<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: Single records of a file split into multiple? in Archives of Support Questions (Read Only)</title>
    <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140320#M19409</link>
    <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/2582/rovboyko.html" nodeid="2582"&gt;@Roman Boyko&lt;/A&gt; Is it possible to handle this scenario using PIG or HIVE?  &lt;/P&gt;&lt;P&gt;I am currently tried it using HIVE with below LOAD statement:&lt;/P&gt;&lt;PRE&gt;hive&amp;gt; create table test(col1 string, col2 string, col3 string, col4 string, col5 string, col6 string, col7 string, col8 string, col9 string, col10 string) row format delimited fields terminated by '\t' stored as textfile;&lt;/PRE&gt;&lt;PRE&gt;hive&amp;gt; load data local inpath '/tmp/test.txt' into table test;&lt;/PRE&gt;</description>
    <pubDate>Sun, 14 Feb 2016 21:01:26 GMT</pubDate>
    <dc:creator>rushikeshdeshmu</dc:creator>
    <dc:date>2016-02-14T21:01:26Z</dc:date>
    <item>
      <title>Single records of a file split into multiple?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140317#M19406</link>
      <description>&lt;P&gt;        Hi,&lt;/P&gt;&lt;P&gt;                               I am currently facing an issue related to single record of a file split into multiple.
See the below example. &lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;filename - test.txt&lt;/STRONG&gt; &lt;/P&gt;&lt;P&gt;col1|col2|col3|col4|col5|col6|col7|col8|col9|col10 &lt;/P&gt;&lt;P&gt;qwe|345|point(-10.0, -11.0)|exec|1234|mana &lt;/P&gt;&lt;P&gt;ger|124|team|specia|1100 &lt;/P&gt;&lt;P&gt;mwe|123|point(-0.9,-1.0)|exec|4563|superman|134|team|special|1101 &lt;/P&gt;&lt;P&gt;                             The above one is the sample file I have in mentioned schema and two sample records.
first record is splitted into two lines when I tried to load in hive getting the null values.
Second record values are loading correctly . Currently we are handling with pig but not getting accurate values. Can anyone suggest me the best way of handling the scenario.&lt;/P&gt;&lt;P&gt;Thanks,&lt;/P&gt;&lt;P&gt;Rushikesh&lt;/P&gt;</description>
      <pubDate>Sat, 13 Feb 2016 20:48:47 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140317#M19406</guid>
      <dc:creator>rushikeshdeshmu</dc:creator>
      <dc:date>2016-02-13T20:48:47Z</dc:date>
    </item>
    <item>
      <title>Re: Single records of a file split into multiple?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140318#M19407</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/2769/rushikeshdeshmukh007.html" nodeid="2769"&gt;@Rushikesh Deshmukh&lt;/A&gt; please provide load statement&lt;/P&gt;</description>
      <pubDate>Sat, 13 Feb 2016 21:29:30 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140318#M19407</guid>
      <dc:creator>aervits</dc:creator>
      <dc:date>2016-02-13T21:29:30Z</dc:date>
    </item>
    <item>
      <title>Re: Single records of a file split into multiple?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140319#M19408</link>
      <description>&lt;P&gt;Hi! &lt;/P&gt;&lt;P&gt;I think the best way is to write MapReduce job that corrects data in file, based on real and expected count of delimiters in the record.&lt;/P&gt;</description>
      <pubDate>Sun, 14 Feb 2016 02:48:18 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140319#M19408</guid>
      <dc:creator>ro_v_boyko</dc:creator>
      <dc:date>2016-02-14T02:48:18Z</dc:date>
    </item>
    <item>
      <title>Re: Single records of a file split into multiple?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140320#M19409</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/2582/rovboyko.html" nodeid="2582"&gt;@Roman Boyko&lt;/A&gt; Is it possible to handle this scenario using PIG or HIVE?  &lt;/P&gt;&lt;P&gt;I am currently tried it using HIVE with below LOAD statement:&lt;/P&gt;&lt;PRE&gt;hive&amp;gt; create table test(col1 string, col2 string, col3 string, col4 string, col5 string, col6 string, col7 string, col8 string, col9 string, col10 string) row format delimited fields terminated by '\t' stored as textfile;&lt;/PRE&gt;&lt;PRE&gt;hive&amp;gt; load data local inpath '/tmp/test.txt' into table test;&lt;/PRE&gt;</description>
      <pubDate>Sun, 14 Feb 2016 21:01:26 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140320#M19409</guid>
      <dc:creator>rushikeshdeshmu</dc:creator>
      <dc:date>2016-02-14T21:01:26Z</dc:date>
    </item>
    <item>
      <title>Re: Single records of a file split into multiple?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140321#M19410</link>
      <description>&lt;P&gt;
	&lt;A href="https://community.hortonworks.com/users/2769/rushikeshdeshmukh007.html"&gt;@Rushikesh Deshmukh&lt;/A&gt; You can use lead and lag functions in Hive, but in this case you'll face many constraints (e.g. only one spurious record delimiter in row, no null columns and so on).&lt;/P&gt;&lt;P&gt;
	Or you can try to use Pig like &lt;A href="http://stackoverflow.com/questions/30798090/apache-pig-set-date-of-current-row-as-next-records-date-minus-one-day-for-a-giv"&gt;this&lt;/A&gt; example.&lt;/P&gt;</description>
      <pubDate>Mon, 15 Feb 2016 11:09:05 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140321#M19410</guid>
      <dc:creator>ro_v_boyko</dc:creator>
      <dc:date>2016-02-15T11:09:05Z</dc:date>
    </item>
    <item>
      <title>Re: Single records of a file split into multiple?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140322#M19411</link>
      <description>&lt;P&gt;@&lt;A href="https://community.hortonworks.com/users/2582/rovboyko.html"&gt;Roman Boyko&lt;/A&gt;, thanks for sharing this information and link.&lt;/P&gt;</description>
      <pubDate>Sat, 20 Feb 2016 21:45:47 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140322#M19411</guid>
      <dc:creator>rushikeshdeshmu</dc:creator>
      <dc:date>2016-02-20T21:45:47Z</dc:date>
    </item>
    <item>
      <title>Re: Single records of a file split into multiple?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140323#M19412</link>
      <description>&lt;P&gt;@&lt;A href="https://community.hortonworks.com/users/2769/rushikeshdeshmukh007.html"&gt;Rushikesh Deshmukh&lt;/A&gt;- I see Geometry data type (point) included in your data set. For insights on geo-spacial calculations, you can start at- &lt;A href="https://cwiki.apache.org/confluence/display/Hive/Spatial+queries" target="_blank"&gt;https://cwiki.apache.org/confluence/display/Hive/Spatial+queries&lt;/A&gt; and &lt;A href="https://github.com/Esri/spatial-framework-for-hadoop/wiki/ST_Geometry-in-Hive-versus-SQL" target="_blank"&gt;https://github.com/Esri/spatial-framework-for-hadoop/wiki/ST_Geometry-in-Hive-versus-SQL&lt;/A&gt;. &lt;/P&gt;</description>
      <pubDate>Sun, 13 Mar 2016 13:29:31 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140323#M19412</guid>
      <dc:creator>shekharonlin</dc:creator>
      <dc:date>2016-03-13T13:29:31Z</dc:date>
    </item>
    <item>
      <title>Re: Single records of a file split into multiple?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140324#M19413</link>
      <description>&lt;P&gt;@&lt;A href="https://community.hortonworks.com/users/3314/shekharonlin.html"&gt;Mayank Shekhar&lt;/A&gt;, thanks for sharing this information and link.&lt;/P&gt;</description>
      <pubDate>Sun, 13 Mar 2016 15:53:43 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140324#M19413</guid>
      <dc:creator>rushikeshdeshmu</dc:creator>
      <dc:date>2016-03-13T15:53:43Z</dc:date>
    </item>
    <item>
      <title>Re: Single records of a file split into multiple?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140325#M19414</link>
      <description>&lt;P&gt;&lt;A href="https://community.hortonworks.com/users/2769/rushikeshdeshmukh007.html"&gt;@Rushikesh Deshmukh &lt;/A&gt;&lt;/P&gt;&lt;P&gt;On the other hand.. HIVE's regexp_replace can help in cleaning the data.. eg below.. this removes nested '\','\t' and '\r' combination of unformatted data within a single JSON string..&lt;/P&gt;&lt;P&gt;--populate clean src table 
insert overwrite table src_clean PARTITION (ddate='${hiveconf:DATE_VALUE}')
select regexp_replace(regexp_replace(regexp_replace(regexp_replace(full_json_line, "\\\\\\\\\\\\\\\\t|\\\\\\\\\\\\\\\\n|\\\\\\\\\\\\\\\\r", "\\\\\\\\\\\\\\\\&amp;lt;t or n or r&amp;gt;"), "\\\\\\\\\\\\t|\\\\\\\\\\\\n|\\\\\\\\\\\\r", "\\\\\\\\ "), "\\\\\\\\t|\\\\\\\\n|\\\\\\\\r", "\\\\\\\\&amp;lt;t or n or r&amp;gt;"),"\\\\t|\\\\n|\\\\r", "") as full_json_line 
from src_unclean where ddate='${hiveconf:DATE_VALUE}';
&lt;/P&gt;</description>
      <pubDate>Sun, 13 Mar 2016 16:00:53 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140325#M19414</guid>
      <dc:creator>shekharonlin</dc:creator>
      <dc:date>2016-03-13T16:00:53Z</dc:date>
    </item>
    <item>
      <title>Re: Single records of a file split into multiple?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140326#M19415</link>
      <description>&lt;P&gt;@&lt;A href="https://community.hortonworks.com/users/3314/shekharonlin.html"&gt;Mayank Shekhar&lt;/A&gt;, thanks for sharing this information.&lt;/P&gt;</description>
      <pubDate>Wed, 16 Mar 2016 01:35:56 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Single-records-of-a-file-split-into-multiple/m-p/140326#M19415</guid>
      <dc:creator>rushikeshdeshmu</dc:creator>
      <dc:date>2016-03-16T01:35:56Z</dc:date>
    </item>
  </channel>
</rss>

