<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: HIVE Regex with Uppercase in Archives of Support Questions (Read Only)</title>
    <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/HIVE-Regex-with-Uppercase/m-p/209559#M79241</link>
    <description>&lt;P&gt;Hey &lt;A rel="user" href="https://community.cloudera.com/users/10685/thierryvernhet.html" nodeid="10685"&gt;@Thierry Vernhet&lt;/A&gt;!&lt;/P&gt;&lt;P&gt;Hmm, I'm not sure if you can do this only using RegexSerde. Probably you will need to &lt;/P&gt;&lt;P&gt;- Filter those NULL values in a query &lt;BR /&gt;or &lt;BR /&gt;- Construct an ETL by passing the NULL values and grabbing the ARMING values to other table&lt;BR /&gt;or&lt;BR /&gt;- Create a View&lt;BR /&gt;or &lt;BR /&gt;- Clean the data before putting into HDFS.&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;I made a research at the code and here's some parts of the code that explain it:&lt;BR /&gt;&lt;A href="https://github.com/apache/hive/blob/master/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java" target="_blank"&gt;https://github.com/apache/hive/blob/master/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java&lt;/A&gt;&lt;BR /&gt;&lt;/P&gt;&lt;PRE&gt; @Override  public Object deserialize(Writable blob) throws SerDeException {
    if (inputPattern == null) {      throw new SerDeException(          "This table does not have serde property \"input.regex\"!");    }    Text rowText = (Text) blob;
    Matcher m = inputPattern.matcher(rowText.toString());
    // If do not match, ignore the line, return a row with all nulls.    if (!m.matches()) {      unmatchedRows++;      if (unmatchedRows &amp;gt;= nextUnmatchedRows) {        nextUnmatchedRows = getNextNumberToDisplay(nextUnmatchedRows);        // Report the row        LOG.warn("" + unmatchedRows + " unmatched rows are found: " + rowText);      }      return null;    }
    // Otherwise, return the row.    for (int c = 0; c &amp;lt; numColumns; c++) {      try {        row.set(c, m.group(c + 1));      } catch (RuntimeException e) {        partialMatchedRows++;        if (partialMatchedRows &amp;gt;= nextPartialMatchedRows) {          nextPartialMatchedRows = getNextNumberToDisplay(nextPartialMatchedRows);          // Report the row          LOG.warn("" + partialMatchedRows              + " partially unmatched rows are found, " + " cannot find group "              + c + ": " + rowText);        }        row.set(c, null);      }    }    return row;  }&lt;/PRE&gt;&lt;P&gt;Hope this helps! &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;</description>
    <pubDate>Thu, 07 Jun 2018 22:46:34 GMT</pubDate>
    <dc:creator>vmurakami</dc:creator>
    <dc:date>2018-06-07T22:46:34Z</dc:date>
    <item>
      <title>HIVE Regex with Uppercase</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/HIVE-Regex-with-Uppercase/m-p/209556#M79238</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;I would like to create an external table with an Hive regex expression by selecting lines containing "ARMING" (in uppercase).&lt;/P&gt;&lt;P&gt;The records HDFS look like this &lt;/P&gt;&lt;P&gt;2018-06-06T11:28:54+02:00 sazqye named[980]: ARMING trigger on (action:LOG) (T6-Recursive-attacks  recursive-time: 1283)&lt;/P&gt;&lt;P&gt;2018-06-06T11:20:27+02:00 sazqyd named[92960]: client (1.debian)&lt;/P&gt;&lt;P&gt;...&lt;/P&gt;&lt;P&gt;My request :&lt;/P&gt;&lt;P&gt;CREATE EXTERNAL TABLE my_arming_table (
dc_syslog_date STRING,
dc_syslog_hostname STRING,
dc_syslog_process STRING,
dc_logtype STRING, dc_message STRING)
PARTITIONED BY (yearbrut INT, monthbrut INT, daybrut INT) &lt;/P&gt;&lt;P&gt;ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' &lt;/P&gt;&lt;P&gt;WITH SERDEPROPERTIES(
  'input.regex'='^(\\S+)\\s(\\S+)\\s(\\S+)\\s(&lt;STRONG&gt;ARMING&lt;/STRONG&gt;)\\s(.*)')
STORED AS TEXTFILE;&lt;/P&gt;&lt;P&gt;The result is KO :&lt;/P&gt;&lt;P&gt;    &amp;gt; select * from my_arming_table limit 2 ; &lt;/P&gt;&lt;P&gt;OK&lt;/P&gt;&lt;P&gt;
NULL    NULL    NULL    NULL    NULL    0       0       0 &lt;/P&gt;&lt;P&gt;NULL    NULL    NULL    NULL    NULL    0       0       0&lt;/P&gt;&lt;P&gt;And if I try this request (with client in lowercase)&lt;/P&gt;&lt;P&gt;CREATE EXTERNAL TABLE my_client_table (
dc_syslog_date STRING,
dc_syslog_hostname STRING,
dc_syslog_process STRING,
dc_logtype STRING,
dc_message STRING)
PARTITIONED BY (yearbrut INT, monthbrut INT, daybrut INT) &lt;/P&gt;&lt;P&gt;ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' &lt;/P&gt;&lt;P&gt;WITH SERDEPROPERTIES(
  'input.regex'='^(\\S+)\\s(\\S+)\\s(\\S+)\\s(&lt;STRONG&gt;client&lt;/STRONG&gt;)\\s(.*)')
STORED AS TEXTFILE;&lt;/P&gt;&lt;P&gt;The result is OK&lt;/P&gt;&lt;P&gt;    &amp;gt; select * from my_client_table limit 2 ;&lt;/P&gt;&lt;P&gt;
OK&lt;/P&gt;&lt;P&gt;
2018-06-06T11:12:55+02:00       sazqyd  named[92960]:   client (swza6z) 0       0       0 &lt;/P&gt;&lt;P&gt;2018-06-06T11:13:10+02:00       sazqyd  named[92960]:   client (osce01) 0       0       0&lt;/P&gt;&lt;P&gt;Does anybody knows why it doesn't work with uppercase in the regex expression ?&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;</description>
      <pubDate>Wed, 06 Jun 2018 17:26:27 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/HIVE-Regex-with-Uppercase/m-p/209556#M79238</guid>
      <dc:creator>thierry_vernhet</dc:creator>
      <dc:date>2018-06-06T17:26:27Z</dc:date>
    </item>
    <item>
      <title>Re: HIVE Regex with Uppercase</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/HIVE-Regex-with-Uppercase/m-p/209557#M79239</link>
      <description>&lt;P&gt;Hi &lt;A rel="user" href="https://community.cloudera.com/users/10685/thierryvernhet.html" nodeid="10685"&gt;@Thierry Vernhet&lt;/A&gt;&lt;BR /&gt;Do you have more examples from your HDFS records? &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt; &lt;BR /&gt;&lt;BR /&gt;I made a simple test here, and its seems to be working your REGEX with the Uppercase&lt;BR /&gt;Here's the output&lt;/P&gt;&lt;PRE&gt;CREATE EXTERNAL TABLE my_arming_table 
( dc_syslog_date STRING, 
dc_syslog_hostname STRING, 
dc_syslog_process STRING, 
dc_logtype STRING, 
dc_message STRING) 
PARTITIONED BY (yearbrut INT, monthbrut INT, daybrut INT)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES( 'input.regex'='^(\\S+)\\s(\\S+)\\s(\\S+)\\s(ARMING)\\s(.*)') STORED AS TEXTFILE
LOCATION '/user/hive/warehouse/my_arming_table/';
ALTER TABLE my_arming_table ADD PARTITION(yearbrut=2018,monthbrut=06,daybrut=06);
#Command to put some data in the table hdfs dfs -put regex_hcc  /user/hive/warehouse/my_arming_table/yearbrut=2018/monthbrut=6/daybrut=6/
#And here's a quick describe from my tablehive&amp;gt; desc formatted my_arming_table;
OK
# col_name            	data_type           	comment             
	 	 
dc_syslog_date      	string              	                    
dc_syslog_hostname  	string              	                    
dc_syslog_process   	string              	                    
dc_logtype          	string              	                    
dc_message          	string              	                    
	 	 
# Partition Information	 	 
# col_name            	data_type           	comment             
	 	 
yearbrut            	int                 	                    
monthbrut           	int                 	                    
daybrut             	int                 	                    
	 	 
# Detailed Table Information	 	 
Database:           	default             	 
Owner:              	hive                	 
CreateTime:         	Wed Jun 06 18:00:02 UTC 2018	 
LastAccessTime:     	UNKNOWN             	 
Protect Mode:       	None                	 
Retention:          	0                   	 
Location:           	hdfs://vini-horton/user/hive/warehouse/my_arming_table	 
Table Type:         	EXTERNAL_TABLE      	 
Table Parameters:	 	 
	EXTERNAL            	TRUE                
	transient_lastDdlTime	1528308002          
	 	 
# Storage Information	 	 
SerDe Library:      	org.apache.hadoop.hive.serde2.RegexSerDe	 
InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
Compressed:         	No                  	 
Num Buckets:        	-1                  	 
Bucket Columns:     	[]                  	 
Sort Columns:       	[]                  	 
Storage Desc Params:	 	 
	input.regex         	^(\\S+)\\s(\\S+)\\s(\\S+)\\s(ARMING)\\s(.*)
	serialization.format	1   
 
#Finally the select outputhive&amp;gt;select * from my_arming_table;
OK
2018-06-06T11:28:54+02:00	sazqye	named[980]:	ARMING	trigger on (action:LOG) (T6-Recursive-attacks recursive-time: 1283)	2018	6	6
NULL	NULL	NULL	NULL	NULL	2018	6	6
Time taken: 0.175 seconds, Fetched: 2 row(s)	
&lt;/PRE&gt;</description>
      <pubDate>Thu, 07 Jun 2018 01:27:02 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/HIVE-Regex-with-Uppercase/m-p/209557#M79239</guid>
      <dc:creator>vmurakami</dc:creator>
      <dc:date>2018-06-07T01:27:02Z</dc:date>
    </item>
    <item>
      <title>Re: HIVE Regex with Uppercase</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/HIVE-Regex-with-Uppercase/m-p/209558#M79240</link>
      <description>&lt;P&gt;Hi &lt;A href="https://community.hortonworks.com/questions/194598/hive-regex-with-uppercase.html#"&gt;@Vinicius Higa Murakami&lt;/A&gt;&lt;/P&gt;&lt;P&gt;Thanks Vinicius. You're right. I didn't see the "ARMING" records with the "select...limit 2;" because the's a lot of "client" records and few "ARMING" records.&lt;/P&gt;&lt;P&gt;But now, how exclude other records and have only "ARMING" rows in the final table and no row "NULL" ?&lt;/P&gt;</description>
      <pubDate>Thu, 07 Jun 2018 15:07:46 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/HIVE-Regex-with-Uppercase/m-p/209558#M79240</guid>
      <dc:creator>thierry_vernhet</dc:creator>
      <dc:date>2018-06-07T15:07:46Z</dc:date>
    </item>
    <item>
      <title>Re: HIVE Regex with Uppercase</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/HIVE-Regex-with-Uppercase/m-p/209559#M79241</link>
      <description>&lt;P&gt;Hey &lt;A rel="user" href="https://community.cloudera.com/users/10685/thierryvernhet.html" nodeid="10685"&gt;@Thierry Vernhet&lt;/A&gt;!&lt;/P&gt;&lt;P&gt;Hmm, I'm not sure if you can do this only using RegexSerde. Probably you will need to &lt;/P&gt;&lt;P&gt;- Filter those NULL values in a query &lt;BR /&gt;or &lt;BR /&gt;- Construct an ETL by passing the NULL values and grabbing the ARMING values to other table&lt;BR /&gt;or&lt;BR /&gt;- Create a View&lt;BR /&gt;or &lt;BR /&gt;- Clean the data before putting into HDFS.&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;I made a research at the code and here's some parts of the code that explain it:&lt;BR /&gt;&lt;A href="https://github.com/apache/hive/blob/master/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java" target="_blank"&gt;https://github.com/apache/hive/blob/master/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java&lt;/A&gt;&lt;BR /&gt;&lt;/P&gt;&lt;PRE&gt; @Override  public Object deserialize(Writable blob) throws SerDeException {
    if (inputPattern == null) {      throw new SerDeException(          "This table does not have serde property \"input.regex\"!");    }    Text rowText = (Text) blob;
    Matcher m = inputPattern.matcher(rowText.toString());
    // If do not match, ignore the line, return a row with all nulls.    if (!m.matches()) {      unmatchedRows++;      if (unmatchedRows &amp;gt;= nextUnmatchedRows) {        nextUnmatchedRows = getNextNumberToDisplay(nextUnmatchedRows);        // Report the row        LOG.warn("" + unmatchedRows + " unmatched rows are found: " + rowText);      }      return null;    }
    // Otherwise, return the row.    for (int c = 0; c &amp;lt; numColumns; c++) {      try {        row.set(c, m.group(c + 1));      } catch (RuntimeException e) {        partialMatchedRows++;        if (partialMatchedRows &amp;gt;= nextPartialMatchedRows) {          nextPartialMatchedRows = getNextNumberToDisplay(nextPartialMatchedRows);          // Report the row          LOG.warn("" + partialMatchedRows              + " partially unmatched rows are found, " + " cannot find group "              + c + ": " + rowText);        }        row.set(c, null);      }    }    return row;  }&lt;/PRE&gt;&lt;P&gt;Hope this helps! &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 07 Jun 2018 22:46:34 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/HIVE-Regex-with-Uppercase/m-p/209559#M79241</guid>
      <dc:creator>vmurakami</dc:creator>
      <dc:date>2018-06-07T22:46:34Z</dc:date>
    </item>
    <item>
      <title>Re: HIVE Regex with Uppercase</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/HIVE-Regex-with-Uppercase/m-p/209560#M79242</link>
      <description>&lt;P&gt;OK&lt;/P&gt;&lt;P&gt;Thanks for all&lt;/P&gt;</description>
      <pubDate>Fri, 08 Jun 2018 12:43:23 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/HIVE-Regex-with-Uppercase/m-p/209560#M79242</guid>
      <dc:creator>thierry_vernhet</dc:creator>
      <dc:date>2018-06-08T12:43:23Z</dc:date>
    </item>
  </channel>
</rss>

