Member since 
    
	
		
		
		05-30-2016
	
	
	
	
	
	
	
	
	
	
	
	
	
	
			
      
                13
            
            
                Posts
            
        
                1
            
            
                Kudos Received
            
        
                0
            
            
                Solutions
            
        
			
    
	
		
		
		09-21-2017
	
		
		12:36 PM
	
	
	
	
	
	
	
	
	
	
	
	
	
	
		
	
				
		
			
					
				
		
	
		
					
							 Any solutions that work long term? 
						
					
					... View more
				
			
			
			
			
			
			
			
			
			
		
			
    
	
		
		
		07-04-2016
	
		
		08:59 AM
	
	
	
	
	
	
	
	
	
	
	
	
	
	
		
	
				
		
			
					
				
		
	
		
					
							 Hi, It works fine for me with small tables. Seems to only corrupt data in tables bigger than 1 block.  Hadoop 2.7.1.2.4.0.0-169 
						
					
					... View more
				
			
			
			
			
			
			
			
			
			
		
			
    
	
		
		
		06-29-2016
	
		
		01:57 PM
	
	
	
	
	
	
	
	
	
	
	
	
	
	
		
	
				
		
			
					
				
		
	
		
					
							 Okay. I played around. After removing   'skip.header.line.count'='1'  and creating new external table, then count(*) = wc -l  I will include header and sample row, I don't find any irreguralites, only difference is that header lacks 2 columns from DDL definition (K, L), which should not be a problem:    A |B |C |D |E |F |G |H |I |J
+04454.|+133322063.|A42AL|201618|20160702|N|+00000.00|0|+00001.11|0
  Out of curiosity I created table without columns "K,L" in order to match header row. With option   'skip.header.line.count'='1'  it gives wrong result count(*) <> wc -l. Without, it gives right result.  By what is this caused? Can someone test this out using big table? I am running  Hadoop 2.7.1.2.4.0.0-169 
						
					
					... View more
				
			
			
			
			
			
			
			
			
			
		
			
    
	
		
		
		06-29-2016
	
		
		12:35 PM
	
	
	
	
	
	
	
	
	
	
	
	
	
	
		
	
				
		
			
					
				
		
	
		
					
							 @Jitendra Yadav
@Sindhu  
						
					
					... View more
				
			
			
			
			
			
			
			
			
			
		
			
    
	
		
		
		06-29-2016
	
		
		11:18 AM
	
	
	
	
	
	
	
	
	
	
	
	
	
	
		
	
				
		
			
					
				
		
	
		
					
							 CREATE EXTERNAL TABLE corrupt_rows
(
   A   INT,
   B   BIGINT,
   C   STRING,
   D   STRING,
   E   STRING,
   F   STRING,
   G   DOUBLE,
   H   INT,
   I   DOUBLE,
   J   INT,
   K   STRING,
   L   STRING
)
COMMENT 'xy'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS
   INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
   OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://xy:8020/data/temp'
TBLPROPERTIES ('COLUMN_STATS_ACCURATE' = 'false',
               'numFiles' = '1',
               'numRows' = '-1',
               'rawDataSize' = '-1',
               'skip.header.line.count' = '1',
               'totalSize' = '4969304654',
               'transient_lastDdlTime' = '1467196659') 
						
					
					... View more
				
			
			
			
			
			
			
			
			
			
		
			
    
	
		
		
		06-29-2016
	
		
		10:39 AM
	
	
	
	
	
	
	
	
	
	
	
	
	
	
		
	
				
		
			
					
				
		
	
		
					
							 Hello,   I encountered weird problem. I pointed external table to data in HDFS. Source file have non-compressed pipe delimited about 5gb. When I run wc -l /hdfs/fileA.arc, it results in 80,002,783 rows, but when I query select count(*) from tableA, I get  16,877,533.  I examined the file and there are no weird characters, blanks, etc...  Did I do something wrong? Shouldn't count of rows be the same? Does Hive automatically remove duplicates?  Thanks 
						
					
					... View more
				
			
			
			
			
			
			
			
			
			
		
		
			
				
						
							Labels:
						
						
		
			
	
					
			
		
	
	
	
	
				
		
	
	
- Labels:
- 
						
							
		
			Apache Hadoop
- 
						
							
		
			Apache Hive
			
    
	
		
		
		05-30-2016
	
		
		02:36 PM
	
	
	
	
	
	
	
	
	
	
	
	
	
	
		
	
				
		
			
					
				
		
	
		
					
							 Hello,  I'm trying to create workflow which incorporates retry-max and retry-interval. Workflow is cotrolled by coordinator which fires it when files are available in HDFS. Part of workflow is a Hive query, which runs fine, in case files are found in directory.  I want workflow to run again for few times in case files are not yet available in HDFS. Therefore I deliberatele remove files from HDFS and then reload them again in order to test funcionality of user-retry  Workflow is KILLED and do not go to USER-RETRY state due to this error:  WARN HiveActionExecutor:523 - SERVER[localhost] USER[hadoopmgr] GROUP[-] TOKEN[] APP[hive-wf] JOB[0000001-160530104739010-oozie-oozi-W] ACTION[0000001-160530104739010-oozie-oozi-W@stockjob] Launcher ERROR, reason: Main class [org.apache.oozie.action.hadoop.HiveMain], exit code [40000]  Coordinator.xml:  <coordinator-app name="stockjob" frequency="${coord:days(1)}" start="${start}" end="${end}" timezone="Europe/Amsterdam"
xmlns="uri:oozie:coordinator:0.2">  <controls>  <concurrency>1</concurrency>
<execution>FIFO</execution>
<throttle>5</throttle>
</controls>
<datasets>  <dataset name="dindc1" frequency="${coord:days(1)}"
initial-instance="2016-05-30T07:00Z" timezone="Europe/Amsterdam">
<uri-template>${nameNode}/user/hadoopmgr/wfArc/data/in/${YEAR}${MONTH}${DAY}/</uri-template>
<done-flag></done-flag>
</dataset>  <dataset name="dout" frequency="${coord:days(1)}"
initial-instance="2016-05-30T07:00Z" timezone="Europe/Amsterdam">
<uri-template>${nameNode}/user/hadoopmgr/wfArc/data/out/${YEAR}${MONTH}${DAY}/</uri-template>
<done-flag></done-flag>
</dataset>
</datasets>  <input-events>
<data-in name="eindc1" dataset="dindc1">
<instance>${coord:current(0)}</instance>
</data-in>  </input-events>
<output-events>
<data-out name="eout" dataset="dout">
<instance>${coord:current(0)}</instance>
</data-out>
</output-events>  <action>
<workflow>
<app-path>${workflowAppUri}</app-path>  <configuration>
<property>
<name>jobTracker</name>
<value>${jobTracker}</value>
</property>
<property>
<name>nameNode</name>
<value>${nameNode}</value>
</property>
<property>
<name>queueName</name>
<value>${queueName}</value>
</property>
<property>
<name>inputPath1</name>
<value>${coord:dataIn('eindc1')}</value>
</property>  <property>
<name>outputPath1</name>
<value>${coord:dataOut('eout')}</value>
</property>
<property>
<name>the_timestamp</name>
<value>${coord:formatTime(coord:nominalTime(), 'yyyy-MM-dd')}</value>
</property>
</configuration>
</workflow>
</action>
</coordinator-app>  Workflow.xml:  <workflow-app name="hive-wf" xmlns="uri:oozie:workflow:0.3">  <credentials>
<credential name="hive_credentials" type="hcat">
<property>
<name>hcat.metastore.uri</name>
<value>${tHrift}</value>
</property>
<property>
<name>hcat.metastore.principal</name>
<value>${principal}</value>
</property>
</credential>
</credentials>  <start to="stockjob"/>  <action name="stockjob" retry-max="3" retry-interval="10" cred="hive_credentials">
<hive xmlns="uri:oozie:hive-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<job-xml>/user/hadoopmgr/wfArc/wf1/hive-site.xml</job-xml>  <configuration>
<property>
<name>mapred.compress.map.output</name>
<value>true</value>
</property>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.hive.defaults</name>
<value>/user/hadoopmgr/wfArc/wf2/hive-site.xml</value>
</property>
</configuration>  <script>/user/hadoopmgr/wfArc/wf2/sl_stock.hql</script>
<param>inputPath1=${inputPath1}</param>
<param>tableName1=${the_timestamp}</param>
<param>outputPath1=${outputPath1}</param>  </hive>
<ok to="end"/>
<error to="fail"/>
</action>
<kill name="fail">
<message>hql script failed [${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name="end"/>
</workflow-app>  Thanks  Jan 
						
					
					... View more
				
			
			
			
			
			
			
			
			
			
		
		
			
				
						
							Labels:
						
						
		
			
	
					
			
		
	
	
	
	
				
		
	
	
- Labels:
- 
						
							
		
			Apache Oozie
 
        







