Member since
04-11-2020
3
Posts
2
Kudos Received
0
Solutions
02-13-2024
09:09 PM
1 Kudo
I have an HDFS path including two different file format 1- Avro 2-JSON I want to build a hive external table on that path but only for Avro files. I do not have the option to have each file format in a separate location and I could not find a way to pick those file based on the file extension for the hive external table the suggested solutions was to build a custom class which will fitter the files based on the targeted type which is extended for the main avro input class AvroContainerInputFormat. package com.company.hadoop.customavroinputformat;
import org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RecordReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroWrapper;
import org.apache.hadoop.io.NullWritable;
public class CustomAvroContainerInputFormat extends AvroContainerInputFormat {
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
// Get the original splits
InputSplit[] originalSplits = super.getSplits(job, numSplits);
// Create a list to store the filtered splits
List<InputSplit> filteredSplits = new ArrayList<>();
// Your specific file extension
String fileExtension = ".avroSnappy";
for (InputSplit split : originalSplits) {
// Check if the file path ends with the specified extension
String filePath = ((FileSplit) split).getPath().toString();
if (filePath.endsWith(fileExtension)) {
System.out.println("File Path considered: " + filePath);
filteredSplits.add(split);
}
}
// Convert the list to an array
InputSplit[] filteredSplitsArray = new InputSplit[filteredSplits.size()];
return filteredSplits.toArray(filteredSplitsArray);
}
// You can override other methods as needed
// ...
} Above is the customized class the class is added to hive using below command ADD JAR /path/CustomDSAvroInputFormat.jar; and the hive external table is created as bellow CREATE EXTERNAL TABLE table_name
PARTITIONED BY (
event_day STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT 'com.company.hadoop.customavroinputformat.CustomAvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION '.....'
TBLPROPERTIES ('avro.schema.url'='/path/AVRO_SCHEMA_FILE'); Now when I do select * from the table, I am getting the result as expected and the data being shown normally But when I am using select count (*) or where clause, it's trying to create mapreduc job and somehow it's not using the customized class so it's scan the binary files as well and getting failed select * resulte > select event_day from <table_name> limit 2;
OK
File Path considered: hdfs://<URL><path>/c5f085e1-7a81-47e2-981e-702982c0ac00.avroSnappy
20240204000000
20240204000000
Time taken: 2.041 seconds, Fetched: 2 row(s) select count select count(*) from <table_name> limit 2;
Query ID = user_20240214072828_73a1c989-a71f-4153-93e0-5e719b949014
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1707415269562_0043, Tracking URL = <url>/application_1707415269562_0043/
Kill Command = /opt/cloudera/parcels/CDH-5.16.1-1.cdh5.16.1.p0.3/lib/hadoop/bin/hadoop job -kill job_1707415269562_0043
Hadoop job information for Stage-1: number of mappers: 2; number of reducers: 1
2024-02-14 07:28:34,044 Stage-1 map = 0%, reduce = 0%
2024-02-14 07:28:52,709 Stage-1 map = 100%, reduce = 100%
Ended Job = job_1707415269562_0043 with errors
Error during job, obtaining debugging information...
Examining task ID: task_1707415269562_0043_m_000001 (and more) from job job_1707415269562_0043
Task with the most failures(4):
-----
Task ID:
task_1707415269562_0043_m_000001
URL:
<url>
-----
Diagnostic Messages for this Task:
Error: java.io.IOException: java.lang.reflect.InvocationTargetException
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderCreationException(HiveIOExceptionHandlerChain.java:97)
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(HiveIOExceptionHandlerUtil.java:57)
at org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.initNextRecordReader(HadoopShimsSecure.java:267)
at org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.<init>(HadoopShimsSecure.java:213)
at org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileInputFormatShim.getRecordReader(HadoopShimsSecure.java:334)
at org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getRecordReader(CombineHiveInputFormat.java:695)
at org.apache.hadoop.mapred.MapTask$TrackedRecordReader.<init>(MapTask.java:169)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:438)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:343)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1924)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.initNextRecordReader(HadoopShimsSecure.java:253)
... 11 more
Caused by: java.io.IOException: Not a data file.
at org.apache.avro.file.DataFileStream.initialize(DataFileStream.java:105)
at org.apache.avro.file.DataFileReader.<init>(DataFileReader.java:97)
at org.apache.hadoop.hive.ql.io.avro.AvroGenericRecordReader.<init>(AvroGenericRecordReader.java:89)
at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.getRecordReader(AvroContainerInputFormat.java:51)
at org.apache.hadoop.hive.ql.io.CombineHiveRecordReader.<init>(CombineHiveRecordReader.java:68)
... 16 more
FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask
MapReduce Jobs Launched:
Stage-Stage-1: Map: 2 Reduce: 1 HDFS Read: 0 HDFS Write: 0 FAIL
Total MapReduce CPU Time Spent: 0 msec so, is there any ideas why and how can i fix this or an alternative solution? Thanks in advance I tried to create view on top of the main table or using nested queries, but it did not work also.
... View more
Labels:
- Labels:
-
Apache Hive
-
MapReduce
04-11-2020
05:45 AM
1 Kudo
@cjervis Thanks for comment and sorry for that as this is the first post. I provide full details about the question.
... View more
04-11-2020
02:10 AM
I am trying to run java code on window to access Hadoop 3.1.3 in virtualbox 6.1 with redhat 7.7. the hadoop with all components working fine (NameNode,datanode,....). 19856 NameNode
20864 NodeManager
20274 SecondaryNameNode
20532 ResourceManager
20043 DataNode I am trying to run java code on window to access Hadoop 3.1.3 in virtualbox 6.1 with redhat 7.7. the hadoop with all components working fine (NameNode,datanode,....). 19856 NameNode
20864 NodeManager
20274 SecondaryNameNode
20532 ResourceManager
20043 DataNode I am able to display hadoop datanode and cluster from window after create port foreword VBox Host IP 127.0.0.1 Gust IP 192.168.122.1 For the following ports: 9870
80888
9000
9866
9864
9869
9865
9867 Hadoop working fine and displayed from both host and gust accessable from window on http://localhost:8088/cluster accessable from Linux red hat on http://192.168.122.1:8088 accessable from window browser on http://localhost:9870/ accessable from Linux red hat browser on http://192.168.122.1:9870 Linux /ect/hosts ######127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4
####::1 localhost localhost.localdomain localhost6 localhost6.localdomain6q
192.168.122.1 master.hadoop master Virsualbox Network: NAT Java Code run Successfully on virtualbox (gust with hadoop) but cannot run successfully form windows (host) import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
public class HadoopFileSystemCat {
public static void main(String[] args) throws IOException {
String uri = "hdfs://localhost:9000/TES";
Configuration conf = new Configuration();
conf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
FileSystem fs = FileSystem.get(URI.create(uri), conf);
FSDataInputStream in = null;
try {
in = fs.open(new Path(uri));
System.out.println(fs.getFileBlockLocations(new Path(uri), 0, 0)[0]);
System.out.println(fs.getFileBlockLocations(new Path(uri), 0, 0)[0].getNames()[0]);
IOUtils.copyBytes(in, System.out, 4096, false);
} finally {
IOUtils.closeStream(in);
}
}
} Code Output 0,10,localhost
192.168.122.1:9866 Exception Message Exception in thread "main" org.apache.hadoop.hdfs.BlockMissingException: Could not obtain block: BP-428104912-127.0.0.1-1586539199175:blk_1073741825_1001 file=/DDD
at org.apache.hadoop.hdfs.DFSInputStream.refetchLocations(DFSInputStream.java:875)
at org.apache.hadoop.hdfs.DFSInputStream.chooseDataNode(DFSInputStream.java:858)
at org.apache.hadoop.hdfs.DFSInputStream.chooseDataNode(DFSInputStream.java:837)
at org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:566)
at org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:756)
at org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:825)
at java.io.DataInputStream.read(DataInputStream.java:100)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:94)
at org.apache.hadoop.io.IOUtils.copyBytes(IOUtils.java:68)
at com.mycompany.hdfs_java_api.HadoopFileSystemCat.main(HadoopFileSystemCat.java:39) actually block available as we can see through terminal [muneeralnajdi@localhost hadoop]$ bin/hdfs dfs -ls /
Found 1 items
-rw-r--r-- 1 muneeralnajdi supergroup 10 2020-04-10 20:25 /DDD Hadoop Configuration Core-site.xsml <configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://master.hadoop:9000</value>
</property>
</configuration> yarn-site.xml <configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/home/muneeralnajdi/hadoop_store/hdfs/namenode</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/home/muneeralnajdi/hadoop_store/hdfs/datanode</value>
</property>
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>true</value>
<description>Whether clients should use datanode hostnames when
connecting to datanodes.
</description>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>master.hadoop:9870</value>
</property>
</configuration> hdfs-site.xml <configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/home/muneeralnajdi/hadoop_store/hdfs/namenode</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/home/muneeralnajdi/hadoop_store/hdfs/datanode</value>
</property>
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>true</value>
<description>Whether clients should use datanode hostnames when
connecting to datanodes.
</description>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>master.hadoop:9870</value>
</property>
</configuration> Could any one help me to fix this issue I go through some answers Access to file on Hadoop Virtualbox Cluster But the java API still can not access the datanode block from windows
... View more
Labels:
- Labels:
-
Apache Hadoop