I Have a large file i have to process every month, when i import it, it only uses one mapper to import the data causing it to be unnecessarily slow. the file is comma delimited with double-quote text qualifiers...
any advice to allow the cluster to parrallize the prcessing would be helpful. i basically create the external table, managed table, then i query the data from the external table into the managed table. see below.
(using HDP 2.6.5)
Thanks!
CREATE EXTERNAL TABLE `ext_incomingdata`(
`record_note` string COMMENT 'from deserializer',
`firstdate` string COMMENT 'from deserializer',
`lastdate` string COMMENT 'from deserializer',
`fn` string COMMENT 'from deserializer',
`ln` string COMMENT 'from deserializer'
)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
'quoteChar'='\"',
'separatorChar'=',')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://mynode.mydomain.com:8020/etl/mydir/201905/import.csv'
TBLPROPERTIES (
'last_modified_by'='hdfs',
'last_modified_time'='1556747118',
'numFiles'='1',
'numRows'='0',
'rawDataSize'='0',
'skip.header.line.count'='1',
'totalSize'='37567255367',
'transient_lastDdlTime'='1556747118')
CREATE TABLE `incomingdata`(
`record_note` string,
`firstdate` string,
`lastdate` string,
`fn` string,
`ln` string
)
stored as orc;
insert into incomingdata select * from ext_incomingdata;