I am working on a POC that consumes messages from an SQS queue via Spark and inserts the data into two separate but similar tables.
Both of the tables below have roughly 228M records in them in the partition '2017-12-11'. The select query below takes roughly 330 seconds to complete and if I add in clustering the query never completes. I have tested with vectorization, tez index filtering and modifying the tez container size. None of these adjustments have helped improve the overall performance.
Does anyone have any suggestions on how I might improve this performance?
SELECT
cds.serialid,
cds.serialid,
get_json_object(cd.data, '$.device_sku') as device_sku
from
component_data_status cds
INNER JOIN component_data cd ON cds.serialid = cd.serialid AND cds.deviceid = cd.deviceid
WHERE
cds.xdate = '2017-12-11'
and
cd.xdate = '2017-12-11'
LIMIT 10;
CREATE TABLE IF NOT EXISTS component_data (
serialid string,
deviceid string,
json_blob string)
PARTITIONED BY (xdate string)
STORED AS ORCFILE
tblproperties ("orc.compress"="SNAPPY")
CREATE TABLE IF NOT EXISTS component_data_status (
serialid string,
deviceid string,
component string)
PARTITIONED BY (xdate string)
STORED AS ORCFILE
tblproperties ("orc.compress"="SNAPPY")