<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Impala is scanning all the Partitions in join query , how to optimize query ? in Archives of Support Questions (Read Only)</title>
    <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Impala-is-scanning-all-the-Partitions-in-join-query-how-to/m-p/21449#M3634</link>
    <description>&lt;P&gt;How can we do partition pruninng &amp;nbsp;in impala join query , not able to figure out and where clause is also not giving the desired results , need some help on this .&lt;/P&gt;&lt;P&gt;here is the query :&lt;/P&gt;&lt;P&gt;select a11.apn apn,&lt;BR /&gt;cast(a12.tariff_plan as int) tariff_plan,&lt;BR /&gt;a11.subscriber_id subscriber_id,&lt;BR /&gt;a11.partition_date partition_date,&lt;BR /&gt;a13.name name,&lt;BR /&gt;a12.tariff_plan tariff_plan0,&lt;BR /&gt;a12.charging_node charging_node,&lt;BR /&gt;concat(substring(cast(a11.partition_date as string),5,2),'/',substring(cast(a11.partition_date as string),7,2),'/',&lt;BR /&gt;substring(cast(a11.partition_date as string),1,4)) CustCol_23,&lt;BR /&gt;sum(case when (a11.volume between 10*1024 and 50*1024) then 1 else 0 end) WJXBFS1,&lt;BR /&gt;sum(case when (a11.volume between 5*1024 and 10*1024) then 1 else 0 end) WJXBFS2,&lt;BR /&gt;sum(case when a11.volume &amp;lt; 5*1024 then 1 else 0 end) WJXBFS3,&lt;BR /&gt;sum(case when a11.volume&amp;gt;50*1024 then 1 else 0 end) WJXBFS4&lt;BR /&gt;from acc_volume_daily a11&lt;BR /&gt;join subscriber_history_daily a12&lt;BR /&gt;on (a11.partition_date = a12.partition_date and&lt;BR /&gt;a11.subscriber_id = a12.subscriber_id and&lt;BR /&gt;concat(substring(cast(a11.partition_date as string),5,2),'/',substring(cast(a11.partition_date as string),7,2),'/',&lt;BR /&gt;substring(cast(a11.partition_date as string),1,4)) = concat(substring(cast(a12.partition_date as string),5,2),'/',substring(cast(a12.partition_date as string),7,2),'/',&lt;BR /&gt;substring(cast(a12.partition_date as string),1,4)))&lt;BR /&gt;join l_data_plans a13&lt;BR /&gt;on (cast(a12.tariff_plan as int) = a13.id)&lt;BR /&gt;where a11.partition_date = a12.partition_date = 20140411&lt;BR /&gt;group by a11.apn,&lt;BR /&gt;cast(a12.tariff_plan as int),&lt;BR /&gt;a11.subscriber_id,&lt;BR /&gt;a11.partition_date,&lt;BR /&gt;a13.name,&lt;BR /&gt;a12.tariff_plan,&lt;BR /&gt;a12.charging_node,&lt;BR /&gt;concat(substring(cast(a11.partition_date as string),5,2),'/',substring(cast(a11.partition_date as string),7,2),'/',&lt;BR /&gt;substring(cast(a11.partition_date as string),1,4));&lt;/P&gt;&lt;P&gt;--------------------------------------------------------------------------------------------------------------------------------------------------------&lt;/P&gt;&lt;P&gt;here is the explain query result.&amp;nbsp;&lt;/P&gt;&lt;P&gt;Explain String&lt;BR /&gt;----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------&lt;BR /&gt;PLAN FRAGMENT 0&lt;BR /&gt;PARTITION: UNPARTITIONED&lt;BR /&gt;&lt;BR /&gt;10:EXCHANGE&lt;BR /&gt;tuple ids: 3&lt;BR /&gt;&lt;BR /&gt;PLAN FRAGMENT 1&lt;BR /&gt;PARTITION: HASH_PARTITIONED: a11.apn, CAST(a12.tariff_plan AS INT), a11.subscriber_id, a11.partition_date, a13.name, a12.tariff_plan, a12.charging_node, concat(substring(CAST(a11.partition_date AS STRING), 5, 2), '/', substring(CAST(a11.partition_date AS STRING), 7, 2), '/', substring(CAST(a11.partition_date AS STRING), 1, 4))&lt;BR /&gt;&lt;BR /&gt;STREAM DATA SINK&lt;BR /&gt;EXCHANGE ID: 10&lt;BR /&gt;UNPARTITIONED&lt;BR /&gt;&lt;BR /&gt;9:AGGREGATE&lt;BR /&gt;| output: SUM(SUM(CASE WHEN (a11.volume BETWEEN 10 * 1024 AND 50 * 1024) THEN 1 ELSE 0 END)), SUM(SUM(CASE WHEN (a11.volume BETWEEN 5 * 1024 AND 10 * 1024) THEN 1 ELSE 0 END)), SUM(SUM(CASE WHEN a11.volume &amp;lt; 5 * 1024 THEN 1 ELSE 0 END)), SUM(SUM(CASE WHEN a11.volume &amp;gt; 50 * 1024 THEN 1 ELSE 0 END))&lt;BR /&gt;| group by: a11.apn, CAST(a12.tariff_plan AS INT), a11.subscriber_id, a11.partition_date, a13.name, a12.tariff_plan, a12.charging_node, concat(substring(CAST(a11.partition_date AS STRING), 5, 2), '/', substring(CAST(a11.partition_date AS STRING), 7, 2), '/', substring(CAST(a11.partition_date AS STRING), 1, 4))&lt;BR /&gt;| tuple ids: 3&lt;BR /&gt;|&lt;BR /&gt;8:EXCHANGE&lt;BR /&gt;tuple ids: 3&lt;BR /&gt;&lt;BR /&gt;PLAN FRAGMENT 2&lt;BR /&gt;PARTITION: RANDOM&lt;BR /&gt;&lt;BR /&gt;STREAM DATA SINK&lt;BR /&gt;EXCHANGE ID: 8&lt;BR /&gt;HASH_PARTITIONED: a11.apn, CAST(a12.tariff_plan AS INT), a11.subscriber_id, a11.partition_date, a13.name, a12.tariff_plan, a12.charging_node, concat(substring(CAST(a11.partition_date AS STRING), 5, 2), '/', substring(CAST(a11.partition_date AS STRING), 7, 2), '/', substring(CAST(a11.partition_date AS STRING), 1, 4))&lt;BR /&gt;&lt;BR /&gt;5:AGGREGATE&lt;BR /&gt;| output: SUM(CASE WHEN (a11.volume BETWEEN 10 * 1024 AND 50 * 1024) THEN 1 ELSE 0 END), SUM(CASE WHEN (a11.volume BETWEEN 5 * 1024 AND 10 * 1024) THEN 1 ELSE 0 END), SUM(CASE WHEN a11.volume &amp;lt; 5 * 1024 THEN 1 ELSE 0 END), SUM(CASE WHEN a11.volume &amp;gt; 50 * 1024 THEN 1 ELSE 0 END)&lt;BR /&gt;| group by: a11.apn, CAST(a12.tariff_plan AS INT), a11.subscriber_id, a11.partition_date, a13.name, a12.tariff_plan, a12.charging_node, concat(substring(CAST(a11.partition_date AS STRING), 5, 2), '/', substring(CAST(a11.partition_date AS STRING), 7, 2), '/', substring(CAST(a11.partition_date AS STRING), 1, 4))&lt;BR /&gt;| tuple ids: 3&lt;BR /&gt;|&lt;BR /&gt;4:HASH JOIN&lt;BR /&gt;| join op: INNER JOIN (BROADCAST)&lt;BR /&gt;| hash predicates:&lt;BR /&gt;| CAST(a12.tariff_plan AS INT) = a13.id&lt;BR /&gt;| tuple ids: 0 1 2&lt;BR /&gt;|&lt;BR /&gt;|----7:EXCHANGE&lt;BR /&gt;| tuple ids: 2&lt;BR /&gt;|&lt;BR /&gt;2:HASH JOIN&lt;BR /&gt;| join op: INNER JOIN (BROADCAST)&lt;BR /&gt;| hash predicates:&lt;BR /&gt;| a11.partition_date = a12.partition_date&lt;BR /&gt;| a11.subscriber_id = a12.subscriber_id&lt;BR /&gt;| other predicates: concat(substring(CAST(a11.partition_date AS STRING), 5, 2), '/', substring(CAST(a11.partition_date AS STRING), 7, 2), '/', substring(CAST(a11.partition_date AS STRING), 1, 4)) = concat(substring(CAST(a12.partition_date AS STRING), 5, 2), '/', substring(CAST(a12.partition_date AS STRING), 7, 2), '/', substring(CAST(a12.partition_date AS STRING), 1, 4))&lt;BR /&gt;| tuple ids: 0 1&lt;BR /&gt;|&lt;BR /&gt;|----6:EXCHANGE&lt;BR /&gt;| tuple ids: 1&lt;BR /&gt;|&lt;BR /&gt;0:SCAN HDFS&lt;BR /&gt;table=reporting_mini.acc_volume_daily #partitions=0 size=0B&lt;BR /&gt;tuple ids: 0&lt;BR /&gt;&lt;BR /&gt;PLAN FRAGMENT 3&lt;BR /&gt;PARTITION: RANDOM&lt;BR /&gt;&lt;BR /&gt;STREAM DATA SINK&lt;BR /&gt;EXCHANGE ID: 7&lt;BR /&gt;UNPARTITIONED&lt;BR /&gt;&lt;BR /&gt;3:SCAN HDFS&lt;BR /&gt;table=reporting_mini.l_data_plans #partitions=1 size=21.96KB compact&lt;BR /&gt;tuple ids: 2&lt;BR /&gt;&lt;BR /&gt;PLAN FRAGMENT 4&lt;BR /&gt;PARTITION: RANDOM&lt;BR /&gt;&lt;BR /&gt;STREAM DATA SINK&lt;BR /&gt;EXCHANGE ID: 6&lt;BR /&gt;UNPARTITIONED&lt;BR /&gt;&lt;BR /&gt;1:SCAN HDFS&lt;BR /&gt;&lt;SPAN style="color: #ff9900;"&gt;&lt;STRONG&gt;table=reporting_mini.subscriber_history_daily #partitions=191 size=2.33GB compact&lt;/STRONG&gt; &lt;/SPAN&gt;&lt;BR /&gt;tuple ids: 1&lt;/P&gt;</description>
    <pubDate>Fri, 16 Sep 2022 09:12:32 GMT</pubDate>
    <dc:creator>laxmikant.s110688</dc:creator>
    <dc:date>2022-09-16T09:12:32Z</dc:date>
    <item>
      <title>Impala is scanning all the Partitions in join query , how to optimize query ?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Impala-is-scanning-all-the-Partitions-in-join-query-how-to/m-p/21449#M3634</link>
      <description>&lt;P&gt;How can we do partition pruninng &amp;nbsp;in impala join query , not able to figure out and where clause is also not giving the desired results , need some help on this .&lt;/P&gt;&lt;P&gt;here is the query :&lt;/P&gt;&lt;P&gt;select a11.apn apn,&lt;BR /&gt;cast(a12.tariff_plan as int) tariff_plan,&lt;BR /&gt;a11.subscriber_id subscriber_id,&lt;BR /&gt;a11.partition_date partition_date,&lt;BR /&gt;a13.name name,&lt;BR /&gt;a12.tariff_plan tariff_plan0,&lt;BR /&gt;a12.charging_node charging_node,&lt;BR /&gt;concat(substring(cast(a11.partition_date as string),5,2),'/',substring(cast(a11.partition_date as string),7,2),'/',&lt;BR /&gt;substring(cast(a11.partition_date as string),1,4)) CustCol_23,&lt;BR /&gt;sum(case when (a11.volume between 10*1024 and 50*1024) then 1 else 0 end) WJXBFS1,&lt;BR /&gt;sum(case when (a11.volume between 5*1024 and 10*1024) then 1 else 0 end) WJXBFS2,&lt;BR /&gt;sum(case when a11.volume &amp;lt; 5*1024 then 1 else 0 end) WJXBFS3,&lt;BR /&gt;sum(case when a11.volume&amp;gt;50*1024 then 1 else 0 end) WJXBFS4&lt;BR /&gt;from acc_volume_daily a11&lt;BR /&gt;join subscriber_history_daily a12&lt;BR /&gt;on (a11.partition_date = a12.partition_date and&lt;BR /&gt;a11.subscriber_id = a12.subscriber_id and&lt;BR /&gt;concat(substring(cast(a11.partition_date as string),5,2),'/',substring(cast(a11.partition_date as string),7,2),'/',&lt;BR /&gt;substring(cast(a11.partition_date as string),1,4)) = concat(substring(cast(a12.partition_date as string),5,2),'/',substring(cast(a12.partition_date as string),7,2),'/',&lt;BR /&gt;substring(cast(a12.partition_date as string),1,4)))&lt;BR /&gt;join l_data_plans a13&lt;BR /&gt;on (cast(a12.tariff_plan as int) = a13.id)&lt;BR /&gt;where a11.partition_date = a12.partition_date = 20140411&lt;BR /&gt;group by a11.apn,&lt;BR /&gt;cast(a12.tariff_plan as int),&lt;BR /&gt;a11.subscriber_id,&lt;BR /&gt;a11.partition_date,&lt;BR /&gt;a13.name,&lt;BR /&gt;a12.tariff_plan,&lt;BR /&gt;a12.charging_node,&lt;BR /&gt;concat(substring(cast(a11.partition_date as string),5,2),'/',substring(cast(a11.partition_date as string),7,2),'/',&lt;BR /&gt;substring(cast(a11.partition_date as string),1,4));&lt;/P&gt;&lt;P&gt;--------------------------------------------------------------------------------------------------------------------------------------------------------&lt;/P&gt;&lt;P&gt;here is the explain query result.&amp;nbsp;&lt;/P&gt;&lt;P&gt;Explain String&lt;BR /&gt;----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------&lt;BR /&gt;PLAN FRAGMENT 0&lt;BR /&gt;PARTITION: UNPARTITIONED&lt;BR /&gt;&lt;BR /&gt;10:EXCHANGE&lt;BR /&gt;tuple ids: 3&lt;BR /&gt;&lt;BR /&gt;PLAN FRAGMENT 1&lt;BR /&gt;PARTITION: HASH_PARTITIONED: a11.apn, CAST(a12.tariff_plan AS INT), a11.subscriber_id, a11.partition_date, a13.name, a12.tariff_plan, a12.charging_node, concat(substring(CAST(a11.partition_date AS STRING), 5, 2), '/', substring(CAST(a11.partition_date AS STRING), 7, 2), '/', substring(CAST(a11.partition_date AS STRING), 1, 4))&lt;BR /&gt;&lt;BR /&gt;STREAM DATA SINK&lt;BR /&gt;EXCHANGE ID: 10&lt;BR /&gt;UNPARTITIONED&lt;BR /&gt;&lt;BR /&gt;9:AGGREGATE&lt;BR /&gt;| output: SUM(SUM(CASE WHEN (a11.volume BETWEEN 10 * 1024 AND 50 * 1024) THEN 1 ELSE 0 END)), SUM(SUM(CASE WHEN (a11.volume BETWEEN 5 * 1024 AND 10 * 1024) THEN 1 ELSE 0 END)), SUM(SUM(CASE WHEN a11.volume &amp;lt; 5 * 1024 THEN 1 ELSE 0 END)), SUM(SUM(CASE WHEN a11.volume &amp;gt; 50 * 1024 THEN 1 ELSE 0 END))&lt;BR /&gt;| group by: a11.apn, CAST(a12.tariff_plan AS INT), a11.subscriber_id, a11.partition_date, a13.name, a12.tariff_plan, a12.charging_node, concat(substring(CAST(a11.partition_date AS STRING), 5, 2), '/', substring(CAST(a11.partition_date AS STRING), 7, 2), '/', substring(CAST(a11.partition_date AS STRING), 1, 4))&lt;BR /&gt;| tuple ids: 3&lt;BR /&gt;|&lt;BR /&gt;8:EXCHANGE&lt;BR /&gt;tuple ids: 3&lt;BR /&gt;&lt;BR /&gt;PLAN FRAGMENT 2&lt;BR /&gt;PARTITION: RANDOM&lt;BR /&gt;&lt;BR /&gt;STREAM DATA SINK&lt;BR /&gt;EXCHANGE ID: 8&lt;BR /&gt;HASH_PARTITIONED: a11.apn, CAST(a12.tariff_plan AS INT), a11.subscriber_id, a11.partition_date, a13.name, a12.tariff_plan, a12.charging_node, concat(substring(CAST(a11.partition_date AS STRING), 5, 2), '/', substring(CAST(a11.partition_date AS STRING), 7, 2), '/', substring(CAST(a11.partition_date AS STRING), 1, 4))&lt;BR /&gt;&lt;BR /&gt;5:AGGREGATE&lt;BR /&gt;| output: SUM(CASE WHEN (a11.volume BETWEEN 10 * 1024 AND 50 * 1024) THEN 1 ELSE 0 END), SUM(CASE WHEN (a11.volume BETWEEN 5 * 1024 AND 10 * 1024) THEN 1 ELSE 0 END), SUM(CASE WHEN a11.volume &amp;lt; 5 * 1024 THEN 1 ELSE 0 END), SUM(CASE WHEN a11.volume &amp;gt; 50 * 1024 THEN 1 ELSE 0 END)&lt;BR /&gt;| group by: a11.apn, CAST(a12.tariff_plan AS INT), a11.subscriber_id, a11.partition_date, a13.name, a12.tariff_plan, a12.charging_node, concat(substring(CAST(a11.partition_date AS STRING), 5, 2), '/', substring(CAST(a11.partition_date AS STRING), 7, 2), '/', substring(CAST(a11.partition_date AS STRING), 1, 4))&lt;BR /&gt;| tuple ids: 3&lt;BR /&gt;|&lt;BR /&gt;4:HASH JOIN&lt;BR /&gt;| join op: INNER JOIN (BROADCAST)&lt;BR /&gt;| hash predicates:&lt;BR /&gt;| CAST(a12.tariff_plan AS INT) = a13.id&lt;BR /&gt;| tuple ids: 0 1 2&lt;BR /&gt;|&lt;BR /&gt;|----7:EXCHANGE&lt;BR /&gt;| tuple ids: 2&lt;BR /&gt;|&lt;BR /&gt;2:HASH JOIN&lt;BR /&gt;| join op: INNER JOIN (BROADCAST)&lt;BR /&gt;| hash predicates:&lt;BR /&gt;| a11.partition_date = a12.partition_date&lt;BR /&gt;| a11.subscriber_id = a12.subscriber_id&lt;BR /&gt;| other predicates: concat(substring(CAST(a11.partition_date AS STRING), 5, 2), '/', substring(CAST(a11.partition_date AS STRING), 7, 2), '/', substring(CAST(a11.partition_date AS STRING), 1, 4)) = concat(substring(CAST(a12.partition_date AS STRING), 5, 2), '/', substring(CAST(a12.partition_date AS STRING), 7, 2), '/', substring(CAST(a12.partition_date AS STRING), 1, 4))&lt;BR /&gt;| tuple ids: 0 1&lt;BR /&gt;|&lt;BR /&gt;|----6:EXCHANGE&lt;BR /&gt;| tuple ids: 1&lt;BR /&gt;|&lt;BR /&gt;0:SCAN HDFS&lt;BR /&gt;table=reporting_mini.acc_volume_daily #partitions=0 size=0B&lt;BR /&gt;tuple ids: 0&lt;BR /&gt;&lt;BR /&gt;PLAN FRAGMENT 3&lt;BR /&gt;PARTITION: RANDOM&lt;BR /&gt;&lt;BR /&gt;STREAM DATA SINK&lt;BR /&gt;EXCHANGE ID: 7&lt;BR /&gt;UNPARTITIONED&lt;BR /&gt;&lt;BR /&gt;3:SCAN HDFS&lt;BR /&gt;table=reporting_mini.l_data_plans #partitions=1 size=21.96KB compact&lt;BR /&gt;tuple ids: 2&lt;BR /&gt;&lt;BR /&gt;PLAN FRAGMENT 4&lt;BR /&gt;PARTITION: RANDOM&lt;BR /&gt;&lt;BR /&gt;STREAM DATA SINK&lt;BR /&gt;EXCHANGE ID: 6&lt;BR /&gt;UNPARTITIONED&lt;BR /&gt;&lt;BR /&gt;1:SCAN HDFS&lt;BR /&gt;&lt;SPAN style="color: #ff9900;"&gt;&lt;STRONG&gt;table=reporting_mini.subscriber_history_daily #partitions=191 size=2.33GB compact&lt;/STRONG&gt; &lt;/SPAN&gt;&lt;BR /&gt;tuple ids: 1&lt;/P&gt;</description>
      <pubDate>Fri, 16 Sep 2022 09:12:32 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Impala-is-scanning-all-the-Partitions-in-join-query-how-to/m-p/21449#M3634</guid>
      <dc:creator>laxmikant.s110688</dc:creator>
      <dc:date>2022-09-16T09:12:32Z</dc:date>
    </item>
    <item>
      <title>Re: Impala is scanning all the Partitions in join query , how to optimize query ?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Impala-is-scanning-all-the-Partitions-in-join-query-how-to/m-p/21509#M3635</link>
      <description>&lt;P&gt;The problem is with the following&amp;nbsp;predicate in the where clause "&lt;SPAN&gt;where a11.partition_date = a12.partition_date = 20140411". The condition a12.partition_date = 20140411 is not properly pushed to the scan node of a12, hence no partition pruning is happening. You may want to rewrite it as "a11.partition_date = a12.partition_date and a12.partition_date = 20140411".&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Dimitris&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 13 Nov 2014 02:10:54 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Impala-is-scanning-all-the-Partitions-in-join-query-how-to/m-p/21509#M3635</guid>
      <dc:creator>Dimitris</dc:creator>
      <dc:date>2014-11-13T02:10:54Z</dc:date>
    </item>
    <item>
      <title>Re: Impala is scanning all the Partitions in join query , how to optimize query ?</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Impala-is-scanning-all-the-Partitions-in-join-query-how-to/m-p/21513#M3636</link>
      <description>&lt;P&gt;thanks for your reply , and you are right about partition pruning.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 13 Nov 2014 04:44:45 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Impala-is-scanning-all-the-Partitions-in-join-query-how-to/m-p/21513#M3636</guid>
      <dc:creator>laxmikant.s110688</dc:creator>
      <dc:date>2014-11-13T04:44:45Z</dc:date>
    </item>
  </channel>
</rss>

