<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question How to delete duplicate data in hive table with partition? in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/How-to-delete-duplicate-data-in-hive-table-with-partition/m-p/374535#M242022</link>
    <description>&lt;P&gt;Hello,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The duplicate data from 2023-03-26 to 2023-07-10 must be removed.&lt;/P&gt;&lt;P&gt;I'm trying to remove duplicates from the table using this command, but I'm getting an error.&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;Commands:&lt;/P&gt;&lt;P&gt;set hive.exec.dynamic.partition.mode=nonstrict;&lt;BR /&gt;INSERT OVERWRITE TABLE db.table_name PARTITION(dt) select distinct * from db.table_name where dt &amp;gt;= '2023-03-26' AND dt &amp;gt;= '2023-07-10';&lt;BR /&gt;&lt;BR /&gt;Error:&amp;nbsp;&lt;BR /&gt;&lt;SPAN&gt;23/07/26 16:07:46 [LocalJobRunner Map Task Executor #0]: WARN io.CombineHiveRecordReader: Multiple partitions found; not going to pass a part spec to LLAP IO: {{dt=2023-07-10}} and {{dt=2023-07-11}} 2023-07-26 16:07:47,952 Stage-1 map = 0%, reduce = 0% 23/07/26 16:07:47 [aabca681-0714-44f6-bc8d-9be6d7fca9fc main]: WARN mapreduce.Counters: Group org.apache.hadoop.mapred.Task$Counter is deprecated. Use org.apache.hadoop.mapreduce.TaskCounter instead.&lt;BR /&gt;&lt;BR /&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Note: The partition of this table is just the dates.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Example:&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;show partitions db.table_name;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;dt=2023-07-04&lt;BR /&gt;dt=2023-07-05&lt;BR /&gt;dt=2023-07-06&lt;BR /&gt;dt=2023-07-07&lt;BR /&gt;dt=2023-07-08&lt;BR /&gt;dt=2023-07-09&lt;BR /&gt;dt=2023-07-10&lt;BR /&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;$ hive --version&lt;BR /&gt;Hive 2.3.3&lt;BR /&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Hope you can suggest on this one.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Thank you!&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Thu, 27 Jul 2023 03:20:17 GMT</pubDate>
    <dc:creator>Noel_0317</dc:creator>
    <dc:date>2023-07-27T03:20:17Z</dc:date>
    <item>
      <title>How to delete duplicate data in hive table with partition?</title>
      <link>https://community.cloudera.com/t5/Support-Questions/How-to-delete-duplicate-data-in-hive-table-with-partition/m-p/374535#M242022</link>
      <description>&lt;P&gt;Hello,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The duplicate data from 2023-03-26 to 2023-07-10 must be removed.&lt;/P&gt;&lt;P&gt;I'm trying to remove duplicates from the table using this command, but I'm getting an error.&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;Commands:&lt;/P&gt;&lt;P&gt;set hive.exec.dynamic.partition.mode=nonstrict;&lt;BR /&gt;INSERT OVERWRITE TABLE db.table_name PARTITION(dt) select distinct * from db.table_name where dt &amp;gt;= '2023-03-26' AND dt &amp;gt;= '2023-07-10';&lt;BR /&gt;&lt;BR /&gt;Error:&amp;nbsp;&lt;BR /&gt;&lt;SPAN&gt;23/07/26 16:07:46 [LocalJobRunner Map Task Executor #0]: WARN io.CombineHiveRecordReader: Multiple partitions found; not going to pass a part spec to LLAP IO: {{dt=2023-07-10}} and {{dt=2023-07-11}} 2023-07-26 16:07:47,952 Stage-1 map = 0%, reduce = 0% 23/07/26 16:07:47 [aabca681-0714-44f6-bc8d-9be6d7fca9fc main]: WARN mapreduce.Counters: Group org.apache.hadoop.mapred.Task$Counter is deprecated. Use org.apache.hadoop.mapreduce.TaskCounter instead.&lt;BR /&gt;&lt;BR /&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Note: The partition of this table is just the dates.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Example:&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;show partitions db.table_name;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;dt=2023-07-04&lt;BR /&gt;dt=2023-07-05&lt;BR /&gt;dt=2023-07-06&lt;BR /&gt;dt=2023-07-07&lt;BR /&gt;dt=2023-07-08&lt;BR /&gt;dt=2023-07-09&lt;BR /&gt;dt=2023-07-10&lt;BR /&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;$ hive --version&lt;BR /&gt;Hive 2.3.3&lt;BR /&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Hope you can suggest on this one.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Thank you!&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 27 Jul 2023 03:20:17 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/How-to-delete-duplicate-data-in-hive-table-with-partition/m-p/374535#M242022</guid>
      <dc:creator>Noel_0317</dc:creator>
      <dc:date>2023-07-27T03:20:17Z</dc:date>
    </item>
    <item>
      <title>Re: How to delete duplicate data in hive table with partition?</title>
      <link>https://community.cloudera.com/t5/Support-Questions/How-to-delete-duplicate-data-in-hive-table-with-partition/m-p/374770#M242134</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/104325"&gt;@Noel_0317&lt;/a&gt;,&lt;/P&gt;&lt;P&gt;The error indicates that there are multiple partitions in the where condition. Can you try the below query:&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;INSERT OVERWRITE TABLE db.table_name PARTITION(dt='2023-03-26') select distinct * from db.table_name where dt = '2023-03-26';&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Let us know how it goes.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Cheers!&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 01 Aug 2023 06:56:16 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/How-to-delete-duplicate-data-in-hive-table-with-partition/m-p/374770#M242134</guid>
      <dc:creator>tj2007</dc:creator>
      <dc:date>2023-08-01T06:56:16Z</dc:date>
    </item>
  </channel>
</rss>

