<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Hive original order of records using collect_list in Archives of Support Questions (Read Only)</title>
    <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Hive-original-order-of-records-using-collect-list/m-p/103156#M15412</link>
    <description>&lt;P&gt;In Hive how to do collect_list(path) in ascending order based on servicetime column grouping by service_date,timeslot,customer_id,category,product,sub_product.&lt;/P&gt;&lt;P&gt;Path order is correct when the select query is executed.&lt;/P&gt;&lt;PRE&gt;Correct Answer Hive Query -&amp;gt; 

select sum(calls),concat_ws(':',collect_list(path)) FROM test group by service_date,timeslot,customer_id,category,product,sub_product;
Output : 38,|MN_0038|MN_0008|MN_0003|MN_0002|MN_0001|Dummy:|MN_0003|MN_0002:|MN_0001|Dummy:|MN_0001|Dummy&lt;/PRE&gt;&lt;P&gt;But when I execute on 1 gb of data for bulk aggregation from one table to another table the order of path is not correct.&lt;/P&gt;&lt;P&gt;Wrong Answer Hive Query -&amp;gt; 
with isort as &lt;/P&gt;&lt;PRE&gt;select calls,path from raw_test order by servicetime asc)
from isort
insert into table test 
select sum(calls),concat_ws(':',collect_list(path)) FROM test group by service_date,timeslot,customer_id,category,product,sub_product;
Output : 38,|MN_0003|MN_0002:|MN_0001|Dummy:|MN_0038|MN_0008|MN_0003|MN_0002|MN_0001|Dummy:|MN_0001|Dummy&lt;/PRE&gt;&lt;P&gt;Like Mysql am trying to write hive query with order by clause am facing an error&lt;/P&gt;&lt;P&gt;Error Hive Query like MySQL -&amp;gt; 
with isort as (&lt;/P&gt;&lt;PRE&gt;select calls,path from raw_test order by servicetime asc)
from isort
insert into table test 
select sum(calls),concat_ws(':',collect_list(path)) FROM test group by service_date,timeslot,customer_id,category,product,sub_product order by servicetime;&lt;/PRE&gt;</description>
    <pubDate>Tue, 19 Jan 2016 16:55:18 GMT</pubDate>
    <dc:creator>balachandrank30</dc:creator>
    <dc:date>2016-01-19T16:55:18Z</dc:date>
    <item>
      <title>Hive original order of records using collect_list</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Hive-original-order-of-records-using-collect-list/m-p/103156#M15412</link>
      <description>&lt;P&gt;In Hive how to do collect_list(path) in ascending order based on servicetime column grouping by service_date,timeslot,customer_id,category,product,sub_product.&lt;/P&gt;&lt;P&gt;Path order is correct when the select query is executed.&lt;/P&gt;&lt;PRE&gt;Correct Answer Hive Query -&amp;gt; 

select sum(calls),concat_ws(':',collect_list(path)) FROM test group by service_date,timeslot,customer_id,category,product,sub_product;
Output : 38,|MN_0038|MN_0008|MN_0003|MN_0002|MN_0001|Dummy:|MN_0003|MN_0002:|MN_0001|Dummy:|MN_0001|Dummy&lt;/PRE&gt;&lt;P&gt;But when I execute on 1 gb of data for bulk aggregation from one table to another table the order of path is not correct.&lt;/P&gt;&lt;P&gt;Wrong Answer Hive Query -&amp;gt; 
with isort as &lt;/P&gt;&lt;PRE&gt;select calls,path from raw_test order by servicetime asc)
from isort
insert into table test 
select sum(calls),concat_ws(':',collect_list(path)) FROM test group by service_date,timeslot,customer_id,category,product,sub_product;
Output : 38,|MN_0003|MN_0002:|MN_0001|Dummy:|MN_0038|MN_0008|MN_0003|MN_0002|MN_0001|Dummy:|MN_0001|Dummy&lt;/PRE&gt;&lt;P&gt;Like Mysql am trying to write hive query with order by clause am facing an error&lt;/P&gt;&lt;P&gt;Error Hive Query like MySQL -&amp;gt; 
with isort as (&lt;/P&gt;&lt;PRE&gt;select calls,path from raw_test order by servicetime asc)
from isort
insert into table test 
select sum(calls),concat_ws(':',collect_list(path)) FROM test group by service_date,timeslot,customer_id,category,product,sub_product order by servicetime;&lt;/PRE&gt;</description>
      <pubDate>Tue, 19 Jan 2016 16:55:18 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Hive-original-order-of-records-using-collect-list/m-p/103156#M15412</guid>
      <dc:creator>balachandrank30</dc:creator>
      <dc:date>2016-01-19T16:55:18Z</dc:date>
    </item>
    <item>
      <title>Re: Hive original order of records using collect_list</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Hive-original-order-of-records-using-collect-list/m-p/103157#M15413</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/2263/balachandrank30.html" nodeid="2263"&gt;@Balachandran Karnati&lt;/A&gt;&lt;/P&gt;&lt;P&gt;Collect_list uses ArrayList, so the data will be kept in the same order they were added, to do that, uou need to use SORT BY clause in a subquery, don't use ORDER BY, it will cause your query to execute in a non-distributed way.&lt;/P&gt;&lt;P&gt;Find a simple example below:&lt;/P&gt;&lt;PRE&gt;drop table collect;
create external table collect(
  key string,
  order_field int,
  value double
   
)
row format delimited fields terminated by ',' 
stored as textfile 
;
&lt;/PRE&gt;&lt;PRE&gt;[root@sandbox ~]# cat collect1.txt
a,1,1.0
a,3,3.0
a,2,2.0
b,1,1.0
b,2,2.0
b,3,3.0
[root@sandbox ~]# cat collect2.txt
a,1,1.1
a,3,3.1
a,2,2.1
b,1,1.1
b,2,2.1
b,3,3.1
[root@sandbox ~]# hadoop fs -put collect* /apps/hive/warehouse/collect
&lt;/PRE&gt;&lt;PRE&gt;drop table IF EXISTS collect_sorted;
create table collect_sorted as 
select key, collect_list(value)
from 
(select * from collect sort by key, order_field, value desc) x
group by key
;
&lt;/PRE&gt;</description>
      <pubDate>Wed, 20 Jan 2016 08:07:26 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Hive-original-order-of-records-using-collect-list/m-p/103157#M15413</guid>
      <dc:creator>gbraccialli3</dc:creator>
      <dc:date>2016-01-20T08:07:26Z</dc:date>
    </item>
    <item>
      <title>Re: Hive original order of records using collect_list</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Hive-original-order-of-records-using-collect-list/m-p/362476#M15414</link>
      <description>&lt;P&gt;For me (working on CDP) this is still now working.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;create temporary table stage1 as SELECT *&lt;BR /&gt;FROM (&lt;BR /&gt;SELECT customer&lt;BR /&gt;, ts&lt;BR /&gt;, sum(enodmywindow) over (partition by customer order by ts desc rows between unbounded preceding and current row) mywindow&lt;BR /&gt;, sum(1) over (partition by customer order by ts desc rows between unbounded preceding and current row) row_count&lt;BR /&gt;FROM od_stage1&lt;BR /&gt;WHERE previous_day_ts is null -- to change when the next day ts is fixed&lt;BR /&gt;) a&lt;BR /&gt;DISTRIBUTE BY customer, mywindow sort by customer, mywindow, row_count&lt;BR /&gt;;&lt;/P&gt;&lt;P&gt;create temporary table stage2 as&lt;BR /&gt;SELECT customer&lt;BR /&gt;, mywindow&lt;BR /&gt;, collect_list(ts) ts_list&lt;BR /&gt;, collect_list(row_count) row_count_list&lt;BR /&gt;FROM (select *&lt;BR /&gt;from stage1&lt;BR /&gt;DISTRIBUTE BY customer, mywindow sort by customer, mywindow, row_count ) x&lt;BR /&gt;group by customer&lt;BR /&gt;, mywindow&lt;BR /&gt;;&lt;/P&gt;&lt;P&gt;+-------------------+--------------------------+-------------+----------------------------------------------------+&lt;BR /&gt;| customer | trip_count | row_count_list |&lt;BR /&gt;+-------------------+--------------------------+-------------+----------------------------------------------------+&lt;BR /&gt;| someone | 2023-01-23 17:18:30.89 | 8 | [110] |&lt;BR /&gt;| someone | 2023-01-24 01:34:19.052 | 7 | [109] |&lt;BR /&gt;| someone | 2023-01-24 03:53:09.388 | 6 | [108] |&lt;BR /&gt;| someone | 2023-01-24 09:32:03.172 | 5 | [71,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107] |&lt;BR /&gt;| someone | 2023-01-24 11:48:21.606 | 4 | [51,50] |&lt;BR /&gt;| someone | 2023-01-24 12:29:52.992 | 3 | [48,49] |&lt;BR /&gt;| someone | 2023-01-24 17:35:35.846 | 2 | [46,47] |&lt;BR /&gt;| someone | 2023-01-24 20:03:30.239 | 1 | [20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45] |&lt;BR /&gt;+-------------------+--------------------------+-------------+----------------------------------------------------+&lt;/P&gt;</description>
      <pubDate>Mon, 30 Jan 2023 09:19:12 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Hive-original-order-of-records-using-collect-list/m-p/362476#M15414</guid>
      <dc:creator>clumZZey</dc:creator>
      <dc:date>2023-01-30T09:19:12Z</dc:date>
    </item>
    <item>
      <title>Re: Hive original order of records using collect_list</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Hive-original-order-of-records-using-collect-list/m-p/362477#M15415</link>
      <description>&lt;P&gt;Sorry, pasted the wrong output:&lt;BR /&gt;&lt;BR /&gt;+---------------+-----------+----------------------------------------------------+&lt;BR /&gt;| customer | mywindow | row_count_list |&lt;BR /&gt;+---------------+-----------+----------------------------------------------------+&lt;BR /&gt;| somecustomer | 1 | [20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45] |&lt;BR /&gt;| somecustomer | 2 | [46,47] |&lt;BR /&gt;| somecustomer | 3 | [48,49] |&lt;BR /&gt;| somecustomer | 4 | [51,50] |&lt;BR /&gt;| somecustomer | 5 | [71,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107] |&lt;BR /&gt;| somecustomer | 6 | [108] |&lt;BR /&gt;| somecustomer | 7 | [109] |&lt;BR /&gt;| somecustomer | 8 | [110] |&lt;BR /&gt;+---------------+-----------+----------------------------------------------------+&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 30 Jan 2023 09:21:33 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Hive-original-order-of-records-using-collect-list/m-p/362477#M15415</guid>
      <dc:creator>clumZZey</dc:creator>
      <dc:date>2023-01-30T09:21:33Z</dc:date>
    </item>
  </channel>
</rss>

