Support Questions

Find answers, ask questions, and share your expertise

Why Impala return cross join on Array and struct data elements

avatar
New Contributor
-- I adapted from this example www.cloudera.com/documentation

-- impala side

CREATE TABLE flat_struct_array (continent STRING, country STRING, city STRING);

INSERT INTO flat_struct_array VALUES
    ('North America', 'Canada', 'Toronto') , ('North America', 'Canada', 'Vancouver')
  , ('North America', 'Canada', "St. John\'s") , ('North America', 'Canada', 'Saint John')
  , ('North America', 'Canada', 'Montreal') , ('North America', 'Canada', 'Halifax')
  , ('North America', 'Canada', 'Winnipeg') , ('North America', 'Canada', 'Calgary')
  , ('North America', 'Canada', 'Saskatoon') , ('North America', 'Canada', 'Ottawa')
  , ('North America', 'Canada', 'Yellowknife') , ('Europe', 'France', 'Paris')
  , ('Europe', 'France', 'Nice') , ('Europe', 'France', 'Marseilles')
  , ('Europe', 'France', 'Cannes') , ('Europe', 'Greece', 'Athens')
  , ('Europe', 'Greece', 'Piraeus') , ('Europe', 'Greece', 'Hania')
  , ('Europe', 'Greece', 'Heraklion') , ('Europe', 'Greece', 'Rethymnon')
  , ('Europe', 'Greece', 'Fira');

CREATE TABLE complex_struct_array2
(continent STRING, country array< STRUCT <name: STRING, city: ARRAY <string> >   > ) STORED AS PARQUET;

-- hive side

INSERT INTO complex_struct_array2
select continent, collect_list(struct1)
from (
SELECT continent, named_struct('name', country, 'city', collect_list(city)) as struct1 FROM flat_struct_array GROUP BY continent, country
) a group by continent

select * from complex_struct_array2
-- you'll see table with 2 records, and in Europe it has 2 countries
-- France has 4 cities and Greece has 6 cities

--back to impala side

select * 
from complex_struct_array2 t, t.country t2, t.country.city t3

-- you'll see the result that France contains 10 records (with Greece cities) and vice versa
continent     |name   |item        |
--------------|-------|------------|
Europe        |France |Paris       |
Europe        |France |Nice        |
Europe        |France |Marseilles  |
Europe        |France |Cannes      |
Europe        |France |Athens      |<-- should not be shown
Europe        |France |Piraeus     |<-- should not be shown
Europe        |France |Hania       |<-- should not be shown
Europe        |France |Heraklion   |<-- should not be shown
Europe        |France |Rethymnon   |<-- should not be shown
Europe        |France |Fira        |<-- should not be shown
...

any help please
1 ACCEPTED SOLUTION

avatar

Impala SQL treats nested collections essentially as tables. If you want to "join" the nested collection with the containing table or collection,  you need to use the same alias that you gave that table previously in the FROM list (otherwise it considers it a separate reference to the nested collection)

 

I.e. instead of

from complex_struct_array2 t, t.country t2, t.country.city t3

you want to write the following to do the implicit join:

from complex_struct_array2 t, t.country t2, t2.city t3

 

View solution in original post

2 REPLIES 2

avatar

Impala SQL treats nested collections essentially as tables. If you want to "join" the nested collection with the containing table or collection,  you need to use the same alias that you gave that table previously in the FROM list (otherwise it considers it a separate reference to the nested collection)

 

I.e. instead of

from complex_struct_array2 t, t.country t2, t.country.city t3

you want to write the following to do the implicit join:

from complex_struct_array2 t, t.country t2, t2.city t3

 

avatar
New Contributor

Thanks a lot TIM ..it now ok