Created on 12-28-2016 11:38 AM - edited 08-17-2019 06:34 AM
Of late, there are many HCC questions on how to create hive table and lineage using REST APIs in Atlas. This article will be act a step by step guide to create hive tables and lineage using REST API.
As part of the solution to this FAQ, I will create two hive tables and lineage(CTAS) between them. I have tested these changes on HDP-2.5 release, so make sure you have HDP version >= 2.5.
Step1: JSON for creating table1:[{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425525",
"version":0,
"typeName":"hive_db",
"state":"ACTIVE"
},
"typeName":"hive_db",
"values":{
"name":"default",
"location":"hdfs://mycluster/apps/hive/warehouse",
"description":"Default Hive database",
"ownerType":2,
"qualifiedName":"default@cl1",
"owner":"public",
"clusterName":"cl1",
"parameters":{
}
},
"traitNames":[
],
"traits":{
}
},{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425524",
"version":0,
"typeName":"hive_table",
"state":"ACTIVE"
},
"typeName":"hive_table",
"values":{
"tableType":"MANAGED_TABLE",
"name":"table1",
"createTime":"2016-12-28T09:34:53.000Z",
"temporary":false,
"db":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425525",
"version":0,
"typeName":"hive_db",
"state":"ACTIVE"
},
"typeName":"hive_db",
"values":{
"name":"default",
"location":"hdfs://mycluster/apps/hive/warehouse",
"description":"Default Hive database",
"ownerType":2,
"qualifiedName":"default@cl1",
"owner":"public",
"clusterName":"cl1",
"parameters":{
}
},
"traitNames":[
],
"traits":{
}
},
"retention":0,
"qualifiedName":"default.table1@cl1",
"columns":[
{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425522",
"version":0,
"typeName":"hive_column",
"state":"ACTIVE"
},
"typeName":"hive_column",
"values":{
"name":"abc",
"qualifiedName":"default.table1.abc@cl1",
"owner":"hive",
"type":"string",
"table":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425524",
"version":0,
"typeName":"hive_table",
"state":"ACTIVE"
}
},
"traitNames":[
],
"traits":{
}
}
],
"lastAccessTime":"2016-12-28T09:34:53.000Z",
"owner":"hive",
"sd":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425523",
"version":0,
"typeName":"hive_storagedesc",
"state":"ACTIVE"
},
"typeName":"hive_storagedesc",
"values":{
"location":"hdfs://mycluster/apps/hive/warehouse/table1",
"serdeInfo":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Struct",
"typeName":"hive_serde",
"values":{
"serializationLib":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
"parameters":{
"serialization.format":"1"
}
}
},
"qualifiedName":"default.table1@cl1_storage",
"outputFormat":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"compressed":false,
"numBuckets":-1,
"inputFormat":"org.apache.hadoop.mapred.TextInputFormat",
"parameters":{
},
"storedAsSubDirectories":false,
"table":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425524",
"version":0,
"typeName":"hive_table",
"state":"ACTIVE"
}
},
"traitNames":[
],
"traits":{
}
},
"parameters":{
"rawDataSize":"0",
"numFiles":"0",
"transient_lastDdlTime":"1482917693",
"totalSize":"0",
"COLUMN_STATS_ACCURATE":"{\"BASIC_STATS\":\"true\"}",
"numRows":"0"
},
"partitionKeys":[
]
},
"traitNames":[
],
"traits":{
}
}]
Save the above json to a file.
Step2: REST API call to create the hive table entity.curl -v -H 'Accept: application/json, text/plain, */*' -H 'Content-Type: application/json; charset=UTF-8' -u admin:admin -d @sample.json http://<IP_ADDRESS>:21000/api/atlas/entities
The above will help in creating a hive table entity.
Step3: JSON for creating table2:[{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425525",
"version":0,
"typeName":"hive_db",
"state":"ACTIVE"
},
"typeName":"hive_db",
"values":{
"name":"default",
"location":"hdfs://mycluster/apps/hive/warehouse",
"description":"Default Hive database",
"ownerType":2,
"qualifiedName":"default@cl1",
"owner":"public",
"clusterName":"cl1",
"parameters":{
}
},
"traitNames":[
],
"traits":{
}
},{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425524",
"version":0,
"typeName":"hive_table",
"state":"ACTIVE"
},
"typeName":"hive_table",
"values":{
"tableType":"MANAGED_TABLE",
"name":"table2",
"createTime":"2016-12-28T09:34:53.000Z",
"temporary":false,
"db":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425525",
"version":0,
"typeName":"hive_db",
"state":"ACTIVE"
},
"typeName":"hive_db",
"values":{
"name":"default",
"location":"hdfs://mycluster/apps/hive/warehouse",
"description":"Default Hive database",
"ownerType":2,
"qualifiedName":"default@cl1",
"owner":"public",
"clusterName":"cl1",
"parameters":{
}
},
"traitNames":[
],
"traits":{
}
},
"retention":0,
"qualifiedName":"default.table2@cl1",
"columns":[
{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425522",
"version":0,
"typeName":"hive_column",
"state":"ACTIVE"
},
"typeName":"hive_column",
"values":{
"name":"abc",
"qualifiedName":"default.table2.abc@cl1",
"owner":"hive",
"type":"string",
"table":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425524",
"version":0,
"typeName":"hive_table",
"state":"ACTIVE"
}
},
"traitNames":[
],
"traits":{
}
}
],
"lastAccessTime":"2016-12-28T09:34:53.000Z",
"owner":"hive",
"sd":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425523",
"version":0,
"typeName":"hive_storagedesc",
"state":"ACTIVE"
},
"typeName":"hive_storagedesc",
"values":{
"location":"hdfs://mycluster/apps/hive/warehouse/table2",
"serdeInfo":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Struct",
"typeName":"hive_serde",
"values":{
"serializationLib":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
"parameters":{
"serialization.format":"1"
}
}
},
"qualifiedName":"default.table2@cl1_storage",
"outputFormat":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"compressed":false,
"numBuckets":-1,
"inputFormat":"org.apache.hadoop.mapred.TextInputFormat",
"parameters":{
},
"storedAsSubDirectories":false,
"table":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425524",
"version":0,
"typeName":"hive_table",
"state":"ACTIVE"
}
},
"traitNames":[
],
"traits":{
}
},
"parameters":{
"rawDataSize":"0",
"numFiles":"0",
"transient_lastDdlTime":"1482917693",
"totalSize":"0",
"COLUMN_STATS_ACCURATE":"{\"BASIC_STATS\":\"true\"}",
"numRows":"0"
},
"partitionKeys":[
]
},
"traitNames":[
],
"traits":{
}
}]
Save the above json to a file.
Step4: Repeat step2 with step3 json
Step5: JSON to create lineage between above two hive tables:[{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425513",
"version":0,
"typeName":"hive_process",
"state":"ACTIVE"
},
"typeName":"hive_process",
"values":{
"queryId":"hive_20161228094619_81b13647-4f7f-4f1b-9c08-0f64eb8dbb34",
"name":"create table table2 as select * from table1",
"startTime":"2016-12-28T09:46:19.003Z",
"queryPlan":{
},
"operationType":"CREATETABLE_AS_SELECT",
"outputs":[
{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425516",
"version":0,
"typeName":"hive_table",
"state":"ACTIVE"
},
"typeName":"hive_table",
"values":{
"tableType":"MANAGED_TABLE",
"name":"table2",
"createTime":"2016-12-28T09:46:30.000Z",
"temporary":false,
"db":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425517",
"version":0,
"typeName":"hive_db",
"state":"ACTIVE"
},
"typeName":"hive_db",
"values":{
"name":"default",
"location":"hdfs://mycluster/apps/hive/warehouse",
"description":"Default Hive database",
"ownerType":2,
"qualifiedName":"default@cl1",
"owner":"public",
"clusterName":"cl1",
"parameters":{
}
},
"traitNames":[
],
"traits":{
}
},
"retention":0,
"qualifiedName":"default.table2@cl1",
"columns":[
{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425514",
"version":0,
"typeName":"hive_column",
"state":"ACTIVE"
},
"typeName":"hive_column",
"values":{
"name":"abc",
"qualifiedName":"default.table2.abc@cl1",
"owner":"hive",
"type":"string",
"table":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425516",
"version":0,
"typeName":"hive_table",
"state":"ACTIVE"
}
},
"traitNames":[
],
"traits":{
}
}
],
"lastAccessTime":"2016-12-28T09:46:30.000Z",
"owner":"hive",
"sd":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425515",
"version":0,
"typeName":"hive_storagedesc",
"state":"ACTIVE"
},
"typeName":"hive_storagedesc",
"values":{
"location":"hdfs://mycluster/apps/hive/warehouse/table2",
"serdeInfo":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Struct",
"typeName":"hive_serde",
"values":{
"serializationLib":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
"parameters":{
"serialization.format":"1"
}
}
},
"qualifiedName":"default.table2@cl1_storage",
"outputFormat":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"compressed":false,
"numBuckets":-1,
"inputFormat":"org.apache.hadoop.mapred.TextInputFormat",
"parameters":{
},
"storedAsSubDirectories":false,
"table":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425516",
"version":0,
"typeName":"hive_table",
"state":"ACTIVE"
}
},
"traitNames":[
],
"traits":{
}
},
"parameters":{
"rawDataSize":"0",
"numFiles":"0",
"transient_lastDdlTime":"1482918390",
"totalSize":"0",
"COLUMN_STATS_ACCURATE":"{\"BASIC_STATS\":\"true\"}",
"numRows":"0"
},
"partitionKeys":[
]
},
"traitNames":[
],
"traits":{
}
}
],
"endTime":"2016-12-28T09:46:31.211Z",
"recentQueries":[
"create table table2 as select * from table1"
],
"inputs":[
{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425520",
"version":0,
"typeName":"hive_table",
"state":"ACTIVE"
},
"typeName":"hive_table",
"values":{
"tableType":"MANAGED_TABLE",
"name":"table1",
"createTime":"2016-12-28T09:34:53.000Z",
"temporary":false,
"db":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425521",
"version":0,
"typeName":"hive_db",
"state":"ACTIVE"
},
"typeName":"hive_db",
"values":{
"name":"default",
"location":"hdfs://mycluster/apps/hive/warehouse",
"description":"Default Hive database",
"ownerType":2,
"qualifiedName":"default@cl1",
"owner":"public",
"clusterName":"cl1",
"parameters":{
}
},
"traitNames":[
],
"traits":{
}
},
"retention":0,
"qualifiedName":"default.table1@cl1",
"columns":[
{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425518",
"version":0,
"typeName":"hive_column",
"state":"ACTIVE"
},
"typeName":"hive_column",
"values":{
"name":"abc",
"qualifiedName":"default.table1.abc@cl1",
"owner":"hive",
"type":"string",
"table":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425520",
"version":0,
"typeName":"hive_table",
"state":"ACTIVE"
}
},
"traitNames":[
],
"traits":{
}
}
],
"lastAccessTime":"2016-12-28T09:34:53.000Z",
"owner":"hive",
"sd":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
"id":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425519",
"version":0,
"typeName":"hive_storagedesc",
"state":"ACTIVE"
},
"typeName":"hive_storagedesc",
"values":{
"location":"hdfs://mycluster/apps/hive/warehouse/table1",
"serdeInfo":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Struct",
"typeName":"hive_serde",
"values":{
"serializationLib":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
"parameters":{
"serialization.format":"1"
}
}
},
"qualifiedName":"default.table1@cl1_storage",
"outputFormat":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"compressed":false,
"numBuckets":-1,
"inputFormat":"org.apache.hadoop.mapred.TextInputFormat",
"parameters":{
},
"storedAsSubDirectories":false,
"table":{
"jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
"id":"-11893021824425520",
"version":0,
"typeName":"hive_table",
"state":"ACTIVE"
}
},
"traitNames":[
],
"traits":{
}
},
"parameters":{
"rawDataSize":"0",
"numFiles":"0",
"transient_lastDdlTime":"1482917693",
"totalSize":"0",
"COLUMN_STATS_ACCURATE":"{\"BASIC_STATS\":\"true\"}",
"numRows":"0"
},
"partitionKeys":[
]
},
"traitNames":[
],
"traits":{
}
}
],
"qualifiedName":"default.table2@cl1:1482918390000",
"queryText":"create table table2 as select * from table1",
"clusterName":"cl1",
"userName":"hive"
},
"traitNames":[
],
"traits":{
}
}]
Save the above json to a file.
Step6: Repeat step2 with step5 json
Step7: Over Atlas UI, lineage between two entities can be seen as below.
Hope this clarifies the queries on creating hive tables using rest API. Please let me know if there are any queries in the comments, I will be more than happy to help.
Created on 12-28-2016 01:08 PM
Nicely written and very helpful. Thanks @Ayub Khan for writing this !
Created on 12-11-2018 01:24 PM
Hi Ayub,
As described in the step 1, Is it required to create some random ids "id":"-11893021824425525" for this json request to be successful.