Community Articles

Find and share helpful community-sourced technical articles.
Labels (1)
avatar

Problem:

Of late, there are many HCC questions on how to create hive table and lineage using REST APIs in Atlas. This article will be act a step by step guide to create hive tables and lineage using REST API.

Solution:

As part of the solution to this FAQ, I will create two hive tables and lineage(CTAS) between them. I have tested these changes on HDP-2.5 release, so make sure you have HDP version >= 2.5.

Step1: JSON for creating table1:
[{
  "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
  "id":{
    "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
    "id":"-11893021824425525",
    "version":0,
    "typeName":"hive_db",
    "state":"ACTIVE"
  },
  "typeName":"hive_db",
  "values":{
    "name":"default",
    "location":"hdfs://mycluster/apps/hive/warehouse",
    "description":"Default Hive database",
    "ownerType":2,
    "qualifiedName":"default@cl1",
    "owner":"public",
    "clusterName":"cl1",
    "parameters":{
 
 
    }
  },
  "traitNames":[
 
 
  ],
  "traits":{
 
 
  }
},{
  "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
  "id":{
    "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
    "id":"-11893021824425524",
    "version":0,
    "typeName":"hive_table",
    "state":"ACTIVE"
  },
  "typeName":"hive_table",
  "values":{
    "tableType":"MANAGED_TABLE",
    "name":"table1",
    "createTime":"2016-12-28T09:34:53.000Z",
    "temporary":false,
    "db":{
      "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
      "id":{
        "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
        "id":"-11893021824425525",
        "version":0,
        "typeName":"hive_db",
        "state":"ACTIVE"
      },
      "typeName":"hive_db",
      "values":{
        "name":"default",
        "location":"hdfs://mycluster/apps/hive/warehouse",
        "description":"Default Hive database",
        "ownerType":2,
        "qualifiedName":"default@cl1",
        "owner":"public",
        "clusterName":"cl1",
        "parameters":{
 
 
        }
      },
      "traitNames":[
 
 
      ],
      "traits":{
 
 
      }
    },
    "retention":0,
    "qualifiedName":"default.table1@cl1",
    "columns":[
      {
        "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
        "id":{
          "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
          "id":"-11893021824425522",
          "version":0,
          "typeName":"hive_column",
          "state":"ACTIVE"
        },
        "typeName":"hive_column",
        "values":{
          "name":"abc",
          "qualifiedName":"default.table1.abc@cl1",
          "owner":"hive",
          "type":"string",
          "table":{
            "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
            "id":"-11893021824425524",
            "version":0,
            "typeName":"hive_table",
            "state":"ACTIVE"
          }
        },
        "traitNames":[
 
 
        ],
        "traits":{
 
 
        }
      }
    ],
    "lastAccessTime":"2016-12-28T09:34:53.000Z",
    "owner":"hive",
    "sd":{
      "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
      "id":{
        "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
        "id":"-11893021824425523",
        "version":0,
        "typeName":"hive_storagedesc",
        "state":"ACTIVE"
      },
      "typeName":"hive_storagedesc",
      "values":{
        "location":"hdfs://mycluster/apps/hive/warehouse/table1",
        "serdeInfo":{
          "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Struct",
          "typeName":"hive_serde",
          "values":{
            "serializationLib":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
            "parameters":{
              "serialization.format":"1"
            }
          }
        },
        "qualifiedName":"default.table1@cl1_storage",
        "outputFormat":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
        "compressed":false,
        "numBuckets":-1,
        "inputFormat":"org.apache.hadoop.mapred.TextInputFormat",
        "parameters":{
 
 
        },
        "storedAsSubDirectories":false,
        "table":{
          "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
          "id":"-11893021824425524",
          "version":0,
          "typeName":"hive_table",
          "state":"ACTIVE"
        }
      },
      "traitNames":[
 
 
      ],
      "traits":{
 
 
      }
    },
    "parameters":{
      "rawDataSize":"0",
      "numFiles":"0",
      "transient_lastDdlTime":"1482917693",
      "totalSize":"0",
      "COLUMN_STATS_ACCURATE":"{\"BASIC_STATS\":\"true\"}",
      "numRows":"0"
    },
    "partitionKeys":[
 
 
    ]
  },
  "traitNames":[
 
 
  ],
  "traits":{
 
 
  }
}]

Save the above json to a file.

Step2: REST API call to create the hive table entity.
curl -v -H 'Accept: application/json, text/plain, */*' -H 'Content-Type: application/json;  charset=UTF-8' -u admin:admin -d @sample.json http://<IP_ADDRESS>:21000/api/atlas/entities

The above will help in creating a hive table entity.

Step3: JSON for creating table2:
[{
  "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
  "id":{
    "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
    "id":"-11893021824425525",
    "version":0,
    "typeName":"hive_db",
    "state":"ACTIVE"
  },
  "typeName":"hive_db",
  "values":{
    "name":"default",
    "location":"hdfs://mycluster/apps/hive/warehouse",
    "description":"Default Hive database",
    "ownerType":2,
    "qualifiedName":"default@cl1",
    "owner":"public",
    "clusterName":"cl1",
    "parameters":{
 
 
    }
  },
  "traitNames":[
 
 
  ],
  "traits":{
 
 
  }
},{
  "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
  "id":{
    "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
    "id":"-11893021824425524",
    "version":0,
    "typeName":"hive_table",
    "state":"ACTIVE"
  },
  "typeName":"hive_table",
  "values":{
    "tableType":"MANAGED_TABLE",
    "name":"table2",
    "createTime":"2016-12-28T09:34:53.000Z",
    "temporary":false,
    "db":{
      "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
      "id":{
        "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
        "id":"-11893021824425525",
        "version":0,
        "typeName":"hive_db",
        "state":"ACTIVE"
      },
      "typeName":"hive_db",
      "values":{
        "name":"default",
        "location":"hdfs://mycluster/apps/hive/warehouse",
        "description":"Default Hive database",
        "ownerType":2,
        "qualifiedName":"default@cl1",
        "owner":"public",
        "clusterName":"cl1",
        "parameters":{
 
 
        }
      },
      "traitNames":[
 
 
      ],
      "traits":{
 
 
      }
    },
    "retention":0,
    "qualifiedName":"default.table2@cl1",
    "columns":[
      {
        "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
        "id":{
          "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
          "id":"-11893021824425522",
          "version":0,
          "typeName":"hive_column",
          "state":"ACTIVE"
        },
        "typeName":"hive_column",
        "values":{
          "name":"abc",
          "qualifiedName":"default.table2.abc@cl1",
          "owner":"hive",
          "type":"string",
          "table":{
            "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
            "id":"-11893021824425524",
            "version":0,
            "typeName":"hive_table",
            "state":"ACTIVE"
          }
        },
        "traitNames":[
 
 
        ],
        "traits":{
 
 
        }
      }
    ],
    "lastAccessTime":"2016-12-28T09:34:53.000Z",
    "owner":"hive",
    "sd":{
      "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
      "id":{
        "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
        "id":"-11893021824425523",
        "version":0,
        "typeName":"hive_storagedesc",
        "state":"ACTIVE"
      },
      "typeName":"hive_storagedesc",
      "values":{
        "location":"hdfs://mycluster/apps/hive/warehouse/table2",
        "serdeInfo":{
          "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Struct",
          "typeName":"hive_serde",
          "values":{
            "serializationLib":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
            "parameters":{
              "serialization.format":"1"
            }
          }
        },
        "qualifiedName":"default.table2@cl1_storage",
        "outputFormat":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
        "compressed":false,
        "numBuckets":-1,
        "inputFormat":"org.apache.hadoop.mapred.TextInputFormat",
        "parameters":{
 
 
        },
        "storedAsSubDirectories":false,
        "table":{
          "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
          "id":"-11893021824425524",
          "version":0,
          "typeName":"hive_table",
          "state":"ACTIVE"
        }
      },
      "traitNames":[
 
 
      ],
      "traits":{
 
 
      }
    },
    "parameters":{
      "rawDataSize":"0",
      "numFiles":"0",
      "transient_lastDdlTime":"1482917693",
      "totalSize":"0",
      "COLUMN_STATS_ACCURATE":"{\"BASIC_STATS\":\"true\"}",
      "numRows":"0"
    },
    "partitionKeys":[
 
 
    ]
  },
  "traitNames":[
 
 
  ],
  "traits":{
 
 
  }
}]

Save the above json to a file.

Step4: Repeat step2 with step3 json

Step5: JSON to create lineage between above two hive tables:
[{
  "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
  "id":{
    "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
    "id":"-11893021824425513",
    "version":0,
    "typeName":"hive_process",
    "state":"ACTIVE"
  },
  "typeName":"hive_process",
  "values":{
    "queryId":"hive_20161228094619_81b13647-4f7f-4f1b-9c08-0f64eb8dbb34",
    "name":"create table table2 as select * from table1",
    "startTime":"2016-12-28T09:46:19.003Z",
    "queryPlan":{
 
 
    },
    "operationType":"CREATETABLE_AS_SELECT",
    "outputs":[
      {
        "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
        "id":{
          "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
          "id":"-11893021824425516",
          "version":0,
          "typeName":"hive_table",
          "state":"ACTIVE"
        },
        "typeName":"hive_table",
        "values":{
          "tableType":"MANAGED_TABLE",
          "name":"table2",
          "createTime":"2016-12-28T09:46:30.000Z",
          "temporary":false,
          "db":{
            "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
            "id":{
              "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
              "id":"-11893021824425517",
              "version":0,
              "typeName":"hive_db",
              "state":"ACTIVE"
            },
            "typeName":"hive_db",
            "values":{
              "name":"default",
              "location":"hdfs://mycluster/apps/hive/warehouse",
              "description":"Default Hive database",
              "ownerType":2,
              "qualifiedName":"default@cl1",
              "owner":"public",
              "clusterName":"cl1",
              "parameters":{
 
 
              }
            },
            "traitNames":[
 
 
            ],
            "traits":{
 
 
            }
          },
          "retention":0,
          "qualifiedName":"default.table2@cl1",
          "columns":[
            {
              "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
              "id":{
                "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
                "id":"-11893021824425514",
                "version":0,
                "typeName":"hive_column",
                "state":"ACTIVE"
              },
              "typeName":"hive_column",
              "values":{
                "name":"abc",
                "qualifiedName":"default.table2.abc@cl1",
                "owner":"hive",
                "type":"string",
                "table":{
                  "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
                  "id":"-11893021824425516",
                  "version":0,
                  "typeName":"hive_table",
                  "state":"ACTIVE"
                }
              },
              "traitNames":[
 
 
              ],
              "traits":{
 
 
              }
            }
          ],
          "lastAccessTime":"2016-12-28T09:46:30.000Z",
          "owner":"hive",
          "sd":{
            "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
            "id":{
              "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
              "id":"-11893021824425515",
              "version":0,
              "typeName":"hive_storagedesc",
              "state":"ACTIVE"
            },
            "typeName":"hive_storagedesc",
            "values":{
              "location":"hdfs://mycluster/apps/hive/warehouse/table2",
              "serdeInfo":{
                "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Struct",
                "typeName":"hive_serde",
                "values":{
                  "serializationLib":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
                  "parameters":{
                    "serialization.format":"1"
                  }
                }
              },
              "qualifiedName":"default.table2@cl1_storage",
              "outputFormat":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
              "compressed":false,
              "numBuckets":-1,
              "inputFormat":"org.apache.hadoop.mapred.TextInputFormat",
              "parameters":{
 
 
              },
              "storedAsSubDirectories":false,
              "table":{
                "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
                "id":"-11893021824425516",
                "version":0,
                "typeName":"hive_table",
                "state":"ACTIVE"
              }
            },
            "traitNames":[
 
 
            ],
            "traits":{
 
 
            }
          },
          "parameters":{
            "rawDataSize":"0",
            "numFiles":"0",
            "transient_lastDdlTime":"1482918390",
            "totalSize":"0",
            "COLUMN_STATS_ACCURATE":"{\"BASIC_STATS\":\"true\"}",
            "numRows":"0"
          },
          "partitionKeys":[
 
 
          ]
        },
        "traitNames":[
 
 
        ],
        "traits":{
 
 
        }
      }
    ],
    "endTime":"2016-12-28T09:46:31.211Z",
    "recentQueries":[
      "create table table2 as select * from table1"
    ],
    "inputs":[
      {
        "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
        "id":{
          "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
          "id":"-11893021824425520",
          "version":0,
          "typeName":"hive_table",
          "state":"ACTIVE"
        },
        "typeName":"hive_table",
        "values":{
          "tableType":"MANAGED_TABLE",
          "name":"table1",
          "createTime":"2016-12-28T09:34:53.000Z",
          "temporary":false,
          "db":{
            "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
            "id":{
              "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
              "id":"-11893021824425521",
              "version":0,
              "typeName":"hive_db",
              "state":"ACTIVE"
            },
            "typeName":"hive_db",
            "values":{
              "name":"default",
              "location":"hdfs://mycluster/apps/hive/warehouse",
              "description":"Default Hive database",
              "ownerType":2,
              "qualifiedName":"default@cl1",
              "owner":"public",
              "clusterName":"cl1",
              "parameters":{
 
 
              }
            },
            "traitNames":[
 
 
            ],
            "traits":{
 
 
            }
          },
          "retention":0,
          "qualifiedName":"default.table1@cl1",
          "columns":[
            {
              "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
              "id":{
                "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
                "id":"-11893021824425518",
                "version":0,
                "typeName":"hive_column",
                "state":"ACTIVE"
              },
              "typeName":"hive_column",
              "values":{
                "name":"abc",
                "qualifiedName":"default.table1.abc@cl1",
                "owner":"hive",
                "type":"string",
                "table":{
                  "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
                  "id":"-11893021824425520",
                  "version":0,
                  "typeName":"hive_table",
                  "state":"ACTIVE"
                }
              },
              "traitNames":[
 
 
              ],
              "traits":{
 
 
              }
            }
          ],
          "lastAccessTime":"2016-12-28T09:34:53.000Z",
          "owner":"hive",
          "sd":{
            "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Reference",
            "id":{
              "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
              "id":"-11893021824425519",
              "version":0,
              "typeName":"hive_storagedesc",
              "state":"ACTIVE"
            },
            "typeName":"hive_storagedesc",
            "values":{
              "location":"hdfs://mycluster/apps/hive/warehouse/table1",
              "serdeInfo":{
                "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Struct",
                "typeName":"hive_serde",
                "values":{
                  "serializationLib":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
                  "parameters":{
                    "serialization.format":"1"
                  }
                }
              },
              "qualifiedName":"default.table1@cl1_storage",
              "outputFormat":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
              "compressed":false,
              "numBuckets":-1,
              "inputFormat":"org.apache.hadoop.mapred.TextInputFormat",
              "parameters":{
 
 
              },
              "storedAsSubDirectories":false,
              "table":{
                "jsonClass":"org.apache.atlas.typesystem.json.InstanceSerialization$_Id",
                "id":"-11893021824425520",
                "version":0,
                "typeName":"hive_table",
                "state":"ACTIVE"
              }
            },
            "traitNames":[
 
 
            ],
            "traits":{
 
 
            }
          },
          "parameters":{
            "rawDataSize":"0",
            "numFiles":"0",
            "transient_lastDdlTime":"1482917693",
            "totalSize":"0",
            "COLUMN_STATS_ACCURATE":"{\"BASIC_STATS\":\"true\"}",
            "numRows":"0"
          },
          "partitionKeys":[
 
 
          ]
        },
        "traitNames":[
 
 
        ],
        "traits":{
 
 
        }
      }
    ],
    "qualifiedName":"default.table2@cl1:1482918390000",
    "queryText":"create table table2 as select * from table1",
    "clusterName":"cl1",
    "userName":"hive"
  },
  "traitNames":[
 
 
  ],
  "traits":{
 
 
  }
}]

Save the above json to a file.

Step6: Repeat step2 with step5 json

Step7: Over Atlas UI, lineage between two entities can be seen as below.

10864-atlas-2016-12-28-16-47-13.png

Hope this clarifies the queries on creating hive tables using rest API. Please let me know if there are any queries in the comments, I will be more than happy to help.

References:

6,013 Views
Comments

Nicely written and very helpful. Thanks @Ayub Khan for writing this !

avatar
New Contributor

Hi Ayub,

As described in the step 1, Is it required to create some random ids "id":"-11893021824425525" for this json request to be successful.