Created on 02-05-2016 06:59 PM
# SANDBOX must have only Host-only network,
quorum is sandbox.hortonworks.com
/etc/hosts is 192.168.56.101 sandbox.hortonworks.com
# create an hbase table from hive
CREATE TABLE IF NOT EXISTS hbase_hive_table(key string, value string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:json") TBLPROPERTIES ("hbase.table.name" = "hbase_hive_table");
# in hbase shell access the table
hbase(main):001:0> describe 'hbase_hive_table' Table hbase_hive_table is ENABLED hbase_hive_table COLUMN FAMILIES DESCRIPTION{NAME => 'cf', DATA_BLOCK_ENCODING => 'NONE', BLOOMFILTER => 'ROW', REPLICATION_SCOPE => '0', VERSIONS => '1', COMPRESSION => 'NONE', MIN_VERSIONS => '0', TTL => 'FOREVER', KEEP_DELETED_CELLS => 'FALSE', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true'} 1 row(s) in 0.2860 seconds
# insert into HBase table through Hive
INSERT OVERWRITE TABLE HBASE_HIVE_TABLE SELECT CODE, DESCRIPTION FROM SAMPLE_07;
# access data in HBase through Hive
SELECT * FROM HBASE_HIVE_TABLE;
# access data in HBase through HBase shell
hbase(main):001:0> scan 'hbase_hive_table', {LIMIT => 10}
# create table in HBase first
hbase(main):001:0> create 'JsonTable', 'cf' 0 row(s) in 1.4450 seconds => Hbase::Table - JsonTable hbase(main):002:0> describe 'JsonTable' Table JsonTable is ENABLED JsonTable COLUMN FAMILIES DESCRIPTION {NAME => 'cf', DATA_BLOCK_ENCODING => 'NONE', BLOOMFILTER => 'ROW', REPLICATION_SCOPE => '0', VERSIONS => '1', COMPRESSION => 'NON E', MIN_VERSIONS => '0', TTL => 'FOREVER', KEEP_DELETED_CELLS => 'FALSE', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE = > 'true'} 1 row(s) in 0.1000 seconds
# run Java code to load data, code called HBaseLoad
# count rows in HBase
> count 'JsonTable' Current count: 139000, row: fe671e34-b723-4134-9317-7f31fe2715dd 139861 row(s) in 9.9540 seconds => 139861
# create hbase mapped external table
CREATE EXTERNAL TABLE hbase_json_table(key string, json string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:json") TBLPROPERTIES ("hbase.table.name" = "JsonTable");
# count using hive
SELECT COUNT(*)FROM HBASE_JSON_TABLE;
# query using get_json_object
SELECT get_json_object(json, '$.id') AS ID, get_json_object(json, '$.person.last_name') AS LastName, get_json_object(json, '$.person.first_name') AS FirstName, get_json_object(json, '$.person.email') AS email, get_json_object(json, '$.person.location.address') AS Address, get_json_object(json, '$.person.location.city') AS City, get_json_object(json, '$.person.location.state') AS State, get_json_object(json, '$.person.location.zipcode') AS Zip, get_json_object(json, '$.person.text') AS Text, get_json_object(json, '$.person.url') AS URL FROM HBASE_JSON_TABLE;
# query using json_tuple
SELECT id, lastName, firstName, email, city, state, text, url FROM hbase_json_table A LATERAL VIEW json_tuple(A.json, 'id', 'person') B AS id, person LATERAL VIEW json_tuple(person, 'last_name', 'first_name', 'email', 'text', 'url', 'location') C as lastName, firstName, email, text, url, loc LATERAL VIEW json_tuple(loc, 'city', 'state') D AS city, state;
### Analytics over HBase snapshots ###
# create snapshot
hbase(main):006:0> snapshot 'hbase_hive_table', 'hbase_hive_table_snapshot'0 row(s) in 0.3390 seconds
# use list_snapshots to list all available snapshots
# create a restore location
sudo -u hdfs hdfs dfs -mkdir /tmp/hbase_snapshots
# register snapshot in Hive and query, (TABLE MUST BE MAPPED IN HIVE ALREADY)
# NOTE: set command doesn't work in Ambari Views yet, run the following in a script
https://issues.apache.org/jira/browse/HIVE-6584
# To query against a snapshot instead of the online table, specify the snapshot name via hive.hbase.snapshot.name. The snapshot will be restored into a unique directory under /tmp. This location can be overridden by setting a path via hive.hbase.snapshot.restoredir.
set hive.hbase.snapshot.name=hbase_hive_table_snapshot; set hive.hbase.snapshot.restoredir=/tmp/hbase_snapshots; select * from hbase_hive_table;
# set it back to point to table rather than snapshot and delete snapshot
hive -e "set hbase.table.name=hbase_hive_table;" echo "delete_snapshot 'hbase_hive_table_snapshot'" | hbase shell
# create Hive table with HBase snapshot and reference the HBase timestmap for column family
Hive-2828 JIRA
Jira for referencing each cell's timestamp is still not patched
CREATE EXTERNAL TABLE hbase_json_table(key string, json string, time timestamp) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:json,:timestamp")TBLPROPERTIES ("hbase.table.name" = "JsonTable");
# create hive table using SerdDe
DROP TABLE IF EXISTS json_serde_table;CREATE EXTERNAL TABLE json_serde_table ( id string, person struct<email:string, first_name:string, last_name:string, location:struct<address:string, city:string, state:string, zipcode:string>, text:string, url:string>)ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'LOCATION '/tmp/json/';
# upload json file to a location on hdfs
hdfs dfs -put data.json /tmp/json/
# query the table as you normally would
SELECT id, person.first_name, person.last_name, person.email,person.location.address, person.location.city, person.location.state, person.location.zipcode, person.text, person.urlFROM json_serde_table LIMIT 5;
# hbase mapped table with multiple values
DROP TABLE IF EXISTS HBASE_TABLE_FROM_SERDE; CREATE EXTERNAL TABLE HBASE_TABLE_FROM_SERDE(key String, ID string, fn string, ln string, email string,address string, city string, state string, zip string, text string, url string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = "cf:id,cf:fn,cf:ln,cf:e,cf:addr,cf:city,cf:state,cf:zip,cf:txt,cf:url") TBLPROPERTIES ("hbase.table.name" = "serde_table");
# hbase mapped table with multiple values
INSERT OVERWRITE TABLE hbase_table_from_serde SELECT id as key, id, person.first_name, person.last_name, person.email,person.location.address, person.location.city, person.location.state, person.location.zipcode, person.text, person.urlFROM json_serde_table LIMIT 5;
# view in hive
SELECT * FROM hbase_table_from_serde LIMIT 5;
# view in hbase
scan 'serde_table', {LIMIT => 10} get 'serde_table', '00043df9-7630-41c5-8b68-73fe5eb7d636'
Created on 02-05-2016 07:01 PM
@Enis @Devaraj Das @vrodionov @nmaillard @Guilherme Braccialli please review and advise
Created on 11-17-2016 03:30 PM
This is a great article, can we do ATLAS tagging to fields in Hbase, by tagging the external table. Can you apply Ranger policies to that??
Created on 11-20-2016 11:11 PM
@Satish Bomma it would make an awesome follow up to this article, I am not sure whether Ranger authorization can be applied to tables driven by HBaseStorageHandlers, native Atlas/Ranger integration is coming in the next release.
Created on 06-19-2017 08:58 AM
@Artem Ervits This is a great article Do we have performance comparison of HIVE on HDFS vs HIVE on HBase. Is it advisable to come up with Hive on Hbase in production for large datasets?
Created on 06-30-2017 08:05 PM
@riyer I'd avoid going against HBase with Hive. Generating a snapshot is so trivial that you should consider going that route first. On average, going against a snapshot should be 2.5x times better than going against HBase directly.