Community Articles

aervits · ‎02-05-2016

# SANDBOX must have only Host-only network,

quorum is sandbox.hortonworks.com

/etc/hosts is 192.168.56.101 sandbox.hortonworks.com

# create an hbase table from hive

CREATE TABLE IF NOT EXISTS hbase_hive_table(key string, value string)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:json")
TBLPROPERTIES ("hbase.table.name" = "hbase_hive_table");

# in hbase shell access the table

hbase(main):001:0> describe 'hbase_hive_table'

Table hbase_hive_table is ENABLED

hbase_hive_table COLUMN FAMILIES DESCRIPTION{NAME => 'cf', DATA_BLOCK_ENCODING => 'NONE', BLOOMFILTER => 'ROW', REPLICATION_SCOPE => '0', VERSIONS => '1', COMPRESSION => 'NONE', MIN_VERSIONS => '0', TTL => 'FOREVER', KEEP_DELETED_CELLS => 'FALSE', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true'}
1 row(s) in 0.2860 seconds

# insert into HBase table through Hive

INSERT OVERWRITE TABLE HBASE_HIVE_TABLE SELECT CODE, DESCRIPTION FROM SAMPLE_07;

# access data in HBase through Hive

SELECT * FROM HBASE_HIVE_TABLE;

# access data in HBase through HBase shell

hbase(main):001:0> scan 'hbase_hive_table', {LIMIT => 10}

# create table in HBase first

hbase(main):001:0> create 'JsonTable', 'cf'
0 row(s) in 1.4450 seconds

=> Hbase::Table - JsonTable
hbase(main):002:0> describe 'JsonTable'
Table JsonTable is ENABLED
JsonTable
COLUMN FAMILIES DESCRIPTION
{NAME => 'cf', DATA_BLOCK_ENCODING => 'NONE', BLOOMFILTER => 'ROW', REPLICATION_SCOPE => '0', VERSIONS => '1', COMPRESSION => 'NON
E', MIN_VERSIONS => '0', TTL => 'FOREVER', KEEP_DELETED_CELLS => 'FALSE', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE =
> 'true'}
1 row(s) in 0.1000 seconds

# run Java code to load data, code called HBaseLoad

source code here

# count rows in HBase

> count 'JsonTable'

Current count: 139000, row: fe671e34-b723-4134-9317-7f31fe2715dd
139861 row(s) in 9.9540 seconds
=> 139861

# create hbase mapped external table

CREATE EXTERNAL TABLE hbase_json_table(key string, json string)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:json")
TBLPROPERTIES ("hbase.table.name" = "JsonTable");

# count using hive

SELECT COUNT(*)FROM HBASE_JSON_TABLE;

# query using get_json_object

SELECT get_json_object(json, '$.id') AS ID,
get_json_object(json, '$.person.last_name') AS LastName,
get_json_object(json, '$.person.first_name') AS FirstName,
get_json_object(json, '$.person.email') AS email,
get_json_object(json, '$.person.location.address') AS Address,
get_json_object(json, '$.person.location.city') AS City,
get_json_object(json, '$.person.location.state') AS State,
get_json_object(json, '$.person.location.zipcode') AS Zip,
get_json_object(json, '$.person.text') AS Text,
get_json_object(json, '$.person.url') AS URL
FROM HBASE_JSON_TABLE;

# query using json_tuple

SELECT id, lastName, firstName, email, city, state, text, url FROM hbase_json_table A

LATERAL VIEW json_tuple(A.json, 'id', 'person') B AS id, person
LATERAL VIEW json_tuple(person, 'last_name', 'first_name', 'email',
'text', 'url', 'location') C as lastName, firstName, email, text, url, loc
LATERAL VIEW json_tuple(loc, 'city', 'state') D AS city, state;

### Analytics over HBase snapshots ###

# create snapshot

hbase(main):006:0> snapshot 'hbase_hive_table', 'hbase_hive_table_snapshot'0 row(s) in 0.3390 seconds

# use list_snapshots to list all available snapshots

# create a restore location

sudo -u hdfs hdfs dfs -mkdir /tmp/hbase_snapshots

# register snapshot in Hive and query, (TABLE MUST BE MAPPED IN HIVE ALREADY)

# NOTE: set command doesn't work in Ambari Views yet, run the following in a script

https://issues.apache.org/jira/browse/HIVE-6584

# To query against a snapshot instead of the online table, specify the snapshot name via hive.hbase.snapshot.name. The snapshot will be restored into a unique directory under /tmp. This location can be overridden by setting a path via hive.hbase.snapshot.restoredir.

set hive.hbase.snapshot.name=hbase_hive_table_snapshot;
set hive.hbase.snapshot.restoredir=/tmp/hbase_snapshots;
select * from hbase_hive_table;

# set it back to point to table rather than snapshot and delete snapshot

hive -e "set hbase.table.name=hbase_hive_table;"
echo "delete_snapshot 'hbase_hive_table_snapshot'" | hbase shell

# create Hive table with HBase snapshot and reference the HBase timestmap for column family

Hive-2828 JIRA

Jira for referencing each cell's timestamp is still not patched

CREATE EXTERNAL TABLE hbase_json_table(key string, json string, time timestamp) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:json,:timestamp")TBLPROPERTIES ("hbase.table.name" = "JsonTable");

# create hive table using SerdDe

DROP TABLE IF EXISTS json_serde_table;CREATE EXTERNAL TABLE json_serde_table (  id string,  person struct<email:string, first_name:string, last_name:string, location:struct<address:string, city:string, state:string, zipcode:string>, text:string, url:string>)ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'LOCATION '/tmp/json/';

# upload json file to a location on hdfs

hdfs dfs -put data.json /tmp/json/

# query the table as you normally would

SELECT id, person.first_name, person.last_name, person.email,person.location.address, person.location.city, person.location.state, person.location.zipcode, person.text, person.urlFROM json_serde_table LIMIT 5;

# hbase mapped table with multiple values

DROP TABLE IF EXISTS HBASE_TABLE_FROM_SERDE;
CREATE EXTERNAL TABLE HBASE_TABLE_FROM_SERDE(key String, ID string, fn string, ln string, email string,address string, city string, state string, zip string, text string, url string) 
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = "cf:id,cf:fn,cf:ln,cf:e,cf:addr,cf:city,cf:state,cf:zip,cf:txt,cf:url")
TBLPROPERTIES ("hbase.table.name" = "serde_table");

# hbase mapped table with multiple values

INSERT OVERWRITE TABLE hbase_table_from_serde SELECT id as key, id, person.first_name, person.last_name, person.email,person.location.address, person.location.city, person.location.state, person.location.zipcode, person.text, person.urlFROM json_serde_table LIMIT 5;

# view in hive

SELECT * FROM hbase_table_from_serde LIMIT 5;

# view in hbase

scan 'serde_table', {LIMIT => 10}
get 'serde_table', '00043df9-7630-41c5-8b68-73fe5eb7d636'

aervits · ‎02-05-2016

@Enis @Devaraj Das @vrodionov @nmaillard @Guilherme Braccialli please review and advise

sbomma · ‎11-17-2016

This is a great article, can we do ATLAS tagging to fields in Hbase, by tagging the external table. Can you apply Ranger policies to that??

aervits · ‎11-20-2016

@Satish Bomma it would make an awesome follow up to this article, I am not sure whether Ranger authorization can be applied to tables driven by HBaseStorageHandlers, native Atlas/Ranger integration is coming in the next release.

riyer · ‎06-19-2017

@Artem Ervits This is a great article Do we have performance comparison of HIVE on HDFS vs HIVE on HBase. Is it advisable to come up with Hive on Hbase in production for large datasets?

aervits · ‎06-30-2017

@riyer I'd avoid going against HBase with Hive. Generating a snapshot is so trivial that you should consider going that route first. On average, going against a snapshot should be 2.5x times better than going against HBase directly.

Cloudera Community

Community Articles

working with HBase and Hive (WIP)

Apache HBase

Apache Hive

Re: working with HBase and Hive (WIP)

Re: working with HBase and Hive (WIP)

Re: working with HBase and Hive (WIP)

Re: working with HBase and Hive (WIP)

Re: working with HBase and Hive (WIP)