About aervits

aervits · ‎02-05-2016

@Enis @Devaraj Das @vrodionov @nmaillard @Guilherme Braccialli please review and advise

aervits · ‎02-05-2016

# SANDBOX must have only Host-only network, quorum is sandbox.hortonworks.com /etc/hosts is 192.168.56.101 sandbox.hortonworks.com # create an hbase table from hive CREATE TABLE IF NOT EXISTS hbase_hive_table(key string, value string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:json") TBLPROPERTIES ("hbase.table.name" = "hbase_hive_table"); # in hbase shell access the table hbase(main):001:0> describe 'hbase_hive_table' Table hbase_hive_table is ENABLED hbase_hive_table COLUMN FAMILIES DESCRIPTION{NAME => 'cf', DATA_BLOCK_ENCODING => 'NONE', BLOOMFILTER => 'ROW', REPLICATION_SCOPE => '0', VERSIONS => '1', COMPRESSION => 'NONE', MIN_VERSIONS => '0', TTL => 'FOREVER', KEEP_DELETED_CELLS => 'FALSE', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true'} 1 row(s) in 0.2860 seconds # insert into HBase table through Hive INSERT OVERWRITE TABLE HBASE_HIVE_TABLE SELECT CODE, DESCRIPTION FROM SAMPLE_07; # access data in HBase through Hive SELECT * FROM HBASE_HIVE_TABLE; # access data in HBase through HBase shell hbase(main):001:0> scan 'hbase_hive_table', {LIMIT => 10} # create table in HBase first hbase(main):001:0> create 'JsonTable', 'cf' 0 row(s) in 1.4450 seconds => Hbase::Table - JsonTable hbase(main):002:0> describe 'JsonTable' Table JsonTable is ENABLED JsonTable COLUMN FAMILIES DESCRIPTION {NAME => 'cf', DATA_BLOCK_ENCODING => 'NONE', BLOOMFILTER => 'ROW', REPLICATION_SCOPE => '0', VERSIONS => '1', COMPRESSION => 'NON E', MIN_VERSIONS => '0', TTL => 'FOREVER', KEEP_DELETED_CELLS => 'FALSE', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE = > 'true'} 1 row(s) in 0.1000 seconds # run Java code to load data, code called HBaseLoad source code here # count rows in HBase > count 'JsonTable' Current count: 139000, row: fe671e34-b723-4134-9317-7f31fe2715dd 139861 row(s) in 9.9540 seconds => 139861 # create hbase mapped external table CREATE EXTERNAL TABLE hbase_json_table(key string, json string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:json") TBLPROPERTIES ("hbase.table.name" = "JsonTable"); # count using hive SELECT COUNT(*)FROM HBASE_JSON_TABLE; # query using get_json_object SELECT get_json_object(json, '$.id') AS ID, get_json_object(json, '$.person.last_name') AS LastName, get_json_object(json, '$.person.first_name') AS FirstName, get_json_object(json, '$.person.email') AS email, get_json_object(json, '$.person.location.address') AS Address, get_json_object(json, '$.person.location.city') AS City, get_json_object(json, '$.person.location.state') AS State, get_json_object(json, '$.person.location.zipcode') AS Zip, get_json_object(json, '$.person.text') AS Text, get_json_object(json, '$.person.url') AS URL FROM HBASE_JSON_TABLE; # query using json_tuple SELECT id, lastName, firstName, email, city, state, text, url FROM hbase_json_table A LATERAL VIEW json_tuple(A.json, 'id', 'person') B AS id, person LATERAL VIEW json_tuple(person, 'last_name', 'first_name', 'email', 'text', 'url', 'location') C as lastName, firstName, email, text, url, loc LATERAL VIEW json_tuple(loc, 'city', 'state') D AS city, state; ### Analytics over HBase snapshots ### # create snapshot hbase(main):006:0> snapshot 'hbase_hive_table', 'hbase_hive_table_snapshot'0 row(s) in 0.3390 seconds # use list_snapshots to list all available snapshots # create a restore location sudo -u hdfs hdfs dfs -mkdir /tmp/hbase_snapshots # register snapshot in Hive and query, (TABLE MUST BE MAPPED IN HIVE ALREADY) # NOTE: set command doesn't work in Ambari Views yet, run the following in a script https://issues.apache.org/jira/browse/HIVE-6584 # To query against a snapshot instead of the online table, specify the snapshot name via hive.hbase.snapshot.name. The snapshot will be restored into a unique directory under /tmp. This location can be overridden by setting a path via hive.hbase.snapshot.restoredir. set hive.hbase.snapshot.name=hbase_hive_table_snapshot; set hive.hbase.snapshot.restoredir=/tmp/hbase_snapshots; select * from hbase_hive_table; # set it back to point to table rather than snapshot and delete snapshot hive -e "set hbase.table.name=hbase_hive_table;" echo "delete_snapshot 'hbase_hive_table_snapshot'" | hbase shell # create Hive table with HBase snapshot and reference the HBase timestmap for column family Hive-2828 JIRA Jira for referencing each cell's timestamp is still not patched CREATE EXTERNAL TABLE hbase_json_table(key string, json string, time timestamp) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:json,:timestamp")TBLPROPERTIES ("hbase.table.name" = "JsonTable"); # create hive table using SerdDe DROP TABLE IF EXISTS json_serde_table;CREATE EXTERNAL TABLE json_serde_table ( id string, person struct<email:string, first_name:string, last_name:string, location:struct<address:string, city:string, state:string, zipcode:string>, text:string, url:string>)ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'LOCATION '/tmp/json/'; # upload json file to a location on hdfs hdfs dfs -put data.json /tmp/json/ # query the table as you normally would SELECT id, person.first_name, person.last_name, person.email,person.location.address, person.location.city, person.location.state, person.location.zipcode, person.text, person.urlFROM json_serde_table LIMIT 5; # hbase mapped table with multiple values DROP TABLE IF EXISTS HBASE_TABLE_FROM_SERDE; CREATE EXTERNAL TABLE HBASE_TABLE_FROM_SERDE(key String, ID string, fn string, ln string, email string,address string, city string, state string, zip string, text string, url string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = "cf:id,cf:fn,cf:ln,cf:e,cf:addr,cf:city,cf:state,cf:zip,cf:txt,cf:url") TBLPROPERTIES ("hbase.table.name" = "serde_table"); # hbase mapped table with multiple values INSERT OVERWRITE TABLE hbase_table_from_serde SELECT id as key, id, person.first_name, person.last_name, person.email,person.location.address, person.location.city, person.location.state, person.location.zipcode, person.text, person.urlFROM json_serde_table LIMIT 5; # view in hive SELECT * FROM hbase_table_from_serde LIMIT 5; # view in hbase scan 'serde_table', {LIMIT => 10} get 'serde_table', '00043df9-7630-41c5-8b68-73fe5eb7d636'

aervits · ‎02-05-2016

WebHCat # this will execute a hive query and save result to hdfs file in your home directory called output curl -s -d execute="select+*+from+sample_08;" \ -d statusdir="output" \ 'http://localhost:50111/templeton/v1/hive?user.name=root' # if you ls on the directory, it will have two files, stderr and stdout hdfs dfs -ls output # if the job succeeded, you can cat the stdout file and view the results hdfs dfs -cat output/stdout WebHDFS # list the output directory, notice the webhdfs port curl -i "http://sandbox.hortonworks.com:50070/webhdfs/v1/user/root/output/?op=LISTSTATUS" # read the output file curl -i -L "http://sandbox.hortonworks.com:50070/webhdfs/v1/user/root/output/stdout?op=OPEN" # rename a file, if you get dr. who error, add &user.name=root or any other user in the context curl -i -X PUT "sandbox.hortonworks.com:50070/webhdfs/v1/user/root/output/stdout?op=RENAME&user.name=root&destination=/user/root/newname" # read the output of the new file curl -i -L "http://sandbox.hortonworks.com:50070/webhdfs/v1/user/root/newname?op=OPEN"

aervits · ‎02-05-2016

Sqoop # Sandbox 2.3.2 # using postgres ambari database # edit pg_hba.conf file in /var/lib/pgsql/data # Add the following line as the first line of pg_hba.conf. It allows access to all databases for all users with an encrypted password: ******************************** # TYPE DATABASE USER CIDR-ADDRESS METHOD host all all 0.0.0.0/0 md5 ******************************** ### POSTGRES ### su postgres psql \c ambari #list all tables \dt ambari.* select * from ambari.hosts; ### SQOOP ### #check sqoop version sqoop version # get list of commands sqoop help # more command specific sqoop help import ### DEMO ### # Download Postgres Driver curl -L 'http://jdbc.postgresql.org/download/postgresql-9.2-1002.jdbc4.jar' -o postgresql-9.2-1002.jdbc4.jar # put in the sqoop library sudo cp postgresql-9.2-1002.jdbc4.jar /usr/hdp/current/sqoop-client/lib/ # list tables in ambari database / username ambari password bigdata sqoop list-tables --connect jdbc:postgresql://127.0.0.1/ambari --username ambari -P # sqoop ambari hosts table into hdfs su guestsqoop import --connect jdbc:postgresql://127.0.0.1/ambari --username ambari -P --table hosts --target-dir /user/guest/ambari_hosts_table # HDFS view the file hdfs dfs -cat /user/guest/ambari_hosts_table/part-m-00000 # sqoop ambari hosts table into hdfs using --direct sqoop import --connect jdbc:postgresql://127.0.0.1/ambari --username ambari -password bigdata --table hosts --target-dir /user/guest/ambari_hosts_table_using_direct --direct # sqoop ambari hosts table into hive sqoop import --connect jdbc:postgresql://127.0.0.1/ambari --username ambari -password bigdata --table hosts --hive-import --create-hive-table --direct # HIVE select host_name, cpu_count from hosts; select * from hosts; # SQOOP EXPORT # Create Hive table drop table if exists export_table; create table export_table ( key int, value string ) row format delimited fields terminated by ","; # populate Hive with dummy data insert into export_table values("1", "ExportedValue"); # confirm Hive table has data select * from export_table; # display the values in hive as hdfs files hdfs dfs -cat /apps/hive/warehouse/export_table/000000_0 # export table to MySQL # MySQL table must exist su mysql mysql -u root create database export; use export; create table exported (rowkey int, value varchar(20)); exit; # on HDP 2.3.2 Sandbox, SQOOP-1400 bug, use --driver com.mysql.jdbc driver to overcome the problem # sqoop export from a Hive table into MySQL sqoop export --connect jdbc:mysql://127.0.0.1/export --username hive --password hive --table exported --direct --export-dir /apps/hive/warehouse/export_table --driver com.mysql.jdbc.Driver # login to Mysql and check the table su mysql mysql -u root use export; select * from exported;exit;

aervits · ‎02-02-2016

@Ritesh Chaman this is an article not a question. Look at the last message in your log, change permission to 644.

aervits · ‎02-01-2016

it is not completely obvious but you can certainly run Python scripts within Oozie workflows using the Shell action. Here's a sample job.properties file, nothing special about it. nameNode=hdfs://sandbox.hortonworks.com:8020 jobTracker=sandbox.hortonworks.com:8050 queueName=defaultexamplesRoot=oozie oozie.wf.application.path=${nameNode}/user/${user.name}/${examplesRoot}/apps/python Here's a sample workflow that will look for a script called script.py inside scripts folder <workflow-app xmlns="uri:oozie:workflow:0.4" name="python-wf"> <start to="python-node"/> <action name="python-node"> <shell xmlns="uri:oozie:shell-action:0.2"> <job-tracker>${jobTracker}</job-tracker> <name-node>${nameNode}</name-node> <configuration> <property> <name>mapred.job.queue.name</name> <value>${queueName}</value> </property> </configuration> <exec>script.py</exec> <file>scripts/script.py</file> <capture-output/> </shell> <ok to="end"/> <error to="fail"/> </action> <kill name="fail"> <message>Python action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> </kill> <end name="end"/> </workflow-app> here's my sample script.py #! /usr/bin/env python import os, pwd, sys print "who am I? " + pwd.getpwuid(os.getuid())[0] print "this is a Python script" print "Python Interpreter Version: " + sys.version directory tree for my workflow assuming the workflow directory is called python is as such [root@sandbox python]# tree . ├── job.properties ├── scripts │ └── script.py └── workflow.xml 1 directory, 3 files now you can execute the workflow like any other Oozie workflow. If you wanted to leverage Python3, make sure Python3 is installed on every node. My Python3 script.py looks like this #! /usr/bin/env /usr/local/bin/python3.3 import os, pwd, sys print("who am I? " + pwd.getpwuid(os.getuid())[0]) print("this is a Python script") print("Python Interpreter Version: " + sys.version) Everything else above holds true. You can find my sample workflow source code at the following link, including Python3.

aervits · ‎01-19-2016

@Ali Bajwa big props for updating Nifi to 0.4.1, can you update the step where you say to navigate to http://sandbox.hortonworks.com:9090/ to http://sandbox.hortonworks.com:9090/nifi?

aervits · ‎01-05-2016

Groovy UDF example Can be compiled at run time Currently only works in "hive" shell, does not work in beeline <code>su guest hive paste the following code into the hive shellthis will use Groovy String replace function to replace all instances of lower case 'e' with 'E' <code>compile `import org.apache.hadoop.hive.ql.exec.UDF \; import org.apache.hadoop.io.Text \; public class Replace extends UDF { public Text evaluate(Text s){ if (s == null) return null \; return new Text(s.toString().replace('e', 'E')) \; } } ` AS GROOVY NAMED Replace.groovy; now create a temporary function to leverage the Groovy UDF <code>CREATE TEMPORARY FUNCTION Replace as 'Replace'; now you can use the function in your SQL <code>SELECT Replace(description) FROM sample_08 limit 5; full example <code>hive> compile `import org.apache.hadoop.hive.ql.exec.UDF \; > import org.apache.hadoop.io.Text \; > public class Replace extends UDF { > public Text evaluate(Text s){ > if (s == null) return null \; > return new Text(s.toString().replace('e', 'E')) \; > } > } ` AS GROOVY NAMED Replace.groovy; Added [/tmp/0_1452022176763.jar] to class path Added resources: [/tmp/0_1452022176763.jar] hive> CREATE TEMPORARY FUNCTION Replace as 'Replace'; OK Time taken: 1.201 seconds hive> SELECT Replace(description) FROM sample_08 limit 5; OK All Occupations ManagEmEnt occupations ChiEf ExEcutivEs GEnEral and opErations managErs LEgislators Time taken: 6.373 seconds, Fetched: 5 row(s) hive> Another example this will duplicate any String passed to the function <code>compile `import org.apache.hadoop.hive.ql.exec.UDF \; import org.apache.hadoop.io.Text \; public class Duplicate extends UDF { public Text evaluate(Text s){ if (s == null) return null \; return new Text(s.toString() * 2) \; } } ` AS GROOVY NAMED Duplicate.groovy; CREATE TEMPORARY FUNCTION Duplicate as 'Duplicate'; SELECT Duplicate(description) FROM sample_08 limit 5; All OccupationsAll Occupations Management occupationsManagement occupations Chief executivesChief executives General and operations managersGeneral and operations managers LegislatorsLegislators JSON Parsing UDF <code>compile `import org.apache.hadoop.hive.ql.exec.UDF \; import groovy.json.JsonSlurper \; import org.apache.hadoop.io.Text \; public class JsonExtract extends UDF { public int evaluate(Text a){ def jsonSlurper = new JsonSlurper() \; def obj = jsonSlurper.parseText(a.toString())\; return obj.val1\; } } ` AS GROOVY NAMED json_extract.groovy; CREATE TEMPORARY FUNCTION json_extract as 'JsonExtract'; SELECT json_extract('{"val1": 2}') from date_dim limit 1; 2

aervits · ‎12-30-2015

I’m going to show you a neat way to work with CSV files and Apache Hive. Usually, you’d have to do some preparatory work on CSV data before you can consume it with Hive but I’d like to show you a built-in SerDe (Serializer/Deseriazlier) for Hive that will make it a lot more convenient to work with CSV. This work was merged in Hive 0.14 and there’s no additional steps necessary to work with CSV from Hive. Suppose you have a CSV file with the following entries id first_name last_name email gender ip_address 1 James Coleman jcoleman0@cam.ac.uk Male 136.90.241.52 2 Lillian Lawrence llawrence1@statcounter.com Female 101.177.15.130 3 Theresa Hall thall2@sohu.com Female 114.123.153.64 4 Samuel Tucker stucker3@sun.com Male 89.60.227.31 5 Emily Dixon edixon4@surveymonkey.com Female 119.92.21.19 to consume it from within Hive, you’ll need to upload it to hdfs hdfs dfs -put sample.csv /tmp/serdes/ now all it takes is to create a table schema on top of the file drop table if exists sample; create external table sample(id int,first_name string,last_name string,email string,gender string,ip_address string) row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde' stored as textfile location '/tmp/serdes/'; now you can query the table as is select * from sample limit 10; but what if your CSV file was tab-delimited rather than comma? well the SerDe got you covered there too: drop table if exists sample; create external table sample(id int,first_name string,last_name string,email string,gender string,ip_address string) row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde' with serdeproperties ( "separatorChar" = "\t" ) stored as textfile location '/tmp/serdes/'; notice the separatorChar argument, in all, the SerDe accepts two more arguments; custom escape characters and quote characters Take a look at the wiki for more info https://cwiki.apache.org/confluence/display/Hive/CSV+Serde.

aervits · ‎12-30-2015

thanks @azeltov and @Ali Bajwa the latest changes fixed the problem with this tutorial.

Online	Offline
Last Visited	‎08-15-2019 06:35 AM

Member Since	‎10-01-2015 11:46 AM
Last Visited	‎08-15-2019 06:35 AM
Posts	3,933
Kudos received	1067

Cloudera Community

Re: working with HBase and Hive (WIP)

working with HBase and Hive (WIP)

WebHCat and WebHDFS tutorial

sqoop import/export tutorial

Re: ERROR : Error: Could not find or load main cla...

Oozie Python workflow example walkthrough

Re: Sample HDF/NiFi flow to Push Tweets into Solr/...

Apache Hive Groovy UDF examples

Apache Hive CSV SerDe Example

Re: Index Documents using HDPSearch in HDP 2.3