Created on 05-10-2021 11:15 PM - edited on 05-10-2021 11:25 PM by VidyaSargur
In this article, we will learn how to integrate Spark with Hive JdbcStorageHandler in CDP.
By using the JdbcStorageHandler, we can connect Apache Hive to JDBC Data Sources (MySQL, PostgreSQL, Oracle, DB2, or Derby etc). Currently writing to a JDBC data source is not supported. To use JdbcStorageHandler, we need to create an external table using JdbcStorageHandler.
Example:
CREATE EXTERNAL TABLE employee
(
id BIGINT,
name STRING,
age INT,
salary DOUBLE
)
STORED BY 'org.apache.hive.storage.jdbc.JdbcStorageHandler'
TBLPROPERTIES (
"hive.sql.database.type" = "MYSQL",
"hive.sql.jdbc.driver" = "com.mysql.jdbc.Driver",
"hive.sql.jdbc.url" = "jdbc:mysql://localhost/test",
"hive.sql.dbcp.username" = "username",
"hive.sql.dbcp.password" = "password",
"hive.sql.table" = "EMPLOYEE",
"hive.sql.dbcp.maxActive" = "1"
);
The column data type for a Hive JdbcStorageHandler table can be:
Numeric data type: byte, short, int, long, float, double
Decimal with scale and precision
String data type: string, char, varchar
Date
Timestamp
Note: Complex data type: struct, map, array are not supported.
The following are the steps to integrate with MySQL database:
mysql> create database test;
Query OK, 1 row affected (0.00 sec)
mysql> use test;
Database changed
mysql> drop table if EXISTS test.EMPLOYEE;
mysql> CREATE TABLE test.EMPLOYEE(
id INT,
name varchar(255),
salary DECIMAL,
dob DATE NOT NULL DEFAULT '2021-05-01',
doj TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP(),
PRIMARY KEY (id)
);
Query OK, 0 rows affected (0.01 sec)
mysql> INSERT INTO test.EMPLOYEE (id, name, salary, dob, doj) VALUES (1, "Ranga", 10000.00, '1988-06-01', '2020-03-16 09:00:01.000000');
Query OK, 1 row affected (0.01 sec)
mysql> INSERT INTO test.EMPLOYEE (id, name, salary,dob) VALUES (2, "Nishanth", 50000.00, '2018-05-29');
Query OK, 1 row affected (0.01 sec)
mysql> INSERT INTO test.EMPLOYEE (id, name, salary) VALUES (3, "Raja", 30000.00);
Query OK, 1 row affected (0.01 sec)
mysql> select * from test.EMPLOYEE;
+----+----------+--------+------------+---------------------+
| id | name | salary | dob | doj |
+----+----------+--------+------------+---------------------+
| 1 | Ranga | 10000 | 1988-06-01 | 2020-03-16 09:00:01 |
| 2 | Nishanth | 50000 | 2018-05-29 | 2021-05-01 09:02:33 |
| 3 | Raja | 30000 | 2021-05-01 | 2021-05-01 09:02:45 |
+----+----------+--------+------------+---------------------+
3 rows in set (0.00 sec)
#beeline -u jdbc:hive2://<hiveserver2_host>:<port>/<db>
beeline -u jdbc:hive2://localhost:10000/default
> create database if not exists db_test;
INFO : OK
> use db_test;
INFO : OK
> DROP TABLE IF EXISTS db_test.employee;
> CREATE EXTERNAL TABLE db_test.employee(
id INT,
name STRING,
salary DOUBLE,
dob DATE,
doj TIMESTAMP
)
STORED BY 'org.apache.hive.storage.jdbc.JdbcStorageHandler'
TBLPROPERTIES (
"hive.sql.database.type" = "MYSQL",
"hive.sql.jdbc.driver" = "com.mysql.jdbc.Driver",
"hive.sql.jdbc.url" = "jdbc:mysql://localhost/test",
"hive.sql.dbcp.username" = "ranga",
"hive.sql.dbcp.password" = "ranga",
"hive.sql.query" = "select * from test.EMPLOYEE",
"hive.sql.dbcp.maxActive" = "1"
);
> show tables;
INFO : OK
+-----------+
| tab_name |
+-----------+
| employee |
+-----------+
> select * from db_test.employee;
INFO : OK
+--------------+----------------+------------------+---------------+------------------------+
| employee.id | employee.name | employee.salary | employee.dob | employee.doj |
+--------------+----------------+------------------+---------------+------------------------+
| 1 | Ranga | 10000.0 | 1988-06-01 | 2020-03-16 09:00:01.0 |
| 2 | Nishanth | 50000.0 | 2018-05-29 | 2021-05-01 09:02:33.0 |
| 3 | Raja | 30000.0 | 2021-05-01 | 2021-05-01 09:02:45.0 |
+--------------+----------------+------------------+---------------+------------------------+
3 rows selected (0.349 seconds)
> !exit
spark-shell --master yarn --jars /opt/cloudera/parcels/CDH/lib/hive/lib/hive-jdbc-handler.jar,/usr/share/java/mysql-connector-java.jar
scala> spark.sql("select * from db_test.employee").show(truncate=false)
Hive Session ID = 4e572b40-76c2-4991-b41e-ee7830c9bff7
+---+--------+-------+----------+-------------------+
|id |name |salary |dob |doj |
+---+--------+-------+----------+-------------------+
|1 |Ranga |10000.0|1988-06-01|2020-03-16 09:00:01|
|2 |Nishanth|50000.0|2018-05-29|2021-05-01 09:02:33|
|3 |Raja |30000.0|2021-05-01|2021-05-01 09:02:45|
+---+--------+-------+----------+-------------------+
Thank You!