Created on 03-11-2017 02:22 AM
1. You can update the share lib with the following jars or can be directly passed in oozie workflow.xml. ( Make sure you use 3.2 version not 4.x datanucleus jars )
/usr/hdp/current/spark-client/lib/datanucleus-rdbms-3.2.9.jar
/usr/hdp/current/spark-client/lib/datanucleus-core-3.2.10.jar
/usr/hdp/current/spark-client/lib/datanucleus-api-jdo-3.2.6.jar
#hdfs dfs –put /usr/hdp/current/spark-client/lib/datanucleus-rdbms-3.2.9.jar /user/oozie/share/lib/lib_*/spark/
If you copy the jars to sharelib make sure run oozie sharelibupdate
# oozie admin -oozie http://<oozie-server>:11000/oozie -sharelibupdate
# oozie admin -oozie http://<oozie-server>:11000/oozie -shareliblist spark*
<configuration> <property> <name>hive.metastore.kerberos.keytab.file</name> <value>/etc/security/keytabs/hive.service.keytab</value> </property> <property> <name>hive.metastore.kerberos.principal</name> <value>hive/_HOST@SANDBOX.COM</value> </property> <property> <name>hive.metastore.sasl.enabled</name> <value>true</value> </property> <property> <name>hive.metastore.uris</name> <value>thrift://vb-atlas-node1.hortonworks.com:9083</value> </property> <property> <name>hive.server2.authentication</name> <value>KERBEROS</value> </property> <property> <name>hive.server2.authentication.kerberos.keytab</name> <value>/etc/security/keytabs/hive.service.keytab</value> </property> <property> <name>hive.server2.authentication.kerberos.principal</name> <value>hive/_HOST@SANDBOX.COM</value> </property> <property> <name>hive.server2.authentication.spnego.keytab</name> <value>/etc/security/keytabs/spnego.service.keytab</value> </property> <property> <name>hive.server2.authentication.spnego.principal</name> <value>HTTP/_HOST@SANDBOX.COM</value> </property> <property> <name>hive.metastore.warehouse.dir</name> <value>/apps/hive/warehouse</value> </property> <property> <name>hive.metastore.cache.pinobjtypes</name> <value>Table,Database,Type,FieldSchema,Order</value> </property> <property> <name>javax.jdo.option.ConnectionDriverName</name> <value>com.mysql.jdbc.Driver</value> </property> <property> <name>javax.jdo.option.ConnectionURL</name> <value>jdbc:mysql://vb-atlas-node1.hortonworks.com/hive?createDatabaseIfNotExist=true</value> </property> </configuration>
2. Create a workflow.xml, please make sure you replace the Metastore url and jar's location.
<workflow-app name="spark-wf" xmlns="uri:oozie:workflow:0.5">
<credentials>
<credential name='hcat_auth' type='hcat'>
<property>
<name>hcat.metastore.uri</name>
<value>thrift://vb-atlas-node1.hortonworks.com:9083</value>
</property>
<property>
<name>hcat.metastore.principal</name>
<value>hive/_HOST@SANDBOX.COM</value>
</property>
</credential>
</credentials>
<start to="spark-action"/>
<action name="spark-action" cred='hcat_auth'>
<spark xmlns="uri:oozie:spark-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<prepare>
<delete path="${nameNode}/user/spark/sparkOozie/output-data/spark"/>
</prepare>
<master>${master}</master>
<name>Spark Hive Example</name>
<class>com.hortonworks.vinod.SparkSqlExample</class>
<jar>${nameNode}/user/{User_You_run _as}/lib/Spark-Example-vinod-0.0.1-SNAPSHOT.jar</jar>
<spark-opts>--driver-memory 512m --executor-memory 512m --num-executors 1 --jars /usr/hdp/current/spark-client/lib/datanucleus-rdbms-3.2.9.jar,/usr/hdp/current/spark-client/lib/datanucleus-core-3.2.10.jar,/usr/hdp/current/spark-client/lib/datanucleus-api-jdo-3.2.6.jar --files /usr/hdp/current/spark-client/conf/hive-site.xml</spark-opts>
<arg>thrift://vb-atlas-node1.hortonworks.com:9083</arg>
</spark>
<ok to="end"/>
<error to="kill"/>
</action>
<kill name="kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name="end"/>
</workflow-app>
3. Upload the jars to run the program and the input file to home path as which you run the oozie job as.
# hdfs dfs -put Spark-Example-vinod-0.0.1–SNAPSHOT.jar /user/{User_You_run _as}/lib/Spark-Example-vinod-0.0.1–SNAPSHOT.jar
#hdfs dfs –put input.txt /user/{User_You_run _as}/
- Upload workflow.xml to HDFS:
For example:
# hdfs dfs -put workflow.xml /user/{User_You_run _as}/
4. Config the job.properties and run the job.
nameNode:hdfs://<namenode_HOST>:8020
jobTracker= <Resource_Manager.:8050
oozie.wf.application.path=/user/{User_You_run _as}/
oozie.use.system.libpath=true
master=yarn-cluster
5. Run the oozie job with the properites:
# oozie job -oozie http://<oozie-server>:11000/oozie/ -config job.properties -run
Log Type: stdout
Log Upload Time: Fri Mar 10 22:30:16 +0000 2017
Log Length: 99
+---+-------+
| id| name|
+---+-------+
| 1|sample1|
| 2|sample2|
| 3|sample3|
+---+———+
6 == com.hortonworks.vinod.SparkSqlExample.class ===
package com.hortonworks.vinod; import java.io.IOException; import javax.security.auth.login.Configuration; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.hive.HiveContext; public class SparkSqlExample { public static void main(String[] args) throws IOException { org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration(); conf.addResource("/etc/hadoop/conf/core-site.xml"); conf.addResource("/etc/hadoop/conf/hdfs-site.xml"); conf.addResource("/etc/hive/conf/hive-site.xml"); FileSystem fs = FileSystem.get(conf); SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL"); SparkContext sss = new SparkContext(sparkConf); // JavaSparkContext ctx = new JavaSparkContext(sparkConf); HiveContext hivecontex = new HiveContext(sss); hivecontex.sql("create external table if not exists SparkHiveExample ( id int, name string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TextFile"); hivecontex.sql("LOAD DATA INPATH 'input.txt' OVERWRITE INTO TABLE SparkHiveExample"); DataFrame df = hivecontex.sql("select * from SparkHiveExample"); df.show(); } }
7 pom.xml
<projectxmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.hortonworks.sparkExample</groupId> <artifactId>Spark-Example-vinod</artifactId> <version>0.0.1-SNAPSHOT</version> <name>Spark Examples</name> <description>Spark programs </description> <parent> <groupId>org.apache.spark</groupId> <artifactId>spark-parent_2.10</artifactId> <version>1.6.2</version> </parent> <dependencies> <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.10 --> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>1.6.0</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive_2.10 --> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.10</artifactId> <version>1.6.1</version> </dependency> <!-- https://mvnrepository.com/artifact/org.scala-lang/scala-library --> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>2.10.6</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>2.7.0</version> </dependency> </dependencies> </project>
Created on 03-12-2017 01:51 AM
Is this needed even after the HDP 2.5 native Oozie Spark action?