Community Articles

Find and share helpful community-sourced technical articles.
Announcements
Celebrating as our community reaches 100,000 members! Thank you!
Labels (2)
avatar
Expert Contributor

1. You can update the share lib with the following jars or can be directly passed in oozie workflow.xml. ( Make sure you use 3.2 version not 4.x datanucleus jars )

/usr/hdp/current/spark-client/lib/datanucleus-rdbms-3.2.9.jar

/usr/hdp/current/spark-client/lib/datanucleus-core-3.2.10.jar

/usr/hdp/current/spark-client/lib/datanucleus-api-jdo-3.2.6.jar

  • To copy jars to sharelib

#hdfs dfs –put /usr/hdp/current/spark-client/lib/datanucleus-rdbms-3.2.9.jar /user/oozie/share/lib/lib_*/spark/

If you copy the jars to sharelib make sure run oozie sharelibupdate

  • Update oozie sharelib:

# oozie admin -oozie http://<oozie-server>:11000/oozie -sharelibupdate

  • Verify the current spark action sharelib with all the above files:

# oozie admin -oozie http://<oozie-server>:11000/oozie -shareliblist spark*

  • Make sure you have hive-site.xml in sharelib too and have the following properties in it. Replace the values with your hive-site.xml values.
<configuration>
<property>
<name>hive.metastore.kerberos.keytab.file</name>
<value>/etc/security/keytabs/hive.service.keytab</value>
</property>
<property>
<name>hive.metastore.kerberos.principal</name>
<value>hive/_HOST@SANDBOX.COM</value>
</property>
<property>
<name>hive.metastore.sasl.enabled</name>
<value>true</value>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://vb-atlas-node1.hortonworks.com:9083</value>
</property>
<property>
<name>hive.server2.authentication</name>
<value>KERBEROS</value>
</property>
<property>
<name>hive.server2.authentication.kerberos.keytab</name>
<value>/etc/security/keytabs/hive.service.keytab</value>
</property>
<property>
<name>hive.server2.authentication.kerberos.principal</name>
<value>hive/_HOST@SANDBOX.COM</value>
</property>
<property>
<name>hive.server2.authentication.spnego.keytab</name>
<value>/etc/security/keytabs/spnego.service.keytab</value>
</property>
<property>
<name>hive.server2.authentication.spnego.principal</name>
<value>HTTP/_HOST@SANDBOX.COM</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/apps/hive/warehouse</value>
</property>
<property>
<name>hive.metastore.cache.pinobjtypes</name>
<value>Table,Database,Type,FieldSchema,Order</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://vb-atlas-node1.hortonworks.com/hive?createDatabaseIfNotExist=true</value>
</property>
</configuration>

2. Create a workflow.xml, please make sure you replace the Metastore url and jar's location.

<workflow-app name="spark-wf" xmlns="uri:oozie:workflow:0.5">

<credentials>

<credential name='hcat_auth' type='hcat'>

<property>

<name>hcat.metastore.uri</name>

<value>thrift://vb-atlas-node1.hortonworks.com:9083</value>

</property>

<property>

<name>hcat.metastore.principal</name>

<value>hive/_HOST@SANDBOX.COM</value>

</property>

</credential>

</credentials>

<start to="spark-action"/>

<action name="spark-action" cred='hcat_auth'>

<spark xmlns="uri:oozie:spark-action:0.1">

<job-tracker>${jobTracker}</job-tracker>

<name-node>${nameNode}</name-node>

<prepare>

<delete path="${nameNode}/user/spark/sparkOozie/output-data/spark"/>

</prepare>

<master>${master}</master>

<name>Spark Hive Example</name>

<class>com.hortonworks.vinod.SparkSqlExample</class>

<jar>${nameNode}/user/{User_You_run _as}/lib/Spark-Example-vinod-0.0.1-SNAPSHOT.jar</jar>

<spark-opts>--driver-memory 512m --executor-memory 512m --num-executors 1 --jars /usr/hdp/current/spark-client/lib/datanucleus-rdbms-3.2.9.jar,/usr/hdp/current/spark-client/lib/datanucleus-core-3.2.10.jar,/usr/hdp/current/spark-client/lib/datanucleus-api-jdo-3.2.6.jar --files /usr/hdp/current/spark-client/conf/hive-site.xml</spark-opts>

<arg>thrift://vb-atlas-node1.hortonworks.com:9083</arg>

</spark>

<ok to="end"/>

<error to="kill"/>

</action>

<kill name="kill">

<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>

</kill>

<end name="end"/>

</workflow-app>

3. Upload the jars to run the program and the input file to home path as which you run the oozie job as.

# hdfs dfs -put Spark-Example-vinod-0.0.1–SNAPSHOT.jar /user/{User_You_run _as}/lib/Spark-Example-vinod-0.0.1–SNAPSHOT.jar

#hdfs dfs –put input.txt /user/{User_You_run _as}/

- Upload workflow.xml to HDFS:

For example:

# hdfs dfs -put workflow.xml /user/{User_You_run _as}/

4. Config the job.properties and run the job.

nameNode:hdfs://<namenode_HOST>:8020

jobTracker= <Resource_Manager.:8050

oozie.wf.application.path=/user/{User_You_run _as}/

oozie.use.system.libpath=true

master=yarn-cluster

5. Run the oozie job with the properites:

# oozie job -oozie http://<oozie-server>:11000/oozie/ -config job.properties -run

  • You should be seeing Spark Hive Example in Resource Manager and output will be in std.out

Log Type: stdout

Log Upload Time: Fri Mar 10 22:30:16 +0000 2017

Log Length: 99

+---+-------+

| id| name|

+---+-------+

| 1|sample1|

| 2|sample2|

| 3|sample3|

+---+———+

6 == com.hortonworks.vinod.SparkSqlExample.class ===

package com.hortonworks.vinod;
import java.io.IOException;
import javax.security.auth.login.Configuration;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.hive.HiveContext;
public class SparkSqlExample {
public static void main(String[] args) throws IOException {
org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
conf.addResource("/etc/hadoop/conf/core-site.xml");
conf.addResource("/etc/hadoop/conf/hdfs-site.xml");
conf.addResource("/etc/hive/conf/hive-site.xml");
FileSystem fs = FileSystem.get(conf);
SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL");
SparkContext sss = new SparkContext(sparkConf);
// JavaSparkContext ctx = new JavaSparkContext(sparkConf);
HiveContext hivecontex = new HiveContext(sss);
hivecontex.sql("create external table if not exists SparkHiveExample ( id int, name string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TextFile");
hivecontex.sql("LOAD DATA INPATH 'input.txt' OVERWRITE INTO TABLE SparkHiveExample");
DataFrame df = hivecontex.sql("select * from SparkHiveExample");
df.show();
}
}

7 pom.xml

<projectxmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.hortonworks.sparkExample</groupId>
<artifactId>Spark-Example-vinod</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>Spark Examples</name>
<description>Spark programs </description>
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.10</artifactId>
<version>1.6.2</version>
</parent>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>1.6.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive_2.10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.10</artifactId>
<version>1.6.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.scala-lang/scala-library -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.10.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.0</version>
</dependency>
</dependencies>
</project>
7,778 Views
Comments
avatar
Super Collaborator

Is this needed even after the HDP 2.5 native Oozie Spark action?