Community Articles

Find and share helpful community-sourced technical articles.
Labels (2)
avatar
Expert Contributor

1. You can update the share lib with the following jars or can be directly passed in oozie workflow.xml. ( Make sure you use 3.2 version not 4.x datanucleus jars )

/usr/hdp/current/spark-client/lib/datanucleus-rdbms-3.2.9.jar

/usr/hdp/current/spark-client/lib/datanucleus-core-3.2.10.jar

/usr/hdp/current/spark-client/lib/datanucleus-api-jdo-3.2.6.jar

  • To copy jars to sharelib

#hdfs dfs –put /usr/hdp/current/spark-client/lib/datanucleus-rdbms-3.2.9.jar /user/oozie/share/lib/lib_*/spark/

If you copy the jars to sharelib make sure run oozie sharelibupdate

  • Update oozie sharelib:

# oozie admin -oozie http://<oozie-server>:11000/oozie -sharelibupdate

  • Verify the current spark action sharelib with all the above files:

# oozie admin -oozie http://<oozie-server>:11000/oozie -shareliblist spark*

  • Make sure you have hive-site.xml in sharelib too and have the following properties in it. Replace the values with your hive-site.xml values.
<configuration>
<property>
<name>hive.metastore.kerberos.keytab.file</name>
<value>/etc/security/keytabs/hive.service.keytab</value>
</property>
<property>
<name>hive.metastore.kerberos.principal</name>
<value>hive/_HOST@SANDBOX.COM</value>
</property>
<property>
<name>hive.metastore.sasl.enabled</name>
<value>true</value>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://vb-atlas-node1.hortonworks.com:9083</value>
</property>
<property>
<name>hive.server2.authentication</name>
<value>KERBEROS</value>
</property>
<property>
<name>hive.server2.authentication.kerberos.keytab</name>
<value>/etc/security/keytabs/hive.service.keytab</value>
</property>
<property>
<name>hive.server2.authentication.kerberos.principal</name>
<value>hive/_HOST@SANDBOX.COM</value>
</property>
<property>
<name>hive.server2.authentication.spnego.keytab</name>
<value>/etc/security/keytabs/spnego.service.keytab</value>
</property>
<property>
<name>hive.server2.authentication.spnego.principal</name>
<value>HTTP/_HOST@SANDBOX.COM</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/apps/hive/warehouse</value>
</property>
<property>
<name>hive.metastore.cache.pinobjtypes</name>
<value>Table,Database,Type,FieldSchema,Order</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://vb-atlas-node1.hortonworks.com/hive?createDatabaseIfNotExist=true</value>
</property>
</configuration>

2. Create a workflow.xml, please make sure you replace the Metastore url and jar's location.

<workflow-app name="spark-wf" xmlns="uri:oozie:workflow:0.5">

<credentials>

<credential name='hcat_auth' type='hcat'>

<property>

<name>hcat.metastore.uri</name>

<value>thrift://vb-atlas-node1.hortonworks.com:9083</value>

</property>

<property>

<name>hcat.metastore.principal</name>

<value>hive/_HOST@SANDBOX.COM</value>

</property>

</credential>

</credentials>

<start to="spark-action"/>

<action name="spark-action" cred='hcat_auth'>

<spark xmlns="uri:oozie:spark-action:0.1">

<job-tracker>${jobTracker}</job-tracker>

<name-node>${nameNode}</name-node>

<prepare>

<delete path="${nameNode}/user/spark/sparkOozie/output-data/spark"/>

</prepare>

<master>${master}</master>

<name>Spark Hive Example</name>

<class>com.hortonworks.vinod.SparkSqlExample</class>

<jar>${nameNode}/user/{User_You_run _as}/lib/Spark-Example-vinod-0.0.1-SNAPSHOT.jar</jar>

<spark-opts>--driver-memory 512m --executor-memory 512m --num-executors 1 --jars /usr/hdp/current/spark-client/lib/datanucleus-rdbms-3.2.9.jar,/usr/hdp/current/spark-client/lib/datanucleus-core-3.2.10.jar,/usr/hdp/current/spark-client/lib/datanucleus-api-jdo-3.2.6.jar --files /usr/hdp/current/spark-client/conf/hive-site.xml</spark-opts>

<arg>thrift://vb-atlas-node1.hortonworks.com:9083</arg>

</spark>

<ok to="end"/>

<error to="kill"/>

</action>

<kill name="kill">

<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>

</kill>

<end name="end"/>

</workflow-app>

3. Upload the jars to run the program and the input file to home path as which you run the oozie job as.

# hdfs dfs -put Spark-Example-vinod-0.0.1–SNAPSHOT.jar /user/{User_You_run _as}/lib/Spark-Example-vinod-0.0.1–SNAPSHOT.jar

#hdfs dfs –put input.txt /user/{User_You_run _as}/

- Upload workflow.xml to HDFS:

For example:

# hdfs dfs -put workflow.xml /user/{User_You_run _as}/

4. Config the job.properties and run the job.

nameNode:hdfs://<namenode_HOST>:8020

jobTracker= <Resource_Manager.:8050

oozie.wf.application.path=/user/{User_You_run _as}/

oozie.use.system.libpath=true

master=yarn-cluster

5. Run the oozie job with the properites:

# oozie job -oozie http://<oozie-server>:11000/oozie/ -config job.properties -run

  • You should be seeing Spark Hive Example in Resource Manager and output will be in std.out

Log Type: stdout

Log Upload Time: Fri Mar 10 22:30:16 +0000 2017

Log Length: 99

+---+-------+

| id| name|

+---+-------+

| 1|sample1|

| 2|sample2|

| 3|sample3|

+---+———+

6 == com.hortonworks.vinod.SparkSqlExample.class ===

package com.hortonworks.vinod;
import java.io.IOException;
import javax.security.auth.login.Configuration;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.hive.HiveContext;
public class SparkSqlExample {
public static void main(String[] args) throws IOException {
org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
conf.addResource("/etc/hadoop/conf/core-site.xml");
conf.addResource("/etc/hadoop/conf/hdfs-site.xml");
conf.addResource("/etc/hive/conf/hive-site.xml");
FileSystem fs = FileSystem.get(conf);
SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL");
SparkContext sss = new SparkContext(sparkConf);
// JavaSparkContext ctx = new JavaSparkContext(sparkConf);
HiveContext hivecontex = new HiveContext(sss);
hivecontex.sql("create external table if not exists SparkHiveExample ( id int, name string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TextFile");
hivecontex.sql("LOAD DATA INPATH 'input.txt' OVERWRITE INTO TABLE SparkHiveExample");
DataFrame df = hivecontex.sql("select * from SparkHiveExample");
df.show();
}
}

7 pom.xml

<projectxmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.hortonworks.sparkExample</groupId>
<artifactId>Spark-Example-vinod</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>Spark Examples</name>
<description>Spark programs </description>
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.10</artifactId>
<version>1.6.2</version>
</parent>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>1.6.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive_2.10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.10</artifactId>
<version>1.6.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.scala-lang/scala-library -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.10.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.0</version>
</dependency>
</dependencies>
</project>
8,129 Views
Comments
avatar
Super Collaborator

Is this needed even after the HDP 2.5 native Oozie Spark action?