The PySpark code ran without errors with spark submit in yarn client and yarn cluster but couldn't make it work with Oozie workflow. Been trying to get this work for a week now but couldn't figure out exact error. Attached pictures of part of yarn log where I suspect error might be, but not sure. I would appreciate if someone looked over it.
Im new using spark action and oozie workflow with pyspark script. Pasted are the .xml and .properties files here. Please also correct me if my code is having mistakes or needs some work around.
-
- workflow.xml
-
- <workflow-app name="oozie-pyspark-wf" xmlns="uri:oozie:workflow:0.5">
<credentials>
<credential name="navHiveCreds" type="hive2">
<property>
<name>hive2.jdbc.url</name>
<value>${metastoreUri}</value>
</property>
<property>
<name>hive2.server.principal</name>
<value>${metastorePrincipal}</value>
</property>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
</credential>
</credentials>
<start to='pyspark-ingestion'/>
<action name='pyspark-ingestion'>
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>mapred.compress.map.output</name>
<value>true</value>
</property>
</configuration>
<master>${master}</master>
<name>.py</name>
<jar>lib/.py</jar>
<arg>${conn_string}</arg>
<arg>${table_name}</arg>
<arg>${column_name}</arg>
<arg>${num_partitions}</arg>
<arg>${process_date}</arg>
<arg>${file_format}</arg>
</spark>
<ok to="end" />
<error to="fail" />
</action>
<kill name="fail">
<message>Workflow failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name='end' />
</workflow-app>
Job.properties:
nameNode=hdfs://url:8020
jobTracker=hdfs://url:8050
master=yarn-cluster
queueName=default
conn_string=${conn_string}
table_name=${table_name}
column_name=${column_name}
num_partitions=50
process_date=2019-08-02
file_format=orc
oozie.use.system.libpath=true
oozie.wf.application.path=${nameNode}/user/${user.name}/root folder containing xml, properties and .py
oozie.libpath=${nameNode}/user/oozie/share/lib/lib_20181117153841
oozie.action.sharelib.for.spark=spark2
metastorePrincipal=${metastorePrincipal}
metastoreUri=${metastoreUri}