Support Questions
Find answers, ask questions, and share your expertise
Announcements
Alert: Welcome to the Unified Cloudera Community. Former HCC members be sure to read and learn how to activate your account here.

Py4JJavaError in Ipython Notebook

Highlighted

Py4JJavaError in Ipython Notebook

New Contributor

I am using Hue 3.8.1 – The Hadoop UI in my company for data science work. This is the first time I have been using it and have been playing around with it for 3 weeks. I am able to successfully run Hive queries.

 

I also use Jupyter Notebook for data analysis and it is run from cloud. I see that I am able to use PySpark directly without having to download Spark in desktop.

 

I was reproducing some analysis done by my colleagues using Pyspark. It involves reading a metastore table from Hue and creating a dataframe. Couple of times it ran fine and then now I am getting a Py4JJava Error. I also tried out this tutorial online and I also get the Py4JJava Error at this step.

https://www.codementor.io/spark/tutorial/python-spark-sql-dataframes

 

Any help will be greatly appreciated.

 

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
interactions_df = sqlContext.createDataFrame(row_data)
#interactions_df.registerTempTable("interactions") 
#register as a table so that we can run sql queries over it





---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-36-a75823d72f70> in <module>()
      1 from pyspark.sql import SQLContext
      2 sqlContext = SQLContext(sc)
----> 3 interactions_df = sqlContext.createDataFrame(row_data)
      4 #interactions_df.registerTempTable("interactions")
      5 #register as a table so that we can run sql queries over it

/usr/hdp/current/spark-client/python/pyspark/sql/context.py in createDataFrame(self, data, schema, samplingRatio)
    402 
    403         if isinstance(data, RDD):
--> 404             rdd, schema = self._createFromRDD(data, schema, samplingRatio)
    405         else:
    406             rdd, schema = self._createFromLocal(data, schema)

/usr/hdp/current/spark-client/python/pyspark/sql/context.py in _createFromRDD(self, rdd, schema, samplingRatio)
    283         """
    284         if schema is None or isinstance(schema, (list, tuple)):
--> 285             struct = self._inferSchema(rdd, samplingRatio)
    286             converter = _create_converter(struct)
    287             rdd = rdd.map(converter)

/usr/hdp/current/spark-client/python/pyspark/sql/context.py in _inferSchema(self, rdd, samplingRatio)
    227         :return: StructType
    228         """
--> 229         first = rdd.first()
    230         if not first:
    231             raise ValueError("The first row in RDD is empty, "

/usr/hdp/current/spark-client/python/pyspark/rdd.py in first(self)
   1315         ValueError: RDD is empty
   1316         """
-> 1317         rs = self.take(1)
   1318         if rs:
   1319             return rs[0]

/usr/hdp/current/spark-client/python/pyspark/rdd.py in take(self, num)
   1267         """
   1268         items = []
-> 1269         totalParts = self.getNumPartitions()
   1270         partsScanned = 0
   1271 

/usr/hdp/current/spark-client/python/pyspark/rdd.py in getNumPartitions(self)
   2370 
   2371     def getNumPartitions(self):
-> 2372         return self._prev_jrdd.partitions().size()
   2373 
   2374     @property

/usr/hdp/current/spark-client/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
    536         answer = self.gateway_client.send_command(command)
    537         return_value = get_return_value(answer, self.gateway_client,
--> 538                 self.target_id, self.name)
    539 
    540         for temp_arg in temp_args:

/usr/hdp/current/spark-client/python/pyspark/sql/utils.py in deco(*a, **kw)
     34     def deco(*a, **kw):
     35         try:
---> 36             return f(*a, **kw)
     37         except py4j.protocol.Py4JJavaError as e:
     38             s = e.java_exception.toString()

/usr/hdp/current/spark-client/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    298                 raise Py4JJavaError(
    299                     'An error occurred while calling {0}{1}{2}.\n'.
--> 300                     format(target_id, '.', name), value)
    301             else:
    302                 raise Py4JError(

Py4JJavaError: An error occurred while calling o292.partitions.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://ip-172-31-56-137.us-west-2.compute.internal:8020/user/jupyter/kddcup.data_10_percent.gz
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:207)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
	at org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:65)
	at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:47)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:207)
	at java.lang.Thread.run(Thread.java:745)