Reply
New Contributor
Posts: 1
Registered: ‎03-23-2017

Pyspark: Socket Write Error

I have provided the code and error log. Not sure what the problem is.

Any help/guidance is appreciated. Thanks.

 

Code:

 

from pyspark import SparkConf, SparkContext

from pyspark.ml.linalg import Vectors

from pyspark.ml.classification import LogisticRegression

from pyspark.ml.param import Param,Params

from pyspark.sql import Row

from pyspark.sql import SQLContext

from pyspark.sql import SparkSession

 

conf = SparkConf().setMaster("local").setAppName("WineQuality")

sc = SparkContext(conf = conf)

sqlContext = SQLContext(sc)

 

if __name__ == "__main__":

 

spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("WineQuality").getOrCreate()

 

dataDir = "C:/SparkCourse/"

lines = sc.textFile(dataDir + "winequality-white.csv")

splitLines = lines.map(lambda l: l.split(";"))

 

wineClsRDD = splitLines.map(lambda p: (float(0) if (float(p[11]) <7) else float(1), Vectors.dense([float(p[0]), float(p[1]), float(p[2]),float(p[3]), float(p[4]), float(p[5]), float(p[6]), float(p[7]),float(p[8]), float(p[9]), float(p[10])])))

trainingDF = spark.createDataFrame(wineClsRDD, ['label','features'])

trainingDF.show()

 

Error log:


(Canopy 64bit) C:\SparkCourse>spark-submit WineClass.py
17/03/23 16:54:14 ERROR PythonRunner: Python worker exited unexpectedly (crashed)
java.net.SocketException: Connection reset by peer: socket write error
at java.net.SocketOutputStream.socketWrite0(Native Method)
at java.net.SocketOutputStream.socketWrite(SocketOutputStream.java:111)
at java.net.SocketOutputStream.write(SocketOutputStream.java:155)
at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140)
at java.io.DataOutputStream.flush(DataOutputStream.java:123)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:331)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1857)
at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:269)
17/03/23 16:54:14 ERROR PythonRunner: This may have been caused by a prior exception:
java.net.SocketException: Connection reset by peer: socket write error
at java.net.SocketOutputStream.socketWrite0(Native Method)
at java.net.SocketOutputStream.socketWrite(SocketOutputStream.java:111)
at java.net.SocketOutputStream.write(SocketOutputStream.java:155)
at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140)
at java.io.DataOutputStream.flush(DataOutputStream.java:123)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:331)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1857)
at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:269)
17/03/23 16:54:14 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.net.SocketException: Connection reset by peer: socket write error
at java.net.SocketOutputStream.socketWrite0(Native Method)
at java.net.SocketOutputStream.socketWrite(SocketOutputStream.java:111)
at java.net.SocketOutputStream.write(SocketOutputStream.java:155)
at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140)
at java.io.DataOutputStream.flush(DataOutputStream.java:123)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:331)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1857)
at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:269)
17/03/23 16:54:14 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
Traceback (most recent call last):
File "C:/SparkCourse/WineClass.py", line 22, in <module>
trainingDF = spark.createDataFrame(wineClsRDD, ['label','features'])
File "c:\Spark\python\lib\pyspark.zip\pyspark\sql\session.py", line 520, in createDataFrame
File "c:\Spark\python\lib\pyspark.zip\pyspark\sql\session.py", line 360, in _createFromRDD
File "c:\Spark\python\lib\pyspark.zip\pyspark\sql\session.py", line 331, in _inferSchema
File "c:\Spark\python\lib\pyspark.zip\pyspark\rdd.py", line 1328, in first
File "c:\Spark\python\lib\pyspark.zip\pyspark\rdd.py", line 1310, in take
File "c:\Spark\python\lib\pyspark.zip\pyspark\context.py", line 941, in runJob
File "c:\Spark\python\lib\py4j-0.10.1-src.zip\py4j\java_gateway.py", line 933, in __call__
File "c:\Spark\python\lib\pyspark.zip\pyspark\sql\utils.py", line 63, in deco
File "c:\Spark\python\lib\py4j-0.10.1-src.zip\py4j\protocol.py", line 312, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.net.SocketException: Connection reset by peer: socket write error
at java.net.SocketOutputStream.socketWrite0(Native Method)
at java.net.SocketOutputStream.socketWrite(SocketOutputStream.java:111)
at java.net.SocketOutputStream.write(SocketOutputStream.java:155)
at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140)
at java.io.DataOutputStream.flush(DataOutputStream.java:123)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:331)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1857)
at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:269)

Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1450)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1438)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1437)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1437)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1659)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1618)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1607)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1871)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1884)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1897)
at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:441)
at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:211)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.net.SocketException: Connection reset by peer: socket write error
at java.net.SocketOutputStream.socketWrite0(Native Method)
at java.net.SocketOutputStream.socketWrite(SocketOutputStream.java:111)
at java.net.SocketOutputStream.write(SocketOutputStream.java:155)
at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140)
at java.io.DataOutputStream.flush(DataOutputStream.java:123)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:331)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1857)
at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:269)