I am trying to isolate the loading process of my Spark script to analyze why I repeatedly get the following error. I am required to use SparkRDD's as well as versions 3.1.2 and 3.6.8 of Spark and Python, respectively.
Here's the simple script I am trying to run:
import sys
import os
import time
import pyspark
from pyspark import SparkContext
import gc
gc.collect()
try:
if sc:
sc.stop()
time.sleep(2) # Short delay to prevent race conditions
except NameError:
pass # sc is not defined, so nothing to stop
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
sc = SparkContext('local[*]', 'itemRecYelp',
conf=pyspark.SparkConf()
.set("spark.python.worker.memory", "1g")
.set("sparkwork.timeout", "600s")
.set("spark.executor.heartbeatInterval", "120s"))
sc.setLogLevel("ERROR")
input_train_file_path = sys.argv[1]
input_test_file_path = sys.argv[2]
output_file_path = sys.argv[3]
rdd = sc.textFile(input_train_file_path)
header = rdd.first()
rdd = rdd.filter(lambda line: line != header) \
.map(lambda line: line.strip().split(","))
and here's a partial error printout:
Traceback (most recent call last):
File "C:/Users/Chris/PycharmProjects/DSCI553/HW_2/scripts/load_in_test.py", line 42, in <module>
header = rdd.first()
File "C:\Spark_Hadoop\spark\python\lib\pyspark.zip\pyspark\rdd.py", line 1586, in first
File "C:\Spark_Hadoop\spark\python\lib\pyspark.zip\pyspark\rdd.py", line 1566, in take
File "C:\Spark_Hadoop\spark\python\lib\pyspark.zip\pyspark\context.py", line 1233, in runJob
File "C:\Spark_Hadoop\spark\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py", line 1305, in __call__
File "C:\Spark_Hadoop\spark\python\lib\py4j-0.10.9-src.zip\py4j\protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:.apache.spark.api.python.PythonRDD.runJob.
: .apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent fai
lure: Lost task 0.0 in stage 0.0 (TID 0) (LAPTOP-63PERCQQ executor driver): java.SocketException: Connection reset by peer: socket write error
at java.SocketOutputStream.socketWrite0(Native Method)
at java.SocketOutputStream.socketWrite(SocketOutputStream.java:111)
at java.SocketOutputStream.write(SocketOutputStream.java:155)
at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
at java.io.DataOutputStream.write(DataOutputStream.java:107)
at java.io.FilterOutputStream.write(FilterOutputStream.java:97)
at .apache.spark.api.python.PythonRDD$.writeUTF(PythonRDD.scala:477)
at .apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:297)
at .apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:307)
at .apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:307)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at .apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
at .apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:621)
at .apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:397)
at .apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1996)
at .apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:232)
Driver stacktrace:
at .apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
at .apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
at .apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at .apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
at .apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
at .apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
at scala.Option.foreach(Option.scala:407)
at .apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
at .apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
at .apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
at .apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
at .apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at .apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
at .apache.spark.SparkContext.runJob(SparkContext.scala:2196)
at .apache.spark.SparkContext.runJob(SparkContext.scala:2217)
at .apache.spark.SparkContext.runJob(SparkContext.scala:2236)
at .apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:166)
at .apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4jmands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4jmands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.SocketException: Connection reset by peer: socket write error
at java.SocketOutputStream.socketWrite0(Native Method)
at java.SocketOutputStream.socketWrite(SocketOutputStream.java:111)
at java.SocketOutputStream.write(SocketOutputStream.java:155)
at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
at java.io.DataOutputStream.write(DataOutputStream.java:107)
at java.io.FilterOutputStream.write(FilterOutputStream.java:97)
at .apache.spark.api.python.PythonRDD$.writeUTF(PythonRDD.scala:477)
at .apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:297)
at .apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:307)
at .apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:307)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at .apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
at .apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:621)
at .apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:397)
at .apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1996)
at .apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:232)
I've been using spark-submit and allocating 4G of executor and driver memory. So far the only way I've been able to consistently mitigate this issue is by using an increased number of minPartitions (~150) while loading the file, but that seems like it should be unnecessary considering the file is not that large.