unabel to load data in using spark ?

vb12 — Fri, 16 Sep 2022 13:17:45 GMT

please suggest , how should i give the path ? , there is a file in /home/cloudera/documents.

scala> val textFile = sc.textFile("hdfs://quickstart.cloudera:8020/home/cloudera/Documents/ard.txt")
textFile: org.apache.spark.rdd.RDD[String] = hdfs://quickstart.cloudera:8020/home/cloudera/Documents/ard.txt MapPartitionsRDD[5] at textFile at <console>:27

scala> val count=textFile.flatMap(line=>line.split(" ").map(word=>(word,1)))
count: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[6] at flatMap at <console>:29

scala> count.reduceByKey(_+_).saveAsTextFile("hdfs://quickstart.cloudera.8020/home/cloudera/Documents/2");
org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://quickstart.cloudera:8020/home/cloudera/Documents/ard.txt
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:202)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
	at org.apache.spark.Partitioner$.defaultPartitioner(Partitioner.scala:65)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$reduceByKey$3.apply(PairRDDFunctions.scala:331)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$reduceByKey$3.apply(PairRDDFunctions.scala:331)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
	at org.apache.spark.rdd.PairRDDFunctions.reduceByKey(PairRDDFunctions.scala:330)
	at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:32)
	at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:37)
	at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:39)
	at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:41)
	at $iwC$$iwC$$iwC$$iwC.<init>(<console>:43)
	at $iwC$$iwC$$iwC.<init>(<console>:45)
	at $iwC$$iwC.<init>(<console>:47)
	at $iwC.<init>(<console>:49)
	at <init>(<console>:51)
	at .<init>(<console>:55)
	at .<clinit>(<console>)
	at .<init>(<console>:7)
	at .<clinit>(<console>)
	at $print(<console>)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1045)
	at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1326)
	at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:821)
	at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:852)
	at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:800)
	at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
	at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
	at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
	at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
	at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
	at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
	at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
	at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
	at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
	at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
	at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
	at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1064)
	at org.apache.spark.repl.Main$.main(Main.scala:35)
	at org.apache.spark.repl.Main.main(Main.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:730)
	at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
	at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)



scala> count.reduceByKey(_+_).saveAsTextFile("hdfs://quickstart.cloudera.8020/home/cloudera/Documents/2a.txt")
org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://quickstart.cloudera:8020/home/cloudera/Documents/ard.txt
 at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
 at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
 at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
 at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:202)
 at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
 at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
 at scala.Option.getOrElse(Option.scala:120)
 at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
 at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
 at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
 at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
 at scala.Option.getOrElse(Option.scala:120)

Re: unabel to load data in using spark

Harsh J — Mon, 04 Jun 2018 04:28:35 GMT

The path you are specifying (/home/cloudera/Documents/ard.txt) appears to
be a local filesystem path coming from your Linux machine. The URI you are
supplying (hdfs://quickstart.cloudera:8020) is indicating to the program to
use Hadoop Distributed FileSystem, not Local FileSystem.

You may want to ensure such a path exists on HDFS via commands such as
'hadoop fs -ls /home/cloudera/Documents/ard.txt', or use commands 'hadoop
fs -mkdir -p /home/cloudera/Documents/' and 'hadoop fs -copyFromLocal
/home/cloudera/Documents/ard.txt hdfs:///home/cloudera/Documents/ard.txt'
to place it first on HDFS on the same path.

P.s. "Home" paths on HDFS are typically /user/{username}, not /home like on
Linux.

Re: unabel to load data in using spark

vb12 — Mon, 04 Jun 2018 04:51:36 GMT

thankyou

while saving i did not give hdfs ://cloudera.....

instead i gave ,

count.reduceByKey(_+_).saveAsTextFile("/home/cloudera/Documents/re.txt")

it saved the file in re.txt directory in hdfs..

question unabel to load data in using spark ? in Archives of Support Questions (Read Only)

unabel to load data in using spark ?

Re: unabel to load data in using spark

Re: unabel to load data in using spark