<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: Loading Local File to Apache Spark in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128808#M91505</link>
    <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/10163/clukasik.html" nodeid="10163"&gt;@clukasik&lt;/A&gt;, Thank You, I have had a look at broadcast variables. But I guess with the current requirement, I just require the RDD.&lt;/P&gt;</description>
    <pubDate>Wed, 08 Jun 2016 20:55:46 GMT</pubDate>
    <dc:creator>arunak</dc:creator>
    <dc:date>2016-06-08T20:55:46Z</dc:date>
    <item>
      <title>Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128796#M91493</link>
      <description>&lt;P&gt;Hi, &lt;/P&gt;&lt;P&gt;One of the spark application depends on a local file for some of its business logics. &lt;/P&gt;&lt;P&gt;We can read the file by referring to it as file:///. But for this to work, the copy of the file needs to be on every worker or every worker need to have access to common shared drive as in a NFS mount. &lt;/P&gt;&lt;P&gt;Is there any other way of achieving this?&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:26:33 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128796#M91493</guid>
      <dc:creator>arunak</dc:creator>
      <dc:date>2016-06-08T20:26:33Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128797#M91494</link>
      <description>&lt;A rel="user" href="https://community.cloudera.com/users/10529/akeezhadath.html" nodeid="10529"&gt;@akeezhadath&lt;/A&gt;&lt;P&gt;You can place the file on HDFS and access the file through "hdfs:///path/file".&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:30:22 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128797#M91494</guid>
      <dc:creator>jyadav</dc:creator>
      <dc:date>2016-06-08T20:30:22Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128798#M91495</link>
      <description>&lt;P&gt;Thanks for the suggestion &lt;A rel="user" href="https://community.cloudera.com/users/2528/jyadav.html" nodeid="2528"&gt;@Jitendra Yadav&lt;/A&gt;
But, the file being small &amp;lt;~ 500 KB, I was thinking if we need to have that loaded to HDFS. Was looking for some "hack" &lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:33:09 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128798#M91495</guid>
      <dc:creator>arunak</dc:creator>
      <dc:date>2016-06-08T20:33:09Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128799#M91496</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/10529/akeezhadath.html" nodeid="10529"&gt;@akeezhadath&lt;/A&gt; spark assume the your file is on hdfs by default if you have not specified any uri(file:///,hdfs://,s3://) so it your file is on hdfs, you can refrenced it using absolute path like &lt;/P&gt;&lt;P&gt;sc.textFile("/user/xyz/data.txt")&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:33:41 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128799#M91496</guid>
      <dc:creator>rajkumar_singh</dc:creator>
      <dc:date>2016-06-08T20:33:41Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128800#M91497</link>
      <description>&lt;P&gt;If you are using yarn-client mode and that file resides where the driver JVM is running, then it should work using "file://". Otherwise, as Jitendra suggests, copy the file to hdfs. &lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:33:52 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128800#M91497</guid>
      <dc:creator>clukasik</dc:creator>
      <dc:date>2016-06-08T20:33:52Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128801#M91498</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/8919/rajkumarsingh.html" nodeid="8919"&gt;@Rajkumar Singh&lt;/A&gt; : Yes, but here the file resides on the machine where we trigger the spark-submit. So I was looking if there is any way to read it in the driver without actually having to move it to all the workers or even to the HDFS.&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:36:24 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128801#M91498</guid>
      <dc:creator>arunak</dc:creator>
      <dc:date>2016-06-08T20:36:24Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128802#M91499</link>
      <description>&lt;P&gt;is it single file or multiple small files?&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:40:38 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128802#M91499</guid>
      <dc:creator>rajkumar_singh</dc:creator>
      <dc:date>2016-06-08T20:40:38Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128803#M91500</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/10529/akeezhadath.html" nodeid="10529"&gt;@akeezhadath&lt;/A&gt; - depending on how you are using the file, you could consider broadcast variables (http://spark.apache.org/docs/latest/programming-guide.html#broadcast-variables). However, if the data fits well into the RDD construct, then you might be better with loading it as normal (sc.textFile("file://some-path")).&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:41:02 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128803#M91500</guid>
      <dc:creator>clukasik</dc:creator>
      <dc:date>2016-06-08T20:41:02Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128804#M91501</link>
      <description>&lt;P&gt;One single small file. &lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:44:39 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128804#M91501</guid>
      <dc:creator>arunak</dc:creator>
      <dc:date>2016-06-08T20:44:39Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128805#M91502</link>
      <description>&lt;P&gt;with spark-submit you can try uploading the file to driver using -Dapplication.properties.file=&amp;lt;file path on location&amp;gt; &lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:46:37 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128805#M91502</guid>
      <dc:creator>rajkumar_singh</dc:creator>
      <dc:date>2016-06-08T20:46:37Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128806#M91503</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/10529/akeezhadath.html" nodeid="10529"&gt;@akeezhadath&lt;/A&gt; &lt;/P&gt;&lt;P&gt;Kindly use below API to cache the file on all the nodes.&lt;/P&gt;&lt;P&gt;SparkContext.addFile()&lt;/P&gt;&lt;P&gt;Add a file to be downloaded with this Spark job on every node. The path passed can be either a local file, a file in HDFS (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark jobs, use SparkFiles.get(fileName) to find its download location.&lt;/P&gt;&lt;P&gt;A directory can be given if the recursive option is set to true. Currently directories are only supported for Hadoop-supported filesystems.&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:47:57 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128806#M91503</guid>
      <dc:creator>jyadav</dc:creator>
      <dc:date>2016-06-08T20:47:57Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128807#M91504</link>
      <description>&lt;P&gt;Thanks &lt;A rel="user" href="https://community.cloudera.com/users/10163/clukasik.html" nodeid="10163"&gt;@clukasik&lt;/A&gt;. Is there any performance difference in choosing client deploy-mode over the cluster mode.If I use the default client deploy mode, I get the control on where my driver program runs. However, wanted to be sure that it does not cause any performance issue. &lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:53:37 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128807#M91504</guid>
      <dc:creator>arunak</dc:creator>
      <dc:date>2016-06-08T20:53:37Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128808#M91505</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/10163/clukasik.html" nodeid="10163"&gt;@clukasik&lt;/A&gt;, Thank You, I have had a look at broadcast variables. But I guess with the current requirement, I just require the RDD.&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:55:46 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128808#M91505</guid>
      <dc:creator>arunak</dc:creator>
      <dc:date>2016-06-08T20:55:46Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128809#M91506</link>
      <description>&lt;P&gt;Thanks &lt;A rel="user" href="https://community.cloudera.com/users/2528/jyadav.html" nodeid="2528"&gt;@Jitendra Yadav&lt;/A&gt;. I will take a look at the addFile API. I would like to try getting control on the driver as clukasik pointed out.&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 20:58:25 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128809#M91506</guid>
      <dc:creator>arunak</dc:creator>
      <dc:date>2016-06-08T20:58:25Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128810#M91507</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/8919/rajkumarsingh.html" nodeid="8919"&gt;@Rajkumar Singh&lt;/A&gt;, don't the application.properties.file need to be in a key value format? &lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 21:08:15 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128810#M91507</guid>
      <dc:creator>arunak</dc:creator>
      <dc:date>2016-06-08T21:08:15Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128811#M91508</link>
      <description>&lt;P&gt;I don't think that there would not be a performance difference. Of course, if you are using "collect()" or some such method that aggregates data in the driver JVM you will have to be mindful of driver-related properties and settings (e.g. --driver-memory). &lt;A rel="user" href="https://community.cloudera.com/users/2528/jyadav.html" nodeid="2528"&gt;@Jitendra Yadav&lt;/A&gt; - do you see any performance concerns with client vs cluster?&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 21:19:10 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128811#M91508</guid>
      <dc:creator>clukasik</dc:creator>
      <dc:date>2016-06-08T21:19:10Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128812#M91509</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/10163/clukasik.html" nodeid="10163"&gt;@clukasik&lt;/A&gt; I don't see any performance issue if running it on yarn-client mode however as per initial info they needs to use distributed cache kind of thing in spark, which they can achieve through SparkContext.addFile()&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 21:34:53 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128812#M91509</guid>
      <dc:creator>jyadav</dc:creator>
      <dc:date>2016-06-08T21:34:53Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128813#M91510</link>
      <description>&lt;P&gt;spark-submit provides the --files tag to upload files to the execution directories. If you have small files that do not change. &lt;/P&gt;&lt;P&gt;Alternatively as the others have suggested put it in HDFS&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 21:58:47 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128813#M91510</guid>
      <dc:creator>bleonhardi</dc:creator>
      <dc:date>2016-06-08T21:58:47Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128814#M91511</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/168/bleonhardi.html" nodeid="168"&gt;@Benjamin Leonhardi&lt;/A&gt;. Thanks for pointing this out. I over looked this flag.&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 22:05:20 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128814#M91511</guid>
      <dc:creator>arunak</dc:creator>
      <dc:date>2016-06-08T22:05:20Z</dc:date>
    </item>
    <item>
      <title>Re: Loading Local File to Apache Spark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128815#M91512</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/168/bleonhardi.html" nodeid="168"&gt;@Benjamin Leonhardi&lt;/A&gt; how --files is differ from SparkContext.addFile() apart from the way we use them?&lt;/P&gt;</description>
      <pubDate>Wed, 08 Jun 2016 22:14:35 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Loading-Local-File-to-Apache-Spark/m-p/128815#M91512</guid>
      <dc:creator>jyadav</dc:creator>
      <dc:date>2016-06-08T22:14:35Z</dc:date>
    </item>
  </channel>
</rss>

