<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question distributed processing operation of dataframe with Pyspark in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/distributed-processing-operation-of-dataframe-with-Pyspark/m-p/146929#M109482</link>
    <description>&lt;P&gt;Hello, &lt;/P&gt;&lt;P&gt;I would like to know please, by what method (or line of code) is that I can be convinced that treatment is executed on all my cluster node  with Pyspark?&lt;/P&gt;&lt;DIV&gt;thank you kindly help me&lt;/DIV&gt;&lt;P&gt;here is my code:&lt;/P&gt;&lt;PRE&gt;from pyspark.sql.types import *
from pyspark.sql import Row
           		   
rdd = sc.textFile('hdfs:../personne.txt') 
rdd_split = rdd.map(lambda x: x.split(','))
rdd_people = rdd_split.map(lambda x: Row(name=x[0],age=int(x[1]),ca=int(x[2])))
df_people = sqlContext.createDataFrame(rdd_people)
df_people.registerTempTable("people")
df_people.collect()


&lt;/PRE&gt;</description>
    <pubDate>Fri, 13 May 2016 21:45:24 GMT</pubDate>
    <dc:creator>nanyim_alain</dc:creator>
    <dc:date>2016-05-13T21:45:24Z</dc:date>
    <item>
      <title>distributed processing operation of dataframe with Pyspark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/distributed-processing-operation-of-dataframe-with-Pyspark/m-p/146929#M109482</link>
      <description>&lt;P&gt;Hello, &lt;/P&gt;&lt;P&gt;I would like to know please, by what method (or line of code) is that I can be convinced that treatment is executed on all my cluster node  with Pyspark?&lt;/P&gt;&lt;DIV&gt;thank you kindly help me&lt;/DIV&gt;&lt;P&gt;here is my code:&lt;/P&gt;&lt;PRE&gt;from pyspark.sql.types import *
from pyspark.sql import Row
           		   
rdd = sc.textFile('hdfs:../personne.txt') 
rdd_split = rdd.map(lambda x: x.split(','))
rdd_people = rdd_split.map(lambda x: Row(name=x[0],age=int(x[1]),ca=int(x[2])))
df_people = sqlContext.createDataFrame(rdd_people)
df_people.registerTempTable("people")
df_people.collect()


&lt;/PRE&gt;</description>
      <pubDate>Fri, 13 May 2016 21:45:24 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/distributed-processing-operation-of-dataframe-with-Pyspark/m-p/146929#M109482</guid>
      <dc:creator>nanyim_alain</dc:creator>
      <dc:date>2016-05-13T21:45:24Z</dc:date>
    </item>
    <item>
      <title>Re: distributed processing operation of dataframe with Pyspark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/distributed-processing-operation-of-dataframe-with-Pyspark/m-p/146930#M109483</link>
      <description>&lt;P&gt;If you are looking for a way to monitor the job and determine which nodes it ran on, how many executors, etc, you can see this in the Spark Web UI located at &amp;lt;sparkhost&amp;gt;:4040&lt;/P&gt;&lt;P&gt;&lt;A href="http://spark.apache.org/docs/latest/monitoring.html"&gt;http://spark.apache.org/docs/latest/monitoring.html&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;A href="http://spark.apache.org/docs/latest/monitoring.html"&gt;&lt;/A&gt;&lt;A href="http://stackoverflow.com/questions/35059608/pyspark-on-cluster-make-sure-all-nodes-are-used"&gt;http://stackoverflow.com/questions/35059608/pyspark-on-cluster-make-sure-all-nodes-are-used&lt;/A&gt;&lt;/P&gt;&lt;P&gt;cheers,&lt;/P&gt;&lt;P&gt;Andrew&lt;/P&gt;</description>
      <pubDate>Sat, 14 May 2016 02:27:02 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/distributed-processing-operation-of-dataframe-with-Pyspark/m-p/146930#M109483</guid>
      <dc:creator>andrew_sears</dc:creator>
      <dc:date>2016-05-14T02:27:02Z</dc:date>
    </item>
    <item>
      <title>Re: distributed processing operation of dataframe with Pyspark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/distributed-processing-operation-of-dataframe-with-Pyspark/m-p/146931#M109484</link>
      <description>&lt;P style="margin-left: 20px;"&gt;&lt;A rel="user" href="https://community.cloudera.com/users/1456/andrewsears.html" nodeid="1456" target="_blank"&gt;@Andrew Sears&lt;/A&gt; answer is correct, and once you bring up the Spark History Server URL (http://{driver-node}:4040), you can navigate to the Executors tab, which will show you lots of statistics about the driver and each executor, as shown below. Note that when running Hortonworks Data Platform (HDP), you can get here from the Spark services page, clicking on "Quick Links", and then clicking on the "Spark History Server UI" button. Following that, you will need to find your specific job under "App ID".&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="4217-sparkhistoryserver-executors.png" style="width: 1196px;"&gt;&lt;img src="https://community.cloudera.com/t5/image/serverpage/image-id/21399i2C50E0A293ECBA6C/image-size/medium?v=v2&amp;amp;px=400" role="button" title="4217-sparkhistoryserver-executors.png" alt="4217-sparkhistoryserver-executors.png" /&gt;&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Sun, 18 Aug 2019 13:21:59 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/distributed-processing-operation-of-dataframe-with-Pyspark/m-p/146931#M109484</guid>
      <dc:creator>phargis</dc:creator>
      <dc:date>2019-08-18T13:21:59Z</dc:date>
    </item>
    <item>
      <title>Re: distributed processing operation of dataframe with Pyspark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/distributed-processing-operation-of-dataframe-with-Pyspark/m-p/146932#M109485</link>
      <description>&lt;P&gt;Thank very much&lt;/P&gt;</description>
      <pubDate>Thu, 19 May 2016 15:04:27 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/distributed-processing-operation-of-dataframe-with-Pyspark/m-p/146932#M109485</guid>
      <dc:creator>nanyim_alain</dc:creator>
      <dc:date>2016-05-19T15:04:27Z</dc:date>
    </item>
    <item>
      <title>Re: distributed processing operation of dataframe with Pyspark</title>
      <link>https://community.cloudera.com/t5/Support-Questions/distributed-processing-operation-of-dataframe-with-Pyspark/m-p/146933#M109486</link>
      <description>&lt;P&gt;Very big thnak you&lt;/P&gt;</description>
      <pubDate>Thu, 19 May 2016 15:05:48 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/distributed-processing-operation-of-dataframe-with-Pyspark/m-p/146933#M109486</guid>
      <dc:creator>nanyim_alain</dc:creator>
      <dc:date>2016-05-19T15:05:48Z</dc:date>
    </item>
  </channel>
</rss>

