<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question How to process a word count on zipped files in spark in Archives of Support Questions (Read Only)</title>
    <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/How-to-process-a-word-count-on-zipped-files-in-spark/m-p/118953#M42965</link>
    <description>&lt;P&gt;I am working on a aws dataset(email dataset -enron) . I just wanted to do a word count on all of the emails and find out the average. The files are zipped (Please see the screen shot attachment which shows how the actual data set looks like). Please if some one could help me by looking at the scr&lt;A href="https://community.cloudera.com/legacyfs/online/attachments/8308-screen-shot-2016-10-07-at-090457.png"&gt;screen-shot-2016-10-07-at-090457.png&lt;/A&gt;een shot that how I can do the word count processing using spark (scala preferably). I would really appreciate . &lt;/P&gt;&lt;P&gt;Note: The actual datasize is 210 GB. I am planning to run an EMR cluster then perform the processing. &lt;/P&gt;</description>
    <pubDate>Fri, 07 Oct 2016 15:10:45 GMT</pubDate>
    <dc:creator>sanjeevanmahaja</dc:creator>
    <dc:date>2016-10-07T15:10:45Z</dc:date>
    <item>
      <title>How to process a word count on zipped files in spark</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/How-to-process-a-word-count-on-zipped-files-in-spark/m-p/118953#M42965</link>
      <description>&lt;P&gt;I am working on a aws dataset(email dataset -enron) . I just wanted to do a word count on all of the emails and find out the average. The files are zipped (Please see the screen shot attachment which shows how the actual data set looks like). Please if some one could help me by looking at the scr&lt;A href="https://community.cloudera.com/legacyfs/online/attachments/8308-screen-shot-2016-10-07-at-090457.png"&gt;screen-shot-2016-10-07-at-090457.png&lt;/A&gt;een shot that how I can do the word count processing using spark (scala preferably). I would really appreciate . &lt;/P&gt;&lt;P&gt;Note: The actual datasize is 210 GB. I am planning to run an EMR cluster then perform the processing. &lt;/P&gt;</description>
      <pubDate>Fri, 07 Oct 2016 15:10:45 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/How-to-process-a-word-count-on-zipped-files-in-spark/m-p/118953#M42965</guid>
      <dc:creator>sanjeevanmahaja</dc:creator>
      <dc:date>2016-10-07T15:10:45Z</dc:date>
    </item>
    <item>
      <title>Re: How to process a word count on zipped files in spark</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/How-to-process-a-word-count-on-zipped-files-in-spark/m-p/118954#M42966</link>
      <description>&lt;P&gt;These should be helpful&lt;/P&gt;&lt;P&gt;&lt;A href="http://stackoverflow.com/questions/32080475/how-to-read-a-zip-containing-multiple-files-in-apache-spark" target="_blank"&gt;http://stackoverflow.com/questions/32080475/how-to-read-a-zip-containing-multiple-files-in-apache-spark&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;A href="http://stackoverflow.com/questions/28569788/how-to-open-stream-zip-files-through-spark" target="_blank"&gt;http://stackoverflow.com/questions/28569788/how-to-open-stream-zip-files-through-spark&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 07 Oct 2016 20:19:54 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/How-to-process-a-word-count-on-zipped-files-in-spark/m-p/118954#M42966</guid>
      <dc:creator>gkeys</dc:creator>
      <dc:date>2016-10-07T20:19:54Z</dc:date>
    </item>
  </channel>
</rss>

