<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: Record count and Duplicate check - using Spark in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/Record-count-and-Duplicate-check-using-Spark/m-p/231113#M192957</link>
    <description>&lt;P&gt;Hi Sandeep Nemuri &lt;/P&gt;&lt;P&gt;Thanks for the answer, will try this. but should the data be in rdd? or can I use text files in hdfs?  can I do record count and duplicate check using files? or just data frames, instead of having both RDD and Data frames. we are going to have a huge number of files and a huge data volume, so performance is very important. can you comment on that please.&lt;/P&gt;</description>
    <pubDate>Fri, 20 Oct 2017 20:52:56 GMT</pubDate>
    <dc:creator>kpk_ds</dc:creator>
    <dc:date>2017-10-20T20:52:56Z</dc:date>
  </channel>
</rss>

