<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: Record count and Duplicate check - using Spark in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/Record-count-and-Duplicate-check-using-Spark/m-p/231114#M192958</link>
    <description>&lt;A rel="user" href="https://community.cloudera.com/users/15722/kpkds.html" nodeid="15722"&gt;@karthick baskaran&lt;/A&gt;&lt;P&gt;Here is the command to get number of lines in a file. Spark will internally load your text file and keep it in RDD/dataframe/dataset.&lt;/P&gt;&lt;PRE&gt;spark-shell (spark 1.6.x)
scala&amp;gt; val textFile = sc.textFile("README.md")
scala&amp;gt; textFile.count() // Number of items in this RD&lt;/PRE&gt;</description>
    <pubDate>Fri, 20 Oct 2017 22:16:05 GMT</pubDate>
    <dc:creator>sandyy006</dc:creator>
    <dc:date>2017-10-20T22:16:05Z</dc:date>
  </channel>
</rss>

