<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: PostgreSQL count higher than Spark dataframe in Archives of Support Questions (Read Only)</title>
    <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/PostgreSQL-count-higher-than-Spark-dataframe/m-p/180770#M80463</link>
    <description>&lt;P&gt;Solved it. &lt;/P&gt;&lt;P&gt;Noticed that writing to Postgresql was accurate if i read parquet with second option below.&lt;/P&gt;&lt;P&gt;parquet("/user-data/xyz/input/TABLE/*) // WRONG numbers in PostgreSQL&lt;/P&gt;&lt;P&gt;parquet("/user-data/xyz/input/TABLE/evnt_month=*)  // Correct numbers in postgreSQL&lt;/P&gt;&lt;P&gt;If someone is aware of such problem, please comment.&lt;/P&gt;</description>
    <pubDate>Sun, 05 Aug 2018 13:56:36 GMT</pubDate>
    <dc:creator>learnsqoop</dc:creator>
    <dc:date>2018-08-05T13:56:36Z</dc:date>
    <item>
      <title>PostgreSQL count higher than Spark dataframe</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/PostgreSQL-count-higher-than-Spark-dataframe/m-p/180767#M80460</link>
      <description>&lt;P&gt;When I try to Write a Dataframe to PostgreSQL using Spark Scala, I have noticed that the count on PostgreSQL is always higher than what is get in Spark Scala. The count in spark dataframe is correct &amp;amp; expected. &lt;/P&gt;&lt;P&gt;I have even tried to load the data on monthly basis in parts but the Count in postgreSQL is higher than Spark dataframe&lt;/P&gt;&lt;PRE&gt;
df=sqlContext.read.option("compression","snappy").parquet("/user-data/xyz/input/TABLE/")
val connection="jdbc:postgresql://localhost:5449/adb?user=aschema&amp;amp;password=abc" 
val prop = new java.util.Properties 
prop.setProperty("driver", "org.postgresql.Driver") 
df.write.mode("Overwrite").jdbc(url=  connection, table = "adb.aschema.TABLE", connectionProperties  = prop)&lt;/PRE&gt;</description>
      <pubDate>Tue, 10 Jul 2018 13:56:45 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/PostgreSQL-count-higher-than-Spark-dataframe/m-p/180767#M80460</guid>
      <dc:creator>learnsqoop</dc:creator>
      <dc:date>2018-07-10T13:56:45Z</dc:date>
    </item>
    <item>
      <title>Re: PostgreSQL count higher than Spark dataframe</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/PostgreSQL-count-higher-than-Spark-dataframe/m-p/180768#M80461</link>
      <description>&lt;P&gt; &lt;A rel="user" href="https://community.cloudera.com/users/48176/voxjungle.html" nodeid="48176"&gt;@Team Spark&lt;/A&gt;&lt;/P&gt;&lt;P&gt;I recommend you try to find a small subset of data where you see the count does not match, for example do monthly, then daily and then by hours to try to narrow down and be able to find hopefully which rows are perhaps missing on postgre. This will provide more information as you can review the rows data and hopefully find something. &lt;/P&gt;&lt;P&gt;HTH&lt;/P&gt;</description>
      <pubDate>Tue, 10 Jul 2018 19:40:42 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/PostgreSQL-count-higher-than-Spark-dataframe/m-p/180768#M80461</guid>
      <dc:creator>falbani</dc:creator>
      <dc:date>2018-07-10T19:40:42Z</dc:date>
    </item>
    <item>
      <title>Re: PostgreSQL count higher than Spark dataframe</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/PostgreSQL-count-higher-than-Spark-dataframe/m-p/180769#M80462</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/11048/falbani.html" nodeid="11048"&gt;@Felix Albani&lt;/A&gt; The table is having millions of records so it's very difficult to identify the missing or extra rows in PostgreSQL.&lt;/P&gt;&lt;P&gt;Is there any known issue in spark for postgresql to not match count ?.&lt;/P&gt;</description>
      <pubDate>Wed, 11 Jul 2018 17:11:13 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/PostgreSQL-count-higher-than-Spark-dataframe/m-p/180769#M80462</guid>
      <dc:creator>learnsqoop</dc:creator>
      <dc:date>2018-07-11T17:11:13Z</dc:date>
    </item>
    <item>
      <title>Re: PostgreSQL count higher than Spark dataframe</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/PostgreSQL-count-higher-than-Spark-dataframe/m-p/180770#M80463</link>
      <description>&lt;P&gt;Solved it. &lt;/P&gt;&lt;P&gt;Noticed that writing to Postgresql was accurate if i read parquet with second option below.&lt;/P&gt;&lt;P&gt;parquet("/user-data/xyz/input/TABLE/*) // WRONG numbers in PostgreSQL&lt;/P&gt;&lt;P&gt;parquet("/user-data/xyz/input/TABLE/evnt_month=*)  // Correct numbers in postgreSQL&lt;/P&gt;&lt;P&gt;If someone is aware of such problem, please comment.&lt;/P&gt;</description>
      <pubDate>Sun, 05 Aug 2018 13:56:36 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/PostgreSQL-count-higher-than-Spark-dataframe/m-p/180770#M80463</guid>
      <dc:creator>learnsqoop</dc:creator>
      <dc:date>2018-08-05T13:56:36Z</dc:date>
    </item>
  </channel>
</rss>

