<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: Kudu to HDFS data load timestamp issue. in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/Kudu-to-HDFS-data-load-timestamp-issue/m-p/347451#M235176</link>
    <description>&lt;P&gt;hi &lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/50448"&gt;@pszabados&lt;/a&gt;&lt;/P&gt;&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;FONT face="georgia,palatino,serif"&gt;- via Spark: timestamp will be converted to UTC in Kudu (however you can change this behavior in spark.conf)&lt;/FONT&gt;&lt;/BLOCKQUOTE&gt;&lt;P&gt;please, can you share the option to set?&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Wed, 13 Jul 2022 07:10:40 GMT</pubDate>
    <dc:creator>loacker88</dc:creator>
    <dc:date>2022-07-13T07:10:40Z</dc:date>
    <item>
      <title>Kudu to HDFS data load timestamp issue.</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Kudu-to-HDFS-data-load-timestamp-issue/m-p/93646#M57297</link>
      <description>&lt;P&gt;Hello,&lt;/P&gt;
&lt;P&gt;I am trying to load tables from Kudu to HDFS using spark2 and i have noticed that timestamp is off by 8 hours between Kudu and HDFS.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;df=spark_session.read.format('org.apache.kudu.spark.kudu')&lt;BR /&gt;.option('kudu.master','dcaldd163:7051,dcaldd162:7051,dcaldd161:7051')&lt;BR /&gt;.option('kudu.table',"impala::DB.kudu_table_name").load()&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;df.write.format("parquet").mode('overwrite').saveAsTable("db_name.kudu_table_name")&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I have tried to set the timezone locally for the session in Spark2 and still it does not solve the issue.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Can someone give a hint on how to solve this issue?&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 02 Aug 2019 16:43:27 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Kudu-to-HDFS-data-load-timestamp-issue/m-p/93646#M57297</guid>
      <dc:creator>GopiG</dc:creator>
      <dc:date>2019-08-02T16:43:27Z</dc:date>
    </item>
    <item>
      <title>Re: Kudu to HDFS data load timestamp issue.</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Kudu-to-HDFS-data-load-timestamp-issue/m-p/93657#M57306</link>
      <description>&lt;P&gt;hello&amp;nbsp;&lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/35518"&gt;@GopiG&lt;/a&gt;&amp;nbsp;,&lt;BR /&gt;have you tried setting the executor's and the driver's params in&amp;nbsp;&lt;SPAN&gt;spark-defaults.conf ?&lt;BR /&gt;&lt;BR /&gt;spark.driver.extraJavaOptions -Duser.timezone=UTC&lt;BR /&gt;spark.executor.extraJavaOptions -Duser.timezone=UTC&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;you can set the default time zone UTC or any example you want like GMT+8 etc...&lt;BR /&gt;&lt;BR /&gt;&amp;nbsp;cheers.&lt;/P&gt;</description>
      <pubDate>Fri, 02 Aug 2019 09:35:56 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Kudu-to-HDFS-data-load-timestamp-issue/m-p/93657#M57306</guid>
      <dc:creator>Bildervic</dc:creator>
      <dc:date>2019-08-02T09:35:56Z</dc:date>
    </item>
    <item>
      <title>Re: Kudu to HDFS data load timestamp issue.</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Kudu-to-HDFS-data-load-timestamp-issue/m-p/93768#M57360</link>
      <description>&lt;P&gt;Thank you so much for your response.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Unfortunately the solution did not work for me.&amp;nbsp;&lt;/P&gt;&lt;P&gt;Cloudera Version -&amp;gt; CDH-5.16.1-1.cdh5.16.1.p0.3&lt;/P&gt;&lt;P&gt;spark version&amp;nbsp; -&amp;gt;&amp;nbsp; 2.3.0&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;Instead of making changes in&amp;nbsp;spark-defaults.conf file, i have passed&amp;nbsp;the executor's and the driver's params along spark2 submit command.&amp;nbsp;&lt;/P&gt;&lt;P&gt;I have tried it with UTC, UTC+8, GMT+8 and America/Los_Angeles timezone , but none of them changed the time in date portion.&lt;/P&gt;&lt;P&gt;I have copied the entire spark2 submit command for your reference.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;===========================================================================&lt;/P&gt;&lt;P&gt;command = "spark2-submit --deploy-mode cluster --master yarn --executor-memory " + executor_memory + \&lt;BR /&gt;" --name " + job_name + " --executor-cores " + executor_cores + " --driver-memory " + driver_memory \&lt;BR /&gt;+ " --conf spark.dynamicAllocation.initialExecutors=" + num_executors \&lt;BR /&gt;+ " --conf spark.dynamicAllocation.minExecutors=2" \&lt;BR /&gt;+ " --conf spark.dynamicAllocation.maxExecutors=" + str(max_executor) \&lt;BR /&gt;+ " --py-files " + utils_file + "," + module_name \&lt;BR /&gt;+ " --conf spark.dynamicAllocation.executorIdleTimeout=10" \&lt;BR /&gt;+ " --conf spark.serializer=org.apache.spark.serializer.KryoSerializer" \&lt;BR /&gt;+ " --conf spark.task.maxFailures=14" \&lt;BR /&gt;+ " --conf spark.port.maxRetries=50" \&lt;BR /&gt;+ " --conf spark.yarn.max.executor.failures=14" \&lt;BR /&gt;+ " --conf spark.executor.memoryOverhead=2000" \&lt;BR /&gt;+ " --conf spark.yarn.maxAppAttempts=1" \&lt;BR /&gt;+ " --packages org.apache.kudu:kudu-spark2_2.11:1.6.0 "&lt;/P&gt;&lt;P&gt;command += " --files {4},{1},{5},{7} --conf spark.executor.extraJavaOptions=\'-Dlog4j.configuration={6} -Duser.timezone=UTC+8\' --conf spark.driver.extraJavaOptions=\'-Dlog4j.configuration={6} -Duser.timezone=UTC+8\' {0} {3} {2}".format(PROCESS_HANDLER_FILE_PATH, CONFIG_FILE_PATH, job_name, os.path.basename(CONFIG_FILE_PATH), process_csv, log4j_file, os.path.basename(log4j_file), module_base_table_path)&lt;/P&gt;&lt;P&gt;===========================================================================&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;After submitting the above command, i could see it setting params properly from SPARK properties of YARN. Below lines are copied from Spark Properties while the job is running.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;spark.executor.extraJavaOptions -Dlog4j.configuration=spark2_log4j.properties -Duser.timezone=UTC+8&lt;BR /&gt;spark.executor.extraJavaOptions -Dlog4j.configuration=spark2_log4j.properties -Duser.timezone=UTC+8&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Appreciate your response.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 06 Aug 2019 20:11:48 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Kudu-to-HDFS-data-load-timestamp-issue/m-p/93768#M57360</guid>
      <dc:creator>GopiG</dc:creator>
      <dc:date>2019-08-06T20:11:48Z</dc:date>
    </item>
    <item>
      <title>Re: Kudu to HDFS data load timestamp issue.</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Kudu-to-HDFS-data-load-timestamp-issue/m-p/308779#M223675</link>
      <description>&lt;P&gt;&lt;FONT face="georgia,palatino,serif"&gt;Hi &lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/35518"&gt;@GopiG&lt;/a&gt;,&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;FONT face="georgia,palatino,serif"&gt;There are several issues, you have to consider.&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;&lt;FONT face="georgia,palatino,serif"&gt;1. How has the data been written to the Kudu table.&lt;/FONT&gt;&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;FONT face="georgia,palatino,serif"&gt;- via Impala: timestamp remains local&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&lt;FONT face="georgia,palatino,serif"&gt;- via Spark: timestamp will be converted to UTC in Kudu (however you can change this behavior in spark.conf)&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;&lt;FONT face="georgia,palatino,serif"&gt;2. Reading the Kudu table in Spark&lt;/FONT&gt;&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;FONT face="georgia,palatino,serif"&gt;Timestamp will be converted from UTC to local, so you have local times in your data frame.&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;&lt;FONT face="georgia,palatino,serif"&gt;3. Writing the DataFrame to Hive parquet&lt;/FONT&gt;&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;FONT face="georgia,palatino,serif"&gt;Local timestamp is converted to UTC.&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;FONT face="georgia,palatino,serif"&gt;You have to check the following configuration options:&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&lt;FONT face="georgia,palatino,serif"&gt;- spark.sql.parquet.int96TimestampConversion&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&lt;FONT face="georgia,palatino,serif"&gt;- use_local_tz_for_unix_timestamp_conversions&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&lt;FONT face="georgia,palatino,serif"&gt;- convert_legacy_hive_parquet_utc_timestamps&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&lt;FONT face="georgia,palatino,serif"&gt;&lt;A href="https://docs.cloudera.com/documentation/enterprise/6/6.3/topics/impala_timestamp.html" target="_blank" rel="noopener"&gt;https://docs.cloudera.com/documentation/enterprise/6/6.3/topics/impala_timestamp.html&lt;/A&gt;&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 05 Jan 2021 10:42:52 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Kudu-to-HDFS-data-load-timestamp-issue/m-p/308779#M223675</guid>
      <dc:creator>pszabados</dc:creator>
      <dc:date>2021-01-05T10:42:52Z</dc:date>
    </item>
    <item>
      <title>Re: Kudu to HDFS data load timestamp issue.</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Kudu-to-HDFS-data-load-timestamp-issue/m-p/347451#M235176</link>
      <description>&lt;P&gt;hi &lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/50448"&gt;@pszabados&lt;/a&gt;&lt;/P&gt;&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;FONT face="georgia,palatino,serif"&gt;- via Spark: timestamp will be converted to UTC in Kudu (however you can change this behavior in spark.conf)&lt;/FONT&gt;&lt;/BLOCKQUOTE&gt;&lt;P&gt;please, can you share the option to set?&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 13 Jul 2022 07:10:40 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Kudu-to-HDFS-data-load-timestamp-issue/m-p/347451#M235176</guid>
      <dc:creator>loacker88</dc:creator>
      <dc:date>2022-07-13T07:10:40Z</dc:date>
    </item>
  </channel>
</rss>

