<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Retrieving Timestamps in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/Retrieving-Timestamps/m-p/397685#M249947</link>
    <description>&lt;P&gt;I am trying to retrieve data from a table in HDFS in which a column contains timestamps.&lt;/P&gt;&lt;P&gt;I am connected in hdfs using CDSW and&amp;nbsp; running a python script which opens a spark session and run an sql query to retrieve some rows from the table. Although running the same sql query in HUE imala i get the proper values, in CDSW using the python script i get None values only from the timestamp column. How can i retrieve my data properly. It's a huge table so i cannot just export the csv file from the impala editor. I want to retrieve data for migration to another database. The script i run in CDSW is the following:&lt;/P&gt;&lt;P&gt;import pandas as pd&lt;/P&gt;&lt;P&gt;import numpy as np&lt;/P&gt;&lt;P&gt;import sys&lt;/P&gt;&lt;P&gt;from pyspark import SparkContext&lt;/P&gt;&lt;P&gt;from pyspark import SparkConf&lt;/P&gt;&lt;P&gt;from pyspark.sql.functions import lit&lt;BR /&gt;from pyspark.sql.functions import unix_timestamp&lt;BR /&gt;from pyspark.sql import SparkSession&lt;BR /&gt;from pyspark.sql.functions import to_utc_timestamp&lt;BR /&gt;from pyspark.sql import SQLContext&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;os.environ['PYSPARK_PYTHON'] = '/opt/cloudera/parcels/Anaconda-5.1.0.1/bin/python3.6'&lt;BR /&gt;os.environ['PROJ_LIB']='/home/cdsw/.conda/envs/python3.6/share/proj'&lt;BR /&gt;os.environ['PYSPARK_DRIVER_PYTHON'] = '/opt/cloudera/parcels/Anaconda-5.1.0.1/bin/python3.6'&lt;/P&gt;&lt;P&gt;spark = SparkSession.builder\&lt;BR /&gt;.master("yarn")\&lt;BR /&gt;.config("spark.sql.session.timeZone","UTC")\&lt;BR /&gt;.config("spark.submit.deployMode", "client")\&lt;BR /&gt;.config("spark.eventLog.enabled", "true")\&lt;BR /&gt;.config("spark.executor.instances", "30")\&lt;BR /&gt;.config("spark.executor.cores", "2")\&lt;BR /&gt;.config("spark.executor.memory", "4g")\&lt;BR /&gt;.config("spark.rpc.message.maxSize", "1024")\&lt;BR /&gt;.config("spark.executor.memoryOverhead", "800")\&lt;BR /&gt;.config("spark.driver.memory", "4g")\&lt;BR /&gt;.config("spark.driver.memoryOverhead", "800")\&lt;BR /&gt;.config("spark.spark.driver.maxResultSize", "4g")\&lt;BR /&gt;.config("spark.executor.dynamicAllocation.initialExecutors", "false")\&lt;BR /&gt;.config("spark.executor.dynamicAllocation.minExecutors", "false")\&lt;BR /&gt;.config("spark.executor.dynamicAllocation.maxExecutors", "false")\&lt;BR /&gt;.config("spark.sql.broadcastTimeout", "1000")\&lt;BR /&gt;.config("spark.kryoserializer.buffer.max", "1024m")\&lt;BR /&gt;.config("spark.sql.execution.arrow.pyspark.enabled", "true")\&lt;BR /&gt;.config("spark.sql.legacy.parquet.int96RebaseModeInRead", "LEGACY") \&lt;BR /&gt;.config("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "LEGACY") \&lt;BR /&gt;.config("spark.sql.legacy.avro.datetimeRebaseModeInRead", "LEGACY") \&lt;BR /&gt;.config("spark.sql.legacy.timeParserPolicy", "LEGACY") \&lt;BR /&gt;.getOrCreate()&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;sc = spark.sparkContext&lt;/P&gt;&lt;P&gt;sqlContext = SQLContext(sc)&lt;/P&gt;&lt;P&gt;hiveaction = sqlContext.sql('SET hive.exec.dynamic.partition = true')&lt;BR /&gt;hiveaction = sqlContext.sql('SET hive.exec.dynamic.partition.mode = nonstrict')&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;# Show all columns&lt;BR /&gt;pd.set_option("display.max_rows", None, "display.max_columns", None)&lt;/P&gt;&lt;P&gt;qry ="""SELECT parameter_id, measurement_time , value, par_dt FROM aums.eems_archive_data WHERE par_dt = '20240101' LIMIT 10"""&lt;/P&gt;&lt;P&gt;spark_df = spark.sql(qry)&lt;BR /&gt;data_df = spark_df.toPandas()&lt;BR /&gt;print(data_df.head(1))&lt;BR /&gt;&lt;BR /&gt;where 'measurement_time ' returns None values instead of dates and times&lt;/P&gt;</description>
    <pubDate>Tue, 21 Apr 2026 06:23:23 GMT</pubDate>
    <dc:creator>Nikitas</dc:creator>
    <dc:date>2026-04-21T06:23:23Z</dc:date>
    <item>
      <title>Retrieving Timestamps</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Retrieving-Timestamps/m-p/397685#M249947</link>
      <description>&lt;P&gt;I am trying to retrieve data from a table in HDFS in which a column contains timestamps.&lt;/P&gt;&lt;P&gt;I am connected in hdfs using CDSW and&amp;nbsp; running a python script which opens a spark session and run an sql query to retrieve some rows from the table. Although running the same sql query in HUE imala i get the proper values, in CDSW using the python script i get None values only from the timestamp column. How can i retrieve my data properly. It's a huge table so i cannot just export the csv file from the impala editor. I want to retrieve data for migration to another database. The script i run in CDSW is the following:&lt;/P&gt;&lt;P&gt;import pandas as pd&lt;/P&gt;&lt;P&gt;import numpy as np&lt;/P&gt;&lt;P&gt;import sys&lt;/P&gt;&lt;P&gt;from pyspark import SparkContext&lt;/P&gt;&lt;P&gt;from pyspark import SparkConf&lt;/P&gt;&lt;P&gt;from pyspark.sql.functions import lit&lt;BR /&gt;from pyspark.sql.functions import unix_timestamp&lt;BR /&gt;from pyspark.sql import SparkSession&lt;BR /&gt;from pyspark.sql.functions import to_utc_timestamp&lt;BR /&gt;from pyspark.sql import SQLContext&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;os.environ['PYSPARK_PYTHON'] = '/opt/cloudera/parcels/Anaconda-5.1.0.1/bin/python3.6'&lt;BR /&gt;os.environ['PROJ_LIB']='/home/cdsw/.conda/envs/python3.6/share/proj'&lt;BR /&gt;os.environ['PYSPARK_DRIVER_PYTHON'] = '/opt/cloudera/parcels/Anaconda-5.1.0.1/bin/python3.6'&lt;/P&gt;&lt;P&gt;spark = SparkSession.builder\&lt;BR /&gt;.master("yarn")\&lt;BR /&gt;.config("spark.sql.session.timeZone","UTC")\&lt;BR /&gt;.config("spark.submit.deployMode", "client")\&lt;BR /&gt;.config("spark.eventLog.enabled", "true")\&lt;BR /&gt;.config("spark.executor.instances", "30")\&lt;BR /&gt;.config("spark.executor.cores", "2")\&lt;BR /&gt;.config("spark.executor.memory", "4g")\&lt;BR /&gt;.config("spark.rpc.message.maxSize", "1024")\&lt;BR /&gt;.config("spark.executor.memoryOverhead", "800")\&lt;BR /&gt;.config("spark.driver.memory", "4g")\&lt;BR /&gt;.config("spark.driver.memoryOverhead", "800")\&lt;BR /&gt;.config("spark.spark.driver.maxResultSize", "4g")\&lt;BR /&gt;.config("spark.executor.dynamicAllocation.initialExecutors", "false")\&lt;BR /&gt;.config("spark.executor.dynamicAllocation.minExecutors", "false")\&lt;BR /&gt;.config("spark.executor.dynamicAllocation.maxExecutors", "false")\&lt;BR /&gt;.config("spark.sql.broadcastTimeout", "1000")\&lt;BR /&gt;.config("spark.kryoserializer.buffer.max", "1024m")\&lt;BR /&gt;.config("spark.sql.execution.arrow.pyspark.enabled", "true")\&lt;BR /&gt;.config("spark.sql.legacy.parquet.int96RebaseModeInRead", "LEGACY") \&lt;BR /&gt;.config("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "LEGACY") \&lt;BR /&gt;.config("spark.sql.legacy.avro.datetimeRebaseModeInRead", "LEGACY") \&lt;BR /&gt;.config("spark.sql.legacy.timeParserPolicy", "LEGACY") \&lt;BR /&gt;.getOrCreate()&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;sc = spark.sparkContext&lt;/P&gt;&lt;P&gt;sqlContext = SQLContext(sc)&lt;/P&gt;&lt;P&gt;hiveaction = sqlContext.sql('SET hive.exec.dynamic.partition = true')&lt;BR /&gt;hiveaction = sqlContext.sql('SET hive.exec.dynamic.partition.mode = nonstrict')&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;# Show all columns&lt;BR /&gt;pd.set_option("display.max_rows", None, "display.max_columns", None)&lt;/P&gt;&lt;P&gt;qry ="""SELECT parameter_id, measurement_time , value, par_dt FROM aums.eems_archive_data WHERE par_dt = '20240101' LIMIT 10"""&lt;/P&gt;&lt;P&gt;spark_df = spark.sql(qry)&lt;BR /&gt;data_df = spark_df.toPandas()&lt;BR /&gt;print(data_df.head(1))&lt;BR /&gt;&lt;BR /&gt;where 'measurement_time ' returns None values instead of dates and times&lt;/P&gt;</description>
      <pubDate>Tue, 21 Apr 2026 06:23:23 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Retrieving-Timestamps/m-p/397685#M249947</guid>
      <dc:creator>Nikitas</dc:creator>
      <dc:date>2026-04-21T06:23:23Z</dc:date>
    </item>
    <item>
      <title>Re: Retrieving Timestamps</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Retrieving-Timestamps/m-p/397764#M249974</link>
      <description>&lt;P&gt;&lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/121312"&gt;@Nikitas&lt;/a&gt;,&amp;nbsp;Welcome to our community! To help you get the best possible answer, I have tagged in our experts&amp;nbsp; &lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/75213"&gt;@ChethanYM&lt;/a&gt;&amp;nbsp;&lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/27216"&gt;@satz&lt;/a&gt;&amp;nbsp;&amp;nbsp;who may be able to assist you further.&lt;BR /&gt;&lt;BR /&gt;Please feel free to provide any additional information or details about your query. We hope that you will find a satisfactory solution to your question.&lt;/P&gt;</description>
      <pubDate>Wed, 20 Nov 2024 07:16:11 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Retrieving-Timestamps/m-p/397764#M249974</guid>
      <dc:creator>VidyaSargur</dc:creator>
      <dc:date>2024-11-20T07:16:11Z</dc:date>
    </item>
    <item>
      <title>Re: Retrieving Timestamps</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Retrieving-Timestamps/m-p/397769#M249975</link>
      <description>&lt;P&gt;&lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/121312"&gt;@Nikitas&lt;/a&gt;&amp;nbsp;Thank you for posting your query with us.&amp;nbsp;&lt;/P&gt;&lt;P&gt;Just we would like to check following items&lt;/P&gt;&lt;P&gt;1. Does the files (data format stored in table) is parquet / avro / any other format ?&lt;/P&gt;&lt;P&gt;2. Were you able to remove the following configurations and able to read the timestamps?&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;config("spark.sql.legacy.parquet.int96RebaseModeInRead", "LEGACY") \&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;.config("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "LEGACY") \&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;.config("spark.sql.legacy.avro.datetimeRebaseModeInRead", "LEGACY") \&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;.config("spark.sql.legacy.timeParserPolicy", "LEGACY") \&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;3. Does these files are written by Hive / Spark / external applications ?&lt;/P&gt;&lt;P&gt;4. Have you tried setting these above modes to "CORRECTED" (for parquet files generated by latest Hive or other application) ? and does that worked for that column&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 20 Nov 2024 09:29:52 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Retrieving-Timestamps/m-p/397769#M249975</guid>
      <dc:creator>satz</dc:creator>
      <dc:date>2024-11-20T09:29:52Z</dc:date>
    </item>
    <item>
      <title>Re: Retrieving Timestamps</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Retrieving-Timestamps/m-p/397770#M249976</link>
      <description>&lt;P&gt;&lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/27216"&gt;@satz&lt;/a&gt;1. I think that the files stored in the table are parquet, but i will check it.&lt;/P&gt;&lt;P&gt;2. I have already run the script without these configurations and get None values. That is why i added these configuration to&amp;nbsp; my spark session in CDSW.&lt;/P&gt;&lt;P&gt;3. The files are fed in the hdfs&amp;nbsp;from an sftp server, using handmade bash scripts that tranform raw data.&lt;/P&gt;&lt;P&gt;4. Changing the "LEGACY" to "CORRECTED" did change nothing. I expect&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;PRE&gt;parameter_id measurement_time  value    par_dt
0 d7cc8e82-7ad1     &lt;SPAN class="hljs-number"&gt;2024&lt;/SPAN&gt;-01-01 01:&lt;SPAN class="hljs-number"&gt;34&lt;/SPAN&gt;:&lt;SPAN class="hljs-number"&gt;24&lt;/SPAN&gt;  13.45 20240101 &lt;BR /&gt;1 d7caa072-7ad1 &lt;SPAN class="hljs-number"&gt;&amp;nbsp;   2024&lt;/SPAN&gt;-01-01 01:4&lt;SPAN class="hljs-number"&gt;4&lt;/SPAN&gt;:50  28.00 20240101&lt;/PRE&gt;&lt;P&gt;and i get&lt;/P&gt;&lt;PRE&gt;parameter_id measurement_time  value    par_dt
0  d7cc8e82-7ad1            None  13.45  20240101
1  d7caa072-7ad1            None  28.00  20240101&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 20 Nov 2024 09:51:34 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Retrieving-Timestamps/m-p/397770#M249976</guid>
      <dc:creator>Nikitas</dc:creator>
      <dc:date>2024-11-20T09:51:34Z</dc:date>
    </item>
    <item>
      <title>Re: Retrieving Timestamps</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Retrieving-Timestamps/m-p/397825#M249989</link>
      <description>&lt;P&gt;&lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/27216"&gt;@satz&lt;/a&gt;I just noticed that when i run the same query in hive instead of impala editor, measurement_time columns shows onl Null values. Does that mean that there files are written by Hive? I would really appreciate any further suggestions!&lt;/P&gt;</description>
      <pubDate>Thu, 21 Nov 2024 11:53:45 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Retrieving-Timestamps/m-p/397825#M249989</guid>
      <dc:creator>Nikitas</dc:creator>
      <dc:date>2024-11-21T11:53:45Z</dc:date>
    </item>
    <item>
      <title>Re: Retrieving Timestamps</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Retrieving-Timestamps/m-p/404569#M252338</link>
      <description>&lt;P&gt;To identify which user is writing the files, use HDFS CLI commands such as &lt;STRONG&gt;ls or getfacl&lt;/STRONG&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 21 Mar 2025 13:29:31 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Retrieving-Timestamps/m-p/404569#M252338</guid>
      <dc:creator>ggangadharan</dc:creator>
      <dc:date>2025-03-21T13:29:31Z</dc:date>
    </item>
  </channel>
</rss>

