<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Insert data from spark data frame to hive orc table slow and use much memory in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/Insert-data-from-spark-data-frame-to-hive-orc-table-slow-and/m-p/347617#M235223</link>
    <description>&lt;P&gt;Hello everyone,&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;I am new for spark processing. I need you help my problem when transform data from plat file to hive orc table. Below is my process flow using pyspark:&lt;BR /&gt;1 - Use pyspark to load flat file to dataframe&lt;/P&gt;&lt;P&gt;2 - Transform data in dataframe and insert to hive table(parquet)&lt;/P&gt;&lt;P&gt;3 - Insert data from hive(parquet) to orc format&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Step 1 and 2 is fast, but step 3 is too slow because it use much memory. Sometime it stuck cannot continue.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Please help to advice and recommend the better flow.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Here is sample code:&lt;/P&gt;&lt;P&gt;-- loading.py&lt;/P&gt;&lt;P&gt;import pyspark&lt;BR /&gt;from pyspark import SparkContext, SparkConf&lt;BR /&gt;from pyspark.conf import SparkConf&lt;BR /&gt;from pyspark.sql import SparkSession&lt;BR /&gt;from pyspark.sql import HiveContext,SQLContext&lt;BR /&gt;from pyspark.sql.types import StructType,StructField, StringType, IntegerType&lt;BR /&gt;from pyspark.sql.types import ArrayType, DoubleType, BooleanType&lt;BR /&gt;from pyspark.sql.functions import input_file_name,col,array_contains&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;spark = SparkSession.builder.appName("testing..").enableHiveSupport().getOrCreate()&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;df_schema = StructType([&lt;BR /&gt;StructField("col1",StringType(),True)&lt;BR /&gt;,StructField("col2",StringType(),True)&lt;BR /&gt;,StructField("col3",StringType(),True)&lt;BR /&gt;,StructField("col4",StringType(),True)&lt;BR /&gt;,StructField("col5",StringType(),True)&lt;BR /&gt;,StructField("filename",StringType(),True)&lt;BR /&gt;,StructField("YEARKEY",StringType(),True)&lt;BR /&gt;,StructField("MONTHKEY",StringType(),True)&lt;BR /&gt;,StructField("DAYKEY",StringType(),True)&lt;BR /&gt;])&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;dsCSV = spark.read.format("csv").options(header='False', delimiter=';').schema(df_schema).load("/user/test/processing/data/out").withColumn("filename",input_file_name())&lt;BR /&gt;dsCSV.registerTempTable("cdr_data")&lt;/P&gt;&lt;P&gt;df_insert=spark.sql("select * from cdr_data")&lt;/P&gt;&lt;P&gt;df_insert.write.option("compression","snappy").mode('append').format('parquet').partitionBy("yearkey","monthkey","daykey").saveAsTable(('landing.test_loading'))&lt;/P&gt;&lt;P&gt;dsCSV.unpersist()&lt;BR /&gt;dsCSV.unpersist(True)&lt;BR /&gt;df_insert.unpersist()&lt;BR /&gt;df_insert.unpersist(True)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;-- cdr_hivesql.sh&lt;/P&gt;&lt;P&gt;v_history_records="insert into staging.test_loading select * from landing.test_loading"&lt;/P&gt;&lt;P&gt;echo "====================&amp;gt;&amp;gt;&amp;gt;`date +%Y%m%d%H%M%S`&amp;lt;&amp;lt;&amp;lt;====================="&lt;BR /&gt;echo ""&lt;BR /&gt;echo $v_history_records&lt;BR /&gt;hive -e "$v_history_records;"&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Note:&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;-- landing.test_loading(parquet format)&lt;BR /&gt;-- staging.test_loading(orc format)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Thu, 14 Jul 2022 10:36:58 GMT</pubDate>
    <dc:creator>mala_etl</dc:creator>
    <dc:date>2022-07-14T10:36:58Z</dc:date>
    <item>
      <title>Insert data from spark data frame to hive orc table slow and use much memory</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Insert-data-from-spark-data-frame-to-hive-orc-table-slow-and/m-p/347617#M235223</link>
      <description>&lt;P&gt;Hello everyone,&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;I am new for spark processing. I need you help my problem when transform data from plat file to hive orc table. Below is my process flow using pyspark:&lt;BR /&gt;1 - Use pyspark to load flat file to dataframe&lt;/P&gt;&lt;P&gt;2 - Transform data in dataframe and insert to hive table(parquet)&lt;/P&gt;&lt;P&gt;3 - Insert data from hive(parquet) to orc format&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Step 1 and 2 is fast, but step 3 is too slow because it use much memory. Sometime it stuck cannot continue.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Please help to advice and recommend the better flow.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Here is sample code:&lt;/P&gt;&lt;P&gt;-- loading.py&lt;/P&gt;&lt;P&gt;import pyspark&lt;BR /&gt;from pyspark import SparkContext, SparkConf&lt;BR /&gt;from pyspark.conf import SparkConf&lt;BR /&gt;from pyspark.sql import SparkSession&lt;BR /&gt;from pyspark.sql import HiveContext,SQLContext&lt;BR /&gt;from pyspark.sql.types import StructType,StructField, StringType, IntegerType&lt;BR /&gt;from pyspark.sql.types import ArrayType, DoubleType, BooleanType&lt;BR /&gt;from pyspark.sql.functions import input_file_name,col,array_contains&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;spark = SparkSession.builder.appName("testing..").enableHiveSupport().getOrCreate()&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;df_schema = StructType([&lt;BR /&gt;StructField("col1",StringType(),True)&lt;BR /&gt;,StructField("col2",StringType(),True)&lt;BR /&gt;,StructField("col3",StringType(),True)&lt;BR /&gt;,StructField("col4",StringType(),True)&lt;BR /&gt;,StructField("col5",StringType(),True)&lt;BR /&gt;,StructField("filename",StringType(),True)&lt;BR /&gt;,StructField("YEARKEY",StringType(),True)&lt;BR /&gt;,StructField("MONTHKEY",StringType(),True)&lt;BR /&gt;,StructField("DAYKEY",StringType(),True)&lt;BR /&gt;])&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;dsCSV = spark.read.format("csv").options(header='False', delimiter=';').schema(df_schema).load("/user/test/processing/data/out").withColumn("filename",input_file_name())&lt;BR /&gt;dsCSV.registerTempTable("cdr_data")&lt;/P&gt;&lt;P&gt;df_insert=spark.sql("select * from cdr_data")&lt;/P&gt;&lt;P&gt;df_insert.write.option("compression","snappy").mode('append').format('parquet').partitionBy("yearkey","monthkey","daykey").saveAsTable(('landing.test_loading'))&lt;/P&gt;&lt;P&gt;dsCSV.unpersist()&lt;BR /&gt;dsCSV.unpersist(True)&lt;BR /&gt;df_insert.unpersist()&lt;BR /&gt;df_insert.unpersist(True)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;-- cdr_hivesql.sh&lt;/P&gt;&lt;P&gt;v_history_records="insert into staging.test_loading select * from landing.test_loading"&lt;/P&gt;&lt;P&gt;echo "====================&amp;gt;&amp;gt;&amp;gt;`date +%Y%m%d%H%M%S`&amp;lt;&amp;lt;&amp;lt;====================="&lt;BR /&gt;echo ""&lt;BR /&gt;echo $v_history_records&lt;BR /&gt;hive -e "$v_history_records;"&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Note:&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;-- landing.test_loading(parquet format)&lt;BR /&gt;-- staging.test_loading(orc format)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 14 Jul 2022 10:36:58 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Insert-data-from-spark-data-frame-to-hive-orc-table-slow-and/m-p/347617#M235223</guid>
      <dc:creator>mala_etl</dc:creator>
      <dc:date>2022-07-14T10:36:58Z</dc:date>
    </item>
    <item>
      <title>Re: Insert data from spark data frame to hive orc table slow and use much memory</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Insert-data-from-spark-data-frame-to-hive-orc-table-slow-and/m-p/377162#M243163</link>
      <description>&lt;P&gt;&lt;SPAN&gt;It appears that you're currently following a two-step process: writing data to a Parquet table and then using that Parquet table to write to an ORC table. You can streamline this by directly writing the data into the ORC format table, eliminating the need to write the same data to a Parquet table before reading it.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;Ref -&amp;nbsp;&lt;BR /&gt;&lt;A href="https://spark.apache.org/docs/2.4.0/sql-data-sources-hive-tables.html" target="_blank"&gt;https://spark.apache.org/docs/2.4.0/sql-data-sources-hive-tables.html&lt;/A&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;A href="https://docs.cloudera.com/cdp-private-cloud-base/7.1.9/developing-spark-applications/topics/spark-loading-orc-data-predicate-push-down.html" target="_blank"&gt;https://docs.cloudera.com/cdp-private-cloud-base/7.1.9/developing-spark-applications/topics/spark-loading-orc-data-predicate-push-down.html&lt;/A&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 03 Oct 2023 09:07:45 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Insert-data-from-spark-data-frame-to-hive-orc-table-slow-and/m-p/377162#M243163</guid>
      <dc:creator>ggangadharan</dc:creator>
      <dc:date>2023-10-03T09:07:45Z</dc:date>
    </item>
  </channel>
</rss>

