<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: CDH 5.4.7  spark-streaming(1.3.0) kafka message into hive, too many staging directory not cleane in Archives of Support Questions (Read Only)</title>
    <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/CDH-5-4-7-spark-streaming-1-3-0-kafka-message-into-hive-too/m-p/49502#M51665</link>
    <description>&lt;P&gt;It seems like , every batch of streaming create a .hive-staging directory. Any suggestions ??&lt;/P&gt;</description>
    <pubDate>Tue, 17 Jan 2017 06:54:59 GMT</pubDate>
    <dc:creator>terry19850289</dc:creator>
    <dc:date>2017-01-17T06:54:59Z</dc:date>
    <item>
      <title>CDH 5.4.7  spark-streaming(1.3.0) kafka message into hive, too many staging directory not cleaned</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/CDH-5-4-7-spark-streaming-1-3-0-kafka-message-into-hive-too/m-p/49501#M51664</link>
      <description>&lt;P&gt;This seems a bug of spark, see &lt;A href="https://github.com/apache/spark/pull/16325" target="_self"&gt;https://github.com/apache/spark/pull/16325&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;my code is here,it's work.But after a while,there are a lot of .hive-stagingXXXX directory.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;a photo of my warehouse as attachment.&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="hive-staging.png" style="width: 586px;"&gt;&lt;img src="https://community.cloudera.com/t5/image/serverpage/image-id/2524i6836DF837356DFD4/image-size/large?v=v2&amp;amp;px=999" role="button" title="hive-staging.png" alt="hive-staging.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;SPAN&gt;val &lt;/SPAN&gt;sparkConf = &lt;SPAN&gt;new &lt;/SPAN&gt;SparkConf()&lt;BR /&gt;    .setAppName(&lt;SPAN&gt;"SendHistoryConsumer"&lt;/SPAN&gt;)&lt;BR /&gt;&lt;SPAN&gt;// .setMaster("local[2]") // for debug&lt;BR /&gt;&lt;/SPAN&gt;&lt;SPAN&gt;val &lt;/SPAN&gt;ssc = &lt;SPAN&gt;new &lt;/SPAN&gt;StreamingContext(sparkConf, &lt;SPAN&gt;Seconds&lt;/SPAN&gt;(batchInterval.toInt))&lt;BR /&gt;ssc.checkpoint(&lt;SPAN&gt;"/opt/cloudera/checkPoint"&lt;/SPAN&gt;)&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;// kafka configuration&lt;BR /&gt;&lt;/SPAN&gt;&lt;SPAN&gt;val &lt;/SPAN&gt;topicsSet = &lt;SPAN&gt;Set&lt;/SPAN&gt;(KafkaConfigConstant.&lt;SPAN&gt;PRIORITY_10&lt;/SPAN&gt;)&lt;BR /&gt;&lt;SPAN&gt;val &lt;/SPAN&gt;kafkaParams = &lt;SPAN&gt;Map&lt;/SPAN&gt;[&lt;SPAN&gt;String&lt;/SPAN&gt;, &lt;SPAN&gt;String&lt;/SPAN&gt;](&lt;BR /&gt;    &lt;SPAN&gt;"metadata.broker.list" &lt;/SPAN&gt;-&amp;gt; brokers,&lt;BR /&gt;    &lt;SPAN&gt;"serializer.class"&lt;/SPAN&gt;-&amp;gt;&lt;SPAN&gt;"kafka.serializer.StringEncoder"&lt;/SPAN&gt;,&lt;BR /&gt;    &lt;SPAN&gt;"group.id" &lt;/SPAN&gt;-&amp;gt; groupId,&lt;BR /&gt;    &lt;SPAN&gt;"auto.offset.reset" &lt;/SPAN&gt;-&amp;gt; &lt;SPAN&gt;"smallest"&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;val &lt;/SPAN&gt;messages = KafkaUtils.&lt;SPAN&gt;createDirectStream&lt;/SPAN&gt;[&lt;SPAN&gt;String&lt;/SPAN&gt;, &lt;SPAN&gt;String&lt;/SPAN&gt;, StringDecoder, StringDecoder](ssc, kafkaParams,&lt;BR /&gt;topicsSet)&lt;BR /&gt;&lt;SPAN&gt;//messages.print()&lt;BR /&gt;&lt;/SPAN&gt;&lt;SPAN&gt;val &lt;/SPAN&gt;dStream = messages&lt;BR /&gt;    .map(_._2)&lt;BR /&gt;    .filter(message =&amp;gt;&lt;BR /&gt;        !message.isEmpty&lt;BR /&gt;    )&lt;BR /&gt;    .map(&lt;SPAN&gt;parseTopic10&lt;/SPAN&gt;)&lt;BR /&gt;&lt;BR /&gt;dStream.foreachRDD(rdd =&amp;gt;&lt;BR /&gt;&lt;BR /&gt;    &lt;SPAN&gt;if &lt;/SPAN&gt;(!rdd.isEmpty &amp;amp;&amp;amp; !rdd.partitions.isEmpty) {&lt;BR /&gt;&lt;BR /&gt;        &lt;SPAN&gt;val &lt;/SPAN&gt;hiveCtx = &lt;SPAN&gt;new &lt;/SPAN&gt;HiveContext(rdd.sparkContext)&lt;BR /&gt;        &lt;SPAN&gt;val &lt;/SPAN&gt;sendCallBackPoolDF = hiveCtx.createDataFrame(rdd, &lt;SPAN&gt;classOf&lt;/SPAN&gt;[SmsSendHistDataSede])&lt;BR /&gt;&lt;BR /&gt;        hiveCtx.sql(&lt;SPAN&gt;"use db"&lt;/SPAN&gt;)&lt;BR /&gt;        sendCallBackPoolDF.registerTempTable(&lt;SPAN&gt;"sendCallBack"&lt;/SPAN&gt;)&lt;BR /&gt;&lt;BR /&gt;        hiveCtx.sql(&lt;SPAN&gt;"set hive.exec.dynamic.partition=true"&lt;/SPAN&gt;)&lt;BR /&gt;        hiveCtx.sql(&lt;SPAN&gt;"set hive.exec.dynamic.partition.mode=nonstrict"&lt;/SPAN&gt;)&lt;BR /&gt;&lt;SPAN&gt;&lt;BR /&gt;&lt;/SPAN&gt;        &lt;SPAN&gt;val &lt;/SPAN&gt;smTrgPart = hiveCtx.sql(&lt;SPAN&gt;"insert into table SmsSendHistData_orc partition" &lt;/SPAN&gt;+&lt;BR /&gt;            &lt;SPAN&gt;"(partittime) select userid,city,mobile,msgcount,msgbody,sender,accepttype,sendcode," &lt;/SPAN&gt;+&lt;BR /&gt;            &lt;SPAN&gt;"msgid,bmsgid,usermsgid,userbulkmsgid,channelmsgid,channelid,fileid,notifyurl,smscode ,smsstat," &lt;/SPAN&gt;+&lt;BR /&gt;            &lt;SPAN&gt;"notifycallnum,sendsmsbody,parentmsgid,errormessage,queuename,startdeliveryid,acctime,sendtime," &lt;/SPAN&gt;+&lt;BR /&gt;            &lt;SPAN&gt;"notifytime,smstype,from_unixtime(floor(acctime/1000),'yyyyMMdd') as partittime from sendCallBack"&lt;/SPAN&gt;)&lt;BR /&gt;        smTrgPart.saveAsTable(&lt;SPAN&gt;"SmsSendHistData_orc"&lt;/SPAN&gt;, SaveMode.&lt;SPAN&gt;Append&lt;/SPAN&gt;)&lt;BR /&gt;&lt;BR /&gt;    }&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 16 Sep 2022 10:55:03 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/CDH-5-4-7-spark-streaming-1-3-0-kafka-message-into-hive-too/m-p/49501#M51664</guid>
      <dc:creator>terry19850289</dc:creator>
      <dc:date>2022-09-16T10:55:03Z</dc:date>
    </item>
    <item>
      <title>Re: CDH 5.4.7  spark-streaming(1.3.0) kafka message into hive, too many staging directory not cleane</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/CDH-5-4-7-spark-streaming-1-3-0-kafka-message-into-hive-too/m-p/49502#M51665</link>
      <description>&lt;P&gt;It seems like , every batch of streaming create a .hive-staging directory. Any suggestions ??&lt;/P&gt;</description>
      <pubDate>Tue, 17 Jan 2017 06:54:59 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/CDH-5-4-7-spark-streaming-1-3-0-kafka-message-into-hive-too/m-p/49502#M51665</guid>
      <dc:creator>terry19850289</dc:creator>
      <dc:date>2017-01-17T06:54:59Z</dc:date>
    </item>
    <item>
      <title>Re: CDH 5.4.7  spark-streaming(1.3.0) kafka message into hive, too many staging directory not cleane</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/CDH-5-4-7-spark-streaming-1-3-0-kafka-message-into-hive-too/m-p/49504#M51666</link>
      <description>Yeah, that is expected behavior. Each batch writes to the staging directory and when it is done, the data is moved to the actual table/partition directory. I have experienced these same staging directories being left behind. In general, if the data is successfully moved then there will be no data left behind. I ended up having a separate process that would check for entries since the last run (regular Spark jobs, not streaming) and then check if the directory was empty; remove, and repeat. I also employed these directories checks to see if something had go wrong in a job as the data would remain.</description>
      <pubDate>Tue, 17 Jan 2017 07:08:59 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/CDH-5-4-7-spark-streaming-1-3-0-kafka-message-into-hive-too/m-p/49504#M51666</guid>
      <dc:creator>mbigelow</dc:creator>
      <dc:date>2017-01-17T07:08:59Z</dc:date>
    </item>
    <item>
      <title>Re: CDH 5.4.7  spark-streaming(1.3.0) kafka message into hive, too many staging directory not cleane</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/CDH-5-4-7-spark-streaming-1-3-0-kafka-message-into-hive-too/m-p/49505#M51667</link>
      <description>&lt;P&gt;&lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/18127"&gt;@mbigelow&lt;/a&gt;&amp;nbsp;Some of these directories only have a _success file which is a result of batch. In spark-streaming scene, should I periodly remove the empty directory since they're too much?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Could you explain your solution in more detail, I'm a stupid guy.&lt;/P&gt;</description>
      <pubDate>Tue, 17 Jan 2017 07:51:08 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/CDH-5-4-7-spark-streaming-1-3-0-kafka-message-into-hive-too/m-p/49505#M51667</guid>
      <dc:creator>terry19850289</dc:creator>
      <dc:date>2017-01-17T07:51:08Z</dc:date>
    </item>
    <item>
      <title>Re: CDH 5.4.7  spark-streaming(1.3.0) kafka message into hive, too many staging directory not cleane</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/CDH-5-4-7-spark-streaming-1-3-0-kafka-message-into-hive-too/m-p/49507#M51668</link>
      <description>&lt;P&gt;&lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/18127"&gt;@mbigelow&lt;/a&gt;&amp;nbsp;my active partition's data also split into small files too, It's really strange&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="smallfiles.png" style="width: 600px;"&gt;&lt;img src="https://community.cloudera.com/t5/image/serverpage/image-id/2525i97D41DEB92A26598/image-size/large?v=v2&amp;amp;px=999" role="button" title="smallfiles.png" alt="smallfiles.png" /&gt;&lt;/span&gt;.&lt;/P&gt;</description>
      <pubDate>Tue, 17 Jan 2017 07:26:24 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/CDH-5-4-7-spark-streaming-1-3-0-kafka-message-into-hive-too/m-p/49507#M51668</guid>
      <dc:creator>terry19850289</dc:creator>
      <dc:date>2017-01-17T07:26:24Z</dc:date>
    </item>
    <item>
      <title>Re: CDH 5.4.7  spark-streaming(1.3.0) kafka message into hive, too many staging directory not cleane</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/CDH-5-4-7-spark-streaming-1-3-0-kafka-message-into-hive-too/m-p/49585#M51669</link>
      <description>Add two background thread, one for delete empty dir, another for hive Concatenate. But It's really a ugly way.</description>
      <pubDate>Wed, 18 Jan 2017 03:45:54 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/CDH-5-4-7-spark-streaming-1-3-0-kafka-message-into-hive-too/m-p/49585#M51669</guid>
      <dc:creator>terry19850289</dc:creator>
      <dc:date>2017-01-18T03:45:54Z</dc:date>
    </item>
  </channel>
</rss>

