<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: Flume: HDFS sink: Can't write large files in Archives of Support Questions (Read Only)</title>
    <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Flume-HDFS-sink-Can-t-write-large-files/m-p/23507#M4513</link>
    <description>&lt;P&gt;It seems that this error is caused by the file channel:&amp;nbsp;I change the sink to "null channel" and the error still occurs.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;2015-01-09 14:37:28,928 ERROR org.apache.flume.SinkRunner: Unable to deliver event. Exception follows.
java.lang.IllegalStateException: Log is closed
	at com.google.common.base.Preconditions.checkState(Preconditions.java:145)
	at org.apache.flume.channel.file.Log.rollback(Log.java:701)
	at org.apache.flume.channel.file.FileChannel$FileBackedTransaction.doRollback(FileChannel.java:614)
	at org.apache.flume.channel.BasicTransactionSemantics.rollback(BasicTransactionSemantics.java:168)
	at org.apache.flume.sink.NullSink.process(NullSink.java:104)
	at org.apache.flume.sink.DefaultSinkProcessor.process(DefaultSinkProcessor.java:68)
	at org.apache.flume.SinkRunner$PollingRunner.run(SinkRunner.java:147)
	at java.lang.Thread.run(Thread.java:745)&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Fri, 09 Jan 2015 14:59:49 GMT</pubDate>
    <dc:creator>Adagyo</dc:creator>
    <dc:date>2015-01-09T14:59:49Z</dc:date>
    <item>
      <title>Flume: HDFS sink: Can't write large files</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Flume-HDFS-sink-Can-t-write-large-files/m-p/23456#M4512</link>
      <description>&lt;P&gt;I'm trying to do the following:&amp;nbsp;[SpoolDir source] --&amp;gt; [Custom Interceptor] --&amp;gt; [FileChannel] --&amp;gt; [HDFS Sink]&lt;/P&gt;&lt;P&gt;My flume.conf looks like that:&lt;/P&gt;&lt;PRE&gt;# source / channel / sink
sebanalytics.sources = spooldir-source
sebanalytics.channels = file-channel
sebanalytics.sinks = hdfs-sink

# source definition
sebanalytics.sources.spooldir-source.type = spooldir
sebanalytics.sources.spooldir-source.spoolDir = /var/flume/in
sebanalytics.sources.spooldir-source.basenameHeader = true
sebanalytics.sources.spooldir-source.basenameHeaderKey = basename
# Max blob size: 1.5Go
sebanalytics.sources.spooldir-source.deserializer = org.apache.flume.sink.solr.morphline.BlobDeserializer$Builder
sebanalytics.sources.spooldir-source.deserializer.maxBlobLength = 1610000000
# Attach the interceptor to the source
sebanalytics.sources.spooldir-source.interceptors = json-interceptor
sebanalytics.sources.spooldir-source.interceptors.json-interceptor.type = com.adagyo.flume.interceptor.JsonInterceptor$Builder
# Define event's headers. basenameHeader must be the same than source.basenameHeaderKey (defaults is basename)
sebanalytics.sources.spooldir-source.interceptors.json-interceptor.basenameHeader = basename
sebanalytics.sources.spooldir-source.interceptors.json-interceptor.resourceHeader = resources
sebanalytics.sources.spooldir-source.interceptors.json-interceptor.ssidHeader = ssid

# channel definition
sebanalytics.channels.file-channel.type = file
sebanalytics.channels.file-channel.checkpointDir = /var/flume/checkpoints
sebanalytics.channels.file-channel.dataDirs = /var/flume/datadirs

# sink definition
sebanalytics.sinks.hdfs-sink.type = hdfs
sebanalytics.sinks.hdfs-sink.hdfs.path = hdfs://xx.xx.xx.xx/user/adagyo/flume/in
sebanalytics.sinks.hdfs-sink.hdfs.filePrefix = %{basename}
sebanalytics.sinks.hdfs-sink.hdfs.fileSuffix = .json
sebanalytics.sinks.hdfs-sink.hdfs.fileType = DataStream
sebanalytics.sinks.hdfs-sink.hdfs.writeFormat = Text
sebanalytics.sinks.hdfs-sink.hdfs.rollInterval = 0
sebanalytics.sinks.hdfs-sink.hdfs.rollSize = 0
sebanalytics.sinks.hdfs-sink.hdfs.rollCount = 0
sebanalytics.sinks.hdfs-sink.hdfs.batchSize = 1

# connect source and sink to channel
sebanalytics.sources.spooldir-source.channels = file-channel
sebanalytics.sinks.hdfs-sink.channel = file-channel&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;File in the spoolDir looks like that:&lt;/P&gt;&lt;PRE&gt;{ "objectType" : [ { JSON Object 1 } , { JSON Object 2 }, ... ] }&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;My custom interceptor modify the event body to that&lt;/P&gt;&lt;PRE&gt;{JSON Object 1}
{JSON Object 2}&lt;BR /&gt;...&lt;/PRE&gt;&lt;P&gt;Everything works fine with small files (less than 10MB).&lt;/P&gt;&lt;P&gt;When I try with a bigger one (175MB), spoolDir, interceptor and channel works (the file in the spoolDir is renamed with the .COMPLETED suffix), but I get an error after that:&lt;/P&gt;&lt;PRE&gt;2015-01-08 15:49:30,351 ERROR org.apache.flume.SinkRunner: Unable to deliver event. Exception follows.
java.lang.IllegalStateException: Log is closed
	at com.google.common.base.Preconditions.checkState(Preconditions.java:145)
	at org.apache.flume.channel.file.Log.getFlumeEventQueue(Log.java:575)
	at org.apache.flume.channel.file.FileChannel$FileBackedTransaction.&amp;lt;init&amp;gt;(FileChannel.java:431)
	at org.apache.flume.channel.file.FileChannel.createTransaction(FileChannel.java:351)
	at org.apache.flume.channel.BasicChannelSemantics.getTransaction(BasicChannelSemantics.java:122)
	at org.apache.flume.sink.hdfs.HDFSEventSink.process(HDFSEventSink.java:376)
	at org.apache.flume.sink.DefaultSinkProcessor.process(DefaultSinkProcessor.java:68)
	at org.apache.flume.SinkRunner$PollingRunner.run(SinkRunner.java:147)
	at java.lang.Thread.run(Thread.java:745)&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thanks for your help!&lt;/P&gt;</description>
      <pubDate>Fri, 16 Sep 2022 09:18:15 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Flume-HDFS-sink-Can-t-write-large-files/m-p/23456#M4512</guid>
      <dc:creator>Adagyo</dc:creator>
      <dc:date>2022-09-16T09:18:15Z</dc:date>
    </item>
    <item>
      <title>Re: Flume: HDFS sink: Can't write large files</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Flume-HDFS-sink-Can-t-write-large-files/m-p/23507#M4513</link>
      <description>&lt;P&gt;It seems that this error is caused by the file channel:&amp;nbsp;I change the sink to "null channel" and the error still occurs.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;2015-01-09 14:37:28,928 ERROR org.apache.flume.SinkRunner: Unable to deliver event. Exception follows.
java.lang.IllegalStateException: Log is closed
	at com.google.common.base.Preconditions.checkState(Preconditions.java:145)
	at org.apache.flume.channel.file.Log.rollback(Log.java:701)
	at org.apache.flume.channel.file.FileChannel$FileBackedTransaction.doRollback(FileChannel.java:614)
	at org.apache.flume.channel.BasicTransactionSemantics.rollback(BasicTransactionSemantics.java:168)
	at org.apache.flume.sink.NullSink.process(NullSink.java:104)
	at org.apache.flume.sink.DefaultSinkProcessor.process(DefaultSinkProcessor.java:68)
	at org.apache.flume.SinkRunner$PollingRunner.run(SinkRunner.java:147)
	at java.lang.Thread.run(Thread.java:745)&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 09 Jan 2015 14:59:49 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Flume-HDFS-sink-Can-t-write-large-files/m-p/23507#M4513</guid>
      <dc:creator>Adagyo</dc:creator>
      <dc:date>2015-01-09T14:59:49Z</dc:date>
    </item>
    <item>
      <title>Re: Flume: HDFS sink: Can't write large files</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Flume-HDFS-sink-Can-t-write-large-files/m-p/23510#M4514</link>
      <description>&lt;P&gt;You probably need to adjust the maxFileSize and minimumSpaceRequired settings on the file channel[1].&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;FWIW, transfering large files with Flume is an anti-pattern. Flume is designed for event/log transport not large file transport. You might want to check out a new Apache project called Apache NiFi[2] that is better suited to large file transfer. There's a quick how-to blog post available here to get you started:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;A target="_blank" href="http://ingest.tips/2014/12/22/getting-started-with-apache-nifi/"&gt;http://ingest.tips/2014/12/22/getting-started-with-apache-nifi/&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;-Joey&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;[1]&amp;nbsp;&lt;A target="_blank" href="http://flume.apache.org/FlumeUserGuide.html#file-channel"&gt;http://flume.apache.org/FlumeUserGuide.html#file-channel&lt;/A&gt;&lt;/P&gt;&lt;P&gt;[2]&amp;nbsp;&lt;A target="_blank" href="http://nifi.incubator.apache.org"&gt;http://nifi.incubator.apache.org&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 09 Jan 2015 15:09:41 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Flume-HDFS-sink-Can-t-write-large-files/m-p/23510#M4514</guid>
      <dc:creator>joey</dc:creator>
      <dc:date>2015-01-09T15:09:41Z</dc:date>
    </item>
    <item>
      <title>Re: Flume: HDFS sink: Can't write large files</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Flume-HDFS-sink-Can-t-write-large-files/m-p/23513#M4515</link>
      <description>&lt;P&gt;Thanks for your answer. I try the memory channel and it works. So I confirm the channel issue.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I understand (and already know) that my use case is an anti-pattern. Usually the events we receive are below 1Mo (and it's already too big). I'm migrating a old workflow to a new one and I can't change the incoming data at that moment.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I manage to make it work with the memory channel and It's ok for me now.&lt;/P&gt;</description>
      <pubDate>Fri, 09 Jan 2015 15:24:38 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Flume-HDFS-sink-Can-t-write-large-files/m-p/23513#M4515</guid>
      <dc:creator>Adagyo</dc:creator>
      <dc:date>2015-01-09T15:24:38Z</dc:date>
    </item>
    <item>
      <title>Re: Flume: HDFS sink: Can't write large files</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Flume-HDFS-sink-Can-t-write-large-files/m-p/23514#M4516</link>
      <description>&lt;P&gt;Keep in mind that with the MemoryChannel you lose any records in the channel if Flume crashes or the system reboots.&lt;/P&gt;</description>
      <pubDate>Fri, 09 Jan 2015 15:28:31 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Flume-HDFS-sink-Can-t-write-large-files/m-p/23514#M4516</guid>
      <dc:creator>joey</dc:creator>
      <dc:date>2015-01-09T15:28:31Z</dc:date>
    </item>
  </channel>
</rss>

