<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: Performance diff between single big file vs multiple smaller files in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/Performance-diff-between-single-big-file-vs-multiple-smaller/m-p/376962#M243084</link>
    <description>&lt;P&gt;Where is this graph available on ui? And does it get updated after every run?&lt;/P&gt;</description>
    <pubDate>Thu, 28 Sep 2023 02:52:38 GMT</pubDate>
    <dc:creator>manishg</dc:creator>
    <dc:date>2023-09-28T02:52:38Z</dc:date>
    <item>
      <title>Performance diff between single big file vs multiple smaller files</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Performance-diff-between-single-big-file-vs-multiple-smaller/m-p/376926#M243065</link>
      <description>&lt;P&gt;I have a nifi flow. I observe that if input file is split into smaller files and fed into the flow one by one, then overall time taken(sum of time taken for individual files) is considerably low compared to when I feed single big file.&lt;/P&gt;&lt;P&gt;What can be a possible cause for this performance difference?&lt;/P&gt;&lt;P&gt;Note:&lt;/P&gt;&lt;P&gt;Flow has many processors that use avro readers/writers.&lt;/P&gt;&lt;P&gt;I calculate time using following in a LogMessage processor:&lt;/P&gt;&lt;P&gt;${now():toNumber():minus(${lineageStartDate}):format("HH:mm:‌​ss", "GMT")}&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 27 Sep 2023 09:36:15 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Performance-diff-between-single-big-file-vs-multiple-smaller/m-p/376926#M243065</guid>
      <dc:creator>manishg</dc:creator>
      <dc:date>2023-09-27T09:36:15Z</dc:date>
    </item>
    <item>
      <title>Re: Performance diff between single big file vs multiple smaller files</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Performance-diff-between-single-big-file-vs-multiple-smaller/m-p/376935#M243069</link>
      <description>&lt;P&gt;&lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/106206"&gt;@manishg&lt;/a&gt;&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;The first thing that comes to mind is JVM heap.&amp;nbsp; &amp;nbsp;You may want to collect and look at garbage collection data with large files versus small files.&amp;nbsp;&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;Second would be identifying which processor(s) the largest FlowFiles spend the most time at.&amp;nbsp; For this I would suggest looking at the provenance lineage for the large FlowFiles. There is a slide bar at bottom of that lineage graph that you can scroll to see progression of FlowFile through the lineage tree.&amp;nbsp; &amp;nbsp;What processor follow did the FlowFile spend longest at and how are they configured?&lt;/P&gt;&lt;P&gt;Hope this helps,&lt;/P&gt;&lt;P&gt;Matt&lt;/P&gt;</description>
      <pubDate>Wed, 27 Sep 2023 12:54:32 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Performance-diff-between-single-big-file-vs-multiple-smaller/m-p/376935#M243069</guid>
      <dc:creator>MattWho</dc:creator>
      <dc:date>2023-09-27T12:54:32Z</dc:date>
    </item>
    <item>
      <title>Re: Performance diff between single big file vs multiple smaller files</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Performance-diff-between-single-big-file-vs-multiple-smaller/m-p/376962#M243084</link>
      <description>&lt;P&gt;Where is this graph available on ui? And does it get updated after every run?&lt;/P&gt;</description>
      <pubDate>Thu, 28 Sep 2023 02:52:38 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Performance-diff-between-single-big-file-vs-multiple-smaller/m-p/376962#M243084</guid>
      <dc:creator>manishg</dc:creator>
      <dc:date>2023-09-28T02:52:38Z</dc:date>
    </item>
    <item>
      <title>Re: Performance diff between single big file vs multiple smaller files</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Performance-diff-between-single-big-file-vs-multiple-smaller/m-p/376963#M243085</link>
      <description>&lt;P&gt;Got it. Its on Data Provenance dialog box.&lt;/P&gt;</description>
      <pubDate>Thu, 28 Sep 2023 02:57:16 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Performance-diff-between-single-big-file-vs-multiple-smaller/m-p/376963#M243085</guid>
      <dc:creator>manishg</dc:creator>
      <dc:date>2023-09-28T02:57:16Z</dc:date>
    </item>
  </channel>
</rss>

