<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: NIFI GetSplunk JSON  to PutParquet seeing duplicate records. in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/NIFI-GetSplunk-JSON-to-PutParquet-seeing-duplicate-records/m-p/297995#M218926</link>
    <description>&lt;P&gt;Hello All,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;After Long research I found that If I use cron driven strategy the query itself triggering 3 times so I changed this to timer driven strategy then query is triggering only one time so I am getting only one flow file and this avoids duplicates to me. Thanx!&lt;/P&gt;</description>
    <pubDate>Sat, 13 Jun 2020 02:12:25 GMT</pubDate>
    <dc:creator>NBharadwaj</dc:creator>
    <dc:date>2020-06-13T02:12:25Z</dc:date>
    <item>
      <title>NIFI GetSplunk JSON  to PutParquet seeing duplicate records.</title>
      <link>https://community.cloudera.com/t5/Support-Questions/NIFI-GetSplunk-JSON-to-PutParquet-seeing-duplicate-records/m-p/294194#M217121</link>
      <description>&lt;P&gt;&lt;STRONG&gt;Hi All,&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;&amp;nbsp;&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;I am newbie to NIFI!&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Today I got one problem of duplicate records, Below is the following &lt;/STRONG&gt;&lt;STRONG&gt;scenario:&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;We need to get data from Splunk to HDFS in Parquet format so we created a Data Flow with NIFI &lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;&amp;nbsp;&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;- I am getting data from Splunk using GetSplunk in JSON format and later putting in HDFS using the PutParquet processor with JsonTreeReader, AvroSchema.&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;- I am successful in this but later I see there are duplicate records of each record, So seeking help here to fix this issue and below is the sample JSON record and PFA for the NIFI DataFlow screenshot. &lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Thanks In Advance.&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;&amp;nbsp;&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;&amp;nbsp;&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Example Data Record:&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;[ {&lt;/P&gt;&lt;P&gt;&amp;nbsp; "preview" : true,&lt;/P&gt;&lt;P&gt;&amp;nbsp; "offset" : 0,&lt;/P&gt;&lt;P&gt;&amp;nbsp; "result" : {&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "action" : "allowed",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "app" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "dest" : "xxxx.xx.xxx.xxx",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "dest_bunit" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "dest_category" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "dest_ip" : "xxx.xx.xxx.xxx",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "dest_port" : "xxx",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "dest_priority" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "direction" : "N/A",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "duration" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "dvc" : "xxx.xx.xxx.xxx",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "dvc_ip" : "xxx.xx.xxx.xxx",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "protocol" : "HTTPS",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "response_time" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "rule" : "/Common/ds_policy_2",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "session_id" : "ad240f0634150d02",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "src" : "xx.xxx.xxx.xx",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "src_bunit" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "src_category" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "src_ip" : "xx.xxx.xxx.xx",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "src_port" : "62858",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "src_priority" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "tag" : "proxy,web",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "usr" : "N/A",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "user_bunit" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "user_category" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "user_priority" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "vendor_product" : "ASM",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "vendor_product_uuid" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "ts" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "description" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "action_reason" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "severity" : "Informational",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "user_type" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "service_type" : "",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "dt" : "20200331",&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; "hr" : "15"&lt;/P&gt;&lt;P&gt;&amp;nbsp; },&lt;/P&gt;&lt;P&gt;&amp;nbsp; "lastrow" : null&lt;/P&gt;&lt;P&gt;} ]&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="NIFIDATAFLOW.PNG" style="width: 999px;"&gt;&lt;img src="https://community.cloudera.com/t5/image/serverpage/image-id/27151iEAA77DFAB7552141/image-size/large?v=v2&amp;amp;px=999" role="button" title="NIFIDATAFLOW.PNG" alt="NIFIDATAFLOW.PNG" /&gt;&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 17 Apr 2020 03:23:31 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/NIFI-GetSplunk-JSON-to-PutParquet-seeing-duplicate-records/m-p/294194#M217121</guid>
      <dc:creator>NBharadwaj</dc:creator>
      <dc:date>2020-04-17T03:23:31Z</dc:date>
    </item>
    <item>
      <title>Re: NIFI GetSplunk JSON  to PutParquet seeing duplicate records.</title>
      <link>https://community.cloudera.com/t5/Support-Questions/NIFI-GetSplunk-JSON-to-PutParquet-seeing-duplicate-records/m-p/297995#M218926</link>
      <description>&lt;P&gt;Hello All,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;After Long research I found that If I use cron driven strategy the query itself triggering 3 times so I changed this to timer driven strategy then query is triggering only one time so I am getting only one flow file and this avoids duplicates to me. Thanx!&lt;/P&gt;</description>
      <pubDate>Sat, 13 Jun 2020 02:12:25 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/NIFI-GetSplunk-JSON-to-PutParquet-seeing-duplicate-records/m-p/297995#M218926</guid>
      <dc:creator>NBharadwaj</dc:creator>
      <dc:date>2020-06-13T02:12:25Z</dc:date>
    </item>
  </channel>
</rss>

