<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: Putting SequenceFile key value into a data frame in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/Putting-SequenceFile-key-value-into-a-data-frame/m-p/341947#M233655</link>
    <description>&lt;P&gt;&lt;SPAN&gt;André,&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Thanks for the elegant solution. &lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;</description>
    <pubDate>Fri, 22 Apr 2022 00:07:30 GMT</pubDate>
    <dc:creator>Seaport</dc:creator>
    <dc:date>2022-04-22T00:07:30Z</dc:date>
    <item>
      <title>Putting SequenceFile key value into a data frame</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Putting-SequenceFile-key-value-into-a-data-frame/m-p/341836#M233620</link>
      <description>&lt;P&gt;I saved thousands of small json files in SequenceFile format to resolve the "small file issue". I use the following pyspark code to parse the json data from saved sequence files.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;reader= sc.sequenceFile("/mysequencefile_dir", "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
rdd=reader.map(lambda x: x[1])
mydf=spark.read.schema(myschema).json(rdd)
mydf.show(truncate=False)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;The code worked. However, I do not know how to put the key value from the sequence file, which is actually the original json file name, into the mydf dataframe. Please advise. Thank you.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;</description>
      <pubDate>Tue, 21 Apr 2026 07:57:23 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Putting-SequenceFile-key-value-into-a-data-frame/m-p/341836#M233620</guid>
      <dc:creator>Seaport</dc:creator>
      <dc:date>2026-04-21T07:57:23Z</dc:date>
    </item>
    <item>
      <title>Re: Putting SequenceFile key value into a data frame</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Putting-SequenceFile-key-value-into-a-data-frame/m-p/341837#M233621</link>
      <description>&lt;P&gt;I did a workaround by injecting the myfilepath element into the json string.&lt;/P&gt;&lt;LI-CODE lang="python"&gt;rdd=reader.map(lambda x: str(x[1])[0]+'"myfilepath":"'+x[0]+'",'+str(x[1])[1:])&lt;/LI-CODE&gt;&lt;P&gt;It does not look like a very clean solution. Is there a better one? Thanks.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Regards&lt;/P&gt;</description>
      <pubDate>Thu, 21 Apr 2022 00:46:58 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Putting-SequenceFile-key-value-into-a-data-frame/m-p/341837#M233621</guid>
      <dc:creator>Seaport</dc:creator>
      <dc:date>2022-04-21T00:46:58Z</dc:date>
    </item>
    <item>
      <title>Re: Putting SequenceFile key value into a data frame</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Putting-SequenceFile-key-value-into-a-data-frame/m-p/341861#M233626</link>
      <description>&lt;P&gt;&lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/45630"&gt;@Seaport&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Please try the below:&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;import json

def jsonize(k, v):
    ret = json.loads(v)
    ret.update({'key': k})
    return ret

...
rdd = reader.map(lambda x: jsonize(*x))
...&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;You need to make sure your schema includes the added key column.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Cheers,&lt;/P&gt;&lt;P&gt;André&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 21 Apr 2022 05:27:15 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Putting-SequenceFile-key-value-into-a-data-frame/m-p/341861#M233626</guid>
      <dc:creator>araujo</dc:creator>
      <dc:date>2022-04-21T05:27:15Z</dc:date>
    </item>
    <item>
      <title>Re: Putting SequenceFile key value into a data frame</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Putting-SequenceFile-key-value-into-a-data-frame/m-p/341947#M233655</link>
      <description>&lt;P&gt;&lt;SPAN&gt;André,&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Thanks for the elegant solution. &lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;</description>
      <pubDate>Fri, 22 Apr 2022 00:07:30 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Putting-SequenceFile-key-value-into-a-data-frame/m-p/341947#M233655</guid>
      <dc:creator>Seaport</dc:creator>
      <dc:date>2022-04-22T00:07:30Z</dc:date>
    </item>
  </channel>
</rss>

