<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Parquet as Array[Byte] to DataFrame without writing to disk in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/Parquet-as-Array-Byte-to-DataFrame-without-writing-to-disk/m-p/342678#M233793</link>
    <description>&lt;P&gt;Hi&lt;/P&gt;&lt;P&gt;From an API I receive a parquet as an Array[Byte] (stored here as parquetPayload), and want to convert that into a DataFrame. My current function is below, and includes a write to disk&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="java"&gt;val tempFilePath = new Path("/tmp/", java.util.UUID.randomUUID().toString + ".tmp")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)

val stream = fs.create(tempFilePath, true)
try {
  stream.write(parquetPayload, 0, parquetPayload.length)
} finally {
  stream.close()
}

val df = spark.read.parquet(tempFilePath.toString) &lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;This works, but I would like to avoid the write to disk, and keep this all in memory. Is this possible?&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Fri, 29 Apr 2022 08:47:14 GMT</pubDate>
    <dc:creator>JoeR</dc:creator>
    <dc:date>2022-04-29T08:47:14Z</dc:date>
    <item>
      <title>Parquet as Array[Byte] to DataFrame without writing to disk</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Parquet-as-Array-Byte-to-DataFrame-without-writing-to-disk/m-p/342678#M233793</link>
      <description>&lt;P&gt;Hi&lt;/P&gt;&lt;P&gt;From an API I receive a parquet as an Array[Byte] (stored here as parquetPayload), and want to convert that into a DataFrame. My current function is below, and includes a write to disk&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="java"&gt;val tempFilePath = new Path("/tmp/", java.util.UUID.randomUUID().toString + ".tmp")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)

val stream = fs.create(tempFilePath, true)
try {
  stream.write(parquetPayload, 0, parquetPayload.length)
} finally {
  stream.close()
}

val df = spark.read.parquet(tempFilePath.toString) &lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;This works, but I would like to avoid the write to disk, and keep this all in memory. Is this possible?&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 29 Apr 2022 08:47:14 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Parquet-as-Array-Byte-to-DataFrame-without-writing-to-disk/m-p/342678#M233793</guid>
      <dc:creator>JoeR</dc:creator>
      <dc:date>2022-04-29T08:47:14Z</dc:date>
    </item>
    <item>
      <title>Re: Parquet as Array[Byte] to DataFrame without writing to disk</title>
      <link>https://community.cloudera.com/t5/Support-Questions/Parquet-as-Array-Byte-to-DataFrame-without-writing-to-disk/m-p/342713#M233801</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/97624"&gt;@JoeR&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Spark will support reading files with multiple file formats like parquet, orc, json, xml, avro,csv etc. I think there is no direct mechanism to read the data from the payload.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;If I found a different solution, I will share it with you.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 29 Apr 2022 12:29:57 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/Parquet-as-Array-Byte-to-DataFrame-without-writing-to-disk/m-p/342713#M233801</guid>
      <dc:creator>RangaReddy</dc:creator>
      <dc:date>2022-04-29T12:29:57Z</dc:date>
    </item>
  </channel>
</rss>

