<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: How to transpose a pyspark dataframe? in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/How-to-transpose-a-pyspark-dataframe/m-p/131473#M94157</link>
    <description>&lt;P&gt;Is there a way to get row labels (_1, _2, _3, _4, _5, _6, _7) as the first column of transposed dataframe?&lt;/P&gt;</description>
    <pubDate>Wed, 07 Jun 2017 20:28:43 GMT</pubDate>
    <dc:creator>arora_jyoti</dc:creator>
    <dc:date>2017-06-07T20:28:43Z</dc:date>
    <item>
      <title>How to transpose a pyspark dataframe?</title>
      <link>https://community.cloudera.com/t5/Support-Questions/How-to-transpose-a-pyspark-dataframe/m-p/131471#M94155</link>
      <description>&lt;P&gt;dt1 = {'one':[0.3, 1.2, 1.3, 1.5, 1.4, 1],'two':[0.6, 1.2, 1.7, 1.5,1.4, 2]}&lt;/P&gt;&lt;P&gt;dt = sc.parallelize([ (k,) + tuple(v[0:]) for k,v in dt1.items()]).toDF()&lt;/P&gt;&lt;P&gt;dt.show()&lt;/P&gt;&lt;P&gt;I want 'one' and 'two' to be column header and all list values should be column values&lt;/P&gt;</description>
      <pubDate>Tue, 17 Jan 2017 13:56:40 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/How-to-transpose-a-pyspark-dataframe/m-p/131471#M94155</guid>
      <dc:creator>subhrajit</dc:creator>
      <dc:date>2017-01-17T13:56:40Z</dc:date>
    </item>
    <item>
      <title>Re: How to transpose a pyspark dataframe?</title>
      <link>https://community.cloudera.com/t5/Support-Questions/How-to-transpose-a-pyspark-dataframe/m-p/131472#M94156</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/2903/subhrmohanty.html" nodeid="2903"&gt;@subhrajit mohanty&lt;/A&gt;&lt;/P&gt;&lt;P&gt;Here's an option using RDDs, and I'd like to see if anyone comes up with a good DataFrame solution.&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Input Format:&lt;/STRONG&gt;&lt;/P&gt;&lt;PRE&gt;+---+---+---+---+---+---+---+
| _1| _2| _3| _4| _5| _6| _7|
+---+---+---+---+---+---+---+
|two|0.6|1.2|1.7|1.5|1.4|2.0|
|one|0.3|1.2|1.3|1.5|1.4|1.0|
+---+---+---+---+---+---+---+
&lt;/PRE&gt;&lt;P&gt;&lt;STRONG&gt;Output Format:&lt;/STRONG&gt;&lt;/P&gt;&lt;PRE&gt;+---+---+
|two|one|
+---+---+
|0.6|0.3|
|1.2|1.2|
|1.7|1.3|
|1.5|1.5|
|1.4|1.4|
|2.0|1.0|
+---+---+
&lt;/PRE&gt;&lt;P&gt;&lt;B&gt;Code:&lt;/B&gt;&lt;/P&gt;&lt;PRE&gt;import numpy as np
from pyspark.sql import SQLContext
from pyspark.sql.functions import lit

dt1 = {'one':[0.3, 1.2, 1.3, 1.5, 1.4, 1.0],'two':[0.6, 1.2, 1.7, 1.5, 1.4, 2.0]}
dt = sc.parallelize([ (k,) + tuple(v[0:]) for k,v in dt1.items()]).toDF()
dt.show()


#--- Start of my Transpose Code ---


# Grad data from first columns, since it will be transposed to new column headers
new_header = [i[0] for i in dt.select("_1").rdd.map(tuple).collect()]

# Remove first column from dataframe
dt2 = dt.select([c for c in dt.columns if c not in ['_1']])

# Convert DataFrame to RDD
rdd = dt2.rdd.map(tuple)

# Transpose Data
rddT1 = rdd.zipWithIndex().flatMap(lambda (x,i): [(i,j,e) for (j,e) in enumerate(x)])
rddT2 = rddT1.map(lambda (i,j,e): (j, (i,e))).groupByKey().sortByKey()
rddT3 = rddT2.map(lambda (i, x): sorted(list(x), cmp=lambda (i1,e1),(i2,e2) : cmp(i1, i2)))
rddT4 = rddT3.map(lambda x: map(lambda (i, y): y , x))

# Convert back to DataFrame (along with header)
df = rddT4.toDF(new_header)

df.show()
&lt;/PRE&gt;&lt;P&gt;Let me know if this helps.&lt;/P&gt;&lt;P&gt;Reference: &lt;A href="http://www.data-intuitive.com/2015/01/transposing-a-spark-rdd/" target="_blank"&gt;http://www.data-intuitive.com/2015/01/transposing-a-spark-rdd/&lt;/A&gt; &lt;/P&gt;</description>
      <pubDate>Fri, 20 Jan 2017 10:32:26 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/How-to-transpose-a-pyspark-dataframe/m-p/131472#M94156</guid>
      <dc:creator>dzaratsian</dc:creator>
      <dc:date>2017-01-20T10:32:26Z</dc:date>
    </item>
    <item>
      <title>Re: How to transpose a pyspark dataframe?</title>
      <link>https://community.cloudera.com/t5/Support-Questions/How-to-transpose-a-pyspark-dataframe/m-p/131473#M94157</link>
      <description>&lt;P&gt;Is there a way to get row labels (_1, _2, _3, _4, _5, _6, _7) as the first column of transposed dataframe?&lt;/P&gt;</description>
      <pubDate>Wed, 07 Jun 2017 20:28:43 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/How-to-transpose-a-pyspark-dataframe/m-p/131473#M94157</guid>
      <dc:creator>arora_jyoti</dc:creator>
      <dc:date>2017-06-07T20:28:43Z</dc:date>
    </item>
  </channel>
</rss>

