<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Index/Rank a Grouped Rdd in Spark Scala in Archives of Support Questions (Read Only)</title>
    <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Index-Rank-a-Grouped-Rdd-in-Spark-Scala/m-p/129438#M51595</link>
    <description>&lt;P&gt;I have been really looking to index/rank a grouped rdd. The RDD was grouped by a key as you can see below and I want to index it starting from &lt;STRONG&gt;number 2 for each of the arrays under a key.&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;What I have &lt;/STRONG&gt;&lt;/P&gt;&lt;PRE&gt;scala.collection.immutable.Map[String,Array[(String, String, String)]] = 
Map( 
        394 -&amp;gt; Array((394,394,0), (394,362,7), (394,368,7)), 
        328 -&amp;gt; Array((328,328,0), (328,324,7), (328,325,7), (328,326,7), (328,327,7), (328,329,7),
        368 -&amp;gt; Array((368,368,0), (368,394,7), (368,396,7), (368,397,7), (368,479896,7)),
        278 -&amp;gt; Array((278,278,0), (278,371,7), (278,372,7))
)&lt;/PRE&gt;&lt;P&gt;&lt;STRONG&gt;What I want (Notice the the new 4th element of each Array, its an index starting from 2)
&lt;/STRONG&gt;&lt;/P&gt;&lt;PRE&gt;
        394 -&amp;gt; Array((394,394,0,2), (394,362,7,3), (394,368,7,4)), 
        328 -&amp;gt; Array((328,328,0,2), (328,324,7,3), (328,325,7,4), (328,326,7,5), (328,327,7), (328,329,7,6),
        368 -&amp;gt; Array((368,368,0,2), (368,394,7,3), (368,396,7,4), (368,397,7,5), (368,479896,7,6),
        278 -&amp;gt; Array((278,278,0,2), (278,371,7,3), (278,372,7,4))
  &lt;/PRE&gt;</description>
    <pubDate>Sun, 15 Jan 2017 07:12:01 GMT</pubDate>
    <dc:creator>adnanalvee</dc:creator>
    <dc:date>2017-01-15T07:12:01Z</dc:date>
    <item>
      <title>Index/Rank a Grouped Rdd in Spark Scala</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Index-Rank-a-Grouped-Rdd-in-Spark-Scala/m-p/129438#M51595</link>
      <description>&lt;P&gt;I have been really looking to index/rank a grouped rdd. The RDD was grouped by a key as you can see below and I want to index it starting from &lt;STRONG&gt;number 2 for each of the arrays under a key.&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;What I have &lt;/STRONG&gt;&lt;/P&gt;&lt;PRE&gt;scala.collection.immutable.Map[String,Array[(String, String, String)]] = 
Map( 
        394 -&amp;gt; Array((394,394,0), (394,362,7), (394,368,7)), 
        328 -&amp;gt; Array((328,328,0), (328,324,7), (328,325,7), (328,326,7), (328,327,7), (328,329,7),
        368 -&amp;gt; Array((368,368,0), (368,394,7), (368,396,7), (368,397,7), (368,479896,7)),
        278 -&amp;gt; Array((278,278,0), (278,371,7), (278,372,7))
)&lt;/PRE&gt;&lt;P&gt;&lt;STRONG&gt;What I want (Notice the the new 4th element of each Array, its an index starting from 2)
&lt;/STRONG&gt;&lt;/P&gt;&lt;PRE&gt;
        394 -&amp;gt; Array((394,394,0,2), (394,362,7,3), (394,368,7,4)), 
        328 -&amp;gt; Array((328,328,0,2), (328,324,7,3), (328,325,7,4), (328,326,7,5), (328,327,7), (328,329,7,6),
        368 -&amp;gt; Array((368,368,0,2), (368,394,7,3), (368,396,7,4), (368,397,7,5), (368,479896,7,6),
        278 -&amp;gt; Array((278,278,0,2), (278,371,7,3), (278,372,7,4))
  &lt;/PRE&gt;</description>
      <pubDate>Sun, 15 Jan 2017 07:12:01 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Index-Rank-a-Grouped-Rdd-in-Spark-Scala/m-p/129438#M51595</guid>
      <dc:creator>adnanalvee</dc:creator>
      <dc:date>2017-01-15T07:12:01Z</dc:date>
    </item>
    <item>
      <title>Re: Index/Rank a Grouped Rdd in Spark Scala</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Index-Rank-a-Grouped-Rdd-in-Spark-Scala/m-p/129439#M51596</link>
      <description>&lt;P&gt;&lt;A rel="user" href="https://community.cloudera.com/users/472/jwiden.html" nodeid="472"&gt;@Joe Widen&lt;/A&gt; &lt;A rel="user" href="https://community.cloudera.com/users/9304/tspann.html" nodeid="9304"&gt;@Timothy Spann&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 16 Jan 2017 08:45:14 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Index-Rank-a-Grouped-Rdd-in-Spark-Scala/m-p/129439#M51596</guid>
      <dc:creator>adnanalvee</dc:creator>
      <dc:date>2017-01-16T08:45:14Z</dc:date>
    </item>
    <item>
      <title>Re: Index/Rank a Grouped Rdd in Spark Scala</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Index-Rank-a-Grouped-Rdd-in-Spark-Scala/m-p/129440#M51597</link>
      <description>&lt;P&gt;Use the mapValues api.  I made an example of doing what you wanted below.  You'll have to update the listbuffer to use the types you have, but it looks like its doing what you want.&lt;/P&gt;&lt;PRE&gt;val rdd1 = sc.parallelize(Array((1,2),(2,3),(1,3),(2,4)))
val gRdd = rdd1.groupByKey()
val indxRdd = gRdd.mapValues(a =&amp;gt; {
  val b = a.toArray
  var indx = 2
  val lb = new ListBuffer[(Int, Int)]
  for(i &amp;lt;- 0 to b.size-1) {
    lb.append((b(i), indx))
    indx += 1
  }
  lb.toArray
}).collectAsMap()
&lt;/PRE&gt;&lt;PRE&gt;indxRdd.collectAsMap()
res8: scala.collection.Map[Int,Array[(Int, Int)]] = 
Map(
	2 -&amp;gt; Array((3,2), (4,3)), 
	1 -&amp;gt; Array((2,2), (3,3))
)&lt;/PRE&gt;</description>
      <pubDate>Mon, 23 Jan 2017 23:11:51 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Index-Rank-a-Grouped-Rdd-in-Spark-Scala/m-p/129440#M51597</guid>
      <dc:creator>jwiden</dc:creator>
      <dc:date>2017-01-23T23:11:51Z</dc:date>
    </item>
    <item>
      <title>Re: Index/Rank a Grouped Rdd in Spark Scala</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Index-Rank-a-Grouped-Rdd-in-Spark-Scala/m-p/129441#M51598</link>
      <description>&lt;P&gt;Thank you so much!&lt;/P&gt;</description>
      <pubDate>Mon, 23 Jan 2017 23:35:46 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Index-Rank-a-Grouped-Rdd-in-Spark-Scala/m-p/129441#M51598</guid>
      <dc:creator>adnanalvee</dc:creator>
      <dc:date>2017-01-23T23:35:46Z</dc:date>
    </item>
  </channel>
</rss>

