# This will return a new DF with all the columns + iddata1 = data.withColumn("id", monotonically_increasing_id()) # Create an integer indexdata1.show() def create_indexes(df, fields=['country', 'state_id', 'airport', 'airport_id']): """ Create indexes for the different element ids for CMRs. This allows us to select CMRs that match a given element and element value very quickly. """ if fields == None: print("No fields specified, returning") return for field in fields: if field not in df.columns: print('field: ', field, " is not in the data...") return indexes = {} for field in fields: print(field) res = df.groupby(field) index = {label: np.array(vals['id'], np.int32) for label, vals in res} indexes[field] = index return indexes # Create indexes. Some of them take a lot of time! #Changed dom_client_id by gbl_buy_grp_id as it was changed in Line Number indexes = create_indexes(data1, fields=['country', 'state_id', 'airport', 'airport_id']) print type(indexes)
Options
- Subscribe to RSS Feed
- Mark Question as New
- Mark Question as Read
- Float this Question for Current User
- Bookmark
- Subscribe
- Mute
- Printer Friendly Page
TypeError: 'GroupedData' object is not iterable in pyspark
Labels:
- Labels:
-
Apache Spark
New Contributor
Created on ‎10-17-2017 09:22 AM - edited ‎09-16-2022 05:24 AM
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I'm using pyspark 2.0.1 & python 2.7.I'm running following code & getting error message as
TypeError: 'GroupedData' object is not iterable in pyspark.Can you pleas help me?
body,.top-bar{margin-top:1.9em}
1 REPLY 1
New Contributor
Created ‎10-18-2017 06:00 AM
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Any help please?
