Support Questions

VidyaSargur · ‎12-18-2024

Hi Everyone,

I have the below Input JSON and would like to flatten the data into the below mentioned output JSON format. I am new to the JOLT, I tried to transform (its incomplete) but could not achieve the result.

Kindly help me to get the desired output, thanks in advance.

Input JSON:

{
  "data": {
    "getItemListing": {
      "edges": [
        {
          "node": {
            "id": "2940126",
            "identifier": 3119650,
            "parentItems": [
              {
                "id": "364270",
                "parentIdentifier": 371153
              },
              {
                "id": "364258",
                "parentIdentifier": 371147
              }
            ],
            "treatmentClusterIds": [
              {
                "metadata": [
                  {
                    "treatmentClusterIDs": "528"
                  }
                ],
                "element": {
                  "name": "GOAT02",
                  "labelingClusters": [
                    {
                      "labelingCluster": "1000"
                    }
                  ]
                }
              },
              {
                "metadata": [
                  {
                    "treatmentClusterIDs": "529"
                  }
                ],
                "element": {
                  "name": "GOAT03",
                  "labelingClusters": [
                    {
                      "labelingCluster": "4045"
                    }
                  ]
                }
              },
              {
                "metadata": [
                  {
                    "treatmentClusterIDs": "5581"
                  }
                ],
                "element": {
                  "name": "GOAT04",
                  "labelingClusters": [
                    {
                      "labelingCluster": "2000"
                    },
                    {
                      "labelingCluster": "3000"
                    },
                    {
                      "labelingCluster": "4000"
                    }
                  ]
                }
              }
            ]
          }
        },
        {
          "node": {
            "id": "2940127",
            "identifier": 3119651,
            "parentItems": [
              {
                "id": "364270",
                "parentIdentifier": 371154
              }
            ],
            "treatmentClusterIds": [
              {
                "metadata": [
                  {
                    "treatmentClusterIDs": "527"
                  }
                ],
                "element": {
                  "name": "GOAT01",
                  "labelingClusters": [
                    {
                      "labelingCluster": "1000"
                    },
                    {
                      "labelingCluster": "4045"
                    },
                    {
                      "labelingCluster": "2000"
                    },
                    {
                      "labelingCluster": "3000"
                    },
                    {
                      "labelingCluster": "4000"
                    }
                  ]
                }
              }
            ]
          }
        }
      ]
    }
  }
}

Output JSON:

[ {
  "identifier": "3119650",
  "parentIdentifier": "371153",
  "Element" : "GOAT02",
  "TreatmentId" : "528",
  "LabelingCluster" : "1000"
}, {
  "identifier": "3119650",
  "parentIdentifier": "371153",
  "Element" : "GOAT03",
  "TreatmentId" : "529",
  "LabelingCluster" : "4045"
}, {
  "identifier": "3119650",
  "parentIdentifier": "371153",
  "Element" : "GOAT04",
  "TreatmentId" : "5581",
  "LabelingCluster" : "2000"
}, {
  "identifier": "3119650",
  "parentIdentifier": "371153",
  "Element" : "GOAT04",
  "TreatmentId" : "5581",
  "LabelingCluster" : "3000"
}, {
  "identifier": "3119650",
  "parentIdentifier": "371153",
  "Element" : "GOAT04",
  "TreatmentId" : "5581",
  "LabelingCluster" : "4000"
}, {
  "identifier": "3119650",
  "parentIdentifier": "371147",
  "Element" : "GOAT02",
  "TreatmentId" : "528",
  "LabelingCluster" : "1000"
}, {
  "identifier": "3119650",
  "parentIdentifier": "371147",
  "Element" : "GOAT03",
  "TreatmentId" : "529",
  "LabelingCluster" : "4045"
}, {
  "identifier": "3119650",
  "parentIdentifier": "371147",
  "Element" : "GOAT04",
  "TreatmentId" : "5581",
  "LabelingCluster" : "2000"
}, {
  "identifier": "3119650",
  "parentIdentifier": "371147",
  "Element" : "GOAT04",
  "TreatmentId" : "5581",
  "LabelingCluster" : "3000"
}, {
  "identifier": "300119650",
  "parentIdentifier": "371147",
  "Element" : "GOAT04",
  "TreatmentId" : "5581",
  "LabelingCluster" : "4000"
}, {
  "identifier": "3119651",
  "parentIdentifier": "371154",
  "Element" : "GOAT01",
  "TreatmentId" : "527",
  "LabelingCluster" : "1000"
}, {
  "identifier": "3119651",
  "parentIdentifier": "371154",
  "Element" : "GOAT01",
  "TreatmentId" : "527",
  "LabelingCluster" : "4045"
}, {
  "identifier": "3119651",
  "parentIdentifier": "371154",
  "Element" : "GOAT01",
  "TreatmentId" : "527",
  "LabelingCluster" : "2000"
}, {
  "identifier": "3119651",
  "parentIdentifier": "371154",
  "Element" : "GOAT01",
  "TreatmentId" : "527",
  "LabelingCluster" : "3000"
}, {
  "identifier": "3119651",
  "parentIdentifier": "371154",
  "Element" : "GOAT01",
  "TreatmentId" : "527",
  "LabelingCluster" : "4000"
} ]

JOLT spec I tired (incomplete):

[
  {
    "operation": "shift",
    "spec": {
      "data": {
        "getItemListing": {
          "edges": {
            "*": {
              "node": {
                "treatmentClusterIds": {
                  "*": "[]"
                }
              }
            }
          }
        }
      }
    }
 }, {
    "operation": "shift",
    "spec": {
      "*": {
        "element": {
          "labelingClusters": {
            "*": {
              "labelingCluster": "[&4].[&1].LabelingCluster",
              "@(2,name)": "[&4].[&1].Element",
              "@3,metadata": {
                "*": {
                  "treatmentClusterIDs": "[&6].[&2].TreatmentId"
                }
              }
            }
          }
        }
      }
    }
 },
  {
    "operation": "shift",
    "spec": {
      "*": {
        "*": "[]"
      }
    }
 }
]

SAMSAL · ‎12-19-2024

Hi @Velankanni ,

I got to say that this is one of the most complex jolt I have ever written. Im starting to question if its even worth doing it because it will make maintaining the the spec very hard. I would urge you to look at this post which has similar request of flattening complex json and there I recommended to use JSTL transformation spec instead as it would simplify things by a lot.

As far as the jolt spec , I was able to do it in 4 transformation:

1- Start form the leaf "labelingCluster" and work your way up to collect common data for each node and assign arrays for the values of metadata & parent identifiers.

2- traverse the metadata array from step 1 to replicate the values from above against each metadata array item.

3- traverse the parent identifier array from step 1 to replicate the values from step2 against each parent identifier array item.

4- Bucket result from step3 into single array

It important to maintain parent\sub array index position during each transformation to help you group fields correctly.

[
  {
    "operation": "shift",
    "spec": {
      "data": {
        "getItemListing": {
          "edges": {
            "*": {
              "node": {
                "treatmentClusterIds": {
                  "*": {
                    "element": {
                      "labelingClusters": {
                        "*": {
                          "labelingCluster": {
                            "@(3,name)": "[&8].treatmentClusterIds[&5].labelingClusters[&2].Element",
                            "@(6,identifier)": "[&8].treatmentClusterIds[&5].labelingClusters[&2].identifier",
                            "@": "[&8].treatmentClusterIds[&5].labelingClusters[&2].labelingCluster",
                            "@(4,metadata)": {
                              "*": {
                                "treatmentClusterIDs": "[&10].treatmentClusterIds[&7].labelingClusters[&4].metadata[]"
                              }
                            },
                            "@(6,parentItems)": {
                              "*": {
                                "parentIdentifier": "[&10].treatmentClusterIds[&7].labelingClusters[&4].parentId[]"
                              }
                            }
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  },
  {
    "operation": "shift",
    "spec": {
      "*": {
        "treatmentClusterIds": {
          "*": {
            "labelingClusters": {
              "*": {
                "metadata": {
                  "*": {
                    "@(2,identifier)": "[&7].treatmentClusterIds[&5].labelingClusters[&3][&1].identifier",
                    "@": "[&7].treatmentClusterIds[&5].labelingClusters[&3][&1].TreatmentId",
                    "@(2,Element)": "[&7].treatmentClusterIds[&5].labelingClusters[&3][&1].Element",
                    "@(2,labelingCluster)": "[&7].treatmentClusterIds[&5].labelingClusters[&3][&1].labelingCluster",
                    "@(2,parentId)": "[&7].treatmentClusterIds[&5].labelingClusters[&3][&1].parentId"
                  }
                }
              }
            }
          }
        }
      }
    }
  }
  ,
  {
    "operation": "shift",
    "spec": {
      "*": {
        "treatmentClusterIds": {
          "*": {
            "labelingClusters": {
              "*": {
                "*": {
                  "parentId": {
                    "*": {
                      "@(2,identifier)": "[&8].treatmentClusterIds[&6].labelingClusters[&4][&3][&1].identifier",
                      "@": "[&8].treatmentClusterIds[&6].labelingClusters[&4][&3][&1].parentIdentifier",
                      "@(2,Element)": "[&8].treatmentClusterIds[&6].labelingClusters[&4][&3][&1].Element",
                      "@(2,TreatmentId)": "[&8].treatmentClusterIds[&6].labelingClusters[&4][&3][&1].TreatmentId",
                      "@(2,labelingCluster)": "[&8].treatmentClusterIds[&6].labelingClusters[&4][&3][&1].labelingCluster"
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  },
  {
    "operation": "shift",
    "spec": {
      "*": {
        "treatmentClusterIds": {
          "*": {
            "labelingClusters": {
              "*": {
                "*": {
                  "*": "[]"
                }
              }
            }
          }
        }
      }
    }
  }

]

Again, look into JSTL spec to simplify, or try to break up the json and store into database where you can easily perform sql queries to flatten the data using join

If this helps please accept the solution

Thanks

View solution in original post

VidyaSargur · ‎12-18-2024

@Velankanni, Welcome to our community! To help you get the best possible answer, I have tagged in our NiFi experts @SAMSAL @MattWho who may be able to assist you further.

Please feel free to provide any additional information or details about your query. We hope that you will find a satisfactory solution to your question.

Regards,

Vidya Sargur,
Community Manager

Was your question answered? Make sure to mark the answer as the accepted solution.
If you find a reply useful, say thanks by clicking on the thumbs up button.
Learn more about the Cloudera Community:
Community Guidelines
How to use the forum

Velankanni · ‎12-19-2024

Hi @VidyaSargur , Thank you.

SAMSAL · ‎12-19-2024

Hi @Velankanni ,

Can you explain what are you trying to accomplish with the transformation?

I was having a hard time understanding how you came up with output given that you are pulling fields from different arrays with different cardinality.

Velankanni · ‎12-19-2024

Hi @SAMSAL , Thank you, this JSON is for Bill Of Materials and we have to flatten the entire array at each node and repeat the array values to other columns.

Shelton · ‎12-19-2024

@Velankanni
If you are still having the problem you can you try this JOLT remember the spoiler tag distorts the JSON

JOLT Spec

Spoiler

[
{
"operation": "shift",
"spec": {
"data": {
"getItemListing": {
"edges": {
"*": {
"node": {
"identifier": "[&1].identifier",
"parentItems": {
"*": {
"parentIdentifier": "[&3].[&1].parentIdentifier"
}
},
"treatmentClusterIds": {
"*": {
"metadata": {
"*": {
"treatmentClusterIDs": "[&4].[&2].[&1].TreatmentId"
}
},
"element": {
"name": "[&4].[&2].[&1].Element",
"labelingClusters": {
"*": {
"labelingCluster": "[&5].[&3].[&2].LabelingCluster"
}
}
}
}
}
}
}
}
}
}
}
},
{
"operation": "shift",
"spec": {
"*": {
"*": {
"*": {
"*": {
"*": {
"*": {
"$": "[#6].identifier",
"parentIdentifier": "[#6].parentIdentifier",
"Element": "[#6].Element",
"TreatmentId": "[#6].TreatmentId",
"LabelingCluster": "[#6].LabelingCluster"
}
}
}
}
}
}
}
}
]

[{"operation": "shift","spec": {"data": {"getItemListing": {"edges": {"*": {"node": {"identifier": "[&1].identifier","parentItems": {"*": {"parentIdentifier": "[&3].[&1].parentIdentifier"}},"treatmentClusterIds": {"*": {"metadata": {"*": {"treatmentClusterIDs": "[&4].[&2].[&1].TreatmentId"}},"element": {"name": "[&4].[&2].[&1].Element","labelingClusters": {"*": {"labelingCluster": "[&5].[&3].[&2].LabelingCluster"}}}}}}}}}}}},{"operation": "shift","spec": {"*": {"*": {"*": {"*": {"*": {"*": {"$": "[#6].identifier","parentIdentifier": "[#6].parentIdentifier","Element": "[#6].Element","TreatmentId": "[#6].TreatmentId","LabelingCluster": "[#6].LabelingCluster"}}}}}}}}]

Place the Input JSON in a file input.json.
Use a JOLT processor
Apply the spec above and verify the output matches the Output JSON format.

Happy hadooping

SAMSAL · ‎12-19-2024

Hi @Velankanni ,

I got to say that this is one of the most complex jolt I have ever written. Im starting to question if its even worth doing it because it will make maintaining the the spec very hard. I would urge you to look at this post which has similar request of flattening complex json and there I recommended to use JSTL transformation spec instead as it would simplify things by a lot.

As far as the jolt spec , I was able to do it in 4 transformation:

1- Start form the leaf "labelingCluster" and work your way up to collect common data for each node and assign arrays for the values of metadata & parent identifiers.

2- traverse the metadata array from step 1 to replicate the values from above against each metadata array item.

3- traverse the parent identifier array from step 1 to replicate the values from step2 against each parent identifier array item.

4- Bucket result from step3 into single array

It important to maintain parent\sub array index position during each transformation to help you group fields correctly.

[
  {
    "operation": "shift",
    "spec": {
      "data": {
        "getItemListing": {
          "edges": {
            "*": {
              "node": {
                "treatmentClusterIds": {
                  "*": {
                    "element": {
                      "labelingClusters": {
                        "*": {
                          "labelingCluster": {
                            "@(3,name)": "[&8].treatmentClusterIds[&5].labelingClusters[&2].Element",
                            "@(6,identifier)": "[&8].treatmentClusterIds[&5].labelingClusters[&2].identifier",
                            "@": "[&8].treatmentClusterIds[&5].labelingClusters[&2].labelingCluster",
                            "@(4,metadata)": {
                              "*": {
                                "treatmentClusterIDs": "[&10].treatmentClusterIds[&7].labelingClusters[&4].metadata[]"
                              }
                            },
                            "@(6,parentItems)": {
                              "*": {
                                "parentIdentifier": "[&10].treatmentClusterIds[&7].labelingClusters[&4].parentId[]"
                              }
                            }
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  },
  {
    "operation": "shift",
    "spec": {
      "*": {
        "treatmentClusterIds": {
          "*": {
            "labelingClusters": {
              "*": {
                "metadata": {
                  "*": {
                    "@(2,identifier)": "[&7].treatmentClusterIds[&5].labelingClusters[&3][&1].identifier",
                    "@": "[&7].treatmentClusterIds[&5].labelingClusters[&3][&1].TreatmentId",
                    "@(2,Element)": "[&7].treatmentClusterIds[&5].labelingClusters[&3][&1].Element",
                    "@(2,labelingCluster)": "[&7].treatmentClusterIds[&5].labelingClusters[&3][&1].labelingCluster",
                    "@(2,parentId)": "[&7].treatmentClusterIds[&5].labelingClusters[&3][&1].parentId"
                  }
                }
              }
            }
          }
        }
      }
    }
  }
  ,
  {
    "operation": "shift",
    "spec": {
      "*": {
        "treatmentClusterIds": {
          "*": {
            "labelingClusters": {
              "*": {
                "*": {
                  "parentId": {
                    "*": {
                      "@(2,identifier)": "[&8].treatmentClusterIds[&6].labelingClusters[&4][&3][&1].identifier",
                      "@": "[&8].treatmentClusterIds[&6].labelingClusters[&4][&3][&1].parentIdentifier",
                      "@(2,Element)": "[&8].treatmentClusterIds[&6].labelingClusters[&4][&3][&1].Element",
                      "@(2,TreatmentId)": "[&8].treatmentClusterIds[&6].labelingClusters[&4][&3][&1].TreatmentId",
                      "@(2,labelingCluster)": "[&8].treatmentClusterIds[&6].labelingClusters[&4][&3][&1].labelingCluster"
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  },
  {
    "operation": "shift",
    "spec": {
      "*": {
        "treatmentClusterIds": {
          "*": {
            "labelingClusters": {
              "*": {
                "*": {
                  "*": "[]"
                }
              }
            }
          }
        }
      }
    }
  }

]

Again, look into JSTL spec to simplify, or try to break up the json and store into database where you can easily perform sql queries to flatten the data using join

If this helps please accept the solution

Thanks

Velankanni · ‎12-19-2024

Hi @SAMSAL ,

This works really fine. Thank you so much for your solution.

I got your idea of splitting the JSON and perform the transformation in SQL table. I will work on that.

Thank you again.

Support Questions

Jolt spec to flatten the nested JSON