<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Spark connecting two hadoop clusters in Archives of Support Questions (Read Only)</title>
    <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/150116#M40547</link>
    <description>&lt;P&gt;Hi &lt;/P&gt;&lt;P&gt;I want to source data from two hadoop clusters and join in Spark. Will it possible as shown below&lt;/P&gt;&lt;P&gt;//data from cluster1&lt;/P&gt;&lt;P&gt;val erorDF = spark.read.json("hdfs://master:8020//user/ubuntu/error.json") &lt;/P&gt;&lt;P&gt;erorDF.registerTempTable("erorDFTBL") &lt;/P&gt;&lt;P&gt;//data from cluster2&lt;/P&gt;&lt;P&gt;val erorDF2 = spark.read.json("hdfs://master2:8020//user/ubuntu/error.json") &lt;/P&gt;&lt;P&gt;erorDF2.registerTempTable("erorDFTBL2")&lt;/P&gt;</description>
    <pubDate>Wed, 14 Sep 2016 02:46:43 GMT</pubDate>
    <dc:creator>ambariCloud</dc:creator>
    <dc:date>2016-09-14T02:46:43Z</dc:date>
    <item>
      <title>Spark connecting two hadoop clusters</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/150116#M40547</link>
      <description>&lt;P&gt;Hi &lt;/P&gt;&lt;P&gt;I want to source data from two hadoop clusters and join in Spark. Will it possible as shown below&lt;/P&gt;&lt;P&gt;//data from cluster1&lt;/P&gt;&lt;P&gt;val erorDF = spark.read.json("hdfs://master:8020//user/ubuntu/error.json") &lt;/P&gt;&lt;P&gt;erorDF.registerTempTable("erorDFTBL") &lt;/P&gt;&lt;P&gt;//data from cluster2&lt;/P&gt;&lt;P&gt;val erorDF2 = spark.read.json("hdfs://master2:8020//user/ubuntu/error.json") &lt;/P&gt;&lt;P&gt;erorDF2.registerTempTable("erorDFTBL2")&lt;/P&gt;</description>
      <pubDate>Wed, 14 Sep 2016 02:46:43 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/150116#M40547</guid>
      <dc:creator>ambariCloud</dc:creator>
      <dc:date>2016-09-14T02:46:43Z</dc:date>
    </item>
    <item>
      <title>Re: Spark connecting two hadoop clusters</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/150117#M40548</link>
      <description>&lt;P&gt;Sure! I just did it (with PySpark in Zeppelin, though):&lt;/P&gt;&lt;P&gt;&lt;IMG src="https://community.cloudera.com/legacyfs/online/attachments/7614-screen-shot-2016-09-14-at-41323-pm.png" /&gt;&lt;/P&gt;&lt;P&gt;For my test, I spun up two instances of HDP sandbox on Azure, and put a file into HDFS on each cluster. The code snippet reads each file and counts lines in each file individually, then concatenates the data sets and counts the lines of the union. &lt;/P&gt;</description>
      <pubDate>Wed, 14 Sep 2016 21:16:52 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/150117#M40548</guid>
      <dc:creator>Former Member</dc:creator>
      <dc:date>2016-09-14T21:16:52Z</dc:date>
    </item>
    <item>
      <title>Re: Spark connecting two hadoop clusters</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/150118#M40549</link>
      <description>&lt;P&gt;Thank you Becker. Will there be any setup I need to do in Zeppelin. I am running my Zeppelin in cluster 1. &lt;/P&gt;</description>
      <pubDate>Wed, 14 Sep 2016 21:38:37 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/150118#M40549</guid>
      <dc:creator>ambariCloud</dc:creator>
      <dc:date>2016-09-14T21:38:37Z</dc:date>
    </item>
    <item>
      <title>Re: Spark connecting two hadoop clusters</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/150119#M40550</link>
      <description>&lt;P&gt;No additional setup required - the Spark libraries are automatically imported and the Spark context is provided implicitly by Zeppelin. For any additional dependencies that you project needs, use %dep - see the documentation in &lt;A href="https://zeppelin.apache.org/docs/latest/interpreter/spark.html" target="_blank"&gt;https://zeppelin.apache.org/docs/latest/interpreter/spark.html&lt;/A&gt;.&lt;/P&gt;</description>
      <pubDate>Wed, 14 Sep 2016 21:43:46 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/150119#M40550</guid>
      <dc:creator>Former Member</dc:creator>
      <dc:date>2016-09-14T21:43:46Z</dc:date>
    </item>
    <item>
      <title>Re: Spark connecting two hadoop clusters</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/150120#M40551</link>
      <description>&lt;P&gt;Tested below in AWS. Looks good. Thank you&lt;/P&gt;&lt;P&gt;//read error JSON file from cluster 1 &lt;/P&gt;&lt;P&gt;val erorDF = spark.read.json("hdfs://master:8020/user/ubuntu/error.json") &lt;/P&gt;&lt;P&gt;erorDF.registerTempTable("erorDFTBL") &lt;/P&gt;&lt;P&gt;//read file from cluster 2 &lt;/P&gt;&lt;P&gt;val erorDF2 = spark.read.json("hdfs://master2:8020/user/ubuntu/errors") &lt;/P&gt;&lt;P&gt;erorDF2.registerTempTable("erorDFTBL2")&lt;/P&gt;</description>
      <pubDate>Wed, 14 Sep 2016 22:36:12 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/150120#M40551</guid>
      <dc:creator>ambariCloud</dc:creator>
      <dc:date>2016-09-14T22:36:12Z</dc:date>
    </item>
    <item>
      <title>Re: Spark connecting two hadoop clusters</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/346506#M40552</link>
      <description>&lt;P&gt;Please let me know if it is possible to access the hive table present across multiple clusters (On Hortonworks on-premises cluster)&lt;/P&gt;</description>
      <pubDate>Wed, 29 Jun 2022 07:05:19 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/346506#M40552</guid>
      <dc:creator>Chandraprabu</dc:creator>
      <dc:date>2022-06-29T07:05:19Z</dc:date>
    </item>
    <item>
      <title>Re: Spark connecting two hadoop clusters</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/346528#M40553</link>
      <description>&lt;P&gt;&lt;a href="https://community.cloudera.com/t5/user/viewprofilepage/user-id/98886"&gt;@Chandraprabu&lt;/a&gt;&amp;nbsp;As this is an older post, we recommend starting a new thread. The new thread will provide the opportunity to provide details specific to your environment that could aid others in providing a more accurate answer to your question.&lt;/P&gt;</description>
      <pubDate>Wed, 29 Jun 2022 11:54:05 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Spark-connecting-two-hadoop-clusters/m-p/346528#M40553</guid>
      <dc:creator>cjervis</dc:creator>
      <dc:date>2022-06-29T11:54:05Z</dc:date>
    </item>
  </channel>
</rss>

