<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: Kafka DirectStream and Python in Archives of Support Questions (Read Only)</title>
    <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Kafka-DirectStream-and-Python/m-p/82101#M84639</link>
    <description>&lt;P&gt;As of currently, Spark Structured Streaming is not supported:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;A href="https://www.cloudera.com/documentation/spark2/latest/topics/spark2_known_issues.html#ki_structured_streaming" target="_blank"&gt;https://www.cloudera.com/documentation/spark2/latest/topics/spark2_known_issues.html#ki_structured_streaming&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Thu, 08 Nov 2018 09:01:48 GMT</pubDate>
    <dc:creator>manuelschipper</dc:creator>
    <dc:date>2018-11-08T09:01:48Z</dc:date>
    <item>
      <title>Kafka DirectStream and Python</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Kafka-DirectStream-and-Python/m-p/81669#M84636</link>
      <description>&lt;P&gt;Hello all,&amp;nbsp;&lt;/P&gt;&lt;P&gt;does cloudera supports Kafka direct stream with python ?&amp;nbsp;&lt;/P&gt;&lt;P&gt;I'm facing some issue about missing methods.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 16 Sep 2022 13:50:55 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Kafka-DirectStream-and-Python/m-p/81669#M84636</guid>
      <dc:creator>ozw1z5rd</dc:creator>
      <dc:date>2022-09-16T13:50:55Z</dc:date>
    </item>
    <item>
      <title>Re: Kafka DirectStream and Python</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Kafka-DirectStream-and-Python/m-p/81882#M84637</link>
      <description>&lt;P&gt;Could you ellaborate on what and how you are trying to achieve? What kind of errors are you receiving?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I'm assuming you are talking about setting up a Spark/Kafka integration using Python as the Spark language. Something as follows:&lt;/P&gt;&lt;P&gt;&lt;A href="http://spark.apache.org/docs/latest/streaming-kafka-0-8-integration.html#approach-2-direct-approach-no-receivers" target="_blank"&gt;http://spark.apache.org/docs/latest/streaming-kafka-0-8-integration.html#approach-2-direct-approach-no-receivers&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Could you confirm or amend?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Cloudera's distribution of Kafka officially supports Flume, Spark, and Java clients [1] as these have been tested by our development team. However, Spark Structured Streaming is currently untested and unsupported.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;[1]:&amp;nbsp;&lt;A href="http://www.cloudera.com/documentation/enterprise/latest/topics/kafka_end_to_end.html#kafka_end_to_end" target="_blank"&gt;http://www.cloudera.com/documentation/enterprise/latest/topics/kafka_end_to_end.html#kafka_end_to_end&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Sun, 04 Nov 2018 11:10:17 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Kafka-DirectStream-and-Python/m-p/81882#M84637</guid>
      <dc:creator>manuelschipper</dc:creator>
      <dc:date>2018-11-04T11:10:17Z</dc:date>
    </item>
    <item>
      <title>Re: Kafka DirectStream and Python</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Kafka-DirectStream-and-Python/m-p/81907#M84638</link>
      <description>&lt;P&gt;Hello &amp;nbsp;Manuel,&amp;nbsp;&lt;/P&gt;&lt;P&gt;I'm trying to use structured streaming using python, but all attempt made are unsuccessful.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Please read this post:&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;A href="https://community.hortonworks.com/articles/197922/spark-23-structured-streaming-integration-with-apa.html" target="_blank"&gt;https://community.hortonworks.com/articles/197922/spark-23-structured-streaming-integration-with-apa.html&lt;/A&gt;&lt;/P&gt;&lt;P&gt;This error was the 1st I got,so I attempted to change the jar to&amp;nbsp;spark-sql-kafka-0-10_2.11-2.1.0.cloudera1.jar and the error now is the one shown into the attached picture.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;There is no way to move a step from this point. So we started to migrate to Direct Stream, also using this last option we found other issue on python so now we are using scala/java code. But this is not a problem.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;If we can make Structured streaming available, I think that it will be a great option for all the cloudera users.&lt;/P&gt;&lt;BR /&gt;&lt;IMG src="http://community.cloudera.com/t5/image/serverpage/image-id/4959iC4A4D0CBAAD214D3/image-size/large?v=1.0&amp;amp;px=600" border="0" alt="Screen Shot 2018-10-29 at 10.52.13 AM.png" title="Screen Shot 2018-10-29 at 10.52.13 AM.png" /&gt;</description>
      <pubDate>Mon, 05 Nov 2018 07:37:30 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Kafka-DirectStream-and-Python/m-p/81907#M84638</guid>
      <dc:creator>ozw1z5rd</dc:creator>
      <dc:date>2018-11-05T07:37:30Z</dc:date>
    </item>
    <item>
      <title>Re: Kafka DirectStream and Python</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Kafka-DirectStream-and-Python/m-p/82101#M84639</link>
      <description>&lt;P&gt;As of currently, Spark Structured Streaming is not supported:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;A href="https://www.cloudera.com/documentation/spark2/latest/topics/spark2_known_issues.html#ki_structured_streaming" target="_blank"&gt;https://www.cloudera.com/documentation/spark2/latest/topics/spark2_known_issues.html#ki_structured_streaming&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 08 Nov 2018 09:01:48 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Kafka-DirectStream-and-Python/m-p/82101#M84639</guid>
      <dc:creator>manuelschipper</dc:creator>
      <dc:date>2018-11-08T09:01:48Z</dc:date>
    </item>
    <item>
      <title>Re: Kafka DirectStream and Python</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Kafka-DirectStream-and-Python/m-p/82356#M84640</link>
      <description>&lt;P&gt;&lt;SPAN&gt;I switched from Python to Scala, which is a better supported language since Spark itself has been written in Scala.&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;I keep trying, on a my remote friend suggestion and found that this works fine, so Structured Streaming it is not supported but it is working.&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;scala&amp;gt; import org.apache.spark.sql.Encoders

scala&amp;gt; case class Amazon(EventId:String, DOCOMOEntitlementId:String, AmazonSubscriptionId:String, AmazonPlanId:String, DOCOMOUserId:String, MerchantAccountKey:String, ResellerKey:String, Status:String, CreatedDate:String, EndDate:String, ActivatedDate:String, FailedDate:String, ExpiryDate:String, LastUpdated:String, dateTimeStart:String, dateTimeEnd:String, referrerSource:String, reasonCode:String)

scala&amp;gt; val schema = Encoders.product[Amazon].schema

scala &amp;gt; val data = spark.readStream.schema(schema).csv("/user/ale/csv.csv").as[Amazon]
data: org.apache.spark.sql.Dataset[Amazon] = [EventId: string, DOCOMOEntitlementId: string ... 

scala&amp;gt; data.isStreaming 
res0: Boolean = true

scala&amp;gt; data.writeStream.outputMode("append").format("console")

res1: org.apache.spark.sql.streaming.DataStreamWriter[Amazon] = org.apache.spark.sql.streaming.DataStreamWriter@7dc56c0a

scala&amp;gt; res1.start()&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;On the other hand using&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;PRE&gt;spark.readStream.format("kafka") .... &lt;/PRE&gt;&lt;P&gt;&lt;SPAN&gt;Lead to this issue:&lt;/SPAN&gt;&lt;/P&gt;&lt;PRE&gt;java.lang.ClassNotFoundException: Failed to find data source: kafka. Please find packages at http://spark.apache.org/third-party-projects.html
  at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:635)
  at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:159)
  ... 49 elided
Caused by: java.lang.ClassNotFoundException: kafka.DefaultSource
  at scala.reflect.internal.util.AbstractFileClassLoader.findClass(AbstractFileClassLoader.scala:62)
  at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
  at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
  at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$23$$anonfun$apply$15.apply(DataSource.scala:618)
  at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$23$$anonfun$apply$15.apply(DataSource.scala:618)
  at scala.util.Try$.apply(Try.scala:192)
  at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$23.apply(DataSource.scala:618)
  at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$23.apply(DataSource.scala:618)
  at scala.util.Try.orElse(Try.scala:84)
  at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:618)
  ... 50 more&lt;/PRE&gt;&lt;P&gt;&lt;SPAN&gt;Is there any&amp;nbsp;Jars from Cloudera that can solve this issue?&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;If there are no Jars from Cloudera, what it the best that matches?&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Well after a while I found that the library I need is&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;A href="https://repository.cloudera.com/cloudera/cloudera-repos/org/apache/spark/spark-sql-kafka-0-10_2.11/2.3.0.cloudera2/spark-sql-kafka-0-10_2.11-2.3.0.cloudera2.jar" target="_self"&gt;https://repository.cloudera.com/cloudera/cloudera-repos/org/apache/spark/spark-sql-kafka-0-10_2.11/2.3.0.cloudera2/spark-sql-kafka-0-10_2.11-2.3.0.cloudera2.jar&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;And anything goes almost fine, but...&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;org.apache.kafka.common.errors.GroupAuthorizationException: Not authorized to access group: spark-kafka-source-707ab780-c71c-408b-80dd-be1960a03dd6-360506181-driver-0
18/11/15 10:51:27 WARN kafka010.KafkaOffsetReader: Error in attempt 2 getting Kafka offsets: 
org.apache.kafka.common.errors.GroupAuthorizationException: Not authorized to access group: spark-kafka-source-707ab780-c71c-408b-80dd-be1960a03dd6-360506181-driver-1
18/11/15 10:51:28 WARN kafka010.KafkaOffsetReader: Error in attempt 3 getting Kafka offsets: 
org.apache.kafka.common.errors.GroupAuthorizationException: Not authorized to access group: spark-kafka-source-707ab780-c71c-408b-80dd-be1960a03dd6-360506181-driver-2&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;and this depends on how the kakfa code works, and there is no option to avoid it.&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Screen Shot 2018-11-15 at 12.42.20 PM.png" style="width: 600px;"&gt;&lt;img src="https://community.cloudera.com/t5/image/serverpage/image-id/4994i957561DF0C418E88/image-size/large?v=v2&amp;amp;px=999" role="button" title="Screen Shot 2018-11-15 at 12.42.20 PM.png" alt="Screen Shot 2018-11-15 at 12.42.20 PM.png" /&gt;&lt;/span&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;This, of course, is a problem when kafka is under Sentry. Is there any option to have it working without disabling Sentry?&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 15 Nov 2018 11:45:06 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Kafka-DirectStream-and-Python/m-p/82356#M84640</guid>
      <dc:creator>ozw1z5rd</dc:creator>
      <dc:date>2018-11-15T11:45:06Z</dc:date>
    </item>
    <item>
      <title>Re: Kafka DirectStream and Python</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/Kafka-DirectStream-and-Python/m-p/84895#M84641</link>
      <description>&lt;P&gt;Hello,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Structured Streaming is now supported in 6.1&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;A href="https://blog.cloudera.com/blog/2018/12/cloudera-enterprise-6-1-0-is-now-available/" target="_blank"&gt;https://blog.cloudera.com/blog/2018/12/cloudera-enterprise-6-1-0-is-now-available/&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Could you retry testing it in this version? Should you face the same issue in that version let us know.&lt;/P&gt;</description>
      <pubDate>Mon, 14 Jan 2019 10:43:24 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/Kafka-DirectStream-and-Python/m-p/84895#M84641</guid>
      <dc:creator>manuelschipper</dc:creator>
      <dc:date>2019-01-14T10:43:24Z</dc:date>
    </item>
  </channel>
</rss>

