<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question pyspark streaming writing data in to hive using foreachbatch method in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/pyspark-streaming-writing-data-in-to-hive-using-foreachbatch/m-p/311744#M224808</link>
    <description>&lt;P&gt;Hi everybody,&lt;/P&gt;&lt;P&gt;I am trying the following approach to write data in to hive table.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.streaming.kafka import KafkaUtils
import datetime
from pyspark.sql.functions import lit,unix_timestamp
from os.path import *
from pyspark import Row

warehouseLocation = abspath("spark-warehouse")
spark = SparkSession.builder.appName("spark_streaming").config("spark.sql.warehouse.dir", warehouseLocation).enableHiveSupport().getOrCreate()

kafka = "kafka"
offsets = "earliest"
servers = "server_1:port,server_2:port"
security_protocol = "SSL"
keystore_location = "keystore"
keystore_password = "keystore_password"
kafka_topic = "kafka_topic"
checkpoint_location ="/checkpoint/location"

def hiveInsert(df, batchId):
        df.createOrReplaceTempView("updates")
        spark.sql("insert into hive_db.hive_table select value, time_stamp from updates")

df = spark.readStream.format(kafka).option("startingoffsets", offsets).option("kafka.bootstrap.servers", servers).option("kafka.security.protocol", security_protocol).option("kafka.ssl.keystore.location", keystore_location).option("kafka.ssl.keystore.password", keystore_password).option("subscribe",kafka_topic).load().selectExpr("CAST(value AS STRING)").select('value').withColumn('time_stamp',lit(datetime.datetime.now().strftime('%Y%m%d%H%M')))

query = df.writeStream.foreachBatch(hiveInsert).start()


query.awaitTermination()&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The above code is not working&lt;/P&gt;&lt;P&gt;Any pointers are of great help!&lt;/P&gt;</description>
    <pubDate>Thu, 18 Feb 2021 17:21:06 GMT</pubDate>
    <dc:creator>mark_hadoop</dc:creator>
    <dc:date>2021-02-18T17:21:06Z</dc:date>
  </channel>
</rss>

