HDP-2.6.3.0
/usr/hdp/current/spark2-client/bin/spark-submit dpi_test.py --queue load --driver-memory 10g --num-executors 6 --executor-memory 30G
dpi_test.py :
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import HiveContext
from pyspark.sql.functions import udf, col
from pyspark.sql.types import BooleanType
import re
import time
warehouse_location = abspath('spark-warehouse')
spark = SparkSession \
.builder \
.appName("Python Spark SQL Hive integration example") \
.config("spark.sql.warehouse.dir", warehouse_location) \
.enableHiveSupport() \
.getOrCreate()
cont=udf(lambda x,y: bool(re.match('^((.)*([^0-9A-Za-z])+)*'+x+'(([^0-9A-Za-z])+(.)*)*$',y))&True, BooleanType())
spark.sql("select d.msisdn, d.end_time from other_sources.stg_dpi_other_day d where msisdn='380988526911'").groupby('msisdn').agg({'end_time':'min', 'end_time':'max','end_time':'count'}).show(n=50)