Here is my code :
%spark.spark
val file=sc.textFile("/apps/hive/migration/unique_visitors_RT_craigieburn.csv")
case class RT_craigieburn(
event_year:Integer,
event_month:Integer,
event_day:Integer,
event_hour:Integer,
event_minute:Integer,
event_second:Integer,
mac_address:String)
val realtimeTable = file.map(s => s.split(",")).filter(s => s(0) != "event_year").map(
s => RT_craigieburn(s(0).toInt,
s(1).toInt,
s(2).toInt,
s(3).toInt,
s(4).toInt,
s(5).toInt,
s(6).replaceAll("\"", "")
)
).toDF()
realtimeTable.registerTempTable("craigieburn")
When I run, it says :
file: org.apache.spark.rdd.RDD[String] = /apps/hive/migration/unique_visitors_RT_craigieburn.csv MapPartitionsRDD[46] at textFile at <console>:29
defined class RT_craigieburn
realtimeTable: org.apache.spark.sql.DataFrame = [event_year: int, event_month: int, event_day: int, event_hour: int, event_minute: int, event_second: int, mac_address: string]
But when I tried to run spark sql
%spark.sql
select event_year
from craigieburn
java.lang.NumberFormatException: For input string: ""
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Integer.parseInt(Integer.java:592)
at java.lang.Integer.parseInt(Integer.java:615)
at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:229)
at scala.collection.immutable.StringOps.toInt(StringOps.scala:31)
at $line115675017136.$read$iwC$iwC$iwC$iwC$iwC$iwC$iwC$iwC$anonfun$3.apply(<console>:34)
at $line115675017136.$read$iwC$iwC$iwC$iwC$iwC$iwC$iwC$iwC$anonfun$3.apply(<console>:34)
at scala.collection.Iterator$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$anonfun$5.apply(SparkPlan.scala:212)
at org.apache.spark.sql.execution.SparkPlan$anonfun$5.apply(SparkPlan.scala:212)
at org.apache.spark.SparkContext$anonfun$runJob$5.apply(SparkContext.scala:1857)
at org.apache.spark.SparkContext$anonfun$runJob$5.apply(SparkContext.scala:1857)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
Can someone help me ?
Thanks