Support Questions

Find answers, ask questions, and share your expertise

Spark SQL fails on empty ORC table, HDP 2.4.2

avatar
Contributor

A simple scenario, create an empty orc table using Hive, then try to query the table using Spark

Hive:
create table tbl(name string) stored as orc;
Spark:
sqlContext.sql("select * from tbl") // even collect is not needed to see the error!

Here is the error:

16/07/11 15:09:21 INFO ParseDriver: Parsing command: select * from tbl
16/07/11 15:09:22 INFO ParseDriver: Parse Completed
java.lang.IllegalArgumentException: orcFileOperator: path hdfs://dobbindata/apps/hive/warehouse/tbl does not have valid orc files matching the pattern
	at org.apache.spark.sql.hive.orc.OrcFileOperator$.listOrcFiles(OrcFileOperator.scala:104)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$.getFileReader(OrcFileOperator.scala:69)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$.readSchema(OrcFileOperator.scala:77)
	at org.apache.spark.sql.hive.orc.OrcRelation$$anonfun$2.apply(OrcRelation.scala:185)
	at org.apache.spark.sql.hive.orc.OrcRelation$$anonfun$2.apply(OrcRelation.scala:185)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.sql.hive.orc.OrcRelation.<init>(OrcRelation.scala:184)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog$$anonfun$20.apply(HiveMetastoreCatalog.scala:580)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog$$anonfun$20.apply(HiveMetastoreCatalog.scala:578)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog.org$apache$spark$sql$hive$HiveMetastoreCatalog$$convertToOrcRelation(HiveMetastoreCatalog.scala:578)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$$anonfun$apply$2.applyOrElse(HiveMetastoreCatalog.scala:647)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$$anonfun$apply$2.applyOrElse(HiveMetastoreCatalog.scala:643)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:335)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:335)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:334)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:332)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:332)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:281)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
	at scala.collection.Iterator$class.foreach(Iterator.scala:727)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
	at scala.collection.AbstractIterator.to(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:321)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:332)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$.apply(HiveMetastoreCatalog.scala:643)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$.apply(HiveMetastoreCatalog.scala:637)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:83)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:80)
	at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:111)
	at scala.collection.immutable.List.foldLeft(List.scala:84)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:80)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:72)
	at scala.collection.immutable.List.foreach(List.scala:318)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:72)
	at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:36)
	at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:36)
	at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:34)
	at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:133)
	at org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:52)
	at org.apache.spark.sql.SQLContext.sql(SQLContext.scala:817)
	at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:26)
	at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:31)
	at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:33)
	at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:35)
	at $iwC$$iwC$$iwC$$iwC.<init>(<console>:37)
	at $iwC$$iwC$$iwC.<init>(<console>:39)
	at $iwC$$iwC.<init>(<console>:41)
	at $iwC.<init>(<console>:43)
	at <init>(<console>:45)
	at .<init>(<console>:49)
	at .<clinit>(<console>)
	at .<init>(<console>:7)
	at .<clinit>(<console>)
	at $print(<console>)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
	at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1346)
	at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840)
	at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871)
	at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819)
	at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
	at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
	at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
	at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
	at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
	at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
	at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
	at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
	at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
	at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
	at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
	at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059)
	at org.apache.spark.repl.Main$.main(Main.scala:31)
	at org.apache.spark.repl.Main.main(Main.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731)
	at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
	at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
1 ACCEPTED SOLUTION

avatar
Contributor

Now I know what is going on, there is a ticket in progress at moment called "Enable OrcRelation even when connecting via spark thrift server", https://issues.apache.org/jira/browse/SPARK-12998

But you can find it as a release improvement on HDP 2.4.2, https://github.com/hortonworks/spark-release/blob/HDP-2.4.2.0-tag/HDP-CHANGES.txt

To conclude, Spark in Hortonworks could be slightly different from main Spark!

Now this is a workaround to disable the improvement and directly read the schema from Hive metastore:

sqlContext.setConf("spark.sql.hive.convertMetastoreOrc", "false")

View solution in original post

9 REPLIES 9

avatar
Contributor

There is also an open jira ticket since March

https://issues.apache.org/jira/browse/SPARK-14286

avatar
Contributor

I think this bug fix is also related to the issue

https://issues.apache.org/jira/browse/SPARK-8501

avatar
New Contributor

Is there any workaround/Patch available for the same in HDP 2.4.2 .. Becasue on HDP 2.3.6 said scenario is working fine on Spark Version 1.5.2.. below is sample:

[root@****-1316 ~]# spark-sql SET hive.support.sql11.reserved.keywords=false SET spark.sql.hive.version=1.2.1 SET spark.sql.hive.version=1.2.1 spark-sql> create table customer1(id int ,name string, email string) clustered by (id) into 2 buckets stored as orc ; OK Time taken: 5.225 seconds spark-sql> select * from customer1; Time taken: 0.402 seconds spark-sql>

avatar
Contributor

Now I know what is going on, there is a ticket in progress at moment called "Enable OrcRelation even when connecting via spark thrift server", https://issues.apache.org/jira/browse/SPARK-12998

But you can find it as a release improvement on HDP 2.4.2, https://github.com/hortonworks/spark-release/blob/HDP-2.4.2.0-tag/HDP-CHANGES.txt

To conclude, Spark in Hortonworks could be slightly different from main Spark!

Now this is a workaround to disable the improvement and directly read the schema from Hive metastore:

sqlContext.setConf("spark.sql.hive.convertMetastoreOrc", "false")

avatar
Expert Contributor

@Mahan Hosseinzadeh Which configuration file should I set

"spark.sql.hive.convertMetastoreOrc","false" ?

Not work in hive-site.xml

avatar
Contributor

@Huahua Wei this is a spark specific configuration is not in hive-site.xml! set it in your application or from Ambari

avatar
Expert Contributor

@Mahan Hosseinzadeh in Ambari, which section should I set

spark.sql.hive.convertMetastoreOrc ?

11173-9zzr4.png

or

11174-o7w3o.png

or elsewhere?

avatar
New Contributor

In ambari its Custom spark-defaults.

Value : spark.sql.hive.convertMetastoreOrc=false

avatar
New Contributor

Thanks @Mahan Hosseinzadeh!

I had a similar issue and your solution worked for me. I was trying to read from a hive table using the hive context.

This was the error I got, in case it's useful for someone else.

scala> val outputHiveTableSchema = hiveContext.read.table("table_name").schema java.util.NoSuchElementException: next on empty iterator at scala.collection.Iterator$anon$2.next(Iterator.scala:39) at scala.collection.Iterator$anon$2.next(Iterator.scala:37) at scala.collection.IndexedSeqLike$Elements.next(IndexedSeqLike.scala:64) at scala.collection.IterableLike$class.head(IterableLike.scala:91) at scala.collection.mutable.ArrayOps$ofRef.scala$collection$IndexedSeqOptimized$super$head(ArrayOps.scala:108) at scala.collection.IndexedSeqOptimized$class.head(IndexedSeqOptimized.scala:120) at scala.collection.mutable.ArrayOps$ofRef.head(ArrayOps.scala:108) at org.apache.spark.sql.hive.orc.OrcRelation$anonfun$2.apply(OrcRelation.scala:186) at org.apache.spark.sql.hive.orc.OrcRelation$anonfun$2.apply(OrcRelation.scala:185) at scala.Option.getOrElse(Option.scala:120) at org.apache.spark.sql.hive.orc.OrcRelation.<init>(OrcRelation.scala:184) at org.apache.spark.sql.hive.HiveMetastoreCatalog$anonfun$19.apply(HiveMetastoreCatalog.scala:567) at org.apache.spark.sql.hive.HiveMetastoreCatalog$anonfun$19.apply(HiveMetastoreCatalog.scala:565) at scala.Option.getOrElse(Option.scala:120) at org.apache.spark.sql.hive.HiveMetastoreCatalog.org$apache$spark$sql$hive$HiveMetastoreCatalog$convertToOrcRelation(HiveMetastoreCatalog.scala:565) at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$anonfun$apply$2.applyOrElse(HiveMetastoreCatalog.scala:647) at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$anonfun$apply$2.applyOrElse(HiveMetastoreCatalog.scala:643) at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$transformUp$1.apply(TreeNode.scala:335) at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$transformUp$1.apply(TreeNode.scala:335) at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69) at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:334) at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$.apply(HiveMetastoreCatalog.scala:643) at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$.apply(HiveMetastoreCatalog.scala:637) at org.apache.spark.sql.catalyst.rules.RuleExecutor$anonfun$execute$1$anonfun$apply$1.apply(RuleExecutor.scala:83) at org.apache.spark.sql.catalyst.rules.RuleExecutor$anonfun$execute$1$anonfun$apply$1.apply(RuleExecutor.scala:80) at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:111) at scala.collection.immutable.List.foldLeft(List.scala:84) at org.apache.spark.sql.catalyst.rules.RuleExecutor$anonfun$execute$1.apply(RuleExecutor.scala:80) at org.apache.spark.sql.catalyst.rules.RuleExecutor$anonfun$execute$1.apply(RuleExecutor.scala:72) at scala.collection.immutable.List.foreach(List.scala:318) at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:72) at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:36) at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:36) at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:34) at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:133) at org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:52) at org.apache.spark.sql.DataFrameReader.table(DataFrameReader.scala:337) at $iwC$iwC$iwC$iwC$iwC$iwC$iwC$iwC.<init>(<console>:29) at $iwC$iwC$iwC$iwC$iwC$iwC$iwC.<init>(<console>:34) at $iwC$iwC$iwC$iwC$iwC$iwC.<init>(<console>:36) at $iwC$iwC$iwC$iwC$iwC.<init>(<console>:38) at $iwC$iwC$iwC$iwC.<init>(<console>:40) at $iwC$iwC$iwC.<init>(<console>:42) at $iwC$iwC.<init>(<console>:44) at $iwC.<init>(<console>:46) at <init>(<console>:48) at .<init>(<console>:52) at .<clinit>(<console>) at .<init>(<console>:7) at .<clinit>(<console>) at $print(<console>) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065) at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1346) at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840) at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871) at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819) at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857) at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902) at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814) at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657) at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665) at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$loop(SparkILoop.scala:670) at org.apache.spark.repl.SparkILoop$anonfun$org$apache$spark$repl$SparkILoop$process$1.apply$mcZ$sp(SparkILoop.scala:997) at org.apache.spark.repl.SparkILoop$anonfun$org$apache$spark$repl$SparkILoop$process$1.apply(SparkILoop.scala:945) at org.apache.spark.repl.SparkILoop$anonfun$org$apache$spark$repl$SparkILoop$process$1.apply(SparkILoop.scala:945) at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135) at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$process(SparkILoop.scala:945) at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059) at org.apache.spark.repl.Main$.main(Main.scala:31) at org.apache.spark.repl.Main.main(Main.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at