Support Questions
Find answers, ask questions, and share your expertise
Announcements
Alert: Welcome to the Unified Cloudera Community. Former HCC members be sure to read and learn how to activate your account here.

Spark SQL fails on empty ORC table, HDP 2.4.2

SOLVED Go to solution
Highlighted

Spark SQL fails on empty ORC table, HDP 2.4.2

New Contributor

A simple scenario, create an empty orc table using Hive, then try to query the table using Spark

Hive:
create table tbl(name string) stored as orc;
Spark:
sqlContext.sql("select * from tbl") // even collect is not needed to see the error!

Here is the error:

16/07/11 15:09:21 INFO ParseDriver: Parsing command: select * from tbl
16/07/11 15:09:22 INFO ParseDriver: Parse Completed
java.lang.IllegalArgumentException: orcFileOperator: path hdfs://dobbindata/apps/hive/warehouse/tbl does not have valid orc files matching the pattern
	at org.apache.spark.sql.hive.orc.OrcFileOperator$.listOrcFiles(OrcFileOperator.scala:104)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$.getFileReader(OrcFileOperator.scala:69)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$.readSchema(OrcFileOperator.scala:77)
	at org.apache.spark.sql.hive.orc.OrcRelation$$anonfun$2.apply(OrcRelation.scala:185)
	at org.apache.spark.sql.hive.orc.OrcRelation$$anonfun$2.apply(OrcRelation.scala:185)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.sql.hive.orc.OrcRelation.<init>(OrcRelation.scala:184)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog$$anonfun$20.apply(HiveMetastoreCatalog.scala:580)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog$$anonfun$20.apply(HiveMetastoreCatalog.scala:578)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog.org$apache$spark$sql$hive$HiveMetastoreCatalog$$convertToOrcRelation(HiveMetastoreCatalog.scala:578)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$$anonfun$apply$2.applyOrElse(HiveMetastoreCatalog.scala:647)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$$anonfun$apply$2.applyOrElse(HiveMetastoreCatalog.scala:643)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:335)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:335)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:334)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:332)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:332)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:281)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
	at scala.collection.Iterator$class.foreach(Iterator.scala:727)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
	at scala.collection.AbstractIterator.to(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:321)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:332)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$.apply(HiveMetastoreCatalog.scala:643)
	at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$.apply(HiveMetastoreCatalog.scala:637)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:83)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:80)
	at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:111)
	at scala.collection.immutable.List.foldLeft(List.scala:84)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:80)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:72)
	at scala.collection.immutable.List.foreach(List.scala:318)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:72)
	at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:36)
	at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:36)
	at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:34)
	at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:133)
	at org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:52)
	at org.apache.spark.sql.SQLContext.sql(SQLContext.scala:817)
	at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:26)
	at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:31)
	at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:33)
	at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:35)
	at $iwC$$iwC$$iwC$$iwC.<init>(<console>:37)
	at $iwC$$iwC$$iwC.<init>(<console>:39)
	at $iwC$$iwC.<init>(<console>:41)
	at $iwC.<init>(<console>:43)
	at <init>(<console>:45)
	at .<init>(<console>:49)
	at .<clinit>(<console>)
	at .<init>(<console>:7)
	at .<clinit>(<console>)
	at $print(<console>)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
	at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1346)
	at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840)
	at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871)
	at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819)
	at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
	at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
	at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
	at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
	at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
	at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
	at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
	at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
	at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
	at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
	at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
	at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059)
	at org.apache.spark.repl.Main$.main(Main.scala:31)
	at org.apache.spark.repl.Main.main(Main.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731)
	at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
	at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
1 ACCEPTED SOLUTION

Accepted Solutions

Re: Spark SQL fails on empty ORC table, HDP 2.4.2

New Contributor

Now I know what is going on, there is a ticket in progress at moment called "Enable OrcRelation even when connecting via spark thrift server", https://issues.apache.org/jira/browse/SPARK-12998

But you can find it as a release improvement on HDP 2.4.2, https://github.com/hortonworks/spark-release/blob/HDP-2.4.2.0-tag/HDP-CHANGES.txt

To conclude, Spark in Hortonworks could be slightly different from main Spark!

Now this is a workaround to disable the improvement and directly read the schema from Hive metastore:

sqlContext.setConf("spark.sql.hive.convertMetastoreOrc", "false")
9 REPLIES 9

Re: Spark SQL fails on empty ORC table, HDP 2.4.2

New Contributor

There is also an open jira ticket since March

https://issues.apache.org/jira/browse/SPARK-14286

Re: Spark SQL fails on empty ORC table, HDP 2.4.2

New Contributor

I think this bug fix is also related to the issue

https://issues.apache.org/jira/browse/SPARK-8501

Re: Spark SQL fails on empty ORC table, HDP 2.4.2

New Contributor

Is there any workaround/Patch available for the same in HDP 2.4.2 .. Becasue on HDP 2.3.6 said scenario is working fine on Spark Version 1.5.2.. below is sample:

[root@****-1316 ~]# spark-sql SET hive.support.sql11.reserved.keywords=false SET spark.sql.hive.version=1.2.1 SET spark.sql.hive.version=1.2.1 spark-sql> create table customer1(id int ,name string, email string) clustered by (id) into 2 buckets stored as orc ; OK Time taken: 5.225 seconds spark-sql> select * from customer1; Time taken: 0.402 seconds spark-sql>

Re: Spark SQL fails on empty ORC table, HDP 2.4.2

New Contributor

Now I know what is going on, there is a ticket in progress at moment called "Enable OrcRelation even when connecting via spark thrift server", https://issues.apache.org/jira/browse/SPARK-12998

But you can find it as a release improvement on HDP 2.4.2, https://github.com/hortonworks/spark-release/blob/HDP-2.4.2.0-tag/HDP-CHANGES.txt

To conclude, Spark in Hortonworks could be slightly different from main Spark!

Now this is a workaround to disable the improvement and directly read the schema from Hive metastore:

sqlContext.setConf("spark.sql.hive.convertMetastoreOrc", "false")

Re: Spark SQL fails on empty ORC table, HDP 2.4.2

Rising Star

@Mahan Hosseinzadeh Which configuration file should I set

"spark.sql.hive.convertMetastoreOrc","false" ?

Not work in hive-site.xml

Re: Spark SQL fails on empty ORC table, HDP 2.4.2

New Contributor

@Huahua Wei this is a spark specific configuration is not in hive-site.xml! set it in your application or from Ambari

Re: Spark SQL fails on empty ORC table, HDP 2.4.2

Rising Star

@Mahan Hosseinzadeh in Ambari, which section should I set

spark.sql.hive.convertMetastoreOrc ?

or

or elsewhere?

Re: Spark SQL fails on empty ORC table, HDP 2.4.2

New Contributor

In ambari its Custom spark-defaults.

Value : spark.sql.hive.convertMetastoreOrc=false

Re: Spark SQL fails on empty ORC table, HDP 2.4.2

New Contributor

Thanks @Mahan Hosseinzadeh!

I had a similar issue and your solution worked for me. I was trying to read from a hive table using the hive context.

This was the error I got, in case it's useful for someone else.

scala> val outputHiveTableSchema = hiveContext.read.table("table_name").schema java.util.NoSuchElementException: next on empty iterator at scala.collection.Iterator$anon$2.next(Iterator.scala:39) at scala.collection.Iterator$anon$2.next(Iterator.scala:37) at scala.collection.IndexedSeqLike$Elements.next(IndexedSeqLike.scala:64) at scala.collection.IterableLike$class.head(IterableLike.scala:91) at scala.collection.mutable.ArrayOps$ofRef.scala$collection$IndexedSeqOptimized$super$head(ArrayOps.scala:108) at scala.collection.IndexedSeqOptimized$class.head(IndexedSeqOptimized.scala:120) at scala.collection.mutable.ArrayOps$ofRef.head(ArrayOps.scala:108) at org.apache.spark.sql.hive.orc.OrcRelation$anonfun$2.apply(OrcRelation.scala:186) at org.apache.spark.sql.hive.orc.OrcRelation$anonfun$2.apply(OrcRelation.scala:185) at scala.Option.getOrElse(Option.scala:120) at org.apache.spark.sql.hive.orc.OrcRelation.<init>(OrcRelation.scala:184) at org.apache.spark.sql.hive.HiveMetastoreCatalog$anonfun$19.apply(HiveMetastoreCatalog.scala:567) at org.apache.spark.sql.hive.HiveMetastoreCatalog$anonfun$19.apply(HiveMetastoreCatalog.scala:565) at scala.Option.getOrElse(Option.scala:120) at org.apache.spark.sql.hive.HiveMetastoreCatalog.org$apache$spark$sql$hive$HiveMetastoreCatalog$convertToOrcRelation(HiveMetastoreCatalog.scala:565) at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$anonfun$apply$2.applyOrElse(HiveMetastoreCatalog.scala:647) at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$anonfun$apply$2.applyOrElse(HiveMetastoreCatalog.scala:643) at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$transformUp$1.apply(TreeNode.scala:335) at org.apache.spark.sql.catalyst.trees.TreeNode$anonfun$transformUp$1.apply(TreeNode.scala:335) at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69) at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:334) at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$.apply(HiveMetastoreCatalog.scala:643) at org.apache.spark.sql.hive.HiveMetastoreCatalog$OrcConversions$.apply(HiveMetastoreCatalog.scala:637) at org.apache.spark.sql.catalyst.rules.RuleExecutor$anonfun$execute$1$anonfun$apply$1.apply(RuleExecutor.scala:83) at org.apache.spark.sql.catalyst.rules.RuleExecutor$anonfun$execute$1$anonfun$apply$1.apply(RuleExecutor.scala:80) at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:111) at scala.collection.immutable.List.foldLeft(List.scala:84) at org.apache.spark.sql.catalyst.rules.RuleExecutor$anonfun$execute$1.apply(RuleExecutor.scala:80) at org.apache.spark.sql.catalyst.rules.RuleExecutor$anonfun$execute$1.apply(RuleExecutor.scala:72) at scala.collection.immutable.List.foreach(List.scala:318) at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:72) at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:36) at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:36) at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:34) at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:133) at org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:52) at org.apache.spark.sql.DataFrameReader.table(DataFrameReader.scala:337) at $iwC$iwC$iwC$iwC$iwC$iwC$iwC$iwC.<init>(<console>:29) at $iwC$iwC$iwC$iwC$iwC$iwC$iwC.<init>(<console>:34) at $iwC$iwC$iwC$iwC$iwC$iwC.<init>(<console>:36) at $iwC$iwC$iwC$iwC$iwC.<init>(<console>:38) at $iwC$iwC$iwC$iwC.<init>(<console>:40) at $iwC$iwC$iwC.<init>(<console>:42) at $iwC$iwC.<init>(<console>:44) at $iwC.<init>(<console>:46) at <init>(<console>:48) at .<init>(<console>:52) at .<clinit>(<console>) at .<init>(<console>:7) at .<clinit>(<console>) at $print(<console>) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065) at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1346) at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840) at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871) at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819) at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857) at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902) at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814) at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657) at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665) at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$loop(SparkILoop.scala:670) at org.apache.spark.repl.SparkILoop$anonfun$org$apache$spark$repl$SparkILoop$process$1.apply$mcZ$sp(SparkILoop.scala:997) at org.apache.spark.repl.SparkILoop$anonfun$org$apache$spark$repl$SparkILoop$process$1.apply(SparkILoop.scala:945) at org.apache.spark.repl.SparkILoop$anonfun$org$apache$spark$repl$SparkILoop$process$1.apply(SparkILoop.scala:945) at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135) at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$process(SparkILoop.scala:945) at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059) at org.apache.spark.repl.Main$.main(Main.scala:31) at org.apache.spark.repl.Main.main(Main.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at