Reply
Contributor
Posts: 83
Registered: ‎05-19-2016

How to avoid time out waiting for client connection with hive on spark

I have my Hive jobs scheduled through oozie and I randomly get:

ERROR : Failed to execute spark task, with exception 'org.apache.hadoop.hive.ql.metadata.HiveException(Failed to create spark client.)'
org.apache.hadoop.hive.ql.metadata.HiveException: Failed to create spark client.
	at org.apache.hadoop.hive.ql.exec.spark.session.SparkSessionImpl.open(SparkSessionImpl.java:64)
	at org.apache.hadoop.hive.ql.exec.spark.session.SparkSessionManagerImpl.getSession(SparkSessionManagerImpl.java:114)
	at org.apache.hadoop.hive.ql.exec.spark.SparkUtilities.getSparkSession(SparkUtilities.java:111)
	at org.apache.hadoop.hive.ql.exec.spark.SparkTask.execute(SparkTask.java:99)
	at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:214)
	at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:100)
	at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:1977)
	at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1690)
	at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1422)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1206)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1201)
	at org.apache.hive.service.cli.operation.SQLOperation.runQuery(SQLOperation.java:237)
	at org.apache.hive.service.cli.operation.SQLOperation.access$300(SQLOperation.java:88)
	at org.apache.hive.service.cli.operation.SQLOperation$3$1.run(SQLOperation.java:293)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:415)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1920)
	at org.apache.hive.service.cli.operation.SQLOperation$3.run(SQLOperation.java:306)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
	at java.util.concurrent.FutureTask.run(FutureTask.java:262)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.util.concurrent.TimeoutException: Timed out waiting for client connection.
	at com.google.common.base.Throwables.propagate(Throwables.java:156)
	at org.apache.hive.spark.client.SparkClientImpl.<init>(SparkClientImpl.java:124)
	at org.apache.hive.spark.client.SparkClientFactory.createClient(SparkClientFactory.java:80)
	at org.apache.hadoop.hive.ql.exec.spark.RemoteHiveSparkClient.createRemoteClient(RemoteHiveSparkClient.java:100)
	at org.apache.hadoop.hive.ql.exec.spark.RemoteHiveSparkClient.<init>(RemoteHiveSparkClient.java:96)
	at org.apache.hadoop.hive.ql.exec.spark.HiveSparkClientFactory.createHiveSparkClient(HiveSparkClientFactory.java:68)
	at org.apache.hadoop.hive.ql.exec.spark.session.SparkSessionImpl.open(SparkSessionImpl.java:62)
	... 22 more
Caused by: java.util.concurrent.ExecutionException: java.util.concurrent.TimeoutException: Timed out waiting for client connection.
	at io.netty.util.concurrent.AbstractFuture.get(AbstractFuture.java:37)
	at org.apache.hive.spark.client.SparkClientImpl.<init>(SparkClientImpl.java:108)
	... 27 more
Caused by: java.util.concurrent.TimeoutException: Timed out waiting for client connection.
	at org.apache.hive.spark.client.rpc.RpcServer$2.run(RpcServer.java:141)
	at io.netty.util.concurrent.PromiseTask$RunnableAdapter.call(PromiseTask.java:38)
	at io.netty.util.concurrent.ScheduledFutureTask.run(ScheduledFutureTask.java:120)
	at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:357)
	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:357)
	at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
	... 1 more

ERROR : Failed to execute spark task, with exception 'org.apache.hadoop.hive.ql.metadata.HiveException(Failed to create spark client.)'
org.apache.hadoop.hive.ql.metadata.HiveException: Failed to create spark client.
	at org.apache.hadoop.hive.ql.exec.spark.session.SparkSessionImpl.open(SparkSessionImpl.java:64)
	at org.apache.hadoop.hive.ql.exec.spark.session.SparkSessionManagerImpl.getSession(SparkSessionManagerImpl.java:114)
	at org.apache.hadoop.hive.ql.exec.spark.SparkUtilities.getSparkSession(SparkUtilities.java:111)
	at org.apache.hadoop.hive.ql.exec.spark.SparkTask.execute(SparkTask.java:99)
	at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:214)
	at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:100)
	at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:1977)
	at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1690)
	at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1422)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1206)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1201)
	at org.apache.hive.service.cli.operation.SQLOperation.runQuery(SQLOperation.java:237)
	at org.apache.hive.service.cli.operation.SQLOperation.access$300(SQLOperation.java:88)
	at org.apache.hive.service.cli.operation.SQLOperation$3$1.run(SQLOperation.java:293)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:415)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1920)
	at org.apache.hive.service.cli.operation.SQLOperation$3.run(SQLOperation.java:306)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
	at java.util.concurrent.FutureTask.run(FutureTask.java:262)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.util.concurrent.TimeoutException: Timed out waiting for client connection.
	at com.google.common.base.Throwables.propagate(Throwables.java:156)
	at org.apache.hive.spark.client.SparkClientImpl.<init>(SparkClientImpl.java:124)
	at org.apache.hive.spark.client.SparkClientFactory.createClient(SparkClientFactory.java:80)
	at org.apache.hadoop.hive.ql.exec.spark.RemoteHiveSparkClient.createRemoteClient(RemoteHiveSparkClient.java:100)
	at org.apache.hadoop.hive.ql.exec.spark.RemoteHiveSparkClient.<init>(RemoteHiveSparkClient.java:96)
	at org.apache.hadoop.hive.ql.exec.spark.HiveSparkClientFactory.createHiveSparkClient(HiveSparkClientFactory.java:68)
	at org.apache.hadoop.hive.ql.exec.spark.session.SparkSessionImpl.open(SparkSessionImpl.java:62)
	... 22 more
Caused by: java.util.concurrent.ExecutionException: java.util.concurrent.TimeoutException: Timed out waiting for client connection.
	at io.netty.util.concurrent.AbstractFuture.get(AbstractFuture.java:37)
	at org.apache.hive.spark.client.SparkClientImpl.<init>(SparkClientImpl.java:108)
	... 27 more
Caused by: java.util.concurrent.TimeoutException: Timed out waiting for client connection.
	at org.apache.hive.spark.client.rpc.RpcServer$2.run(RpcServer.java:141)
	at io.netty.util.concurrent.PromiseTask$RunnableAdapter.call(PromiseTask.java:38)
	at io.netty.util.concurrent.ScheduledFutureTask.run(ScheduledFutureTask.java:120)
	at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:357)
	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:357)
	at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
	... 1 more
ERROR : FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.spark.SparkTask
INFO  : Completed executing command(queryId=hive_20180506233232_2a9bc92f-d830-435b-91df-eafd2236a9fc); Time taken: 90.051 seconds
Error: Error while processing statement: FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.spark.SparkTask (state=08S01,code=1)
Closing: 0: jdbc:hive2://ip-172-31-4-192.ap-south-1.compute.internal:10000/default
Intercepting System.exit(2)
Failing Oozie Launcher, Main class [org.apache.oozie.action.hadoop.Hive2Main], exit code [2]

What can I do to fix this? It happens randomly even at times when workload is not much

Cluster Confi:

Total vCores: 60

Total memory: 205 GB

3 worker nodes in my cluster are m4.4x Large ec2 instances, 2 m4.2x large ec2 instance with Yarn (Node manager)

Container memory is set to 8GB in Yarn config

spark.executor.cores=1(since I will have multiple users running queries and also multiple queries running at same time)

spark java heap size = 5GB

Memory overhead = 2GB

Is there something I can do to improve the situation?

Highlighted
Cloudera Employee
Posts: 54
Registered: ‎11-16-2015

Re: How to avoid time out waiting for client connection with hive on spark

Hi @sim6

 

Caused by: java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.util.concurrent.TimeoutException: Timed out waiting for client connection.

It looks like the rpc times out waiting for resource getting available on spark side. Given that it is random indicates that this error might be happening when the cluster does not have enough resource and nothing permanently wrong with the cluster as such.

 

For testing you can explore the following timeout values and see if that helps:

 

hive.spark.client.connect.timeout=30000ms  (default 1000ms)
hive.spark.client.server.connect.timeout=300000ms (default  90000ms)

You'd need to set it up in the Hive Safety Value using the steps below, so that it takes effect for all the spark queries:

Go to Cloudera Manager home page
click through "Hive" service
click "Configuration"
search for "Hive Service Advanced Configuration Snippet (Safety Valve) for hive-site.xml"
enter the following in the XML text field:
<property>
<name>hive.spark.client.connect.timeout</name>
<value>30000ms</value>
</property>
<property>
<name>hive.spark.client.server.connect.timeout</name>
<value>300000ms</value>
</property

Restart Hive services to allow changes to take effect then run the query again to test. 

 

Let us know how it goes.

Contributor
Posts: 83
Registered: ‎05-19-2016

Re: How to avoid time out waiting for client connection with hive on spark

Hello, 

Thank you for your response.

 

 I have set the timeout to the recommended value as directed in the response. However, I still get

 

ERROR : Failed to monitor Job[ 8] with exception 'org.apache.hadoop.hive.ql.metadata.HiveException(java.util.concurrent.TimeoutException)'
org.apache.hadoop.hive.ql.metadata.HiveException: java.util.concurrent.TimeoutException
	at org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus.getSparkJobInfo(RemoteSparkJobStatus.java:153)
	at org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus.getState(RemoteSparkJobStatus.java:82)
	at org.apache.hadoop.hive.ql.exec.spark.status.RemoteSparkJobMonitor.startMonitor(RemoteSparkJobMonitor.java:80)
	at org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobRef.monitorJob(RemoteSparkJobRef.java:60)
	at org.apache.hadoop.hive.ql.exec.spark.SparkTask.execute(SparkTask.java:109)
	at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:214)
	at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:100)
	at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:1977)
	at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1690)
	at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1422)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1206)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1201)
	at org.apache.hive.service.cli.operation.SQLOperation.runQuery(SQLOperation.java:237)
	at org.apache.hive.service.cli.operation.SQLOperation.access$300(SQLOperation.java:88)
	at org.apache.hive.service.cli.operation.SQLOperation$3$1.run(SQLOperation.java:293)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:415)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1920)
	at org.apache.hive.service.cli.operation.SQLOperation$3.run(SQLOperation.java:306)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
	at java.util.concurrent.FutureTask.run(FutureTask.java:262)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.util.concurrent.TimeoutException
	at io.netty.util.concurrent.AbstractFuture.get(AbstractFuture.java:49)
	at org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus.getSparkJobInfo(RemoteSparkJobStatus.java:150)
	... 23 more
ERROR : Failed to monitor Job[ 8] with exception 'org.apache.hadoop.hive.ql.metadata.HiveException(java.util.concurrent.TimeoutException)'
org.apache.hadoop.hive.ql.metadata.HiveException: java.util.concurrent.TimeoutException
	at org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus.getSparkJobInfo(RemoteSparkJobStatus.java:153)
	at org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus.getState(RemoteSparkJobStatus.java:82)
	at org.apache.hadoop.hive.ql.exec.spark.status.RemoteSparkJobMonitor.startMonitor(RemoteSparkJobMonitor.java:80)
	at org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobRef.monitorJob(RemoteSparkJobRef.java:60)
	at org.apache.hadoop.hive.ql.exec.spark.SparkTask.execute(SparkTask.java:109)
	at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:214)
	at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:100)
	at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:1977)
	at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1690)
	at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1422)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1206)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1201)
	at org.apache.hive.service.cli.operation.SQLOperation.runQuery(SQLOperation.java:237)
	at org.apache.hive.service.cli.operation.SQLOperation.access$300(SQLOperation.java:88)
	at org.apache.hive.service.cli.operation.SQLOperation$3$1.run(SQLOperation.java:293)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:415)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1920)
	at org.apache.hive.service.cli.operation.SQLOperation$3.run(SQLOperation.java:306)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
	at java.util.concurrent.FutureTask.run(FutureTask.java:262)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.util.concurrent.TimeoutException
	at io.netty.util.concurrent.AbstractFuture.get(AbstractFuture.java:49)
	at org.apache.hadoop.hive.ql.exec.spark.status.impl.RemoteSparkJobStatus.getSparkJobInfo(RemoteSparkJobStatus.java:150)
	... 23 more

ERROR : FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.spark.SparkTask
INFO  : Completed executing command(queryId=hive_20180518120808_12c3175f-865c-47d4-9748-29853259b4aa); Time taken: 120.892 seconds
Error: Error while processing statement: FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.spark.SparkTask (state=08S01,code=1)
Closing: 0: jdbc:hive2://ip-172-31-4-192.ap-south-1.compute.internal:10000/default
Intercepting System.exit(2)
Failing Oozie Launcher, Main class [org.apache.oozie.action.hadoop.Hive2Main], exit code [2]

 

Announcements