Support Questions
Find answers, ask questions, and share your expertise
Announcements
Alert: Welcome to the Unified Cloudera Community. Former HCC members be sure to read and learn how to activate your account here.

Yarn is not stable and getting SIGTERM 15 and connection refused errors in resource manager and job history server

Re: Yarn is not stable and getting SIGTERM 15 and connection refused errors in resource manager and job history server

Rising Star

Hi @EricL 

 

Yes i can see same issue when i am launching the jobs in local mode.

 

Please find the mapred-site.xml and yarn-site.xml.

 

mapred-site.xml

============

 

-<configuration>


-<property>

<name>mapreduce.job.split.metainfo.maxsize</name>

<value>10000000</value>

</property>


-<property>

<name>mapreduce.job.counters.max</name>

<value>120</value>

</property>


-<property>

<name>mapreduce.job.counters.groups.max</name>

<value>50</value>

</property>


-<property>

<name>mapreduce.output.fileoutputformat.compress</name>

<value>false</value>

</property>


-<property>

<name>mapreduce.output.fileoutputformat.compress.type</name>

<value>BLOCK</value>

</property>


-<property>

<name>mapreduce.output.fileoutputformat.compress.codec</name>

<value>org.apache.hadoop.io.compress.DefaultCodec</value>

</property>


-<property>

<name>mapreduce.map.output.compress.codec</name>

<value>org.apache.hadoop.io.compress.SnappyCodec</value>

</property>


-<property>

<name>mapreduce.map.output.compress</name>

<value>true</value>

</property>


-<property>

<name>zlib.compress.level</name>

<value>DEFAULT_COMPRESSION</value>

</property>


-<property>

<name>mapreduce.task.io.sort.factor</name>

<value>64</value>

</property>


-<property>

<name>mapreduce.map.sort.spill.percent</name>

<value>0.8</value>

</property>


-<property>

<name>mapreduce.reduce.shuffle.parallelcopies</name>

<value>10</value>

</property>


-<property>

<name>mapreduce.task.timeout</name>

<value>600000</value>

</property>


-<property>

<name>mapreduce.client.submit.file.replication</name>

<value>3</value>

</property>


-<property>

<name>mapreduce.job.reduces</name>

<value>1036</value>

</property>


-<property>

<name>mapreduce.task.io.sort.mb</name>

<value>256</value>

</property>


-<property>

<name>mapreduce.map.speculative</name>

<value>false</value>

</property>


-<property>

<name>mapreduce.reduce.speculative</name>

<value>false</value>

</property>


-<property>

<name>mapreduce.job.reduce.slowstart.completedmaps</name>

<value>0.8</value>

</property>


-<property>

<name>mapreduce.jobhistory.address</name>

<value>hostname.enterprisenet.org:10020</value>

</property>


-<property>

<name>mapreduce.jobhistory.webapp.address</name>

<value>hostname.enterprisenet.org:19888</value>

</property>


-<property>

<name>mapreduce.jobhistory.webapp.https.address</name>

<value>hostname.enterprisenet.org:19890</value>

</property>


-<property>

<name>mapreduce.jobhistory.admin.address</name>

<value>hostname.enterprisenet.org:10033</value>

</property>


-<property>

<name>mapreduce.framework.name</name>

<value>yarn</value>

</property>


-<property>

<name>yarn.app.mapreduce.am.staging-dir</name>

<value>/user</value>

</property>


-<property>

<name>mapreduce.am.max-attempts</name>

<value>2</value>

</property>


-<property>

<name>yarn.app.mapreduce.am.resource.mb</name>

<value>1536</value>

</property>


-<property>

<name>yarn.app.mapreduce.am.resource.cpu-vcores</name>

<value>1</value>

</property>


-<property>

<name>mapreduce.job.ubertask.enable</name>

<value>false</value>

</property>


-<property>

<name>yarn.app.mapreduce.am.command-opts</name>

<value>-Djava.net.preferIPv4Stack=true -Xmx1073741824</value>

</property>


-<property>

<name>mapreduce.map.java.opts</name>

<value>-Djava.net.preferIPv4Stack=true -Xmx2147483648</value>

</property>


-<property>

<name>mapreduce.reduce.java.opts</name>

<value>-Djava.net.preferIPv4Stack=true -Xmx825955249</value>

</property>


-<property>

<name>yarn.app.mapreduce.am.admin.user.env</name>

<value>LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH</value>

</property>


-<property>

<name>mapreduce.map.memory.mb</name>

<value>2560</value>

</property>


-<property>

<name>mapreduce.map.cpu.vcores</name>

<value>1</value>

</property>


-<property>

<name>mapreduce.reduce.memory.mb</name>

<value>1024</value>

</property>


-<property>

<name>mapreduce.reduce.cpu.vcores</name>

<value>1</value>

</property>


-<property>

<name>mapreduce.application.classpath</name>

<value>$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,$MR2_CLASSPATH</value>

</property>


-<property>

<name>mapreduce.admin.user.env</name>

<value>LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH</value>

</property>

</configuration>

 

 

 

yarn-site.xml

==========

 

-<configuration>


-<property>

<name>yarn.acl.enable</name>

<value>true</value>

</property>


-<property>

<name>yarn.admin.acl</name>

<value>*</value>

</property>


-<property>

<name>yarn.resourcemanager.ha.enabled</name>

<value>true</value>

</property>


-<property>

<name>yarn.resourcemanager.ha.automatic-failover.enabled</name>

<value>true</value>

</property>


-<property>

<name>yarn.resourcemanager.ha.automatic-failover.embedded</name>

<value>true</value>

</property>


-<property>

<name>yarn.resourcemanager.recovery.enabled</name>

<value>true</value>

</property>


-<property>

<name>yarn.resourcemanager.zk-address</name>

<value>hostname_2.enterprisenet.org:2181,hostname.enterprisenet.org:2181,hostname_1.enterprisenet.org:2181</value>

</property>


-<property>

<name>yarn.resourcemanager.store.class</name>

<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>

</property>


-<property>

<name>yarn.client.failover-sleep-base-ms</name>

<value>100</value>

</property>


-<property>

<name>yarn.client.failover-sleep-max-ms</name>

<value>2000</value>

</property>


-<property>

<name>yarn.resourcemanager.cluster-id</name>

<value>yarnRM</value>

</property>


-<property>

<name>yarn.resourcemanager.address.rm648</name>

<value>hostname_1.enterprisenet.org:8032</value>

</property>


-<property>

<name>yarn.resourcemanager.scheduler.address.rm648</name>

<value>hostname_1.enterprisenet.org:8030</value>

</property>


-<property>

<name>yarn.resourcemanager.resource-tracker.address.rm648</name>

<value>hostname_1.enterprisenet.org:8031</value>

</property>


-<property>

<name>yarn.resourcemanager.admin.address.rm648</name>

<value>hostname_1.enterprisenet.org:8033</value>

</property>


-<property>

<name>yarn.resourcemanager.webapp.address.rm648</name>

<value>hostname_1.enterprisenet.org:8088</value>

</property>


-<property>

<name>yarn.resourcemanager.webapp.https.address.rm648</name>

<value>hostname_1.enterprisenet.org:8090</value>

</property>


-<property>

<name>yarn.resourcemanager.address.rm647</name>

<value>hostname.enterprisenet.org:8032</value>

</property>


-<property>

<name>yarn.resourcemanager.scheduler.address.rm647</name>

<value>hostname.enterprisenet.org:8030</value>

</property>


-<property>

<name>yarn.resourcemanager.resource-tracker.address.rm647</name>

<value>hostname.enterprisenet.org:8031</value>

</property>


-<property>

<name>yarn.resourcemanager.admin.address.rm647</name>

<value>hostname.enterprisenet.org:8033</value>

</property>


-<property>

<name>yarn.resourcemanager.webapp.address.rm647</name>

<value>hostname.enterprisenet.org:8088</value>

</property>


-<property>

<name>yarn.resourcemanager.webapp.https.address.rm647</name>

<value>hostname.enterprisenet.org:8090</value>

</property>


-<property>

<name>yarn.resourcemanager.ha.rm-ids</name>

<value>rm648,rm647</value>

</property>


-<property>

<name>yarn.resourcemanager.client.thread-count</name>

<value>50</value>

</property>


-<property>

<name>yarn.resourcemanager.scheduler.client.thread-count</name>

<value>50</value>

</property>


-<property>

<name>yarn.resourcemanager.admin.client.thread-count</name>

<value>1</value>

</property>


-<property>

<name>yarn.scheduler.minimum-allocation-mb</name>

<value>1024</value>

</property>


-<property>

<name>yarn.scheduler.increment-allocation-mb</name>

<value>512</value>

</property>


-<property>

<name>yarn.scheduler.maximum-allocation-mb</name>

<value>102400</value>

</property>


-<property>

<name>yarn.scheduler.minimum-allocation-vcores</name>

<value>1</value>

</property>


-<property>

<name>yarn.scheduler.increment-allocation-vcores</name>

<value>1</value>

</property>


-<property>

<name>yarn.scheduler.maximum-allocation-vcores</name>

<value>56</value>

</property>


-<property>

<name>yarn.resourcemanager.amliveliness-monitor.interval-ms</name>

<value>1000</value>

</property>


-<property>

<name>yarn.am.liveness-monitor.expiry-interval-ms</name>

<value>600000</value>

</property>


-<property>

<name>yarn.resourcemanager.am.max-attempts</name>

<value>2</value>

</property>


-<property>

<name>yarn.resourcemanager.container.liveness-monitor.interval-ms</name>

<value>600000</value>

</property>


-<property>

<name>yarn.resourcemanager.nm.liveness-monitor.interval-ms</name>

<value>1000</value>

</property>


-<property>

<name>yarn.nm.liveness-monitor.expiry-interval-ms</name>

<value>600000</value>

</property>


-<property>

<name>yarn.resourcemanager.resource-tracker.client.thread-count</name>

<value>50</value>

</property>


-<property>

<name>yarn.application.classpath</name>

<value>$HADOOP_CLIENT_CONF_DIR,$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*</value>

</property>


-<property>

<name>yarn.resourcemanager.scheduler.class</name>

<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>

</property>


-<property>

<name>yarn.resourcemanager.max-completed-applications</name>

<value>10000</value>

</property>


-<property>

<name>yarn.nodemanager.remote-app-log-dir</name>

<value>/tmp/logs</value>

</property>


-<property>

<name>yarn.nodemanager.remote-app-log-dir-suffix</name>

<value>logs</value>

</property>

</configuration>

 

Let me know if you need any other information.

 

Thanks,

Vinod

Highlighted

Re: Yarn is not stable and getting SIGTERM 15 and connection refused errors in resource manager and job history server

Rising Star

Hi @EricL ,

 

Can you please let me know your comments, And the strange thing is i have cleanedup everything and freshly installed and setup a cluster.

But still we are facing same issue and the jobs are running in Local mode not in YARN Mode.

I have ran it in Active Resource Manager server and edgenode, But no luck.

 

Can someone please let us know where i can debug and fix this issue ?

Is this related to OS level of issue or YARN issue or any other issue?

When i ran hostinspector i ddint find any issues like, Firewall, selinux or other issues.

 

Please do the needful and help us.

 

Best Regards,

Vinod

Don't have an account?
Coming from Hortonworks? Activate your account here