Created 10-06-2016 06:31 PM
Hi,
I'm experimenting Cloudbreak to setup a Ambari2.4/HDP2.5 cluster quickly. There are 3 blueprints out of box, but none of them is for HA.
I followed this wiki page https://cwiki.apache.org/confluence/display/AMBARI/Blueprint+Support+for+HA+Clusters#BlueprintSuppor...
I can only make the NameNode HA work. Here is the working blueprint.
{ "Blueprints": { "blueprint_name": "hdp-ha", "stack_name": "HDP", "stack_version": "2.5" }, "host_groups": [ { "name": "gateway", "components": [ { "name": "ZOOKEEPER_CLIENT" }, { "name": "KNOX_GATEWAY" }, { "name": "METRICS_MONITOR" }, { "name": "TEZ_CLIENT" }, { "name": "SLIDER" }, { "name": "SQOOP" }, { "name": "HDFS_CLIENT" }, { "name": "YARN_CLIENT" }, { "name": "METRICS_COLLECTOR" }, { "name": "MAPREDUCE2_CLIENT" } ], "cardinality": "1" }, { "name": "master_3", "components": [ { "name": "ZOOKEEPER_SERVER" }, { "name": "ZKFC" }, { "name": "JOURNALNODE" }, { "name": "METRICS_MONITOR" }, { "name": "NAMENODE" } ], "cardinality": "1" }, { "name": "slave_1", "components": [ { "name": "NODEMANAGER" }, { "name": "METRICS_MONITOR" }, { "name": "DATANODE" } ], "cardinality": "3" }, { "name": "master_2", "components": [ { "name": "ZOOKEEPER_SERVER" }, { "name": "JOURNALNODE" }, { "name": "METRICS_MONITOR" }, { "name": "RESOURCEMANAGER" }, { "name": "APP_TIMELINE_SERVER" } ], "cardinality": "1" }, { "name": "master_1", "components": [ { "name": "ZOOKEEPER_SERVER" }, { "name": "JOURNALNODE" }, { "name": "HISTORYSERVER" }, { "name": "NAMENODE" }, { "name": "ZKFC" }, { "name": "METRICS_MONITOR" } ], "cardinality": "1" } ], "configurations": [ { "core-site": { "properties": { "fs.defaultFS": "hdfs://mycluster", "ha.zookeeper.quorum": "%HOSTGROUP::master_1%:2181,%HOSTGROUP::master_2%:2181,%HOSTGROUP::master_3%:2181" } } }, { "hdfs-site": { "properties": { "dfs.client.failover.proxy.provider.mycluster": "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider", "dfs.ha.automatic-failover.enabled": "true", "dfs.ha.fencing.methods": "shell(/bin/true)", "dfs.ha.namenodes.mycluster": "nn1,nn2", "dfs.namenode.http-address": "%HOSTGROUP::master_1%:50070", "dfs.namenode.http-address.mycluster.nn1": "%HOSTGROUP::master_1%:50070", "dfs.namenode.http-address.mycluster.nn2": "%HOSTGROUP::master_3%:50070", "dfs.namenode.https-address": "%HOSTGROUP::master_1%:50470", "dfs.namenode.https-address.mycluster.nn1": "%HOSTGROUP::master_1%:50470", "dfs.namenode.https-address.mycluster.nn2": "%HOSTGROUP::master_3%:50470", "dfs.namenode.rpc-address.mycluster.nn1": "%HOSTGROUP::master_1%:8020", "dfs.namenode.rpc-address.mycluster.nn2": "%HOSTGROUP::master_3%:8020", "dfs.namenode.shared.edits.dir": "qjournal://%HOSTGROUP::master_1%:8485;%HOSTGROUP::master_2%:8485;%HOSTGROUP::master_3%:8485/mycluster", "dfs.nameservices": "mycluster" } } } ] }
The ResourceManager HA always got error "java.lang.IllegalArgumentException: Unable to update configuration property 'yarn.resourcemanager.admin.address' with topology information. Component 'RESOURCEMANAGER' is mapped to an invalid number of hosts '2'"
The blueprint for NN+RM HA is
{ "Blueprints": { "blueprint_name": "hdp-ha", "stack_name": "HDP", "stack_version": "2.5" }, "host_groups": [ { "name": "gateway", "components": [ { "name": "ZOOKEEPER_CLIENT" }, { "name": "KNOX_GATEWAY" }, { "name": "METRICS_MONITOR" }, { "name": "TEZ_CLIENT" }, { "name": "SLIDER" }, { "name": "SQOOP" }, { "name": "HDFS_CLIENT" }, { "name": "YARN_CLIENT" }, { "name": "METRICS_COLLECTOR" }, { "name": "MAPREDUCE2_CLIENT" } ], "cardinality": "1" }, { "name": "slave_1", "components": [ { "name": "NODEMANAGER" }, { "name": "METRICS_MONITOR" }, { "name": "DATANODE" } ], "cardinality": "3" }, { "name": "master_1", "components": [ { "name": "ZOOKEEPER_SERVER" }, { "name": "HDFS_CLIENT" }, { "name": "YARN_CLIENT" }, { "name": "JOURNALNODE" }, { "name": "HISTORYSERVER" }, { "name": "NAMENODE" }, { "name": "ZKFC" }, { "name": "METRICS_MONITOR" } ], "cardinality": "1" }, { "name": "master_2", "components": [ { "name": "ZOOKEEPER_SERVER" }, { "name": "HDFS_CLIENT" }, { "name": "YARN_CLIENT" }, { "name": "JOURNALNODE" }, { "name": "METRICS_MONITOR" }, { "name": "RESOURCEMANAGER" }, { "name": "APP_TIMELINE_SERVER" } ], "cardinality": "1" }, { "name": "master_3", "components": [ { "name": "ZOOKEEPER_SERVER" }, { "name": "HDFS_CLIENT" }, { "name": "YARN_CLIENT" }, { "name": "ZKFC" }, { "name": "JOURNALNODE" }, { "name": "METRICS_MONITOR" }, { "name": "NAMENODE" }, { "name": "RESOURCEMANAGER" } ], "cardinality": "1" } ], "configurations": [ { "core-site": { "properties": { "fs.defaultFS": "hdfs://mycluster", "ha.zookeeper.quorum": "%HOSTGROUP::master_1%:2181,%HOSTGROUP::master_2%:2181,%HOSTGROUP::master_3%:2181", "hadoop.proxyuser.yarn.hosts": "%HOSTGROUP::master_2%,%HOSTGROUP::master_3%" } } }, { "hdfs-site": { "properties": { "dfs.client.failover.proxy.provider.mycluster": "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider", "dfs.ha.automatic-failover.enabled": "true", "dfs.ha.fencing.methods": "shell(/bin/true)", "dfs.ha.namenodes.mycluster": "nn1,nn2", "dfs.namenode.http-address": "%HOSTGROUP::master_1%:50070", "dfs.namenode.http-address.mycluster.nn1": "%HOSTGROUP::master_1%:50070", "dfs.namenode.http-address.mycluster.nn2": "%HOSTGROUP::master_3%:50070", "dfs.namenode.https-address": "%HOSTGROUP::master_1%:50470", "dfs.namenode.https-address.mycluster.nn1": "%HOSTGROUP::master_1%:50470", "dfs.namenode.https-address.mycluster.nn2": "%HOSTGROUP::master_3%:50470", "dfs.namenode.rpc-address.mycluster.nn1": "%HOSTGROUP::master_1%:8020", "dfs.namenode.rpc-address.mycluster.nn2": "%HOSTGROUP::master_3%:8020", "dfs.namenode.shared.edits.dir": "qjournal://%HOSTGROUP::master_1%:8485;%HOSTGROUP::master_2%:8485;%HOSTGROUP::master_3%:8485/mycluster", "dfs.nameservices": "mycluster" } } }, { "yarn-site": { "properties": { "yarn.resourcemanager.ha.enabled": "true", "yarn.resourcemanager.ha.rm-ids": "rm1,rm2", "yarn.resourcemanager.hostname.rm1": "%HOSTGROUP::master_2%", "yarn.resourcemanager.hostname.rm2": "%HOSTGROUP::master_3%", "yarn.resourcemanager.webapp.address.rm1": "%HOSTGROUP::master_2%:8088", "yarn.resourcemanager.webapp.address.rm2": "%HOSTGROUP::master_3%:8088", "yarn.resourcemanager.webapp.https.address.rm1": "%HOSTGROUP::master_2%:8090", "yarn.resourcemanager.webapp.https.address.rm2": "%HOSTGROUP::master_3%:8090", "yarn.resourcemanager.recovery.enabled": "true", "yarn.resourcemanager.store.class": "org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore", "yarn.resourcemanager.zk-address": "%HOSTGROUP::master_1%:2181,%HOSTGROUP::master_2%:2181,%HOSTGROUP::master_3%:2181", "yarn.client.failover-proxy-provider": "org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider", "yarn.resourcemanager.cluster-id": "yarn-cluster", "yarn.resourcemanager.ha.automatic-failover.zk-base-path": "/yarn-leader-election" } } } ] }
Anyone had experience with RM HA blueprint? Or it's a bug in Ambari 2.4?
Thanks in advance.
Wendell
Created 10-10-2016 10:02 PM
Hello @wbu, You have missed few of the properties in `yarn-site`: Please replace it with the following snippet
"properties": { "hadoop.registry.rm.enabled": "false", "hadoop.registry.zk.quorum": "%HOSTGROUP::master_3%:2181,%HOSTGROUP::master_2%:2181,%HOSTGROUP::master_1%:2181", "yarn.log.server.url": "http://%HOSTGROUP::master_2%:19888/jobhistory/logs", "yarn.resourcemanager.address": "%HOSTGROUP::master_2%:8050", "yarn.resourcemanager.admin.address": "%HOSTGROUP::master_2%:8141", "yarn.resourcemanager.cluster-id": "yarn-cluster", "yarn.resourcemanager.ha.automatic-failover.zk-base-path": "/yarn-leader-election", "yarn.resourcemanager.ha.enabled": "true", "yarn.resourcemanager.ha.rm-ids": "rm1,rm2", "yarn.resourcemanager.hostname": "%HOSTGROUP::master_2%", "yarn.resourcemanager.recovery.enabled": "true", "yarn.resourcemanager.resource-tracker.address": "%HOSTGROUP::master_2%:8025", "yarn.resourcemanager.scheduler.address": "%HOSTGROUP::master_2%:8030", "yarn.resourcemanager.store.class": "org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore", "yarn.resourcemanager.webapp.address": "%HOSTGROUP::master_2%:8088", "yarn.resourcemanager.webapp.https.address": "%HOSTGROUP::master_2%:8090", "yarn.timeline-service.address": "%HOSTGROUP::master_2%:10200", "yarn.timeline-service.webapp.address": "%HOSTGROUP::master_2%:8188", "yarn.timeline-service.webapp.https.address": "%HOSTGROUP::master_2%:8190", "yarn.resourcemanager.zk-address": "%HOSTGROUP::master_2%:2181,%HOSTGROUP::master_1%:2181,%HOSTGROUP::master_3%:2181", "yarn.resourcemanager.hostname.rm1": "%HOSTGROUP::master_2%", "yarn.resourcemanager.hostname.rm2": "%HOSTGROUP::master_3%" }
Created 10-10-2016 10:02 PM
Hello @wbu, You have missed few of the properties in `yarn-site`: Please replace it with the following snippet
"properties": { "hadoop.registry.rm.enabled": "false", "hadoop.registry.zk.quorum": "%HOSTGROUP::master_3%:2181,%HOSTGROUP::master_2%:2181,%HOSTGROUP::master_1%:2181", "yarn.log.server.url": "http://%HOSTGROUP::master_2%:19888/jobhistory/logs", "yarn.resourcemanager.address": "%HOSTGROUP::master_2%:8050", "yarn.resourcemanager.admin.address": "%HOSTGROUP::master_2%:8141", "yarn.resourcemanager.cluster-id": "yarn-cluster", "yarn.resourcemanager.ha.automatic-failover.zk-base-path": "/yarn-leader-election", "yarn.resourcemanager.ha.enabled": "true", "yarn.resourcemanager.ha.rm-ids": "rm1,rm2", "yarn.resourcemanager.hostname": "%HOSTGROUP::master_2%", "yarn.resourcemanager.recovery.enabled": "true", "yarn.resourcemanager.resource-tracker.address": "%HOSTGROUP::master_2%:8025", "yarn.resourcemanager.scheduler.address": "%HOSTGROUP::master_2%:8030", "yarn.resourcemanager.store.class": "org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore", "yarn.resourcemanager.webapp.address": "%HOSTGROUP::master_2%:8088", "yarn.resourcemanager.webapp.https.address": "%HOSTGROUP::master_2%:8090", "yarn.timeline-service.address": "%HOSTGROUP::master_2%:10200", "yarn.timeline-service.webapp.address": "%HOSTGROUP::master_2%:8188", "yarn.timeline-service.webapp.https.address": "%HOSTGROUP::master_2%:8190", "yarn.resourcemanager.zk-address": "%HOSTGROUP::master_2%:2181,%HOSTGROUP::master_1%:2181,%HOSTGROUP::master_3%:2181", "yarn.resourcemanager.hostname.rm1": "%HOSTGROUP::master_2%", "yarn.resourcemanager.hostname.rm2": "%HOSTGROUP::master_3%" }
Created 10-11-2016 02:00 PM
Thanks, @Bhavin Tandel
It works.
Created 11-16-2016 06:19 PM
Am trying to setup a NN+RM HA blueprint as shown below, but couldn't able to launch successfully and getting errors like couldn't able to start namenode, resource manager services successfully. Please advise.
{ "Blueprints": { "blueprint_name": "NN_RM_HA", "stack_name": "HDP", "stack_version": "2.5" }, "host_groups": [ { "name": "edge", "components": [ { "name": "ZOOKEEPER_CLIENT" }, { "name": "KNOX_GATEWAY" }, { "name": "METRICS_MONITOR" }, { "name": "TEZ_CLIENT" }, { "name": "SLIDER" }, { "name": "SQOOP" }, { "name": "HDFS_CLIENT" }, { "name": "YARN_CLIENT" }, { "name": "METRICS_COLLECTOR" }, { "name": "MAPREDUCE2_CLIENT" } ], "cardinality": "1" }, { "name": "datanodes", "components": [ { "name": "NODEMANAGER" }, { "name": "METRICS_MONITOR" }, { "name": "DATANODE" } ], "cardinality": "4" }, { "name": "mgmt_1", "components": [ { "name": "ZOOKEEPER_SERVER" }, { "name": "HDFS_CLIENT" }, { "name": "YARN_CLIENT" }, { "name": "JOURNALNODE" }, { "name": "HISTORYSERVER" }, { "name": "NAMENODE" }, { "name": "ZKFC" }, { "name": "METRICS_MONITOR" } ], "cardinality": "1" }, { "name": "mgmt_2", "components": [ { "name": "ZOOKEEPER_SERVER" }, { "name": "HDFS_CLIENT" }, { "name": "YARN_CLIENT" }, { "name": "JOURNALNODE" }, { "name": "METRICS_MONITOR" }, { "name": "RESOURCEMANAGER" }, { "name": "APP_TIMELINE_SERVER" } ], "cardinality": "1" }, { "name": "mgmt_3", "components": [ { "name": "ZOOKEEPER_SERVER" }, { "name": "HDFS_CLIENT" }, { "name": "YARN_CLIENT" }, { "name": "ZKFC" }, { "name": "JOURNALNODE" }, { "name": "METRICS_MONITOR" }, { "name": "NAMENODE" }, { "name": "RESOURCEMANAGER" } ], "cardinality": "1" } ], "configurations": [ { "core-site": { "properties": { "fs.defaultFS": "hdfs://mycluster", "ha.zookeeper.quorum": "%HOSTGROUP::mgmt_1%:2181,%HOSTGROUP::mgmt_2%:2181,%HOSTGROUP::mgmt_3%:2181", "hadoop.proxyuser.yarn.hosts": "%HOSTGROUP::mgmt_2%,%HOSTGROUP::mgmt_3%" } } }, { "hdfs-site": { "properties": { "dfs.client.failover.proxy.provider.mycluster": "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider", "dfs.ha.automatic-failover.enabled": "true", "dfs.ha.fencing.methods": "shell(/bin/true)", "dfs.ha.namenodes.mycluster": "nn1,nn2", "dfs.namenode.http-address": "%HOSTGROUP::mgmt_1%:50070", "dfs.namenode.http-address.mycluster.nn1": "%HOSTGROUP::mgmt_1%:50070", "dfs.namenode.http-address.mycluster.nn2": "%HOSTGROUP::mgmt_3%:50070", "dfs.namenode.https-address": "%HOSTGROUP::mgmt_1%:50470", "dfs.namenode.https-address.mycluster.nn1": "%HOSTGROUP::mgmt_1%:50470", "dfs.namenode.https-address.mycluster.nn2": "%HOSTGROUP::mgmt_3%:50470", "dfs.namenode.rpc-address.mycluster.nn1": "%HOSTGROUP::mgmt_1%:8020", "dfs.namenode.rpc-address.mycluster.nn2": "%HOSTGROUP::mgmt_3%:8020", "dfs.namenode.shared.edits.dir": "qjournal://%HOSTGROUP::mgmt_1%:8485;%HOSTGROUP::mgmt_2%:8485;%HOSTGROUP::mgmt_3%:8485/mycluster", "dfs.nameservices": "mycluster" } } }, { "yarn-site": { "properties": { "hadoop.registry.rm.enabled": "false", "hadoop.registry.zk.quorum": "%HOSTGROUP::mgmt_3%:2181,%HOSTGROUP::mgmt_2%:2181,%HOSTGROUP::mgmt_1%:2181", "yarn.log.server.url": "http://%HOSTGROUP::mgmt_2%:19888/jobhistory/logs", "yarn.resourcemanager.address": "%HOSTGROUP::mgmt_2%:8050", "yarn.resourcemanager.admin.address": "%HOSTGROUP::mgmt_2%:8141", "yarn.resourcemanager.cluster-id": "yarn-cluster", "yarn.resourcemanager.ha.automatic-failover.zk-base-path": "/yarn-leader-election", "yarn.resourcemanager.ha.enabled": "true", "yarn.resourcemanager.ha.rm-ids": "rm1,rm2", "yarn.resourcemanager.hostname": "%HOSTGROUP::mgmt_2%", "yarn.resourcemanager.recovery.enabled": "true", "yarn.resourcemanager.resource-tracker.address": "%HOSTGROUP::mgmt_2%:8025", "yarn.resourcemanager.scheduler.address": "%HOSTGROUP::mgmt_2%:8030", "yarn.resourcemanager.store.class": "org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore", "yarn.resourcemanager.webapp.address": "%HOSTGROUP::mgmt_2%:8088", "yarn.resourcemanager.webapp.https.address": "%HOSTGROUP::mgmt_2%:8090", "yarn.timeline-service.address": "%HOSTGROUP::mgmt_2%:10200", "yarn.timeline-service.webapp.address": "%HOSTGROUP::mgmt_2%:8188", "yarn.timeline-service.webapp.https.address": "%HOSTGROUP::mgmt_2%:8190", "yarn.resourcemanager.zk-address": "%HOSTGROUP::mgmt_2%:2181,%HOSTGROUP::mgmt_1%:2181,%HOSTGROUP::mgmt_3%:2181", "yarn.resourcemanager.hostname.rm1": "%HOSTGROUP::mgmt_2%", "yarn.resourcemanager.hostname.rm2": "%HOSTGROUP::mgmt_3%" } } } ] }