Created on 05-19-2016 01:29 AM - edited 08-17-2019 12:21 PM
Goals:
Notes:
export ambari_server=node1 curl -sSL https://raw.githubusercontent.com/seanorama/ambari-bootstrap/master/ambari-bootstrap.sh | sudo -E sh
export install_ambari_server=true curl -sSL https://raw.githubusercontent.com/seanorama/ambari-bootstrap/master/ambari-bootstrap.sh | sudo -E sh
yum install -y git git clone https://github.com/hortonworks-gallery/ambari-zeppelin-service.git /var/lib/ambari-server/resources/stacks/HDP/2.4/services/ZEPPELIN sed -i.bak '/dependencies for all/a \ "ZEPPELIN_MASTER-START": ["NAMENODE-START", "DATANODE-START"],' /var/lib/ambari-server/resources/stacks/HDP/2.4/role_command_order.json
mkdir /staging chmod a+rx /staging
tar -xvzf /staging/hdb-2.0.0.0-*.tar.gz -C /staging/ tar -xvzf /staging/hdb-ambari-plugin-2.0.0-*.tar.gz -C /staging/ yum install -y httpd service httpd start cd /staging/hdb* ./setup_repo.sh cd /staging/hdb-ambari-plugin* ./setup_repo.sh yum install -y hdb-ambari-plugin
service ambari-server restart service ambari-agent restart
curl -u admin:admin -H X-Requested-By:ambari http://localhost:8080/api/v1/hosts service ambari-agent status
#install bootstrap yum install -y python-argparse cd git clone https://github.com/seanorama/ambari-bootstrap.git #decide which services to deploy and set the number of nodes in the cluster export ambari_services="HDFS MAPREDUCE2 YARN ZOOKEEPER HIVE ZEPPELIN SPARK HAWQ PXF" export host_count=4 cd ./ambari-bootstrap/deploy/
cat << EOF > configuration-custom.json
{
  "configurations" : {
    "hdfs-site": {
        "dfs.allow.truncate": "true",
        "dfs.block.access.token.enable": "false",
        "dfs.block.local-path-access.user": "gpadmin",
        "dfs.client.read.shortcircuit": "true",
        "dfs.client.socket-timeout": "300000000",
        "dfs.client.use.legacy.blockreader.local": "false",
        "dfs.datanode.handler.count": "60",
        "dfs.datanode.socket.write.timeout": "7200000",                                
        "dfs.namenode.handler.count": "600",
        "dfs.support.append": "true"               
    },
    "hawq-site":{
        "hawq_master_address_port":"5432"
    },
    "hawq-env":{
        "hawq_password":"gpadmin"
    },
    "core-site": {
        "ipc.client.connection.maxidletime": "3600000",
        "ipc.client.connect.timeout": "300000",
        "ipc.server.listen.queue.size": "3300"
    }
  }
}
EOF#optional - if you want to review the BP before deploying it #export deploy=false #./deploy-recommended-cluster.bash #more temp*/blueprint.json #generate BP including customizations and start cluster deployment export deploy=true ./deploy-recommended-cluster.bash
curl -u admin:admin -H X-Requested-By:ambari -X DELETE http://localhost:8080/api/v1/blueprints/recommended
su - gpadmin source /usr/local/hawq/greenplum_path.sh export PGPORT=5432 psql -d postgres create database contoso; \q
ALTER ROLE gpadmin WITH PASSWORD 'gpadmin';
echo "host all all 172.17.0.2/32 trust" >> /data/hawq/master/pg_hba.conf
hawq stop cluster --reload
%psql.sql create table tt (i int); insert into tt select generate_series(1,1000000);
%psql.sql select avg(i) from tt where i>5000;
#Import sample tables cd /tmp wget https://raw.githubusercontent.com/abajwa-hw/security-workshops/master/data/sample_07.csv beeline -u "jdbc:hive2://localhost:10000/default" use default; CREATE TABLE `sample_07` ( `code` string , `description` string , `total_emp` int , `salary` int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TextFile; load data local inpath '/tmp/sample_07.csv' into table sample_07;
su - gpadmin source /usr/local/hawq/greenplum_path.sh export PGPORT=5432 psql -d postgres SET hcatalog_enable = true; SET pxf_service_address TO "node2:51200"; select count(*) from hcatalog.default.sample_07;
%psql.sql SET hcatalog_enable = true; SET pxf_service_address TO "node2:51200";
%psql.sql select description, salary from hcatalog.default.sample_07;
A retail dataset with sample queries can be found at https://github.com/pivotalsoftware/pivotal-samples
cd /tmp git clone https://github.com/pivotalsoftware/pivotal-samples.git cd /tmp/pivotal-samples/sample-data/ sudo -u hdfs ./load_data_to_HDFS.sh sudo -u hdfs hdfs dfs -chmod -R 777 /retail_demo hive -f /tmp/pivotal-samples/hive/create_hive_tables.sql
%psql.sql
SELECT product_id
,      product_category_id
,      product_count
,      category_rank
FROM (SELECT product_id, product_category_id
      ,      SUM(item_quantity::int) AS product_count
      ,      row_number() OVER (PARTITION BY product_category_id ORDER BY SUM(item_quantity::int) DESC) AS category_rank
      FROM   hcatalog.retail_demo.order_lineitems_hive
      GROUP BY product_id, product_category_id
     ) AS lineitems
WHERE category_rank <= 10
ORDER BY product_category_id, category_rank
;%psql.sql
SELECT CASE WHEN order_datetime::timestamp < timestamp '2010-10-08' THEN date_trunc('day', order_datetime::timestamp + interval ' 1 week')
            ELSE date_trunc('day', order_datetime::timestamp)
       END::date AS order_day
,      SUM(CASE WHEN order_datetime >= timestamp '2010-01-08' THEN 1 ELSE 0 END) AS num__orders_current
,      SUM(CASE WHEN order_datetime <  timestamp '2010-01-08' THEN 1 ELSE 0 END) AS num__orders_last_week
FROM   hcatalog.retail_demo.order_lineitems_hive
WHERE  order_datetime BETWEEN timestamp '2010-10-01' AND timestamp '2010-10-15 23:59:59'
GROUP BY 1
ORDER BY 1
;%psql.sql 
SELECT product_id
,      product_category_id
,      product_count
,      category_rank
FROM (SELECT product_id, product_category_id
      ,      SUM(item_quantity::int) AS product_count
      ,      row_number() OVER (PARTITION BY product_category_id ORDER BY SUM(item_quantity::int) DESC) AS category_rank
      FROM   hcatalog.retail_demo.order_lineitems_hive
      GROUP BY product_id, product_category_id
     ) AS lineitems
WHERE category_rank <= 10
ORDER BY product_category_id, category_rank
;Created on 05-20-2016 03:52 AM
