From 79d25a50f98685d4f9ccb7127a705fd16f3afb29 Mon Sep 17 00:00:00 2001 From: lhoss Date: Wed, 19 May 2021 18:52:49 +0200 Subject: [PATCH 01/23] static inventory with TF-deployed hosts using ansible_host for ssh-access via pub-IP --- examples/sandbox/inventory_static.ini | 55 +++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 examples/sandbox/inventory_static.ini diff --git a/examples/sandbox/inventory_static.ini b/examples/sandbox/inventory_static.ini new file mode 100644 index 0000000..ff4f5eb --- /dev/null +++ b/examples/sandbox/inventory_static.ini @@ -0,0 +1,55 @@ +# Copyright 2021 Cloudera, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +[cloudera_manager] +eval-cdp-public0.internal.cloudapp.net ansible_host=13.92.192.89 + +[cluster_master_nodes] +eval-cdp-public0.internal.cloudapp.net host_template=Master1 ansible_host=13.92.192.89 + +[cluster_worker_nodes] +eval-cdp-public1.internal.cloudapp.net ansible_host=13.92.254.225 +eval-cdp-public2.internal.cloudapp.net ansible_host=13.92.249.4 +eval-cdp-public3.internal.cloudapp.net ansible_host=13.92.254.136 + + +[cluster_worker_nodes:vars] +host_template=Workers + +[cluster:children] +cluster_master_nodes +cluster_worker_nodes + +[db_server:children] +cloudera_manager + +[deployment:children] +cluster +db_server + + + +[deployment:vars] +# Ansible will defer to the running SSH Agent for relevant keys +# Set the following to hardcode the SSH private key for the instances +# ansible_ssh_private_key_file=~/.ssh/mykey.pem +#ansible_user=centos +ansible_user=adminuser + +#TODO enhance TF deploy with: +# -SSH access via ssh keys +# -passwordless ssh-key based internode login + +ansible_password= From c5402c9b3efdc162fd6c169f2ea6cc0689e448d8 Mon Sep 17 00:00:00 2001 From: lhoss Date: Wed, 19 May 2021 18:54:04 +0200 Subject: [PATCH 02/23] HACK: set init__call_cloud_role=False to skip cloud roles --- roles/cloudera_deploy/tasks/init.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/roles/cloudera_deploy/tasks/init.yml b/roles/cloudera_deploy/tasks/init.yml index cd6f08b..85812fd 100644 --- a/roles/cloudera_deploy/tasks/init.yml +++ b/roles/cloudera_deploy/tasks/init.yml @@ -351,6 +351,8 @@ __no_log_globals: admin_password: "{{ admin_password | mandatory }}" +# TODO undo hack. How to ensure below used vars are all undefined (empty is not enough) ?! - name: Determine if Cloud Roles should be called ansible.builtin.set_fact: - init__call_cloud_role: "{{ infra is defined or env is defined or ml is defined or de is defined or datahub is defined or opdb is defined or dw is defined | default(False) }}" +# init__call_cloud_role: "{{ infra is defined or env is defined or ml is defined or de is defined or datahub is defined or opdb is defined or dw is defined | default(False) }}" + init__call_cloud_role: False From 7712385cab7cac3b03556623e2455bfca71970d5 Mon Sep 17 00:00:00 2001 From: lhoss Date: Wed, 19 May 2021 19:25:22 +0200 Subject: [PATCH 03/23] CDH_playbook_run_20200519_02: use custom data dirs for HDFS and zk --- roles/cloudera_deploy/defaults/basic_cluster.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/roles/cloudera_deploy/defaults/basic_cluster.yml b/roles/cloudera_deploy/defaults/basic_cluster.yml index 22daff5..d198e57 100644 --- a/roles/cloudera_deploy/defaults/basic_cluster.yml +++ b/roles/cloudera_deploy/defaults/basic_cluster.yml @@ -21,14 +21,16 @@ clusters: services: [HDFS, YARN, ZOOKEEPER] repositories: - https://archive.cloudera.com/cdh7/7.1.4.0/parcels/ + # Changed the dir configs below, to get them out of the too small root partition (30GB by default) + # TODO Ensure that mount point is not on emphemeral disk. If it is, easier to just provision a larger root part. configs: HDFS: DATANODE: - dfs_data_dir_list: /dfs/dn + dfs_data_dir_list: /mnt/resource/dfs/dn NAMENODE: - dfs_name_dir_list: /dfs/nn + dfs_name_dir_list: /mnt/resource/dfs/nn SECONDARYNAMENODE: - fs_checkpoint_dir_list: /dfs/snn + fs_checkpoint_dir_list: /mnt/resource/dfs/snn YARN: RESOURCEMANAGER: yarn_scheduler_maximum_allocation_mb: 4096 @@ -44,6 +46,11 @@ clusters: ZOOKEEPER: SERVICEWIDE: zookeeper_datadir_autocreate: true + # Added following entries (inspired from defaults.j2) also for custom "zk" dirs + SERVER: + dataDir: /mnt/resource/zookeeper/dataDir + dataLogDir: /mnt/resource/zookeeper/dataLogDir + host_templates: Master1: HDFS: [NAMENODE, SECONDARYNAMENODE] From 9d95d6c7f3764f4295632f40cb8f97b970932312 Mon Sep 17 00:00:00 2001 From: lhoss Date: Tue, 25 May 2021 12:42:39 +0200 Subject: [PATCH 04/23] CDH_playbook_run_20200525a: new VMs ; undo useage of mount point /mnt/resource (on temporary disk) --- examples/sandbox/inventory_static.ini | 10 +++++----- roles/cloudera_deploy/defaults/basic_cluster.yml | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/sandbox/inventory_static.ini b/examples/sandbox/inventory_static.ini index ff4f5eb..88efc2a 100644 --- a/examples/sandbox/inventory_static.ini +++ b/examples/sandbox/inventory_static.ini @@ -14,15 +14,15 @@ [cloudera_manager] -eval-cdp-public0.internal.cloudapp.net ansible_host=13.92.192.89 +eval-cdp-public0.internal.cloudapp.net ansible_host=40.88.26.110 [cluster_master_nodes] -eval-cdp-public0.internal.cloudapp.net host_template=Master1 ansible_host=13.92.192.89 +eval-cdp-public0.internal.cloudapp.net host_template=Master1 ansible_host=40.88.26.110 [cluster_worker_nodes] -eval-cdp-public1.internal.cloudapp.net ansible_host=13.92.254.225 -eval-cdp-public2.internal.cloudapp.net ansible_host=13.92.249.4 -eval-cdp-public3.internal.cloudapp.net ansible_host=13.92.254.136 +eval-cdp-public1.internal.cloudapp.net ansible_host=40.88.26.103 +eval-cdp-public2.internal.cloudapp.net ansible_host=40.88.26.108 +eval-cdp-public3.internal.cloudapp.net ansible_host=40.87.20.33 [cluster_worker_nodes:vars] diff --git a/roles/cloudera_deploy/defaults/basic_cluster.yml b/roles/cloudera_deploy/defaults/basic_cluster.yml index d198e57..7e4ba43 100644 --- a/roles/cloudera_deploy/defaults/basic_cluster.yml +++ b/roles/cloudera_deploy/defaults/basic_cluster.yml @@ -26,11 +26,11 @@ clusters: configs: HDFS: DATANODE: - dfs_data_dir_list: /mnt/resource/dfs/dn + dfs_data_dir_list: /app/data/dfs/dn NAMENODE: - dfs_name_dir_list: /mnt/resource/dfs/nn + dfs_name_dir_list: /app/data/dfs/nn SECONDARYNAMENODE: - fs_checkpoint_dir_list: /mnt/resource/dfs/snn + fs_checkpoint_dir_list: /app/data/dfs/snn YARN: RESOURCEMANAGER: yarn_scheduler_maximum_allocation_mb: 4096 @@ -48,8 +48,8 @@ clusters: zookeeper_datadir_autocreate: true # Added following entries (inspired from defaults.j2) also for custom "zk" dirs SERVER: - dataDir: /mnt/resource/zookeeper/dataDir - dataLogDir: /mnt/resource/zookeeper/dataLogDir + dataDir: /app/data/zookeeper/dataDir + dataLogDir: /app/data/zookeeper/dataLogDir host_templates: Master1: From 5a3fc0f9c8a73228fc7f9ae30776218f69019028 Mon Sep 17 00:00:00 2001 From: lhoss Date: Tue, 25 May 2021 18:42:23 +0200 Subject: [PATCH 05/23] workaround to fix "cloudera_manager_admin_password" fact missing on localhost --- .../tasks/distribute_facts_to_inventory.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/roles/cloudera_deploy/tasks/distribute_facts_to_inventory.yml b/roles/cloudera_deploy/tasks/distribute_facts_to_inventory.yml index 4c74f6b..65dc20f 100644 --- a/roles/cloudera_deploy/tasks/distribute_facts_to_inventory.yml +++ b/roles/cloudera_deploy/tasks/distribute_facts_to_inventory.yml @@ -26,6 +26,14 @@ loop_var: __play_host label: __play_host +- name: LOCAL Set Sensitive Facts with no-log for later use in Cluster Deployment + ansible.builtin.set_fact: + cloudera_manager_admin_password: "{{ globals.admin_password }}" + cloudera_manager_license_file: "{{ globals.cloudera_license_file | default(omit) }}" + delegate_to: "localhost" + delegate_facts: true + #no_log: true + - name: Set Sensitive Facts with no-log for later use in Cluster Deployment ansible.builtin.set_fact: cloudera_manager_admin_password: "{{ globals.admin_password }}" From 223df7ad4548c615bc1977678de839a1143cf8a6 Mon Sep 17 00:00:00 2001 From: lhoss Date: Fri, 18 Jun 2021 12:20:36 +0200 Subject: [PATCH 06/23] initial config duplicated from basic_cluster --- .../defaults/cluster_3M2E3W.yml | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 roles/cloudera_deploy/defaults/cluster_3M2E3W.yml diff --git a/roles/cloudera_deploy/defaults/cluster_3M2E3W.yml b/roles/cloudera_deploy/defaults/cluster_3M2E3W.yml new file mode 100644 index 0000000..6f85f36 --- /dev/null +++ b/roles/cloudera_deploy/defaults/cluster_3M2E3W.yml @@ -0,0 +1,55 @@ +--- + +# +cloudera_manager_version: 7.1.4 + +clusters: + - name: Basic Cluster + services: [HDFS, YARN, ZOOKEEPER] + repositories: + - https://archive.cloudera.com/cdh7/7.1.4.0/parcels/ + configs: + HDFS: + DATANODE: + dfs_data_dir_list: /dfs/dn + NAMENODE: + dfs_name_dir_list: /dfs/nn + SECONDARYNAMENODE: + fs_checkpoint_dir_list: /dfs/snn + YARN: + RESOURCEMANAGER: + yarn_scheduler_maximum_allocation_mb: 4096 + yarn_scheduler_maximum_allocation_vcores: 4 + NODEMANAGER: + yarn_nodemanager_resource_memory_mb: 4096 + yarn_nodemanager_resource_cpu_vcores: 4 + yarn_nodemanager_local_dirs: /tmp/nm + yarn_nodemanager_log_dirs: /var/log/nm + GATEWAY: + mapred_submit_replication: 3 + mapred_reduce_tasks: 6 + ZOOKEEPER: + SERVICEWIDE: + zookeeper_datadir_autocreate: true + host_templates: + Master1: + HDFS: [NAMENODE, SECONDARYNAMENODE] + YARN: [RESOURCEMANAGER, JOBHISTORY] + ZOOKEEPER: [SERVER] + Workers: + HDFS: [DATANODE] + YARN: [NODEMANAGER] + +mgmt: + name: Cloudera Management Service + services: [ALERTPUBLISHER, EVENTSERVER, HOSTMONITOR, REPORTSMANAGER, SERVICEMONITOR] + +hosts: + configs: + host_default_proc_memswap_thresholds: + warning: never + critical: never + host_memswap_thresholds: + warning: never + critical: never + host_config_suppression_agent_system_user_group_validator: true \ No newline at end of file From 3b71310f83f2183ca326f1891a7a42ab961ccd84 Mon Sep 17 00:00:00 2001 From: lhoss Date: Fri, 18 Jun 2021 12:32:16 +0200 Subject: [PATCH 07/23] config for an 8-node cluster (3 master, 2 edge, 3 worker) --- .../defaults/cluster_3M2E3W.yml | 485 +++++++++++++++++- 1 file changed, 466 insertions(+), 19 deletions(-) diff --git a/roles/cloudera_deploy/defaults/cluster_3M2E3W.yml b/roles/cloudera_deploy/defaults/cluster_3M2E3W.yml index 6f85f36..84b4d74 100644 --- a/roles/cloudera_deploy/defaults/cluster_3M2E3W.yml +++ b/roles/cloudera_deploy/defaults/cluster_3M2E3W.yml @@ -5,40 +5,487 @@ cloudera_manager_version: 7.1.4 clusters: - name: Basic Cluster - services: [HDFS, YARN, ZOOKEEPER] repositories: - https://archive.cloudera.com/cdh7/7.1.4.0/parcels/ + services: + - ZOOKEEPER + - HDFS + - YARN + - SPARK_ON_YARN + - RANGER + - HUE + - INFRA_SOLR + - ATLAS + - HIVE + - HIVE_ON_TEZ + - TEZ + - ZEPPELIN + - KAFKA + - KNOX + - HBASE + - STREAMS_MESSAGING_MANAGER + - QUEUEMANAGER + + ### DSS_DEV service configs + # Note: The config was derived from a cluster export JSON (converted to YML via custom J2-based script) configs: + # Service + RANGER: + RANGER_ADMIN: + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + RANGER_TAGSYNC: + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + RANGER_USERSYNC: + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + ranger.usersync.sleeptimeinmillisbetweensynccycle: "14400000" + + #SERVICEWIDE: + + + # Service + HUE: + HUE_LOAD_BALANCER: + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + HUE_SERVER: + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + KT_RENEWER: + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + SERVICEWIDE: + service_config_suppression_hue_load_balancer_count_validator: "true" + time_zone: "Europe/Zurich" + + + # Service HDFS: + #BALANCER: + DATANODE: - dfs_data_dir_list: /dfs/dn + datanode_data_directories_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + # DONE custom dirs (all 9 disks) + dfs_data_dir_list: "/app/data/disk01/dfs/dn,/app/data/disk02/dfs/dn,/app/data/disk03/dfs/dn,/app/data/disk04/dfs/dn,/app/data/disk05/dfs/dn,/app/data/disk06/dfs/dn,/app/data/disk07/dfs/dn,/app/data/disk08/dfs/dn,/app/data/disk09/dfs/dn" + dfs_datanode_data_dir_perm: "700" + dfs_datanode_du_reserved: "10732175360" + # proposed value by xlaho 2 (vs '3'): + dfs_datanode_failed_volumes_tolerated: "2" + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + role_config_suppression_dfs_datanode_http_port: "true" + role_config_suppression_dfs_datanode_port: "true" + + FAILOVERCONTROLLER: + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + GATEWAY: + dfs_client_use_trash: "true" + + HTTPFS: + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + JOURNALNODE: + # DONE custom dirs + dfs_journalnode_edits_dir: "/app/data/journalnode/dfs/jn" + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + journalnode_edits_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + NAMENODE: - dfs_name_dir_list: /dfs/nn + # DONE custom dirs + dfs_name_dir_list: "/app/data/namenode/dfs/nn" + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + namenode_data_directories_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + role_health_suppression_name_node_safe_mode: "true" + + NFSGATEWAY: + dfs_nfs3_dump_dir: "/app/tmp/.hdfs-nfs" + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + nfsgateway_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + SECONDARYNAMENODE: - fs_checkpoint_dir_list: /dfs/snn - YARN: - RESOURCEMANAGER: - yarn_scheduler_maximum_allocation_mb: 4096 - yarn_scheduler_maximum_allocation_vcores: 4 - NODEMANAGER: - yarn_nodemanager_resource_memory_mb: 4096 - yarn_nodemanager_resource_cpu_vcores: 4 - yarn_nodemanager_local_dirs: /tmp/nm - yarn_nodemanager_log_dirs: /var/log/nm + # DONE custom dirs + fs_checkpoint_dir_list: "/app/data/namenode/dfs/snn" + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + secondarynamenode_checkpoint_directories_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + SERVICEWIDE: + dfs_encrypt_data_transfer_algorithm: "AES/CTR/NoPadding" + service_health_suppression_hdfs_verify_ec_with_topology: "true" + + + # Service + # Custom renamed to INFRA_SOLR (since the script put 2 SOLR_SERVER elements here, 1 I removed) + INFRA_SOLR: GATEWAY: - mapred_submit_replication: 3 - mapred_reduce_tasks: 6 + client_config_root_dir: "/etc/solr-infra" + + SOLR_SERVER: + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":524288000,"critical":262144000}' + process_auto_restart: "true" + role_config_suppression_solr_http_port: "true" + role_config_suppression_solr_https_port: "true" + # DONE custom dir + solr_data_dir: "/app/data/solr-infra" + solr_java_direct_memory_size: "11471421440" + solr_java_heapsize: "8823767040" + + SERVICEWIDE: + hdfs_data_dir: "/solr-infra" + rm_dirty: "true" + zookeeper_znode: "/solr-infra" + + + # Service + ATLAS: + ATLAS_SERVER: + atlas_max_heap_size: "4096" + #SERVICEWIDE: + + + # Service ZOOKEEPER: + SERVER: + # DONE custom dirs + dataDir: "/app/data/zookeeper" + dataLogDir: "/app/data/zookeeper" + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + maxClientCnxns: "300" + maxSessionTimeout: "60000" + zookeeper_server_data_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + zookeeper_server_data_log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + + # Service + QUEUEMANAGER: + QUEUEMANAGER_STORE: + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_threshold: "DEBUG" + process_auto_restart: "true" + QUEUEMANAGER_WEBAPP: + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + process_auto_restart: "true" + + + # Service + HIVE_ON_TEZ: + #GATEWAY: + + HIVESERVER2: + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + hive_on_tez_hs2_downloaded_resources_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + hive_on_tez_hs2_exec_local_scratch_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + process_auto_restart: "true" + + SERVICEWIDE: + hms_connector: "hive" + + + # Service + ZEPPELIN: + ZEPPELIN_SERVER: + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + # DONE custom dirs + zeppelin.dep.localrepo: "/app/data/zeppelin/local-repo" + zeppelin.interpreter.localRepo: "/app/data/zeppelin/local-repo" + zeppelin.war.tempdir: "/app/data/zeppelin/webapps" + + #SERVICEWIDE: + + # TODO test install with SPARK3 (requires Spark3 parcel repo setup?!) + # Service + #SPARK3_ON_YARN: + # GATEWAY: + # SPARK3_YARN_HISTORY_SERVER: + + + # Service + KAFKA: + KAFKA_BROKER: + # DONE custom dirs + log.dirs: "/app/data/kafka" + log.retention.ms: "259200000" + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + process_auto_restart: "true" + role_config_suppression_port: "true" + + KAFKA_CONNECT: + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + process_auto_restart: "true" + + KAFKA_MIRROR_MAKER: + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + process_auto_restart: "true" + + SERVICEWIDE: + log.cleaner.delete.retention.ms: "259200000" + offsets.retention.minutes: "4320" + + + # Service + HIVE: + #GATEWAY: + + HIVEMETASTORE: + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + hive_metastore_server_max_message_size: "858993459" + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + HIVESERVER2: + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + hiveserver2_downloaded_resources_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + hiveserver2_exec_local_scratch_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + WEBHCAT: + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + #SERVICEWIDE: + # set via varlib*j2: + #hive_metastore_derby_path: "/app/opt/hive/cloudera_manager/derby/metastore_db" + + + # Service °17 is KNOX + KNOX: + #GATEWAY: + + IDBROKER: + # set via varlib*j2: + #idbroker_conf_dir: "/app/opt/knox/idbroker/conf" + #idbroker_data_dir: "/app/opt/knox/idbroker/data" + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + process_auto_restart: "true" + + KNOX_GATEWAY: + #gateway_conf_dir: "/app/opt/knox/gateway/conf" + #gateway_data_dir: "/app/opt/knox/gateway/data" + #gateway_ranger_knox_plugin_conf_path: "/app/opt/knox/ranger-knox-plugin" + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + process_auto_restart: "true" + + #SERVICEWIDE: + + + # Service °18 is TEZ + TEZ: {} + # #GATEWAY: + # SERVICEWIDE: + # tez_version_uploaded: "0.9.1.7.1.5.0-257" + + + # Service °19 is YARN + YARN: + GATEWAY: + mapred_reduce_tasks: "12" + mapred_submit_replication: "3" + + JOBHISTORY: + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + + NODEMANAGER: + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + nodemanager_local_data_directories_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + nodemanager_log_directories_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + nodemanager_recovery_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + yarn_nodemanager_heartbeat_interval_ms: "100" + # DONE custom dirs. Note: The nodes have a custom dir for it: /app/data/nodemanager + yarn_nodemanager_local_dirs: "/app/data/disk01/yarn/nm,/app/data/disk02/yarn/nm,/app/data/disk03/yarn/nm,/app/data/disk04/yarn/nm,/app/data/disk05/yarn/nm,/app/data/disk06/yarn/nm,/app/data/disk07/yarn/nm,/app/data/disk08/yarn/nm,/app/data/disk09/yarn/nm" + # set via varlib*j2: + #yarn_nodemanager_recovery_dir: "/app/opt/hadoop-yarn/yarn-nm-recovery" + yarn_nodemanager_resource_memory_mb: "32768" + + RESOURCEMANAGER: + heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + yarn_resourcemanager_max_completed_applications: "1000" + yarn_scheduler_maximum_allocation_mb: "10067" + yarn_scheduler_maximum_allocation_vcores: "8" + + SERVICEWIDE: + rm_dirty: "true" + + + # Service °20 is LIVY + # LIVY: + # GATEWAY: + # LIVY_SERVER: + # log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + # process_auto_restart: "true" + # SERVICEWIDE: + + + # Service °21 is SPARK_ON_YARN + SPARK_ON_YARN: + #GATEWAY: + SPARK_YARN_HISTORY_SERVER: + # set via varlib*j2: + #local_storage_dir: "/app/opt/spark/history" + log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + process_auto_restart: "true" + #SERVICEWIDE: + + + # Service °22 is HBASE + HBASE: + # #GATEWAY: + # HBASERESTSERVER: + # HBASETHRIFTSERVER: + # MASTER: + REGIONSERVER: + hbase_bucketcache_size: "1" + hbase_regionserver_java_heapsize: "52428800" SERVICEWIDE: - zookeeper_datadir_autocreate: true + rm_dirty: "true" + + + # Service °23 is STREAMS_MESSAGING_MANAGER + STREAMS_MESSAGING_MANAGER: + STREAMS_MESSAGING_MANAGER_SERVER: + SMM_HEAP_SIZE: "5928" + log_directory_free_space_absolute_thresholds: '{"warning":5368709120,"critical":2147483648}' + STREAMS_MESSAGING_MANAGER_UI: + log_directory_free_space_absolute_thresholds: '{"warning":5368709120,"critical":2147483648}' + + #SERVICEWIDE: + + + ### DSS_DEV host_templates host_templates: Master1: - HDFS: [NAMENODE, SECONDARYNAMENODE] - YARN: [RESOURCEMANAGER, JOBHISTORY] - ZOOKEEPER: [SERVER] + ZOOKEEPER: [SERVER] + HDFS: + - NAMENODE + - JOURNALNODE + - FAILOVERCONTROLLER + YARN: + - JOBHISTORY + KAFKA: + - KAFKA_BROKER + HBASE: + - MASTER + # 1.try leave out the "GATEWAY". 2. add on ALL nodes?! + #- GATEWAY + + Master2: + ZOOKEEPER: [SERVER] + HDFS: + # 1.try using 2 NameNodes. 2.try with SECONDARYNAMENODE + - NAMENODE + #- SECONDARYNAMENODE + - JOURNALNODE + - FAILOVERCONTROLLER + YARN: + - RESOURCEMANAGER + #- JOBHISTORY # HA? + KAFKA: + - KAFKA_BROKER + - KAFKA_CONNECT + - KAFKA_MIRROR_MAKER + SCHEMA_REGISTRY: + - SCHEMA_REGISTRY_SERVER + + Master3: + ZOOKEEPER: [SERVER] + HDFS: + - FAILOVERCONTROLLER + - BALANCER + YARN: + - RESOURCEMANAGER + SPARK_ON_YARN: + - SPARK_YARN_HISTORY_SERVER + - GATEWAY + KAFKA: + - KAFKA_BROKER + - KAFKA_CONNECT + - KAFKA_MIRROR_MAKER + SCHEMA_REGISTRY: + - SCHEMA_REGISTRY_SERVER + HBASE: + - MASTER + #- GATEWAY + Edge1: + STREAMS_MESSAGING_MANAGER: + - STREAMS_MESSAGING_MANAGER_SERVER + - STREAMS_MESSAGING_MANAGER_UI + HBASE: + - HBASERESTSERVER + #- HBASETHRIFTSERVER # HA? + + # TODO later: what about "HIVE_LLAP" (roles: HIVESERVER2,LLAPPROXY) + HIVE: + - HIVEMETASTORE + - HIVESERVER2 + - WEBHCAT + HIVE_ON_TEZ: + - HIVESERVER2 + ATLAS: + - ATLAS_SERVER + INFRA_SOLR: + - SOLR_SERVER + HUE: + - HUE_LOAD_BALANCER + - HUE_SERVER + RANGER: + - RANGER_ADMIN + - RANGER_TAGSYNC + - RANGER_USERSYNC + ZEPPELIN: + - ZEPPELIN_SERVER + Edge2: + HDFS: + - NFSGATEWAY + - HTTPFS + HBASE: + #- HBASERESTSERVER # HA? + - HBASETHRIFTSERVER + HIVE: + - HIVEMETASTORE + - HIVESERVER2 + - WEBHCAT + HIVE_ON_TEZ: + - HIVESERVER2 + ATLAS: + - ATLAS_SERVER + INFRA_SOLR: + - SOLR_SERVER + HUE: + - HUE_LOAD_BALANCER + - HUE_SERVER + RANGER: + - RANGER_ADMIN + - RANGER_TAGSYNC + - RANGER_USERSYNC + KNOX: + - IDBROKER + - KNOX_GATEWAY + ZEPPELIN: + - ZEPPELIN_SERVER + QUEUEMANAGER: + - QUEUEMANAGER_STORE + - QUEUEMANAGER_WEBAPP + Workers: HDFS: [DATANODE] YARN: [NODEMANAGER] + HBASE: [REGIONSERVER] + mgmt: name: Cloudera Management Service From 4ba1f1977f5a966fb7cceebb9de9f653c3c9eafd Mon Sep 17 00:00:00 2001 From: lhoss Date: Fri, 18 Jun 2021 16:29:51 +0200 Subject: [PATCH 08/23] prepare new inventory/profile dir incl. the moved 3M2E3W cfg --- inventory/cdpAz3M2E3W/application.yml | 11 ++++ .../cdpAz3M2E3W}/cluster_3M2E3W.yml | 0 inventory/cdpAz3M2E3W/definition.yml | 28 ++++++++++ inventory/cdpAz3M2E3W/inventory_static.ini | 54 +++++++++++++++++++ 4 files changed, 93 insertions(+) create mode 100644 inventory/cdpAz3M2E3W/application.yml rename {roles/cloudera_deploy/defaults => inventory/cdpAz3M2E3W}/cluster_3M2E3W.yml (100%) create mode 100644 inventory/cdpAz3M2E3W/definition.yml create mode 100644 inventory/cdpAz3M2E3W/inventory_static.ini diff --git a/inventory/cdpAz3M2E3W/application.yml b/inventory/cdpAz3M2E3W/application.yml new file mode 100644 index 0000000..dd8a16e --- /dev/null +++ b/inventory/cdpAz3M2E3W/application.yml @@ -0,0 +1,11 @@ +--- +# Just a dummy playbook call from the main playbook after the cluster install was done +- name: Coda + hosts: localhost + connection: local + gather_facts: no + become: no + tasks: + - name: Deployment results + debug: + msg: Success! \ No newline at end of file diff --git a/roles/cloudera_deploy/defaults/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml similarity index 100% rename from roles/cloudera_deploy/defaults/cluster_3M2E3W.yml rename to inventory/cdpAz3M2E3W/cluster_3M2E3W.yml diff --git a/inventory/cdpAz3M2E3W/definition.yml b/inventory/cdpAz3M2E3W/definition.yml new file mode 100644 index 0000000..ea24836 --- /dev/null +++ b/inventory/cdpAz3M2E3W/definition.yml @@ -0,0 +1,28 @@ +--- +## [LH] Commented out following, AFAIU only used for CDP public-cloud deploys: +# datahub: +# definitions: +# - include: "datahub_streams_messaging_light.j2" + +use_default_cluster_definition: no +use_download_mirror: no +preload_cm_parcel_repo: no + +## [LH] moved vars here from ~/.config/cloudera-deploy/profiles: +# but specifying it here seems not working, so I provide it via cmdline --extra-vars +#admin_password: "" + +## Specifies the Cloud Infrastructure provider, CDP presently supports GCP, AWS and Azure +## Those should not necessary when using a static Ansible inventory +infra_type: +infra: +datahub: +env: + +## Vars to configure the "teardown" playbook tag +teardown_preserve_parcels: true +# Teardown the "cluster" but not CM, nor CMS (the CM services, under link "Cloudera Management Service"!) +#teardown_everything: true +teardown_everything: false +teardown_cms: False +teardown_cluster: "all" diff --git a/inventory/cdpAz3M2E3W/inventory_static.ini b/inventory/cdpAz3M2E3W/inventory_static.ini new file mode 100644 index 0000000..fb15583 --- /dev/null +++ b/inventory/cdpAz3M2E3W/inventory_static.ini @@ -0,0 +1,54 @@ +# Inventory for an 8-node cluster, to be used for ex with config "cluster_3M2E3W" + +# IDEA: Adding localhost to the inventory, so that it is part of "groups.all" (and it also gets important "set_fact" ) +# but only works when avoiding ssh, via ansible_connection=local +[local] +localhost ansible_connection=local + + +[edge1] +eval-cdp-public0.internal.cloudapp.net host_template=Edge1 ansible_host=52.170.192.213 + +[edge2] +eval-cdp-public1.internal.cloudapp.net host_template=Edge2 ansible_host=52.170.193.21 + +[cloudera_manager:children] +edge1 + +[cloudera_edge_nodes:children] +edge1 +edge2 + +[cluster_master_nodes] +eval-cdp-public2.internal.cloudapp.net host_template=Master1 ansible_host=52.170.192.205 +eval-cdp-public3.internal.cloudapp.net host_template=Master2 ansible_host=52.170.192.236 +eval-cdp-public4.internal.cloudapp.net host_template=Master3 ansible_host=52.170.192.172 + +[cluster_worker_nodes] +eval-cdp-public5.internal.cloudapp.net host_template=Workers ansible_host=52.170.197.255 +eval-cdp-public6.internal.cloudapp.net host_template=Workers ansible_host=52.170.198.178 +eval-cdp-public7.internal.cloudapp.net host_template=Workers ansible_host=52.170.86.155 + +[cluster_worker_nodes:vars] +host_template=Workers + +[cluster:children] +cloudera_edge_nodes +cluster_master_nodes +cluster_worker_nodes + + +[db_server:children] +cloudera_manager + +[deployment:children] +cluster +db_server + +# Note: TF deploys our OS users incl. ssh-pub key, so no more need to set the "ansible_user".. +#[deployment:vars] +# Ansible will defer to the running SSH Agent for relevant keys +# Set the following to hardcode the SSH private key for the instances +# ansible_ssh_private_key_file=~/.ssh/mykey.pem +#ansible_user=centos +#ansible_user=adminuser From 41ce9012ab2bc6863d4abe33434ecfbde2e7e60d Mon Sep 17 00:00:00 2001 From: lhoss Date: Fri, 18 Jun 2021 17:38:03 +0200 Subject: [PATCH 09/23] fix TEZ missing in hostTemplate --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index 84b4d74..641755b 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -435,6 +435,9 @@ clusters: - WEBHCAT HIVE_ON_TEZ: - HIVESERVER2 + # Note: I just explicitly added the GATEWAY role here, since TEZ has no other role + TEZ: + - GATEWAY ATLAS: - ATLAS_SERVER INFRA_SOLR: @@ -461,6 +464,8 @@ clusters: - WEBHCAT HIVE_ON_TEZ: - HIVESERVER2 + TEZ: + - GATEWAY ATLAS: - ATLAS_SERVER INFRA_SOLR: From 1f2ab5894545ef348c57a4c7e33bd972c9548e07 Mon Sep 17 00:00:00 2001 From: lhoss Date: Fri, 18 Jun 2021 17:40:01 +0200 Subject: [PATCH 10/23] fix service name SCHEMAREGISTRY --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index 641755b..b52f074 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -24,6 +24,7 @@ clusters: - KNOX - HBASE - STREAMS_MESSAGING_MANAGER + - SCHEMAREGISTRY - QUEUEMANAGER ### DSS_DEV service configs @@ -366,6 +367,10 @@ clusters: #SERVICEWIDE: + ## Services newly added (since not present on the cluster we derived above cfg from) + # Service + #SCHEMAREGISTRY: + ### DSS_DEV host_templates host_templates: Master1: @@ -398,7 +403,7 @@ clusters: - KAFKA_BROKER - KAFKA_CONNECT - KAFKA_MIRROR_MAKER - SCHEMA_REGISTRY: + SCHEMAREGISTRY: - SCHEMA_REGISTRY_SERVER Master3: @@ -415,7 +420,7 @@ clusters: - KAFKA_BROKER - KAFKA_CONNECT - KAFKA_MIRROR_MAKER - SCHEMA_REGISTRY: + SCHEMAREGISTRY: - SCHEMA_REGISTRY_SERVER HBASE: - MASTER From fcbcba5e1b0e4d1812d84e53444a8977fbb4c807 Mon Sep 17 00:00:00 2001 From: lhoss Date: Fri, 18 Jun 2021 17:50:45 +0200 Subject: [PATCH 11/23] disable unsupported role HIVE/WEBHCAT --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index b52f074..aebc4ab 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -252,9 +252,10 @@ clusters: hiveserver2_exec_local_scratch_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' - WEBHCAT: - heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' - log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + ## TODO fix bug to get "WEBHCAT" enabled (or test on 7.1.6 ?!) + # WEBHCAT: + # heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + # log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' #SERVICEWIDE: # set via varlib*j2: @@ -365,8 +366,8 @@ clusters: log_directory_free_space_absolute_thresholds: '{"warning":5368709120,"critical":2147483648}' #SERVICEWIDE: + - ## Services newly added (since not present on the cluster we derived above cfg from) # Service #SCHEMAREGISTRY: @@ -437,7 +438,7 @@ clusters: HIVE: - HIVEMETASTORE - HIVESERVER2 - - WEBHCAT + #- WEBHCAT #TODO fix bug (or test on 7.1.6 ?!) HIVE_ON_TEZ: - HIVESERVER2 # Note: I just explicitly added the GATEWAY role here, since TEZ has no other role @@ -466,7 +467,7 @@ clusters: HIVE: - HIVEMETASTORE - HIVESERVER2 - - WEBHCAT + #- WEBHCAT #TODO fix bug (or test on 7.1.6 ?!) HIVE_ON_TEZ: - HIVESERVER2 TEZ: From 6517445b3d45edd295f5c4ad3ef39de18e2e95de Mon Sep 17 00:00:00 2001 From: lhoss Date: Fri, 18 Jun 2021 18:15:27 +0200 Subject: [PATCH 12/23] fix: only 1 RANGER_USERSYNC allowed --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index aebc4ab..ba82d66 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -4,7 +4,7 @@ cloudera_manager_version: 7.1.4 clusters: - - name: Basic Cluster + - name: DSS Preview Cluster repositories: - https://archive.cloudera.com/cdh7/7.1.4.0/parcels/ services: @@ -481,8 +481,9 @@ clusters: - HUE_SERVER RANGER: - RANGER_ADMIN - - RANGER_TAGSYNC - - RANGER_USERSYNC + ## Only 1 RANGER_USERSYNC allowed, see Edge1 (not sure about TagSync), but also commenting out now + #- RANGER_TAGSYNC + #- RANGER_USERSYNC KNOX: - IDBROKER - KNOX_GATEWAY From 1b1c9de920d6832e60c7cef7b1cac8b00cc308f1 Mon Sep 17 00:00:00 2001 From: lhoss Date: Fri, 18 Jun 2021 18:26:51 +0200 Subject: [PATCH 13/23] fix: only 1 ZEPPELIN instance allowed --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index ba82d66..b2a33de 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -487,8 +487,9 @@ clusters: KNOX: - IDBROKER - KNOX_GATEWAY - ZEPPELIN: - - ZEPPELIN_SERVER + ## Only 1 ZEPPELIN allowed! + # ZEPPELIN: + # - ZEPPELIN_SERVER QUEUEMANAGER: - QUEUEMANAGER_STORE - QUEUEMANAGER_WEBAPP From b79bea4639d147e39ca6257e241b87aa260a371a Mon Sep 17 00:00:00 2001 From: lhoss Date: Mon, 21 Jun 2021 09:56:09 +0200 Subject: [PATCH 14/23] fix missing 3rd Hdfs journalnode --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index b2a33de..6d725a1 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -410,6 +410,7 @@ clusters: Master3: ZOOKEEPER: [SERVER] HDFS: + - JOURNALNODE - FAILOVERCONTROLLER - BALANCER YARN: From 6a8290227a0f99363b13eab17a29f932b9457ecc Mon Sep 17 00:00:00 2001 From: lhoss Date: Mon, 21 Jun 2021 10:25:47 +0200 Subject: [PATCH 15/23] disable kafka connect & mirrormaker (require extra configs) --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 25 ++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index 6d725a1..634ba09 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -223,14 +223,13 @@ clusters: log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' process_auto_restart: "true" role_config_suppression_port: "true" - - KAFKA_CONNECT: - log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' - process_auto_restart: "true" - - KAFKA_MIRROR_MAKER: - log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' - process_auto_restart: "true" + ## Disable following 2 because they require extra custom configs like a "Broker List" to work + # KAFKA_CONNECT: + # log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + # process_auto_restart: "true" + # KAFKA_MIRROR_MAKER: + # log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + # process_auto_restart: "true" SERVICEWIDE: log.cleaner.delete.retention.ms: "259200000" @@ -402,8 +401,9 @@ clusters: #- JOBHISTORY # HA? KAFKA: - KAFKA_BROKER - - KAFKA_CONNECT - - KAFKA_MIRROR_MAKER + ## disabled + #- KAFKA_CONNECT + #- KAFKA_MIRROR_MAKER SCHEMAREGISTRY: - SCHEMA_REGISTRY_SERVER @@ -420,8 +420,9 @@ clusters: - GATEWAY KAFKA: - KAFKA_BROKER - - KAFKA_CONNECT - - KAFKA_MIRROR_MAKER + ## disabled + #- KAFKA_CONNECT + #- KAFKA_MIRROR_MAKER SCHEMAREGISTRY: - SCHEMA_REGISTRY_SERVER HBASE: From 4ff4af063238c1cac911e46cb40c6db598453d69 Mon Sep 17 00:00:00 2001 From: lhoss Date: Mon, 21 Jun 2021 10:30:05 +0200 Subject: [PATCH 16/23] define HIVESERVER2 role only in service "HIVE_ON_TEZ" --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index 634ba09..d1ccf93 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -244,12 +244,9 @@ clusters: heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' hive_metastore_server_max_message_size: "858993459" log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' - - HIVESERVER2: - heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' - hiveserver2_downloaded_resources_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' - hiveserver2_exec_local_scratch_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' - log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' + ## Note: we only need "HIVESERVER2" role in service "HIVE_ON_TEZ" + #HIVESERVER2: + # heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' ## TODO fix bug to get "WEBHCAT" enabled (or test on 7.1.6 ?!) # WEBHCAT: @@ -439,7 +436,8 @@ clusters: # TODO later: what about "HIVE_LLAP" (roles: HIVESERVER2,LLAPPROXY) HIVE: - HIVEMETASTORE - - HIVESERVER2 + # we only need "HIVESERVER2" role in service "HIVE_ON_TEZ" + #- HIVESERVER2 #- WEBHCAT #TODO fix bug (or test on 7.1.6 ?!) HIVE_ON_TEZ: - HIVESERVER2 @@ -468,7 +466,6 @@ clusters: - HBASETHRIFTSERVER HIVE: - HIVEMETASTORE - - HIVESERVER2 #- WEBHCAT #TODO fix bug (or test on 7.1.6 ?!) HIVE_ON_TEZ: - HIVESERVER2 From deff909922c23059bce9459f3978d46d6e27bbee Mon Sep 17 00:00:00 2001 From: lhoss Date: Mon, 21 Jun 2021 11:57:00 +0200 Subject: [PATCH 17/23] configure the 2 missing Knox secrets/configs --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 5 +++++ inventory/cdpAz3M2E3W/definition.yml | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index d1ccf93..ce6a9f6 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -1,4 +1,7 @@ --- +## Below "clusters" config depends on extra secret vars to be set in the profile or inventory: +# gateway_master_secret +# idbroker_master_secret # cloudera_manager_version: 7.1.4 @@ -263,6 +266,7 @@ clusters: #GATEWAY: IDBROKER: + idbroker_master_secret: "{{ idbroker_master_secret }}" # set via varlib*j2: #idbroker_conf_dir: "/app/opt/knox/idbroker/conf" #idbroker_data_dir: "/app/opt/knox/idbroker/data" @@ -270,6 +274,7 @@ clusters: process_auto_restart: "true" KNOX_GATEWAY: + gateway_master_secret: "{{ gateway_master_secret }}" #gateway_conf_dir: "/app/opt/knox/gateway/conf" #gateway_data_dir: "/app/opt/knox/gateway/data" #gateway_ranger_knox_plugin_conf_path: "/app/opt/knox/ranger-knox-plugin" diff --git a/inventory/cdpAz3M2E3W/definition.yml b/inventory/cdpAz3M2E3W/definition.yml index ea24836..4dc5c4c 100644 --- a/inventory/cdpAz3M2E3W/definition.yml +++ b/inventory/cdpAz3M2E3W/definition.yml @@ -12,6 +12,10 @@ preload_cm_parcel_repo: no # but specifying it here seems not working, so I provide it via cmdline --extra-vars #admin_password: "" +# Secret vars used in the "clusters" config: +gateway_master_secret: "{{ admin_password }}" +idbroker_master_secret: "{{ admin_password }}" + ## Specifies the Cloud Infrastructure provider, CDP presently supports GCP, AWS and Azure ## Those should not necessary when using a static Ansible inventory infra_type: From eaa0fd5fdf427299ebb96cdd7e6ab2a664090e3e Mon Sep 17 00:00:00 2001 From: lhoss Date: Mon, 21 Jun 2021 12:13:32 +0200 Subject: [PATCH 18/23] undo HDFS HA (2 Namenodes); add SecondaryNamenode --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index ce6a9f6..0e03fc1 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -379,8 +379,8 @@ clusters: ZOOKEEPER: [SERVER] HDFS: - NAMENODE - - JOURNALNODE - - FAILOVERCONTROLLER + #- JOURNALNODE + #- FAILOVERCONTROLLER YARN: - JOBHISTORY KAFKA: @@ -393,11 +393,11 @@ clusters: Master2: ZOOKEEPER: [SERVER] HDFS: - # 1.try using 2 NameNodes. 2.try with SECONDARYNAMENODE - - NAMENODE - #- SECONDARYNAMENODE - - JOURNALNODE - - FAILOVERCONTROLLER + # 1.try using 2 NameNodes did not work -> 2.try with SECONDARYNAMENODE + #- NAMENODE + - SECONDARYNAMENODE + #- JOURNALNODE + #- FAILOVERCONTROLLER YARN: - RESOURCEMANAGER #- JOBHISTORY # HA? @@ -412,8 +412,8 @@ clusters: Master3: ZOOKEEPER: [SERVER] HDFS: - - JOURNALNODE - - FAILOVERCONTROLLER + #- JOURNALNODE + #- FAILOVERCONTROLLER - BALANCER YARN: - RESOURCEMANAGER From dbc9a486e5b3e6d5e215946f90a26e122cd55fe0 Mon Sep 17 00:00:00 2001 From: lhoss Date: Mon, 21 Jun 2021 13:37:57 +0200 Subject: [PATCH 19/23] added missing GATEWAY roles on both Edge Nodes --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 42 +++++++++++++++++++++--- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index 0e03fc1..4f30427 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -383,6 +383,8 @@ clusters: #- FAILOVERCONTROLLER YARN: - JOBHISTORY + HIVE_ON_TEZ: + - GATEWAY KAFKA: - KAFKA_BROKER HBASE: @@ -401,6 +403,8 @@ clusters: YARN: - RESOURCEMANAGER #- JOBHISTORY # HA? + HIVE_ON_TEZ: + - GATEWAY KAFKA: - KAFKA_BROKER ## disabled @@ -420,6 +424,9 @@ clusters: SPARK_ON_YARN: - SPARK_YARN_HISTORY_SERVER - GATEWAY + # required by Spark history server + HIVE_ON_TEZ: + - GATEWAY KAFKA: - KAFKA_BROKER ## disabled @@ -441,18 +448,18 @@ clusters: # TODO later: what about "HIVE_LLAP" (roles: HIVESERVER2,LLAPPROXY) HIVE: - HIVEMETASTORE + - GATEWAY # we only need "HIVESERVER2" role in service "HIVE_ON_TEZ" #- HIVESERVER2 #- WEBHCAT #TODO fix bug (or test on 7.1.6 ?!) HIVE_ON_TEZ: - HIVESERVER2 - # Note: I just explicitly added the GATEWAY role here, since TEZ has no other role - TEZ: - GATEWAY ATLAS: - ATLAS_SERVER INFRA_SOLR: - SOLR_SERVER + - GATEWAY HUE: - HUE_LOAD_BALANCER - HUE_SERVER @@ -462,24 +469,41 @@ clusters: - RANGER_USERSYNC ZEPPELIN: - ZEPPELIN_SERVER + # Gateway-only Roles: + HDFS: + - GATEWAY + #INFRA_SOLR: [GATEWAY] + #HIVE_ON_TEZ: [GATEWAY] + KAFKA: + - GATEWAY + #HIVE: [GATEWAY] + TEZ: + - GATEWAY + YARN: + - GATEWAY + SPARK_ON_YARN: + - GATEWAY + Edge2: HDFS: - NFSGATEWAY - HTTPFS + - GATEWAY HBASE: #- HBASERESTSERVER # HA? - HBASETHRIFTSERVER HIVE: - HIVEMETASTORE + - GATEWAY #- WEBHCAT #TODO fix bug (or test on 7.1.6 ?!) HIVE_ON_TEZ: - HIVESERVER2 - TEZ: - GATEWAY ATLAS: - ATLAS_SERVER INFRA_SOLR: - SOLR_SERVER + - GATEWAY HUE: - HUE_LOAD_BALANCER - HUE_SERVER @@ -497,7 +521,17 @@ clusters: QUEUEMANAGER: - QUEUEMANAGER_STORE - QUEUEMANAGER_WEBAPP - + # Gateway-only Roles: + KAFKA: + - GATEWAY + TEZ: + - GATEWAY + YARN: + - GATEWAY + SPARK_ON_YARN: + - GATEWAY + + # TODO Do we need GATEWAY roles on workers? Workers: HDFS: [DATANODE] YARN: [NODEMANAGER] From 613fc55940275dbd66839d188cbccd1140066eda Mon Sep 17 00:00:00 2001 From: lhoss Date: Mon, 21 Jun 2021 15:22:09 +0200 Subject: [PATCH 20/23] the Spark history server requires HIVE/GATEWAY --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index 4f30427..59bd0b7 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -425,6 +425,8 @@ clusters: - SPARK_YARN_HISTORY_SERVER - GATEWAY # required by Spark history server + HIVE: + - GATEWAY HIVE_ON_TEZ: - GATEWAY KAFKA: From 6f7d112da5ef1f4ca2b90f2492e3337fde0a8f3b Mon Sep 17 00:00:00 2001 From: lhoss Date: Mon, 21 Jun 2021 15:32:18 +0200 Subject: [PATCH 21/23] fix NFSGATEWAY tmp dir cfg --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index 59bd0b7..7ae0e48 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -3,7 +3,10 @@ # gateway_master_secret # idbroker_master_secret -# +## Other custom vars used in the "clusters" config: +tmp_base: "/tmp" # TODO on DSS: /app/tmp + +## Start of config expected by cloudera playbooks/roles: cloudera_manager_version: 7.1.4 clusters: @@ -112,7 +115,7 @@ clusters: role_health_suppression_name_node_safe_mode: "true" NFSGATEWAY: - dfs_nfs3_dump_dir: "/app/tmp/.hdfs-nfs" + dfs_nfs3_dump_dir: "{{tmp_base}}/.hdfs-nfs" heap_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' log_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' nfsgateway_dump_directory_free_space_absolute_thresholds: '{"warning":2147483648,"critical":1073741824}' From 89d644ce6a6775ec0e51d7fe2652e3d0932ba312 Mon Sep 17 00:00:00 2001 From: lhoss Date: Tue, 29 Jun 2021 18:40:54 +0200 Subject: [PATCH 22/23] inventory vars used by the playbook deploy-tenant-on-cdp.yml --- .../group_vars/all/hdfs_ranger.yml | 53 +++++++++++++++++++ inventory/cdpAz3M2E3W/inventory_static.ini | 7 ++- 2 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 inventory/cdpAz3M2E3W/group_vars/all/hdfs_ranger.yml diff --git a/inventory/cdpAz3M2E3W/group_vars/all/hdfs_ranger.yml b/inventory/cdpAz3M2E3W/group_vars/all/hdfs_ranger.yml new file mode 100644 index 0000000..495e53c --- /dev/null +++ b/inventory/cdpAz3M2E3W/group_vars/all/hdfs_ranger.yml @@ -0,0 +1,53 @@ +### Vars used by the playbook deploy-tenant-on-cdp.yml + +### Generic configs +_tenants: + - tenant: tenant1 +# - tenant: tenant2 + + +### HDFS Folder related configs + +hdfs_folder_tenants: "{{ _tenants }}" +hdfs_folders: +- name: "/{{ tenant }}" + owner: "{{ tenant }}" + mode: "0700" +- name: "/{{ tenant }}/rds" + owner: "{{ tenant }}" + mode: "0700" +- name: "/{{ tenant }}/ios" + owner: "{{ tenant }}" + mode: "0700" +# Do we need an "apps" dir? +#- name: "{{ tenant }}/apps" + + +### Ranger (module) related configs + +# used in playbook "deploy-tenant-on-cdp.yml" +ranger_tenants: "{{ _tenants }}" +# TODO define ranger host +#ranger_host: "{{ inventory_hostname }}" +ranger_host: "{{ groups.edge[0] }}" +ranger_admin_url: "http://{{ ranger_host }}:6080" +ranger_admin_username: admin +# Ranger Admin passwd as defined in the cloudera-playbook: +ranger_admin_password: "{{ ranger_rangeradmin_user_password | default('password123') }}" + +# TODO re-use the tenants list +ranger_hdfs_policies: + - name: "HDFS Base policies for {{ tenant }}" + # TODO using jquery, generated the list from the hdfs_folders var + paths: + - "/{{ tenant }}/rds" + - "/{{ tenant }}/ios" + permissions: + - users: + - "{{ tenant }}" + groups: + - "{{ tenant }}" + accesses: + - Write + - read + - execute diff --git a/inventory/cdpAz3M2E3W/inventory_static.ini b/inventory/cdpAz3M2E3W/inventory_static.ini index fb15583..5de1de4 100644 --- a/inventory/cdpAz3M2E3W/inventory_static.ini +++ b/inventory/cdpAz3M2E3W/inventory_static.ini @@ -15,10 +15,13 @@ eval-cdp-public1.internal.cloudapp.net host_template=Edge2 ansible_host=52.170.1 [cloudera_manager:children] edge1 -[cloudera_edge_nodes:children] +[edge:children] edge1 edge2 +[cluster_edge_nodes:children] +edge + [cluster_master_nodes] eval-cdp-public2.internal.cloudapp.net host_template=Master1 ansible_host=52.170.192.205 eval-cdp-public3.internal.cloudapp.net host_template=Master2 ansible_host=52.170.192.236 @@ -33,7 +36,7 @@ eval-cdp-public7.internal.cloudapp.net host_template=Workers ansible_host=52.170 host_template=Workers [cluster:children] -cloudera_edge_nodes +cluster_edge_nodes cluster_master_nodes cluster_worker_nodes From 384f4a993e1262d84d950869afd2e08d90abe5c8 Mon Sep 17 00:00:00 2001 From: lhoss Date: Thu, 1 Jul 2021 10:53:53 +0200 Subject: [PATCH 23/23] fix hbase regionserver heapsize --- inventory/cdpAz3M2E3W/cluster_3M2E3W.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml index 7ae0e48..a0dda9e 100644 --- a/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml +++ b/inventory/cdpAz3M2E3W/cluster_3M2E3W.yml @@ -356,7 +356,7 @@ clusters: # MASTER: REGIONSERVER: hbase_bucketcache_size: "1" - hbase_regionserver_java_heapsize: "52428800" + hbase_regionserver_java_heapsize: "4294967296" #4GB SERVICEWIDE: rm_dirty: "true"