diff --git a/2025-HPDC/docker/Dockerfile.spawn b/2025-HPDC/docker/Dockerfile.spawn index 93c4d6d..d0cae07 100644 --- a/2025-HPDC/docker/Dockerfile.spawn +++ b/2025-HPDC/docker/Dockerfile.spawn @@ -83,6 +83,7 @@ RUN cmake \ # rm -rf /tmp/build-xsbench COPY ./tutorial-code/caliper-tutorial/tutorial ${HOME}/caliper-tutorial/ +COPY ./tutorial-code/caliper-tutorial/apps ${HOME}/caliper-tutorial/apps COPY ./tutorial-code/thicket-tutorial/data/lassen ${HOME}/thicket-tutorial/data/lassen COPY ./tutorial-code/thicket-tutorial/data/quartz ${HOME}/thicket-tutorial/data/quartz COPY ./tutorial-code/thicket-tutorial/notebooks/01_thicket_tutorial.ipynb ${HOME}/thicket-tutorial/notebooks/01_thicket_tutorial.ipynb diff --git a/2025-HPDC/docker/spawn-entrypoint.sh b/2025-HPDC/docker/spawn-entrypoint.sh index 39f3dcb..de4c271 100755 --- a/2025-HPDC/docker/spawn-entrypoint.sh +++ b/2025-HPDC/docker/spawn-entrypoint.sh @@ -11,4 +11,19 @@ # /usr/bin/mpiexec.hydra -n $num_brokers -bind-to core:$num_cores_per_node /usr/bin/flux start /opt/global_py_venv/bin/jupyterhub-singleuser # NOTE: use this if we only want a single "node" -/usr/bin/flux start /opt/global_py_venv/bin/jupyterhub-singleuser \ No newline at end of file +if [[ $# -ne 1 ]]; then + /usr/bin/flux start /opt/global_py_venv/bin/jupyterhub-singleuser +else + last_core_id=$(( $1 - 1 )) + mkdir -p ${HOME}/.flux + cat > ${HOME}/.flux/resource.toml < ${HOME}/.flux/resource.toml </dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +hub_pod_id=$(kubectl get pods -n default --no-headers=true | awk '/hub/{print $1}') +kubectl logs $hub_pod_id \ No newline at end of file diff --git a/2025-HPDC/infrastructure/https_test/check_init_container_log.sh b/2025-HPDC/infrastructure/https_test/check_init_container_log.sh new file mode 100755 index 0000000..f4fd398 --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/check_init_container_log.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +if [ $# -ne 1 ]; then + echo "Usage: ./check_init_container_log.sh " + exit 1 +fi + +kubectl logs $1 -c init-tutorial-service \ No newline at end of file diff --git a/2025-HPDC/infrastructure/https_test/check_jupyterhub_status.sh b/2025-HPDC/infrastructure/https_test/check_jupyterhub_status.sh new file mode 100755 index 0000000..10b4261 --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/check_jupyterhub_status.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +kubectl --namespace=default get pods + +echo "If there are issues with any pods, you can get more details with:" +echo " $ kubectl --namespace=default describe pod " \ No newline at end of file diff --git a/2025-HPDC/infrastructure/https_test/cleanup.sh b/2025-HPDC/infrastructure/https_test/cleanup.sh new file mode 100755 index 0000000..0f8222f --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/cleanup.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +if ! command -v eksctl >/dev/null 2>&1; then + echo "ERROR: 'eksctl' is required to create a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://eksctl.io/installation/" + exit 1 +fi + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +# Temporarily allow errors in the script so that the script won't fail +# if the JupyterHub deployment failed or was previously torn down +set +e +echo "Tearing down JupyterHub and uninstalling everything related to Helm:" +helm uninstall hpdc-2025-pave-dry-run-jupyter +set -e + +echo "" +echo "Deleting all pods from the EKS cluster:" +kubectl delete pod --all-namespaces --all --force + +echo "" +echo "Deleting the EKS cluster:" +eksctl delete cluster --config-file ./eksctl-config.yaml --wait + +echo "" +echo "Everything is now cleaned up!" \ No newline at end of file diff --git a/2025-HPDC/infrastructure/https_test/cluster-autoscaler.yaml b/2025-HPDC/infrastructure/https_test/cluster-autoscaler.yaml new file mode 100644 index 0000000..f525796 --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/cluster-autoscaler.yaml @@ -0,0 +1,272 @@ +# The roles defined in this config file set permissions on several Kubernetes resources. +# +# Resources referred to: +# * events: resource representing information/responses generated from actions or changes taken against the cluster +# * endpoints: resource representing REST API endpoints within the cluster +# * pods/eviction: resource that terminates and removes pods when created +# * pods/status: resource used to query or edit the status of pods +# * nodes: resource representing the physical or virtual nodes of the cluster +# * namespaces: resource representing a group of isolated resources within the cluster +# * pods: resource representing a unit of computation that is deployed to a node +# * services: resource representing a networked application running in a pod and exposed over the network (either internal to the cluster or external to the broader internet) +# * replicationcontrollers: legacy resource for managing horizontal scaling (i.e., scale-out). Used for broader support across clouds +# * persistantvolumeclaims: resource representing a request for storage by a user +# * persistantvolumes: resource representing actual storage +# * replicasets: resource that creates replica pods that are used to ensure some minimum number of identical pods in the cluster +# * daemonsets: resource that ensures copies of pods are deployed to new nodes and removed from removed nodes +# * poddisruptionbudgets: resource that represents the cluster policy regarding the minimum number of pods that must remain available +# during voluntary disruptions (i.e., pod/node eviction not caused by something like hardware failure) +# * statefulsets: resource that maintains pod state +# * storageclasses: resource that describes different types of storage. Often used for things like QoS levels +# * csinodes: resource that describes a node's ability to interact with one or more storage providers. Mainly used by Kubernetes's scheduler +# * csidrivers: resource that provide information on the drivers for a single storage provider installed on a node +# * csistoragecapacities: resource that describes the available storage from different providers +# * jobs: resource that represents one-off tasks spread across one or more pods that must run to completion. Useful for certain types of setup and elasticity work +# * leases: resource that allows different pods, nodes, or kublets (kubernetes daemon on a node) to lock shared resources. Think of it like a mutex +# * configmaps: resource representing non-confidential key-value pair info. Often used to decouple environment-specific configuration from container images +--- +# Create a Service Account that will act as the internal user during the creation +# of the autoscaling infrastructure and have all the appropriate roles and permissions assigned +# to do its work +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler + name: cluster-autoscaler + namespace: kube-system +--- +# Create a ClusterRole to set permissions for associated +# users across the entire cluster +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cluster-autoscaler + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +rules: + # Allow associated users to create or partially update events and endpoints + - apiGroups: [""] + resources: ["events", "endpoints"] + verbs: ["create", "patch"] + # Allow associated users to evict pods + - apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] + # Allow associated users to update pod statuses + - apiGroups: [""] + resources: ["pods/status"] + verbs: ["update"] + # Allow associated users to get and update the state of the autoscaler + - apiGroups: [""] + resources: ["endpoints"] + resourceNames: ["cluster-autoscaler"] + verbs: ["get", "update"] + # Allow associated users to be notified of changes to, list, get the state of, + # and fully update information related to nodes + - apiGroups: [""] + resources: ["nodes"] + verbs: ["watch", "list", "get", "update"] + # Allow associated users to be notified of changes to, list, and get the state of + # namespaces, pods, services, replication controllers, persistent volume claims, and + # persistent volumes + - apiGroups: [""] + resources: + - "namespaces" + - "pods" + - "services" + - "replicationcontrollers" + - "persistentvolumeclaims" + - "persistentvolumes" + verbs: ["watch", "list", "get"] + # Allow associated users to be notified of changes to, list, and get the state of + # replica sets, and daemon sets + - apiGroups: ["extensions"] + resources: ["replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + # Allow associated users to be notified of changes to and list pod disruption budgets + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["watch", "list"] + # Allow associated users to be notified of changes to, list, and get the state of + # stateful sets, replica sets, and daemon sets + - apiGroups: ["apps"] + resources: ["statefulsets", "replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + # Allow associated users to be notified of chagnes to, list, and get the state of + # all resources related to available storage + - apiGroups: ["storage.k8s.io"] + resources: + ["storageclasses", "csinodes", "csidrivers", "csistoragecapacities"] + verbs: ["watch", "list", "get"] + # Allow associated users to get the state of, list, be notified of chagnes to, and partially update + # jobs launched in the cluster + - apiGroups: ["batch", "extensions"] + resources: ["jobs"] + verbs: ["get", "list", "watch", "patch"] + # Allow associated users to create leases + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create"] + # Allow associated users to get the state of and fully update leases in the autoscaler + - apiGroups: ["coordination.k8s.io"] + resourceNames: ["cluster-autoscaler"] + resources: ["leases"] + verbs: ["get", "update"] +--- +# Create a Role to set permissions within the 'kube-system' namespace +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: cluster-autoscaler + # The permissions in this Role apply to the 'kube-system' namespace + namespace: kube-system + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +rules: + # Allow associated users to create, list, and be notified of changes to config maps + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["create", "list", "watch"] + # Allow associated users to delete, get the state of, fully update, and be notified of + # changes to config maps in the autoscaler's status and priority-expander subresources + - apiGroups: [""] + resources: ["configmaps"] + resourceNames: + - "cluster-autoscaler-status" + - "cluster-autoscaler-priority-expander" + verbs: ["delete", "get", "update", "watch"] +--- +# Grant permissions defined by the ClusterRole +# to users defined by the ServiceAccount +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cluster-autoscaler + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +# Use the ClusterRole named "cluster-autoscaler" in the binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-autoscaler +# Use the ServiceAccount named "cluster-autoscaler" +# in the "kube-system" workspace in the binding +subjects: + - kind: ServiceAccount + name: cluster-autoscaler + namespace: kube-system +--- +# Grant permissions defined by the Role +# to users defined by the ServiceAccount +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +# Use the Role named "cluster-autoscaler" in the binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: cluster-autoscaler +# Use the ServiceAccount named "cluster-autoscaler" +# in the "kube-system" workspace in the binding +subjects: + - kind: ServiceAccount + name: cluster-autoscaler + namespace: kube-system +--- +# Define deployment rules for pods and ReplicaSets +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + app: cluster-autoscaler +spec: + replicas: 1 # Number of pods to run + # Apply to pods where the app has a label called 'app' + # with value 'cluster-autoscaler' + selector: + matchLabels: + app: cluster-autoscaler + # Definition of created pods + template: + metadata: + labels: + app: cluster-autoscaler + # Allow Prometheus to collect monitoring data over port 8085 + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8085" + spec: + priorityClassName: system-cluster-critical + securityContext: + # The Kubelet must be run as a non-root user + runAsNonRoot: true + runAsUser: 65534 + fsGroup: 65534 + # Use the default seccomp profile as specified by the + # container runtime + seccompProfile: + type: RuntimeDefault + serviceAccountName: cluster-autoscaler + # The container(s) to run within the pod. + # Since we're running an autoscaler, we'll run the autoscaler + # as the pod's only container, and then we'll deploy other + # containers within the autoscaler to actually do work + containers: + # The main container for the pod will be the + # Kubernetes autoscaling container + - image: registry.k8s.io/autoscaling/cluster-autoscaler:v1.26.2 + name: cluster-autoscaler + resources: + # Maximum amount of compute resources allowed + limits: + cpu: 100m + memory: 600Mi + # Minimum amount of compute resources required + # Defaults to 'limits' if not specified + requests: + cpu: 100m + memory: 600Mi + command: + - ./cluster-autoscaler + - --v=4 + - --stderrthreshold=info + - --cloud-provider=aws + - --skip-nodes-with-local-storage=false + - --expander=least-waste + - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/hpdc-2025-pave-dry-run + volumeMounts: + # Mount the CA SSL/TLS certificates into the container + - name: ssl-certs + mountPath: /etc/ssl/certs/ca-certificates.crt + readOnly: true + # Always pull the digest of the image from the + # container registry. If the locally cached digest is + # the same as the pulled digest, use the cached container image. + # Otherwise, pull the container from the registry + imagePullPolicy: "Always" + securityContext: + # Don't let the pod have more privileges than the + # parent process + allowPrivilegeEscalation: false + capabilities: + # Remove all capabilities + drop: + - ALL + # Root filesystem (i.e., '/') is read-only + readOnlyRootFilesystem: true + volumes: + - name: ssl-certs + hostPath: + path: "/etc/ssl/certs/ca-bundle.crt" \ No newline at end of file diff --git a/2025-HPDC/infrastructure/https_test/config.toml b/2025-HPDC/infrastructure/https_test/config.toml new file mode 100644 index 0000000..e390070 --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/config.toml @@ -0,0 +1,55 @@ +tutorial_name = "hpdc-2025-pave-dry-run" + +[aws.eksctl] +cluster_name = "hpdc-2025-pave-dry-run" +cluster_deployment_region = "us-west-1" +cluster_availability_zones = [ + "us-west-1a", + "us-west-1c", +] + +[[aws.eksctl.cluster_node_groups]] +zone = "us-west-1a" +instance_type = "c7i.12xlarge" +volume_size = 30 +desired_size = 2 +min_size = 2 +max_size = 8 + +[[aws.eksctl.cluster_node_groups]] +zone = "us-west-1c" +instance_type = "c7i.12xlarge" +volume_size = 30 +desired_size = 2 +min_size = 2 +max_size = 8 + +[aws."Kubernetes autoscaler"] +cpu_max = "100m" +memory_max = "600Mi" +cpu_min = "100m" +memory_min = "600Mi" + +[aws.Helm] +max_concurrent_users = 14 +hub_password = "butter" +hub_db_capacity = "32Gi" +ebs_storage_type = "gp3" +hub_container_image = "jupyterhub/k8s-hub" +hub_container_tag = "4.2.0" +spawner_container_image = "ghcr.io/llnl/reproducible-benchmarking-spawn" +spawner_container_tag = "hpdc-2025" +spawner_image_entrypoint = "/entrypoint.sh" +cpu_min = "32" +cpu_max = "32" +mem_min = "64G" +mem_max = "64G" +provide_extra_shmem = true +init_container_image = "ghcr.io/llnl/reproducible-benchmarking-init" +init_container_tag = "hpdc-2025" +init_image_entrypoint = "/entrypoint.sh" +custom_host = "https://software.llnl.gov/benchpark/hpdc-25-tutorial" + +[aws."utility scripts"] +jupyterhub_helm_version = "4.2.0" +ebs_csidriver_version = "v1.45.0" diff --git a/2025-HPDC/infrastructure/https_test/configure_kubernetes.sh b/2025-HPDC/infrastructure/https_test/configure_kubernetes.sh new file mode 100755 index 0000000..5c4bee6 --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/configure_kubernetes.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +echo "Configuring the Cluster Autoscaler:" +kubectl apply -k "github.com/kubernetes-sigs/aws-ebs-csi-driver/deploy/kubernetes/overlays/stable/?ref=v1.45.0" +kubectl apply -f ./cluster-autoscaler.yaml +echo "" +echo "Configuring the Storage Class:" +kubectl apply -f ./storage-class.yaml + +echo "" +echo "Patching the cluster to make the configured storage class the default:" +kubectl patch storageclass gp3 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' +kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' + +echo "" +echo "Done configuring Kubernetes!" +echo "" +echo "Next, you should run deploy_jupyterhub.sh to actually deploy JupyterHub and the tutorial." \ No newline at end of file diff --git a/2025-HPDC/infrastructure/https_test/create_cluster.sh b/2025-HPDC/infrastructure/https_test/create_cluster.sh new file mode 100755 index 0000000..f631168 --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/create_cluster.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v eksctl >/dev/null 2>&1; then + echo "ERROR: 'eksctl' is required to create a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://eksctl.io/installation/" + exit 1 +fi + +echo "Creating EKS cluster with eksctl:" +eksctl create cluster --config-file ./eksctl-config.yaml + +echo "Done creating the EKS cluster!" +echo "" +echo "Next, you should run configure_kubernetes.sh to configure Kubernetes on the cluster." \ No newline at end of file diff --git a/2025-HPDC/infrastructure/https_test/deploy_jupyterhub.sh b/2025-HPDC/infrastructure/https_test/deploy_jupyterhub.sh new file mode 100755 index 0000000..dcfd1d5 --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/deploy_jupyterhub.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +echo "Adding JupyterHub to EKS cluster using Helm:" +helm repo add jupyterhub https://hub.jupyter.org/helm-chart/ +helm repo update +echo "" +echo "Installing the Helm chart and deploying JupyterHub to EKS:" +helm install hpdc-2025-pave-dry-run-jupyter jupyterhub/jupyterhub --version 4.2.0 --values ./helm-config.yaml + +echo "" +echo "Done deploying JupyterHub!" +echo "" +echo "Next, you should ensure all the pods spawned correctly with check_jupyterhub_status.sh," +echo "and you should get the cluster URL with get_jupyterhub_url.sh." +echo "" +echo "If something went wrong, you can edit the helm-config.yaml file to try to fix the issue." +echo "After editing helm-config.yaml, you can normally reconfigure and relaunch JupyterHub using" +echo "the update_jupyterhub_deployment.sh script. If that doesn't work or if you need to edit" +echo "storage-class.yaml or cluster-autoscaler.yaml, you should first tear down JupyterHub with" +echo "tear_down_jupyterhub.sh, and then you should bring Jupyter back up by rerunning deploy_jupyterhub.sh." +echo "" +echo "If everything went smoothly, the cluster URL is what you should share with attendees." +echo "" +echo "Attendees can get a Jupyter environment to work in by going to that URL and logging in" +echo "with a username of their choice and the password specified in helm-config.yaml." +echo "" +echo "Note: users should have unique usernames. If two users have the same username, they will" +echo " share the same pod." +echo "" +echo "After you are done with your tutorial, you should finally run cleanup.sh to bring down" +echo "the EKS cluster and all associated resources." \ No newline at end of file diff --git a/2025-HPDC/infrastructure/https_test/eksctl-config.yaml b/2025-HPDC/infrastructure/https_test/eksctl-config.yaml new file mode 100644 index 0000000..27a235e --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/eksctl-config.yaml @@ -0,0 +1,110 @@ +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +# Define the name of the cluster and the deployment region +metadata: + name: hpdc-2025-pave-dry-run + region: us-west-1 + +# Create the IAM policies needed to enable the autoscaler and storage +iam: + withOIDC: true + serviceAccounts: + - metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + aws-usage: "cluster-ops" + app.kubernetes.io/name: cluster-autoscaler + + # https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md + attachPolicy: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "autoscaling:DescribeAutoScalingGroups" + - "autoscaling:DescribeAutoScalingInstances" + - "autoscaling:DescribeLaunchConfigurations" + - "autoscaling:DescribeTags" + - "autoscaling:SetDesiredCapacity" + - "autoscaling:TerminateInstanceInAutoScalingGroup" + - "ec2:DescribeLaunchTemplateVersions" + Resource: "*" + + - metadata: + name: ebs-csi-controller-sa + namespace: kube-system + labels: + aws-usage: "cluster-ops" + app.kubernetes.io/name: aws-ebs-csi-driver + attachPolicy: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "ec2:AttachVolume" + - "ec2:CreateSnapshot" + - "ec2:CreateTags" + - "ec2:CreateVolume" + - "ec2:DeleteSnapshot" + - "ec2:DeleteTags" + - "ec2:DeleteVolume" + - "ec2:DescribeInstances" + - "ec2:DescribeSnapshots" + - "ec2:DescribeTags" + - "ec2:DescribeVolumes" + - "ec2:DetachVolume" + Resource: "*" + +# Specify the availability zone from which nodes will be obtained +availabilityZones: +- "us-west-1a" +- "us-west-1c" + + +# Define rules for nodegroups for each availability zone +managedNodeGroups: + + - name: node-group-us-west-1a + # Set policies/permissions to autoscale + iam: + withAddonPolicies: + autoScaler: true + # Instance type to allocate + instanceType: c7i.12xlarge + # Size of storage volume for the availability zone, in gigabytes + volumeSize: 30 + # Number of nodes to start with in this availability zone + desiredCapacity: 2 + # Minimum number of nodes that will always be allocated in this availability zone + minSize: 2 + # Maximum number of nodes that will every be allocated in this availability zone + maxSize: 8 + privateNetworking: true + availabilityZones: + - us-west-1a + tags: + k8s.io/cluster-autoscaler/enabled: "true" + k8s.io/cluster-autoscaler/jupyterhub: "owned" + + - name: node-group-us-west-1c + # Set policies/permissions to autoscale + iam: + withAddonPolicies: + autoScaler: true + # Instance type to allocate + instanceType: c7i.12xlarge + # Size of storage volume for the availability zone, in gigabytes + volumeSize: 30 + # Number of nodes to start with in this availability zone + desiredCapacity: 2 + # Minimum number of nodes that will always be allocated in this availability zone + minSize: 2 + # Maximum number of nodes that will every be allocated in this availability zone + maxSize: 8 + privateNetworking: true + availabilityZones: + - us-west-1c + tags: + k8s.io/cluster-autoscaler/enabled: "true" + k8s.io/cluster-autoscaler/jupyterhub: "owned" diff --git a/2025-HPDC/infrastructure/https_test/get_jupyterhub_url.sh b/2025-HPDC/infrastructure/https_test/get_jupyterhub_url.sh new file mode 100755 index 0000000..ddfd250 --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/get_jupyterhub_url.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +kubectl get -o json service proxy-public | jq '.status.loadBalancer.ingress[0].hostname' \ No newline at end of file diff --git a/2025-HPDC/infrastructure/https_test/helm-config.yaml b/2025-HPDC/infrastructure/https_test/helm-config.yaml new file mode 100644 index 0000000..b9e92f5 --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/helm-config.yaml @@ -0,0 +1,127 @@ +# Uncomment if you need to debug your deployment of Jupyter. +# For more information on debugging, see: +# https://z2jh.jupyter.org/en/stable/administrator/debug.html +# debug: +# enabled: true + +hub: + # Maximum number of users with spawned JupyterLab environments (i.e., pods) at a time + concurrentSpawnLimit: 14 + config: + # Define a password for login + DummyAuthenticator: + password: butter + JupyterHub: + admin_access: true + authenticator_class: dummy + + # Define storage quantity for JupyterHub's persistent database + # We could explicitly set storage class name here, + # but we won't because we've marked the storage class defined + # in storage-class.yaml as default + db: + pvc: + storage: 32Gi + storageClassName: gp3 + + # Specify the hub image for the tutorial. + # The hub image should be based off of the jupyterhub/k8s-hub image. + # Its job is twofold: + # 1) If desired, replace the login page (at /usr/local/share/jupyterhub/templates/login.html) with a custom HTML login page + # 2) Set the user + image: + name: jupyterhub/k8s-hub + tag: "4.2.0" + pullPolicy: Always + + # Define resource usage for JupyterHub + # For large tutorials, it is recommended to set these higher + + # We are just using defualt resource usage + + # Define custom hostname for JupyterHub + +proxy: + https: + enabled: true + type: letsencrypt + letsencrypt: + contactEmail: you@email.com + service: + type: ClusterIP +ingress: + enabled: true + +# Based on optimization recommendations from: +# https://z2jh.jupyter.org/en/latest/administrator/optimization.html#scaling-up-in-time-user-placeholders +# scheduling: +# podPriority: +# enabled: true +# userPlaceholder: +# replicas: 3 + +# Define the spawner and init containers for each attendee's pod +singleuser: + # Specify the spawner image for the tutorial. + # The spawner image should do the following: + # 1) Install any necessary software + # 2) Define the user for the tutorial (we usually default to jovyan) + # 3) If custom Python packages are needed, it's often recommended to install a custom Jupyter kernel with `IPython kernel install` + # 4) If you want a custom Jupyter launcher UI, install the appropriate packages and update JUPYTER_APP_LAUNCHER_PATH + # 5) Copy any necessary local scripts or files and ensure proper permissions + image: + name: ghcr.io/llnl/reproducible-benchmarking-spawn + tag: "hpdc-2025" + pullPolicy: Always + # Specify the minimum (i.e., guarantee) and maximum (i.e., limit) amount of resources per user + cpu: + limit: 32 + guarantee: 32 + memory: + limit: "64G" + guarantee: "64G" + # If needed, specify a custom entrypoint into the spawner image. + # For more information, look at the documentation for Docker ENTRYPOINT and CMD directives: + # https://www.docker.com/blog/docker-best-practices-choosing-between-run-cmd-and-entrypoint/ + cmd: /entrypoint.sh + # Specify the init image for the tutorial. + # This image is optional, but it can be used to do last second configuration or installation of files + # before the user gains control of the pod. + # + # A good usecase for the init image is to set permissions and ensure the tutorial user will be able to + # access the files for your tutorial. An example Dockerfile for the init image may look like: + # + # Dockerfile: + # FROM alpine/git + # ENV NB_USER=jovyan \ + # NB_UID=1000 \ + # HOME=/home/jovyan + # + # RUN adduser \ + # -D \ + # -g "Default user" \ + # -u ${NB_UID} \ + # -h ${HOME} \ + # ${NB_USER} + # + # COPY ./init-entrypoint.sh /entrypoint.sh + # + # The 'command' field for the init container specifies the entrypoint for the container. For the Dockerfile + # above, the entrypoint should be "/entrypoint.sh". This script could look something like this: + # + # entrypoint.sh (would be ./init-entrypoint.sh on your local computer) + # chown -R 1000 /home/jovyan + initContainers: + - name: init-tutorial-service + image: ghcr.io/llnl/reproducible-benchmarking-init:hpdc-2025 + command: ["/entrypoint.sh"] + imagePullPolicy: Always + storage: + type: none + extraVolumes: + - name: shm-volume + emptyDir: + medium: Memory + extraVolumeMounts: + - name: shm-volume + mountPath: /dev/shm diff --git a/2025-HPDC/infrastructure/https_test/storage-class.yaml b/2025-HPDC/infrastructure/https_test/storage-class.yaml new file mode 100644 index 0000000..b83a030 --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/storage-class.yaml @@ -0,0 +1,7 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: gp3 +provisioner: kubernetes.io/aws-ebs +volumeBindingMode: WaitForFirstConsumer +reclaimPolicy: Delete \ No newline at end of file diff --git a/2025-HPDC/infrastructure/https_test/tear_down_jupyterhub.sh b/2025-HPDC/infrastructure/https_test/tear_down_jupyterhub.sh new file mode 100755 index 0000000..b306b9c --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/tear_down_jupyterhub.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +helm uninstall hpdc-2025-pave-dry-run-jupyter + +echo "Helm's JupyterHub deployment is torn down." +echo "If any attendee pods are remaining, you can delete them with 'kubectl delete pod '" +echo "" +echo "To recreate the JupyterHub deployment, just run deploy_jupyterhub.sh again." \ No newline at end of file diff --git a/2025-HPDC/infrastructure/https_test/test.yaml b/2025-HPDC/infrastructure/https_test/test.yaml new file mode 100644 index 0000000..f7d5e52 --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/test.yaml @@ -0,0 +1,2980 @@ +--- +# Source: jupyterhub/templates/hub/netpol.yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: hub + labels: + component: hub + app.kubernetes.io/component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + podSelector: + matchLabels: + component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + policyTypes: + - Ingress + - Egress + + # IMPORTANT: + # NetworkPolicy's ingress "from" and egress "to" rule specifications require + # great attention to detail. A quick summary is: + # + # 1. You can provide "from"/"to" rules that provide access either ports or a + # subset of ports. + # 2. You can for each "from"/"to" rule provide any number of + # "sources"/"destinations" of four different kinds. + # - podSelector - targets pods with a certain label in the same namespace as the NetworkPolicy + # - namespaceSelector - targets all pods running in namespaces with a certain label + # - namespaceSelector and podSelector - targets pods with a certain label running in namespaces with a certain label + # - ipBlock - targets network traffic from/to a set of IP address ranges + # + # Read more at: https://kubernetes.io/docs/concepts/services-networking/network-policies/#behavior-of-to-and-from-selectors + # + ingress: + + # allowed pods (hub.jupyter.org/network-access-hub) --> hub + - ports: + - port: http + from: + # source 1 - labeled pods + - podSelector: + matchLabels: + hub.jupyter.org/network-access-hub: "true" + + egress: + # hub --> proxy + - to: + - podSelector: + matchLabels: + component: proxy + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + ports: + - port: 8001 + + # hub --> singleuser-server + - to: + - podSelector: + matchLabels: + component: singleuser-server + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + ports: + - port: 8888 + + - ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP + to: + # Allow outbound connections to DNS ports on the cloud metadata server + - ipBlock: + cidr: 169.254.169.254/32 + # Allow outbound connections to DNS ports on pods in the kube-system + # namespace + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + # Allow outbound connections to DNS ports on destinations in the private IP + # ranges + - ipBlock: + cidr: 10.0.0.0/8 + - ipBlock: + cidr: 172.16.0.0/12 + - ipBlock: + cidr: 192.168.0.0/16 + # Allow outbound connections to non-private IP ranges + - to: + - ipBlock: + cidr: 0.0.0.0/0 + except: + # As part of this rule: + # - don't allow outbound connections to private IPs + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + # - don't allow outbound connections to the cloud metadata server + - 169.254.169.254/32 + # Allow outbound connections to private IP ranges + - to: + - ipBlock: + cidr: 10.0.0.0/8 + - ipBlock: + cidr: 172.16.0.0/12 + - ipBlock: + cidr: 192.168.0.0/16 + # Allow outbound connections to the cloud metadata server + - to: + - ipBlock: + cidr: 169.254.169.254/32 +--- +# Source: jupyterhub/templates/proxy/autohttps/netpol.yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: autohttps + labels: + component: autohttps + app.kubernetes.io/component: autohttps + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + podSelector: + matchLabels: + component: autohttps + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + policyTypes: + - Ingress + - Egress + + # IMPORTANT: + # NetworkPolicy's ingress "from" and egress "to" rule specifications require + # great attention to detail. A quick summary is: + # + # 1. You can provide "from"/"to" rules that provide access either ports or a + # subset of ports. + # 2. You can for each "from"/"to" rule provide any number of + # "sources"/"destinations" of four different kinds. + # - podSelector - targets pods with a certain label in the same namespace as the NetworkPolicy + # - namespaceSelector - targets all pods running in namespaces with a certain label + # - namespaceSelector and podSelector - targets pods with a certain label running in namespaces with a certain label + # - ipBlock - targets network traffic from/to a set of IP address ranges + # + # Read more at: https://kubernetes.io/docs/concepts/services-networking/network-policies/#behavior-of-to-and-from-selectors + # + ingress: + # allow incoming traffic to these ports independent of source + - ports: + - port: http + - port: https + + # allowed pods (hub.jupyter.org/network-access-proxy-http) --> proxy (http/https port) + - ports: + - port: http + - port: https + from: + # source 1 - labeled pods + - podSelector: + matchLabels: + hub.jupyter.org/network-access-proxy-http: "true" + + egress: + # autohttps --> proxy (http port) + - to: + - podSelector: + matchLabels: + component: proxy + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + ports: + - port: 8000 + + - ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP + to: + # Allow outbound connections to DNS ports on the cloud metadata server + - ipBlock: + cidr: 169.254.169.254/32 + # Allow outbound connections to DNS ports on pods in the kube-system + # namespace + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + # Allow outbound connections to DNS ports on destinations in the private IP + # ranges + - ipBlock: + cidr: 10.0.0.0/8 + - ipBlock: + cidr: 172.16.0.0/12 + - ipBlock: + cidr: 192.168.0.0/16 + # Allow outbound connections to non-private IP ranges + - to: + - ipBlock: + cidr: 0.0.0.0/0 + except: + # As part of this rule: + # - don't allow outbound connections to private IPs + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + # - don't allow outbound connections to the cloud metadata server + - 169.254.169.254/32 + # Allow outbound connections to private IP ranges + - to: + - ipBlock: + cidr: 10.0.0.0/8 + - ipBlock: + cidr: 172.16.0.0/12 + - ipBlock: + cidr: 192.168.0.0/16 + # Allow outbound connections to the cloud metadata server + - to: + - ipBlock: + cidr: 169.254.169.254/32 +--- +# Source: jupyterhub/templates/proxy/netpol.yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: proxy + labels: + component: proxy + app.kubernetes.io/component: proxy + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + podSelector: + matchLabels: + component: proxy + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + policyTypes: + - Ingress + - Egress + + # IMPORTANT: + # NetworkPolicy's ingress "from" and egress "to" rule specifications require + # great attention to detail. A quick summary is: + # + # 1. You can provide "from"/"to" rules that provide access either ports or a + # subset of ports. + # 2. You can for each "from"/"to" rule provide any number of + # "sources"/"destinations" of four different kinds. + # - podSelector - targets pods with a certain label in the same namespace as the NetworkPolicy + # - namespaceSelector - targets all pods running in namespaces with a certain label + # - namespaceSelector and podSelector - targets pods with a certain label running in namespaces with a certain label + # - ipBlock - targets network traffic from/to a set of IP address ranges + # + # Read more at: https://kubernetes.io/docs/concepts/services-networking/network-policies/#behavior-of-to-and-from-selectors + # + ingress: + # allow incoming traffic to these ports independent of source + - ports: + - port: http + - port: https + + # allowed pods (hub.jupyter.org/network-access-proxy-http) --> proxy (http/https port) + - ports: + - port: http + from: + # source 1 - labeled pods + - podSelector: + matchLabels: + hub.jupyter.org/network-access-proxy-http: "true" + + # allowed pods (hub.jupyter.org/network-access-proxy-api) --> proxy (api port) + - ports: + - port: api + from: + # source 1 - labeled pods + - podSelector: + matchLabels: + hub.jupyter.org/network-access-proxy-api: "true" + + egress: + # proxy --> hub + - to: + - podSelector: + matchLabels: + component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + ports: + - port: 8081 + + # proxy --> singleuser-server + - to: + - podSelector: + matchLabels: + component: singleuser-server + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + ports: + - port: 8888 + + - ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP + to: + # Allow outbound connections to DNS ports on the cloud metadata server + - ipBlock: + cidr: 169.254.169.254/32 + # Allow outbound connections to DNS ports on pods in the kube-system + # namespace + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + # Allow outbound connections to DNS ports on destinations in the private IP + # ranges + - ipBlock: + cidr: 10.0.0.0/8 + - ipBlock: + cidr: 172.16.0.0/12 + - ipBlock: + cidr: 192.168.0.0/16 + # Allow outbound connections to non-private IP ranges + - to: + - ipBlock: + cidr: 0.0.0.0/0 + except: + # As part of this rule: + # - don't allow outbound connections to private IPs + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + # - don't allow outbound connections to the cloud metadata server + - 169.254.169.254/32 + # Allow outbound connections to private IP ranges + - to: + - ipBlock: + cidr: 10.0.0.0/8 + - ipBlock: + cidr: 172.16.0.0/12 + - ipBlock: + cidr: 192.168.0.0/16 + # Allow outbound connections to the cloud metadata server + - to: + - ipBlock: + cidr: 169.254.169.254/32 +--- +# Source: jupyterhub/templates/singleuser/netpol.yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: singleuser + labels: + component: singleuser + app.kubernetes.io/component: singleuser + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + podSelector: + matchLabels: + component: singleuser-server + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + policyTypes: + - Ingress + - Egress + + # IMPORTANT: + # NetworkPolicy's ingress "from" and egress "to" rule specifications require + # great attention to detail. A quick summary is: + # + # 1. You can provide "from"/"to" rules that provide access either ports or a + # subset of ports. + # 2. You can for each "from"/"to" rule provide any number of + # "sources"/"destinations" of four different kinds. + # - podSelector - targets pods with a certain label in the same namespace as the NetworkPolicy + # - namespaceSelector - targets all pods running in namespaces with a certain label + # - namespaceSelector and podSelector - targets pods with a certain label running in namespaces with a certain label + # - ipBlock - targets network traffic from/to a set of IP address ranges + # + # Read more at: https://kubernetes.io/docs/concepts/services-networking/network-policies/#behavior-of-to-and-from-selectors + # + ingress: + + # allowed pods (hub.jupyter.org/network-access-singleuser) --> singleuser-server + - ports: + - port: notebook-port + from: + # source 1 - labeled pods + - podSelector: + matchLabels: + hub.jupyter.org/network-access-singleuser: "true" + + egress: + # singleuser-server --> hub + - to: + - podSelector: + matchLabels: + component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + ports: + - port: 8081 + + # singleuser-server --> proxy + # singleuser-server --> autohttps + # + # While not critical for core functionality, a user or library code may rely + # on communicating with the proxy or autohttps pods via a k8s Service it can + # detected from well known environment variables. + # + - to: + - podSelector: + matchLabels: + component: proxy + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + ports: + - port: 8000 + - to: + - podSelector: + matchLabels: + component: autohttps + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + ports: + - port: 8080 + - port: 8443 + + - ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP + to: + # Allow outbound connections to DNS ports on the cloud metadata server + - ipBlock: + cidr: 169.254.169.254/32 + # Allow outbound connections to DNS ports on pods in the kube-system + # namespace + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + # Allow outbound connections to DNS ports on destinations in the private IP + # ranges + - ipBlock: + cidr: 10.0.0.0/8 + - ipBlock: + cidr: 172.16.0.0/12 + - ipBlock: + cidr: 192.168.0.0/16 + # Allow outbound connections to non-private IP ranges + - to: + - ipBlock: + cidr: 0.0.0.0/0 + except: + # As part of this rule: + # - don't allow outbound connections to private IPs + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + # - don't allow outbound connections to the cloud metadata server + - 169.254.169.254/32 +--- +# Source: jupyterhub/templates/scheduling/user-placeholder/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: user-placeholder + labels: + component: user-placeholder + app.kubernetes.io/component: user-placeholder + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + minAvailable: 0 + selector: + matchLabels: + component: user-placeholder + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" +--- +# Source: jupyterhub/templates/scheduling/user-scheduler/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: user-scheduler + labels: + component: user-scheduler + app.kubernetes.io/component: user-scheduler + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + maxUnavailable: 1 + selector: + matchLabels: + component: user-scheduler + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" +--- +# Source: jupyterhub/templates/hub/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: hub + labels: + component: hub + app.kubernetes.io/component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +--- +# Source: jupyterhub/templates/image-puller/serviceaccount-continuous.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: continuous-image-puller + labels: + component: image-puller + app.kubernetes.io/component: image-puller + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm + annotations: +--- +# Source: jupyterhub/templates/proxy/autohttps/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: autohttps + labels: + component: autohttps + app.kubernetes.io/component: autohttps + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +--- +# Source: jupyterhub/templates/scheduling/user-scheduler/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: user-scheduler + labels: + component: user-scheduler + app.kubernetes.io/component: user-scheduler + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +--- +# Source: jupyterhub/templates/hub/secret.yaml +kind: Secret +apiVersion: v1 +metadata: + name: hub + labels: + component: hub + app.kubernetes.io/component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +type: Opaque +data: + values.yaml: "Chart:
  AppVersion: 5.3.0
  Name: jupyterhub
  Version: 4.2.0
Release:
  Name: hpdc-2025-pave-dry-run-jupyter
  Namespace: default
  Service: Helm
cull:
  adminUsers: true
  concurrency: 10
  enabled: true
  every: 600
  maxAge: 0
  removeNamedServers: false
  timeout: 3600
  users: false
custom: {}
debug:
  enabled: false
fullnameOverride: ""
global:
  safeToShowValues: false
hub:
  activeServerLimit: null
  allowNamedServers: false
  annotations: {}
  args: []
  authenticatePrometheus: null
  baseUrl: /benchpark/hpdc-25-tutorial/
  command: []
  concurrentSpawnLimit: 14
  config:
    DummyAuthenticator:
      password: butter
    JupyterHub:
      admin_access: true
      authenticator_class: dummy
  consecutiveFailureLimit: 5
  containerSecurityContext:
    allowPrivilegeEscalation: false
    capabilities:
      drop:
      - ALL
    runAsGroup: 1000
    runAsUser: 1000
  cookieSecret: null
  db:
    password: null
    pvc:
      accessModes:
      - ReadWriteOnce
      annotations: {}
      selector: {}
      storage: 32Gi
      storageClassName: gp3
      subPath: null
    type: sqlite-pvc
    upgrade: null
    url: null
  deploymentStrategy:
    type: Recreate
  existingSecret: null
  extraConfig: {}
  extraContainers: []
  extraEnv: {}
  extraFiles: {}
  extraPodSpec: {}
  extraVolumeMounts: []
  extraVolumes: []
  image:
    name: jupyterhub/k8s-hub
    pullPolicy: Always
    pullSecrets: []
    tag: 4.2.0
  initContainers: []
  labels: {}
  lifecycle: {}
  livenessProbe:
    enabled: true
    failureThreshold: 30
    initialDelaySeconds: 300
    periodSeconds: 10
    timeoutSeconds: 3
  loadRoles: {}
  namedServerLimitPerUser: null
  networkPolicy:
    allowedIngressPorts: []
    egress: []
    egressAllowRules:
      cloudMetadataServer: true
      dnsPortsCloudMetadataServer: true
      dnsPortsKubeSystemNamespace: true
      dnsPortsPrivateIPs: true
      nonPrivateIPs: true
      privateIPs: true
    enabled: true
    ingress: []
    interNamespaceAccessLabels: ignore
  nodeSelector: {}
  pdb:
    enabled: false
    maxUnavailable: null
    minAvailable: 1
  podSecurityContext:
    fsGroup: 1000
    runAsNonRoot: true
    seccompProfile:
      type: RuntimeDefault
  readinessProbe:
    enabled: true
    failureThreshold: 1000
    initialDelaySeconds: 0
    periodSeconds: 2
    timeoutSeconds: 1
  redirectToServer: null
  resources: {}
  revisionHistoryLimit: null
  service:
    annotations: {}
    extraPorts: []
    loadBalancerIP: null
    ports:
      appProtocol: null
      nodePort: null
    type: ClusterIP
  serviceAccount:
    annotations: {}
    create: true
    name: null
  services: {}
  shutdownOnLogout: null
  templatePaths: []
  templateVars: {}
  tolerations: []
imagePullSecret:
  automaticReferenceInjection: true
  create: false
  email: null
  password: null
  registry: null
  username: null
imagePullSecrets: []
ingress:
  annotations: {}
  enabled: true
  extraPaths: []
  hosts:
  - software.llnl.gov
  ingressClassName: null
  pathSuffix: null
  pathType: Prefix
  tls: []
prePuller:
  annotations: {}
  containerSecurityContext:
    allowPrivilegeEscalation: false
    capabilities:
      drop:
      - ALL
    runAsGroup: 65534
    runAsNonRoot: true
    runAsUser: 65534
    seccompProfile:
      type: RuntimeDefault
  continuous:
    enabled: true
    serviceAccount:
      annotations: {}
      create: true
      name: null
  extraImages: {}
  extraTolerations: []
  hook:
    containerSecurityContext:
      allowPrivilegeEscalation: false
      capabilities:
        drop:
        - ALL
      runAsGroup: 65534
      runAsNonRoot: true
      runAsUser: 65534
      seccompProfile:
        type: RuntimeDefault
    enabled: true
    image:
      name: quay.io/jupyterhub/k8s-image-awaiter
      pullPolicy: null
      pullSecrets: []
      tag: 4.2.0
    nodeSelector: {}
    podSchedulingWaitDuration: 10
    pullOnlyOnChanges: true
    resources: {}
    serviceAccount:
      annotations: {}
      create: true
      name: null
    serviceAccountImagePuller:
      annotations: {}
      create: true
      name: null
    tolerations: []
  labels: {}
  pause:
    containerSecurityContext:
      allowPrivilegeEscalation: false
      capabilities:
        drop:
        - ALL
      runAsGroup: 65534
      runAsNonRoot: true
      runAsUser: 65534
      seccompProfile:
        type: RuntimeDefault
    image:
      name: registry.k8s.io/pause
      pullPolicy: null
      pullSecrets: []
      tag: "3.10"
  pullProfileListImages: true
  resources: {}
  revisionHistoryLimit: null
proxy:
  annotations: {}
  chp:
    containerSecurityContext:
      allowPrivilegeEscalation: false
      capabilities:
        drop:
        - ALL
      runAsGroup: 65534
      runAsNonRoot: true
      runAsUser: 65534
      seccompProfile:
        type: RuntimeDefault
    defaultTarget: null
    errorTarget: null
    extraCommandLineFlags: []
    extraEnv: {}
    extraPodSpec: {}
    image:
      name: quay.io/jupyterhub/configurable-http-proxy
      pullPolicy: null
      pullSecrets: []
      tag: 4.6.3
    livenessProbe:
      enabled: true
      failureThreshold: 30
      initialDelaySeconds: 60
      periodSeconds: 10
      timeoutSeconds: 3
    networkPolicy:
      allowedIngressPorts:
      - http
      - https
      egress: []
      egressAllowRules:
        cloudMetadataServer: true
        dnsPortsCloudMetadataServer: true
        dnsPortsKubeSystemNamespace: true
        dnsPortsPrivateIPs: true
        nonPrivateIPs: true
        privateIPs: true
      enabled: true
      ingress: []
      interNamespaceAccessLabels: ignore
    nodeSelector: {}
    pdb:
      enabled: false
      maxUnavailable: null
      minAvailable: 1
    readinessProbe:
      enabled: true
      failureThreshold: 1000
      initialDelaySeconds: 0
      periodSeconds: 2
      timeoutSeconds: 1
    resources: {}
    revisionHistoryLimit: null
    tolerations: []
  deploymentStrategy:
    rollingUpdate: null
    type: Recreate
  https:
    enabled: true
    hosts:
    - software.llnl.gov
    letsencrypt:
      acmeServer: https://acme-v02.api.letsencrypt.org/directory
      contactEmail: you@email.com
    manual:
      cert: null
      key: null
    secret:
      crt: tls.crt
      key: tls.key
      name: null
    type: letsencrypt
  labels: {}
  secretSync:
    containerSecurityContext:
      allowPrivilegeEscalation: false
      capabilities:
        drop:
        - ALL
      runAsGroup: 65534
      runAsNonRoot: true
      runAsUser: 65534
      seccompProfile:
        type: RuntimeDefault
    image:
      name: quay.io/jupyterhub/k8s-secret-sync
      pullPolicy: null
      pullSecrets: []
      tag: 4.2.0
    resources: {}
  secretToken: null
  service:
    annotations: {}
    disableHttpPort: false
    extraPorts: []
    labels: {}
    loadBalancerIP: null
    loadBalancerSourceRanges: []
    nodePorts:
      http: null
      https: null
    type: LoadBalancer
  traefik:
    containerSecurityContext:
      allowPrivilegeEscalation: false
      capabilities:
        drop:
        - ALL
      runAsGroup: 65534
      runAsNonRoot: true
      runAsUser: 65534
      seccompProfile:
        type: RuntimeDefault
    extraDynamicConfig:
      http:
        middlewares:
          hsts:
            headers:
              stsIncludeSubdomains: false
              stsPreload: false
              stsSeconds: 15724800
          redirect:
            redirectScheme:
              permanent: true
              scheme: https
          scheme:
            headers:
              customRequestHeaders:
                X-Scheme: https
        routers:
          default:
            entrypoints:
            - https
            middlewares:
            - hsts
            - scheme
            rule: PathPrefix(`/`)
            service: default
            tls:
              certResolver: default
              domains:
              - main: software.llnl.gov
              options: default
          insecure:
            entrypoints:
            - http
            middlewares:
            - redirect
            rule: PathPrefix(`/`)
            service: default
        services:
          default:
            loadBalancer:
              servers:
              - url: http://proxy-http:8000/
      tls:
        options:
          default:
            cipherSuites:
            - TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384
            - TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
            - TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256
            - TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256
            - TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
            - TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305
            minVersion: VersionTLS12
            sniStrict: true
    extraEnv: {}
    extraInitContainers: []
    extraPodSpec: {}
    extraPorts: []
    extraStaticConfig:
      accessLog:
        fields:
          headers:
            names:
              Authorization: redacted
              Cookie: redacted
              Set-Cookie: redacted
              X-Xsrftoken: redacted
        filters:
          statusCodes:
          - 500-599
      certificatesResolvers:
        default:
          acme:
            caServer: https://acme-v02.api.letsencrypt.org/directory
            email: you@email.com
            httpChallenge:
              entryPoint: http
            storage: /etc/acme/acme.json
      entryPoints:
        http:
          address: :8080
        https:
          address: :8443
          transport:
            respondingTimeouts:
              idleTimeout: 10m0s
      log:
        level: WARN
      providers:
        file:
          filename: /etc/traefik/dynamic.yaml
    extraVolumeMounts: []
    extraVolumes: []
    hsts:
      includeSubdomains: false
      maxAge: 15724800
      preload: false
    image:
      name: traefik
      pullPolicy: null
      pullSecrets: []
      tag: v3.3.5
    labels: {}
    networkPolicy:
      allowedIngressPorts:
      - http
      - https
      egress: []
      egressAllowRules:
        cloudMetadataServer: true
        dnsPortsCloudMetadataServer: true
        dnsPortsKubeSystemNamespace: true
        dnsPortsPrivateIPs: true
        nonPrivateIPs: true
        privateIPs: true
      enabled: true
      ingress: []
      interNamespaceAccessLabels: ignore
    nodeSelector: {}
    pdb:
      enabled: false
      maxUnavailable: null
      minAvailable: 1
    resources: {}
    revisionHistoryLimit: null
    serviceAccount:
      annotations: {}
      create: true
      name: null
    tolerations: []
rbac:
  create: true
scheduling:
  corePods:
    nodeAffinity:
      matchNodePurpose: prefer
    tolerations:
    - effect: NoSchedule
      key: hub.jupyter.org/dedicated
      operator: Equal
      value: core
    - effect: NoSchedule
      key: hub.jupyter.org_dedicated
      operator: Equal
      value: core
  podPriority:
    defaultPriority: 0
    enabled: false
    globalDefault: false
    imagePullerPriority: -5
    userPlaceholderPriority: -10
  userPlaceholder:
    annotations: {}
    containerSecurityContext:
      allowPrivilegeEscalation: false
      capabilities:
        drop:
        - ALL
      runAsGroup: 65534
      runAsNonRoot: true
      runAsUser: 65534
      seccompProfile:
        type: RuntimeDefault
    enabled: true
    extraPodSpec: {}
    image:
      name: registry.k8s.io/pause
      pullPolicy: null
      pullSecrets: []
      tag: "3.10"
    labels: {}
    replicas: 0
    resources: {}
    revisionHistoryLimit: null
  userPods:
    nodeAffinity:
      matchNodePurpose: prefer
    tolerations:
    - effect: NoSchedule
      key: hub.jupyter.org/dedicated
      operator: Equal
      value: user
    - effect: NoSchedule
      key: hub.jupyter.org_dedicated
      operator: Equal
      value: user
  userScheduler:
    annotations: {}
    containerSecurityContext:
      allowPrivilegeEscalation: false
      capabilities:
        drop:
        - ALL
      runAsGroup: 65534
      runAsNonRoot: true
      runAsUser: 65534
      seccompProfile:
        type: RuntimeDefault
    enabled: true
    extraPodSpec: {}
    image:
      name: registry.k8s.io/kube-scheduler
      pullPolicy: null
      pullSecrets: []
      tag: v1.30.11
    labels: {}
    logLevel: 4
    nodeSelector: {}
    pdb:
      enabled: true
      maxUnavailable: 1
      minAvailable: null
    pluginConfig:
    - args:
        scoringStrategy:
          resources:
          - name: cpu
            weight: 1
          - name: memory
            weight: 1
          type: MostAllocated
      name: NodeResourcesFit
    plugins:
      score:
        disabled:
        - name: NodeResourcesBalancedAllocation
        - name: NodeAffinity
        - name: InterPodAffinity
        - name: NodeResourcesFit
        - name: ImageLocality
        enabled:
        - name: NodeAffinity
          weight: 14631
        - name: InterPodAffinity
          weight: 1331
        - name: NodeResourcesFit
          weight: 121
        - name: ImageLocality
          weight: 11
    replicas: 2
    resources: {}
    revisionHistoryLimit: null
    serviceAccount:
      annotations: {}
      create: true
      name: null
    tolerations: []
singleuser:
  allowPrivilegeEscalation: false
  cloudMetadata:
    blockWithIptables: true
    ip: 169.254.169.254
  cmd: /entrypoint.sh
  cpu:
    guarantee: 32
    limit: 32
  defaultUrl: null
  events: true
  extraAnnotations: {}
  extraContainers: []
  extraEnv: {}
  extraFiles: {}
  extraLabels:
    hub.jupyter.org/network-access-hub: "true"
  extraNodeAffinity:
    preferred: []
    required: []
  extraPodAffinity:
    preferred: []
    required: []
  extraPodAntiAffinity:
    preferred: []
    required: []
  extraPodConfig: {}
  extraResource:
    guarantees: {}
    limits: {}
  extraTolerations: []
  fsGid: 100
  image:
    name: ghcr.io/llnl/reproducible-benchmarking-spawn
    pullPolicy: Always
    pullSecrets: []
    tag: hpdc-2025
  initContainers:
  - command:
    - /entrypoint.sh
    image: ghcr.io/llnl/reproducible-benchmarking-init:hpdc-2025
    imagePullPolicy: Always
    name: init-tutorial-service
  lifecycleHooks: {}
  memory:
    guarantee: 64G
    limit: 64G
  networkPolicy:
    allowedIngressPorts: []
    egress: []
    egressAllowRules:
      cloudMetadataServer: false
      dnsPortsCloudMetadataServer: true
      dnsPortsKubeSystemNamespace: true
      dnsPortsPrivateIPs: true
      nonPrivateIPs: true
      privateIPs: false
    enabled: true
    ingress: []
    interNamespaceAccessLabels: ignore
  networkTools:
    image:
      name: quay.io/jupyterhub/k8s-network-tools
      pullPolicy: null
      pullSecrets: []
      tag: 4.2.0
    resources: {}
  nodeSelector: {}
  podNameTemplate: null
  profileList: []
  serviceAccountName: null
  startTimeout: 300
  storage:
    capacity: 10Gi
    dynamic:
      pvcNameTemplate: null
      storageAccessModes:
      - ReadWriteOnce
      storageClass: null
      subPath: null
      volumeNameTemplate: volume-{user_server}
    extraLabels: {}
    extraVolumeMounts:
    - mountPath: /dev/shm
      name: shm-volume
    extraVolumes:
    - emptyDir:
        medium: Memory
      name: shm-volume
    homeMountPath: /home/jovyan
    static:
      pvcName: null
      subPath: '{username}'
    type: none
  uid: 1000" + + # Any JupyterHub Services api_tokens are exposed in this k8s Secret as a + # convinience for external services running in the k8s cluster that could + # mount them directly from this k8s Secret. + + # During Helm template rendering, these values that can be autogenerated for + # users are set using the following logic: + # + # 1. Use chart configuration's value + # 2. Use k8s Secret's value + # 3. Use a new autogenerated value + # + # hub.config.ConfigurableHTTPProxy.auth_token: for hub to proxy-api authorization (JupyterHub.proxy_auth_token is deprecated) + # hub.config.JupyterHub.cookie_secret: for cookie encryption + # hub.config.CryptKeeper.keys: for auth state encryption + # + hub.config.ConfigurableHTTPProxy.auth_token: "OUc3NG9ZMFd1cUJsRDZpcWl4dDFkeHNraTFWUTFZVGVqN3ZGbjFvV3dBcUJoQkp2YkI3dzhueVV6WmoySmcwNw==" + hub.config.JupyterHub.cookie_secret: "Y2NhZjQzOTlkMjQ0MGNhNGU2MDUwNTc5NzMwYWJkNWZjY2RlMjQ0MGIxM2U1Nzc3NjUwZGI3YTg2MzQ4M2U1OA==" + hub.config.CryptKeeper.keys: "NmQ3MWVlMTIzMGMwOWI5ZDM3Yzc2ZjJhZGIyOGFhMTk1ZjZkOWEyMWUzMDYzNTY0ZWRjZWY1NTI5ODQ5MDEyMA==" +--- +# Source: jupyterhub/templates/hub/configmap.yaml +kind: ConfigMap +apiVersion: v1 +metadata: + name: hub + labels: + component: hub + app.kubernetes.io/component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +data: + fullname: "" + fullname-dash: "" + hub: "hub" + hub-serviceaccount: "hub" + hub-existing-secret: "" + hub-existing-secret-or-default: "hub" + hub-pvc: "hub-db-dir" + proxy: "proxy" + proxy-api: "proxy-api" + proxy-http: "proxy-http" + proxy-public: "proxy-public" + proxy-public-tls: "proxy-public-tls-acme" + proxy-public-manual-tls: "proxy-public-manual-tls" + autohttps: "autohttps" + autohttps-serviceaccount: "autohttps" + user-scheduler-deploy: "user-scheduler" + user-scheduler-serviceaccount: "user-scheduler" + user-scheduler-lock: "user-scheduler-lock" + user-placeholder: "user-placeholder" + image-puller-priority: "hpdc-2025-pave-dry-run-jupyter-image-puller-priority" + hook-image-awaiter: "hook-image-awaiter" + hook-image-awaiter-serviceaccount: "hook-image-awaiter" + hook-image-puller: "hook-image-puller" + hook-image-puller-serviceaccount: "hook-image-puller" + continuous-image-puller: "continuous-image-puller" + continuous-image-puller-serviceaccount: "continuous-image-puller" + singleuser: "singleuser" + image-pull-secret: "image-pull-secret" + ingress: "jupyterhub" + priority: "hpdc-2025-pave-dry-run-jupyter-default-priority" + user-placeholder-priority: "hpdc-2025-pave-dry-run-jupyter-user-placeholder-priority" + user-scheduler: "hpdc-2025-pave-dry-run-jupyter-user-scheduler" + jupyterhub_config.py: | + # load the config object (satisfies linters) + c = get_config() # noqa + + import glob + import os + import re + import sys + + from jupyterhub.utils import url_path_join + from kubernetes_asyncio import client + from tornado.httpclient import AsyncHTTPClient + + # Make sure that modules placed in the same directory as the jupyterhub config are added to the pythonpath + configuration_directory = os.path.dirname(os.path.realpath(__file__)) + sys.path.insert(0, configuration_directory) + + from z2jh import ( + get_config, + get_name, + get_name_env, + get_secret_value, + set_config_if_not_none, + ) + + + def camelCaseify(s): + """convert snake_case to camelCase + + For the common case where some_value is set from someValue + so we don't have to specify the name twice. + """ + return re.sub(r"_([a-z])", lambda m: m.group(1).upper(), s) + + + # Configure JupyterHub to use the curl backend for making HTTP requests, + # rather than the pure-python implementations. The default one starts + # being too slow to make a large number of requests to the proxy API + # at the rate required. + AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") + + c.JupyterHub.spawner_class = "kubespawner.KubeSpawner" + + # Connect to a proxy running in a different pod. Note that *_SERVICE_* + # environment variables are set by Kubernetes for Services + c.ConfigurableHTTPProxy.api_url = ( + f'http://{get_name("proxy-api")}:{get_name_env("proxy-api", "_SERVICE_PORT")}' + ) + c.ConfigurableHTTPProxy.should_start = False + + # Do not shut down user pods when hub is restarted + c.JupyterHub.cleanup_servers = False + + # Check that the proxy has routes appropriately setup + c.JupyterHub.last_activity_interval = 60 + + # Don't wait at all before redirecting a spawning user to the progress page + c.JupyterHub.tornado_settings = { + "slow_spawn_timeout": 0, + } + + + # configure the hub db connection + db_type = get_config("hub.db.type") + if db_type == "sqlite-pvc": + c.JupyterHub.db_url = "sqlite:///jupyterhub.sqlite" + elif db_type == "sqlite-memory": + c.JupyterHub.db_url = "sqlite://" + else: + set_config_if_not_none(c.JupyterHub, "db_url", "hub.db.url") + db_password = get_secret_value("hub.db.password", None) + if db_password is not None: + if db_type == "mysql": + os.environ["MYSQL_PWD"] = db_password + elif db_type == "postgres": + os.environ["PGPASSWORD"] = db_password + else: + print(f"Warning: hub.db.password is ignored for hub.db.type={db_type}") + + + # c.JupyterHub configuration from Helm chart's configmap + for trait, cfg_key in ( + ("concurrent_spawn_limit", None), + ("active_server_limit", None), + ("base_url", None), + ("allow_named_servers", None), + ("named_server_limit_per_user", None), + ("authenticate_prometheus", None), + ("redirect_to_server", None), + ("shutdown_on_logout", None), + ("template_paths", None), + ("template_vars", None), + ): + if cfg_key is None: + cfg_key = camelCaseify(trait) + set_config_if_not_none(c.JupyterHub, trait, "hub." + cfg_key) + + # hub_bind_url configures what the JupyterHub process within the hub pod's + # container should listen to. + hub_container_port = 8081 + c.JupyterHub.hub_bind_url = f"http://:{hub_container_port}" + + # hub_connect_url is the URL for connecting to the hub for use by external + # JupyterHub services such as the proxy. Note that *_SERVICE_* environment + # variables are set by Kubernetes for Services. + c.JupyterHub.hub_connect_url = ( + f'http://{get_name("hub")}:{get_name_env("hub", "_SERVICE_PORT")}' + ) + + # implement common labels + # This mimics the jupyterhub.commonLabels helper, but declares managed-by to + # kubespawner instead of helm. + # + # The labels app and release are old labels enabled to be deleted in z2jh 5, but + # for now retained to avoid a breaking change in z2jh 4 that would force user + # server restarts. Restarts would be required because NetworkPolicy resources + # must select old/new pods with labels that then needs to be seen on both + # old/new pods, and we want these resources to keep functioning for old/new user + # server pods during an upgrade. + # + common_labels = c.KubeSpawner.common_labels = {} + common_labels["app.kubernetes.io/name"] = common_labels["app"] = get_config( + "nameOverride", + default=get_config("Chart.Name", "jupyterhub"), + ) + release = get_config("Release.Name") + if release: + common_labels["app.kubernetes.io/instance"] = common_labels["release"] = release + chart_name = get_config("Chart.Name") + chart_version = get_config("Chart.Version") + if chart_name and chart_version: + common_labels["helm.sh/chart"] = common_labels["chart"] = ( + f"{chart_name}-{chart_version.replace('+', '_')}" + ) + common_labels["app.kubernetes.io/managed-by"] = "kubespawner" + + c.KubeSpawner.namespace = os.environ.get("POD_NAMESPACE", "default") + + # Max number of consecutive failures before the Hub restarts itself + set_config_if_not_none( + c.Spawner, + "consecutive_failure_limit", + "hub.consecutiveFailureLimit", + ) + + for trait, cfg_key in ( + ("pod_name_template", None), + ("start_timeout", None), + ("image_pull_policy", "image.pullPolicy"), + # ('image_pull_secrets', 'image.pullSecrets'), # Managed manually below + ("events_enabled", "events"), + ("extra_labels", None), + ("extra_annotations", None), + # ("allow_privilege_escalation", None), # Managed manually below + ("uid", None), + ("fs_gid", None), + ("service_account", "serviceAccountName"), + ("storage_extra_labels", "storage.extraLabels"), + # ("tolerations", "extraTolerations"), # Managed manually below + ("node_selector", None), + ("node_affinity_required", "extraNodeAffinity.required"), + ("node_affinity_preferred", "extraNodeAffinity.preferred"), + ("pod_affinity_required", "extraPodAffinity.required"), + ("pod_affinity_preferred", "extraPodAffinity.preferred"), + ("pod_anti_affinity_required", "extraPodAntiAffinity.required"), + ("pod_anti_affinity_preferred", "extraPodAntiAffinity.preferred"), + ("lifecycle_hooks", None), + ("init_containers", None), + ("extra_containers", None), + ("mem_limit", "memory.limit"), + ("mem_guarantee", "memory.guarantee"), + ("cpu_limit", "cpu.limit"), + ("cpu_guarantee", "cpu.guarantee"), + ("extra_resource_limits", "extraResource.limits"), + ("extra_resource_guarantees", "extraResource.guarantees"), + ("environment", "extraEnv"), + ("profile_list", None), + ("extra_pod_config", None), + ): + if cfg_key is None: + cfg_key = camelCaseify(trait) + set_config_if_not_none(c.KubeSpawner, trait, "singleuser." + cfg_key) + + image = get_config("singleuser.image.name") + if image: + tag = get_config("singleuser.image.tag") + if tag: + image = f"{image}:{tag}" + + c.KubeSpawner.image = image + + # allow_privilege_escalation defaults to False in KubeSpawner 2+. Since its a + # property where None, False, and True all are valid values that users of the + # Helm chart may want to set, we can't use the set_config_if_not_none helper + # function as someone may want to override the default False value to None. + # + c.KubeSpawner.allow_privilege_escalation = get_config( + "singleuser.allowPrivilegeEscalation" + ) + + # Combine imagePullSecret.create (single), imagePullSecrets (list), and + # singleuser.image.pullSecrets (list). + image_pull_secrets = [] + if get_config("imagePullSecret.automaticReferenceInjection") and get_config( + "imagePullSecret.create" + ): + image_pull_secrets.append(get_name("image-pull-secret")) + if get_config("imagePullSecrets"): + image_pull_secrets.extend(get_config("imagePullSecrets")) + if get_config("singleuser.image.pullSecrets"): + image_pull_secrets.extend(get_config("singleuser.image.pullSecrets")) + if image_pull_secrets: + c.KubeSpawner.image_pull_secrets = image_pull_secrets + + # scheduling: + if get_config("scheduling.userScheduler.enabled"): + c.KubeSpawner.scheduler_name = get_name("user-scheduler") + if get_config("scheduling.podPriority.enabled"): + c.KubeSpawner.priority_class_name = get_name("priority") + + # add node-purpose affinity + match_node_purpose = get_config("scheduling.userPods.nodeAffinity.matchNodePurpose") + if match_node_purpose: + node_selector = dict( + matchExpressions=[ + dict( + key="hub.jupyter.org/node-purpose", + operator="In", + values=["user"], + ) + ], + ) + if match_node_purpose == "prefer": + c.KubeSpawner.node_affinity_preferred.append( + dict( + weight=100, + preference=node_selector, + ), + ) + elif match_node_purpose == "require": + c.KubeSpawner.node_affinity_required.append(node_selector) + elif match_node_purpose == "ignore": + pass + else: + raise ValueError( + f"Unrecognized value for matchNodePurpose: {match_node_purpose}" + ) + + # Combine the common tolerations for user pods with singleuser tolerations + scheduling_user_pods_tolerations = get_config("scheduling.userPods.tolerations", []) + singleuser_extra_tolerations = get_config("singleuser.extraTolerations", []) + tolerations = scheduling_user_pods_tolerations + singleuser_extra_tolerations + if tolerations: + c.KubeSpawner.tolerations = tolerations + + # Configure dynamically provisioning pvc + storage_type = get_config("singleuser.storage.type") + if storage_type == "dynamic": + pvc_name_template = get_config("singleuser.storage.dynamic.pvcNameTemplate") + if pvc_name_template: + c.KubeSpawner.pvc_name_template = pvc_name_template + volume_name_template = get_config("singleuser.storage.dynamic.volumeNameTemplate") + c.KubeSpawner.storage_pvc_ensure = True + set_config_if_not_none( + c.KubeSpawner, "storage_class", "singleuser.storage.dynamic.storageClass" + ) + set_config_if_not_none( + c.KubeSpawner, + "storage_access_modes", + "singleuser.storage.dynamic.storageAccessModes", + ) + set_config_if_not_none( + c.KubeSpawner, "storage_capacity", "singleuser.storage.capacity" + ) + + # Add volumes to singleuser pods + c.KubeSpawner.volumes = [ + { + "name": volume_name_template, + "persistentVolumeClaim": {"claimName": "{pvc_name}"}, + } + ] + c.KubeSpawner.volume_mounts = [ + { + "mountPath": get_config("singleuser.storage.homeMountPath"), + "name": volume_name_template, + "subPath": get_config("singleuser.storage.dynamic.subPath"), + } + ] + elif storage_type == "static": + pvc_claim_name = get_config("singleuser.storage.static.pvcName") + c.KubeSpawner.volumes = [ + {"name": "home", "persistentVolumeClaim": {"claimName": pvc_claim_name}} + ] + + c.KubeSpawner.volume_mounts = [ + { + "mountPath": get_config("singleuser.storage.homeMountPath"), + "name": "home", + "subPath": get_config("singleuser.storage.static.subPath"), + } + ] + + # Inject singleuser.extraFiles as volumes and volumeMounts with data loaded from + # the dedicated k8s Secret prepared to hold the extraFiles actual content. + extra_files = get_config("singleuser.extraFiles", {}) + if extra_files: + volume = { + "name": "files", + } + items = [] + for file_key, file_details in extra_files.items(): + # Each item is a mapping of a key in the k8s Secret to a path in this + # abstract volume, the goal is to enable us to set the mode / + # permissions only though so we don't change the mapping. + item = { + "key": file_key, + "path": file_key, + } + if "mode" in file_details: + item["mode"] = file_details["mode"] + items.append(item) + volume["secret"] = { + "secretName": get_name("singleuser"), + "items": items, + } + c.KubeSpawner.volumes.append(volume) + + volume_mounts = [] + for file_key, file_details in extra_files.items(): + volume_mounts.append( + { + "mountPath": file_details["mountPath"], + "subPath": file_key, + "name": "files", + } + ) + c.KubeSpawner.volume_mounts.extend(volume_mounts) + + # Inject extraVolumes / extraVolumeMounts + c.KubeSpawner.volumes.extend(get_config("singleuser.storage.extraVolumes", [])) + c.KubeSpawner.volume_mounts.extend( + get_config("singleuser.storage.extraVolumeMounts", []) + ) + + c.JupyterHub.services = [] + c.JupyterHub.load_roles = [] + + # jupyterhub-idle-culler's permissions are scoped to what it needs only, see + # https://github.com/jupyterhub/jupyterhub-idle-culler#permissions. + # + if get_config("cull.enabled", False): + jupyterhub_idle_culler_role = { + "name": "jupyterhub-idle-culler", + "scopes": [ + "list:users", + "read:users:activity", + "read:servers", + "delete:servers", + # "admin:users", # dynamically added if --cull-users is passed + ], + # assign the role to a jupyterhub service, so it gains these permissions + "services": ["jupyterhub-idle-culler"], + } + + cull_cmd = ["python3", "-m", "jupyterhub_idle_culler"] + base_url = c.JupyterHub.get("base_url", "/") + cull_cmd.append("--url=http://localhost:8081" + url_path_join(base_url, "hub/api")) + + cull_timeout = get_config("cull.timeout") + if cull_timeout: + cull_cmd.append(f"--timeout={cull_timeout}") + + cull_every = get_config("cull.every") + if cull_every: + cull_cmd.append(f"--cull-every={cull_every}") + + cull_concurrency = get_config("cull.concurrency") + if cull_concurrency: + cull_cmd.append(f"--concurrency={cull_concurrency}") + + if get_config("cull.users"): + cull_cmd.append("--cull-users") + jupyterhub_idle_culler_role["scopes"].append("admin:users") + + if not get_config("cull.adminUsers"): + cull_cmd.append("--cull-admin-users=false") + + if get_config("cull.removeNamedServers"): + cull_cmd.append("--remove-named-servers") + + cull_max_age = get_config("cull.maxAge") + if cull_max_age: + cull_cmd.append(f"--max-age={cull_max_age}") + + c.JupyterHub.services.append( + { + "name": "jupyterhub-idle-culler", + "command": cull_cmd, + } + ) + c.JupyterHub.load_roles.append(jupyterhub_idle_culler_role) + + for key, service in get_config("hub.services", {}).items(): + # c.JupyterHub.services is a list of dicts, but + # hub.services is a dict of dicts to make the config mergable + service.setdefault("name", key) + + # As the api_token could be exposed in hub.existingSecret, we need to read + # it it from there or fall back to the chart managed k8s Secret's value. + service.pop("apiToken", None) + service["api_token"] = get_secret_value(f"hub.services.{key}.apiToken") + + c.JupyterHub.services.append(service) + + for key, role in get_config("hub.loadRoles", {}).items(): + # c.JupyterHub.load_roles is a list of dicts, but + # hub.loadRoles is a dict of dicts to make the config mergable + role.setdefault("name", key) + + c.JupyterHub.load_roles.append(role) + + # respect explicit null command (distinct from unspecified) + # this avoids relying on KubeSpawner.cmd's default being None + _unspecified = object() + specified_cmd = get_config("singleuser.cmd", _unspecified) + if specified_cmd is not _unspecified: + c.Spawner.cmd = specified_cmd + + set_config_if_not_none(c.Spawner, "default_url", "singleuser.defaultUrl") + + cloud_metadata = get_config("singleuser.cloudMetadata") + + if cloud_metadata.get("blockWithIptables") == True: + # Use iptables to block access to cloud metadata by default + network_tools_image_name = get_config("singleuser.networkTools.image.name") + network_tools_image_tag = get_config("singleuser.networkTools.image.tag") + network_tools_resources = get_config("singleuser.networkTools.resources") + ip = cloud_metadata["ip"] + ip_block_container = client.V1Container( + name="block-cloud-metadata", + image=f"{network_tools_image_name}:{network_tools_image_tag}", + command=[ + "iptables", + "--append", + "OUTPUT", + "--protocol", + "tcp", + "--destination", + ip, + "--destination-port", + "80", + "--jump", + "DROP", + ], + security_context=client.V1SecurityContext( + privileged=True, + run_as_user=0, + capabilities=client.V1Capabilities(add=["NET_ADMIN"]), + ), + resources=network_tools_resources, + ) + + c.KubeSpawner.init_containers.append(ip_block_container) + + + if get_config("debug.enabled", False): + c.JupyterHub.log_level = "DEBUG" + c.Spawner.debug = True + + # load potentially seeded secrets + # + # NOTE: ConfigurableHTTPProxy.auth_token is set through an environment variable + # that is set using the chart managed secret. + c.JupyterHub.cookie_secret = get_secret_value("hub.config.JupyterHub.cookie_secret") + # NOTE: CryptKeeper.keys should be a list of strings, but we have encoded as a + # single string joined with ; in the k8s Secret. + # + c.CryptKeeper.keys = get_secret_value("hub.config.CryptKeeper.keys").split(";") + + # load hub.config values, except potentially seeded secrets already loaded + for app, cfg in get_config("hub.config", {}).items(): + if app == "JupyterHub": + cfg.pop("proxy_auth_token", None) + cfg.pop("cookie_secret", None) + cfg.pop("services", None) + elif app == "ConfigurableHTTPProxy": + cfg.pop("auth_token", None) + elif app == "CryptKeeper": + cfg.pop("keys", None) + c[app].update(cfg) + + # load /usr/local/etc/jupyterhub/jupyterhub_config.d config files + config_dir = "/usr/local/etc/jupyterhub/jupyterhub_config.d" + if os.path.isdir(config_dir): + for file_path in sorted(glob.glob(f"{config_dir}/*.py")): + file_name = os.path.basename(file_path) + print(f"Loading {config_dir} config: {file_name}") + with open(file_path) as f: + file_content = f.read() + # compiling makes debugging easier: https://stackoverflow.com/a/437857 + exec(compile(source=file_content, filename=file_name, mode="exec")) + + # execute hub.extraConfig entries + for key, config_py in sorted(get_config("hub.extraConfig", {}).items()): + print(f"Loading extra config: {key}") + exec(config_py) + z2jh.py: | + """ + Utility methods for use in jupyterhub_config.py and dynamic subconfigs. + + Methods here can be imported by extraConfig in values.yaml + """ + + import os + from collections.abc import Mapping + from functools import lru_cache + + import yaml + + + # memoize so we only load config once + @lru_cache + def _load_config(): + """Load the Helm chart configuration used to render the Helm templates of + the chart from a mounted k8s Secret, and merge in values from an optionally + mounted secret (hub.existingSecret).""" + + cfg = {} + for source in ("secret/values.yaml", "existing-secret/values.yaml"): + path = f"/usr/local/etc/jupyterhub/{source}" + if os.path.exists(path): + print(f"Loading {path}") + with open(path) as f: + values = yaml.safe_load(f) + cfg = _merge_dictionaries(cfg, values) + else: + print(f"No config at {path}") + return cfg + + + @lru_cache + def _get_config_value(key): + """Load value from the k8s ConfigMap given a key.""" + + path = f"/usr/local/etc/jupyterhub/config/{key}" + if os.path.exists(path): + with open(path) as f: + return f.read() + else: + raise Exception(f"{path} not found!") + + + @lru_cache + def get_secret_value(key, default="never-explicitly-set"): + """Load value from the user managed k8s Secret or the default k8s Secret + given a key.""" + + for source in ("existing-secret", "secret"): + path = f"/usr/local/etc/jupyterhub/{source}/{key}" + if os.path.exists(path): + with open(path) as f: + return f.read() + if default != "never-explicitly-set": + return default + raise Exception(f"{key} not found in either k8s Secret!") + + + def get_name(name): + """Returns the fullname of a resource given its short name""" + return _get_config_value(name) + + + def get_name_env(name, suffix=""): + """Returns the fullname of a resource given its short name along with a + suffix, converted to uppercase with dashes replaced with underscores. This + is useful to reference named services associated environment variables, such + as PROXY_PUBLIC_SERVICE_PORT.""" + env_key = _get_config_value(name) + suffix + env_key = env_key.upper().replace("-", "_") + return os.environ[env_key] + + + def _merge_dictionaries(a, b): + """Merge two dictionaries recursively. + + Simplified From https://stackoverflow.com/a/7205107 + """ + merged = a.copy() + for key in b: + if key in a: + if isinstance(a[key], Mapping) and isinstance(b[key], Mapping): + merged[key] = _merge_dictionaries(a[key], b[key]) + else: + merged[key] = b[key] + else: + merged[key] = b[key] + return merged + + + def get_config(key, default=None): + """ + Find a config item of a given name & return it + + Parses everything as YAML, so lists and dicts are available too + + get_config("a.b.c") returns config['a']['b']['c'] + """ + value = _load_config() + # resolve path in yaml + for level in key.split("."): + if not isinstance(value, dict): + # a parent is a scalar or null, + # can't resolve full path + return default + if level not in value: + return default + else: + value = value[level] + return value + + + def set_config_if_not_none(cparent, name, key): + """ + Find a config item of a given name, set the corresponding Jupyter + configuration item if not None + """ + data = get_config(key) + if data is not None: + setattr(cparent, name, data) + checksum_hook-image-puller: "f42af7083fb07c4f701f205f9f7b02510d6ffd8db625a75bd2cc75ccfd72028e" +--- +# Source: jupyterhub/templates/proxy/autohttps/configmap.yaml +# This configmap contains Traefik configuration files to be mounted. +# - traefik.yaml will only be read during startup (static configuration) +# - dynamic.yaml will be read on change (dynamic configuration) +# +# ref: https://docs.traefik.io/getting-started/configuration-overview/ +# +# The configuration files are first rendered with Helm templating to large YAML +# strings. Then we use the fromYAML function on these strings to get an object, +# that we in turn merge with user provided extra configuration. +# +kind: ConfigMap +apiVersion: v1 +metadata: + name: autohttps + labels: + component: autohttps + app.kubernetes.io/component: autohttps + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +data: + traefik.yaml: | + accessLog: + fields: + headers: + names: + Authorization: redacted + Cookie: redacted + Set-Cookie: redacted + X-Xsrftoken: redacted + filters: + statusCodes: + - 500-599 + certificatesResolvers: + default: + acme: + caServer: https://acme-v02.api.letsencrypt.org/directory + email: you@email.com + httpChallenge: + entryPoint: http + storage: /etc/acme/acme.json + entryPoints: + http: + address: :8080 + https: + address: :8443 + transport: + respondingTimeouts: + idleTimeout: 10m0s + log: + level: WARN + providers: + file: + filename: /etc/traefik/dynamic.yaml + dynamic.yaml: | + http: + middlewares: + hsts: + headers: + stsIncludeSubdomains: false + stsPreload: false + stsSeconds: 15724800 + redirect: + redirectScheme: + permanent: true + scheme: https + scheme: + headers: + customRequestHeaders: + X-Scheme: https + routers: + default: + entrypoints: + - https + middlewares: + - hsts + - scheme + rule: PathPrefix(`/`) + service: default + tls: + certResolver: default + domains: + - main: software.llnl.gov + options: default + insecure: + entrypoints: + - http + middlewares: + - redirect + rule: PathPrefix(`/`) + service: default + services: + default: + loadBalancer: + servers: + - url: http://proxy-http:8000/ + tls: + options: + default: + cipherSuites: + - TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 + - TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 + - TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256 + - TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 + - TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305 + minVersion: VersionTLS12 + sniStrict: true +--- +# Source: jupyterhub/templates/scheduling/user-scheduler/configmap.yaml +kind: ConfigMap +apiVersion: v1 +metadata: + name: user-scheduler + labels: + component: user-scheduler + app.kubernetes.io/component: user-scheduler + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +data: + config.yaml: | + apiVersion: kubescheduler.config.k8s.io/v1 + kind: KubeSchedulerConfiguration + leaderElection: + resourceLock: leases + resourceName: user-scheduler-lock + resourceNamespace: "default" + profiles: + - schedulerName: hpdc-2025-pave-dry-run-jupyter-user-scheduler + plugins: + score: + disabled: + - name: NodeResourcesBalancedAllocation + - name: NodeAffinity + - name: InterPodAffinity + - name: NodeResourcesFit + - name: ImageLocality + enabled: + - name: NodeAffinity + weight: 14631 + - name: InterPodAffinity + weight: 1331 + - name: NodeResourcesFit + weight: 121 + - name: ImageLocality + weight: 11 + pluginConfig: + - args: + scoringStrategy: + resources: + - name: cpu + weight: 1 + - name: memory + weight: 1 + type: MostAllocated + name: NodeResourcesFit +--- +# Source: jupyterhub/templates/hub/pvc.yaml +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: hub-db-dir + labels: + component: hub + app.kubernetes.io/component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + storageClassName: "gp3" + accessModes: + - ReadWriteOnce + resources: + requests: + storage: "32Gi" +--- +# Source: jupyterhub/templates/scheduling/user-scheduler/rbac.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: hpdc-2025-pave-dry-run-jupyter-user-scheduler + labels: + component: user-scheduler + app.kubernetes.io/component: user-scheduler + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +rules: + # Copied from the system:kube-scheduler ClusterRole of the k8s version + # matching the kube-scheduler binary we use. A modification has been made to + # resourceName fields to remain relevant for how we have named our resources + # in this Helm chart. + # + # NOTE: These rules have been: + # - unchanged between 1.12 and 1.15 + # - changed in 1.16 + # - changed in 1.17 + # - unchanged between 1.18 and 1.20 + # - changed in 1.21: get/list/watch permission for namespace, + # csidrivers, csistoragecapacities was added. + # - unchanged between 1.22 and 1.27 + # - changed in 1.28: permissions to get/update lock endpoint resource + # removed + # - unchanged between 1.28 and 1.30 + # - (1.31 is known to bring some changes below) + # + # ref: https://github.com/kubernetes/kubernetes/blob/v1.30.0/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml#L721-L862 + - apiGroups: + - "" + - events.k8s.io + resources: + - events + verbs: + - create + - patch + - update + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - apiGroups: + - coordination.k8s.io + resourceNames: + - user-scheduler-lock + resources: + - leases + verbs: + - get + - update + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - pods + verbs: + - delete + - get + - list + - watch + - apiGroups: + - "" + resources: + - bindings + - pods/binding + verbs: + - create + - apiGroups: + - "" + resources: + - pods/status + verbs: + - patch + - update + - apiGroups: + - "" + resources: + - replicationcontrollers + - services + verbs: + - get + - list + - watch + - apiGroups: + - apps + - extensions + resources: + - replicasets + verbs: + - get + - list + - watch + - apiGroups: + - apps + resources: + - statefulsets + verbs: + - get + - list + - watch + - apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - persistentvolumeclaims + - persistentvolumes + verbs: + - get + - list + - watch + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create + - apiGroups: + - storage.k8s.io + resources: + - csinodes + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list + - watch + - apiGroups: + - storage.k8s.io + resources: + - csidrivers + verbs: + - get + - list + - watch + - apiGroups: + - storage.k8s.io + resources: + - csistoragecapacities + verbs: + - get + - list + - watch + + # Copied from the system:volume-scheduler ClusterRole of the k8s version + # matching the kube-scheduler binary we use. + # + # NOTE: These rules have not changed between 1.12 and 1.29. + # + # ref: https://github.com/kubernetes/kubernetes/blob/v1.29.0/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml#L1283-L1310 + - apiGroups: + - "" + resources: + - persistentvolumes + verbs: + - get + - list + - patch + - update + - watch + - apiGroups: + - storage.k8s.io + resources: + - storageclasses + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - get + - list + - patch + - update + - watch +--- +# Source: jupyterhub/templates/scheduling/user-scheduler/rbac.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: hpdc-2025-pave-dry-run-jupyter-user-scheduler + labels: + component: user-scheduler + app.kubernetes.io/component: user-scheduler + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +subjects: + - kind: ServiceAccount + name: user-scheduler + namespace: "default" +roleRef: + kind: ClusterRole + name: hpdc-2025-pave-dry-run-jupyter-user-scheduler + apiGroup: rbac.authorization.k8s.io +--- +# Source: jupyterhub/templates/hub/rbac.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: hub + labels: + component: hub + app.kubernetes.io/component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: [""] # "" indicates the core API group + resources: ["pods", "persistentvolumeclaims", "secrets", "services"] + verbs: ["get", "watch", "list", "create", "delete"] + - apiGroups: [""] # "" indicates the core API group + resources: ["events"] + verbs: ["get", "watch", "list"] +--- +# Source: jupyterhub/templates/proxy/autohttps/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: autohttps + labels: + component: autohttps + app.kubernetes.io/component: autohttps + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +rules: +- apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "patch", "list", "create"] +--- +# Source: jupyterhub/templates/hub/rbac.yaml +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: hub + labels: + component: hub + app.kubernetes.io/component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +subjects: + - kind: ServiceAccount + name: hub + namespace: "default" +roleRef: + kind: Role + name: hub + apiGroup: rbac.authorization.k8s.io +--- +# Source: jupyterhub/templates/proxy/autohttps/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: autohttps + labels: + component: autohttps + app.kubernetes.io/component: autohttps + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +subjects: +- kind: ServiceAccount + name: autohttps + apiGroup: +roleRef: + kind: Role + name: autohttps + apiGroup: rbac.authorization.k8s.io +--- +# Source: jupyterhub/templates/hub/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: hub + labels: + component: hub + app.kubernetes.io/component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm + annotations: + prometheus.io/scrape: "true" + prometheus.io/path: /benchpark/hpdc-25-tutorial/hub/metrics + prometheus.io/port: "8081" +spec: + type: ClusterIP + selector: + component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + ports: + - name: hub + port: 8081 + targetPort: http +--- +# Source: jupyterhub/templates/proxy/autohttps/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: proxy-http + labels: + component: autohttps + app.kubernetes.io/component: autohttps + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + selector: + component: proxy + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + ports: + - port: 8000 + targetPort: http +--- +# Source: jupyterhub/templates/proxy/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: proxy-api + labels: + component: proxy-api + app.kubernetes.io/component: proxy-api + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + selector: + component: proxy + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + ports: + - port: 8001 + targetPort: api +--- +# Source: jupyterhub/templates/proxy/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: proxy-public + labels: + component: proxy-public + app.kubernetes.io/component: proxy-public + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + selector: + # This service will target the autohttps pod if autohttps is configured, and + # the proxy pod if not. When autohttps is configured, the service proxy-http + # will be around to target the proxy pod directly. + component: autohttps + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + ports: + - name: https + port: 443 + # When HTTPS termination is handled outside our helm chart, pass traffic + # coming in via this Service's port 443 to targeted pod's port meant for + # HTTP traffic. + targetPort: https + - name: http + port: 80 + targetPort: http + type: LoadBalancer +--- +# Source: jupyterhub/templates/image-puller/daemonset-continuous.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: continuous-image-puller + labels: + component: continuous-image-puller + app.kubernetes.io/component: continuous-image-puller + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + component: continuous-image-puller + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 100% + template: + metadata: + labels: + component: continuous-image-puller + app.kubernetes.io/component: continuous-image-puller + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + spec: + tolerations: + - effect: NoSchedule + key: hub.jupyter.org/dedicated + operator: Equal + value: user + - effect: NoSchedule + key: hub.jupyter.org_dedicated + operator: Equal + value: user + terminationGracePeriodSeconds: 0 + serviceAccountName: continuous-image-puller + automountServiceAccountToken: false + initContainers: + - name: image-pull-metadata-block + image: quay.io/jupyterhub/k8s-network-tools:4.2.0 + command: + - /bin/sh + - -c + - echo "Pulling complete" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + - name: image-pull-singleuser + image: ghcr.io/llnl/reproducible-benchmarking-spawn:hpdc-2025 + command: + - /bin/sh + - -c + - echo "Pulling complete" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + - name: image-pull-singleuser-init-and-extra-containers-0 + image: ghcr.io/llnl/reproducible-benchmarking-init:hpdc-2025 + command: + - /bin/sh + - -c + - echo "Pulling complete" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containers: + - name: pause + image: registry.k8s.io/pause:3.10 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault +--- +# Source: jupyterhub/templates/hub/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: hub + labels: + component: hub + app.kubernetes.io/component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + strategy: + type: Recreate + template: + metadata: + labels: + component: hub + app.kubernetes.io/component: hub + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + hub.jupyter.org/network-access-proxy-api: "true" + hub.jupyter.org/network-access-proxy-http: "true" + hub.jupyter.org/network-access-singleuser: "true" + annotations: + checksum/config-map: 8c5c098976319e4bd0037e3665086e3c0974a958c111a33527a4a152a85db745 + checksum/secret: 4899d6a61388434958df3de285b5cba270ff90de21421a067d1f3ee9392eabd3 + spec: + tolerations: + - effect: NoSchedule + key: hub.jupyter.org/dedicated + operator: Equal + value: core + - effect: NoSchedule + key: hub.jupyter.org_dedicated + operator: Equal + value: core + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: hub.jupyter.org/node-purpose + operator: In + values: [core] + volumes: + - name: config + configMap: + name: hub + - name: secret + secret: + secretName: hub + - name: pvc + persistentVolumeClaim: + claimName: hub-db-dir + serviceAccountName: hub + securityContext: + fsGroup: 1000 + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + - name: hub + image: jupyterhub/k8s-hub:4.2.0 + args: + - jupyterhub + - --config + - /usr/local/etc/jupyterhub/jupyterhub_config.py + - --upgrade-db + volumeMounts: + - mountPath: /usr/local/etc/jupyterhub/jupyterhub_config.py + subPath: jupyterhub_config.py + name: config + - mountPath: /usr/local/etc/jupyterhub/z2jh.py + subPath: z2jh.py + name: config + - mountPath: /usr/local/etc/jupyterhub/config/ + name: config + - mountPath: /usr/local/etc/jupyterhub/secret/ + name: secret + - mountPath: /srv/jupyterhub + name: pvc + imagePullPolicy: Always + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 1000 + runAsUser: 1000 + env: + - name: PYTHONUNBUFFERED + value: "1" + - name: HELM_RELEASE_NAME + value: "hpdc-2025-pave-dry-run-jupyter" + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIGPROXY_AUTH_TOKEN + valueFrom: + secretKeyRef: + name: hub + key: hub.config.ConfigurableHTTPProxy.auth_token + ports: + - name: http + containerPort: 8081 + livenessProbe: + initialDelaySeconds: 300 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 30 + httpGet: + path: /benchpark/hpdc-25-tutorial/hub/health + port: http + readinessProbe: + initialDelaySeconds: 0 + periodSeconds: 2 + timeoutSeconds: 1 + failureThreshold: 1000 + httpGet: + path: /benchpark/hpdc-25-tutorial/hub/health + port: http +--- +# Source: jupyterhub/templates/proxy/autohttps/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: autohttps + labels: + component: autohttps + app.kubernetes.io/component: autohttps + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + component: autohttps + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + template: + metadata: + labels: + component: autohttps + app.kubernetes.io/component: autohttps + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + hub.jupyter.org/network-access-proxy-http: "true" + annotations: + # Only force a restart through a change to this checksum when the static + # configuration is changed, as the dynamic can be updated after start. + # Any disruptions to this deployment impacts everything, it is the + # entrypoint of all network traffic. + checksum/static-config: c706b8ce4f21b269498a41a636e284516df84b61952e6bb1f1366fc84188f5e7 + spec: + serviceAccountName: autohttps + tolerations: + - effect: NoSchedule + key: hub.jupyter.org/dedicated + operator: Equal + value: core + - effect: NoSchedule + key: hub.jupyter.org_dedicated + operator: Equal + value: core + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: hub.jupyter.org/node-purpose + operator: In + values: [core] + volumes: + - name: certificates + emptyDir: {} + - name: traefik-config + configMap: + name: autohttps + initContainers: + - name: load-acme + image: "quay.io/jupyterhub/k8s-secret-sync:4.2.0" + args: + - load + - proxy-public-tls-acme + - acme.json + - /etc/acme/acme.json + env: + # We need this to get logs immediately + - name: PYTHONUNBUFFERED + value: "True" + volumeMounts: + - name: certificates + mountPath: /etc/acme + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containers: + - name: traefik + image: "traefik:v3.3.5" + ports: + - name: http + containerPort: 8080 + - name: https + containerPort: 8443 + volumeMounts: + - name: traefik-config + mountPath: /etc/traefik + - name: certificates + mountPath: /etc/acme + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + - name: secret-sync + image: "quay.io/jupyterhub/k8s-secret-sync:4.2.0" + args: + - watch-save + - --label=app.kubernetes.io/name=jupyterhub + - --label=app.kubernetes.io/instance=hpdc-2025-pave-dry-run-jupyter + - --label=helm.sh/chart=jupyterhub-4.2.0 + - --label=app.kubernetes.io/managed-by=secret-sync + - proxy-public-tls-acme + - acme.json + - /etc/acme/acme.json + env: + # We need this to get logs immediately + - name: PYTHONUNBUFFERED + value: "True" + volumeMounts: + - name: certificates + mountPath: /etc/acme + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault +--- +# Source: jupyterhub/templates/proxy/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: proxy + labels: + component: proxy + app.kubernetes.io/component: proxy + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + component: proxy + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + strategy: + rollingUpdate: null + type: Recreate + template: + metadata: + labels: + component: proxy + app.kubernetes.io/component: proxy + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + hub.jupyter.org/network-access-hub: "true" + hub.jupyter.org/network-access-singleuser: "true" + annotations: + # We want to restart proxy only if the auth token changes + # Other changes to the hub config should not restart. + # We truncate to 4 chars to avoid leaking auth token info, + # since someone could brute force the hash to obtain the token + # + # Note that if auth_token has to be generated at random, it will be + # generated at random here separately from being generated at random in + # the k8s Secret template. This will cause this annotation to change to + # match the k8s Secret during the first upgrade following an auth_token + # was generated. + checksum/auth-token: "6c4e" + checksum/proxy-secret: "01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b" + spec: + terminationGracePeriodSeconds: 60 + tolerations: + - effect: NoSchedule + key: hub.jupyter.org/dedicated + operator: Equal + value: core + - effect: NoSchedule + key: hub.jupyter.org_dedicated + operator: Equal + value: core + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: hub.jupyter.org/node-purpose + operator: In + values: [core] + containers: + - name: chp + image: quay.io/jupyterhub/configurable-http-proxy:4.6.3 + command: + - configurable-http-proxy + - "--ip=" + - "--api-ip=" + - --api-port=8001 + - --default-target=http://hub:$(HUB_SERVICE_PORT) + - --error-target=http://hub:$(HUB_SERVICE_PORT)/hub/error + - --port=8000 + env: + - name: CONFIGPROXY_AUTH_TOKEN + valueFrom: + secretKeyRef: + # NOTE: References the chart managed k8s Secret even if + # hub.existingSecret is specified to avoid using the + # lookup function on the user managed k8s Secret. + name: hub + key: hub.config.ConfigurableHTTPProxy.auth_token + ports: + - name: http + containerPort: 8000 + - name: api + containerPort: 8001 + livenessProbe: + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 30 + httpGet: + path: /_chp_healthz + port: http + scheme: HTTP + readinessProbe: + initialDelaySeconds: 0 + periodSeconds: 2 + timeoutSeconds: 1 + failureThreshold: 1000 + httpGet: + path: /_chp_healthz + port: http + scheme: HTTP + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault +--- +# Source: jupyterhub/templates/scheduling/user-scheduler/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: user-scheduler + labels: + component: user-scheduler + app.kubernetes.io/component: user-scheduler + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + replicas: 2 + selector: + matchLabels: + component: user-scheduler + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + template: + metadata: + labels: + component: user-scheduler + app.kubernetes.io/component: user-scheduler + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + annotations: + checksum/config-map: c2c27729f1dec95364acaeb8b801d26eb1488c087348dd2c317a3354315415f2 + spec: + + serviceAccountName: user-scheduler + tolerations: + - effect: NoSchedule + key: hub.jupyter.org/dedicated + operator: Equal + value: core + - effect: NoSchedule + key: hub.jupyter.org_dedicated + operator: Equal + value: core + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: hub.jupyter.org/node-purpose + operator: In + values: [core] + volumes: + - name: config + configMap: + name: user-scheduler + containers: + - name: kube-scheduler + image: registry.k8s.io/kube-scheduler:v1.30.11 + command: + - /usr/local/bin/kube-scheduler + # NOTE: --authentication-skip-lookup=true is used to avoid a + # seemingly harmless error, if we need to not skip + # "authentication lookup" in the future, see the linked issue. + # + # ref: https://github.com/jupyterhub/zero-to-jupyterhub-k8s/issues/1894 + - --config=/etc/user-scheduler/config.yaml + - --authentication-skip-lookup=true + - --v=4 + volumeMounts: + - mountPath: /etc/user-scheduler + name: config + livenessProbe: + httpGet: + path: /healthz + scheme: HTTPS + port: 10259 + initialDelaySeconds: 15 + readinessProbe: + httpGet: + path: /healthz + scheme: HTTPS + port: 10259 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault +--- +# Source: jupyterhub/templates/scheduling/user-placeholder/statefulset.yaml +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: user-placeholder + labels: + component: user-placeholder + app.kubernetes.io/component: user-placeholder + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + podManagementPolicy: Parallel + replicas: 0 + selector: + matchLabels: + component: user-placeholder + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + serviceName: user-placeholder + template: + metadata: + labels: + component: user-placeholder + app.kubernetes.io/component: user-placeholder + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + spec: + schedulerName: hpdc-2025-pave-dry-run-jupyter-user-scheduler + tolerations: + - effect: NoSchedule + key: hub.jupyter.org/dedicated + operator: Equal + value: user + - effect: NoSchedule + key: hub.jupyter.org_dedicated + operator: Equal + value: user + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: hub.jupyter.org/node-purpose + operator: In + values: [user] + terminationGracePeriodSeconds: 0 + automountServiceAccountToken: false + containers: + - name: pause + image: registry.k8s.io/pause:3.10 + resources: + requests: + cpu: 32 + memory: 64G + limits: + cpu: 32 + memory: 64G + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault +--- +# Source: jupyterhub/templates/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: jupyterhub + labels: + component: ingress + app.kubernetes.io/component: ingress + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm +spec: + rules: + - http: + paths: + - path: /benchpark/hpdc-25-tutorial/ + pathType: Prefix + backend: + service: + name: proxy-public + port: + name: http + host: "software.llnl.gov" +--- +# Source: jupyterhub/templates/image-puller/serviceaccount-hook.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: hook-image-puller + labels: + component: image-puller + app.kubernetes.io/component: image-puller + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm + hub.jupyter.org/deletable: "true" + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-weight": "-10" +--- +# Source: jupyterhub/templates/image-puller/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: hook-image-awaiter + labels: + component: image-puller + app.kubernetes.io/component: image-puller + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm + hub.jupyter.org/deletable: "true" + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-weight": "0" +--- +# Source: jupyterhub/templates/image-puller/rbac.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: hook-image-awaiter + labels: + component: image-puller + app.kubernetes.io/component: image-puller + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm + hub.jupyter.org/deletable: "true" + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-weight": "0" +rules: + - apiGroups: ["apps"] # "" indicates the core API group + resources: ["daemonsets"] + verbs: ["get"] +--- +# Source: jupyterhub/templates/image-puller/rbac.yaml +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: hook-image-awaiter + labels: + component: image-puller + app.kubernetes.io/component: image-puller + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm + hub.jupyter.org/deletable: "true" + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-weight": "0" +subjects: + - kind: ServiceAccount + name: hook-image-awaiter + namespace: "default" +roleRef: + kind: Role + name: hook-image-awaiter + apiGroup: rbac.authorization.k8s.io +--- +# Source: jupyterhub/templates/image-puller/daemonset-hook.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: hook-image-puller + labels: + component: hook-image-puller + app.kubernetes.io/component: hook-image-puller + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm + hub.jupyter.org/deletable: "true" + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-weight": "-10" +spec: + selector: + matchLabels: + component: hook-image-puller + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 100% + template: + metadata: + labels: + component: hook-image-puller + app.kubernetes.io/component: hook-image-puller + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + spec: + tolerations: + - effect: NoSchedule + key: hub.jupyter.org/dedicated + operator: Equal + value: user + - effect: NoSchedule + key: hub.jupyter.org_dedicated + operator: Equal + value: user + terminationGracePeriodSeconds: 0 + serviceAccountName: hook-image-puller + automountServiceAccountToken: false + initContainers: + - name: image-pull-metadata-block + image: quay.io/jupyterhub/k8s-network-tools:4.2.0 + command: + - /bin/sh + - -c + - echo "Pulling complete" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + - name: image-pull-singleuser + image: ghcr.io/llnl/reproducible-benchmarking-spawn:hpdc-2025 + command: + - /bin/sh + - -c + - echo "Pulling complete" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + - name: image-pull-singleuser-init-and-extra-containers-0 + image: ghcr.io/llnl/reproducible-benchmarking-init:hpdc-2025 + command: + - /bin/sh + - -c + - echo "Pulling complete" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containers: + - name: pause + image: registry.k8s.io/pause:3.10 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault +--- +# Source: jupyterhub/templates/image-puller/job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: hook-image-awaiter + labels: + component: image-puller + app.kubernetes.io/component: image-puller + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + chart: jupyterhub-4.2.0 + heritage: Helm + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + helm.sh/chart: jupyterhub-4.2.0 + app.kubernetes.io/managed-by: Helm + hub.jupyter.org/deletable: "true" + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-weight": "10" +spec: + template: + # The hook-image-awaiter Job and hook-image-puller DaemonSet was + # conditionally created based on this state: + # + # prePuller.hook.enabled=true + # prePuller.hook.pullOnlyOnChanges=true + # post-upgrade checksum != pre-upgrade checksum (of the hook-image-puller DaemonSet) + # "f42af7083fb07c4f701f205f9f7b02510d6ffd8db625a75bd2cc75ccfd72028e" != "" + # + metadata: + labels: + component: image-puller + app.kubernetes.io/component: image-puller + app: "jupyterhub" + release: "hpdc-2025-pave-dry-run-jupyter" + app.kubernetes.io/name: "jupyterhub" + app.kubernetes.io/instance: "hpdc-2025-pave-dry-run-jupyter" + spec: + restartPolicy: Never + serviceAccountName: hook-image-awaiter + tolerations: + - effect: NoSchedule + key: hub.jupyter.org/dedicated + operator: Equal + value: core + - effect: NoSchedule + key: hub.jupyter.org_dedicated + operator: Equal + value: core + containers: + - image: quay.io/jupyterhub/k8s-image-awaiter:4.2.0 + name: hook-image-awaiter + command: + - /image-awaiter + - -ca-path=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + - -auth-token-path=/var/run/secrets/kubernetes.io/serviceaccount/token + - -api-server-address=https://kubernetes.default.svc:$(KUBERNETES_SERVICE_PORT) + - -namespace=default + - -daemonset=hook-image-puller + - -pod-scheduling-wait-duration=10 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault diff --git a/2025-HPDC/infrastructure/https_test/update_jupyterhub_deployment.sh b/2025-HPDC/infrastructure/https_test/update_jupyterhub_deployment.sh new file mode 100755 index 0000000..68c3306 --- /dev/null +++ b/2025-HPDC/infrastructure/https_test/update_jupyterhub_deployment.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +helm upgrade hpdc-2025-pave-dry-run-jupyter jupyterhub/jupyterhub --values ./helm-config.yaml + +echo "The JupyterHub deployment is updated!" \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/README.md b/2025-HPDC/infrastructure/pave-dry-run/README.md new file mode 100644 index 0000000..c7fd67b --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/README.md @@ -0,0 +1,114 @@ +# Deploy hpdc-2025-pave-dry-run to AWS Elastic Kubernetes Service (EKS) + +These config files and scripts can be used to deploy the hpdc-2025-pave-dry-run tutorial to EKS. + +The sections below walk you through the steps to deploying your cluster. All commands in these +sections should be run from the same directory as this README. + +## Step 1: Create EKS cluster + +To create an EKS cluster with your configured settings, run the following: + +```bash +$ ./create_cluster.sh +``` + +Be aware that this step can take upwards of 15-30 minutes to complete. + +## Step 2: Configure Kubernetes within the EKS cluster + +After creating the cluster, we need to configure Kubernetes and its addons. In particular, +we need to setup the Kubernetes autoscaler, which will allow our tutorial to scale to as +many users as our cluster's resources can possibly handle. + +To configure Kubernetes and the autoscaler, run the following: + +```bash +$ ./configure_kubernetes.sh +``` + +## Step 3: Deploy JupyterHub to the EKS cluster + +With the cluster properly created and configured, we now can deploy JupyterHub to the cluster +to manage everything else about our tutorial. + +To deploy JupyterHub, run the following: + +```bash +$ ./deploy_jupyterhub.sh +``` + +## Step 4: Verify that everything is working + +After deploying JupyterHub, we need to make sure that all the necessary components +are working properly. + +To check this, run the following: + +```bash +$ ./check_jupyterhub_status.sh +``` + +If everything worked properly, you should see an output like this: + +``` +NAME READY STATUS RESTARTS AGE +continuous-image-puller-2gqrw 1/1 Running 0 30s +continuous-image-puller-gb7mj 1/1 Running 0 30s +hub-8446c9d589-vgjlw 1/1 Running 0 30s +proxy-7d98df9f7-s5gft 1/1 Running 0 30s +user-scheduler-668ff95ccf-fw6wv 1/1 Running 0 30s +user-scheduler-668ff95ccf-wq5xp 1/1 Running 0 30s +``` + +Be aware that the hub pod (i.e., hub-8446c9d589-vgjlw above) may take a minute or so to start. + +If something went wrong, you will have to edit the config YAML files to get things working. Before +trying to work things out yourself, check the FAQ to see if your issue has already been addressed. + +Depending on what file you edit, you may have to run different commands to update the EKS cluster and +deployment of JupyterHub. Follow the steps below to update: +1. If you only edited `helm-config.yaml`, try to just update the deployment of Jupyterhub by running `./update_jupyterhub_deployment.sh` +2. If step 1 failed, fully tear down the JupyterHub deployment with `./tear_down_jupyterhub.sh` and then re-deploy it with `./deploy_jupyterhub.sh` +3. If you edited `cluster-autoscaler.yaml` or `storage-class.yaml`, tear down the JupyterHub deployment with `./tear_down_jupyterhub.sh`. Then, reconfigure Kubernetes with `./configure_kubernetes.sh`, and re-deploy JupyterHub with `./deploy_jupyterhub.sh` +4. If you edited `eksctl-config.yaml`, fully tear down the cluster with `cleanup.sh`, and then restart from the top of this README + +## Step 5: Get the public cluster URL + +Now that everything's ready to go, we need to get the public URL to the cluster. + +To do this, run the following: + +```bash +$ ./get_jupyterhub_url.sh +``` + +Note that it can take several minutes after the URL is available for it to actually redirect +to JupyterHub. + +## Step 6: Distribute URL and password to attendees + +Now that we have our pulbic URL, we can give the attendees everything they need to join the tutorial. + +For attendees to access JupyterHub, they simply need to enter the public URL (from step 5) in their browser of choice. +This will take them to a login page. The login credentials are as follows: +* Username: anything the attendee wants (note: this should be unique for every user. Otherwise, users will share pods.) +* Password: the password specified towards the top of `helm-config.yaml` + +Once the attendees log in with these credentials, the Kubernetes autoscaler will spin up a pod for them (and grab new +resources, if needed). This pod will contain a JupyterLab instace with the tutorial materials and environment already +prepared for them. + +At this point, you can start presenting your interactive tutorial! + +## Step 7: Cleanup everything + +Once you are done with your tutorial, you should cleanup everything so that there are not continuing, unneccesary expenses +to your AWS account. To do this, simply run the following: + +```bash +$ ./cleanup.sh +``` + +After cleaning everything up, you can verify that everything has been cleaned up by going to the AWS web consle +and ensuring nothing from your tutorial still exists in CloudFormation and EKS. \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/check_hub_log.sh b/2025-HPDC/infrastructure/pave-dry-run/check_hub_log.sh new file mode 100755 index 0000000..1c13e91 --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/check_hub_log.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +hub_pod_id=$(kubectl get pods -n default --no-headers=true | awk '/hub/{print $1}') +kubectl logs $hub_pod_id \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/check_init_container_log.sh b/2025-HPDC/infrastructure/pave-dry-run/check_init_container_log.sh new file mode 100755 index 0000000..f4fd398 --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/check_init_container_log.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +if [ $# -ne 1 ]; then + echo "Usage: ./check_init_container_log.sh " + exit 1 +fi + +kubectl logs $1 -c init-tutorial-service \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/check_jupyterhub_status.sh b/2025-HPDC/infrastructure/pave-dry-run/check_jupyterhub_status.sh new file mode 100755 index 0000000..10b4261 --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/check_jupyterhub_status.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +kubectl --namespace=default get pods + +echo "If there are issues with any pods, you can get more details with:" +echo " $ kubectl --namespace=default describe pod " \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/cleanup.sh b/2025-HPDC/infrastructure/pave-dry-run/cleanup.sh new file mode 100755 index 0000000..0f8222f --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/cleanup.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +if ! command -v eksctl >/dev/null 2>&1; then + echo "ERROR: 'eksctl' is required to create a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://eksctl.io/installation/" + exit 1 +fi + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +# Temporarily allow errors in the script so that the script won't fail +# if the JupyterHub deployment failed or was previously torn down +set +e +echo "Tearing down JupyterHub and uninstalling everything related to Helm:" +helm uninstall hpdc-2025-pave-dry-run-jupyter +set -e + +echo "" +echo "Deleting all pods from the EKS cluster:" +kubectl delete pod --all-namespaces --all --force + +echo "" +echo "Deleting the EKS cluster:" +eksctl delete cluster --config-file ./eksctl-config.yaml --wait + +echo "" +echo "Everything is now cleaned up!" \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/cluster-autoscaler.yaml b/2025-HPDC/infrastructure/pave-dry-run/cluster-autoscaler.yaml new file mode 100644 index 0000000..f525796 --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/cluster-autoscaler.yaml @@ -0,0 +1,272 @@ +# The roles defined in this config file set permissions on several Kubernetes resources. +# +# Resources referred to: +# * events: resource representing information/responses generated from actions or changes taken against the cluster +# * endpoints: resource representing REST API endpoints within the cluster +# * pods/eviction: resource that terminates and removes pods when created +# * pods/status: resource used to query or edit the status of pods +# * nodes: resource representing the physical or virtual nodes of the cluster +# * namespaces: resource representing a group of isolated resources within the cluster +# * pods: resource representing a unit of computation that is deployed to a node +# * services: resource representing a networked application running in a pod and exposed over the network (either internal to the cluster or external to the broader internet) +# * replicationcontrollers: legacy resource for managing horizontal scaling (i.e., scale-out). Used for broader support across clouds +# * persistantvolumeclaims: resource representing a request for storage by a user +# * persistantvolumes: resource representing actual storage +# * replicasets: resource that creates replica pods that are used to ensure some minimum number of identical pods in the cluster +# * daemonsets: resource that ensures copies of pods are deployed to new nodes and removed from removed nodes +# * poddisruptionbudgets: resource that represents the cluster policy regarding the minimum number of pods that must remain available +# during voluntary disruptions (i.e., pod/node eviction not caused by something like hardware failure) +# * statefulsets: resource that maintains pod state +# * storageclasses: resource that describes different types of storage. Often used for things like QoS levels +# * csinodes: resource that describes a node's ability to interact with one or more storage providers. Mainly used by Kubernetes's scheduler +# * csidrivers: resource that provide information on the drivers for a single storage provider installed on a node +# * csistoragecapacities: resource that describes the available storage from different providers +# * jobs: resource that represents one-off tasks spread across one or more pods that must run to completion. Useful for certain types of setup and elasticity work +# * leases: resource that allows different pods, nodes, or kublets (kubernetes daemon on a node) to lock shared resources. Think of it like a mutex +# * configmaps: resource representing non-confidential key-value pair info. Often used to decouple environment-specific configuration from container images +--- +# Create a Service Account that will act as the internal user during the creation +# of the autoscaling infrastructure and have all the appropriate roles and permissions assigned +# to do its work +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler + name: cluster-autoscaler + namespace: kube-system +--- +# Create a ClusterRole to set permissions for associated +# users across the entire cluster +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cluster-autoscaler + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +rules: + # Allow associated users to create or partially update events and endpoints + - apiGroups: [""] + resources: ["events", "endpoints"] + verbs: ["create", "patch"] + # Allow associated users to evict pods + - apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] + # Allow associated users to update pod statuses + - apiGroups: [""] + resources: ["pods/status"] + verbs: ["update"] + # Allow associated users to get and update the state of the autoscaler + - apiGroups: [""] + resources: ["endpoints"] + resourceNames: ["cluster-autoscaler"] + verbs: ["get", "update"] + # Allow associated users to be notified of changes to, list, get the state of, + # and fully update information related to nodes + - apiGroups: [""] + resources: ["nodes"] + verbs: ["watch", "list", "get", "update"] + # Allow associated users to be notified of changes to, list, and get the state of + # namespaces, pods, services, replication controllers, persistent volume claims, and + # persistent volumes + - apiGroups: [""] + resources: + - "namespaces" + - "pods" + - "services" + - "replicationcontrollers" + - "persistentvolumeclaims" + - "persistentvolumes" + verbs: ["watch", "list", "get"] + # Allow associated users to be notified of changes to, list, and get the state of + # replica sets, and daemon sets + - apiGroups: ["extensions"] + resources: ["replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + # Allow associated users to be notified of changes to and list pod disruption budgets + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["watch", "list"] + # Allow associated users to be notified of changes to, list, and get the state of + # stateful sets, replica sets, and daemon sets + - apiGroups: ["apps"] + resources: ["statefulsets", "replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + # Allow associated users to be notified of chagnes to, list, and get the state of + # all resources related to available storage + - apiGroups: ["storage.k8s.io"] + resources: + ["storageclasses", "csinodes", "csidrivers", "csistoragecapacities"] + verbs: ["watch", "list", "get"] + # Allow associated users to get the state of, list, be notified of chagnes to, and partially update + # jobs launched in the cluster + - apiGroups: ["batch", "extensions"] + resources: ["jobs"] + verbs: ["get", "list", "watch", "patch"] + # Allow associated users to create leases + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create"] + # Allow associated users to get the state of and fully update leases in the autoscaler + - apiGroups: ["coordination.k8s.io"] + resourceNames: ["cluster-autoscaler"] + resources: ["leases"] + verbs: ["get", "update"] +--- +# Create a Role to set permissions within the 'kube-system' namespace +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: cluster-autoscaler + # The permissions in this Role apply to the 'kube-system' namespace + namespace: kube-system + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +rules: + # Allow associated users to create, list, and be notified of changes to config maps + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["create", "list", "watch"] + # Allow associated users to delete, get the state of, fully update, and be notified of + # changes to config maps in the autoscaler's status and priority-expander subresources + - apiGroups: [""] + resources: ["configmaps"] + resourceNames: + - "cluster-autoscaler-status" + - "cluster-autoscaler-priority-expander" + verbs: ["delete", "get", "update", "watch"] +--- +# Grant permissions defined by the ClusterRole +# to users defined by the ServiceAccount +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cluster-autoscaler + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +# Use the ClusterRole named "cluster-autoscaler" in the binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-autoscaler +# Use the ServiceAccount named "cluster-autoscaler" +# in the "kube-system" workspace in the binding +subjects: + - kind: ServiceAccount + name: cluster-autoscaler + namespace: kube-system +--- +# Grant permissions defined by the Role +# to users defined by the ServiceAccount +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +# Use the Role named "cluster-autoscaler" in the binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: cluster-autoscaler +# Use the ServiceAccount named "cluster-autoscaler" +# in the "kube-system" workspace in the binding +subjects: + - kind: ServiceAccount + name: cluster-autoscaler + namespace: kube-system +--- +# Define deployment rules for pods and ReplicaSets +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + app: cluster-autoscaler +spec: + replicas: 1 # Number of pods to run + # Apply to pods where the app has a label called 'app' + # with value 'cluster-autoscaler' + selector: + matchLabels: + app: cluster-autoscaler + # Definition of created pods + template: + metadata: + labels: + app: cluster-autoscaler + # Allow Prometheus to collect monitoring data over port 8085 + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8085" + spec: + priorityClassName: system-cluster-critical + securityContext: + # The Kubelet must be run as a non-root user + runAsNonRoot: true + runAsUser: 65534 + fsGroup: 65534 + # Use the default seccomp profile as specified by the + # container runtime + seccompProfile: + type: RuntimeDefault + serviceAccountName: cluster-autoscaler + # The container(s) to run within the pod. + # Since we're running an autoscaler, we'll run the autoscaler + # as the pod's only container, and then we'll deploy other + # containers within the autoscaler to actually do work + containers: + # The main container for the pod will be the + # Kubernetes autoscaling container + - image: registry.k8s.io/autoscaling/cluster-autoscaler:v1.26.2 + name: cluster-autoscaler + resources: + # Maximum amount of compute resources allowed + limits: + cpu: 100m + memory: 600Mi + # Minimum amount of compute resources required + # Defaults to 'limits' if not specified + requests: + cpu: 100m + memory: 600Mi + command: + - ./cluster-autoscaler + - --v=4 + - --stderrthreshold=info + - --cloud-provider=aws + - --skip-nodes-with-local-storage=false + - --expander=least-waste + - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/hpdc-2025-pave-dry-run + volumeMounts: + # Mount the CA SSL/TLS certificates into the container + - name: ssl-certs + mountPath: /etc/ssl/certs/ca-certificates.crt + readOnly: true + # Always pull the digest of the image from the + # container registry. If the locally cached digest is + # the same as the pulled digest, use the cached container image. + # Otherwise, pull the container from the registry + imagePullPolicy: "Always" + securityContext: + # Don't let the pod have more privileges than the + # parent process + allowPrivilegeEscalation: false + capabilities: + # Remove all capabilities + drop: + - ALL + # Root filesystem (i.e., '/') is read-only + readOnlyRootFilesystem: true + volumes: + - name: ssl-certs + hostPath: + path: "/etc/ssl/certs/ca-bundle.crt" \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/config.toml b/2025-HPDC/infrastructure/pave-dry-run/config.toml new file mode 100644 index 0000000..840d02a --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/config.toml @@ -0,0 +1,54 @@ +tutorial_name = "hpdc-2025-pave-dry-run" + +[aws.eksctl] +cluster_name = "hpdc-2025-pave-dry-run" +cluster_deployment_region = "us-west-1" +cluster_availability_zones = [ + "us-west-1a", + "us-west-1c", +] + +[[aws.eksctl.cluster_node_groups]] +zone = "us-west-1a" +instance_type = "c7i.12xlarge" +volume_size = 30 +desired_size = 2 +min_size = 2 +max_size = 8 + +[[aws.eksctl.cluster_node_groups]] +zone = "us-west-1c" +instance_type = "c7i.12xlarge" +volume_size = 30 +desired_size = 2 +min_size = 2 +max_size = 8 + +[aws."Kubernetes autoscaler"] +cpu_max = "100m" +memory_max = "600Mi" +cpu_min = "100m" +memory_min = "600Mi" + +[aws.Helm] +max_concurrent_users = 14 +hub_password = "butter" +hub_db_capacity = "32Gi" +ebs_storage_type = "gp3" +hub_container_image = "jupyterhub/k8s-hub" +hub_container_tag = "4.2.0" +spawner_container_image = "ghcr.io/llnl/reproducible-benchmarking-spawn" +spawner_container_tag = "hpdc-2025" +spawner_image_entrypoint = "/entrypoint.sh" +cpu_min = "32" +cpu_max = "32" +mem_min = "64G" +mem_max = "64G" +provide_extra_shmem = true +init_container_image = "ghcr.io/llnl/reproducible-benchmarking-init" +init_container_tag = "hpdc-2025" +init_image_entrypoint = "/entrypoint.sh" + +[aws."utility scripts"] +jupyterhub_helm_version = "4.2.0" +ebs_csidriver_version = "v1.45.0" diff --git a/2025-HPDC/infrastructure/pave-dry-run/configure_kubernetes.sh b/2025-HPDC/infrastructure/pave-dry-run/configure_kubernetes.sh new file mode 100755 index 0000000..5c4bee6 --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/configure_kubernetes.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +echo "Configuring the Cluster Autoscaler:" +kubectl apply -k "github.com/kubernetes-sigs/aws-ebs-csi-driver/deploy/kubernetes/overlays/stable/?ref=v1.45.0" +kubectl apply -f ./cluster-autoscaler.yaml +echo "" +echo "Configuring the Storage Class:" +kubectl apply -f ./storage-class.yaml + +echo "" +echo "Patching the cluster to make the configured storage class the default:" +kubectl patch storageclass gp3 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' +kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' + +echo "" +echo "Done configuring Kubernetes!" +echo "" +echo "Next, you should run deploy_jupyterhub.sh to actually deploy JupyterHub and the tutorial." \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/create_cluster.sh b/2025-HPDC/infrastructure/pave-dry-run/create_cluster.sh new file mode 100755 index 0000000..f631168 --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/create_cluster.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v eksctl >/dev/null 2>&1; then + echo "ERROR: 'eksctl' is required to create a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://eksctl.io/installation/" + exit 1 +fi + +echo "Creating EKS cluster with eksctl:" +eksctl create cluster --config-file ./eksctl-config.yaml + +echo "Done creating the EKS cluster!" +echo "" +echo "Next, you should run configure_kubernetes.sh to configure Kubernetes on the cluster." \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/deploy_jupyterhub.sh b/2025-HPDC/infrastructure/pave-dry-run/deploy_jupyterhub.sh new file mode 100755 index 0000000..dcfd1d5 --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/deploy_jupyterhub.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +echo "Adding JupyterHub to EKS cluster using Helm:" +helm repo add jupyterhub https://hub.jupyter.org/helm-chart/ +helm repo update +echo "" +echo "Installing the Helm chart and deploying JupyterHub to EKS:" +helm install hpdc-2025-pave-dry-run-jupyter jupyterhub/jupyterhub --version 4.2.0 --values ./helm-config.yaml + +echo "" +echo "Done deploying JupyterHub!" +echo "" +echo "Next, you should ensure all the pods spawned correctly with check_jupyterhub_status.sh," +echo "and you should get the cluster URL with get_jupyterhub_url.sh." +echo "" +echo "If something went wrong, you can edit the helm-config.yaml file to try to fix the issue." +echo "After editing helm-config.yaml, you can normally reconfigure and relaunch JupyterHub using" +echo "the update_jupyterhub_deployment.sh script. If that doesn't work or if you need to edit" +echo "storage-class.yaml or cluster-autoscaler.yaml, you should first tear down JupyterHub with" +echo "tear_down_jupyterhub.sh, and then you should bring Jupyter back up by rerunning deploy_jupyterhub.sh." +echo "" +echo "If everything went smoothly, the cluster URL is what you should share with attendees." +echo "" +echo "Attendees can get a Jupyter environment to work in by going to that URL and logging in" +echo "with a username of their choice and the password specified in helm-config.yaml." +echo "" +echo "Note: users should have unique usernames. If two users have the same username, they will" +echo " share the same pod." +echo "" +echo "After you are done with your tutorial, you should finally run cleanup.sh to bring down" +echo "the EKS cluster and all associated resources." \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/eksctl-config.yaml b/2025-HPDC/infrastructure/pave-dry-run/eksctl-config.yaml new file mode 100644 index 0000000..27a235e --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/eksctl-config.yaml @@ -0,0 +1,110 @@ +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +# Define the name of the cluster and the deployment region +metadata: + name: hpdc-2025-pave-dry-run + region: us-west-1 + +# Create the IAM policies needed to enable the autoscaler and storage +iam: + withOIDC: true + serviceAccounts: + - metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + aws-usage: "cluster-ops" + app.kubernetes.io/name: cluster-autoscaler + + # https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md + attachPolicy: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "autoscaling:DescribeAutoScalingGroups" + - "autoscaling:DescribeAutoScalingInstances" + - "autoscaling:DescribeLaunchConfigurations" + - "autoscaling:DescribeTags" + - "autoscaling:SetDesiredCapacity" + - "autoscaling:TerminateInstanceInAutoScalingGroup" + - "ec2:DescribeLaunchTemplateVersions" + Resource: "*" + + - metadata: + name: ebs-csi-controller-sa + namespace: kube-system + labels: + aws-usage: "cluster-ops" + app.kubernetes.io/name: aws-ebs-csi-driver + attachPolicy: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "ec2:AttachVolume" + - "ec2:CreateSnapshot" + - "ec2:CreateTags" + - "ec2:CreateVolume" + - "ec2:DeleteSnapshot" + - "ec2:DeleteTags" + - "ec2:DeleteVolume" + - "ec2:DescribeInstances" + - "ec2:DescribeSnapshots" + - "ec2:DescribeTags" + - "ec2:DescribeVolumes" + - "ec2:DetachVolume" + Resource: "*" + +# Specify the availability zone from which nodes will be obtained +availabilityZones: +- "us-west-1a" +- "us-west-1c" + + +# Define rules for nodegroups for each availability zone +managedNodeGroups: + + - name: node-group-us-west-1a + # Set policies/permissions to autoscale + iam: + withAddonPolicies: + autoScaler: true + # Instance type to allocate + instanceType: c7i.12xlarge + # Size of storage volume for the availability zone, in gigabytes + volumeSize: 30 + # Number of nodes to start with in this availability zone + desiredCapacity: 2 + # Minimum number of nodes that will always be allocated in this availability zone + minSize: 2 + # Maximum number of nodes that will every be allocated in this availability zone + maxSize: 8 + privateNetworking: true + availabilityZones: + - us-west-1a + tags: + k8s.io/cluster-autoscaler/enabled: "true" + k8s.io/cluster-autoscaler/jupyterhub: "owned" + + - name: node-group-us-west-1c + # Set policies/permissions to autoscale + iam: + withAddonPolicies: + autoScaler: true + # Instance type to allocate + instanceType: c7i.12xlarge + # Size of storage volume for the availability zone, in gigabytes + volumeSize: 30 + # Number of nodes to start with in this availability zone + desiredCapacity: 2 + # Minimum number of nodes that will always be allocated in this availability zone + minSize: 2 + # Maximum number of nodes that will every be allocated in this availability zone + maxSize: 8 + privateNetworking: true + availabilityZones: + - us-west-1c + tags: + k8s.io/cluster-autoscaler/enabled: "true" + k8s.io/cluster-autoscaler/jupyterhub: "owned" diff --git a/2025-HPDC/infrastructure/pave-dry-run/get_jupyterhub_url.sh b/2025-HPDC/infrastructure/pave-dry-run/get_jupyterhub_url.sh new file mode 100755 index 0000000..ddfd250 --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/get_jupyterhub_url.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +kubectl get -o json service proxy-public | jq '.status.loadBalancer.ingress[0].hostname' \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/helm-config.yaml b/2025-HPDC/infrastructure/pave-dry-run/helm-config.yaml new file mode 100644 index 0000000..4f98dba --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/helm-config.yaml @@ -0,0 +1,121 @@ +# Uncomment if you need to debug your deployment of Jupyter. +# For more information on debugging, see: +# https://z2jh.jupyter.org/en/stable/administrator/debug.html +# debug: +# enabled: true + +hub: + # Maximum number of users with spawned JupyterLab environments (i.e., pods) at a time + concurrentSpawnLimit: 14 + config: + # Define a password for login + DummyAuthenticator: + password: butter + JupyterHub: + admin_access: true + authenticator_class: dummy + + # Define storage quantity for JupyterHub's persistent database + # We could explicitly set storage class name here, + # but we won't because we've marked the storage class defined + # in storage-class.yaml as default + db: + pvc: + storage: 32Gi + storageClassName: gp3 + + # Specify the hub image for the tutorial. + # The hub image should be based off of the jupyterhub/k8s-hub image. + # Its job is twofold: + # 1) If desired, replace the login page (at /usr/local/share/jupyterhub/templates/login.html) with a custom HTML login page + # 2) Set the user + image: + name: jupyterhub/k8s-hub + tag: "4.2.0" + pullPolicy: Always + + # Define resource usage for JupyterHub + # For large tutorials, it is recommended to set these higher + + # We are just using defualt resource usage + + + # Define custom hostname for JupyterHub + + # We are not using a custom hostname + + +# Based on optimization recommendations from: +# https://z2jh.jupyter.org/en/latest/administrator/optimization.html#scaling-up-in-time-user-placeholders +# scheduling: +# podPriority: +# enabled: true +# userPlaceholder: +# replicas: 3 + +# Define the spawner and init containers for each attendee's pod +singleuser: + # Specify the spawner image for the tutorial. + # The spawner image should do the following: + # 1) Install any necessary software + # 2) Define the user for the tutorial (we usually default to jovyan) + # 3) If custom Python packages are needed, it's often recommended to install a custom Jupyter kernel with `IPython kernel install` + # 4) If you want a custom Jupyter launcher UI, install the appropriate packages and update JUPYTER_APP_LAUNCHER_PATH + # 5) Copy any necessary local scripts or files and ensure proper permissions + image: + name: ghcr.io/llnl/reproducible-benchmarking-spawn + tag: "hpdc-2025" + pullPolicy: Always + # Specify the minimum (i.e., guarantee) and maximum (i.e., limit) amount of resources per user + cpu: + limit: 32 + guarantee: 32 + memory: + limit: "64G" + guarantee: "64G" + # If needed, specify a custom entrypoint into the spawner image. + # For more information, look at the documentation for Docker ENTRYPOINT and CMD directives: + # https://www.docker.com/blog/docker-best-practices-choosing-between-run-cmd-and-entrypoint/ + cmd: /entrypoint.sh + # Specify the init image for the tutorial. + # This image is optional, but it can be used to do last second configuration or installation of files + # before the user gains control of the pod. + # + # A good usecase for the init image is to set permissions and ensure the tutorial user will be able to + # access the files for your tutorial. An example Dockerfile for the init image may look like: + # + # Dockerfile: + # FROM alpine/git + # ENV NB_USER=jovyan \ + # NB_UID=1000 \ + # HOME=/home/jovyan + # + # RUN adduser \ + # -D \ + # -g "Default user" \ + # -u ${NB_UID} \ + # -h ${HOME} \ + # ${NB_USER} + # + # COPY ./init-entrypoint.sh /entrypoint.sh + # + # The 'command' field for the init container specifies the entrypoint for the container. For the Dockerfile + # above, the entrypoint should be "/entrypoint.sh". This script could look something like this: + # + # entrypoint.sh (would be ./init-entrypoint.sh on your local computer) + # chown -R 1000 /home/jovyan + initContainers: + - name: init-tutorial-service + image: ghcr.io/llnl/reproducible-benchmarking-init:hpdc-2025 + command: ["/entrypoint.sh"] + imagePullPolicy: Always + storage: + type: none + extraVolumes: + - name: shm-volume + emptyDir: + medium: Memory + extraVolumeMounts: + - name: shm-volume + mountPath: /dev/shm + \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/storage-class.yaml b/2025-HPDC/infrastructure/pave-dry-run/storage-class.yaml new file mode 100644 index 0000000..b83a030 --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/storage-class.yaml @@ -0,0 +1,7 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: gp3 +provisioner: kubernetes.io/aws-ebs +volumeBindingMode: WaitForFirstConsumer +reclaimPolicy: Delete \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/tear_down_jupyterhub.sh b/2025-HPDC/infrastructure/pave-dry-run/tear_down_jupyterhub.sh new file mode 100755 index 0000000..b306b9c --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/tear_down_jupyterhub.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +helm uninstall hpdc-2025-pave-dry-run-jupyter + +echo "Helm's JupyterHub deployment is torn down." +echo "If any attendee pods are remaining, you can delete them with 'kubectl delete pod '" +echo "" +echo "To recreate the JupyterHub deployment, just run deploy_jupyterhub.sh again." \ No newline at end of file diff --git a/2025-HPDC/infrastructure/pave-dry-run/update_jupyterhub_deployment.sh b/2025-HPDC/infrastructure/pave-dry-run/update_jupyterhub_deployment.sh new file mode 100755 index 0000000..68c3306 --- /dev/null +++ b/2025-HPDC/infrastructure/pave-dry-run/update_jupyterhub_deployment.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +helm upgrade hpdc-2025-pave-dry-run-jupyter jupyterhub/jupyterhub --values ./helm-config.yaml + +echo "The JupyterHub deployment is updated!" \ No newline at end of file diff --git a/2025-HPDC/infrastructure/test-c7i-24xlarge/cluster-autoscaler.yaml b/2025-HPDC/infrastructure/test-c7i-24xlarge/cluster-autoscaler.yaml index 1b63d28..e89ed75 100644 --- a/2025-HPDC/infrastructure/test-c7i-24xlarge/cluster-autoscaler.yaml +++ b/2025-HPDC/infrastructure/test-c7i-24xlarge/cluster-autoscaler.yaml @@ -245,7 +245,7 @@ spec: - --cloud-provider=aws - --skip-nodes-with-local-storage=false - --expander=least-waste - - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/jupyterhub + - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/hpdc-2025-c7i-24xlarge volumeMounts: # Mount the CA SSL/TLS certificates into the container - name: ssl-certs diff --git a/2025-HPDC/infrastructure/test-c7i-24xlarge/config.toml b/2025-HPDC/infrastructure/test-c7i-24xlarge/config.toml index 948ba83..87410ce 100644 --- a/2025-HPDC/infrastructure/test-c7i-24xlarge/config.toml +++ b/2025-HPDC/infrastructure/test-c7i-24xlarge/config.toml @@ -1,3 +1,5 @@ +tutorial_name = "hpdc-2025-c7i-24xlarge" + [aws.eksctl] cluster_name = "hpdc-2025-c7i-24xlarge" cluster_deployment_region = "us-west-1" @@ -29,7 +31,6 @@ cpu_min = "100m" memory_min = "600Mi" [aws.Helm] -tutorial_name = "hpdc-2025-c7i-24xlarge" max_concurrent_users = 2 hub_password = "butter" hub_db_capacity = "32Gi" @@ -49,7 +50,5 @@ init_container_tag = "hpdc-2025" init_image_entrypoint = "/entrypoint.sh" [aws."utility scripts"] -tutorial_name = "hpdc-2025-c7i-24xlarge" jupyterhub_helm_version = "4.2.0" ebs_csidriver_version = "v1.45.0" -ebs_storage_type = "gp3" diff --git a/2025-HPDC/infrastructure/test-c7i-48xlarge/cluster-autoscaler.yaml b/2025-HPDC/infrastructure/test-c7i-48xlarge/cluster-autoscaler.yaml index 1b63d28..35bdd6b 100644 --- a/2025-HPDC/infrastructure/test-c7i-48xlarge/cluster-autoscaler.yaml +++ b/2025-HPDC/infrastructure/test-c7i-48xlarge/cluster-autoscaler.yaml @@ -245,7 +245,7 @@ spec: - --cloud-provider=aws - --skip-nodes-with-local-storage=false - --expander=least-waste - - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/jupyterhub + - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/hpdc-2025-c7i-48xlarge volumeMounts: # Mount the CA SSL/TLS certificates into the container - name: ssl-certs diff --git a/2025-HPDC/infrastructure/test-c7i-48xlarge/config.toml b/2025-HPDC/infrastructure/test-c7i-48xlarge/config.toml index 503e845..3afd8c7 100644 --- a/2025-HPDC/infrastructure/test-c7i-48xlarge/config.toml +++ b/2025-HPDC/infrastructure/test-c7i-48xlarge/config.toml @@ -1,3 +1,5 @@ +tutorial_name = "hpdc-2025-c7i-48xlarge" + [aws.eksctl] cluster_name = "hpdc-2025-c7i-48xlarge" cluster_deployment_region = "us-west-1" @@ -29,7 +31,6 @@ cpu_min = "100m" memory_min = "600Mi" [aws.Helm] -tutorial_name = "hpdc-2025-c7i-48xlarge" max_concurrent_users = 2 hub_password = "butter" hub_db_capacity = "32Gi" @@ -49,7 +50,5 @@ init_container_tag = "hpdc-2025" init_image_entrypoint = "/entrypoint.sh" [aws."utility scripts"] -tutorial_name = "hpdc-2025-c7i-48xlarge" jupyterhub_helm_version = "4.2.0" ebs_csidriver_version = "v1.45.0" -ebs_storage_type = "gp3" diff --git a/2025-HPDC/infrastructure/test-c7i-metal-24xl/cluster-autoscaler.yaml b/2025-HPDC/infrastructure/test-c7i-metal-24xl/cluster-autoscaler.yaml index 1b63d28..3262b5f 100644 --- a/2025-HPDC/infrastructure/test-c7i-metal-24xl/cluster-autoscaler.yaml +++ b/2025-HPDC/infrastructure/test-c7i-metal-24xl/cluster-autoscaler.yaml @@ -245,7 +245,7 @@ spec: - --cloud-provider=aws - --skip-nodes-with-local-storage=false - --expander=least-waste - - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/jupyterhub + - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/hpdc-2025-c7i-metal-24xl volumeMounts: # Mount the CA SSL/TLS certificates into the container - name: ssl-certs diff --git a/2025-HPDC/infrastructure/test-c7i-metal-24xl/config.toml b/2025-HPDC/infrastructure/test-c7i-metal-24xl/config.toml index c8d30f8..1982520 100644 --- a/2025-HPDC/infrastructure/test-c7i-metal-24xl/config.toml +++ b/2025-HPDC/infrastructure/test-c7i-metal-24xl/config.toml @@ -1,3 +1,5 @@ +tutorial_name = "hpdc-2025-c7i-metal-24xl" + [aws.eksctl] cluster_name = "hpdc-2025-c7i-metal-24xl" cluster_deployment_region = "us-west-1" @@ -29,7 +31,6 @@ cpu_min = "100m" memory_min = "600Mi" [aws.Helm] -tutorial_name = "hpdc-2025-c7i-metal-24xl" max_concurrent_users = 2 hub_password = "butter" hub_db_capacity = "32Gi" @@ -49,7 +50,5 @@ init_container_tag = "hpdc-2025" init_image_entrypoint = "/entrypoint.sh" [aws."utility scripts"] -tutorial_name = "hpdc-2025-c7i-metal-24xl" jupyterhub_helm_version = "4.2.0" ebs_csidriver_version = "v1.45.0" -ebs_storage_type = "gp3" diff --git a/2025-HPDC/infrastructure/test-c7i-metal-48xl/cluster-autoscaler.yaml b/2025-HPDC/infrastructure/test-c7i-metal-48xl/cluster-autoscaler.yaml index 1b63d28..7528be5 100644 --- a/2025-HPDC/infrastructure/test-c7i-metal-48xl/cluster-autoscaler.yaml +++ b/2025-HPDC/infrastructure/test-c7i-metal-48xl/cluster-autoscaler.yaml @@ -245,7 +245,7 @@ spec: - --cloud-provider=aws - --skip-nodes-with-local-storage=false - --expander=least-waste - - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/jupyterhub + - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/hpdc-2025-c7i-metal-48xl volumeMounts: # Mount the CA SSL/TLS certificates into the container - name: ssl-certs diff --git a/2025-HPDC/infrastructure/test-c7i-metal-48xl/config.toml b/2025-HPDC/infrastructure/test-c7i-metal-48xl/config.toml index 84c202c..86ed1b2 100644 --- a/2025-HPDC/infrastructure/test-c7i-metal-48xl/config.toml +++ b/2025-HPDC/infrastructure/test-c7i-metal-48xl/config.toml @@ -1,3 +1,5 @@ +tutorial_name = "hpdc-2025-c7i-metal-48xl" + [aws.eksctl] cluster_name = "hpdc-2025-c7i-metal-48xl" cluster_deployment_region = "us-west-1" @@ -29,7 +31,6 @@ cpu_min = "100m" memory_min = "600Mi" [aws.Helm] -tutorial_name = "hpdc-2025-c7i-metal-48xl" max_concurrent_users = 2 hub_password = "butter" hub_db_capacity = "32Gi" @@ -49,7 +50,5 @@ init_container_tag = "hpdc-2025" init_image_entrypoint = "/entrypoint.sh" [aws."utility scripts"] -tutorial_name = "hpdc-2025-c7i-metal-48xl" jupyterhub_helm_version = "4.2.0" ebs_csidriver_version = "v1.45.0" -ebs_storage_type = "gp3" diff --git a/2025-HPDC/tutorial-code/caliper-tutorial b/2025-HPDC/tutorial-code/caliper-tutorial index a109ff8..942c1a0 160000 --- a/2025-HPDC/tutorial-code/caliper-tutorial +++ b/2025-HPDC/tutorial-code/caliper-tutorial @@ -1 +1 @@ -Subproject commit a109ff832093c2ad543b176313616e93c043f049 +Subproject commit 942c1a0da4e9ef5f809d03f15d97c938ea4c6531 diff --git a/2025-HPDC/tutorial-code/system-description/aws-tutorial/system.py b/2025-HPDC/tutorial-code/system-description/aws-tutorial/system.py index 0083ee9..2116fba 100644 --- a/2025-HPDC/tutorial-code/system-description/aws-tutorial/system.py +++ b/2025-HPDC/tutorial-code/system-description/aws-tutorial/system.py @@ -39,11 +39,16 @@ class AwsTutorial(System): "hardware_key": str(hardware_descriptions) + "/AWS_Tutorial-zen-EFA/hardware_description.yaml", }, + "c7i.12xlarge": { + "system_site": "aws", + "hardware_key": str(hardware_descriptions) + + "/AWS_Tutorial-zen-EFA/hardware_description.yaml", + }, } variant( "instance_type", - values=("c7i.48xlarge", "c7i.metal-48xl", "c7i.24xlarge", "c7i.metal-24xl"), + values=("c7i.48xlarge", "c7i.metal-48xl", "c7i.24xlarge", "c7i.metal-24xl", "c7i.12xlarge"), default="c7i.24xlarge", description="AWS instance type", )