diff --git a/Makefile b/Makefile index 300b5c01..5e7ea7c6 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,9 @@ else SED ?= gsed endif +MAKEFLAGS += --no-print-directory ARCH=$(shell go env GOARCH) + # Define CONTAINER_FLAGS and include ARCH as an argument CONTAINER_FLAGS ?= --build-arg TARGETARCH=$(ARCH) @@ -339,14 +341,51 @@ undeploy: kustomize delete-webhook-secret-file ## Undeploy operator and agent fr undeploy-force: ## Same as "make undeploy" but also delete any dependencies. $(MAKE) undeploy FORCE=--force -.PHONY: deploy-examples -deploy-examples: ## Deploy the examples to the K8s cluster specified in ~/.kube/config. - @echo "Create Namespace based GKMCache" - $(KUBECTL) apply -f examples/namespace/RWO/ - $(KUBECTL) apply -f examples/namespace/ROX/ - @echo "Create Cluster based ClusterGKMCache" - $(KUBECTL) apply -f examples/cluster/RWO/ - $(KUBECTL) apply -f examples/cluster/ROX/ +.PHONY: gen-apply-example +gen-apply-example: kustomize + @cd examples; \ + if ! EXAMPLE_YAML=$$(DEBUG=false ./generate-files.sh $(EXAMPLE_ACCESS) $(EXAMPLE_SCOPE) $(EXAMPLE_GPU) $(EXAMPLE_VER) $(EXAMPLE_ENV)); then \ + echo; \ + echo "FAILED: ./generate-files.sh $(EXAMPLE_ACCESS) $(EXAMPLE_SCOPE) $(EXAMPLE_GPU) $(EXAMPLE_VER) $(EXAMPLE_ENV)"; \ + echo; \ + exit 1; \ + fi; \ + $(KUBECTL) $$EXAMPLE_CMD -f $$EXAMPLE_YAML + +.PHONY: deploy-examples-kind +deploy-examples-kind: ## Deploy the examples to a KIND K8s cluster + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rwo EXAMPLE_SCOPE=namespace EXAMPLE_GPU=rocm EXAMPLE_VER=v2 EXAMPLE_ENV=kind EXAMPLE_CMD=apply + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rwo EXAMPLE_SCOPE=cluster EXAMPLE_GPU=rocm EXAMPLE_VER=v3 EXAMPLE_ENV=kind EXAMPLE_CMD=apply + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rox EXAMPLE_SCOPE=namespace EXAMPLE_GPU=rocm EXAMPLE_VER=v3 EXAMPLE_ENV=kind EXAMPLE_CMD=apply + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rox EXAMPLE_SCOPE=cluster EXAMPLE_GPU=rocm EXAMPLE_VER=v2 EXAMPLE_ENV=kind EXAMPLE_CMD=apply + +.PHONY: deploy-examples-nfd-cuda +deploy-examples-nfd-cuda: ## Deploy the examples to a K8s cluster running NFD and CUDA, AccessMode=ReadWriteOnce + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rwo EXAMPLE_SCOPE=namespace EXAMPLE_GPU=cuda EXAMPLE_VER=v2 EXAMPLE_ENV=nfd EXAMPLE_CMD=apply + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rwo EXAMPLE_SCOPE=cluster EXAMPLE_GPU=cuda EXAMPLE_VER=v3 EXAMPLE_ENV=nfd EXAMPLE_CMD=apply + +.PHONY: deploy-examples-nfd-rocm +deploy-examples-nfd-rocm: ## Deploy the examples to a K8s cluster running NFD and ROCm, AccessMode=ReadWriteOnce + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rwo EXAMPLE_SCOPE=namespace EXAMPLE_GPU=rocm EXAMPLE_VER=v2 EXAMPLE_ENV=nfd EXAMPLE_CMD=apply + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rwo EXAMPLE_SCOPE=cluster EXAMPLE_GPU=rocm EXAMPLE_VER=v3 EXAMPLE_ENV=nfd EXAMPLE_CMD=apply + +.PHONY: undeploy-examples-kind +undeploy-examples-kind: ## Undeploy the examples to a KIND K8s cluster + @echo "Remove Namespace based GKMCache" + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rwo EXAMPLE_SCOPE=namespace EXAMPLE_GPU=rocm EXAMPLE_VER=v2 EXAMPLE_ENV=kind EXAMPLE_CMD="delete --ignore-not-found=$(ignore-not-found)" + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rwo EXAMPLE_SCOPE=cluster EXAMPLE_GPU=rocm EXAMPLE_VER=v3 EXAMPLE_ENV=kind EXAMPLE_CMD="delete --ignore-not-found=$(ignore-not-found)" + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rox EXAMPLE_SCOPE=namespace EXAMPLE_GPU=rocm EXAMPLE_VER=v3 EXAMPLE_ENV=kind EXAMPLE_CMD="delete --ignore-not-found=$(ignore-not-found)" + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rox EXAMPLE_SCOPE=cluster EXAMPLE_GPU=rocm EXAMPLE_VER=v2 EXAMPLE_ENV=kind EXAMPLE_CMD="delete --ignore-not-found=$(ignore-not-found)" + +.PHONY: undeploy-examples-nfd-cuda +undeploy-examples-nfd-cuda: ## Undeploy the examples to a K8s cluster running NFD and CUDA, AccessMode=ReadWriteOnce + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rwo EXAMPLE_SCOPE=namespace EXAMPLE_GPU=cuda EXAMPLE_VER=v2 EXAMPLE_ENV=nfd EXAMPLE_CMD="delete --ignore-not-found=$(ignore-not-found)" + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rwo EXAMPLE_SCOPE=cluster EXAMPLE_GPU=cuda EXAMPLE_VER=v3 EXAMPLE_ENV=nfd EXAMPLE_CMD="delete --ignore-not-found=$(ignore-not-found)" + +.PHONY: undeploy-examples-nfd-rocm +undeploy-examples-nfd-rocm: ## Undeploy the examples to a K8s cluster running NFD and ROCm, AccessMode=ReadWriteOnce + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rwo EXAMPLE_SCOPE=namespace EXAMPLE_GPU=rocm EXAMPLE_VER=v2 EXAMPLE_ENV=nfd EXAMPLE_CMD="delete --ignore-not-found=$(ignore-not-found)" + @$(MAKE) gen-apply-example EXAMPLE_ACCESS=rwo EXAMPLE_SCOPE=cluster EXAMPLE_GPU=rocm EXAMPLE_VER=v3 EXAMPLE_ENV=nfd EXAMPLE_CMD="delete --ignore-not-found=$(ignore-not-found)" .PHONY: undeploy-examples undeploy-examples: ## Undeploy the examples from the K8s cluster specified in ~/.kube/config. diff --git a/README.md b/README.md index 9cc135c4..b5a017d2 100644 --- a/README.md +++ b/README.md @@ -127,6 +127,9 @@ topics: - [Deployment Options](docs/DeploymentOptions.md): Details on the GKM Custom Resource Definitions, and how to tailor the different optional fields for different environments. +- [Examples Directory](docs/Examples.md): + Details on the layout of the examples directory and how to apply the examples + to a Kubernetes cluster. - [MCV Overview](mcv/README.md): Overview of Model Cache Vault (MCV). List of prerequisites to build, instructions on building MCV and usage guide. diff --git a/docs/DeploymentOptions.md b/docs/DeploymentOptions.md index dbd10803..7a81fe40 100644 --- a/docs/DeploymentOptions.md +++ b/docs/DeploymentOptions.md @@ -43,7 +43,7 @@ So the storage backing the PVC dictates a lot of the options. The most straight forward deployment is the GKM namespace scoped CRD, GKMCache, and a Kubernetes StorageClass that supports an AccessMode of `ReadOnlyMany`. -![GKM Flowchart](images/GKM_Namespace_ReadOnlyMany.png) +![GKM Namespace ReadOnlyMany](images/GKM_Namespace_ReadOnlyMany.png) The user creates a GKMCache with the OCI Image. There is no way to query KubeAPI Server to determine if an AccessMode of @@ -105,7 +105,7 @@ The extracted GPU Kernel Cache is then mounted in the application Pod. Many Kubernetes Clusters are not deployed by default with a Kubernetes StorageClass that supports an AccessMode of `ReadOnlyMany`. -In AWS this is a special request. +For example, in AWS this is a special request. A default KIND Cluster does not support it. If a given cluster doesn't support `ReadOnlyMany`, either to save money or using a test cluster without the support, GKM will handle the PVC distribution @@ -113,7 +113,7 @@ to Nodes. However, GKM will be fighting the Kubernetes Scheduler, so some concessions need to made to allow the Operator to distribute the extracted GPU Kernel Cache. -![GKM Flowchart](images/GKM_Namespace_ReadWriteOnce.png) +![GKM Namespace ReadWriteOnce](images/GKM_Namespace_ReadWriteOnce.png) Still namespace scoped, the user creates a GKMCache with the OCI Image. Unlike the previous example, there is no need to provide the optional AccessMode @@ -149,7 +149,8 @@ Scheduler is run. So the Node has not been selected at the time the Mutating Webhook runs, which is needed to set the correct PVC in the Volume. To get around this, GKM requires the application to be launched in a Kubernetes -DaemonSet when `ReadOnlyMany` is not supported. +DaemonSet when `ReadOnlyMany` is not supported, because Pods associated with a +DaemonSet do provide the selected Node at the time the Mutating Webhook runs. So the user then creates a Kubernetes DaemonSet with a `volume:` of type `persistentVolumeClaim:` and a `claimName:` set to the GKMCache name with the @@ -197,14 +198,14 @@ As with GKMCache, a Kubernetes StorageClass that supports an AccessMode of `ReadOnlyMany` is simpler in deployment because the storage backend is managing distribution to each Node. -![GKM Flowchart](images/GKM_Cluster_ReadOnlyMany.png) +![GKM Cluster ReadOnlyMany](images/GKM_Cluster_ReadOnlyMany.png) The user creates a ClusterGKMCache with the OCI Image. As before, there is no way to query KubeAPI Server to determine if an AccessMode of `ReadOnlyMany` is supported, so that must be passed in via the CRD. -PVCs are namespace scoped so must be created in the same namespace and the Pod +PVCs are namespace scoped so must be created in the same namespace as the Pod they are being mounted. Since ClusterGKMCache is cluster scoped, the User must specify which Namespaces the PVCs need to be created in. @@ -286,7 +287,7 @@ This scenario covers when a given cluster doesn't support `ReadOnlyMany`, and the GPU Kernel Cache needs to be loaded in multiple Pods that run in different Namespaces. -![GKM Flowchart](images/GKM_Cluster_ReadWriteOnce.png) +![GKM Cluster ReadWriteOnce](images/GKM_Cluster_ReadWriteOnce.png) The user creates a ClusterGKMCache with the OCI Image. As with the previous ClusterGKMCache, the User must specify which Namespaces the @@ -380,7 +381,7 @@ The extracted GPU Kernel Cache is then mounted in the application Pods. ## KIND Clusters -Running GKM in KIND Cluster needs some special consideration. +Running GKM in KIND Cluster needs some special consideration. For Kubernetes Operator functionality testing, a GPU is not needed, so GPUs are simulated. See [Getting Started Guide](docs/GettingStartedGuide.md) for more details on @@ -397,7 +398,7 @@ mounted directory are not setup in a way that allows the Pod to access the mounted directory. A workaround is to include a init container in each Pod or DaemonSet that is volume mounting the PVC which adjusts the directory permissions properly. -The following init container is included in all the examples in +The following init container is patched in all the examples in [./examples/](https://github.com/redhat-et/GKM/tree/main/examples): ```yaml diff --git a/docs/Examples.md b/docs/Examples.md new file mode 100644 index 00000000..11b47ee7 --- /dev/null +++ b/docs/Examples.md @@ -0,0 +1,387 @@ +# Examples Directory + +How GKM is used will depend on the GPUs in the Kubernetes cluster, what storage +backend are supported in the cluster, and the namespaces of the workloads +consuming the GPU Kernel Cache. +[Deployment Options](DeploymentOptions.md) describes in detail many of these +options. +Quick summary is that the two major factors that dictate deployment are: + +- **Namespace of the GPU Kernel Cache:** + If a given GPU Kernel Cache will only be deployed in a single Kubernetes + Namespace, then the `GKMCache` should be used. + If a given GPU Kernel Cache will be deployed in multiple Kubernetes Namespaces, + then the `ClusterGKMCache` should be used. +- **Cluster Storage Backend:** + If the Kubernetes StorageClass backend supports an Access Mode of `ReadOnlyMany` + then the storage backend can distribute extracted GPU Kernel Cache to each + node. + If the Kubernetes StorageClass backend does not support an Access Mode of + `ReadOnlyMany`, GKM needs to handle the distribution of the extracted GPU Kernel + Cache to each node. + If this is the case, certain concession need to be made. + +To handle these different deployment options, the Examples directory is using a +tool called `kustomize` along with a shell script to tailor a set of base yaml +files to work in multiple environments. + +Here are the set of options the examples supports: + +- **rox** vs **rwo**: The access mode of `ReadOnlyMany` or `ReadWriteOnce`. + - `rox` implies Pods will be used. + - `rwo` implies DaemonSets will be used. +- **namespace** vs **cluster**: The scope. + - **namespace** or **ns** implies GKMCache will be used. + - **cluster** or **cl** implies ClusterGKMCache will be used. Also implies two + namespaces will be created. +- **rocm** vs **cuda**: The GPU type. +- **v2** vs **v3**: The Cosign version used to sign the OCI Image. +- **kind** vs **nfd**: The environment the example is being deployed in. + - **kind** has some special restrictions that are being managed. + - **nfd** implies Node Feature Discovery is being used in real hardware (not + KIND) and nodes are labeled with detect GPU hardware. + +The object names, namespaces and generated output filenames are appended with +a suffix generated from these options. +For example, the GKMCache instance may be named something like: +`gkm-test-obj-rwo-ns-rocm-v2` + +## Directory Layout + +A set of base yaml files are created, one for each object that will be created. +For a GKM use case, the following objects are needed: + +- **Namespace** (two Namespaces if cluster scoped) +- **GKMCache** (namespace scoped) or **ClusterGKMCache** (cluster scoped) +- **Pod** (for ReadOnlyMany (rox)) or **DaemonSet** (for ReadWriteOnce (rwo)) + +So the yaml files for these basic objects is laid out as follows. +The `kustomization.yaml` file is a `kustomize` file that lists the set of files +the tool should include. + +```sh +$ tree examples/base/ +examples/base/ +├── access +│   ├── rox +│   │   ├── kustomization.yaml +│   │   ├── pod-1.yaml +│   │   ├── pod-2.yaml +│   │   └── pod-3.yaml +│   └── rwo +│   ├── ds-1.yaml +│   ├── ds-2.yaml +│   ├── ds-3.yaml +│   └── kustomization.yaml +├── common +│   ├── kustomization.yaml +│   └── namespace-1.env +└── scope + ├── cluster + │   ├── clustergkmcache.yaml + │   ├── kustomization.yaml + │   └── namespace-2.env + └── namespace + ├── gkmcache.yaml + └── kustomization.yaml +``` + +The base objects are just the bare bones yaml for the object. +Different deployments require additional fields in the object to be set. +For example, a deployment in a KIND Cluster requires an Init-Container be added +to the GKMCache/ClusterGKMCache and Pod/DaemonSet that sets the permissions of +the PVC VolumeMount so the workload can access the contents. +If using the Node Feature Discovery (NFD), the GKMCache/ClusterGKMCache and +Pod/DaemonSet objects need Affinity set so they are deployed on the proper node +based on the labels set by NFD. + +The variants directory contains `kustomize` patches, that mutate base yaml files +with the desired field updates. +A basic `kustomize` patch looks something like: + +```yaml +- target: + kind: Pod + name: gkm-test-pod-1 + patch: |- + - op: replace + path: /metadata/namespace + value: gkm-test-ns-1-rox-cl-rocm-v2 +``` + +This says for the Pod object with the name "gkm-test-pod-1", replace the value +at "metadata.namespace" with the value of "gkm-test-ns-1-rox-cl-rocm-v2". +To make the examples more useful, the goal is to deploy more than one instance +at a given time. +So the object names and the namespaces need to be dynamic, based on the input +deployment settings. +`kustomize` does not manage dynamic naming, so the examples use a script +(`examples/generate-files.sh`) with multiple `sed` commands to adjust the updated +fields as necessary. +So before the `sed` command runs, the above patch, which is stored in a +`kustomization.env` file, looks like: + +```yaml +- target: + kind: Pod + name: gkm-test-pod-1 + patch: |- + - op: replace + path: /metadata/namespace + value: NAMESPACE_1 +``` + +`kustomize` uses the `kustomization.yaml` files as mentioned above. +So `examples/generate-files.sh` runs `sed` commands on the `kustomization.env` +files and pipes the output to `kustomization.yaml` files for `kustomize` to +consume. +The patches are stored as follows: + +```sh +$ tree examples/variants/ +examples/variants/ +├── access +│   ├── rox +│   │   └── kustomization.env +│   └── rwo +│   └── kustomization.env +└── scope + ├── cluster + │   └── kustomization.env + └── namespace + └── kustomization.env +``` + +Finally, not all the files are used in every deployment. +So `kustomize` uses the `kustomization.yaml` in the `examples/overlays` directory +which includes the set of files to include. +To control the order the objects are generated, the `kustomization.yaml` file in +the `examples/overlays` is broken into two files. +These files are generated by the `examples/generate-files.sh` script, so neither +of these files are checked into the repo. + +```sh +$ tree examples/overlays/ +examples/overlays/ +├── access +└── scope +``` + +Once the `examples/generate-files.sh` script is run, the output looks something +like the following: + +```sh +$ cat examples/overlays/scope/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- ../../base/common +- ../../base/scope/namespace + +components: +- ../../variants/scope/namespace + +nameSuffix: -rwo-namespace-rocm-v3 +``` + +```sh +$ cat examples/overlays/access/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- ../../base/access/rwo + +components: +- ../../variants/access/rwo + +nameSuffix: -rwo-namespace-rocm-v3 +``` + +None of the files that are generated by the `examples/generate-files.sh` script +are checked into the repo. +The `examples/.gitignore` keeps the generated files as being flagged as changed. +There is also a `examples/cleanup-files.sh` script that will delete all the +generated yaml files if needed. + +## Deploy Examples from Makefile + +The Makefile has a few pre-canned deployment options. +If these don't fit a given deployment, visit the next section, +[Custom Example Deployments](#custom-example-deployments), for ways to customize +the deployment. + +### Makefile Deploy on KIND + +The KIND Cluster deployment is using a simulated ROCm GPU. +To deploy the examples in a KIND Cluster, run: + +```sh +make deploy-examples-kind +``` + +This runs the `examples/generate-files.sh` script four times, with the +following parameters for each run: + +- `rwo` - `namespace` - `rocm` - `v2` - `kind` +- `rwo` - `cluster` - `rocm` - `v3` - `kind` +- `rox` - `namespace` - `rocm` - `v3` - `kind` +- `rox` - `cluster` - `rocm` - `v2` - `kind` + +The KIND cluster is unique in that even though the backend storage does not +support `ReadOnlyMany`, because KIND is running each node in it's own +container on the same server, each Node can see the extracted cache, so it's +like `ReadOnlyMany`. +So both `rwo` and `rox` are supported. + +To unwind the deployment, run: + +```sh +make undeploy-examples-kind +``` + +### Makefile Deploy on NFD Cluster + +GKM works on clusters with Node Feature Discovery (NFD) deployed. +NFD is a Kubernetes Operator that automatically detects GPU hardware and adds +labels to nodes with details about which GPUs were detected. +GKM works in conjunction with this to only deploy GKM Agents built with drivers +for the detected GPU hardware. +This allows GKM Agent image sizes to be much smaller by not carrying around +unused drivers. + +When creating examples, these labels also allow Affinity and Tolerations to be +set on GKMCache/ClusterGKMCache instances and Pod/DaemonSet instances. +To this end, the following make commands deploy the examples for given GPU +hardware when running with NFD: + +```sh +make deploy-examples-nfd-cuda +``` + +This runs the `examples/generate-files.sh` script twice, with the following +parameters: + +- `rwo` - `namespace` - `cuda` - `v2` - `nfd` +- `rwo` - `cluster` - `cuda` - `v3` - `nfd` + +And: + +```sh +make deploy-examples-nfd-rocm +``` + +This runs the `examples/generate-files.sh` script twice, with the following +parameters: + +- `rwo` - `namespace` - `rocm` - `v2` - `nfd` +- `rwo` - `cluster` - `rocm` - `v3` - `nfd` + +To unwind the deployments, run either: + +```sh +make undeploy-examples-nfd-cuda +``` + +Or: + +```sh +make undeploy-examples-nfd-rocm +``` + +## Custom Example Deployments + +There are too many deployment scenarios to have Makefile cover all of them. +The `examples/generate-files.sh` script can be called directly. +The input parameters are in fixed locations and all are required except +ENVIRONMENT, which is optional. + +The help text associated with the script describes how is should be used: + +```sh +$ ./examples/generate-files.sh --help + +./generate-files.sh will generate a yaml file from the base files + and the input which can then be applied to a Kubernetes cluster. + Generated filename is printed from script and files can be found + in the "output/" directory. +Syntax: + ./generate-files.sh [] +Where: + is "rox" or "rwo" and required. + is "namespace", "ns", "cluster" or "cl" and required. + is "cuda" or "rocm" and required. + is "v2" or "v3" and required. + is "kind" or "nfd" and optional. +Samples: + ./generate-files.sh rwo namespace rocm v3 kind + ./generate-files.sh rox cluster cuda v2 nfd + ./generate-files.sh rox ns rocm v3 +``` + +Then run the script with the parameters as needed: + +```sh +$ ./generate-files.sh rwo namespace rocm v3 kind +output/rwo-ns-rocm-v3-kind.yaml +``` + +Then apply the output file to Kubernetes cluster when ready: + +```sh +kubectl apply -f output/rwo-ns-rocm-v3-kind.yaml +``` + +`examples/generate-files.sh` script can also be controlled with some Environment +Variables. + +- `DEBUG`: Script will also print the generated output file before exiting. + Helpful for examining the yaml before applying to Kubernetes cluster. +- `CUSTOM_AFFINITY`: The location of a file containing the JSON for custom + Affinity that will be applied to GKMCache/ClusterGKMCache and Pod/DaemonSet. + This is used in a `kustomize` patch. + Example is provided in `examples/patch/affinity-nfd-cuda.txt`. +- `CUSTOM_TOLERATION`: The location of a file containing the JSON for custom + Toleration that will be applied to GKMCache/ClusterGKMCache and Pod/DaemonSet. + This is used in a `kustomize` patch. + Example is provided in `examples/patch/toleration-nfd-cuda.txt`. +- `CUSTOM_NODE_SELECTOR_1`-`CUSTOM_NODE_SELECTOR_3`: The location of a file + containing the JSON for custom NodeSelector that will be applied to + Pod/DaemonSet. + CUSTOM_NODE_SELECTOR_1 applies to Pod-1/DaemonSet-1, CUSTOM_NODE_SELECTOR_2 + applies to Pod-2/DaemonSet-2, and CUSTOM_NODE_SELECTOR_3 applies to + Pod-3/DaemonSet-3, + These are used in `kustomize` patches. + Example is provided in `examples/patch/node-selector-kind-true.txt`. + +**NOTE:** The spacing in the custom files is important. +The content of the files are being piped directly into the generated +`kustomization.yaml` files that contain the patches applied to `kustomize`. +If an error occurs while running `examples/generate-files.sh`, like the +following, it is probably a spacing problem. + + + +```sh +$ ./generate-files.sh rox cluster rocm v3 kind +Error: accumulating components: accumulateDirectory: "recursed accumulation of path '/home/bmcfall/src/GKM/examples/variants/scope/cluster': trouble configuring builtin PatchTransformer with config: `\npatch: |-\n # Overwrite the OCI Image in the ClusterGKMCache with the CUDA/ROCm and V2/V3 tag. Whole image, not just tag overwritten\n - op: replace\n path: /spec/image\n value: quay.io/gkm/cache-examples:vector-add-cache-rocm\n\n # Add Cosign Version Label to ClusterGKMCache\n - op: add\n path: /metadata/labels\n value: {}\n - op: add\n path: /metadata/labels/gkm.io~1signature-format\n value: cosign-v3\n\n # Overwrite the namespaces to the `spec.workloadNamespaces` slice in the ClusterGKMCache\n - op: replace\n path: /spec/workloadNamespaces/0\n value: gkm-test-ns-1-rox-cluster-rocm-v3\n - op: replace\n path: /spec/workloadNamespaces/1\n value: gkm-test-ns-2-rox-cluster-rocm-v3- op: add path: /spec/accessModes/- value: ReadOnlyMany\ntarget:\n kind: ClusterGKMCache\n name: gkm-test-obj\n`: unable to parse SM or JSON patch from [patch: \"# Overwrite the OCI Image in the ClusterGKMCache with the CUDA/ROCm and V2/V3 tag. Whole image, not just tag overwritten\\n- op: replace\\n path: /spec/image\\n value: quay.io/gkm/cache-examples:vector-add-cache-rocm\\n\\n# Add Cosign Version Label to ClusterGKMCache\\n- op: add\\n path: /metadata/labels\\n value: {}\\n- op: add\\n path: /metadata/labels/gkm.io~1signature-format\\n value: cosign-v3\\n\\n# Overwrite the namespaces to the `spec.workloadNamespaces` slice in the ClusterGKMCache\\n- op: replace\\n path: /spec/workloadNamespaces/0\\n value: gkm-test-ns-1-rox-cluster-rocm-v3\\n- op: replace\\n path: /spec/workloadNamespaces/1\\n value: gkm-test-ns-2-rox-cluster-rocm-v3- op: add path: /spec/accessModes/- value: ReadOnlyMany\"]" +``` + + +Try to use the files already in the `examples/patch/` directory as examples. +If error occurs, the script probably generated an invalid `kustomization.yaml` +and the error was when `kustomize` tried to process it. +Examine the generated `kustomization.yaml` files in `examples/variants/`. + +- `variants/access/rox/kustomization.yaml` +- `variants/access/rwo/kustomization.yaml` +- `variants/scope/cluster/kustomization.yaml` +- `variants/scope/namespace/kustomization.yaml` + +Below is an example of running the script with some of the control variables: + + + +```sh +CUSTOM_AFFINITY=patch/affinity-nfd-rocm.txt DEBUG=true ./generate-files.sh rwo namespace rocm v3 +``` + diff --git a/docs/GettingStartedGuide.md b/docs/GettingStartedGuide.md index b29dcffb..95ec3b03 100644 --- a/docs/GettingStartedGuide.md +++ b/docs/GettingStartedGuide.md @@ -73,9 +73,9 @@ Check the GKM installed pods: ```sh $ kubectl get pods -n gkm-system NAME READY STATUS RESTARTS AGE -gkm-agent-85lqg 1/1 Running 0 5m7s -gkm-agent-kzx6j 1/1 Running 0 5m7s -gkm-operator-7dc756c84b-2w74z 3/3 Running 0 5m7s +gkm-agent-7hvdw 1/1 Running 0 3m28s +gkm-agent-jk2l9 1/1 Running 0 3m28s +gkm-operator-6f4b9df6f6-p648s 1/1 Running 0 3m28s ``` To delete a `kind` cluster with a simulated GPU: @@ -84,114 +84,134 @@ To delete a `kind` cluster with a simulated GPU: make destroy-kind ``` -## Install Test Pod Using GKM +## Install Test Pod Using GKM on KIND Cluster -There are example yamls that creates `GKMCache` and `ClusterGKMCache` custom -resource (CR) instances, each of which points to an OCI Image with GPU Kernel -Cache. -See [./examples/](https://github.com/redhat-et/GKM/tree/main/examples). -Sample: +There are example yaml files that create `GKMCache` and `ClusterGKMCache` +custom resource (CR) instances, each of which points to an OCI Image with GPU +Kernel Cache. +([Examples Directory](./Examples.md)) explains in detail the layout of the +[./examples/](https://github.com/redhat-et/GKM/tree/main/examples) files and how +to properly deploy them in different environments. + +Example (`cat examples/base/scope/namespace/gkmcache.yaml`): ```yaml apiVersion: gkm.io/v1alpha1 kind: GKMCache metadata: - name: vector-add-cache-rocm-v2 - namespace: gkm-test-ns-scoped-1 - labels: - gkm.io/signature-format: cosign-v2 + name: gkm-test-obj + namespace: gkm-test-ns-1 spec: image: quay.io/gkm/cache-examples:vector-add-cache-rocm-v2 - storageClassName: standard + accessModes: + - ReadWriteOnce ``` -The example yaml also includes a test pod that references the `PVC` that is -create via GKM, which is just the same name as the `GKMCache` CR instance. -Example: +The example yaml also includes several test pod that references the `PVC` that +is create via GKM, which is just the same name as the `GKMCache` CR instance. + +Example (`cat examples/base/access/rox/pod-1.yaml`): ```yaml kind: Pod apiVersion: v1 metadata: name: gkm-test-pod-1 - namespace: gkm-test-ns-scoped-1 + namespace: gkm-test-ns-1 spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" + securityContext: + fsGroup: 1000 containers: - - name: test - : - volumeMounts: - - name: kernel-volume - mountPath: "/cache" - volumes: - volumes: + - name: test + image: quay.io/fedora/fedora-minimal + imagePullPolicy: IfNotPresent + command: [sleep, 365d] + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + volumeMounts: - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2 + mountPath: /cache + volumes: + - name: kernel-volume + persistentVolumeClaim: + claimName: gkm-test-obj ``` Pod Spec Highlights: - The `volumes:` named `kernel-volume` references a PVC via `persistentVolumeClaim:` and references the GKM Cache CR via - `claimName: vector-add-cache-rocm-v2`. + `claimName: gkm-test-obj`. - The `volumeMounts:` named `kernel-volume` maps the GPU Kernel Cache to the directory `/cache` within the pod. -Because of the Node Selector, the test pod will be launched on node -`kind-gpu-sim-worker`. Determine the GKM Agent instant running on this node: +Now the example yamls can be applied: - - ```sh -$ kubectl get pods -n gkm-system -o wide -NAME READY STATUS RESTARTS AGE IP NODE -gkm-agent-85lqg 1/1 Running 0 5m7s 10.244.2.4 kind-gpu-sim-worker <-- HERE -gkm-agent-kzx6j 1/1 Running 0 5m7s 10.244.1.5 kind-gpu-sim-worker2 -gkm-operator-7dc756c84b-2w74z 3/3 Running 0 5m7s 10.244.0.5 kind-gpu-sim-control-plane +make deploy-examples-kind ``` - -Now the example yaml can be applied: - -```sh -make deploy-examples -``` - -The test pods `gkm-test-pod-*` should be running and the cache should be volume -mounted in the pods: +The test pods `gkm-test-ns-*` should be running and the cache should be volume +mounted in the pods. +Note: The `Completed` pods are Kubernetes Jobs that GKM created to download and +extract the OCI Image into a PVC. ```sh $ kubectl get pods -A -NAMESPACE NAME READY STATUS RESTARTS AGE +NAMESPACE NAME READY STATUS RESTARTS AGE +cert-manager cert-manager-7d75c44448-dsz84 1/1 Running 0 5m44s +cert-manager cert-manager-cainjector-798687f777-phhld 1/1 Running 0 5m43s +cert-manager cert-manager-webhook-6b7cdfdf8b-5lr5q 1/1 Running 0 5m43s +gkm-system gkm-agent-7hvdw 1/1 Running 0 5m31s +gkm-system gkm-agent-jk2l9 1/1 Running 0 5m31s +gkm-system gkm-operator-6f4b9df6f6-p648s 1/1 Running 0 5m31s +gkm-test-ns-1-rox-cluster-rocm-v2 gkm-test-obj-rox-cluster-rocm-v29x4md-lvntz 0/1 Completed 0 96s +gkm-test-ns-1-rox-cluster-rocm-v2 gkm-test-pod-1-rox-cluster-rocm-v2 1/1 Running 0 98s +gkm-test-ns-1-rox-cluster-rocm-v2 gkm-test-pod-2-rox-cluster-rocm-v2 1/1 Running 0 98s +gkm-test-ns-1-rox-namespace-rocm-v3 gkm-test-obj-rox-namespace-rocm-v37pbpx-szc2q 0/1 Completed 0 100s +gkm-test-ns-1-rox-namespace-rocm-v3 gkm-test-pod-1-rox-namespace-rocm-v3 1/1 Running 0 100s +gkm-test-ns-1-rox-namespace-rocm-v3 gkm-test-pod-2-rox-namespace-rocm-v3 1/1 Running 0 100s +gkm-test-ns-1-rox-namespace-rocm-v3 gkm-test-pod-3-rox-namespace-rocm-v3 1/1 Running 0 100s +gkm-test-ns-1-rwo-cluster-rocm-v3 gkm-test-ds-1-rwo-cluster-rocm-v3-gtk5x 1/1 Running 0 101s +gkm-test-ns-1-rwo-cluster-rocm-v3 gkm-test-ds-1-rwo-cluster-rocm-v3-j8lmk 1/1 Running 0 101s +gkm-test-ns-1-rwo-cluster-rocm-v3 gkm-test-ds-2-rwo-cluster-rocm-v3-bf8w9 1/1 Running 0 102s +gkm-test-ns-1-rwo-cluster-rocm-v3 gkm-test-obj-rwo-cluster-rocm-v3-286ba108chcrl-dnwzb 0/1 Completed 0 102s +gkm-test-ns-1-rwo-cluster-rocm-v3 gkm-test-obj-rwo-cluster-rocm-v3-c6f37497qjl5g-l9cbl 0/1 Completed 0 102s +gkm-test-ns-1-rwo-namespace-rocm-v2 gkm-test-ds-1-rwo-namespace-rocm-v2-bnn8r 1/1 Running 0 104s +gkm-test-ns-1-rwo-namespace-rocm-v2 gkm-test-ds-1-rwo-namespace-rocm-v2-srd2f 1/1 Running 0 104s +gkm-test-ns-1-rwo-namespace-rocm-v2 gkm-test-ds-2-rwo-namespace-rocm-v2-dm7jb 1/1 Running 0 104s +gkm-test-ns-1-rwo-namespace-rocm-v2 gkm-test-ds-3-rwo-namespace-rocm-v2-r54df 1/1 Running 0 104s +gkm-test-ns-1-rwo-namespace-rocm-v2 gkm-test-obj-rwo-namespace-rocm-v2-7529441aghdw5-pdblk 0/1 Completed 0 104s +gkm-test-ns-1-rwo-namespace-rocm-v2 gkm-test-obj-rwo-namespace-rocm-v2-b6c984234brgh-tk6q7 0/1 Completed 0 104s +gkm-test-ns-2-rox-cluster-rocm-v2 gkm-test-obj-rox-cluster-rocm-v2wdgn4-rsrm6 0/1 Completed 0 96s +gkm-test-ns-2-rox-cluster-rocm-v2 gkm-test-pod-3-rox-cluster-rocm-v2 1/1 Running 0 98s +gkm-test-ns-2-rwo-cluster-rocm-v3 gkm-test-ds-3-rwo-cluster-rocm-v3-7j82t 1/1 Running 0 101s +gkm-test-ns-2-rwo-cluster-rocm-v3 gkm-test-obj-rwo-cluster-rocm-v3-8724a7b7vjg8h-plxfk 0/1 Completed 0 101s +gkm-test-ns-2-rwo-cluster-rocm-v3 gkm-test-obj-rwo-cluster-rocm-v3-9dde6ea5lhtwj-hxh5h 0/1 Completed 0 102s : -gkm-system gkm-agent-85lqg 1/1 Running 0 41s -gkm-system gkm-agent-kzx6j 1/1 Running 0 41s -gkm-system gkm-operator-7dc756c84b-2w74z 3/3 Running 0 41s -gkm-test-cl-scoped gkm-test-pod-1 1/1 Running 0 19s -gkm-test-cl-scoped gkm-test-pod-2 1/1 Running 0 19s -gkm-test-cl-scoped gkm-test-pod-3 1/1 Running 0 19s -gkm-test-ns-scoped-1 gkm-test-pod-1 1/1 Running 0 22s -gkm-test-ns-scoped-1 gkm-test-pod-2 1/1 Running 0 22s -gkm-test-ns-scoped-1 gkm-test-pod-3 1/1 Running 0 22s -gkm-test-ns-scoped-2 gkm-test-pod-1 1/1 Running 0 21s -gkm-test-ns-scoped-2 gkm-test-pod-2 1/1 Running 0 21s -gkm-test-ns-scoped-2 gkm-test-pod-3 1/1 Running 0 21s +kyverno kyverno-admission-controller-578c64df84-gm9x9 1/1 Running 0 4m50s +kyverno kyverno-background-controller-66cb87dd88-p852k 1/1 Running 0 5m18s +kyverno kyverno-cleanup-controller-65b4494b5f-6rjlx 1/1 Running 0 5m18s +kyverno kyverno-reports-controller-db4986dc-2dq6w 1/1 Running 0 5m18s : -$ kubectl exec -it -n gkm-test-ns-scoped-1 gkm-test-pod-1 -- sh -sh-5.2# ls /cache +$ kubectl exec -it -n gkm-test-ns-1-rox-cluster-rocm-v2 gkm-test-pod-1-rox-cluster-rocm-v2 -c test -- sh +sh-5.3$ ls /cache CETLGDE7YAKGU4FRJ26IM6S47TFSIUU7KWBWDR3H2K3QRNRABUCA MCELTMXFCSPAMZYLZ3C3WPPYYVTVR4QOYNE52X3X6FIH7Z6N6X5A CHN6BLIJ7AJJRKY2IETERW2O7JXTFBUD3PH2WE3USNVKZEKXG64Q c4d45c651d6ac181a78d8d2f3ead424b8b8f07dd23dc3de0a99f425d8a633fc6 ``` +To remove, the example yamls: + +```sh +make undeploy-examples-kind +``` + ## Build and Run Private GKM Build By default, `Makefile` defaults to `quay.io/gkm/*` for pushing and pulling. diff --git a/examples/.gitignore b/examples/.gitignore new file mode 100644 index 00000000..8950abb6 --- /dev/null +++ b/examples/.gitignore @@ -0,0 +1,11 @@ +# Ignore generated yaml files +base/common/namespace-1.yaml +base/scope/cluster/namespace-2.yaml +overlays/access/*.yaml +overlays/scope/*.yaml +output/*.yaml +variants/access/rox/*.yaml +variants/access/rwo/*.yaml +variants/scope/cluster/*.yaml +variants/scope/namespace/*.yaml +.gkm-generate-files.exclusivelock diff --git a/examples/base/access/rox/kustomization.yaml b/examples/base/access/rox/kustomization.yaml new file mode 100644 index 00000000..78e4289e --- /dev/null +++ b/examples/base/access/rox/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - pod-1.yaml + - pod-2.yaml + - pod-3.yaml diff --git a/examples/base/access/rox/pod-1.yaml b/examples/base/access/rox/pod-1.yaml new file mode 100644 index 00000000..09928da7 --- /dev/null +++ b/examples/base/access/rox/pod-1.yaml @@ -0,0 +1,25 @@ +--- +kind: Pod +apiVersion: v1 +metadata: + name: gkm-test-pod-1 + namespace: gkm-test-ns-1 +spec: + securityContext: + fsGroup: 1000 + containers: + - name: test + image: quay.io/fedora/fedora-minimal + imagePullPolicy: IfNotPresent + command: [sleep, 365d] + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + volumeMounts: + - name: kernel-volume + mountPath: /cache + volumes: + - name: kernel-volume + persistentVolumeClaim: + claimName: gkm-test-obj diff --git a/examples/base/access/rox/pod-2.yaml b/examples/base/access/rox/pod-2.yaml new file mode 100644 index 00000000..e7629638 --- /dev/null +++ b/examples/base/access/rox/pod-2.yaml @@ -0,0 +1,25 @@ +--- +kind: Pod +apiVersion: v1 +metadata: + name: gkm-test-pod-2 + namespace: gkm-test-ns-1 +spec: + securityContext: + fsGroup: 1000 + containers: + - name: test + image: quay.io/fedora/fedora-minimal + imagePullPolicy: IfNotPresent + command: [sleep, 365d] + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + volumeMounts: + - name: kernel-volume + mountPath: /cache + volumes: + - name: kernel-volume + persistentVolumeClaim: + claimName: gkm-test-obj diff --git a/examples/base/access/rox/pod-3.yaml b/examples/base/access/rox/pod-3.yaml new file mode 100644 index 00000000..37e2c04c --- /dev/null +++ b/examples/base/access/rox/pod-3.yaml @@ -0,0 +1,25 @@ +--- +kind: Pod +apiVersion: v1 +metadata: + name: gkm-test-pod-3 + namespace: gkm-test-ns-2 +spec: + securityContext: + fsGroup: 1000 + containers: + - name: test + image: quay.io/fedora/fedora-minimal + imagePullPolicy: IfNotPresent + command: [sleep, 365d] + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + volumeMounts: + - name: kernel-volume + mountPath: /cache + volumes: + - name: kernel-volume + persistentVolumeClaim: + claimName: gkm-test-obj diff --git a/examples/base/access/rwo/ds-1.yaml b/examples/base/access/rwo/ds-1.yaml new file mode 100644 index 00000000..091bbd25 --- /dev/null +++ b/examples/base/access/rwo/ds-1.yaml @@ -0,0 +1,37 @@ +--- +kind: DaemonSet +apiVersion: apps/v1 +metadata: + name: gkm-test-ds-1 + namespace: gkm-test-ns-1 + labels: + gkm.io/pvc-mutation: "true" +spec: + selector: + matchLabels: + name: gkm-test-ds-1 + template: + metadata: + labels: + name: gkm-test-ds-1 + gkm.io/pvc-mutation: "true" + spec: + securityContext: + fsGroup: 1000 + containers: + - name: test + image: quay.io/fedora/fedora-minimal + imagePullPolicy: IfNotPresent + command: [sleep, 365d] + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + volumeMounts: + - name: kernel-volume + mountPath: /cache + readOnly: true + volumes: + - name: kernel-volume + persistentVolumeClaim: + claimName: gkm-test-obj diff --git a/examples/base/access/rwo/ds-2.yaml b/examples/base/access/rwo/ds-2.yaml new file mode 100644 index 00000000..414b8b88 --- /dev/null +++ b/examples/base/access/rwo/ds-2.yaml @@ -0,0 +1,36 @@ +--- +kind: DaemonSet +apiVersion: apps/v1 +metadata: + name: gkm-test-ds-2 + namespace: gkm-test-ns-1 + labels: + gkm.io/pvc-mutation: "true" +spec: + selector: + matchLabels: + name: gkm-test-ds-2 + template: + metadata: + labels: + name: gkm-test-ds-2 + gkm.io/pvc-mutation: "true" + spec: + securityContext: + fsGroup: 1000 + containers: + - name: test + image: quay.io/fedora/fedora-minimal + imagePullPolicy: IfNotPresent + command: [sleep, 365d] + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + volumeMounts: + - name: kernel-volume + mountPath: /cache + volumes: + - name: kernel-volume + persistentVolumeClaim: + claimName: gkm-test-obj diff --git a/examples/base/access/rwo/ds-3.yaml b/examples/base/access/rwo/ds-3.yaml new file mode 100644 index 00000000..71689b15 --- /dev/null +++ b/examples/base/access/rwo/ds-3.yaml @@ -0,0 +1,36 @@ +--- +kind: DaemonSet +apiVersion: apps/v1 +metadata: + name: gkm-test-ds-3 + namespace: gkm-test-ns-1 + labels: + gkm.io/pvc-mutation: "true" +spec: + selector: + matchLabels: + name: gkm-test-ds-3 + template: + metadata: + labels: + name: gkm-test-ds-3 + gkm.io/pvc-mutation: "true" + spec: + securityContext: + fsGroup: 1000 + containers: + - name: test + image: quay.io/fedora/fedora-minimal + imagePullPolicy: IfNotPresent + command: [sleep, 365d] + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + volumeMounts: + - name: kernel-volume + mountPath: /cache + volumes: + - name: kernel-volume + persistentVolumeClaim: + claimName: gkm-test-obj diff --git a/examples/base/access/rwo/kustomization.yaml b/examples/base/access/rwo/kustomization.yaml new file mode 100644 index 00000000..d2b79dcc --- /dev/null +++ b/examples/base/access/rwo/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ds-1.yaml + - ds-2.yaml + - ds-3.yaml diff --git a/examples/base/common/kustomization.yaml b/examples/base/common/kustomization.yaml new file mode 100644 index 00000000..3a108ab9 --- /dev/null +++ b/examples/base/common/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace-1.yaml diff --git a/examples/namespace/ROX/10-namespace.yaml b/examples/base/common/namespace-1.env similarity index 63% rename from examples/namespace/ROX/10-namespace.yaml rename to examples/base/common/namespace-1.env index 34dd4955..004353d4 100644 --- a/examples/namespace/ROX/10-namespace.yaml +++ b/examples/base/common/namespace-1.env @@ -2,4 +2,4 @@ apiVersion: v1 kind: Namespace metadata: - name: gkm-test-ns-rox-1 + name: NAMESPACE_1 diff --git a/examples/base/scope/cluster/clustergkmcache.yaml b/examples/base/scope/cluster/clustergkmcache.yaml new file mode 100644 index 00000000..a9243f64 --- /dev/null +++ b/examples/base/scope/cluster/clustergkmcache.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: gkm.io/v1alpha1 +kind: ClusterGKMCache +metadata: + name: gkm-test-obj +spec: + image: quay.io/gkm/cache-examples:replace + workloadNamespaces: + - gkm-test-ns-1 + - gkm-test-ns-2 + accessModes: + - ReadWriteOnce diff --git a/examples/base/scope/cluster/kustomization.yaml b/examples/base/scope/cluster/kustomization.yaml new file mode 100644 index 00000000..0627adfb --- /dev/null +++ b/examples/base/scope/cluster/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace-2.yaml + - clustergkmcache.yaml diff --git a/examples/namespace/RWO/10-namespace.yaml b/examples/base/scope/cluster/namespace-2.env similarity index 63% rename from examples/namespace/RWO/10-namespace.yaml rename to examples/base/scope/cluster/namespace-2.env index bc47b15b..0d019ea8 100644 --- a/examples/namespace/RWO/10-namespace.yaml +++ b/examples/base/scope/cluster/namespace-2.env @@ -2,4 +2,4 @@ apiVersion: v1 kind: Namespace metadata: - name: gkm-test-ns-rwo-1 + name: NAMESPACE_2 diff --git a/examples/base/scope/namespace/gkmcache.yaml b/examples/base/scope/namespace/gkmcache.yaml new file mode 100644 index 00000000..fa778c5c --- /dev/null +++ b/examples/base/scope/namespace/gkmcache.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: gkm.io/v1alpha1 +kind: GKMCache +metadata: + name: gkm-test-obj + namespace: gkm-test-ns-1 +spec: + image: quay.io/gkm/cache-examples:vector-add-cache-rocm-v2 + accessModes: + - ReadWriteOnce diff --git a/examples/base/scope/namespace/kustomization.yaml b/examples/base/scope/namespace/kustomization.yaml new file mode 100644 index 00000000..166f089b --- /dev/null +++ b/examples/base/scope/namespace/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - gkmcache.yaml diff --git a/examples/cleanup-files.sh b/examples/cleanup-files.sh new file mode 100755 index 00000000..f05693ef --- /dev/null +++ b/examples/cleanup-files.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +CALL_POPD=false +if [[ "$PWD" != */examples ]]; then + pushd examples &>/dev/null + if [[ $? -ne 0 ]]; then + echo "ERROR: Must run from \"./GKM\" or \"./GKM/examples\"" + exit 1 + fi + CALL_POPD=true +fi + +rm -f base/common/namespace-1.yaml +rm -f base/scope/cluster/namespace-2.yaml +rm -f overlays/access/*.yaml +rm -f overlays/scope/*.yaml +rm -f output/*.yaml +rm -f variants/access/rox/*.yaml +rm -f variants/access/rwo/*.yaml +rm -f variants/scope/cluster/*.yaml +rm -f variants/scope/namespace/*.yaml + +rmdir --ignore-fail-on-non-empty .gkm-generate-files.exclusivelock &>/dev/null + +if [[ "$CALL_POPD" == true ]]; then + popd &>/dev/null || exit +fi diff --git a/examples/cluster/ROX/10-namespace.yaml b/examples/cluster/ROX/10-namespace.yaml deleted file mode 100644 index bf1b9f53..00000000 --- a/examples/cluster/ROX/10-namespace.yaml +++ /dev/null @@ -1,10 +0,0 @@ ---- -apiVersion: v1 -kind: Namespace -metadata: - name: gkm-test-cl-rox-1 ---- -apiVersion: v1 -kind: Namespace -metadata: - name: gkm-test-cl-rox-2 diff --git a/examples/cluster/ROX/11-clustergkmcache.yaml b/examples/cluster/ROX/11-clustergkmcache.yaml deleted file mode 100644 index 1a57447b..00000000 --- a/examples/cluster/ROX/11-clustergkmcache.yaml +++ /dev/null @@ -1,23 +0,0 @@ ---- -apiVersion: gkm.io/v1alpha1 -kind: ClusterGKMCache -metadata: - name: vector-add-cache-rocm-v2-rox - labels: - gkm.io/signature-format: cosign-v2 -spec: - image: quay.io/gkm/cache-examples:vector-add-cache-rocm-v2 - workloadNamespaces: - - gkm-test-cl-rox-1 - - gkm-test-cl-rox-2 - accessModes: - - ReadWriteOnce - - ReadOnlyMany - - # Toleration is specific to the KIND cluster being deployed. - podTemplate: - spec: - tolerations: - - key: gpu - operator: Exists - effect: NoSchedule diff --git a/examples/cluster/ROX/12-pod.yaml b/examples/cluster/ROX/12-pod.yaml deleted file mode 100644 index 124b3343..00000000 --- a/examples/cluster/ROX/12-pod.yaml +++ /dev/null @@ -1,41 +0,0 @@ ---- -kind: Pod -apiVersion: v1 -metadata: - name: gkm-test-cl-rox-pod-1 - namespace: gkm-test-cl-rox-1 -spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rox diff --git a/examples/cluster/ROX/13-pod.yaml b/examples/cluster/ROX/13-pod.yaml deleted file mode 100644 index 2e35df21..00000000 --- a/examples/cluster/ROX/13-pod.yaml +++ /dev/null @@ -1,41 +0,0 @@ ---- -kind: Pod -apiVersion: v1 -metadata: - name: gkm-test-cl-rox-pod-2 - namespace: gkm-test-cl-rox-1 -spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rox diff --git a/examples/cluster/ROX/14-pod.yaml b/examples/cluster/ROX/14-pod.yaml deleted file mode 100644 index 6085704f..00000000 --- a/examples/cluster/ROX/14-pod.yaml +++ /dev/null @@ -1,41 +0,0 @@ ---- -kind: Pod -apiVersion: v1 -metadata: - name: gkm-test-cl-rox-pod-1 - namespace: gkm-test-cl-rox-2 -spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rox diff --git a/examples/cluster/RWO/10-namespace.yaml b/examples/cluster/RWO/10-namespace.yaml deleted file mode 100644 index 799ef127..00000000 --- a/examples/cluster/RWO/10-namespace.yaml +++ /dev/null @@ -1,10 +0,0 @@ ---- -apiVersion: v1 -kind: Namespace -metadata: - name: gkm-test-cl-rwo-1 ---- -apiVersion: v1 -kind: Namespace -metadata: - name: gkm-test-cl-rwo-2 diff --git a/examples/cluster/RWO/11-clustergkmcache.yaml b/examples/cluster/RWO/11-clustergkmcache.yaml deleted file mode 100644 index 6d9e0c1f..00000000 --- a/examples/cluster/RWO/11-clustergkmcache.yaml +++ /dev/null @@ -1,20 +0,0 @@ ---- -apiVersion: gkm.io/v1alpha1 -kind: ClusterGKMCache -metadata: - name: vector-add-cache-rocm-v2-rwo -spec: - image: quay.io/gkm/cache-examples:vector-add-cache-rocm-v2 - workloadNamespaces: - - gkm-test-cl-rwo-1 - - gkm-test-cl-rwo-2 - accessModes: - - ReadWriteOnce - - # Toleration is specific to the KIND cluster being deployed. - podTemplate: - spec: - tolerations: - - key: gpu - operator: Exists - effect: NoSchedule diff --git a/examples/cluster/RWO/12-ds.yaml b/examples/cluster/RWO/12-ds.yaml deleted file mode 100644 index e2042222..00000000 --- a/examples/cluster/RWO/12-ds.yaml +++ /dev/null @@ -1,52 +0,0 @@ ---- -kind: DaemonSet -apiVersion: apps/v1 -metadata: - name: gkm-test-cl-rwo-ds-1 - namespace: gkm-test-cl-rwo-1 - labels: - gkm.io/pvcMutation: "true" -spec: - selector: - matchLabels: - name: gkm-test-cl-rwo-ds-1 - template: - metadata: - labels: - name: gkm-test-cl-rwo-ds-1 - gkm.io/pvc-mutation: "true" - spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rwo diff --git a/examples/cluster/RWO/13-ds.yaml b/examples/cluster/RWO/13-ds.yaml deleted file mode 100644 index cac786ec..00000000 --- a/examples/cluster/RWO/13-ds.yaml +++ /dev/null @@ -1,54 +0,0 @@ ---- -kind: DaemonSet -apiVersion: apps/v1 -metadata: - name: gkm-test-cl-rwo-ds-2 - namespace: gkm-test-cl-rwo-1 - labels: - gkm.io/pvcMutation: "true" -spec: - selector: - matchLabels: - name: gkm-test-cl-rwo-ds-1 - template: - metadata: - labels: - name: gkm-test-cl-rwo-ds-1 - gkm.io/pvc-mutation: "true" - spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - nodeSelector: - gkm-test-node: "true" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rwo diff --git a/examples/cluster/RWO/14-ds.yaml b/examples/cluster/RWO/14-ds.yaml deleted file mode 100644 index e3ff1cf9..00000000 --- a/examples/cluster/RWO/14-ds.yaml +++ /dev/null @@ -1,54 +0,0 @@ ---- -kind: DaemonSet -apiVersion: apps/v1 -metadata: - name: gkm-test-cl-rwo-ds-1 - namespace: gkm-test-cl-rwo-2 - labels: - gkm.io/pvcMutation: "true" -spec: - selector: - matchLabels: - name: gkm-test-cl-rwo-ds-2 - template: - metadata: - labels: - name: gkm-test-cl-rwo-ds-2 - gkm.io/pvc-mutation: "true" - spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - nodeSelector: - gkm-test-node: "false" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rwo diff --git a/examples/cluster/RWO/21-clustergkmcache-cosign-v3.yaml b/examples/cluster/RWO/21-clustergkmcache-cosign-v3.yaml deleted file mode 100644 index d04642b9..00000000 --- a/examples/cluster/RWO/21-clustergkmcache-cosign-v3.yaml +++ /dev/null @@ -1,19 +0,0 @@ ---- -apiVersion: gkm.io/v1alpha1 -kind: ClusterGKMCache -metadata: - name: vector-add-cache-rocm-v3-rwo -spec: - image: quay.io/gkm/cache-examples:vector-add-cache-rocm - workloadNamespaces: - - gkm-test-cl-rwo-2 - accessModes: - - ReadWriteOnce - - # Toleration is specific to the KIND cluster being deployed. - podTemplate: - spec: - tolerations: - - key: gpu - operator: Exists - effect: NoSchedule diff --git a/examples/cluster/RWO/22-ds.yaml b/examples/cluster/RWO/22-ds.yaml deleted file mode 100644 index 2285e6c5..00000000 --- a/examples/cluster/RWO/22-ds.yaml +++ /dev/null @@ -1,52 +0,0 @@ ---- -kind: DaemonSet -apiVersion: apps/v1 -metadata: - name: gkm-test-cl-rwo-v3-ds-1 - namespace: gkm-test-cl-rwo-2 - labels: - gkm.io/pvcMutation: "true" -spec: - selector: - matchLabels: - name: gkm-test-cl-rwo-v3-ds-1 - template: - metadata: - labels: - name: gkm-test-cl-rwo-v3-ds-1 - gkm.io/pvc-mutation: "true" - spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v3-rwo diff --git a/examples/generate-files.sh b/examples/generate-files.sh new file mode 100755 index 00000000..189b0a3b --- /dev/null +++ b/examples/generate-files.sh @@ -0,0 +1,565 @@ +#!/bin/bash + +# Filter out help if it was entered +if [[ "$1" == "help" || "$1" == "-help" || "$1" == "--help" ]]; then + echo "" + echo "./generate-files.sh will generate a yaml file from the base files" + echo " and the input which can then be applied to a Kubernetes cluster." + echo " Generated filename is printed from script and files can be found" + echo " in the \"output/\" directory." + echo "Syntax:" + echo " ./generate-files.sh []" + echo "Where:" + echo " is \"rox\" or \"rwo\" and required." + echo " is \"namespace\", \"ns\", \"cluster\" or \"cl\" and required." + echo " is \"cuda\" or \"rocm\" and required." + echo " is \"v2\" or \"v3\" and required." + echo " is \"kind\" or \"nfd\" and optional." + echo "Samples:" + echo " ./generate-files.sh rwo namespace rocm v3 kind" + echo " ./generate-files.sh rox cluster cuda v2 nfd" + echo " ./generate-files.sh rox ns rocm v3" + echo "" + exit 0 +fi + + +CALL_POPD=false +if [[ "$PWD" != */examples ]]; then + pushd examples &>/dev/null + if [[ $? -ne 0 ]]; then + echo "ERROR: Must run from \"./GKM\" or \"./GKM/examples\"" + exit 1 + fi + CALL_POPD=true +fi + + +# Lock so only one instance of script is run at a time +LOCK_DIR=.gkm-generate-files.exclusivelock +if ! mkdir ${LOCK_DIR} &>/dev/null; then + echo "Could not get lock, retry" + + if [[ "$CALL_POPD" == true ]]; then + popd &>/dev/null || exit + fi + + exit 1 +fi +trap 'rmdir ${LOCK_DIR} &>/dev/null; exit' INT TERM EXIT + + +# +# Setup tools +# + +# On macOS, check if gsed (GNU sed) is installed +if command -v gsed >/dev/null 2>&1; then + SED="gsed" +else + # Fallback to macOS default sed (BSD) + SED="sed" +fi + +KUSTOMIZE=../bin/kustomize +if ! command -v ${KUSTOMIZE} >/dev/null 2>&1; then + echo "Error: ${KUSTOMIZE} not installed. Run 'make kustomize' to install." + exit 1 +fi + + +# +# Process Input Variables +# +ACCESS=$1 +SCOPE=$2 +GPU_ARCH=$3 +COSIGN_VERSION=$4 +ENVIRONMENT=$5 + +# Overridable Input Variables +CUSTOM_AFFINITY=${CUSTOM_AFFINITY:-""} +CUSTOM_TOLERATION=${CUSTOM_TOLERATION:-""} +DEBUG=${DEBUG:-false} +# Node Selector: +# CUSTOM_NODE_SELECTOR_1 is for Pod 1 or DaemonSet 1 +# CUSTOM_NODE_SELECTOR_2 is for Pod 2 or DaemonSet 2 +# CUSTOM_NODE_SELECTOR_3 is for Pod 3 or DaemonSet 3 +CUSTOM_NODE_SELECTOR_1=${CUSTOM_NODE_SELECTOR_1:-""} +CUSTOM_NODE_SELECTOR_2=${CUSTOM_NODE_SELECTOR_2:-""} +CUSTOM_NODE_SELECTOR_3=${CUSTOM_NODE_SELECTOR_3:-""} + +# Constants +BASE_DIR_COMMON="base/common" +OUTPUT_DIR="output" +OVERLAY_DIR_ACCESS="overlays/access" +OVERLAY_DIR_SCOPE="overlays/scope" +AFFINITY_NFD_CUDA_FILE="patch/affinity-nfd-cuda.txt" +AFFINITY_NFD_ROCM_FILE="patch/affinity-nfd-rocm.txt" +NODE_SELECTOR_KIND_TRUE_FILE="patch/node-selector-kind-true.txt" +NODE_SELECTOR_KIND_FALSE_FILE="patch/node-selector-kind-false.txt" +TOLERATION_KIND_FILE="patch/toleration-kind.txt" +TOLERATION_NFD_CUDA_FILE="patch/toleration-nfd-cuda.txt" + +# AccessMode of the PVC, valid values: rox (ReadOnlyMany) or rwo (ReadWriteOnce) +if [[ "$ACCESS" == "rox" ]]; then + BASE_DIR_ACCESS="base/access/rox" + VARIANTS_DIR_ACCESS="variants/access/rox" +elif [[ "$ACCESS" == "rwo" ]]; then + BASE_DIR_ACCESS="base/access/rwo" + VARIANTS_DIR_ACCESS="variants/access/rwo" +else + echo "ERROR: Parameter 1 (ACCESS) must be \"rox\" or \"rwo\"." + exit 1 +fi + +# Scope of the GKM Cache (GKMCache or ClusterGKMCache), valid values: cluster or namespace +if [[ "$SCOPE" == "cluster" || "$SCOPE" == "cl" ]]; then + BASE_DIR_SCOPE="base/scope/cluster" + VARIANTS_DIR_SCOPE="variants/scope/cluster" + SCOPE="cl" +elif [[ "$SCOPE" == "namespace" || "$SCOPE" == "ns" ]]; then + BASE_DIR_SCOPE="base/scope/namespace" + VARIANTS_DIR_SCOPE="variants/scope/namespace" + SCOPE="ns" +else + echo "ERROR: Parameter 2 (SCOPE) must be \"cluster\", \"cl\", \"namespace\" or \"ns\"." + exit 1 +fi + +# GPU Architecture, valid values: cuda or rocm +if [[ "$GPU_ARCH" != "cuda" && "$GPU_ARCH" != "rocm" ]]; then + echo "ERROR: Parameter 3 (GPU_ARCH) must be \"cuda\" or \"rocm\"." + exit 1 +fi + +# CoSign Version used to sign OCI Image, valid values: v2 or v3 +if [[ "$COSIGN_VERSION" != "v2" && "$COSIGN_VERSION" != "v3" ]]; then + echo "ERROR: Parameter 4 (COSIGN_VERSION) must be \"v2\" or \"v3\"." + exit 1 +fi + +# Environment is to indicate KIND Cluster, valid values: kind +if [ -n "${ENVIRONMENT+x}" ]; then + if [[ "$ENVIRONMENT" == "kind" ]]; then + ENV_FILENAME_SUFFIX="-kind" + if [[ "$GPU_ARCH" != "rocm" ]]; then + echo "ERROR: KIND Cluster is currently only deployed with simulated ROCm GPUs." + exit 1 + fi + elif [[ "$ENVIRONMENT" == "nfd" ]]; then + ENV_FILENAME_SUFFIX="-nfd" + elif [[ "$ENVIRONMENT" != "" ]]; then + echo "ERROR: Parameter 5 (ENVIRONMENT) must be \"kind\", \"nfd\" or not specified." + exit 1 + fi +fi + +# Generic Variables based on input +NAME_SUFFIX="${ACCESS}-${SCOPE}-${GPU_ARCH}-${COSIGN_VERSION}" +OBJECT_NAME="gkm-test-obj-${NAME_SUFFIX}" +if [[ "$COSIGN_VERSION" == "v2" ]]; then + OCI_IMAGE="quay.io/gkm/cache-examples:vector-add-cache-${GPU_ARCH}-${COSIGN_VERSION}" +else + OCI_IMAGE="quay.io/gkm/cache-examples:vector-add-cache-${GPU_ARCH}" +fi +COSIGN_VERSION_LABEL="cosign-${COSIGN_VERSION}" +NAMESPACE_1="gkm-test-ns-1-${NAME_SUFFIX}" +if [[ "$SCOPE" == "ns" ]]; then + NAMESPACE_2=${NAMESPACE_1} +else + NAMESPACE_2="gkm-test-ns-2-${NAME_SUFFIX}" +fi + + +# +# Build overlays/scope/kustomization.yaml file with Namespace and GKMCache or ClusterGKMCache +# Build overlays/access/kustomization.yaml file with Pods or DaemonSets +# Broken into two files to control ordering of objects. +# +mkdir -p "${OVERLAY_DIR_SCOPE}" +cat < ${OVERLAY_DIR_SCOPE}/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- ../../${BASE_DIR_COMMON} +- ../../${BASE_DIR_SCOPE} + +components: +- ../../${VARIANTS_DIR_SCOPE} + +nameSuffix: -${NAME_SUFFIX} +EOF + +mkdir -p "${OVERLAY_DIR_ACCESS}" +cat < ${OVERLAY_DIR_ACCESS}/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- ../../${BASE_DIR_ACCESS} + +components: +- ../../${VARIANTS_DIR_ACCESS} + +nameSuffix: -${NAME_SUFFIX} +EOF + + +# +# Set Dynamic Patching. These patches are added based off input and inject a patch (via the +# Modify the Variants section below) to the Variant kustization.env files by replacing a TAG +# with the generated patch or replacing the TAG with nothing if patch not need. +# + +if [[ "$ENVIRONMENT" == "kind" ]]; then + # Where in a POD or DaemonSet Spec an initContainer is inserted + KIND_INIT_CONTAINER_PATH_POD="/spec/initContainers" + KIND_INIT_CONTAINER_PATH_DAEMON_SET="/spec/template/spec/initContainers" + + if [[ "$ACCESS" == "rox" ]]; then + # ReadOnlyMany (rox) implies a Pod is being deployed, so set the path properly + KIND_INIT_CONTAINER_PATH=${KIND_INIT_CONTAINER_PATH_POD} + elif [[ "$ACCESS" == "rwo" ]]; then + # ReadWriteOnce (rwo) implies a DaemonSet is being deployed, so set the path properly + KIND_INIT_CONTAINER_PATH=${KIND_INIT_CONTAINER_PATH_DAEMON_SET} + fi + + # KIND_INIT_CONTAINER holds a patch that is used to add an initContainer to Pods or DaemonSets + # when KIND Cluster is being used. It sets the permissions of the PVC VolumeMount so that the + # Pod can access it. Only needed in KIND. + KIND_INIT_CONTAINER=" # For KIND Cluster, add initContainer that sets the permissions on the PVC VolumeMount + - op: add + path: ${KIND_INIT_CONTAINER_PATH} + value: [] + - op: add + path: ${KIND_INIT_CONTAINER_PATH}/- + value: + name: fix-permissions + image: quay.io/fedora/fedora-minimal + securityContext: + runAsUser: 0 + command: + - sh + - -c + - | + chown -R 1000:1000 /cache + chmod -R 775 /cache + volumeMounts: + - name: kernel-volume + mountPath: /cache" + + # KIND_INIT_CONTAINER contains a multiline string, so special characters need + # to be stripped for sed to process properly. + ESCAPED_KIND_INIT_CONTAINER=$(printf '%s\n' "$KIND_INIT_CONTAINER" \ + | sed -e 's/[\/&]/\\&/g' -e ':a;N;$!ba;s/\n/\\n/g') +fi + + +if [[ "$ACCESS" == "rox" ]]; then + # ACCESS_ROX_ACCESS_MODE holds a patch that is used to add ReadOnlyMany to the AccessMode field + # in a GKMCache or ClusterGKMCache. Kubernetes does not have a way to be queried to determine if + # ReadOnlyMany is supported by a StorageClass so GKM Operator/Agent need tobe told. + ACCESS_ROX_ACCESS_MODE=" # Append ReadOnlyMany to the spec.accessModes slice in the GKMCache or ClusterGKMCache + - op: add + path: /spec/accessModes/- + value: ReadOnlyMany" + + # ACCESS_ROX_ACCESS_MODE contains a multiline string, so special characters need + # to be stripped for sed to process properly. + ESCAPED_ACCESS_ROX_ACCESS_MODE=$(printf '%s\n' "$ACCESS_ROX_ACCESS_MODE" \ + | sed -e 's/[\/&]/\\&/g' -e ':a;N;$!ba;s/\n/\\n/g') +fi + + +# Affinity: +if [[ "$CUSTOM_AFFINITY" != "" ]]; then + if [[ -r "${CUSTOM_AFFINITY}" ]]; then + AFFINITY_INSTANCE=$(cat "${CUSTOM_AFFINITY}") || { echo "Error: Failed to read file - CUSTOM_AFFINITY=${CUSTOM_AFFINITY}" >&2; exit 1; } + else + echo "Error: File does not exist - CUSTOM_AFFINITY=${CUSTOM_AFFINITY}" + exit 1 + fi +elif [[ "$ENVIRONMENT" == "nfd" ]]; then + if [[ "$GPU_ARCH" == "cuda" ]]; then + AFFINITY_INSTANCE=$(cat ${AFFINITY_NFD_CUDA_FILE}) + elif [[ "$GPU_ARCH" == "rocm" ]]; then + AFFINITY_INSTANCE=$(cat ${AFFINITY_NFD_ROCM_FILE}) + fi +fi + +# Tolerations: +if [[ "$CUSTOM_TOLERATION" != "" ]]; then + if [[ -r "${CUSTOM_TOLERATION}" ]]; then + TOLERATION_INSTANCE=$(cat "${CUSTOM_TOLERATION}") || { echo "Error: Failed to read file - CUSTOM_TOLERATION=${CUSTOM_TOLERATION}" >&2; exit 1; } + else + echo "Error: File does not exist - CUSTOM_TOLERATION=${CUSTOM_TOLERATION}" + exit 1 + fi +elif [[ "$ENVIRONMENT" == "kind" ]]; then + TOLERATION_INSTANCE=$(cat ${TOLERATION_KIND_FILE}) +elif [[ "$ENVIRONMENT" == "nfd" ]]; then + if [[ "$GPU_ARCH" == "cuda" ]]; then + TOLERATION_INSTANCE=$(cat ${TOLERATION_NFD_CUDA_FILE}) + fi +fi + +if [[ "$AFFINITY_INSTANCE" != "" || "$TOLERATION_INSTANCE" != "" ]]; then + POD_TEMPLATE_ADD_GKMCACHE=" # Add a Affinity/Toleration to GKMCache or ClusterGKMCache + - op: add + path: /spec/podTemplate + value: {} + - op: add + path: /spec/podTemplate/spec + value: {}" + + if [[ "$AFFINITY_INSTANCE" != "" ]]; then + POD_TEMPLATE_ADD_GKMCACHE+=" + - op: add + path: /spec/podTemplate/spec/affinity + value: +${AFFINITY_INSTANCE}" + fi + + if [[ "$TOLERATION_INSTANCE" != "" ]]; then + POD_TEMPLATE_ADD_GKMCACHE+=" + - op: add + path: /spec/podTemplate/spec/tolerations + value: [] + - op: add + path: /spec/podTemplate/spec/tolerations/- + value: +${TOLERATION_INSTANCE}" + fi + + # POD_TEMPLATE_ADD_GKMCACHE contains a multiline string, so special characters need + # to be stripped for sed to process properly. + ESCAPED_POD_TEMPLATE_ADD_GKMCACHE=$(printf '%s\n' "$POD_TEMPLATE_ADD_GKMCACHE" \ + | sed -e 's/[\/&]/\\&/g' -e ':a;N;$!ba;s/\n/\\n/g') + + + # Where in a POD or DaemonSet Spec affinity/toleration/is inserted + AFFINITY_PATH_POD="/spec/affinity" + AFFINITY_PATH_DAEMON_SET="/spec/template/spec/affinity" + TOLERATION_PATH_POD="/spec/tolerations" + TOLERATION_PATH_DAEMON_SET="/spec/template/spec/tolerations" + + if [[ "$ACCESS" == "rox" ]]; then + # ReadOnlyMany (rox) implies a Pod is being deployed, so set the path properly + AFFINITY_PATH=${AFFINITY_PATH_POD} + TOLERATION_PATH=${TOLERATION_PATH_POD} + elif [[ "$ACCESS" == "rwo" ]]; then + # ReadWriteOnce (rwo) implies a DaemonSet is being deployed, so set the path properly + AFFINITY_PATH=${AFFINITY_PATH_DAEMON_SET} + TOLERATION_PATH=${TOLERATION_PATH_DAEMON_SET} + fi + + if [[ "$AFFINITY_INSTANCE" != "" ]]; then + AFFINITY_ADD_POD_DS=" # Add a Affinity to Pod or DaemonSet + - op: add + path: ${AFFINITY_PATH} + value: +${AFFINITY_INSTANCE}" + + # AFFINITY_ADD_POD_DS contains a multiline string, so special characters need + # to be stripped for sed to process properly. + ESCAPED_AFFINITY_ADD_POD_DS=$(printf '%s\n' "$AFFINITY_ADD_POD_DS" \ + | sed -e 's/[\/&]/\\&/g' -e ':a;N;$!ba;s/\n/\\n/g') + fi + + if [[ "$TOLERATION_INSTANCE" != "" ]]; then + TOLERATION_ADD_POD_DS=" # Add a Toleration to Pod or DaemonSet + - op: add + path: ${TOLERATION_PATH} + value: [] + - op: add + path: ${TOLERATION_PATH}/- + value: +${TOLERATION_INSTANCE}" + + # TOLERATION_ADD_POD_DS contains a multiline string, so special characters need + # to be stripped for sed to process properly. + ESCAPED_TOLERATION_ADD_POD_DS=$(printf '%s\n' "$TOLERATION_ADD_POD_DS" \ + | sed -e 's/[\/&]/\\&/g' -e ':a;N;$!ba;s/\n/\\n/g') + fi +fi + + +# Node Selector: +# .._SELECTOR_1 is for Pod 1 or DaemonSet 1 +# .._SELECTOR_2 is for Pod 2 or DaemonSet 2 +# .._SELECTOR_3 is for Pod 3 or DaemonSet 3 + +# Where in a POD or DaemonSet Spec a Node Selector is inserted +NODE_SELECTOR_PATH_POD="/spec/nodeSelector" +NODE_SELECTOR_PATH_DAEMON_SET="/spec/template/spec/nodeSelector" + +if [[ "$ACCESS" == "rox" ]]; then + # ReadOnlyMany (rox) implies a Pod is being deployed, so set the path properly + NODE_SELECTOR_PATH=${NODE_SELECTOR_PATH_POD} +elif [[ "$ACCESS" == "rwo" ]]; then + # ReadWriteOnce (rwo) implies a DaemonSet is being deployed, so set the path properly + NODE_SELECTOR_PATH=${NODE_SELECTOR_PATH_DAEMON_SET} +fi + +if [[ "$CUSTOM_NODE_SELECTOR_1" != "" ]]; then + if [[ -r "${CUSTOM_NODE_SELECTOR_1}" ]]; then + NODE_SELECTOR_INSTANCE_1=$(cat "${CUSTOM_NODE_SELECTOR_1}") || { echo "Error: Failed to read file - CUSTOM_NODE_SELECTOR_1=${CUSTOM_NODE_SELECTOR_1}" >&2; exit 1; } + else + echo "Error: File does not exist - CUSTOM_NODE_SELECTOR_1=${CUSTOM_NODE_SELECTOR_1}" + exit 1 + fi +fi + +if [[ "$NODE_SELECTOR_INSTANCE_1" != "" ]]; then + NODE_SELECTOR_1=" # Add NodeSelector to Pod/DaemonSet 1 + - op: add + path: ${NODE_SELECTOR_PATH} + value: +${NODE_SELECTOR_INSTANCE_1}" + + # NODE_SELECTOR_1 contains a multiline string, so special characters need + # to be stripped for sed to process properly. + ESCAPED_NODE_SELECTOR_1=$(printf '%s\n' "$NODE_SELECTOR_1" \ + | sed -e 's/[\/&]/\\&/g' -e ':a;N;$!ba;s/\n/\\n/g') +fi + + +if [[ "$CUSTOM_NODE_SELECTOR_2" != "" ]]; then + if [[ -r "${CUSTOM_NODE_SELECTOR_2}" ]]; then + NODE_SELECTOR_INSTANCE_2=$(cat "${CUSTOM_NODE_SELECTOR_2}") || { echo "Error: Failed to read file - CUSTOM_NODE_SELECTOR_2=${CUSTOM_NODE_SELECTOR_2}" >&2; exit 1; } + else + echo "Error: File does not exist - CUSTOM_NODE_SELECTOR_2=${CUSTOM_NODE_SELECTOR_2}" + exit 1 + fi +elif [[ "$ENVIRONMENT" == "kind" && "$ACCESS" == "rwo" ]]; then + NODE_SELECTOR_INSTANCE_2=$(cat ${NODE_SELECTOR_KIND_TRUE_FILE}) +fi + +if [[ "$NODE_SELECTOR_INSTANCE_2" != "" ]]; then + NODE_SELECTOR_2=" # Add NodeSelector to Pod/DaemonSet 2 + - op: add + path: ${NODE_SELECTOR_PATH} + value: +${NODE_SELECTOR_INSTANCE_2}" + + # NODE_SELECTOR_2 contains a multiline string, so special characters need + # to be stripped for sed to process properly. + ESCAPED_NODE_SELECTOR_2=$(printf '%s\n' "$NODE_SELECTOR_2" \ + | sed -e 's/[\/&]/\\&/g' -e ':a;N;$!ba;s/\n/\\n/g') +fi + + +if [[ "$CUSTOM_NODE_SELECTOR_3" != "" ]]; then + if [[ -r "${CUSTOM_NODE_SELECTOR_3}" ]]; then + NODE_SELECTOR_INSTANCE_3=$(cat "${CUSTOM_NODE_SELECTOR_3}") || { echo "Error: Failed to read file - CUSTOM_NODE_SELECTOR_3=${CUSTOM_NODE_SELECTOR_3}" >&2; exit 1; } + else + echo "Error: File does not exist - CUSTOM_NODE_SELECTOR_3=${CUSTOM_NODE_SELECTOR_3}" + exit 1 + fi +elif [[ "$ENVIRONMENT" == "kind" && "$ACCESS" == "rwo" ]]; then + NODE_SELECTOR_INSTANCE_3=$(cat ${NODE_SELECTOR_KIND_FALSE_FILE}) +fi + +if [[ "$NODE_SELECTOR_INSTANCE_3" != "" ]]; then + NODE_SELECTOR_3=" # Add NodeSelector to Pod/DaemonSet 3 + - op: add + path: ${NODE_SELECTOR_PATH} + value: +${NODE_SELECTOR_INSTANCE_3}" + + # NODE_SELECTOR_3 contains a multiline string, so special characters need + # to be stripped for sed to process properly. + ESCAPED_NODE_SELECTOR_3=$(printf '%s\n' "$NODE_SELECTOR_3" \ + | sed -e 's/[\/&]/\\&/g' -e ':a;N;$!ba;s/\n/\\n/g') +fi + + +# +# Modify the Variants Using sed to replace variables +# + +# Set the Namespace name in Namespace 1 object +pushd ${BASE_DIR_COMMON} > /dev/null +${SED} \ + -e "s/NAMESPACE_1/${NAMESPACE_1}/g" \ + namespace-1.env > namespace-1.yaml +popd > /dev/null + +# Set the Namespace name in Namespace 2 object only if Cluster scoped +if [[ "$SCOPE" == "cl" ]]; then + pushd ${BASE_DIR_SCOPE} > /dev/null + ${SED} \ + -e "s/NAMESPACE_2/${NAMESPACE_2}/g" \ + namespace-2.env > namespace-2.yaml + popd > /dev/null +fi + +# UPDATE Pod or DaemonSet +# For both rox and rwo, for each Pod or DaemonSet: +# - set the Namespace +# - set the PVC Claim in the Volume to the generated GKMCache or ClusterGKMCache name +# - insert the KIND Init Container if KIND Cluster, otherwise remove the placeholder +pushd ${VARIANTS_DIR_ACCESS} > /dev/null +${SED} \ + -e "s/NAMESPACE_1/${NAMESPACE_1}/g" \ + -e "s/NAMESPACE_2/${NAMESPACE_2}/g" \ + -e "s/OBJECT_NAME/${OBJECT_NAME}/g" \ + -e "s@KIND_INIT_CONTAINER@${ESCAPED_KIND_INIT_CONTAINER}@g" \ + -e "s@TOLERATION_ADD_POD_DS@${ESCAPED_TOLERATION_ADD_POD_DS}@g" \ + -e "s@AFFINITY_ADD_POD_DS@${ESCAPED_AFFINITY_ADD_POD_DS}@g" \ + -e "s@NODE_SELECTOR_1@${ESCAPED_NODE_SELECTOR_1}@g" \ + -e "s@NODE_SELECTOR_2@${ESCAPED_NODE_SELECTOR_2}@g" \ + -e "s@NODE_SELECTOR_3@${ESCAPED_NODE_SELECTOR_3}@g" \ + kustomization.env > kustomization.yaml +popd > /dev/null + +# UPDATE GKMCache or ClusterGKMCache +# For both cluster and namespace, for each GKMCache or ClusterGKMCache: +# - set the Namespace for GKMCache object (not ClusterGKMCache) +# - set OCI Image +# - add the Cosign Version label +# - set the workload namespace list for the ClusterGKMCache (not for GKMCache) +# - add "readOnlyMany" to the AccessModes field, or remove the placeholder +pushd ${VARIANTS_DIR_SCOPE} > /dev/null +${SED} \ + -e "s/OBJECT_NAME/${OBJECT_NAME}/g" \ + -e "s@OCI_IMAGE@${OCI_IMAGE}@g" \ + -e "s/COSIGN_VERSION_LABEL/${COSIGN_VERSION_LABEL}/g" \ + -e "s/NAMESPACE_1/${NAMESPACE_1}/g" \ + -e "s/NAMESPACE_2/${NAMESPACE_2}/g" \ + -e "s@ACCESS_ROX_ACCESS_MODE@${ESCAPED_ACCESS_ROX_ACCESS_MODE}@g" \ + -e "s@POD_TEMPLATE_ADD_GKMCACHE@${ESCAPED_POD_TEMPLATE_ADD_GKMCACHE}@g" \ + kustomization.env > kustomization.yaml + +# If using ReadOnlyMany (rox), then add "ReadOnlyMany" to the GKMCache or ClusterGKMCache +# AccessMode field. +#if [[ "$ACCESS" == "rox" ]]; then +# echo "${ACCESS_ROX_ACCESS_MODE}" >> kustomization.yaml +#fi +popd > /dev/null + + +# +# Generate the Yaml with all the objects +# +OUTPUT_FILENAME=${OUTPUT_DIR}/${NAME_SUFFIX}${ENV_FILENAME_SUFFIX}.yaml +mkdir -p "${OUTPUT_DIR}" + +${KUSTOMIZE} build overlays/scope > ${OUTPUT_FILENAME} || exit 1 +echo "---" >> ${OUTPUT_FILENAME} +${KUSTOMIZE} build overlays/access >> ${OUTPUT_FILENAME} || exit 1 + +if [[ "${DEBUG}" == true ]]; then + cat ${OUTPUT_FILENAME} + echo +fi + + +rmdir ${LOCK_DIR} &>/dev/null + +if [[ "$CALL_POPD" == true ]]; then + echo "example/${OUTPUT_FILENAME}" + popd &>/dev/null || exit +else + echo "${OUTPUT_FILENAME}" +fi diff --git a/examples/namespace/ROX/11-gkmcache.yaml b/examples/namespace/ROX/11-gkmcache.yaml deleted file mode 100644 index c8d7bf0a..00000000 --- a/examples/namespace/ROX/11-gkmcache.yaml +++ /dev/null @@ -1,21 +0,0 @@ ---- -apiVersion: gkm.io/v1alpha1 -kind: GKMCache -metadata: - name: vector-add-cache-rocm-v2-rox - namespace: gkm-test-ns-rox-1 - labels: - gkm.io/signature-format: cosign-v2 -spec: - image: quay.io/gkm/cache-examples:vector-add-cache-rocm-v2 - accessModes: - - ReadWriteOnce - - ReadOnlyMany - - # Toleration is specific to the KIND cluster being deployed. - podTemplate: - spec: - tolerations: - - key: gpu - operator: Exists - effect: NoSchedule diff --git a/examples/namespace/ROX/12-pod.yaml b/examples/namespace/ROX/12-pod.yaml deleted file mode 100644 index e1ff4358..00000000 --- a/examples/namespace/ROX/12-pod.yaml +++ /dev/null @@ -1,41 +0,0 @@ ---- -kind: Pod -apiVersion: v1 -metadata: - name: gkm-test-ns-rox-pod-1 - namespace: gkm-test-ns-rox-1 -spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rox diff --git a/examples/namespace/ROX/13-pod.yaml b/examples/namespace/ROX/13-pod.yaml deleted file mode 100644 index a80f6e90..00000000 --- a/examples/namespace/ROX/13-pod.yaml +++ /dev/null @@ -1,41 +0,0 @@ ---- -kind: Pod -apiVersion: v1 -metadata: - name: gkm-test-ns-rox-pod-2 - namespace: gkm-test-ns-rox-1 -spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rox diff --git a/examples/namespace/ROX/14-pod.yaml b/examples/namespace/ROX/14-pod.yaml deleted file mode 100644 index 39010941..00000000 --- a/examples/namespace/ROX/14-pod.yaml +++ /dev/null @@ -1,41 +0,0 @@ ---- -kind: Pod -apiVersion: v1 -metadata: - name: gkm-test-ns-rox-pod-3 - namespace: gkm-test-ns-rox-1 -spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rox diff --git a/examples/namespace/RWO/11-gkmcache.yaml b/examples/namespace/RWO/11-gkmcache.yaml deleted file mode 100644 index eb81bd8a..00000000 --- a/examples/namespace/RWO/11-gkmcache.yaml +++ /dev/null @@ -1,18 +0,0 @@ ---- -apiVersion: gkm.io/v1alpha1 -kind: GKMCache -metadata: - name: vector-add-cache-rocm-v2-rwo - namespace: gkm-test-ns-rwo-1 - labels: - gkm.io/signature-format: cosign-v2 -spec: - image: quay.io/gkm/cache-examples:vector-add-cache-rocm-v2 - - # Toleration is specific to the KIND cluster being deployed. - podTemplate: - spec: - tolerations: - - key: gpu - operator: Exists - effect: NoSchedule diff --git a/examples/namespace/RWO/12-ds.yaml b/examples/namespace/RWO/12-ds.yaml deleted file mode 100644 index 738c8bd6..00000000 --- a/examples/namespace/RWO/12-ds.yaml +++ /dev/null @@ -1,53 +0,0 @@ ---- -kind: DaemonSet -apiVersion: apps/v1 -metadata: - name: gkm-test-ns-rwo-ds-1 - namespace: gkm-test-ns-rwo-1 - labels: - gkm.io/pvcMutation: "true" -spec: - selector: - matchLabels: - name: gkm-test-ns-rwo-ds-1 - template: - metadata: - labels: - name: gkm-test-ns-rwo-ds-1 - gkm.io/pvc-mutation: "true" - spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - readOnly: true - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rwo diff --git a/examples/namespace/RWO/13-ds.yaml b/examples/namespace/RWO/13-ds.yaml deleted file mode 100644 index 937e745e..00000000 --- a/examples/namespace/RWO/13-ds.yaml +++ /dev/null @@ -1,54 +0,0 @@ ---- -kind: DaemonSet -apiVersion: apps/v1 -metadata: - name: gkm-test-ns-rwo-ds-2 - namespace: gkm-test-ns-rwo-1 - labels: - gkm.io/pvc-mutation: "true" -spec: - selector: - matchLabels: - name: gkm-test-ns-rwo-ds-2 - template: - metadata: - labels: - name: gkm-test-ns-rwo-ds-2 - gkm.io/pvc-mutation: "true" - spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - nodeSelector: - gkm-test-node: "true" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rwo diff --git a/examples/namespace/RWO/14-ds.yaml b/examples/namespace/RWO/14-ds.yaml deleted file mode 100644 index c6bf5021..00000000 --- a/examples/namespace/RWO/14-ds.yaml +++ /dev/null @@ -1,54 +0,0 @@ ---- -kind: DaemonSet -apiVersion: apps/v1 -metadata: - name: gkm-test-ns-rwo-ds-3 - namespace: gkm-test-ns-rwo-1 - labels: - gkm.io/pvcMutation: "true" -spec: - selector: - matchLabels: - name: gkm-test-ns-rwo-ds-3 - template: - metadata: - labels: - name: gkm-test-ns-rwo-ds-3 - gkm.io/pvc-mutation: "true" - spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - nodeSelector: - gkm-test-node: "false" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rwo diff --git a/examples/namespace/RWO/21-gkmcache-cosign-v3.yaml b/examples/namespace/RWO/21-gkmcache-cosign-v3.yaml deleted file mode 100644 index 9a091eaf..00000000 --- a/examples/namespace/RWO/21-gkmcache-cosign-v3.yaml +++ /dev/null @@ -1,18 +0,0 @@ ---- -apiVersion: gkm.io/v1alpha1 -kind: GKMCache -metadata: - name: vector-add-cache-rocm-v3-rwo - namespace: gkm-test-ns-rwo-1 - labels: - gkm.io/signature-format: cosign-v3 -spec: - image: quay.io/gkm/cache-examples:vector-add-cache-rocm - - # Toleration is specific to the KIND cluster being deployed. - podTemplate: - spec: - tolerations: - - key: gpu - operator: Exists - effect: NoSchedule diff --git a/examples/namespace/RWO/22-ds.yaml b/examples/namespace/RWO/22-ds.yaml deleted file mode 100644 index c682f8a2..00000000 --- a/examples/namespace/RWO/22-ds.yaml +++ /dev/null @@ -1,53 +0,0 @@ ---- -kind: DaemonSet -apiVersion: apps/v1 -metadata: - name: gkm-test-ns-rwo-v3-ds-1 - namespace: gkm-test-ns-rwo-1 - labels: - gkm.io/pvcMutation: "true" -spec: - selector: - matchLabels: - name: gkm-test-ns-rwo-v3-ds-1 - template: - metadata: - labels: - name: gkm-test-ns-rwo-v3-ds-1 - gkm.io/pvc-mutation: "true" - spec: - tolerations: - - key: gpu - operator: Equal - effect: NoSchedule - value: "true" - - # Init-Container is only needed for KIND clusters. - initContainers: - - name: fix-permissions - image: quay.io/fedora/fedora-minimal - securityContext: - runAsUser: 0 - command: - - sh - - -c - - | - chown -R 1000:1000 /cache - chmod -R 775 /cache - volumeMounts: - - name: kernel-volume - mountPath: /cache - - containers: - - name: test - image: quay.io/fedora/fedora-minimal - imagePullPolicy: IfNotPresent - command: [sleep, 365d] - volumeMounts: - - name: kernel-volume - mountPath: /cache - readOnly: true - volumes: - - name: kernel-volume - persistentVolumeClaim: - claimName: vector-add-cache-rocm-v3-rwo diff --git a/examples/patch/affinity-nfd-cuda.txt b/examples/patch/affinity-nfd-cuda.txt new file mode 100644 index 00000000..080ee42b --- /dev/null +++ b/examples/patch/affinity-nfd-cuda.txt @@ -0,0 +1,9 @@ + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0300_10de.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0302_10de.present + operator: Exists diff --git a/examples/patch/affinity-nfd-rocm.txt b/examples/patch/affinity-nfd-rocm.txt new file mode 100644 index 00000000..0c53465f --- /dev/null +++ b/examples/patch/affinity-nfd-rocm.txt @@ -0,0 +1,12 @@ + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0300_1002.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0302_1002.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0380_1002.present + operator: Exists diff --git a/examples/patch/node-selector-kind-false.txt b/examples/patch/node-selector-kind-false.txt new file mode 100644 index 00000000..1d33b3ed --- /dev/null +++ b/examples/patch/node-selector-kind-false.txt @@ -0,0 +1 @@ + gkm-test-node: "false" diff --git a/examples/patch/node-selector-kind-true.txt b/examples/patch/node-selector-kind-true.txt new file mode 100644 index 00000000..99031b39 --- /dev/null +++ b/examples/patch/node-selector-kind-true.txt @@ -0,0 +1 @@ + gkm-test-node: "true" diff --git a/examples/patch/toleration-kind.txt b/examples/patch/toleration-kind.txt new file mode 100644 index 00000000..2c22a340 --- /dev/null +++ b/examples/patch/toleration-kind.txt @@ -0,0 +1,4 @@ + key: gpu + operator: Equal + effect: NoSchedule + value: "true" diff --git a/examples/patch/toleration-nfd-cuda.txt b/examples/patch/toleration-nfd-cuda.txt new file mode 100644 index 00000000..39abe560 --- /dev/null +++ b/examples/patch/toleration-nfd-cuda.txt @@ -0,0 +1,3 @@ + key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/examples/variants/access/rox/kustomization.env b/examples/variants/access/rox/kustomization.env new file mode 100644 index 00000000..340e9827 --- /dev/null +++ b/examples/variants/access/rox/kustomization.env @@ -0,0 +1,49 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: +# For each Pod: +# - Set the Namespace of the Pod +# - Overwrite the PVC Name in the Pod Volume +- target: + kind: Pod + name: gkm-test-pod-1 + patch: |- + - op: replace + path: /metadata/namespace + value: NAMESPACE_1 + - op: replace + path: /spec/volumes/0/persistentVolumeClaim/claimName + value: OBJECT_NAME +KIND_INIT_CONTAINER +AFFINITY_ADD_POD_DS +TOLERATION_ADD_POD_DS +NODE_SELECTOR_1 +- target: + kind: Pod + name: gkm-test-pod-2 + patch: |- + - op: replace + path: /metadata/namespace + value: NAMESPACE_1 + - op: replace + path: /spec/volumes/0/persistentVolumeClaim/claimName + value: OBJECT_NAME +KIND_INIT_CONTAINER +AFFINITY_ADD_POD_DS +TOLERATION_ADD_POD_DS +NODE_SELECTOR_2 +- target: + kind: Pod + name: gkm-test-pod-3 + patch: |- + - op: replace + path: /metadata/namespace + value: NAMESPACE_2 + - op: replace + path: /spec/volumes/0/persistentVolumeClaim/claimName + value: OBJECT_NAME +KIND_INIT_CONTAINER +AFFINITY_ADD_POD_DS +TOLERATION_ADD_POD_DS +NODE_SELECTOR_3 diff --git a/examples/variants/access/rwo/kustomization.env b/examples/variants/access/rwo/kustomization.env new file mode 100644 index 00000000..8ca51c0a --- /dev/null +++ b/examples/variants/access/rwo/kustomization.env @@ -0,0 +1,49 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: +# For each DaemonSet: +# - Set the Namespace of the DaemonSet +# - Overwrite the PVC Name in the DaemonSet Volume +- target: + kind: DaemonSet + name: gkm-test-ds-1 + patch: |- + - op: replace + path: /metadata/namespace + value: NAMESPACE_1 + - op: replace + path: /spec/template/spec/volumes/0/persistentVolumeClaim/claimName + value: OBJECT_NAME +KIND_INIT_CONTAINER +AFFINITY_ADD_POD_DS +TOLERATION_ADD_POD_DS +NODE_SELECTOR_1 +- target: + kind: DaemonSet + name: gkm-test-ds-2 + patch: |- + - op: replace + path: /metadata/namespace + value: NAMESPACE_1 + - op: replace + path: /spec/template/spec/volumes/0/persistentVolumeClaim/claimName + value: OBJECT_NAME +KIND_INIT_CONTAINER +AFFINITY_ADD_POD_DS +TOLERATION_ADD_POD_DS +NODE_SELECTOR_2 +- target: + kind: DaemonSet + name: gkm-test-ds-3 + patch: |- + - op: replace + path: /metadata/namespace + value: NAMESPACE_2 + - op: replace + path: /spec/template/spec/volumes/0/persistentVolumeClaim/claimName + value: OBJECT_NAME +KIND_INIT_CONTAINER +AFFINITY_ADD_POD_DS +TOLERATION_ADD_POD_DS +NODE_SELECTOR_3 diff --git a/examples/variants/scope/cluster/kustomization.env b/examples/variants/scope/cluster/kustomization.env new file mode 100644 index 00000000..983f19c9 --- /dev/null +++ b/examples/variants/scope/cluster/kustomization.env @@ -0,0 +1,32 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: +- target: + kind: ClusterGKMCache + name: gkm-test-obj + patch: |- + # Overwrite the OCI Image in the ClusterGKMCache with the CUDA/ROCm and V2/V3 tag. Whole image, not just tag overwritten + - op: replace + path: /spec/image + value: OCI_IMAGE + + # Add Cosign Version Label to ClusterGKMCache + - op: add + path: /metadata/labels + value: {} + - op: add + path: /metadata/labels/gkm.io~1signature-format + value: COSIGN_VERSION_LABEL + + # Overwrite the namespaces to the `spec.workloadNamespaces` slice in the ClusterGKMCache + - op: replace + path: /spec/workloadNamespaces/0 + value: NAMESPACE_1 + - op: replace + path: /spec/workloadNamespaces/1 + value: NAMESPACE_2 + +POD_TEMPLATE_ADD_GKMCACHE + +ACCESS_ROX_ACCESS_MODE diff --git a/examples/variants/scope/namespace/kustomization.env b/examples/variants/scope/namespace/kustomization.env new file mode 100644 index 00000000..ddf539ac --- /dev/null +++ b/examples/variants/scope/namespace/kustomization.env @@ -0,0 +1,29 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +patches: +- target: + kind: GKMCache + name: gkm-test-obj + patch: |- + # Overwrite the Namespace in the GKMCache + - op: replace + path: /metadata/namespace + value: NAMESPACE_1 + + # Overwrite the OCI Image in the GKMCache with the CUDA/ROCm and V2/V3 tag. Whole image, not just tag overwritten + - op: replace + path: /spec/image + value: OCI_IMAGE + + # Add Cosign Version Label to GKMCache + - op: add + path: /metadata/labels + value: {} + - op: add + path: /metadata/labels/gkm.io~1signature-format + value: COSIGN_VERSION_LABEL + +POD_TEMPLATE_ADD_GKMCACHE + +ACCESS_ROX_ACCESS_MODE