Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Containerfile.gkm-operator
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ COPY cmd/main.go cmd/main.go
COPY api/ api/
COPY pkg/ pkg/
COPY internal/controller/ internal/controller/
COPY internal/webhook/ internal/webhook/
COPY vendor/ vendor/
COPY Makefile Makefile

Expand Down
15 changes: 12 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -304,10 +304,19 @@ uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified
prepare-deploy:
cd config/operator && $(KUSTOMIZE) edit set image quay.io/gkm/operator=${OPERATOR_IMG}
cd config/agent && $(KUSTOMIZE) edit set image quay.io/gkm/agent=${AGENT_IMG}
ifdef NO_GPU
ifeq ($(KIND_CLUSTER),true)
cd config/configMap && \
$(SED) \
-e '/literals:/a\ - gkm.nogpu=true' \
-e '/literals:/a\ - gkm.kindcluster=true' \
-e 's@gkm\.agent\.image=.*@gkm.agent.image=$(AGENT_IMG)@' \
-e 's@gkm\.extract\.image=.*@gkm.extract.image=$(EXTRACT_IMG)@' \
kustomization.yaml.env > kustomization.yaml
else ifeq ($(NO_GPU),true)
cd config/configMap && \
$(SED) \
-e '/literals:/a\ - gkm.nogpu=true' \
-e '/literals:/a\ - gkm.kindcluster=false' \
-e 's@gkm\.agent\.image=.*@gkm.agent.image=$(AGENT_IMG)@' \
-e 's@gkm\.extract\.image=.*@gkm.extract.image=$(EXTRACT_IMG)@' \
kustomization.yaml.env > kustomization.yaml
Expand Down Expand Up @@ -591,11 +600,11 @@ deploy-on-kind: kind-load-images tmp-cleanup
@echo "Add label gkm-test-node=false to node kind-gpu-sim-worker2."
$(KUBECTL) label node kind-gpu-sim-worker2 gkm-test-node=false --overwrite
## NOTE: config/kind-gpu is an overlay of config/default
$(MAKE) deploy DEPLOY_PATH=config/kind-gpu NO_GPU=true
$(MAKE) deploy DEPLOY_PATH=config/kind-gpu NO_GPU=true KIND_CLUSTER=true

.PHONY: redeploy-on-kind
redeploy-on-kind: ## Redeploy controller and agent to Kind GPU cluster after run-on-kind and undeploy-on-kind have been called. Skips some onetime steps in deploy.
$(MAKE) redeploy DEPLOY_PATH=config/kind-gpu NO_GPU=true
$(MAKE) redeploy DEPLOY_PATH=config/kind-gpu NO_GPU=true KIND_CLUSTER=true
@echo "Deployment to $(DEPLOY_PATH) completed."

.PHONY: undeploy-on-kind
Expand Down
27 changes: 21 additions & 6 deletions agent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,24 @@ func main() {
setupLog.Info("No-GPU set to true")
}

extractLogLevel := os.Getenv("EXTRACT_GO_LOG")
if extractLogLevel == "" {
extractLogLevel = "info"
setupLog.Info("Extract Job Log Level set to info")
}

kindCluster := false
if os.Getenv("KIND_CLUSTER") == "true" {
kindCluster = true
setupLog.Info("KIND Cluster set to true")
}

nodeName := os.Getenv("KUBE_NODE_NAME")
if nodeName == "" {
setupLog.Error(fmt.Errorf("KUBE_NODE_NAME env var not set"), "Couldn't determine current node")
os.Exit(1)
}
setupLog.Info("KUBE_NODE_NAME processing", "Node", nodeName)

extractImage := utils.JobExtractImage
tmpExtractImage := os.Getenv("EXTRACT_IMAGE")
Expand Down Expand Up @@ -175,12 +188,13 @@ func main() {
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("GKM-Agent-NS"),
CacheDir: utils.DefaultCacheDir,
NodeName: nodeName,
NoGpu: noGpu,
KindCluster: kindCluster,
ExtractLogLevel: extractLogLevel,
ExtractImage: extractImage,
CrdCacheStr: "GKMCache",
CrdCacheNodeStr: "GKMCacheNode",
CrdCacheStr: utils.CrdGKMCache,
CrdCacheNodeStr: utils.CrdGKMCacheNode,
}
if err = (&gkmAgent.GKMCacheAgentReconciler{
ReconcilerCommonAgent: commonNs,
Expand All @@ -198,12 +212,13 @@ func main() {
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("GKM-Agent-CL"),
CacheDir: utils.DefaultCacheDir,
NodeName: nodeName,
NoGpu: noGpu,
KindCluster: kindCluster,
ExtractLogLevel: extractLogLevel,
ExtractImage: extractImage,
CrdCacheStr: "ClusterGKMCache",
CrdCacheNodeStr: "ClusterGKMCacheNode",
CrdCacheStr: utils.CrdClusterGKMCache,
CrdCacheNodeStr: utils.CrdClusterGKMCacheNode,
}
if err = (&gkmAgent.ClusterGKMCacheAgentReconciler{
ReconcilerCommonAgent: commonCl,
Expand Down
1 change: 0 additions & 1 deletion api/v1alpha1/clustergkmcachenode_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ import (
// created, GKM ensures that one ClusterGKMCacheNode instance is created per
// Kubernetes Node. Cluster GKMCacheNode cannot be edited by an application or
// user, only by GKM.
// +kubebuilder:printcolumn:name="Node",type=string,JSONPath=".status.nodeName"
// +kubebuilder:printcolumn:name="Node-In-Use",type=string,JSONPath=`.status.counts.nodeInUseCnt`
// +kubebuilder:printcolumn:name="Node-Not-In-Use",type=string,JSONPath=`.status.counts.nodeNotInUseCnt`
// +kubebuilder:printcolumn:name="Node-Error",type=string,JSONPath=`.status.counts.nodeErrorCnt`
Expand Down
1 change: 0 additions & 1 deletion api/v1alpha1/gkmcachenode_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ import (
// instance are created in a namespace, GKM ensures that one GKMCacheNode
// instance is created per Kubernetes Node. GKMCacheNode cannot be edited by an
// application or user, only by GKM.
// +kubebuilder:printcolumn:name="Node",type=string,JSONPath=".status.nodeName"
// +kubebuilder:printcolumn:name="Node-In-Use",type=string,JSONPath=`.status.counts.nodeInUseCnt`
// +kubebuilder:printcolumn:name="Node-Not-In-Use",type=string,JSONPath=`.status.counts.nodeNotInUseCnt`
// +kubebuilder:printcolumn:name="Node-Error",type=string,JSONPath=`.status.counts.nodeErrorCnt`
Expand Down
7 changes: 3 additions & 4 deletions api/v1alpha1/shared_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,9 @@ type GKMCacheSpec struct {

// storageClassName contains the name of the Kubernetes Storage Class, which
// will be used for the PersistentVolume and PersistentVolumeClaim the GKM will
// create in order to store the extract GPU Kernel Cache.
// +required
// +kubebuilder:validation:Required
StorageClassName string `json:"storageClassName"`
// create in order to store the extract GPU Kernel Cache. If not provided, then
// default Storage Class will be used.
StorageClassName string `json:"storageClassName,omitempty"`
Comment thread
coderabbitai[bot] marked this conversation as resolved.

// accessMode is the set of capabilities being requested by the generated PVC.
// This field is optional. If not provided, it will default to "ReadWriteOnce".
Expand Down
35 changes: 19 additions & 16 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,10 @@ import (
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
"sigs.k8s.io/controller-runtime/pkg/webhook"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"

gkmv1alpha1 "github.com/redhat-et/GKM/api/v1alpha1"
gkmOperator "github.com/redhat-et/GKM/internal/controller/gkm-operator"

gkmWebhook "github.com/redhat-et/GKM/internal/webhook"
"github.com/redhat-et/GKM/pkg/utils"
// +kubebuilder:scaffold:imports
)
Expand Down Expand Up @@ -71,6 +69,18 @@ func main() {
setupLog.Info("No-GPU set to true", "noGpu", noGpu)
}

extractLogLevel := os.Getenv("EXTRACT_GO_LOG")
if extractLogLevel == "" {
extractLogLevel = "info"
setupLog.Info("Extract Job Log Level set to info")
}

kindCluster := false
if os.Getenv("KIND_CLUSTER") == "true" {
kindCluster = true
setupLog.Info("KIND Cluster set to true")
}

extractImage := utils.JobExtractImage
tmpExtractImage := os.Getenv("EXTRACT_IMAGE")
if tmpExtractImage != "" {
Expand Down Expand Up @@ -184,16 +194,6 @@ func main() {
os.Exit(1)
}

mutator := &gkmWebhook.PodMutator{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Decoder: admission.NewDecoder(mgr.GetScheme()),
}
mgr.GetWebhookServer().Register(
"/mutate-v1-pod",
&admission.Webhook{Handler: mutator},
)

if err = (&gkmOperator.GKMConfigMapReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Expand All @@ -211,9 +211,11 @@ func main() {
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
NoGpu: noGpu,
KindCluster: kindCluster,
ExtractLogLevel: extractLogLevel,
ExtractImage: extractImage,
CrdCacheStr: "GKMCache",
CrdCacheNodeStr: "GKMCacheNode",
CrdCacheStr: utils.CrdGKMCache,
CrdCacheNodeStr: utils.CrdGKMCacheNode,
}
if err = (&gkmOperator.GKMCacheOperatorReconciler{
ReconcilerCommonOperator: commonNs,
Expand All @@ -231,9 +233,10 @@ func main() {
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
NoGpu: noGpu,
KindCluster: kindCluster,
ExtractImage: extractImage,
CrdCacheStr: "ClusterGKMCache",
CrdCacheNodeStr: "ClusterGKMCacheNode",
CrdCacheStr: utils.CrdClusterGKMCache,
CrdCacheNodeStr: utils.CrdClusterGKMCacheNode,
}
if err = (&gkmOperator.ClusterGKMCacheOperatorReconciler{
ReconcilerCommonOperator: commonCl,
Expand Down
41 changes: 11 additions & 30 deletions config/agent/gkm-agent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ spec:
configMapKeyRef:
name: gkm-config
key: gkm.nogpu
- name: KIND_CLUSTER
valueFrom:
configMapKeyRef:
name: gkm-config
key: gkm.kindcluster
- name: GO_LOG
valueFrom:
configMapKeyRef:
Expand All @@ -51,36 +56,12 @@ spec:
memory: "128Mi"
cpu: "100m"
volumeMounts:
- name: gkm-state
mountPath: /var/lib/gkm
mountPropagation: Bidirectional
- name: gkm-runtime
mountPath: /run/gkm
mountPropagation: Bidirectional
- name: sys
mountPath: /sys
readOnly: true
- name: dev
mountPath: /dev
- mountPath: /mnt/kernel-caches
name: kernel-caches
readOnly: false

volumes:
# This volume is the GKM State directory. This is where GPU Kernel Cache
# will be extracted.
- name: gkm-state
hostPath:
path: /var/lib/gkm
type: DirectoryOrCreate
# This volume is the GKM Runtime directory. This is where the Usage data
# will tracked which pods are using which cache.
- name: gkm-runtime
hostPath:
path: /run/gkm
type: DirectoryOrCreate
- name: sys
hostPath:
path: /sys
type: Directory
- name: dev
- name: kernel-caches
hostPath:
path: /dev
type: Directory
path: /kernel-caches
type: DirectoryOrCreate
2 changes: 2 additions & 0 deletions config/configMap/configMap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ data:
## Can be set to "info", "debug", or "trace". Not processed at runtime.
gkm.operator.log.level: info
gkm.agent.log.level: info
gkm.extract.log.level: info
## Can be configured at runtime
gkm.agent.image: quay.io/gkm/agent:latest
gkm.extract.image: quay.io/gkm/gkm-extract:latest
gkm.nogpu: false
gkm.kindcluster: false
Comment thread
coderabbitai[bot] marked this conversation as resolved.
## Enable/disable Kyverno image signature verification (defaults to true/enabled)
gkm.kyverno.enabled: "true"
3 changes: 2 additions & 1 deletion config/configMap/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ kind: Kustomization
configMapGenerator:
- behavior: merge
literals:
- gkm.nogpu=true
- gkm.nogpu=false
- gkm.kindcluster=false
- gkm.agent.image=quay.io/gkm/agent:latest
- gkm.extract.image=quay.io/gkm/gkm-extract:latest
name: config
Expand Down
3 changes: 0 additions & 3 deletions config/crd/bases/gkm.io_clustergkmcachenodes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ spec:
scope: Cluster
versions:
- additionalPrinterColumns:
- jsonPath: .status.nodeName
name: Node
type: string
- jsonPath: .status.counts.nodeInUseCnt
name: Node-In-Use
type: string
Expand Down
4 changes: 2 additions & 2 deletions config/crd/bases/gkm.io_clustergkmcaches.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1147,7 +1147,8 @@ spec:
description: |-
storageClassName contains the name of the Kubernetes Storage Class, which
will be used for the PersistentVolume and PersistentVolumeClaim the GKM will
create in order to store the extract GPU Kernel Cache.
create in order to store the extract GPU Kernel Cache. If not provided, then
default Storage Class will be used.
type: string
workloadNamespaces:
description: |-
Expand All @@ -1161,7 +1162,6 @@ spec:
type: array
required:
- image
- storageClassName
type: object
status:
description: |-
Expand Down
3 changes: 0 additions & 3 deletions config/crd/bases/gkm.io_gkmcachenodes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ spec:
scope: Namespaced
versions:
- additionalPrinterColumns:
- jsonPath: .status.nodeName
name: Node
type: string
- jsonPath: .status.counts.nodeInUseCnt
name: Node-In-Use
type: string
Expand Down
4 changes: 2 additions & 2 deletions config/crd/bases/gkm.io_gkmcaches.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1145,7 +1145,8 @@ spec:
description: |-
storageClassName contains the name of the Kubernetes Storage Class, which
will be used for the PersistentVolume and PersistentVolumeClaim the GKM will
create in order to store the extract GPU Kernel Cache.
create in order to store the extract GPU Kernel Cache. If not provided, then
default Storage Class will be used.
type: string
workloadNamespaces:
description: |-
Expand All @@ -1159,7 +1160,6 @@ spec:
type: array
required:
- image
- storageClassName
type: object
status:
description: |-
Expand Down
5 changes: 5 additions & 0 deletions config/operator/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ spec:
configMapKeyRef:
name: gkm-config
key: gkm.nogpu
- name: KIND_CLUSTER
valueFrom:
configMapKeyRef:
name: gkm-config
key: gkm.kindcluster
- name: GO_LOG
valueFrom:
configMapKeyRef:
Expand Down
19 changes: 0 additions & 19 deletions config/webhook/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,6 @@ kind: MutatingWebhookConfiguration
metadata:
name: mutating-webhook-configuration
webhooks:
- admissionReviewVersions:
- v1
clientConfig:
service:
name: webhook-service
namespace: system
path: /mutate-v1-pod
failurePolicy: Fail
name: mpod.kb.io
rules:
- apiGroups:
- ""
apiVersions:
- v1
operations:
- CREATE
resources:
- pods
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
Expand Down
5 changes: 2 additions & 3 deletions examples/.gitignore
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
# Ignore generated yaml files
base/common/namespace-1.yaml
base/scope/cluster/namespace-2.yaml
overlays/access/*.yaml
overlays/pods/*.yaml
overlays/scope/*.yaml
output/*.yaml
variants/access/rox/*.yaml
variants/access/rwo/*.yaml
variants/pods/*.yaml
variants/scope/cluster/*.yaml
variants/scope/namespace/*.yaml
.gkm-generate-files.exclusivelock
Loading
Loading