Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
9a3e012
feat: add GPU-specific agents for NVIDIA and AMD with NFD-based deplo…
maryamtahhan Mar 11, 2026
0789af4
feat: enhance deployment automation with NFD and Kyverno integration
maryamtahhan Mar 12, 2026
021d348
fix: conditionally build agents based on NO_GPU_BUILD flag
maryamtahhan Mar 12, 2026
5667835
feat: add individual agent image variables for flexible deployment
maryamtahhan Mar 12, 2026
6de83ac
fix: GPU agent scheduling with NFD PCI class code labels
maryamtahhan Mar 12, 2026
7c3cd40
fix: exclude control-plane nodes from nogpu agent deployment
maryamtahhan Mar 12, 2026
50720ea
fix: mount GPU libraries to enable device access without GPU resource…
maryamtahhan Mar 12, 2026
a11cc6a
feat: add automated dependency installation for RHEL 10
maryamtahhan Mar 12, 2026
9dc5a63
gkm: add nvidia example
maryamtahhan Mar 12, 2026
2444c21
fix: address PR #107 review comments and failing workflows
maryamtahhan Mar 16, 2026
2a1c12e
fix: resolve yamllint errors in NVIDIA example YAMLs
maryamtahhan Mar 16, 2026
7572144
refactor: restructure RWO examples into organized subdirectories
maryamtahhan Mar 16, 2026
8a73250
fix: load actual agent images instead of non-existent AGENT_IMG in ki…
maryamtahhan Mar 16, 2026
9a47931
kind: fix kyverno deployment
maryamtahhan Mar 16, 2026
a3a6546
makefile: cleanup kyverno targets
maryamtahhan Mar 16, 2026
3273a18
fix: resolve Kind deployment failures on GPU-tainted nodes
maryamtahhan Mar 16, 2026
e4545a0
fix: skip NFD deployment for Kind clusters and use device plugin labels
maryamtahhan Mar 16, 2026
db7a78b
fix: separate SKIP_NFD and NO_GPU flags, simulate NFD labels in Kind
maryamtahhan Mar 16, 2026
47a443b
fix: remove node affinity from nogpu agent for Kind clusters
maryamtahhan Mar 16, 2026
977df63
fix: standardize namespace and cache naming in ROCM and CUDA examples
maryamtahhan Mar 16, 2026
3103523
fix: update ROCM daemonset names to match namespace pattern
maryamtahhan Mar 16, 2026
c942cd2
images: add gkm prefix to image names
maryamtahhan Mar 16, 2026
e0089e8
refactor: use base image for builder stage in GPU agent Containerfiles
maryamtahhan Mar 16, 2026
5264a8f
refactor: consolidate agent Containerfiles into single multi-target file
maryamtahhan Mar 16, 2026
ccfbfe6
fix: update legacy agent image reference to nogpu variant
maryamtahhan Mar 16, 2026
2c03781
fix: add video group access to ROCm agent for GPU detection
maryamtahhan Apr 15, 2026
c53c04e
fix: support AMD integrated GPUs with N/A field values
maryamtahhan Apr 15, 2026
58f3b0b
fix: correct RDNA architecture matching order in TranslateGPUToArch
maryamtahhan Apr 15, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 31 additions & 4 deletions .github/workflows/image-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
image:
- registry: quay.io
repository: gkm
image: operator
image: gkm-operator
dockerfile: ./Containerfile.gkm-operator
context: .
tags: |
Expand All @@ -45,17 +45,43 @@ jobs:

- registry: quay.io
repository: gkm
image: agent
dockerfile: ./Containerfile.gkm-agent
image: gkm-agent-nogpu
dockerfile: ./Containerfile.gkm-agents
context: .
target: nogpu
tags: |
type=ref,event=branch
type=ref,event=tag
type=ref,event=pr
type=sha,format=long
# set latest tag for default branch
type=raw,value=latest,enable={{is_default_branch}}
- registry: quay.io
repository: gkm
image: gkm-agent-nvidia
dockerfile: ./Containerfile.gkm-agents
context: .
target: nvidia
tags: |
type=ref,event=branch
type=ref,event=tag
type=ref,event=pr
type=sha,format=long
# set latest tag for default branch
type=raw,value=latest,enable={{is_default_branch}}
- registry: quay.io
repository: gkm
image: gkm-agent-amd
dockerfile: ./Containerfile.gkm-agents
context: .
target: amd
tags: |
type=ref,event=branch
type=ref,event=tag
type=ref,event=pr
type=sha,format=long
# set latest tag for default branch
type=raw,value=latest,enable={{is_default_branch}}

- registry: quay.io
repository: gkm
image: gkm-extract
Expand Down Expand Up @@ -130,6 +156,7 @@ jobs:
file: ${{ matrix.image.dockerfile }}
build-args: BUILDPLATFORM=linux/amd64
context: ${{ matrix.image.context }}
target: ${{ matrix.image.target || '' }}

- name: Sign the images with GitHub OIDC Token
if: ${{ fromJSON(steps.set-push.outputs.push_flag) }}
Expand Down
84 changes: 0 additions & 84 deletions Containerfile.gkm-agent

This file was deleted.

139 changes: 139 additions & 0 deletions Containerfile.gkm-agents
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# ============================================================================
# Multi-target Containerfile for GKM Agents
# Build specific targets with: podman build --target <nogpu|amd|nvidia>
# ============================================================================

# ============================================================================
# Stage 1: Builder (shared by all agent variants)
# ============================================================================
FROM public.ecr.aws/docker/library/golang:1.25 AS builder

WORKDIR /workspace

# Install required system packages
RUN apt-get update && \
apt-get install -y \
libgpgme-dev \
btrfs-progs \
libbtrfs-dev \
libgpgme11-dev \
libseccomp-dev \
pkg-config \
build-essential && \
apt-get clean

# Copy the Go Modules manifests
COPY go.mod go.mod
COPY go.sum go.sum

# Copy the go source
COPY agent/main.go agent/main.go
COPY api/ api/
COPY pkg/ pkg/
COPY internal/controller/ internal/controller/
COPY vendor/ vendor/
COPY Makefile Makefile

# Build the agent binary
RUN make build-gkm-agent

# ============================================================================
# Target: nogpu (complete no-GPU agent)
# ============================================================================
FROM public.ecr.aws/docker/library/ubuntu:24.04 AS nogpu

# Copy the binary from the builder
COPY --from=builder /workspace/bin/gkm-agent /agent

# Install common runtime libraries (shared with other agent variants)
RUN apt-get update && \
apt-get install -y \
ca-certificates \
libgpgme11 \
libbtrfs0 \
libffi8 \
libc6 \
wget \
pciutils \
hwdata \
gnupg2 \
python3-setuptools \
python3-wheel \
curl \
dialog \
rsync \
lsb-release \
software-properties-common \
libseccomp2 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Run as non-root user
USER 65532:65532

ENTRYPOINT ["/agent"]

# ============================================================================
# Target: amd (extends nogpu, adds ROCm support)
# ============================================================================
FROM nogpu AS amd

# Switch to root to install ROCm packages
USER root

# AMD ROCm version configuration
ARG ROCM_VERSION=6.3.1
ARG AMDGPU_VERSION=6.3.60301
ARG OPT_ROCM_VERSION=6.3.1

# Install AMD ROCm packages (GPU-specific dependencies)
RUN wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \
apt install -y ./*.deb && \
apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \
apt-get clean && rm -rf /var/lib/apt/lists/* && \
ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \
ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi

# Switch back to non-root user
USER 65532:65532

# Binary and entrypoint are inherited from nogpu

# ============================================================================
# Target: nvidia (CUDA runtime with NVML support)
# ============================================================================
FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 AS nvidia

# Copy the binary from the builder
COPY --from=builder /workspace/bin/gkm-agent /agent

# Install common runtime libraries (shared with other agent variants)
RUN apt-get update && \
apt-get install -y \
ca-certificates \
libgpgme11 \
libbtrfs0 \
libffi8 \
libc6 \
wget \
pciutils \
hwdata \
gnupg2 \
python3-setuptools \
python3-wheel \
curl \
dialog \
rsync \
lsb-release \
software-properties-common \
libseccomp2 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Note: NVIDIA CUDA base image already includes libnvidia-ml.so (NVML)
# No additional GPU-specific packages needed

# Run as non-root user
USER 65532:65532

ENTRYPOINT ["/agent"]
Loading
Loading