Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .github/workflows/e2e-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.10.0
with:
version: v0.32.0 # Define your custom KinD CLI version here
version: v0.32.0 # required for gang
node_image: kindest/node:v1.36.1
config: ./deploy/kind-config.yaml

Expand Down Expand Up @@ -78,6 +78,8 @@ jobs:
done
[ -n "$POD" ] || { echo "ERROR: no Running fluence pod found"; exit 1; }
echo "Using pod: $POD"
# Brief sleep to let the container runtime stabilize before exec
sleep 5
kubectl -n kube-system exec "$POD" -- ls /tmp/
kubectl -n kube-system logs "$POD"
kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json"
Expand Down Expand Up @@ -107,6 +109,8 @@ jobs:
done
[ -n "$POD" ] || { echo "ERROR: no Running fluence pod found after restart"; exit 1; }
echo "Using pod: $POD"
# Brief sleep to let the container runtime stabilize before exec
sleep 5
kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json"

- name: Wait for webhook
Expand All @@ -129,6 +133,9 @@ jobs:
#- name: E2E - restart recovery (no double-book)
# run: bash test/e2e/03-restart-recovery.sh

- name: E2E - sidecar ungate
run: bash test/e2e/04-sidecar-ungate.sh

- name: Dump diagnostics on failure
if: failure()
run: |
Expand Down
71 changes: 71 additions & 0 deletions .github/workflows/sidecar-build-deploy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
name: sidecar-build-deploy

on:
push:
branches: [main]
tags: ["v*"]
paths:
- "sidecars/**"
- ".github/workflows/sidecar-build-deploy.yaml"
pull_request:
branches: [main]
paths:
- "sidecars/**"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

env:
REGISTRY: ghcr.io

jobs:
build-deploy:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write

strategy:
matrix:
sidecar:
- braket
# - qrmi # uncomment when implemented

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Log in to GHCR
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Image metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ github.repository }}-sidecar-${{ matrix.sidecar }}
tags: |
type=ref,event=branch
type=ref,event=pr
type=semver,pattern={{version}}
type=sha
type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }}

- name: Build and push ${{ matrix.sidecar }} sidecar
uses: docker/build-push-action@v6
with:
context: .
file: ./sidecars/${{ matrix.sidecar }}/Dockerfile
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ build: ## Build all binaries (scheduler needs flux-sched; helpers are pure Go)
CGO_ENABLED=0 go build -o bin/fluence-deviceplugin ./cmd/deviceplugin
CGO_ENABLED=0 go build -o bin/fluence-webhook ./cmd/webhook

.PHONY: sidecars
sidecars:
docker build -f sidecars/braket/Dockerfile -t ghcr.io/converged-computing/fluence-sidecar-braket:latest .
docker push ghcr.io/converged-computing/fluence-sidecar-braket:latest
# kind load docker-image ghcr.io/converged-computing/fluence-sidecar-braket:latest

.PHONY: test
test:
CGO_ENABLED=1 CGO_CFLAGS="$(CGO_CFLAGS)" CGO_LDFLAGS="$(CGO_LDFLAGS)" \
Expand Down
6 changes: 5 additions & 1 deletion cmd/webhook/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,11 @@ func main() {
log.Printf("no resources config at %s (%v); injecting FLUXION_BACKEND only", path, rerr)
}
}
mutator := &webhook.Mutator{AttributeKeys: attrKeys}
mutator := &webhook.Mutator{
AttributeKeys: attrKeys,
Client: client,
SidecarImage: env("FLUENCE_SIDECAR_IMAGE", ""),
}
log.Printf("[fluence-webhook] env contract injected into fluxion pods: %v", mutator.EnvVarNames())

mux := http.NewServeMux()
Expand Down
95 changes: 94 additions & 1 deletion deploy/fluence-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ metadata:
rules:
- apiGroups: ["scheduling.k8s.io"]
resources: ["podgroups", "workloads", "podgroups/status", "workloads/status"]
verbs: ["get", "list", "watch", "update", "patch"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["create", "get", "update", "list", "watch"]
Expand All @@ -72,6 +72,17 @@ rules:
- apiGroups: ["admissionregistration.k8s.io"]
resources: ["mutatingwebhookconfigurations"]
verbs: ["get", "list", "watch", "patch"]
# The webhook creates per-namespace sidecar RBAC on demand when a leader
# pod is admitted, so users do not need to apply RBAC manually.
- apiGroups: [""]
resources: ["serviceaccounts"]
verbs: ["get", "create"]
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["get", "create"]
- apiGroups: ["rbac.authorization.k8s.io"]
resources: ["roles", "rolebindings"]
verbs: ["get", "create"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
Expand Down Expand Up @@ -177,6 +188,11 @@ spec:
# Allows for kind load
imagePullPolicy: Never
command: ["/bin/fluence-webhook"]
env:
# Use busybox as sidecar image in tests — avoids pulling the real
# sidecar image which is large and not cached in CI.
- name: FLUENCE_SIDECAR_IMAGE
value: "busybox:latest"
ports:
- containerPort: 8443
readinessProbe:
Expand Down Expand Up @@ -222,3 +238,80 @@ webhooks:
- key: kubernetes.io/metadata.name
operator: NotIn
values: ["kube-system"]
---
# fluence-sidecar.yaml
#
# RBAC and supporting resources for the Fluence quantum sidecar.
#
# The sidecar runs inside a leader pod and needs:
# - patch/annotate on pods in its own namespace (to ungate workers and
# propagate the task ARN annotation)
#
# The sidecar ServiceAccount is namespace-scoped — it only has permissions
# in the namespace where the workflow runs. The webhook sets
# spec.serviceAccountName on the leader pod to fluence-sidecar.
#
# The SDK interceptor ConfigMap holds fluence_braket_intercept.py which
# the webhook mounts into user containers as a Python sitecustomize hook,
# transparently tagging every device.run() call with the pod UID.
#
# Apply with:
# kubectl apply -f deploy/fluence-sidecar.yaml


---
# PriorityClass for classical pods paired with quantum work.
# Applied to worker pods by the webhook when they are gated.
# When ungated, high priority triggers preemption of lower-priority work
# so workers get nodes immediately as the QPU result arrives.
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: fluence-quantum-classical
labels:
app: fluence
value: 1000000
globalDefault: false
preemptionPolicy: PreemptLowerPriority
description: "High priority for classical pods paired with quantum work. Set by Fluence webhook."
---
# SDK interceptor ConfigMap — holds the Python sitecustomize hook that
# patches AwsDevice.run() to tag every quantum task with the pod UID.
# The webhook mounts this into user containers at Python's site-packages
# path so it runs automatically before any user code.
#
# Mounted at: /etc/fluence/fluence_braket_intercept.py
# PYTHONSTARTUP is set to this path by the webhook so any Python version loads it.
apiVersion: v1
kind: ConfigMap
metadata:
name: fluence-braket-interceptor
namespace: kube-system
labels:
app: fluence
data:
fluence_braket_intercept.py: |
# Injected by the Fluence webhook into every pod requesting a QPU resource.
# Patches AwsDevice.run() to automatically tag every quantum task submission
# with the pod UID, enabling the fluence-sidecar to find the task without
# any user application changes.
import os

def _install_interceptor():
try:
from braket.aws import AwsDevice
_original_run = AwsDevice.run

def _patched_run(self, task_specification, *args, **kwargs):
pod_uid = os.environ.get("FLUENCE_POD_UID", "")
if pod_uid:
tags = kwargs.get("tags", {})
tags["fluence-pod-uid"] = pod_uid
kwargs["tags"] = tags
return _original_run(self, task_specification, *args, **kwargs)

AwsDevice.run = _patched_run
except ImportError:
pass

_install_interceptor()
89 changes: 88 additions & 1 deletion deploy/fluence.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ metadata:
rules:
- apiGroups: ["scheduling.k8s.io"]
resources: ["podgroups", "workloads", "podgroups/status", "workloads/status"]
verbs: ["get", "list", "watch", "update", "patch"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["create", "get", "update", "list", "watch"]
Expand All @@ -72,6 +72,17 @@ rules:
- apiGroups: ["admissionregistration.k8s.io"]
resources: ["mutatingwebhookconfigurations"]
verbs: ["get", "list", "watch", "patch"]
# The webhook creates per-namespace sidecar RBAC on demand when a leader
# pod is admitted, so users do not need to apply RBAC manually.
- apiGroups: [""]
resources: ["serviceaccounts"]
verbs: ["get", "create"]
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["get", "create"]
- apiGroups: ["rbac.authorization.k8s.io"]
resources: ["roles", "rolebindings"]
verbs: ["get", "create"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
Expand Down Expand Up @@ -218,3 +229,79 @@ webhooks:
- key: kubernetes.io/metadata.name
operator: NotIn
values: ["kube-system"]
# fluence-sidecar.yaml
#
# RBAC and supporting resources for the Fluence quantum sidecar.
#
# The sidecar runs inside a leader pod and needs:
# - patch/annotate on pods in its own namespace (to ungate workers and
# propagate the task ARN annotation)
#
# The sidecar ServiceAccount is namespace-scoped — it only has permissions
# in the namespace where the workflow runs. The webhook sets
# spec.serviceAccountName on the leader pod to fluence-sidecar.
#
# The SDK interceptor ConfigMap holds fluence_braket_intercept.py which
# the webhook mounts into user containers as a Python sitecustomize hook,
# transparently tagging every device.run() call with the pod UID.
#
# Apply with:
# kubectl apply -f deploy/fluence-sidecar.yaml


---
# PriorityClass for classical pods paired with quantum work.
# Applied to worker pods by the webhook when they are gated.
# When ungated, high priority triggers preemption of lower-priority work
# so workers get nodes immediately as the QPU result arrives.
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: fluence-quantum-classical
labels:
app: fluence
value: 1000000
globalDefault: false
preemptionPolicy: PreemptLowerPriority
description: "High priority for classical pods paired with quantum work. Set by Fluence webhook."
---
# SDK interceptor ConfigMap — holds the Python sitecustomize hook that
# patches AwsDevice.run() to tag every quantum task with the pod UID.
# The webhook mounts this into user containers at Python's site-packages
# path so it runs automatically before any user code.
#
# Mounted at: /etc/fluence/fluence_braket_intercept.py
# PYTHONSTARTUP is set to this path by the webhook so any Python version loads it.
apiVersion: v1
kind: ConfigMap
metadata:
name: fluence-braket-interceptor
namespace: kube-system
labels:
app: fluence
data:
fluence_braket_intercept.py: |
# Injected by the Fluence webhook into every pod requesting a QPU resource.
# Patches AwsDevice.run() to automatically tag every quantum task submission
# with the pod UID, enabling the fluence-sidecar to find the task without
# any user application changes.
import os

def _install_interceptor():
try:
from braket.aws import AwsDevice
_original_run = AwsDevice.run

def _patched_run(self, task_specification, *args, **kwargs):
pod_uid = os.environ.get("FLUENCE_POD_UID", "")
if pod_uid:
tags = kwargs.get("tags", {})
tags["fluence-pod-uid"] = pod_uid
kwargs["tags"] = tags
return _original_run(self, task_specification, *args, **kwargs)

AwsDevice.run = _patched_run
except ImportError:
pass

_install_interceptor()
Loading
Loading