Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ env:
jobs:
build:
runs-on: ubuntu-latest
strategy:
# Don't cancel the other agent on a single failure — better to know
# which one broke and which one shipped.
fail-fast: false
matrix:
agent: [codex, claude]
permissions:
contents: read
packages: write
Expand All @@ -36,21 +42,25 @@ jobs:
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
flavor: |
latest=false
tags: |
type=ref,event=branch
type=ref,event=pr
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=sha,prefix=sha-,format=short
type=raw,value=latest,enable={{is_default_branch}}
type=raw,value=${{ matrix.agent }}-latest,enable=${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
type=ref,event=branch,suffix=-${{ matrix.agent }}
type=ref,event=pr,suffix=-${{ matrix.agent }}
type=sha,prefix=sha-,suffix=-${{ matrix.agent }},format=short
type=semver,pattern={{version}},suffix=-${{ matrix.agent }}
type=semver,pattern={{major}}.{{minor}},suffix=-${{ matrix.agent }}

- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
platforms: linux/amd64
push: ${{ github.event_name != 'pull_request' }}
build-args: |
AGENT=${{ matrix.agent }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
cache-from: type=gha,scope=${{ matrix.agent }}
cache-to: type=gha,mode=max,scope=${{ matrix.agent }}
72 changes: 43 additions & 29 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,27 +1,37 @@
# Codex CLI agent image — single-instance codex-cli runtime for apk8s.
# Multi-agent browser shell image — single-instance ttyd-fronted
# terminal that auto-launches an LLM coding agent. Built once per agent
# via the AGENT build arg (codex|claude). Each agent variant gets its
# own image tag (codex-latest, claude-latest) and is deployed as a
# separate pod in apk8s under kubernetes/apps/agents/<agent>-cli.
#
# Runs ttyd → bash with codex-cli on PATH (codex-cli in the browser).
# Identity is provided at runtime via env vars sourced from a k8s Secret
# backed by 1Password (deploy vault, typically `Kubernetes`). gh + git
# are configured by the entrypoint script so commits/PRs from inside the
# pod attribute to codex-prodromou.
# are configured by the entrypoint so commits/PRs from inside the pod
# attribute to <agent>-prodromou.
#
# code-server (VS Code in browser) is intentionally NOT installed here —
# that is a separate concern tracked by WOVED-35.

FROM debian:bookworm-slim

ARG NODE_VERSION=22
ARG AGENT=codex

ENV DEBIAN_FRONTEND=noninteractive \
# Validate AGENT early so an unsupported value fails the build cleanly.
RUN case "$AGENT" in codex|claude) ;; \
*) echo "Unsupported AGENT: $AGENT (expected codex|claude)" >&2; exit 1 ;; \
esac

ENV AGENT=${AGENT} \
DEBIAN_FRONTEND=noninteractive \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8 \
TZ=America/Los_Angeles \
HOME=/home/codex \
PATH=/home/codex/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
HOME=/home/${AGENT} \
PATH=/home/${AGENT}/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin

# System deps + gh + tmux + standard CLI utilities. ttyd is fetched
# separately below — Debian Bookworm doesn't carry it.
# System deps + gh + tmux + Node + bubblewrap + standard CLI utilities.
# ttyd is fetched separately below — Debian Bookworm doesn't carry it.
RUN set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends \
Expand All @@ -30,9 +40,11 @@ RUN set -eux; \
build-essential python3 python3-pip \
bubblewrap \
passwd; \
# Node.js from NodeSource (pinned major version). The previous
# node:*-bookworm-slim base shipped a phantom uid/gid 1000 user that
# collided with the codex user we add below.
# Node.js from NodeSource (pinned major version). NodeSource ships
# npm slightly behind upstream; we keep what they bundle since
# `npm install -g npm@latest` triggers a self-upgrade module-resolution
# bug at build time, and the bundled version works fine for installing
# the agent CLIs.
curl -fsSL "https://deb.nodesource.com/setup_${NODE_VERSION}.x" | bash -; \
apt-get install -y --no-install-recommends nodejs; \
# GitHub CLI from official apt repo.
Expand All @@ -45,10 +57,7 @@ RUN set -eux; \
apt-get install -y --no-install-recommends gh; \
rm -rf /var/lib/apt/lists/*

# ttyd — fetch the upstream static binary release. Debian Bookworm
# doesn't ship a ttyd package, and building from source pulls in
# libwebsockets + cmake + a long toolchain. The upstream releases
# publish per-arch static binaries that we drop into /usr/local/bin.
# ttyd — fetch the upstream static binary release.
ARG TTYD_VERSION=1.7.7
RUN set -eux; \
arch="$(dpkg --print-architecture)"; \
Expand All @@ -63,24 +72,29 @@ RUN set -eux; \
chmod +x /usr/local/bin/ttyd; \
ttyd --version

# Codex CLI (OpenAI's terminal coding agent).
RUN npm install -g @openai/codex && npm cache clean --force
# Per-agent CLI install. Both are npm packages; the global install puts
# `codex` or `claude` on PATH for the non-root user.
RUN case "$AGENT" in \
codex) npm install -g @openai/codex ;; \
claude) npm install -g @anthropic-ai/claude-code ;; \
esac && npm cache clean --force

# Non-root user. Same uid/gid as bjw-s defaults so PVCs work cleanly.
RUN groupadd -g 1000 codex \
&& useradd -m -u 1000 -g 1000 -s /bin/bash codex \
&& mkdir -p /home/codex/.config /home/codex/workspace \
&& chown -R codex:codex /home/codex
# Non-root user. uid/gid 1000, name = AGENT. Matching the AGENT name to
# the user keeps PVC ownership obvious and avoids shell prompts that
# lie about which agent is running.
RUN groupadd -g 1000 ${AGENT} \
&& useradd -m -u 1000 -g 1000 -s /bin/bash ${AGENT} \
&& mkdir -p /home/${AGENT}/.config /home/${AGENT}/workspace \
&& chown -R ${AGENT}:${AGENT} /home/${AGENT}

# Entrypoint + bash profile.
COPY --chmod=0755 bin/entrypoint.sh /usr/local/bin/entrypoint.sh
COPY --chown=codex:codex profile/.bashrc /home/codex/.bashrc
COPY --chown=codex:codex profile/.tmux.conf /home/codex/.tmux.conf
COPY --chmod=0755 bin/entrypoint.sh /usr/local/bin/entrypoint.sh
COPY --chown=${AGENT}:${AGENT} profile/.bashrc /home/${AGENT}/.bashrc
COPY --chown=${AGENT}:${AGENT} profile/.tmux.conf /home/${AGENT}/.tmux.conf

USER codex
WORKDIR /home/codex/workspace
USER ${AGENT}
WORKDIR /home/${AGENT}/workspace

EXPOSE 7681

# tini reaps zombies; entrypoint sets up identity then exec's ttyd.
ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/entrypoint.sh"]
135 changes: 86 additions & 49 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,57 +1,82 @@
# codex-shell

Container image for running [OpenAI codex-cli](https://github.com/openai/codex)
as a single, browser-accessible shell on Kubernetes — codex CLI exposed over
ttyd, identity locked to a dedicated GitHub user, persistent home volume.
Multi-agent browser shell image. One Dockerfile, two image variants —
`codex` (OpenAI's [codex-cli](https://github.com/openai/codex)) and
`claude` (Anthropic's [Claude Code](https://github.com/anthropics/claude-code))
— each exposed over ttyd as a single, browser-accessible terminal on
Kubernetes. Identity is locked to a dedicated GitHub user per agent;
home directory persists on a PVC; canonical Nate-org instructions
(from [`nprodromou/agent-config`](https://github.com/nprodromou/agent-config))
are pulled at boot and symlinked into the agent's expected path.

The repo name is a historical artifact — it started as codex-only and
gained the claude variant later. Image content is cluster-agnostic; the
canonical deploy lives in [`nprodromou/apk8s`](https://github.com/nprodromou/apk8s)
under `kubernetes/apps/agents/{codex,claude}-cli`.

`code-server` (VS Code in the browser) is intentionally **not** in this
image — that's a separate concern tracked by WOVED-35.

## Images

| Variant | Tag | Agent CLI |
| ------- | -------------------------------------------------- | ------------------------ |
| codex | `ghcr.io/nprodromou/codex-shell:codex-latest` | `@openai/codex` |
| claude | `ghcr.io/nprodromou/codex-shell:claude-latest` | `@anthropic-ai/claude-code` |

Both are built from the same `Dockerfile` via the `AGENT` build arg
(`codex` or `claude`). The build matrix in `.github/workflows/build.yml`
publishes both variants on every push to `main`. Per-commit tags follow
the pattern `sha-XXXXX-{codex,claude}` for pinning.

Cluster-agnostic: the image runs anywhere Kubernetes can pull from GHCR. The
canonical deploy lives in `nprodromou/apk8s`, but nothing in the image is
specific to that cluster.
## Runtime contract

## What it is
The entrypoint requires the following environment variables. They are
mounted into the pod by an `ExternalSecret` that pulls from the deploy's
1Password vault (typically `Kubernetes`) per the canonical
[Agent Secret Naming Convention](https://prodromou.atlassian.net/wiki/spaces/Operations/pages/63438850).

A long-running pod that exposes a `bash` shell with `codex` (and `gh`, `git`,
`tmux`, etc.) on `PATH` over [ttyd](https://github.com/tsl0922/ttyd). Hit it
from a browser and you get a terminal. Identity is locked to a dedicated
GitHub user (`codex-prodromou` in the canonical deploy) so commits, PRs, and
Plane tickets attribute deterministically — no more `gh auth` collisions with
whichever identity a developer machine logged in last.
### Common (both agents)

This image is the runtime; the cluster manifests for the canonical deploy
live in [`nprodromou/apk8s` → `kubernetes/apps/agents/codex-cli`](https://github.com/nprodromou/apk8s).
| Env var | 1Password reference | Purpose |
| ---------------- | ----------------------------------------------------- | -------------------------------------------------- |
| `GH_TOKEN` | `op://Kubernetes/${agent}-github-pat/pat` | GitHub PAT (`${agent}-prodromou`); used by `gh` |
| `GIT_USER_NAME` | `op://Kubernetes/${agent}-github-pat/git_user_name` | Defaults to `${Agent} CoWork` |
| `GIT_USER_EMAIL` | `op://Kubernetes/${agent}-github-pat/git_user_email` | Defaults to `${agent}@prodromou.com` |
| `PLANE_TOKEN` | `op://Kubernetes/${agent}-plane-token/token` | Plane API key for the agent's workspace user |

`code-server` (VS Code in the browser) is intentionally **not** in this image
— see WOVED-35 for that.
### Codex-specific

## Image
| Env var | 1Password reference | Purpose |
| ---------------- | ----------------------------------------- | -------------------------------------------------------------------------------- |
| `CODEX_SESSION` | `op://Kubernetes/codex-session/session` | OpenAI Codex CLI auth blob. Optional. Seeds `~/.codex/auth.json` on first boot. |

```
ghcr.io/nprodromou/codex-shell:latest
```
### Claude-specific

Built by `.github/workflows/build.yml` on push to `main` or version tag.

## Runtime contract
Claude Code uses interactive `/login` on first connect — no env-var
session seed. Credentials persist on the PVC at `~/.claude/`.

The entrypoint requires the following environment variables. They are mounted
into the pod by an `ExternalSecret` that pulls from the deploy's 1Password
vault (typically `Kubernetes`) per the canonical [Agent Secret Naming
Convention](https://prodromou.atlassian.net/wiki/spaces/Operations/pages/63438850).

| Env var | 1Password reference | Purpose |
| ---------------- | ------------------------------------------------ | -------------------------------------------------- |
| `GH_TOKEN` | `op://Kubernetes/codex-github-pat/pat` | GitHub PAT (`codex-prodromou`); used by `gh` |
| `CODEX_SESSION` | `op://Kubernetes/codex-session/session` | OpenAI Codex CLI auth blob |
| `PLANE_TOKEN` | `op://Kubernetes/codex-plane-token/token` | Plane API key for `codex-prodromou` workspace user |
| `GIT_USER_NAME` | `op://Kubernetes/codex-github-pat/git_user_name` | Defaults to `Codex CoWork` |
| `GIT_USER_EMAIL` | `op://Kubernetes/codex-github-pat/git_user_email` | Defaults to `codex@prodromou.com` |

Optional:
### Optional (both agents)

| Env var | Default |
| ------------------- | ------------------------------------------------------- |
| `PLANE_GATEWAY_URL` | `https://n8n.prodromou.com/webhook/plane-gateway-v21` |

## How a connect works

1. ttyd accepts the browser connection and runs the configured shell command.
2. Entrypoint has already wired `gh`, `git`, the agent's auth state, and
pulled the latest `nprodromou/agent-config` into `~/.agent-config`,
symlinking `instructions/CLAUDE.md` into:
- **codex:** `~/.codex/AGENTS.md`
- **claude:** `~/.claude/CLAUDE.md`
3. The shell command attempts to resume the most recent session:
- **codex:** `codex resume --last`
- **claude:** `claude --continue`
4. If no prior session exists, falls through to a fresh agent run.
5. If the agent exits or crashes, drops to an interactive bash login so
the pod isn't bricked.

## Ports

| Port | Purpose |
Expand All @@ -60,31 +85,43 @@ Optional:

## Persistence

The pod's `/home/codex` is backed by a Longhorn `ReadWriteOnce` PVC declared in
the apk8s manifests. That gives you durable shell history, codex-cli session
state, and any cloned repos under `~/workspace`.
The pod's `/home/${AGENT}` is backed by a Longhorn `ReadWriteOnce` PVC
declared in the apk8s manifests. That gives you durable shell history,
agent session state, persisted auth tokens, and any cloned repos under
`~/workspace`.

## Developing locally

```sh
# Build
docker build -t codex-shell:dev .
# Build the codex variant.
docker build -t codex-shell:codex --build-arg AGENT=codex .

# Or the claude variant.
docker build -t codex-shell:claude --build-arg AGENT=claude .

# Run with the env vars the entrypoint expects.
docker run --rm -it -p 7681:7681 \
-e GH_TOKEN="$(gh auth token)" \
-e GIT_USER_NAME="Local Test" \
-e GIT_USER_EMAIL="$(git config user.email)" \
codex-shell:dev
codex-shell:codex
```

Then open <http://localhost:7681>.

## Notes

- The image runs as non-root `codex` (uid 1000).
- Base is `debian:bookworm-slim`; Node 22 from NodeSource; `npm@latest`
installed on top because NodeSource lags upstream.
- The image runs as non-root with the user named after the agent
(`codex` or `claude`), uid/gid 1000. Matches the Longhorn PVC owner so
volume mounts work cleanly.
- `tini` is PID 1 so zombie reaping is handled.
- `tmux` is preinstalled — start a session with `tmux` and your shell survives
closing the browser tab; `tmux attach` to reconnect.
- `gh` uses `GH_TOKEN` automatically; no interactive `gh auth login` needed.
- HTTPS clones via `gh` are seamless because `gh auth setup-git` runs at boot.
- `tmux`, `bubblewrap` (codex sandbox prereq), `ripgrep`-equivalents,
`jq`, `vim`, etc. are preinstalled.
- `gh` uses `GH_TOKEN` automatically; no interactive `gh auth login`
needed. HTTPS clones via `gh` are seamless because `gh auth setup-git`
runs at boot.
- Updates to `nprodromou/agent-config` reach the pod on next restart
(entrypoint pulls and resets to `origin/main`); no image rebuild
needed for instruction changes.
Loading
Loading