-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.yml
More file actions
118 lines (117 loc) · 5.57 KB
/
docker-compose.yml
File metadata and controls
118 lines (117 loc) · 5.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
services:
llamaman:
build:
context: .
dockerfile: Dockerfile
ports:
# Management UI
- "5000:5000"
# Llamaman proxy (Ollama-compatible API for OpenWebUI)
- "42069:42069"
# llama-server instance ports / per-instance idle proxy ports.
# Maps host:8000-8020 → llamaman container:9000-9020 to avoid conflicts
# with sibling llama-server containers that bind host:8000-8020 directly
# in non-proxy mode. Proxy-enabled instances use the 9000-9020 internal
# range and are reached via the Docker network, not these host bindings.
- "8000-8020:9000-9020"
volumes:
- ./models:/models
- ./data:/data
- ./logs:/tmp/llama-logs
# Docker socket - allows llamaman to spawn llama-server containers
- /var/run/docker.sock:/var/run/docker.sock
# GPU monitoring (AMD + Intel Arc): exposes VRAM and utilization via sysfs.
# Safe to leave on NVIDIA hosts - it is ignored when pynvml is available.
- /sys/class/drm:/sys/class/drm:ro
environment:
- MODELS_DIR=/models
- DATA_DIR=/data
- LOGS_DIR=/tmp/llama-logs
# HOST_MODELS_DIR / HOST_LOGS_DIR must be the real paths on the Docker HOST.
# The Docker daemon uses these as bind-mount sources when spawning sibling
# llama-server containers, so they must resolve on the host, not inside
# the llamaman container. Set these to match your volume source paths above.
- HOST_MODELS_DIR=./models
- HOST_LOGS_DIR=./logs
- PORT_RANGE_START=8000
- PORT_RANGE_END=8020
- INTERNAL_PORT_RANGE_START=9000
- INTERNAL_PORT_RANGE_END=9020
- LLAMAMAN_MAX_MODELS=1
- LLAMAMAN_PROXY_PORT=42069
- LLAMAMAN_IDLE_TIMEOUT=0
- HEALTH_CHECK_TIMEOUT=3
- MODEL_LOAD_TIMEOUT=300
- REQUEST_TIMEOUT=300
# Docker-in-Docker settings
#
# LLAMA_IMAGE: which llama.cpp image to use for spawned containers.
# Auto-selected from detected GPU vendor if not set.
# NVIDIA: ghcr.io/ggml-org/llama.cpp:server-cuda (default)
# AMD ROCm: ghcr.io/ggml-org/llama.cpp:server-rocm
# Intel Arc: ghcr.io/ggml-org/llama.cpp:server-sycl
# CPU only: ghcr.io/ggml-org/llama.cpp:server
- LLAMA_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda
- LLAMA_NETWORK=llamaman-net
- LLAMA_CONTAINER_PREFIX=llamaman-
# REQUIRED. A unique string identifying THIS deployment. Any value works
# (a hostname, a uuid, "srv1", ...). It is the partition key for every
# instance/download/per-node setting in storage, so pick once and keep
# forever - changing it later orphans this node's existing state. The app
# refuses to start without it.
- LLAMAMAN_NODE_NAME=srv1
# How llamaman reaches spawned llama-server containers is auto-detected:
# in Docker it uses the Docker network (by container name); bare-metal it
# uses localhost on the published ports. Override only if detection is
# wrong for your runtime: LLAMAMAN_IN_DOCKER=true|false.
# - LLAMAMAN_IN_DOCKER=true
# GPU_TYPE: override GPU auto-detection ("cuda", "rocm", "intel").
# Leave unset to let llamaman probe the host automatically.
# - GPU_TYPE=cuda
# Restrict which GPUs are visible to all llama-server containers.
# Comma-separated indices, e.g. "0,1,3". Leave unset to expose all GPUs.
# - LLAMA_GPU_DEVICES=0,1,3
# Uncomment to set a fixed Flask session secret (required for multi-replica)
# - SECRET_KEY=change-me-to-something-random
# Uncomment to use MariaDB/MySQL instead of JSON file storage
# - DATABASE_URL=mysql+pymysql://llama:password@mariadb:3306/llamaman
#
# Clustering (optional) - run several llamaman deployments as one cluster.
# Off by default; single-node installs ignore all of these. The node's
# identity comes from LLAMAMAN_NODE_NAME above (required for all installs).
# Requirements: every node must share the SAME storage backend (point them
# all at the same DATABASE_URL above) and the SAME CLUSTER_SECRET. Nodes
# discover each other automatically through the shared registry, keyed by
# LLAMAMAN_NODE_NAME (which must be unique across the cluster).
# - CLUSTER_ENABLED=true
# Shared bearer token for all node-to-node calls. Use a long random value,
# identical on every node. Send it over a trusted network or behind TLS.
# - CLUSTER_SECRET=change-me-to-a-long-shared-random-secret
# How peers reach THIS node's UI/API (a hostname/IP routable from the
# OTHER hosts, not localhost), e.g. http://srv1:5000.
# Needed only for cross-node ACTIONS - launching/pulling/downloading on
# this node from another node's UI, and shared-queue inference forwarded
# to this node. The shared-DB dashboard (seeing this node's stats and
# instances) works without it; a node with no advertise URL just shows up
# as view-only and is skipped as an inference target.
# - CLUSTER_ADVERTISE_URL=http://srv1:5000
# NVIDIA native GPU monitoring (pynvml) - uncomment on NVIDIA hosts.
# Grants the llamaman container read-only NVML access (no compute).
# Remove or leave commented on AMD/Intel hosts.
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# capabilities: [utility]
networks:
- llamaman-net
# depends_on:
# mariadb:
# condition: service_healthy
# required: false
restart: unless-stopped
networks:
llamaman-net:
driver: bridge
name: llamaman-net