llamaMan/docker-compose.yml at main · nullata/llamaMan · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
services:
  llamaman:
    build:
      context: .
      dockerfile: Dockerfile
    ports:
      # Management UI
      - "5000:5000"
      # Llamaman proxy (Ollama-compatible API for OpenWebUI)
      - "42069:42069"
      # llama-server instance ports / per-instance idle proxy ports.
      # Maps host:8000-8020 → llamaman container:9000-9020 to avoid conflicts
      # with sibling llama-server containers that bind host:8000-8020 directly
      # in non-proxy mode. Proxy-enabled instances use the 9000-9020 internal
      # range and are reached via the Docker network, not these host bindings.
      - "8000-8020:9000-9020"
    volumes:
      - ./models:/models
      - ./data:/data
      - ./logs:/tmp/llama-logs
      # Docker socket - allows llamaman to spawn llama-server containers
      - /var/run/docker.sock:/var/run/docker.sock
      # GPU monitoring (AMD + Intel Arc): exposes VRAM and utilization via sysfs.
      # Safe to leave on NVIDIA hosts - it is ignored when pynvml is available.
      - /sys/class/drm:/sys/class/drm:ro
    environment:
      - MODELS_DIR=/models
      - DATA_DIR=/data
      - LOGS_DIR=/tmp/llama-logs
      # HOST_MODELS_DIR / HOST_LOGS_DIR must be the real paths on the Docker HOST.
      # The Docker daemon uses these as bind-mount sources when spawning sibling
      # llama-server containers, so they must resolve on the host, not inside
      # the llamaman container.  Set these to match your volume source paths above.
      - HOST_MODELS_DIR=./models
      - HOST_LOGS_DIR=./logs
      - PORT_RANGE_START=8000
      - PORT_RANGE_END=8020
      - INTERNAL_PORT_RANGE_START=9000
      - INTERNAL_PORT_RANGE_END=9020
      - LLAMAMAN_MAX_MODELS=1
      - LLAMAMAN_PROXY_PORT=42069
      - LLAMAMAN_IDLE_TIMEOUT=0
      - HEALTH_CHECK_TIMEOUT=3
      - MODEL_LOAD_TIMEOUT=300
      - REQUEST_TIMEOUT=300
      # Docker-in-Docker settings
      #
      # LLAMA_IMAGE: which llama.cpp image to use for spawned containers.
      # Auto-selected from detected GPU vendor if not set.
      #   NVIDIA:     ghcr.io/ggml-org/llama.cpp:server-cuda   (default)
      #   AMD ROCm:   ghcr.io/ggml-org/llama.cpp:server-rocm
      #   Intel Arc:  ghcr.io/ggml-org/llama.cpp:server-sycl
      #   CPU only:   ghcr.io/ggml-org/llama.cpp:server
      - LLAMA_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda
      - LLAMA_NETWORK=llamaman-net
      - LLAMA_CONTAINER_PREFIX=llamaman-
      # REQUIRED. A unique string identifying THIS deployment. Any value works
      # (a hostname, a uuid, "srv1", ...). It is the partition key for every
      # instance/download/per-node setting in storage, so pick once and keep
      # forever - changing it later orphans this node's existing state. The app
      # refuses to start without it.
      - LLAMAMAN_NODE_NAME=srv1
      # How llamaman reaches spawned llama-server containers is auto-detected:
      # in Docker it uses the Docker network (by container name); bare-metal it
      # uses localhost on the published ports. Override only if detection is
      # wrong for your runtime: LLAMAMAN_IN_DOCKER=true|false.
      # - LLAMAMAN_IN_DOCKER=true
      # GPU_TYPE: override GPU auto-detection ("cuda", "rocm", "intel").
      # Leave unset to let llamaman probe the host automatically.
      # - GPU_TYPE=cuda
      # Restrict which GPUs are visible to all llama-server containers.
      # Comma-separated indices, e.g. "0,1,3". Leave unset to expose all GPUs.
      # - LLAMA_GPU_DEVICES=0,1,3
      # Uncomment to set a fixed Flask session secret (required for multi-replica)
      # - SECRET_KEY=change-me-to-something-random
      # Uncomment to use MariaDB/MySQL instead of JSON file storage
      # - DATABASE_URL=mysql+pymysql://llama:password@mariadb:3306/llamaman
      #
      # Clustering (optional) - run several llamaman deployments as one cluster.
      # Off by default; single-node installs ignore all of these. The node's
      # identity comes from LLAMAMAN_NODE_NAME above (required for all installs).
      # Requirements: every node must share the SAME storage backend (point them
      # all at the same DATABASE_URL above) and the SAME CLUSTER_SECRET. Nodes
      # discover each other automatically through the shared registry, keyed by
      # LLAMAMAN_NODE_NAME (which must be unique across the cluster).
      # - CLUSTER_ENABLED=true
      # Shared bearer token for all node-to-node calls. Use a long random value,
      # identical on every node. Send it over a trusted network or behind TLS.
      # - CLUSTER_SECRET=change-me-to-a-long-shared-random-secret
      # How peers reach THIS node's UI/API (a hostname/IP routable from the
      # OTHER hosts, not localhost), e.g. http://srv1:5000.
      # Needed only for cross-node ACTIONS - launching/pulling/downloading on
      # this node from another node's UI, and shared-queue inference forwarded
      # to this node. The shared-DB dashboard (seeing this node's stats and
      # instances) works without it; a node with no advertise URL just shows up
      # as view-only and is skipped as an inference target.
      # - CLUSTER_ADVERTISE_URL=http://srv1:5000
    # NVIDIA native GPU monitoring (pynvml) - uncomment on NVIDIA hosts.
    # Grants the llamaman container read-only NVML access (no compute).
    # Remove or leave commented on AMD/Intel hosts.
    # deploy:
    #   resources:
    #     reservations:
    #       devices:
    #         - driver: nvidia
    #           capabilities: [utility]
    networks:
      - llamaman-net
    # depends_on:
    #   mariadb:
    #     condition: service_healthy
    #     required: false
    restart: unless-stopped

networks:
  llamaman-net:
    driver: bridge
    name: llamaman-net