longtermrisk · nielsrolf · Sep 17, 2025 · Sep 2, 2025 · Sep 16, 2025 · Sep 17, 2025
diff --git a/.dockerignore b/.dockerignore
@@ -5,4 +5,91 @@ cache/
 .llm-cache/
 **/.llm-cache/
 node_modules/
-**/node_modules/
+**/node_modules/
+logs/
+openweights/dashboard/backend/backend.log
+openweights/dashboard/backend/backend.pid
+build-docker-in-runpod
+.env
+.env.dev
+.env.prod
+.env.ow-dev
+.env.ow-migrations
+openweights/dashboard/backend/.env.ow-dev
+openweights/dashboard/backend/.env.ow-migrations
+openweights/dashboard/frontend/.env.ow-dev
+openweights/dashboard/frontend/.env.ow-migrations
+artifacts/
+debug/
+example/.ipynb_checkpoints/
+example/Untitled1.ipynb
+dev.py
+planb/
+vulnerable-code/
+openweights/client/.llm-cache
+# yeaa
+cache
+# Bazel
+/bazel-*
+/bazel-bin
+/bazel-genfiles
+/bazel-out
+/bazel-testlogs
+/bazel-workspace
+
+# Bazel symlinks
+/bazel-*
+
+# Bazel disk cache
+.bazel-cache/
+
+# Bazel IntelliJ plugin
+.ijwb/
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual Environment
+venv/
+env/
+ENV/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# OS
+.DS_Store
+Thumbs.db
+
+example/ft_job_artifacts/
+example/mcq_dataset.jsonl
+openweights/jobs/unsloth/logp.ipynb
+
+openweights/dashboard/backend/static/
+example/_*
+.logs
+openweights/jobs/unsloth/check.ipynb
+.cache
+.logs/
diff --git a/.gitignore b/.gitignore
@@ -83,4 +83,5 @@ openweights/dashboard/backend/static/
 example/_*
 .logs
 openweights/jobs/unsloth/check.ipynb
-.cache
+.cache
+admin/
diff --git a/docs/ttl.md b/docs/ttl.md
@@ -0,0 +1,82 @@
+# TTL (Time To Live) Feature
+
+The TTL feature provides automatic pod termination to prevent runaway costs and ensure resource cleanup.
+
+## Overview
+
+- **Default TTL**: 24 hours for all pods
+- **Automatic termination**: Pods self-terminate when TTL expires
+- **Extensible**: TTL can be extended from within the pod
+- **Dev mode support**: TTL monitoring runs for both dev and worker instances
+
+## Usage
+
+### Starting pods with custom TTL
+
+```bash
+# Start dev instance with default 24-hour TTL
+python openweights/cluster/start_runpod.py A100 default --dev_mode=true
+
+# Start dev instance with 2-hour TTL  
+python openweights/cluster/start_runpod.py A100 default --dev_mode=true --ttl_hours=2
+
+# Start worker with 12-hour TTL
+python openweights/cluster/start_runpod.py A100 finetuning --ttl_hours=12
+```
+
+### Managing TTL from within a pod
+
+Once inside a pod, use the TTL manager utility:
+
+```bash
+# Check current TTL status
+python openweights/worker/services/ttl_manager.py --check
+
+# Extend TTL by 5 more hours
+python openweights/worker/services/ttl_manager.py --extend 5
+
+# Set TTL to 10 hours from now
+python openweights/worker/services/ttl_manager.py --set 10
+```
+
+### Manual TTL management
+
+You can also manually update the TTL by editing `~/shutdown.txt`:
+
+```bash
+python3 -c "
+import datetime
+with open('~/shutdown.txt', 'w') as f:
+    new_time = datetime.datetime.now() + datetime.timedelta(hours=48)
+    f.write(new_time.isoformat())
+print(f'TTL extended to {new_time}')
+"
+```
+
+## How it works
+
+1. **TTL Setup**: When a pod starts, the TTL monitor service calculates the shutdown time and writes it to `~/shutdown.txt`
+2. **Monitoring**: A background service checks the shutdown time every minute
+3. **Termination**: When the current time exceeds the shutdown time, the service terminates the pod using the RunPod API
+4. **Extension**: Jobs or users can extend the TTL by updating the shutdown time in the file
+
+## Architecture
+
+- **TTL Monitor Service**: `openweights/worker/services/ttl_monitor.py`
+- **TTL Manager Utility**: `openweights/worker/services/ttl_manager.py`
+- **Configuration**: TTL passed via `TTL_HOURS` environment variable
+- **Shutdown File**: `~/shutdown.txt` contains ISO format datetime
+
+## Environment Variables
+
+- `TTL_HOURS`: Number of hours for TTL (default: 24)
+- `RUNPOD_API_KEY`: RunPod API key for pod termination
+- `OW_DEV`: Indicates if running in dev mode (affects other services, not TTL)
+
+## Notes
+
+- TTL monitoring runs for both dev and worker instances
+- This provides an additional safety net especially for dev instances
+- Pod ID is automatically detected from RunPod metadata API
+- Failed termination attempts are retried every minute
+- TTL can be reset/extended unlimited times before expiration
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -13,84 +13,50 @@ if [ -n "$PUBLIC_KEY" ]; then
 else
     echo "[$(date)] No PUBLIC_KEY provided, skipping SSH key setup"
 fi
-echo "Authorized keys added."
 
-# if OW_COMMIT is set, checkout the commit
+# Repository checkout if needed
 echo "[$(date)] Checking for OW_COMMIT environment variable"
 if [ -n "$OW_COMMIT" ]; then
-    echo "[$(date)] Starting repository checkout for commit: $OW_COMMIT"
-    rm -rf openweights
-    git clone https://github.com/longtermrisk/openweights.git openweights_dev
-    cd openweights_dev
-    git checkout $OW_COMMIT
-    mv openweights ../openweights
-    cd ..
-    rm -rf openweights_dev
-    echo "[$(date)] Repository checkout completed"
+    echo "[$(date)] Starting repository checkout"
+    python3 openweights/worker/services/checkout.py
+    if [ $? -ne 0 ]; then
+        echo "[$(date)] Repository checkout failed"
+        exit 1
+    fi
 else
     echo "[$(date)] No OW_COMMIT specified, skipping repository checkout"
 fi
 
 # Login to huggingface
 echo "[$(date)] Attempting to login to Hugging Face"
-python3 -c "from huggingface_hub.hf_api import HfFolder; import os; HfFolder.save_token(os.environ['HF_TOKEN'])"
+python3 openweights/worker/services/hf_login.py
 echo "[$(date)] Hugging Face login completed"
 
-# Generate SSH host keys
-echo "[$(date)] Generating SSH host keys"
+# Generate SSH host keys and start SSH service
+echo "[$(date)] Setting up SSH service"
 ssh-keygen -A
-echo "[$(date)] SSH host keys generated"
-
-# Start SSH service
-echo "[$(date)] Starting SSH service"
 service ssh start
 echo "[$(date)] SSH service started"
 
 # Print sshd logs to stdout
 tail -f /var/log/auth.log &
 
-# Start a simple server that serves the content of main.log on port 10101
-# Create main.log if it doesn't exist
-touch main.log
-
-# Start a simple Python HTTP server to serve files from logs/
+# Start background services
 echo "[$(date)] Starting HTTP log server on port 10101"
-python3 -c '
-import http.server
-import socketserver
-import os
+mkdir logs
+python3 openweights/worker/services/log_server.py &
 
-class LogHandler(http.server.SimpleHTTPRequestHandler):
-    def do_GET(self):
-        # If path is /logs, serve logs/main
-        if self.path == "/logs":
-            file_path = "logs/main"
-        else:
-            # Remove leading slash and ensure path is within logs directory
-            path = self.path.lstrip("/")
-            file_path = os.path.join("logs", path)
-
-        # Check if file exists and is within logs directory
-        if os.path.exists(file_path) and os.path.commonprefix([os.path.abspath(file_path), os.path.abspath("logs")]) == os.path.abspath("logs"):
-            self.send_response(200)
-            self.send_header("Content-type", "text/plain")
-            self.end_headers()
-            with open(file_path, "rb") as f:
-                self.wfile.write(f.read())
-        else:
-            self.send_error(404, "File not found")
+# Start TTL monitoring service
+echo "[$(date)] Starting TTL monitoring service"
+python3 openweights/worker/services/ttl_monitor.py &
 
-with socketserver.TCPServer(("", 10101), LogHandler) as httpd:
-    httpd.serve_forever()
-' &
-
-echo "[$(date)] HTTP log server started"
+echo "[$(date)] All services started"
 
 # Execute the main application or run in dev mode
 if [ "$OW_DEV" = "true" ]; then
     echo "[$(date)] Starting in development mode"
     exec tail -f /dev/null
 else
     echo "[$(date)] Starting main application"
-    exec python3 openweights/worker/main.py \ > >(tee logs/main) \ 2> >(tee -a logs/main >&2)
-fi
+    exec python3 openweights/worker/main.py > >(tee logs/main) 2> >(tee -a logs/main >&2)
+fi
diff --git a/example/ui.py b/example/ui.py
@@ -0,0 +1,29 @@
+import gradio as gr # type: ignore
+from openai import OpenAI # type: ignore
+
+def chat_with(model):
+    client = OpenAI(base_url="https://ag5a2je35kxz7y-8000.proxy.runpod.net/v1")
+    def predict(message, history):
+        messages = []
+        for human, assistant in history:
+            messages.append({"role": "user", "content": human})
+            messages.append({"role": "assistant", "content": assistant})
+        messages.append({"role": "user", "content": message})
+
+        stream = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            stream=True
+        )
+
+        partial_message = ""
+        for chunk in stream:
+            if chunk.choices[0].delta.content is not None:
+                partial_message += chunk.choices[0].delta.content
+                yield partial_message
+
+    gr.ChatInterface(predict).queue().launch()
+
+
+if __name__ == '__main__':
+    chat_with('Qwen/Qwen3-235B-A22B-Instruct-2507-FP8')
diff --git a/openweights/client/__init__.py b/openweights/client/__init__.py
@@ -1,16 +1,6 @@
-import asyncio
-import atexit
-import json
-from typing import Optional, BinaryIO, Dict, Any, List, Union
+from typing import Optional, Dict, Any
 import os
-import sys
-from postgrest.exceptions import APIError
-import hashlib
-from datetime import datetime
-from openai import OpenAI, AsyncOpenAI
-import backoff
-import time
-from supabase import create_client, Client
+from supabase import create_client
 from supabase.lib.client_options import ClientOptions
 
 from openweights.client.files import Files, validate_messages, validate_preference_dataset
@@ -20,7 +10,13 @@
 from openweights.client.temporary_api import TemporaryApi
 from openweights.client.chat import ChatCompletions, AsyncChatCompletions
 from openweights.client.utils import group_models_or_adapters_by_model, get_lora_rank
+from openweights.client.decorators import supabase_retry
 
+import logging
+
+# Reduce noise to only warnings+errors
+for name in ["httpx", "httpx._client", "postgrest", "gotrue", "supabase"]:
+    logging.getLogger(name).setLevel(logging.WARNING)
 
 def create_authenticated_client(supabase_url: str, supabase_anon_key: str, auth_token: Optional[str] = None):
     """Create a Supabase client with authentication.
@@ -114,15 +110,15 @@ def __init__(self,
             setattr(self, name, cls(self))
         OpenWeights._INSTANCES.append(self)
 
-    @backoff.on_exception(backoff.constant, Exception, interval=1, max_time=60, max_tries=60, on_backoff=lambda details: print(f"Retrying... {details['exception']}"))
+    @supabase_retry()
     def get_organization_id(self) -> str:
         """Get the organization ID associated with the current token"""
         result = self._supabase.rpc('get_organization_from_token').execute()
         if not result.data:
             raise ValueError("Could not determine organization ID from token")
         return result.data
 
-    @backoff.on_exception(backoff.constant, Exception, interval=1, max_time=60, max_tries=60, on_backoff=lambda details: print(f"Retrying... {details['exception']}"))
+    @supabase_retry()
     def get_organization_name(self):
         """Get the organization ID associated with the current token"""
         result = self._supabase.table('organizations')\
@@ -131,7 +127,7 @@ def get_organization_name(self):
             .single().execute()
         return result.data['name']
 
-    @backoff.on_exception(backoff.constant, Exception, interval=1, max_time=60, max_tries=60, on_backoff=lambda details: print(f"Retrying... {details['exception']}"))
+    @supabase_retry()
     def get_hf_org(self):
         """Get organization secrets from the database."""
         result = self._supabase.table('organization_secrets')\
-Original file line number
+Diff line change
@@ Expand Up / @@ -83,4 +83,5 @@ openweights/dashboard/backend/static/ @@
     example/_*
     .logs
     openweights/jobs/unsloth/check.ipynb
-    .cache
+    .cache
+    admin/