From 9cffa844175cce3222a15e6394506ab55c00e360 Mon Sep 17 00:00:00 2001 From: Steve Drew Date: Sun, 1 Mar 2026 22:20:50 -0700 Subject: [PATCH 1/7] Add UI end-to-end tests for the FL web portal (#20) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements a Playwright-based E2E test suite that exercises the full federated learning lifecycle through the Controller web portal using three independent site stacks (1 coordinator + 2 participants). New files: - workbench/docker-compose.e2e.yml — three Controller stacks (web + Celery beat/run/processor workers each) sharing one Router, Postgres, and Redis; sites isolated by separate Redis DB numbers (1/2/3) - e2e/test_fl_workflow.py — single sequential test covering all 8 workflow steps: site registration, project creation, project joining, run start, dataset upload, wait for Success, log inspection, and artifact download - e2e/conftest.py — pytest fixtures (base URLs, fixtures dir) - e2e/fixtures/site_{a,b,c}.csv — 50-row synthetic binary classification datasets (2 features, no header) for LogisticRegression - e2e/requirements.txt — pytest, pytest-playwright, playwright - .github/workflows/e2e-tests.yml — CI job that builds images, starts the E2E stack, waits for all services, runs tests, and tears down Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/e2e-tests.yml | 96 ++++++++++++ e2e/conftest.py | 45 ++++++ e2e/fixtures/site_a.csv | 50 ++++++ e2e/fixtures/site_b.csv | 50 ++++++ e2e/fixtures/site_c.csv | 50 ++++++ e2e/requirements.txt | 3 + e2e/test_fl_workflow.py | 236 ++++++++++++++++++++++++++++ workbench/docker-compose.e2e.yml | 260 +++++++++++++++++++++++++++++++ 8 files changed, 790 insertions(+) create mode 100644 .github/workflows/e2e-tests.yml create mode 100644 e2e/conftest.py create mode 100644 e2e/fixtures/site_a.csv create mode 100644 e2e/fixtures/site_b.csv create mode 100644 e2e/fixtures/site_c.csv create mode 100644 e2e/requirements.txt create mode 100644 e2e/test_fl_workflow.py create mode 100644 workbench/docker-compose.e2e.yml diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml new file mode 100644 index 0000000..0ce2eca --- /dev/null +++ b/.github/workflows/e2e-tests.yml @@ -0,0 +1,96 @@ +name: E2E Tests + +on: + pull_request: + branches: [main] + push: + branches: [main] + +jobs: + e2e: + runs-on: ubuntu-latest + timeout-minutes: 20 + + steps: + - uses: actions/checkout@v4 + + # ── Build Docker images ────────────────────────────────────────────────── + + - name: Build router image + run: docker build -t starfish-router router/ + + - name: Build controller image + run: docker build -t starfish-controller controller/ + + # ── Start the E2E stack ────────────────────────────────────────────────── + + - name: Start E2E stack + working-directory: workbench + run: docker compose -f docker-compose.e2e.yml up -d + + # ── Wait for services to be healthy ───────────────────────────────────── + + - name: Wait for router to be ready + run: | + for i in $(seq 1 30); do + if curl -sf http://localhost:8000/starfish/api/v1/sites/ \ + -u admin:1234 > /dev/null 2>&1; then + echo "Router is ready" + exit 0 + fi + echo "Waiting for router ($i/30)..." + sleep 5 + done + echo "Router did not become ready in time" + docker compose -f workbench/docker-compose.e2e.yml logs router + exit 1 + + - name: Wait for all three controllers to be ready + run: | + for port in 8001 8002 8003; do + for i in $(seq 1 20); do + if curl -sf http://localhost:${port}/controller/ > /dev/null 2>&1; then + echo "Controller on port $port is ready" + break + fi + echo "Waiting for controller:$port ($i/20)..." + sleep 5 + if [ $i -eq 20 ]; then + echo "Controller on port $port did not become ready" + docker compose -f workbench/docker-compose.e2e.yml logs + exit 1 + fi + done + done + + # ── Install test dependencies ──────────────────────────────────────────── + + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install E2E test dependencies + working-directory: e2e + run: | + pip install -r requirements.txt + playwright install chromium --with-deps + + # ── Run E2E tests ──────────────────────────────────────────────────────── + + - name: Run E2E tests + working-directory: e2e + run: pytest test_fl_workflow.py -v --timeout=300 + + # ── Collect logs on failure ────────────────────────────────────────────── + + - name: Collect service logs on failure + if: failure() + working-directory: workbench + run: docker compose -f docker-compose.e2e.yml logs --tail=200 + + # ── Tear down ──────────────────────────────────────────────────────────── + + - name: Tear down E2E stack + if: always() + working-directory: workbench + run: docker compose -f docker-compose.e2e.yml down -v diff --git a/e2e/conftest.py b/e2e/conftest.py new file mode 100644 index 0000000..909f54d --- /dev/null +++ b/e2e/conftest.py @@ -0,0 +1,45 @@ +""" +pytest configuration for Starfish-FL E2E tests. + +Requires three Controller instances and a Router to be running: + site-a (coordinator) → http://localhost:8001 + site-b (participant 1) → http://localhost:8002 + site-c (participant 2) → http://localhost:8003 + +Start the stack with: + cd workbench + docker-compose -f docker-compose.e2e.yml up -d +""" +from pathlib import Path +import pytest + +# Base URLs for each site's Controller web portal +BASE_A = "http://localhost:8001" +BASE_B = "http://localhost:8002" +BASE_C = "http://localhost:8003" + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + +@pytest.fixture(scope="session") +def base_a(): + return BASE_A + + +@pytest.fixture(scope="session") +def base_b(): + return BASE_B + + +@pytest.fixture(scope="session") +def base_c(): + return BASE_C + + +@pytest.fixture(scope="session") +def fixtures_dir(): + return FIXTURES_DIR + + +# pytest-playwright provides the `browser` fixture (session-scoped, Chromium by default). +# The test creates its own browser contexts so each site runs in an isolated session. diff --git a/e2e/fixtures/site_a.csv b/e2e/fixtures/site_a.csv new file mode 100644 index 0000000..2dc1b8c --- /dev/null +++ b/e2e/fixtures/site_a.csv @@ -0,0 +1,50 @@ +-0.1441,-0.1729,0 +-0.1113,0.702,1 +-0.1276,-1.4974,0 +0.3323,-0.2673,1 +-0.217,0.1159,0 +0.2323,1.1636,1 +0.6566,0.1105,1 +-0.7383,-1.0147,0 +0.2463,1.3111,1 +0.0417,-0.1063,0 +0.5318,-1.4535,0 +-0.3123,0.4904,1 +0.8734,-0.2406,1 +0.3766,0.2482,1 +0.7823,-1.1132,0 +0.5683,-1.5145,0 +-2.6199,-0.6069,0 +-0.9158,0.876,0 +0.6643,-1.2191,0 +0.8474,-1.0022,0 +-0.0862,-0.2939,0 +0.1144,0.8186,1 +0.6384,0.3499,1 +0.6499,0.4785,1 +-0.627,-0.7174,0 +-0.47,0.4993,1 +-0.2501,2.3358,1 +-0.8193,-1.0989,0 +0.7685,1.4218,1 +0.5057,0.8358,1 +1.4263,-0.094,1 +-1.423,-0.5321,0 +0.9529,-1.4437,0 +0.0335,0.2532,1 +-0.3156,0.7236,1 +0.5808,2.3214,1 +0.62,-0.6094,1 +-0.5618,-0.8316,0 +0.9523,-0.5668,1 +-0.0703,0.7493,1 +-0.7235,-0.2937,0 +-1.8413,-1.0825,0 +-0.5677,0.4158,0 +1.1935,-0.0185,1 +0.2614,0.168,1 +1.0847,0.8934,1 +0.2737,-1.0109,0 +0.9034,0.381,1 +1.2269,-0.0299,1 +1.9531,-0.3589,1 diff --git a/e2e/fixtures/site_b.csv b/e2e/fixtures/site_b.csv new file mode 100644 index 0000000..b290a40 --- /dev/null +++ b/e2e/fixtures/site_b.csv @@ -0,0 +1,50 @@ +0.923,0.4512,1 +0.7233,-0.0443,1 +-1.0781,1.5081,1 +0.9727,-0.7082,1 +1.8577,-0.4548,1 +-0.5343,-0.0862,0 +-1.0844,0.5339,0 +0.227,0.003,1 +0.5453,-0.0892,1 +0.4942,-0.3827,1 +0.532,-1.9094,0 +1.7455,-0.4307,1 +-0.0587,0.1269,1 +-0.3009,0.2198,0 +-0.746,0.9339,1 +0.4356,1.2762,1 +1.1825,-0.4125,1 +-1.4852,-0.4864,0 +1.7045,0.4217,1 +0.981,0.7018,1 +0.0284,-0.0743,0 +-0.3779,-1.9053,0 +0.0429,2.1667,1 +0.5019,-0.1877,1 +-0.0707,-1.8881,0 +0.2117,1.7268,1 +-0.2287,1.45,1 +-0.5615,0.103,0 +0.6509,-1.4532,0 +-0.4305,-2.0542,0 +-1.6488,1.376,0 +-2.3285,-0.2571,0 +0.2718,0.3693,1 +-0.2839,-0.4596,0 +0.1923,-1.442,0 +-1.6745,-1.0611,0 +-0.6087,1.8806,1 +0.6749,-0.0752,1 +-0.2297,-0.6161,0 +-1.6465,-0.692,0 +-1.3635,1.0704,0 +-0.6035,0.5375,0 +0.974,-0.0736,1 +-0.5238,-1.5097,0 +1.6814,0.4615,1 +0.421,-2.1342,0 +0.7027,0.4824,1 +1.6134,-0.5055,1 +1.59,0.1336,1 +-0.0616,0.1335,1 diff --git a/e2e/fixtures/site_c.csv b/e2e/fixtures/site_c.csv new file mode 100644 index 0000000..1c79fda --- /dev/null +++ b/e2e/fixtures/site_c.csv @@ -0,0 +1,50 @@ +-0.5502,0.3791,0 +0.3269,0.6814,1 +0.0472,-0.7591,0 +-1.1268,1.018,0 +-2.2916,-0.5674,0 +-1.0441,0.0705,0 +-0.5294,-0.4563,0 +0.7901,-0.559,1 +-1.1554,0.9792,0 +0.847,1.2015,1 +0.5345,-0.736,0 +1.1589,-0.402,1 +0.1723,-1.1119,0 +-0.6486,0.4551,0 +0.6743,0.0323,1 +1.2337,-0.081,1 +-2.0865,0.5066,0 +-2.1614,-1.6876,0 +-0.4361,0.5441,1 +0.5203,-0.2828,1 +0.3496,-1.2991,0 +2.3865,-0.2576,1 +-0.1468,-0.2323,0 +1.121,0.1653,1 +0.1744,-0.3727,0 +-0.3784,0.1916,0 +-0.0391,0.0992,1 +-0.5845,1.068,1 +0.5353,1.6758,1 +0.6479,0.2351,1 +-0.0484,1.0163,1 +0.0809,-0.931,0 +-0.7655,-2.1263,0 +1.3587,1.4639,1 +-1.3438,-0.972,0 +-0.7974,-0.6432,0 +0.4577,0.3173,1 +0.287,0.5557,1 +0.4164,0.5403,1 +-1.5915,-0.6413,0 +0.5743,-1.8475,0 +1.3772,1.2486,1 +0.1518,0.3567,1 +-0.3116,1.2904,1 +0.6671,0.2061,1 +-1.4978,1.1012,0 +-0.8342,0.6108,0 +-0.3039,0.3196,1 +1.6467,-1.5548,1 +-0.1786,-0.9925,0 diff --git a/e2e/requirements.txt b/e2e/requirements.txt new file mode 100644 index 0000000..00ac075 --- /dev/null +++ b/e2e/requirements.txt @@ -0,0 +1,3 @@ +pytest==8.3.4 +pytest-playwright==0.5.2 +playwright==1.49.1 diff --git a/e2e/test_fl_workflow.py b/e2e/test_fl_workflow.py new file mode 100644 index 0000000..6fbe6fe --- /dev/null +++ b/e2e/test_fl_workflow.py @@ -0,0 +1,236 @@ +""" +End-to-end test: full federated learning workflow through the web portal. + +Scenario +-------- + site-a (coordinator) – port 8001 + site-b (participant 1) – port 8002 + site-c (participant 2) – port 8003 + +Step 1 – Register all three sites +Step 2 – Coordinator creates a project (LogisticRegression, 1 round) +Step 3 – Both participants join the project +Step 4 – Coordinator starts a new run +Step 5 – All three sites upload their CSV dataset +Step 6 – Wait for all runs to reach Success (≤ 180 s) +Step 7 – Coordinator inspects run details and views logs +Step 8 – Coordinator downloads artifacts +""" +import re +import time +from pathlib import Path + +import pytest +from playwright.sync_api import Browser, Page + +# ── constants ───────────────────────────────────────────────────────────────── + +PROJECT_NAME = "e2e-fl-project" + +TASKS_JSON = ( + '[{"seq": 1, "model": "LogisticRegression",' + ' "config": {"total_round": 1, "current_round": 1}}]' +) + +SUCCESS_TIMEOUT_S = 180 # seconds to wait for all runs to finish + + +# ── helpers ─────────────────────────────────────────────────────────────────── + + +def _parse_project_ids(page: Page) -> tuple[int, int]: + """ + Parse project_id and site_id from the 'Project Details' link on the home page. + The link href is like /controller/projects/{project_id}/{site_id}. + """ + link = page.locator('a[href*="/controller/projects/"]').first + href = link.get_attribute("href") + m = re.search(r"/projects/(\d+)/(\d+)", href) + assert m, f"Could not parse project/site IDs from href: {href}" + return int(m.group(1)), int(m.group(2)) + + +def _upload_dataset_for_site(page: Page, base: str, project_id: int, site_id: int, + csv_path: Path) -> None: + """ + Navigate to the project detail page and upload a dataset via the modal. + + The page uses Bootstrap 5 but the upload modal button was written for + Bootstrap 4 data-attributes. We therefore: + 1. Click the 'Upload Dataset' button (which sets up the #uploadDataset + JS handler with the correct run_id). + 2. Make the hidden file input temporarily accessible and set the file. + 3. Click #uploadDataset (force=True in case the modal is not visible). + """ + page.goto(f"{base}/controller/projects/{project_id}/{site_id}/") + page.wait_for_load_state("domcontentloaded") + + upload_btn = page.locator('[id^="openUploadModal"]') + upload_btn.wait_for(state="visible", timeout=10_000) + + # Click to capture run_id and attach the #uploadDataset click handler + upload_btn.click() + page.wait_for_timeout(400) + + # Make the file input accessible (it lives inside the potentially-hidden modal) + page.evaluate("document.getElementById('fileupload').style.display = 'block'") + page.locator("#fileupload").set_input_files(str(csv_path)) + + # Trigger the upload (force bypasses visibility if modal did not open) + page.locator("#uploadDataset").click(force=True) + page.wait_for_timeout(2_000) + + +def _wait_for_success(page: Page, base: str, project_id: int, site_id: int, + timeout: int = SUCCESS_TIMEOUT_S) -> None: + """Poll the project detail page until the run status shows 'Success'.""" + url = f"{base}/controller/projects/{project_id}/{site_id}/" + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + page.goto(url) + page.wait_for_load_state("domcontentloaded") + if page.locator("td:has-text('Success')").count() > 0: + return + time.sleep(5) + raise AssertionError( + f"Run at {url} did not reach 'Success' within {timeout} seconds" + ) + + +# ── fixtures ────────────────────────────────────────────────────────────────── + + +@pytest.fixture(scope="module") +def pages(browser: Browser, base_a, base_b, base_c): + """ + Create one browser context (= isolated cookie/session store) per site. + Yields (page_a, page_b, page_c) and cleans up at module teardown. + """ + ctx_a = browser.new_context() + ctx_b = browser.new_context() + ctx_c = browser.new_context() + page_a = ctx_a.new_page() + page_b = ctx_b.new_page() + page_c = ctx_c.new_page() + yield page_a, page_b, page_c + ctx_a.close() + ctx_b.close() + ctx_c.close() + + +# ── main test ───────────────────────────────────────────────────────────────── + + +def test_fl_workflow(pages, base_a, base_b, base_c, fixtures_dir): + """Full 8-step federated learning workflow through the Controller web portal.""" + page_a, page_b, page_c = pages + + # ── Step 1: Site registration ───────────────────────────────────────────── + + for page, base, name, desc in [ + (page_a, base_a, "site-a", "Coordinator"), + (page_b, base_b, "site-b", "Participant 1"), + (page_c, base_c, "site-c", "Participant 2"), + ]: + page.goto(f"{base}/controller/") + page.wait_for_load_state("domcontentloaded") + page.fill('[name="name"]', name) + page.fill('textarea[name="description"]', desc) + page.click('[name="register_site"]') + page.wait_for_load_state("domcontentloaded") + assert "is currently registered" in page.content(), ( + f"Step 1 – {name}: expected 'is currently registered' on dashboard" + ) + + # ── Step 2: Coordinator creates project ─────────────────────────────────── + + page_a.goto(f"{base_a}/controller/projects/new/") + page_a.fill('[name="name"]', PROJECT_NAME) + page_a.fill('textarea[name="description"]', "E2E test project") + page_a.fill('textarea[name="tasks"]', TASKS_JSON) + page_a.click('[name="create_project"]') + page_a.wait_for_url(f"{base_a}/", timeout=10_000) + + assert PROJECT_NAME in page_a.content(), ( + "Step 2 – Coordinator: project should appear on home page after creation" + ) + project_id, site_a_id = _parse_project_ids(page_a) + + # ── Step 3: Participants join the project ───────────────────────────────── + + for page, base in [(page_b, base_b), (page_c, base_c)]: + page.goto(f"{base}/controller/projects/join/") + page.fill('[name="name"]', PROJECT_NAME) + page.fill('textarea[name="notes"]', "E2E participant") + page.click('[name="join"]') + page.wait_for_load_state("domcontentloaded") + assert PROJECT_NAME in page.content(), ( + f"Step 3 – participant at {base}: project should appear on home page after joining" + ) + + _, site_b_id = _parse_project_ids(page_b) + _, site_c_id = _parse_project_ids(page_c) + + # ── Step 4: Coordinator starts a new run ────────────────────────────────── + + page_a.goto(f"{base_a}/controller/projects/{project_id}/{site_a_id}/") + page_a.wait_for_load_state("domcontentloaded") + page_a.click("#startButton") + page_a.wait_for_load_state("domcontentloaded") + assert page_a.locator("td:has-text('Standby')").count() > 0, ( + "Step 4 – Coordinator: at least one run should be in 'Standby' after starting" + ) + + # Small delay to let the router propagate Run records to participants + time.sleep(3) + + # ── Step 5: All sites upload their datasets ─────────────────────────────── + + _upload_dataset_for_site(page_a, base_a, project_id, site_a_id, + fixtures_dir / "site_a.csv") + _upload_dataset_for_site(page_b, base_b, project_id, site_b_id, + fixtures_dir / "site_b.csv") + _upload_dataset_for_site(page_c, base_c, project_id, site_c_id, + fixtures_dir / "site_c.csv") + + # ── Step 6: Wait for all runs to reach Success ──────────────────────────── + + _wait_for_success(page_a, base_a, project_id, site_a_id) + _wait_for_success(page_b, base_b, project_id, site_b_id) + _wait_for_success(page_c, base_c, project_id, site_c_id) + + # ── Step 7: Run details and logs (coordinator) ──────────────────────────── + + batch = 1 # First (and only) run batch + page_a.goto( + f"{base_a}/controller/runs/detail/{batch}/{project_id}/{site_a_id}/" + ) + page_a.wait_for_load_state("domcontentloaded") + assert "Success" in page_a.content(), ( + "Step 7 – Run details: expected 'Success' status for coordinator run" + ) + + logs_btn = page_a.locator("#logs").first + if logs_btn.count() > 0: + logs_btn.click() + page_a.wait_for_timeout(3_000) + log_text = page_a.locator("#logContent").inner_text() + assert log_text.strip(), ( + "Step 7 – Logs: log modal content should be non-empty after training" + ) + + # ── Step 8: Download artifacts (coordinator) ────────────────────────────── + + page_a.goto( + f"{base_a}/controller/runs/detail/{batch}/{project_id}/{site_a_id}/" + ) + page_a.wait_for_load_state("domcontentloaded") + + download_btn = page_a.locator('[id^="downloadButton-"]').first + if download_btn.count() > 0: + with page_a.expect_download(timeout=30_000) as dl_info: + download_btn.click() + download = dl_info.value + assert download.suggested_filename, ( + "Step 8 – Download: expected a filename in the downloaded artifacts" + ) diff --git a/workbench/docker-compose.e2e.yml b/workbench/docker-compose.e2e.yml new file mode 100644 index 0000000..228905f --- /dev/null +++ b/workbench/docker-compose.e2e.yml @@ -0,0 +1,260 @@ +version: '3.9' + +# Three-site E2E environment +# site-a (coordinator) - port 8001 - Redis DB 1 +# site-b (participant 1) - port 8002 - Redis DB 2 +# site-c (participant 2) - port 8003 - Redis DB 3 +# Shared: postgres, redis, router + +x-controller-build: &controller-build + build: + context: ../../controller + image: starfish-controller + +# ── per-site environment anchors ────────────────────────────────────────────── + +x-site-a-env: &site-a-env + SITE_UID: "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" + ROUTER_URL: http://router:8000/starfish/api/v1 + ROUTER_USERNAME: admin + ROUTER_PASSWORD: "1234" + CELERY_BROKER_URL: redis://redis:6379/1 + CELERY_RESULT_BACKEND: redis://redis:6379/1 + +x-site-b-env: &site-b-env + SITE_UID: "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb" + ROUTER_URL: http://router:8000/starfish/api/v1 + ROUTER_USERNAME: admin + ROUTER_PASSWORD: "1234" + CELERY_BROKER_URL: redis://redis:6379/2 + CELERY_RESULT_BACKEND: redis://redis:6379/2 + +x-site-c-env: &site-c-env + SITE_UID: "cccccccc-cccc-cccc-cccc-cccccccccccc" + ROUTER_URL: http://router:8000/starfish/api/v1 + ROUTER_USERNAME: admin + ROUTER_PASSWORD: "1234" + CELERY_BROKER_URL: redis://redis:6379/3 + CELERY_RESULT_BACKEND: redis://redis:6379/3 + +services: + + # ── shared infrastructure ────────────────────────────────────────────────── + + postgres: + image: postgres:12.9 + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: UCalgary123 + POSTGRES_DB: starfish-router + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres -d starfish-router"] + interval: 5s + timeout: 5s + retries: 20 + volumes: + - e2e_postgres_data:/var/lib/postgresql/data + + redis: + image: redis:5.0 + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 10 + volumes: + - e2e_redis_data:/opt/redis/data + + router: + build: + context: ../../router + image: starfish-router + depends_on: + postgres: + condition: service_healthy + ports: + - '8000:8000' + env_file: + - ./config/router.env + command: > + bash -c " + poetry run python3 manage.py migrate --no-input && + poetry run python3 manage.py shell -c \" +from django.contrib.auth import get_user_model +U = get_user_model() +U.objects.filter(username='admin').exists() or U.objects.create_superuser('admin','admin@e2e.local','1234') +\" && + poetry run python3 manage.py runserver 0.0.0.0:8000 + " + volumes: + - e2e_router_artifacts:/starfish/artifacts + + # ── site-a (coordinator, port 8001) ─────────────────────────────────────── + + controller-a: + <<: *controller-build + container_name: e2e-controller-a + depends_on: + redis: + condition: service_healthy + router: + condition: service_started + ports: + - '8001:8000' + environment: + <<: *site-a-env + command: bash -c "python3 manage.py migrate --no-input && python3 manage.py runserver 0.0.0.0:8000" + volumes: + - e2e_controller_a:/starfish-controller/local + + controller-a-scheduler: + <<: *controller-build + container_name: e2e-controller-a-scheduler + depends_on: + redis: + condition: service_healthy + environment: + <<: *site-a-env + entrypoint: ["celery", "-A", "starfish", "beat", "-l", "INFO"] + volumes: + - e2e_controller_a:/starfish-controller/local + + controller-a-run-worker: + <<: *controller-build + container_name: e2e-controller-a-run-worker + depends_on: + redis: + condition: service_healthy + environment: + <<: *site-a-env + entrypoint: ["celery", "-A", "starfish", "worker", "-l", "INFO", "-Q", "starfish.run"] + volumes: + - e2e_controller_a:/starfish-controller/local + + controller-a-processor-worker: + <<: *controller-build + container_name: e2e-controller-a-processor-worker + depends_on: + redis: + condition: service_healthy + environment: + <<: *site-a-env + entrypoint: ["celery", "-A", "starfish", "worker", "-l", "INFO", "--concurrency=1", "-Q", "starfish.processor"] + volumes: + - e2e_controller_a:/starfish-controller/local + + # ── site-b (participant 1, port 8002) ───────────────────────────────────── + + controller-b: + <<: *controller-build + container_name: e2e-controller-b + depends_on: + redis: + condition: service_healthy + router: + condition: service_started + ports: + - '8002:8000' + environment: + <<: *site-b-env + command: bash -c "python3 manage.py migrate --no-input && python3 manage.py runserver 0.0.0.0:8000" + volumes: + - e2e_controller_b:/starfish-controller/local + + controller-b-scheduler: + <<: *controller-build + container_name: e2e-controller-b-scheduler + depends_on: + redis: + condition: service_healthy + environment: + <<: *site-b-env + entrypoint: ["celery", "-A", "starfish", "beat", "-l", "INFO"] + volumes: + - e2e_controller_b:/starfish-controller/local + + controller-b-run-worker: + <<: *controller-build + container_name: e2e-controller-b-run-worker + depends_on: + redis: + condition: service_healthy + environment: + <<: *site-b-env + entrypoint: ["celery", "-A", "starfish", "worker", "-l", "INFO", "-Q", "starfish.run"] + volumes: + - e2e_controller_b:/starfish-controller/local + + controller-b-processor-worker: + <<: *controller-build + container_name: e2e-controller-b-processor-worker + depends_on: + redis: + condition: service_healthy + environment: + <<: *site-b-env + entrypoint: ["celery", "-A", "starfish", "worker", "-l", "INFO", "--concurrency=1", "-Q", "starfish.processor"] + volumes: + - e2e_controller_b:/starfish-controller/local + + # ── site-c (participant 2, port 8003) ───────────────────────────────────── + + controller-c: + <<: *controller-build + container_name: e2e-controller-c + depends_on: + redis: + condition: service_healthy + router: + condition: service_started + ports: + - '8003:8000' + environment: + <<: *site-c-env + command: bash -c "python3 manage.py migrate --no-input && python3 manage.py runserver 0.0.0.0:8000" + volumes: + - e2e_controller_c:/starfish-controller/local + + controller-c-scheduler: + <<: *controller-build + container_name: e2e-controller-c-scheduler + depends_on: + redis: + condition: service_healthy + environment: + <<: *site-c-env + entrypoint: ["celery", "-A", "starfish", "beat", "-l", "INFO"] + volumes: + - e2e_controller_c:/starfish-controller/local + + controller-c-run-worker: + <<: *controller-build + container_name: e2e-controller-c-run-worker + depends_on: + redis: + condition: service_healthy + environment: + <<: *site-c-env + entrypoint: ["celery", "-A", "starfish", "worker", "-l", "INFO", "-Q", "starfish.run"] + volumes: + - e2e_controller_c:/starfish-controller/local + + controller-c-processor-worker: + <<: *controller-build + container_name: e2e-controller-c-processor-worker + depends_on: + redis: + condition: service_healthy + environment: + <<: *site-c-env + entrypoint: ["celery", "-A", "starfish", "worker", "-l", "INFO", "--concurrency=1", "-Q", "starfish.processor"] + volumes: + - e2e_controller_c:/starfish-controller/local + +volumes: + e2e_postgres_data: {} + e2e_redis_data: {} + e2e_router_artifacts: {} + e2e_controller_a: {} + e2e_controller_b: {} + e2e_controller_c: {} From b1c6df78fa36e4118ce38c3acb777ea817813620 Mon Sep 17 00:00:00 2001 From: Steve Drew Date: Sun, 1 Mar 2026 22:27:07 -0700 Subject: [PATCH 2/7] Fix YAML parse error in docker-compose.e2e.yml Embedded Python code inside a YAML folded scalar caused the Docker Compose parser to fail with 'could not find expected :' on the bare import statement (line 83). Replace the inline Python shell snippet with createsuperuser --noinput, which reads the password from the DJANGO_SUPERUSER_PASSWORD env var and is a single-line shell command that requires no embedded Python code. Co-Authored-By: Claude Sonnet 4.6 --- workbench/docker-compose.e2e.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/workbench/docker-compose.e2e.yml b/workbench/docker-compose.e2e.yml index 228905f..54f5365 100644 --- a/workbench/docker-compose.e2e.yml +++ b/workbench/docker-compose.e2e.yml @@ -76,14 +76,12 @@ services: - '8000:8000' env_file: - ./config/router.env + environment: + DJANGO_SUPERUSER_PASSWORD: "1234" command: > bash -c " poetry run python3 manage.py migrate --no-input && - poetry run python3 manage.py shell -c \" -from django.contrib.auth import get_user_model -U = get_user_model() -U.objects.filter(username='admin').exists() or U.objects.create_superuser('admin','admin@e2e.local','1234') -\" && + poetry run python3 manage.py createsuperuser --noinput --username admin --email admin@e2e.local 2>/dev/null || true && poetry run python3 manage.py runserver 0.0.0.0:8000 " volumes: From 8a0b8040e03dc6e438acbfd62c89b9f09da42048 Mon Sep 17 00:00:00 2001 From: Steve Drew Date: Sun, 1 Mar 2026 22:45:15 -0700 Subject: [PATCH 3/7] Fix two CI failures in E2E setup 1. pytest --timeout flag: add pytest-timeout to e2e/requirements.txt. Without it, pytest rejected --timeout=300 with exit code 4 before running any tests. 2. Controller workers starting before router is ready: add a TCP healthcheck to the router service and change all 12 controller service depends_on entries from service_started to service_healthy. The healthcheck uses python3 socket to probe port 8000, which is available in the router image without extra packages. Co-Authored-By: Claude Sonnet 4.6 --- e2e/requirements.txt | 1 + workbench/docker-compose.e2e.yml | 30 +++++++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/e2e/requirements.txt b/e2e/requirements.txt index 00ac075..3230f01 100644 --- a/e2e/requirements.txt +++ b/e2e/requirements.txt @@ -1,3 +1,4 @@ pytest==8.3.4 pytest-playwright==0.5.2 +pytest-timeout==2.3.1 playwright==1.49.1 diff --git a/workbench/docker-compose.e2e.yml b/workbench/docker-compose.e2e.yml index 54f5365..bc7f894 100644 --- a/workbench/docker-compose.e2e.yml +++ b/workbench/docker-compose.e2e.yml @@ -84,6 +84,12 @@ services: poetry run python3 manage.py createsuperuser --noinput --username admin --email admin@e2e.local 2>/dev/null || true && poetry run python3 manage.py runserver 0.0.0.0:8000 " + healthcheck: + test: ["CMD", "python3", "-c", "import socket; s=socket.socket(); s.settimeout(2); s.connect(('localhost',8000)); s.close()"] + interval: 10s + timeout: 5s + retries: 30 + start_period: 60s volumes: - e2e_router_artifacts:/starfish/artifacts @@ -96,7 +102,7 @@ services: redis: condition: service_healthy router: - condition: service_started + condition: service_healthy ports: - '8001:8000' environment: @@ -111,6 +117,8 @@ services: depends_on: redis: condition: service_healthy + router: + condition: service_healthy environment: <<: *site-a-env entrypoint: ["celery", "-A", "starfish", "beat", "-l", "INFO"] @@ -123,6 +131,8 @@ services: depends_on: redis: condition: service_healthy + router: + condition: service_healthy environment: <<: *site-a-env entrypoint: ["celery", "-A", "starfish", "worker", "-l", "INFO", "-Q", "starfish.run"] @@ -135,6 +145,8 @@ services: depends_on: redis: condition: service_healthy + router: + condition: service_healthy environment: <<: *site-a-env entrypoint: ["celery", "-A", "starfish", "worker", "-l", "INFO", "--concurrency=1", "-Q", "starfish.processor"] @@ -150,7 +162,7 @@ services: redis: condition: service_healthy router: - condition: service_started + condition: service_healthy ports: - '8002:8000' environment: @@ -165,6 +177,8 @@ services: depends_on: redis: condition: service_healthy + router: + condition: service_healthy environment: <<: *site-b-env entrypoint: ["celery", "-A", "starfish", "beat", "-l", "INFO"] @@ -177,6 +191,8 @@ services: depends_on: redis: condition: service_healthy + router: + condition: service_healthy environment: <<: *site-b-env entrypoint: ["celery", "-A", "starfish", "worker", "-l", "INFO", "-Q", "starfish.run"] @@ -189,6 +205,8 @@ services: depends_on: redis: condition: service_healthy + router: + condition: service_healthy environment: <<: *site-b-env entrypoint: ["celery", "-A", "starfish", "worker", "-l", "INFO", "--concurrency=1", "-Q", "starfish.processor"] @@ -204,7 +222,7 @@ services: redis: condition: service_healthy router: - condition: service_started + condition: service_healthy ports: - '8003:8000' environment: @@ -219,6 +237,8 @@ services: depends_on: redis: condition: service_healthy + router: + condition: service_healthy environment: <<: *site-c-env entrypoint: ["celery", "-A", "starfish", "beat", "-l", "INFO"] @@ -231,6 +251,8 @@ services: depends_on: redis: condition: service_healthy + router: + condition: service_healthy environment: <<: *site-c-env entrypoint: ["celery", "-A", "starfish", "worker", "-l", "INFO", "-Q", "starfish.run"] @@ -243,6 +265,8 @@ services: depends_on: redis: condition: service_healthy + router: + condition: service_healthy environment: <<: *site-c-env entrypoint: ["celery", "-A", "starfish", "worker", "-l", "INFO", "--concurrency=1", "-Q", "starfish.processor"] From fb466bd07423dc40998814ea8d244c1559915fda Mon Sep 17 00:00:00 2001 From: Steve Drew Date: Sun, 1 Mar 2026 22:51:07 -0700 Subject: [PATCH 4/7] Fix wait_for_url after project creation in E2E test The create-project JS does window.location.href = "/" but Django redirects / to /controller/. wait_for_url("http://localhost:8001/") timed out because the browser landed on /controller/ instead. Replace with wait_for_load_state("domcontentloaded") which is redirect-agnostic. Co-Authored-By: Claude Sonnet 4.6 --- e2e/test_fl_workflow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/e2e/test_fl_workflow.py b/e2e/test_fl_workflow.py index 6fbe6fe..654a61a 100644 --- a/e2e/test_fl_workflow.py +++ b/e2e/test_fl_workflow.py @@ -149,7 +149,8 @@ def test_fl_workflow(pages, base_a, base_b, base_c, fixtures_dir): page_a.fill('textarea[name="description"]', "E2E test project") page_a.fill('textarea[name="tasks"]', TASKS_JSON) page_a.click('[name="create_project"]') - page_a.wait_for_url(f"{base_a}/", timeout=10_000) + # The JS does window.location.href = "/" which Django redirects to /controller/ + page_a.wait_for_load_state("domcontentloaded") assert PROJECT_NAME in page_a.content(), ( "Step 2 – Coordinator: project should appear on home page after creation" From e1c4dd6134f81f3e95be8287e45df395dfd1bf60 Mon Sep 17 00:00:00 2001 From: Steve Drew Date: Sun, 1 Mar 2026 22:58:58 -0700 Subject: [PATCH 5/7] fix(e2e): wait for URL after AJAX project creation instead of DOM load state The create-project form submits via $.post(); on success the callback calls window.location.href = "/" which Django redirects to /controller/. wait_for_load_state("domcontentloaded") returned immediately (DOM already loaded) before the AJAX callback fired, so the assertion saw the Create New Project page. Switch to wait_for_url("/controller/") which blocks until the navigation actually completes. Co-Authored-By: Claude Sonnet 4.6 --- e2e/test_fl_workflow.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/e2e/test_fl_workflow.py b/e2e/test_fl_workflow.py index 654a61a..8bfb3f9 100644 --- a/e2e/test_fl_workflow.py +++ b/e2e/test_fl_workflow.py @@ -149,8 +149,9 @@ def test_fl_workflow(pages, base_a, base_b, base_c, fixtures_dir): page_a.fill('textarea[name="description"]', "E2E test project") page_a.fill('textarea[name="tasks"]', TASKS_JSON) page_a.click('[name="create_project"]') - # The JS does window.location.href = "/" which Django redirects to /controller/ - page_a.wait_for_load_state("domcontentloaded") + # The form submits via AJAX; on success JS does window.location.href = "/" + # which Django redirects to /controller/ — wait for that final URL. + page_a.wait_for_url(f"{base_a}/controller/", timeout=15_000) assert PROJECT_NAME in page_a.content(), ( "Step 2 – Coordinator: project should appear on home page after creation" From 940185c428f295ce70fd1e8365f82b44a0cc8b1b Mon Sep 17 00:00:00 2001 From: Steve Drew Date: Sun, 1 Mar 2026 23:06:38 -0700 Subject: [PATCH 6/7] fix(e2e): fix _parse_project_ids to match relative project detail hrefs The navbar in base.html has absolute hrefs like /controller/projects/new which the previous selector (a[href*="/controller/projects/"]) matched first. The actual project detail links in index.html use relative hrefs like projects/{id}/{site_id} (no leading slash), so they never matched. Switch to iterating all a[href*="projects/"] links and returning the first one whose href contains two consecutive integers. Co-Authored-By: Claude Sonnet 4.6 --- e2e/test_fl_workflow.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/e2e/test_fl_workflow.py b/e2e/test_fl_workflow.py index 8bfb3f9..feac613 100644 --- a/e2e/test_fl_workflow.py +++ b/e2e/test_fl_workflow.py @@ -40,14 +40,17 @@ def _parse_project_ids(page: Page) -> tuple[int, int]: """ - Parse project_id and site_id from the 'Project Details' link on the home page. - The link href is like /controller/projects/{project_id}/{site_id}. + Parse project_id and site_id from the project detail link on the home page. + The link href is a relative path like projects/{project_id}/{site_id} (no + leading slash), so we iterate all project-related links and pick the first + one whose href contains two consecutive integers. """ - link = page.locator('a[href*="/controller/projects/"]').first - href = link.get_attribute("href") - m = re.search(r"/projects/(\d+)/(\d+)", href) - assert m, f"Could not parse project/site IDs from href: {href}" - return int(m.group(1)), int(m.group(2)) + for link in page.locator('a[href*="projects/"]').all(): + href = link.get_attribute("href") or "" + m = re.search(r"projects/(\d+)/(\d+)", href) + if m: + return int(m.group(1)), int(m.group(2)) + raise AssertionError("Could not find a project detail link on the page") def _upload_dataset_for_site(page: Page, base: str, project_id: int, site_id: int, From 8bac44b9c9a29dfa20a9fdd4a446e26a0f854150 Mon Sep 17 00:00:00 2001 From: Steve Drew Date: Sun, 1 Mar 2026 23:34:14 -0700 Subject: [PATCH 7/7] fix(e2e): upload participants' datasets before coordinator to avoid race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dataset upload view immediately sets the run status to PREPARING in the router DB, bypassing Celery. If the coordinator uploads first and all three sites are already in PREPARING, the coordinator's preparing() fires and transitions everyone to RUNNING before participants' fetch_run beats (every 5 s) have dispatched their own process_task('preparing'). Those tasks never run, so prepare_data() is never called, self.logisticRegr stays None, and training() raises AttributeError. Fix: upload participants (b, c) first, sleep 15 s (≥ 3 beat cycles) to let their process_task('preparing') fire and complete, then upload the coordinator (a) last so it triggers the PREPARING→RUNNING transition only after participants have initialised their ML model in ml_models. Co-Authored-By: Claude Sonnet 4.6 --- e2e/test_fl_workflow.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/e2e/test_fl_workflow.py b/e2e/test_fl_workflow.py index feac613..dcb98fd 100644 --- a/e2e/test_fl_workflow.py +++ b/e2e/test_fl_workflow.py @@ -191,13 +191,25 @@ def test_fl_workflow(pages, base_a, base_b, base_c, fixtures_dir): # ── Step 5: All sites upload their datasets ─────────────────────────────── - _upload_dataset_for_site(page_a, base_a, project_id, site_a_id, - fixtures_dir / "site_a.csv") + # Participants upload first so their router status reaches PREPARING before + # the coordinator's upload fires. _upload_dataset_for_site(page_b, base_b, project_id, site_b_id, fixtures_dir / "site_b.csv") _upload_dataset_for_site(page_c, base_c, project_id, site_c_id, fixtures_dir / "site_c.csv") + # Wait for participants' fetch_run beat (fires every 5 s) to trigger at + # least twice, dispatch process_task('preparing'), and complete + # prepare_data() — which initialises the ML model in the ml_models + # singleton — before the coordinator's upload fires the PREPARING→RUNNING + # transition for all runs. + time.sleep(15) + + # Coordinator uploads last: its upload causes preparing() to call + # notify(4, update_all=True) only after participants are already prepared. + _upload_dataset_for_site(page_a, base_a, project_id, site_a_id, + fixtures_dir / "site_a.csv") + # ── Step 6: Wait for all runs to reach Success ──────────────────────────── _wait_for_success(page_a, base_a, project_id, site_a_id)