I0-OVI · I0-OVI · Jun 13, 2025 · Jun 14, 2025 · Jul 23, 2025 · Jul 23, 2025
diff --git a/.cspell.json b/.cspell.json
@@ -0,0 +1,38 @@
+{
+    "version": "0.2",
+    "language": "en",
+    "words": [
+        "skyzh",
+        "numpy",
+        "Connor",
+        "CUDA",
+        "matmul",
+        "qwen",
+        "huggingface",
+        "dequantize",
+        "freqs",
+        "torchtune",
+        "Jinyi",
+        "logits",
+        "argmax",
+        "logprobs",
+        "softmax",
+        "feedforward",
+        "Convolutional",
+        "Roformer",
+        "bfloat",
+        "multihead",
+        "vllm",
+        "silu",
+        "GFLOPS",
+        "TFLOPS",
+        "dequantized",
+        "dequantization",
+        "dequantizes",
+        "dtype",
+        "threadgroups",
+    ],
+    "ignoreRegExpList": [
+        "`[^`]*`",
+    ]
+}
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
@@ -0,0 +1,45 @@
+# Build and test the reference solution automatically on M1 runners.
+# This helps prevent breakage of the dev setup.
+name: macOS
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  test-refsol:
+    name: Test reference solution
+    runs-on: macos-15 # ARM64
+    steps:
+      - uses: actions/checkout@v5
+
+      - name: Install HuggingFace weights
+        run: |
+          brew install huggingface-cli
+          hf download Qwen/Qwen2-0.5B-Instruct-MLX
+
+      - uses: pdm-project/setup-pdm@v4
+        with:
+          python-version: 3.12
+          cache: true
+
+      - run: pdm install
+
+      - run: pdm run check-installation
+
+      # Without this, future build steps fail in CMake.
+      - name: Add nanobind to CMake
+        run: |
+          nanobind_dir=$(pdm run python -c 'import nanobind, os; print(os.path.join(nanobind.__path__[0], "cmake"))')
+          echo "nanobind_DIR=${nanobind_dir}" >> $GITHUB_ENV
+
+      - name: Try building extensions
+        run: |
+          pdm run build-ext
+          pdm run build-ext-test
+
+      - run: pdm run build-ext-ref
+
+      - run: pdm run test-refsol
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -16,7 +16,8 @@ jobs:
       - name: setup rust toolchain
         run: rustup update && rustup toolchain install
       - uses: dtolnay/rust-toolchain@stable
-      - run: cargo install mdbook-katex
+      - run: cargo install mdbook-toc
+      - run: cargo install mdbook-katex --version 0.10.0-alpha
       - uses: taiki-e/install-action@mdbook
       - name: patch for gh-pages build
         run: mv book/theme/head.hbs._ book/theme/head.hbs

diff --git a/.github/workflows/spell-check.yml b/.github/workflows/spell-check.yml
@@ -0,0 +1,28 @@
+name: Spell Check
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+  workflow_dispatch:
+
+jobs:
+  spell-check:
+    name: Run cspell
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Install cspell globally
+        run: npm install -g cspell
+
+      - name: Run spell check on Markdown files
+        run: cspell "book/**/*.md"
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,81 @@
+# AGENTS.md
+
+## Scope
+
+- This file applies to the entire repository.
+- Use this as the default test-running policy for coding agents.
+
+## Objective
+
+- Run and verify tests in a way that matches the book workflow (`book/src/*.md`).
+- Prefer `pdm` entrypoints defined in `pyproject.toml`.
+
+## Environment Requirements
+
+- macOS on Apple Silicon is expected by the project.
+- Install dependencies first:
+
+```bash
+pdm install -v
+pdm run check-installation
+```
+
+- Optional baseline check from the setup chapter (reference solution, Week 1):
+
+```bash
+pdm run test-refsol -- -- -k week_1
+```
+
+## Agent Test Workflow
+
+1. Start with the smallest relevant scope (`--week` + `--day`).
+2. Use pytest filters via `-- -k ...` to isolate failing tasks.
+3. Run broader suites only after targeted tests pass.
+4. If extension code changed, rebuild extensions before testing.
+
+## Canonical Commands
+
+Run all tests:
+
+```bash
+pdm run test
+```
+
+Run a specific chapter/day:
+
+```bash
+pdm run test --week <WEEK> --day <DAY>
+```
+
+Run with pytest filters:
+
+```bash
+pdm run test --week 1 --day 3 -- -k task_2
+pdm run test --week 2 --day 2 -- -k cpu
+pdm run test --week 2 --day 2 -- -k gpu
+```
+
+Run reference-solution tests:
+
+```bash
+pdm run test-refsol
+pdm run test-refsol --week 2 --day 2 -- -k cpu
+```
+
+## Extension Rebuild Rule
+
+Rebuild before tests if these changed:
+
+- `src/extensions/src/*`
+
+Commands:
+
+```bash
+pdm run build-ext
+```
+
+## Guardrails
+
+- Use `--` before pytest args (`-k`, `-q`, `--collect-only`, etc.).
+- `pdm run test --week X --day Y` auto-copies `tests_refsol/test_week_X_day_Y.py` into `tests/`.
+- Model-dependent tests (0.5B/1.5B/7B) skip when models are not downloaded locally.
diff --git a/README.md b/README.md
@@ -8,6 +8,8 @@ can build the model serving infrastructure from scratch and dig into the optimiz
 
 The goal is to learn the techniques behind efficiently serving a large language model (e.g., Qwen2 models).
 
+In week 1, you will implement the necessary components in Python (only Python!) to use the Qwen2 model to generate responses (e.g., attention, RoPE, etc). In week 2, you will implement the inference system which is similar to but a much simpler version of vLLM (e.g., KV cache, continuous batching, flash attention, etc). In week 3, we will cover more advanced topics and how the model interacts with the outside world.
+
 Why MLX: nowadays it's easier to get a macOS-based local development environment than setting up an NVIDIA GPU.
 
 Why Qwen2: this was the first LLM I've interacted with -- it's the go-to example in the vllm documentation. I spent some time looking at the vllm source code and built some knowledge around it.
@@ -35,19 +37,19 @@ Week 1 is complete. Week 2 is in progress.
 | 1.5            | Load the Model                                              | ✅    | ✅   | ✅  |
 | 1.6            | Generate Responses (aka Decoding)                           | ✅    | ✅   | ✅  |
 | 1.7            | Sampling                                                    | ✅    | ✅   | ✅  |
-| 2.1            | Key-Value Cache                                             | ✅    | 🚧   | 🚧  |
-| 2.2            | Quantized Matmul and Linear - CPU                           | ✅    | 🚧   | 🚧  |
-| 2.3            | Quantized Matmul and Linear - GPU                           | ✅    | 🚧   | 🚧  |
-| 2.4            | Flash Attention 2 - CPU                                     | ✅    | 🚧   | 🚧  |
-| 2.5            | Flash Attention 2 - GPU                                     | ✅    | 🚧   | 🚧  |
-| 2.6            | Continuous Batching                                         | ✅    | 🚧   | 🚧  |
-| 2.7            | Chunked Prefill                                             | ✅    | 🚧   | 🚧  |
+| 2.1            | Key-Value Cache                                             | ✅    | ✅   | ✅  |
+| 2.2            | Quantized Matmul and Linear - CPU                           | ✅    | ✅   | ✅  |
+| 2.3            | Quantized Matmul and Linear - GPU                           | ✅    | ✅   | ✅  |
+| 2.4            | Flash Attention 2 - CPU                                     | ✅    | ✅   | ✅  |
+| 2.5            | Flash Attention 2 - GPU                                     | ✅    | ✅   | 🚧  |
+| 2.6            | Continuous Batching                                         | ✅    | ✅   | ✅  |
+| 2.7            | Chunked Prefill                                             | ✅    | ✅   | ✅  |
 | 3.1            | Paged Attention - Part 1                                    | 🚧    | 🚧   | 🚧  |
 | 3.2            | Paged Attention - Part 2                                    | 🚧    | 🚧   | 🚧  |
 | 3.3            | MoE (Mixture of Experts)                                    | 🚧    | 🚧   | 🚧  |
-| 3.4            | Speculative Decoding                                        | 🚧    | 🚧   | 🚧  |
-| 3.5            | Prefill-Decode Separation (requires two Macintosh devices)  | 🚧    | 🚧   | 🚧  |
-| 3.6            | Parallelism                                                 | 🚧    | 🚧   | 🚧  |
-| 3.7            | AI Agent     / Tool Calling                                 | 🚧    | 🚧   | 🚧  |
+| 3.4            | Speculative Decoding                                        | 🚧    | ✅   | 🚧  |
+| 3.5            | RAG Pipeline                                                | 🚧    | 🚧   | 🚧  |
+| 3.6            | AI Agent     / Tool Calling                                 | 🚧    | 🚧   | 🚧  |
+| 3.7            | Long Context                                                | 🚧    | 🚧   | 🚧  |
 
 Other topics not covered: quantized/compressed kv cache, prefix/prompt cache; sampling, fine tuning; smaller kernels (softmax, silu, etc)
diff --git a/batch-main.py b/batch-main.py
@@ -4,7 +4,7 @@
 import random
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--model", type=str, default="Qwen/Qwen2-7B-Instruct-MLX")
+parser.add_argument("--model", type=str, default="qwen2-0.5b")
 
 shanghai_wikipedia = """
 Shanghai[a] is a direct-administered municipality and the most populous urban area in China. The city is located on the Chinese shoreline on the southern estuary of the Yangtze River, with the Huangpu River flowing through it. The population of the city proper is the second largest in the world after Chongqing, with around 24.87 million inhabitants in 2023, while the urban area is the most populous in China, with 29.87 million residents. As of 2022, the Greater Shanghai metropolitan area was estimated to produce a gross metropolitan product (nominal) of nearly 13 trillion RMB ($1.9 trillion).[13] Shanghai is one of the world's major centers for finance, business and economics, research, science and technology, manufacturing, transportation, tourism, and culture. The Port of Shanghai is the world's busiest container port.
@@ -38,23 +38,31 @@
 parser.add_argument("--device", type=str, default="gpu")
 parser.add_argument("--batch-size", type=int, default=5)
 parser.add_argument("--prefill-step", type=int, default=128)
+parser.add_argument("--enable-flash-attn", action="store_true")
+parser.add_argument("--enable-thinking", action="store_true")
 args = parser.parse_args()
 
 if args.solution == "tiny_llm":
     print("Using your tiny_llm solution")
-    from tiny_llm import Qwen2ModelWeek2, batch_generate
+    from tiny_llm import models, batch_generate
 
 elif args.solution == "tiny_llm_ref" or args.solution == "ref":
     print("Using tiny_llm_ref solution")
-    from tiny_llm_ref import Qwen2ModelWeek2, batch_generate
+    from tiny_llm_ref import models, batch_generate
 
 else:
     raise ValueError(f"Solution {args.solution} not supported")
 
+args.model = models.shortcut_name_to_full_name(args.model)
 mlx_model, tokenizer = load(args.model)
 
 with mx.stream(mx.gpu if args.device == "gpu" else mx.cpu):
-    tiny_llm_model = Qwen2ModelWeek2(mlx_model)
+    print(
+        f"Using week2 loader with flash_attn={args.enable_flash_attn} thinking={args.enable_thinking} for {args.model}"
+    )
+    tiny_llm_model = models.dispatch_model(
+        args.model, mlx_model, week=2, enable_flash_attn=args.enable_flash_attn
+    )
     encoded_prompts = []
     for idx, prompt in enumerate(prompts):
         print(f"Prompt {idx}: {prompt}")
@@ -66,6 +74,7 @@
             messages,
             tokenize=False,
             add_generation_prompt=True,
+            enable_thinking=args.enable_thinking,
         )
         encoded_prompts.append(prompt)
     result = batch_generate(
@@ -76,5 +85,6 @@
         prefill_step=args.prefill_step,
     )
     for prompt_idx, text in result:
+        print(f"--- {prompt_idx} ---")
         print(f"Q: {prompts[prompt_idx]}")
         print(f"A: {text}")