greynewell · greynewell · Feb 13, 2026 · Feb 13, 2026 · coderabbitai · Feb 13, 2026
diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
@@ -1,7 +1,7 @@
 {
   "$schema": "https://anthropic.com/claude-code/marketplace.schema.json",
   "name": "mcpbr",
-  "version": "0.13.4",
+  "version": "0.14.0",
   "description": "mcpbr - MCP Benchmark Runner plugin marketplace",
   "owner": {
     "name": "mcpbr Contributors",
@@ -11,7 +11,7 @@
     {
       "name": "mcpbr",
       "description": "Expert benchmark runner for MCP servers using mcpbr. Handles Docker checks, config generation, and result parsing.",
-      "version": "0.13.4",
+      "version": "0.14.0",
       "author": {
         "name": "mcpbr Contributors"
       },

diff --git a/.claude-plugin/package.json b/.claude-plugin/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@greynewell/mcpbr-claude-plugin",
-  "version": "0.13.4",
+  "version": "0.14.0",
   "description": "Claude Code plugin for mcpbr - Expert benchmark runner for MCP servers with specialized skills",
   "keywords": [
     "claude-code",

diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "mcpbr",
-  "version": "0.13.4",
+  "version": "0.14.0",
   "description": "Expert benchmark runner for MCP servers using mcpbr. Handles Docker checks, config generation, and result parsing.",
   "schema_version": "1.0"
 }
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -57,7 +57,34 @@ jobs:
           pip install pre-commit
 
       - name: Run pre-commit hooks
-        run: pre-commit run --all-files --show-diff-on-failure
+        # Skip mypy in pre-commit; the dedicated type-check job runs it
+        # with full project dependencies installed.
+        run: SKIP=mypy pre-commit run --all-files --show-diff-on-failure
+
+  type-check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev]"
+
+      - name: Cache mypy
+        uses: actions/cache@v4
+        with:
+          path: .mypy_cache
+          key: mypy-${{ hashFiles('pyproject.toml') }}
+          restore-keys: mypy-
+
+      - name: Run mypy
+        run: mypy src/mcpbr/
 
   test:
     runs-on: ubuntu-latest

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,6 +16,15 @@ repos:
         args: [--fix]
       - id: ruff-format
 
+  - repo: local
+    hooks:
+      - id: mypy
+        name: mypy
+        entry: uv run --extra dev mypy src/mcpbr/
+        language: system
+        pass_filenames: false
+        types: [python]
+
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v5.0.0
     hooks:

diff --git a/AGENTS.md b/AGENTS.md
@@ -175,7 +175,18 @@ If any linting errors remain, they MUST be fixed manually before proceeding.
 uvx ruff check --fix src/ tests/ && uvx ruff format src/ tests/ && uvx ruff check src/ tests/
 ```
 
-### 2. Run Tests
+### 2. Run Type Checking
+
+```bash
+# Run mypy on source code
+uv run mypy src/mcpbr/
+```
+
+**Expected output:** `Success: no issues found`
+
+If any type errors remain, they MUST be fixed before proceeding.
+
+### 3. Run Tests
 
 ```bash
 # Run all non-integration tests
@@ -187,7 +198,7 @@ uv run pytest -m integration
 
 **Expected result:** All tests must pass with 0 failures.
 
-### 3. Update CHANGELOG
+### 4. Update CHANGELOG
 
 **MANDATORY:** If your changes are user-visible, update CHANGELOG.md:
 
@@ -201,7 +212,7 @@ uv run pytest -m integration
 cat CHANGELOG.md | head -30
 ```
 
-### 4. Verify Changes
+### 5. Verify Changes
 
 - Review all modified files
 - Ensure no unintended changes were introduced
@@ -217,7 +228,8 @@ The project uses Ruff for linting with the following configuration:
 
 - **Line length:** 100 characters (E501 is ignored)
 - **Target Python version:** 3.11+
-- **Enabled rules:** E (pycodestyle errors), F (pyflakes), I (isort), N (pep8-naming), W (pycodestyle warnings)
+- **Enabled rules:** E (pycodestyle), F (pyflakes), I (isort), N (pep8-naming), W (warnings), B (bugbear), UP (pyupgrade), SIM (simplify), RUF (ruff-specific), C4 (comprehensions), PIE (misc), PT (pytest-style), ASYNC (async bugs), S (security/bandit), T20 (print detection)
+- **Type checking:** mypy with Pydantic plugin, strict mode on core modules
 
 ### Common Linting Issues to Avoid
 
@@ -226,6 +238,10 @@ The project uses Ruff for linting with the following configuration:
 3. **Undefined names** - All variables and functions must be defined before use
 4. **Line too long** - While E501 is ignored, try to keep lines under 100 chars when reasonable
 5. **Trailing whitespace** - Remove trailing whitespace from all lines
+6. **Mutable default args** (B006) - Don't use `[]` or `{}` as default arguments
+7. **Exception chaining** (B904) - Use `raise X from err` inside `except` blocks
+8. **Modern Python** (UP) - Use Python 3.11+ patterns (e.g., `X | Y` unions, `match` statements)
+9. **Simplifications** (SIM) - Collapse nested `with`/`if` statements, use `contextlib.suppress()`
 
 ### Code Style
 
@@ -422,11 +438,12 @@ Checklist for CHANGELOG:
 
 1. ✅ All linting checks pass (`uvx ruff check src/ tests/`)
 2. ✅ Code is formatted (`uvx ruff format src/ tests/`)
-3. ✅ All tests pass (`uv run pytest -m "not integration"`)
-4. ✅ **CHANGELOG.md is updated** (for user-visible changes)
-5. ✅ Code is documented
-6. ✅ README is updated (if applicable)
-7. ✅ Changes are committed with descriptive commit messages
+3. ✅ Type checking passes (`uv run mypy src/mcpbr/`)
+4. ✅ All tests pass (`uv run pytest -m "not integration"`)
+5. ✅ **CHANGELOG.md is updated** (for user-visible changes)
+6. ✅ Code is documented
+7. ✅ README is updated (if applicable)
+8. ✅ Changes are committed with descriptive commit messages
 
 ### PR Title Format
 
@@ -537,9 +554,10 @@ git push
 ### ✅ DO: Check Linting First
 
 ```bash
-# Good: Check linting before commit
+# Good: Check linting and types before commit
 uvx ruff check --fix src/ tests/
 uvx ruff format src/ tests/
+uv run mypy src/mcpbr/
 uv run pytest -m "not integration"
 git commit -m "feat: add new feature"
 git push
@@ -590,14 +608,17 @@ uvx ruff check --fix src/ tests/
 uvx ruff format src/ tests/
 uvx ruff check src/ tests/  # Verify all fixed
 
-# 5. Run tests
+# 5. Run type checking
+uv run mypy src/mcpbr/
+
+# 6. Run tests
 uv run pytest -m "not integration"
 
-# 6. Commit changes (include CHANGELOG.md)
+# 7. Commit changes (include CHANGELOG.md)
 git add src/ tests/ CHANGELOG.md
 git commit -m "feat: add my new feature"
 
-# 7. Push and create PR
+# 8. Push and create PR
 git push -u origin feature/my-new-feature
 gh pr create --title "feat: add my new feature" --body "Implements #123"
 ```
@@ -615,9 +636,10 @@ The project uses GitHub Actions for CI/CD. All PRs must pass:
 
 1. **Lint Check** - `uvx ruff check src/ tests/`
 2. **Format Check** - `uvx ruff format --check src/ tests/`
-3. **Build Check** - Package builds successfully
-4. **Test (Python 3.11)** - All tests pass on Python 3.11
-5. **Test (Python 3.12)** - All tests pass on Python 3.12
+3. **Type Check** - `mypy src/mcpbr/`
+4. **Build Check** - Package builds successfully
+5. **Test (Python 3.11)** - All tests pass on Python 3.11
+6. **Test (Python 3.12)** - All tests pass on Python 3.12
 
 You can view check results on any PR:
 ```bash
@@ -626,11 +648,11 @@ gh pr checks <PR_NUMBER>
 
 ## Summary
 
-**Remember:** The most important rule is to run linting, formatting, and tests BEFORE committing. This ensures high code quality and prevents CI/CD failures.
+**Remember:** The most important rule is to run linting, formatting, type checking, and tests BEFORE committing. This ensures high code quality and prevents CI/CD failures.
 
 **Pre-commit command:**
 ```bash
-uvx ruff check --fix src/ tests/ && uvx ruff format src/ tests/ && uv run pytest -m "not integration"
+uvx ruff check --fix src/ tests/ && uvx ruff format src/ tests/ && uv run mypy src/mcpbr/ && uv run pytest -m "not integration"
 ```
 
 Happy coding! 🚀
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,30 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.14.0] - 2026-02-13
+
+### Added
+
+- **Strict code quality enforcement**: Expanded Ruff linting rules (B, UP, SIM, RUF, C4, PIE, PT,
+  ASYNC, S, T20) and added mypy type checking with Pydantic plugin across all 134 source files
+  - Added mypy pre-commit hook and CI type-check job
+  - Zero ruff violations (72 fixed across 36 files)
+  - Zero mypy errors (267 fixed across 39 files)
+  - All 4293 tests pass with no regressions
+
+### Fixed
+
+- **72 ruff lint violations** across 36 files: B904 (raise-without-from), SIM102/SIM105/SIM115/
+  SIM116/SIM117 (simplifications), RUF059/RUF003 (unused vars, Unicode), B007 (unused loop vars),
+  PT019 (pytest fixtures), S-rules (security: S310 URL validation, S108 temp dirs, S311 non-crypto
+  random, S110 exception handling, S608 SQL, S112 try-except-continue, S104 binding, S602 shell)
+- **267 mypy type errors** across 39 files: union-attr (128), assignment (33), no-any-return (28),
+  arg-type (23), and others. Fixed with proper type narrowing, assertions, annotations, and
+  type-safe patterns across infrastructure providers (GCP, AWS, Azure, Cloudflare, K8s), core
+  modules (harness, CLI, docker_env), and utility modules (providers, notifications, benchmarks)
+
+[0.14.0]: https://github.com/greynewell/mcpbr/releases/tag/v0.14.0
+
 ## [0.13.0] - 2026-02-13
 
 ### Fixed

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@greynewell/mcpbr",
-  "version": "0.13.4",
+  "version": "0.14.0",
   "description": "Model Context Protocol Benchmark Runner - CLI tool for evaluating MCP servers",
   "keywords": [
     "mcpbr",

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "mcpbr"
-version = "0.13.4"
+version = "0.14.0"
 description = "Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks"
 readme = "README.md"
 license = "MIT"
@@ -46,6 +46,12 @@ dev = [
     "ruff>=0.1.0",
     "pre-commit>=3.0.0",
     "slack_sdk>=3.27.0",
+    "mypy>=1.11.0",
+    "types-docker",
+    "types-paramiko",
+    "types-PyYAML",
+    "types-requests",
+    "types-psutil",
 ]
 docs = [
     "mkdocs>=1.5.0",
@@ -90,12 +96,94 @@ line-length = 100
 target-version = "py311"
 
 [tool.ruff.lint]
-select = ["E", "F", "I", "N", "W"]
-ignore = ["E501"]
+select = [
+    "E",     # pycodestyle errors
+    "F",     # pyflakes
+    "I",     # isort
+    "N",     # pep8-naming
+    "W",     # pycodestyle warnings
+    "B",     # flake8-bugbear
+    "UP",    # pyupgrade (Python 3.11+)
+    "SIM",   # simplify
+    "RUF",   # ruff-specific
+    "C4",    # flake8-comprehensions
+    "PIE",   # misc linting
+    "PT",    # pytest-style
+    "ASYNC", # async bugs
+    "S",     # bandit (security)
+    "T20",   # print detection
+]
+ignore = [
+    "E501",    # line too long (handled by formatter)
+    "B008",    # function call in default argument (Click pattern)
+    "S101",    # assert usage (fine in tests)
+    "S603",    # subprocess call - check for untrusted input
+    "S607",    # start process with partial path
+    "T201",    # print statement (CLI tool uses print)
+    "SIM108",  # ternary operator (readability preference)
+    "PT011",   # pytest.raises too broad
+    "PT012",   # pytest.raises multiple statements
+    "RUF012",  # mutable class variable (Pydantic models)
+    "ASYNC109",# async function timeout param (trio-specific, not asyncio)
+    "ASYNC110",# async sleep in loop (trio-specific)
+    "ASYNC221",# await in async for (trio-specific)
+    "ASYNC230",# open call in async function (trio-specific)
+    "ASYNC240",# async generator (trio-specific)
+    "ASYNC251",# async sleep in async for (trio-specific)
+]
+
+[tool.ruff.lint.per-file-ignores]
+"tests/**/*.py" = ["S", "T20"]
+"infrastructure/**/*.py" = ["S603", "S607"]
+"src/mcpbr/infrastructure/**/*.py" = ["S603", "S607", "S108"]
-"infrastructure/**/*.py" = ["S603", "S607"]
-"src/mcpbr/infrastructure/**/*.py" = ["S603", "S607", "S108"]
+[tool.ruff.lint.per-file-ignores]
+"tests/**/*.py" = ["S", "T20"]
+"src/mcpbr/infrastructure/**/*.py" = ["S603", "S607", "S108"]
+"scripts/**/*.py" = ["T20", "S"]
-"infrastructure/**/*.py" = ["S603", "S607"]
-"src/mcpbr/infrastructure/**/*.py" = ["S603", "S607", "S108"]
+[tool.ruff.lint.per-file-ignores]
+"tests/**/*.py" = ["S", "T20"]
+"src/mcpbr/infrastructure/**/*.py" = ["S603", "S607", "S108"]
+"scripts/**/*.py" = ["T20", "S"]
+"scripts/**/*.py" = ["T20", "S"]
 
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
 testpaths = ["tests"]
 markers = [
     "integration: marks tests as integration tests (deselect with '-m not integration')",
 ]
+
+[tool.mypy]
+python_version = "3.11"
+warn_return_any = true
+warn_unreachable = true
+no_implicit_optional = true
+strict_equality = true
+check_untyped_defs = true
+disallow_incomplete_defs = true
+plugins = ["pydantic.mypy"]
+
+[[tool.mypy.overrides]]
+module = [
+    "datasets",
+    "datasets.*",
+    "google.generativeai",
+    "google.generativeai.*",
+    "wandb",
+    "wandb.*",
+    "slack_sdk",
+    "slack_sdk.*",
+    "uvicorn",
+    "uvicorn.*",
+    "fastapi",
+    "fastapi.*",
+    "tomli",
+    "tomli.*",
+    "weasyprint",
+    "weasyprint.*",
+    "terminal_bench",
+    "terminal_bench.*",
+]
+ignore_missing_imports = true
+
+[[tool.mypy.overrides]]
+module = [
+    "mcpbr.models",
+    "mcpbr.config",
+    "mcpbr.evaluation",
+    "mcpbr.pricing",
+]
+disallow_untyped_defs = true
+warn_unused_ignores = true
diff --git a/scripts/sync_version.py b/scripts/sync_version.py
@@ -21,8 +21,6 @@
 class VersionNotFoundError(Exception):
     """Raised when version cannot be found in pyproject.toml."""
 
-    pass
-
 
 def get_version_from_pyproject(pyproject_path: Path) -> str:
     """Extract version from pyproject.toml."""