Team-SEBAF · YUDINDIN1005 · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -30,3 +30,6 @@ jobs:
 
       - name: Test (pytest)
         run: pytest -q
+
+      - name: Eval smoke (v0)
+        run: python scripts/run_eval_v0.py --suite smoke
diff --git a/.gitignore b/.gitignore
@@ -25,8 +25,14 @@ htmlcov/
 .DS_Store
 Thumbs.db
 
-# --- Local scripts ---
-scripts/
+# --- Helper scripts ---
+scripts/*
+!scripts/run_eval_v0.py
 
 # --- Runtime data artifacts ---
-data/
+data/*
+!data/evalsets/
+!data/evalsets/**
+
+# --- Local eval outputs ---
+data/evalsets/**/_last_*.jsonl
diff --git a/README.md b/README.md
@@ -7,6 +7,13 @@ AnsimOn AI core repository.
 - Validator (policy-level rules)
 - Evaluation / regression tests
 
+## Eval (regression) v0
+- 고정 입력(평가셋 v0)을 기반으로 구조화 결과/태그/RequirementState/Event I/O 계약 회귀 감지
+- 운영: CI는 smoke만 실행, 전체(full)는 로컬/수동 실행
+- 로컬 실행(전체): `python scripts/run_eval_v0.py`
+- CI 실행(스모크): `python scripts/run_eval_v0.py --suite smoke`
+- 출력: 케이스별 `PASS|WARN|FAIL`, `reason_codes`, `usage_metrics`(v0: `duration_ms`, `input_chars`, `output_chars`, `cache_hit`)
+
 ## Pipeline Overview
 - 입력(Evidence text) → v1.3 구조화(JSON) → anchor 적용/근거 재현(evidence_span/anchor)
 - Validator v0.1-α: 스키마/공통 필드 정합성 검증(의미 해석/법적 판단 없음)

diff --git a/data/evalsets/v0/eval_full_v0.json b/data/evalsets/v0/eval_full_v0.json
@@ -0,0 +1,196 @@
+{
+  "version": "evalset_v0",
+  "name": "eval_full_v0",
+  "cases": [
+    {
+      "case_id": "FULL_001_anchor_ok",
+      "input": {
+        "kind": "text",
+        "text": "전남친이 지난달부터 거의 매일 전화하고 집 앞에 와서 기다렸어요."
+      },
+      "expected": {
+        "requirement_state": {
+          "state": "EVALUATABLE",
+          "reason_codes_contains": []
+        },
+        "event_io": {
+          "policy": "allow",
+          "can_create_event": true,
+          "caution_tag": null
+        },
+        "tag_validation": {
+          "status": "pass",
+          "codes_contains": []
+        }
+      }
+    },
+    {
+      "case_id": "FULL_002_anchor_missing",
+      "input": {
+        "kind": "text",
+        "text": "전남친이 지난달부터 전화하고 집 앞에 와서 기다렸어요."
+      },
+      "expected": {
+        "requirement_state": {
+          "state": "UNSTABLE",
+          "reason_codes_contains": [
+            "W_ANCHOR_NOT_FOUND",
+            "W_CONFIDENCE_WITHOUT_ANCHOR"
+          ]
+        },
+        "event_io": {
+          "policy": "allow_with_caution",
+          "can_create_event": true,
+          "caution_tag": "UNSTABLE"
+        },
+        "tag_validation": {
+          "status": "warn",
+          "codes_contains": [
+            "W_ANCHOR_NOT_FOUND",
+            "W_CONFIDENCE_WITHOUT_ANCHOR"
+          ]
+        }
+      }
+    }
+    ,
+    {
+      "case_id": "FULL_003_invalid_confidence",
+      "input": {
+        "kind": "text",
+        "text": "dummy"
+      },
+      "mock_llm_output_json": {
+        "evidence_metadata": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": {}},
+        "parties": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": {}},
+        "period": {"confidence": "sure", "evidence_span": null, "evidence_anchor": null, "value": "unknown"},
+        "frequency": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": "unknown"},
+        "channel": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "locations": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "action_types": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "refusal_signal": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": "unknown"},
+        "threat_indicators": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "impact_on_victim": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "report_or_record": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": "unknown"}
+      },
+      "expected": {
+        "requirement_state": {
+          "state": "INVALID",
+          "reason_codes_contains": ["E_STRUCT_INVALID"]
+        },
+        "event_io": {
+          "policy": "deny",
+          "can_create_event": false,
+          "caution_tag": null
+        },
+        "tag_validation": {
+          "status": "fail",
+          "codes_contains": ["E_STRUCT_INVALID"]
+        }
+      }
+    },
+    {
+      "case_id": "FULL_004_anchor_without_span",
+      "input": {
+        "kind": "text",
+        "text": "dummy"
+      },
+      "mock_llm_output_json": {
+        "evidence_metadata": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": {}},
+        "parties": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": {}},
+        "period": {"confidence": "low", "evidence_span": null, "evidence_anchor": {"modality": "text", "start_char": 0, "end_char": 3}, "value": "unknown"},
+        "frequency": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": "unknown"},
+        "channel": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "locations": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "action_types": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "refusal_signal": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": "unknown"},
+        "threat_indicators": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "impact_on_victim": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "report_or_record": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": "unknown"}
+      },
+      "expected": {
+        "requirement_state": {
+          "state": "INVALID",
+          "reason_codes_contains": ["E_STRUCT_INVALID"]
+        },
+        "event_io": {
+          "policy": "deny",
+          "can_create_event": false,
+          "caution_tag": null
+        },
+        "tag_validation": {
+          "status": "fail",
+          "codes_contains": ["E_STRUCT_INVALID"]
+        }
+      }
+    },
+    {
+      "case_id": "FULL_005_missing_required_top_key",
+      "input": {
+        "kind": "text",
+        "text": "dummy"
+      },
+      "mock_llm_output_json": {
+        "evidence_metadata": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": {}},
+        "parties": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": {}},
+        "frequency": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": "unknown"},
+        "channel": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "locations": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "action_types": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "refusal_signal": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": "unknown"},
+        "threat_indicators": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "impact_on_victim": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "report_or_record": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": "unknown"}
+      },
+      "expected": {
+        "requirement_state": {
+          "state": "INVALID",
+          "reason_codes_contains": ["E_STRUCT_INVALID"]
+        },
+        "event_io": {
+          "policy": "deny",
+          "can_create_event": false,
+          "caution_tag": null
+        },
+        "tag_validation": {
+          "status": "fail",
+          "codes_contains": ["E_STRUCT_INVALID"]
+        }
+      }
+    },
+    {
+      "case_id": "FULL_006_invalid_anchor_modality",
+      "input": {
+        "kind": "text",
+        "text": "dummy"
+      },
+      "mock_llm_output_json": {
+        "evidence_metadata": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": {}},
+        "parties": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": {}},
+        "period": {"confidence": "low", "evidence_span": "", "evidence_anchor": {"modality": "audio", "start_char": 0, "end_char": 1}, "value": "unknown"},
+        "frequency": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": "unknown"},
+        "channel": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "locations": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "action_types": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "refusal_signal": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": "unknown"},
+        "threat_indicators": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "impact_on_victim": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": []},
+        "report_or_record": {"confidence": "low", "evidence_span": null, "evidence_anchor": null, "value": "unknown"}
+      },
+      "expected": {
+        "requirement_state": {
+          "state": "INVALID",
+          "reason_codes_contains": ["E_STRUCT_INVALID"]
+        },
+        "event_io": {
+          "policy": "deny",
+          "can_create_event": false,
+          "caution_tag": null
+        },
+        "tag_validation": {
+          "status": "fail",
+          "codes_contains": ["E_STRUCT_INVALID"]
+        }
+      }
+    }
+  ]
+}
diff --git a/data/evalsets/v0/eval_smoke_v0.json b/data/evalsets/v0/eval_smoke_v0.json
@@ -0,0 +1,56 @@
+{
+  "version": "evalset_v0",
+  "name": "eval_smoke_v0",
+  "cases": [
+    {
+      "case_id": "SMOKE_001_anchor_ok",
+      "input": {
+        "kind": "text",
+        "text": "전남친이 지난달부터 거의 매일 전화하고 집 앞에 와서 기다렸어요."
+      },
+      "expected": {
+        "requirement_state": {
+          "state": "EVALUATABLE",
+          "reason_codes_contains": []
+        },
+        "event_io": {
+          "policy": "allow",
+          "can_create_event": true,
+          "caution_tag": null
+        },
+        "tag_validation": {
+          "status": "pass",
+          "codes_contains": []
+        }
+      }
+    },
+    {
+      "case_id": "SMOKE_002_anchor_missing",
+      "input": {
+        "kind": "text",
+        "text": "전남친이 지난달부터 전화하고 집 앞에 와서 기다렸어요."
+      },
+      "expected": {
+        "requirement_state": {
+          "state": "UNSTABLE",
+          "reason_codes_contains": [
+            "W_ANCHOR_NOT_FOUND",
+            "W_CONFIDENCE_WITHOUT_ANCHOR"
+          ]
+        },
+        "event_io": {
+          "policy": "allow_with_caution",
+          "can_create_event": true,
+          "caution_tag": "UNSTABLE"
+        },
+        "tag_validation": {
+          "status": "warn",
+          "codes_contains": [
+            "W_ANCHOR_NOT_FOUND",
+            "W_CONFIDENCE_WITHOUT_ANCHOR"
+          ]
+        }
+      }
+    }
+  ]
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,4 +11,4 @@ dependencies = [
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 pythonpath = ["src"]
-cache_dir = ".pytest_cache"
+addopts = "-p no:cacheprovider"
diff --git a/scripts/run_eval_v0.py b/scripts/run_eval_v0.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from ansimon_ai.eval.runner_v0 import load_evalset_v0, run_evalset_v0
+
+def _default_evalset_path(suite: str) -> Path:
+    suite = suite.strip().lower()
+    if suite in {"smoke", "eval_smoke_v0"}:
+        return Path("data/evalsets/v0/eval_smoke_v0.json")
+    if suite in {"full", "eval_full_v0"}:
+        return Path("data/evalsets/v0/eval_full_v0.json")
+
+    p = Path(suite)
+    if p.exists():
+        return p
+
+    raise SystemExit(f"unknown suite: {suite}")
+
+def main(argv: list[str]) -> int:
+    parser = argparse.ArgumentParser(description="Run evaluation set v0")
+    parser.add_argument(
+        "--suite",
+        default="full",
+        help="smoke|full or path to evalset json (default: full)",
+    )
+    parser.add_argument(
+        "--out",
+        default=None,
+        help="optional output path (jsonl)",
+    )
+    args = parser.parse_args(argv)
+
+    evalset_path = _default_evalset_path(args.suite)
+    evalset = load_evalset_v0(evalset_path)
+
+    class _MemoryCache(dict):
+        def get(self, key):
+            return super().get(key)
+
+        def set(self, key, value):
+            self[key] = value
+
+    cache = _MemoryCache()
+
+    results = run_evalset_v0(evalset=evalset, cache=cache)
+
+    fail_count = sum(1 for r in results if r.status == "fail")
+    warn_count = sum(1 for r in results if r.status == "warn")
+    pass_count = sum(1 for r in results if r.status == "pass")
+
+    for r in results:
+        usage = r.usage_metrics
+        print(
+            f"{r.case_id} {r.status.upper()} "
+            f"duration_ms={usage.duration_ms} input_chars={usage.input_chars} "
+            f"output_chars={usage.output_chars} cache_hit={usage.cache_hit} "
+            f"reason_codes={r.reason_codes}"
+        )
+
+    print(
+        f"\nSummary: pass={pass_count} warn={warn_count} fail={fail_count} "
+        f"(suite={evalset.name}, cases={len(results)})"
+    )
+
+    if args.out:
+        out_path = Path(args.out)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        with out_path.open("w", encoding="utf-8") as f:
+            for r in results:
+                f.write(json.dumps(r.to_dict(), ensure_ascii=False) + "\n")
+
+    return 1 if fail_count > 0 else 0
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
diff --git a/src/ansimon_ai/eval/__init__.py b/src/ansimon_ai/eval/__init__.py