urmzd
diff --git a/‎README.md‎
Lines changed: 122 additions & 1 deletion b/‎README.md‎
Lines changed: 122 additions & 1 deletion
diff --git a/‎agent/eval/scorers.go‎
Lines changed: 127 additions & 0 deletions b/‎agent/eval/scorers.go‎
Lines changed: 127 additions & 0 deletions
diff --git a/‎agent/eval/scorers_test.go‎
Lines changed: 95 additions & 0 deletions b/‎agent/eval/scorers_test.go‎
Lines changed: 95 additions & 0 deletions
@@ -39,6 +39,7 @@
 - **4 LLM providers** (Ollama, OpenAI, Anthropic, Google) behind one `Provider` interface
 - **Provider resilience** — retry + fallback composition out of the box
 - **Structured output** — constrain LLM responses to JSON schema
+- **Universal evaluation** — composable `Scorer` interface, A/B experiment runner, text quality metrics, LLM-as-judge, and subsystem-specific scorers for agent, RAG, and knowledge graph
 
 ### Why one SDK?
 
@@ -192,6 +193,7 @@ fmt.Println(result.AssembledContext.Prompt) // context with citations
 - [agent — AI Agent Framework](#agent--ai-agent-framework) (providers, deltas, tools, sub-agents, markers, feedback/RLHF, compaction, tree, TUI)
 - [kg — Knowledge Graph SDK](#kg--knowledge-graph-sdk)
 - [rag — RAG Pipeline SDK](#rag--rag-pipeline-sdk)
+- [eval — Universal Evaluation Framework](#eval--universal-evaluation-framework)
 - [Examples](#examples)
 - [Agent Skill](#agent-skill)
 
@@ -577,7 +579,7 @@ rag.WithHyDE(myLLM, 3) // generate 3 hypothetical docs
 
 ### Evaluation Metrics
 
-9 metrics across retrieval, generation, and end-to-end evaluation:
+9 metrics across retrieval, generation, and end-to-end evaluation. These are also available as composable `Scorer` adapters for the [universal eval framework](#eval--universal-evaluation-framework) — see `rag/eval` scorer functions like `ContextPrecisionScorer()`, `FaithfulnessScorer()`, etc.
 
 | Metric | Type | Description |
 |--------|------|-------------|
@@ -635,6 +637,125 @@ kgTools := kgtool.NewTools(graph)
 
 ---
 
+## eval — Universal Evaluation Framework
+
+Composable evaluation framework that works across all SAIGE subsystems. The core `eval/` package has zero subsystem dependencies — subsystem-specific scorers live alongside their domains.
+
+### Core Abstractions
+
+| Type | Purpose |
+|------|---------|
+| `Observation` | Universal eval case — Input, Output, GroundTruth as `json.RawMessage`, typed Annotations map |
+| `Scorer` | Interface computing a named metric from an Observation |
+| `Subject` | Function that populates an Observation's Output and Annotations |
+| `Score` | Named metric value with optional reason |
+
+### Built-in Scorers
+
+**Text Quality** (pure functions, no LLM):
+
+| Scorer | Description |
+|--------|-------------|
+| `SequenceSimilarityScorer` | Character-level LCS ratio between output and ground truth |
+| `TokenF1Scorer` | Word-token precision/recall/F1 |
+| `RougeLScorer` | ROUGE-L F1 at the token level |
+
+**LLM-as-Judge**:
+
+| Scorer | Description |
+|--------|-------------|
+| `NewJudgeScorer` | Pointwise scoring with customizable rubric |
+| `NewPairwiseJudgeScorer` | A/B comparison between two outputs |
+
+**Agent** (`agent/eval`):
+
+| Scorer | Description |
+|--------|-------------|
+| `TTFTScorer` | Time to first token (ms) |
+| `TTLTScorer` | Time to last token (ms) |
+| `MedianITLScorer` | Median inter-token latency (ms) |
+| `ToolCallCountScorer` | Number of tool calls |
+| `ToolSuccessRateScorer` | Fraction of successful tool calls |
+| `TurnCountScorer` | Agent loop iterations |
+
+**Knowledge Graph** (`knowledge/eval`):
+
+| Scorer | Description |
+|--------|-------------|
+| `EntityRecallScorer` | Fraction of expected entities extracted |
+| `EntityPrecisionScorer` | Fraction of extracted entities matching expected |
+| `RelationRecallScorer` | Relation extraction recall |
+| `RelationPrecisionScorer` | Relation extraction precision |
+| `FactSearchRecallScorer` | Fraction of relevant facts found by search |
+
+**RAG** (`rag/eval`):
+
+The existing 9 RAG metrics are also available as composable `Scorer` adapters: `ContextPrecisionScorer`, `ContextRecallScorer`, `NDCGScorer`, `MRRScorer`, `HitRateScorer`, `FaithfulnessScorer`, `AnswerRelevancyScorer`, `AnswerCorrectnessScorer`.
+
+### Evaluate a Single System
+
+```go
+import "github.com/urmzd/saige/eval"
+
+observations := []eval.Observation{
+    {ID: "q1", Input: json.RawMessage(`"What is Go?"`), GroundTruth: json.RawMessage(`"A programming language."`)},
+}
+
+// Define a subject that calls the system under test.
+subject := eval.Subject(func(ctx context.Context, obs *eval.Observation) error {
+    // Call your system, populate obs.Output, obs.Annotations, obs.Timing
+    obs.Output = json.RawMessage(`"Go is a statically typed language."`)
+    return nil
+})
+
+eval.Populate(ctx, observations, subject)
+
+result, _ := eval.Run(ctx, "my-eval", observations, []eval.Scorer{
+    eval.TokenF1Scorer(),
+    eval.RougeLScorer(),
+    eval.NewJudgeScorer(llm, eval.WithJudgeRubric("Score for accuracy.")),
+})
+```
+
+### A/B Experiment
+
+Compare two approaches on the same inputs:
+
+```go
+result, _ := eval.RunExperiment(ctx, inputs, baseSubject, expSubject,
+    []eval.Scorer{rageval.NDCGScorer(10), rageval.MRRScorer()},
+    eval.WithOutputDir("experiments/bm25-vs-hyde"),
+    eval.WithExperimentName("bm25-vs-hyde"),
+)
+// result.Deltas["ndcg"] shows the improvement
+```
+
+### Stream Timing (Agent)
+
+Instrument a delta channel to collect TTFT, TTLT, and median ITL:
+
+```go
+import agenteval "github.com/urmzd/saige/agent/eval"
+
+stream := myAgent.Invoke(ctx, messages)
+timing, text, deltas := agenteval.CollectStreamTiming(stream.Deltas())
+// timing.TTFTMs, timing.TTLTMs, timing.MedianITL
+```
+
+### On-Disk Format
+
+Experiment results persist as structured JSON for reproducibility:
+
+```
+experiments/bm25-vs-hyde/
+  result.json
+  inputs/000.json
+  outputs/base/000.json
+  outputs/exp/000.json
+```
+
+---
+
 ## Examples
 
 | Example | Path | Description |
 
@@ -0,0 +1,127 @@
+package eval
+
+import (
+	"context"
+	"encoding/json"
+
+	topeval "github.com/urmzd/saige/eval"
+)
+
+// Annotation keys used by agent subjects.
+const (
+	AnnotationStreamTiming = "agent.stream_timing" // StreamTiming
+	AnnotationToolCalls    = "agent.tool_calls"     // []ToolCallRecord
+	AnnotationTurnCount    = "agent.turn_count"     // int
+)
+
+// ToolCallRecord captures a tool invocation for evaluation.
+type ToolCallRecord struct {
+	Name       string         `json:"name"`
+	Arguments  map[string]any `json:"arguments"`
+	Result     string         `json:"result"`
+	Error      string         `json:"error,omitempty"`
+	DurationMs int64          `json:"duration_ms"`
+}
+
+// TTFTScorer reports time-to-first-token in milliseconds.
+func TTFTScorer() topeval.Scorer {
+	return topeval.NewScorerFunc("ttft_ms", func(_ context.Context, obs topeval.Observation) (topeval.Score, error) {
+		st, err := extractStreamTiming(obs)
+		if err != nil || st == nil {
+			return topeval.Score{}, err
+		}
+		return topeval.Score{Name: "ttft_ms", Value: float64(st.TTFTMs)}, nil
+	})
+}
+
+// TTLTScorer reports time-to-last-token in milliseconds.
+func TTLTScorer() topeval.Scorer {
+	return topeval.NewScorerFunc("ttlt_ms", func(_ context.Context, obs topeval.Observation) (topeval.Score, error) {
+		st, err := extractStreamTiming(obs)
+		if err != nil || st == nil {
+			return topeval.Score{}, err
+		}
+		return topeval.Score{Name: "ttlt_ms", Value: float64(st.TTLTMs)}, nil
+	})
+}
+
+// MedianITLScorer reports median inter-token latency in milliseconds.
+func MedianITLScorer() topeval.Scorer {
+	return topeval.NewScorerFunc("median_itl_ms", func(_ context.Context, obs topeval.Observation) (topeval.Score, error) {
+		st, err := extractStreamTiming(obs)
+		if err != nil || st == nil {
+			return topeval.Score{}, err
+		}
+		return topeval.Score{Name: "median_itl_ms", Value: st.MedianITL}, nil
+	})
+}
+
+// ToolCallCountScorer reports the number of tool calls made.
+func ToolCallCountScorer() topeval.Scorer {
+	return topeval.NewScorerFunc("tool_call_count", func(_ context.Context, obs topeval.Observation) (topeval.Score, error) {
+		calls, err := extractToolCalls(obs)
+		if err != nil || calls == nil {
+			return topeval.Score{}, err
+		}
+		return topeval.Score{Name: "tool_call_count", Value: float64(len(calls))}, nil
+	})
+}
+
+// ToolSuccessRateScorer reports the fraction of tool calls without errors.
+func ToolSuccessRateScorer() topeval.Scorer {
+	return topeval.NewScorerFunc("tool_success_rate", func(_ context.Context, obs topeval.Observation) (topeval.Score, error) {
+		calls, err := extractToolCalls(obs)
+		if err != nil || calls == nil {
+			return topeval.Score{}, err
+		}
+		if len(calls) == 0 {
+			return topeval.Score{Name: "tool_success_rate", Value: 1.0}, nil
+		}
+		var success int
+		for _, c := range calls {
+			if c.Error == "" {
+				success++
+			}
+		}
+		return topeval.Score{Name: "tool_success_rate", Value: float64(success) / float64(len(calls))}, nil
+	})
+}
+
+// TurnCountScorer reports the number of agent loop iterations.
+func TurnCountScorer() topeval.Scorer {
+	return topeval.NewScorerFunc("turn_count", func(_ context.Context, obs topeval.Observation) (topeval.Score, error) {
+		raw, ok := obs.Annotations[AnnotationTurnCount]
+		if !ok {
+			return topeval.Score{}, nil
+		}
+		var count int
+		if err := json.Unmarshal(raw, &count); err != nil {
+			return topeval.Score{}, err
+		}
+		return topeval.Score{Name: "turn_count", Value: float64(count)}, nil
+	})
+}
+
+func extractStreamTiming(obs topeval.Observation) (*StreamTiming, error) {
+	raw, ok := obs.Annotations[AnnotationStreamTiming]
+	if !ok {
+		return nil, nil
+	}
+	var st StreamTiming
+	if err := json.Unmarshal(raw, &st); err != nil {
+		return nil, err
+	}
+	return &st, nil
+}
+
+func extractToolCalls(obs topeval.Observation) ([]ToolCallRecord, error) {
+	raw, ok := obs.Annotations[AnnotationToolCalls]
+	if !ok {
+		return nil, nil
+	}
+	var calls []ToolCallRecord
+	if err := json.Unmarshal(raw, &calls); err != nil {
+		return nil, err
+	}
+	return calls, nil
+}
@@ -0,0 +1,95 @@
+package eval
+
+import (
+	"context"
+	"encoding/json"
+	"math"
+	"testing"
+
+	topeval "github.com/urmzd/saige/eval"
+)
+
+func assertClose(t *testing.T, name string, got, want, eps float64) {
+	t.Helper()
+	if math.Abs(got-want) > eps {
+		t.Errorf("%s: got %f, want %f (±%f)", name, got, want, eps)
+	}
+}
+
+func TestTTFTScorer(t *testing.T) {
+	st := StreamTiming{TTFTMs: 42}
+	stJSON, _ := json.Marshal(st)
+
+	obs := topeval.Observation{
+		ID:          "t1",
+		Annotations: map[string]json.RawMessage{AnnotationStreamTiming: stJSON},
+	}
+
+	score, err := TTFTScorer().Score(context.Background(), obs)
+	if err != nil {
+		t.Fatal(err)
+	}
+	assertClose(t, "ttft", score.Value, 42.0, 0.001)
+}
+
+func TestTTFTScorerMissingAnnotation(t *testing.T) {
+	obs := topeval.Observation{ID: "t2"}
+	score, err := TTFTScorer().Score(context.Background(), obs)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if score.Name != "" {
+		t.Errorf("expected empty score for missing annotation, got %q", score.Name)
+	}
+}
+
+func TestToolSuccessRateScorer(t *testing.T) {
+	calls := []ToolCallRecord{
+		{Name: "search", Result: "ok"},
+		{Name: "fetch", Error: "timeout"},
+		{Name: "parse", Result: "done"},
+	}
+	callsJSON, _ := json.Marshal(calls)
+
+	obs := topeval.Observation{
+		ID:          "t3",
+		Annotations: map[string]json.RawMessage{AnnotationToolCalls: callsJSON},
+	}
+
+	score, err := ToolSuccessRateScorer().Score(context.Background(), obs)
+	if err != nil {
+		t.Fatal(err)
+	}
+	// 2 out of 3 succeeded.
+	assertClose(t, "success_rate", score.Value, 2.0/3.0, 0.001)
+}
+
+func TestToolCallCountScorer(t *testing.T) {
+	calls := []ToolCallRecord{{Name: "a"}, {Name: "b"}}
+	callsJSON, _ := json.Marshal(calls)
+
+	obs := topeval.Observation{
+		ID:          "t4",
+		Annotations: map[string]json.RawMessage{AnnotationToolCalls: callsJSON},
+	}
+
+	score, err := ToolCallCountScorer().Score(context.Background(), obs)
+	if err != nil {
+		t.Fatal(err)
+	}
+	assertClose(t, "count", score.Value, 2.0, 0.001)
+}
+
+func TestTurnCountScorer(t *testing.T) {
+	countJSON, _ := json.Marshal(5)
+	obs := topeval.Observation{
+		ID:          "t5",
+		Annotations: map[string]json.RawMessage{AnnotationTurnCount: countJSON},
+	}
+
+	score, err := TurnCountScorer().Score(context.Background(), obs)
+	if err != nil {
+		t.Fatal(err)
+	}
+	assertClose(t, "turns", score.Value, 5.0, 0.001)
+}