diff --git a/CHANGELOG.md b/CHANGELOG.md
index 224ce863..3c1ebf70 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,10 @@
## [Unreleased]
+### Ask this book — user-book RAG eval, P1 (live grounding validation) — backend (2026-06-22)
+
+Automated live grounding validation for **any user-uploaded book** — the user-book sibling to the catalog `RagEvalRunner` (AI-027). The catalog eval scores fixed golden sets keyed by `editionId`; user books have no goldens, so the new `UserBookRagEvalRunner` **synthesises probes from the book's own chunks**: it seed-retrieves a spread of chunks (`RetrieveUserBookAsync`, no gate), asks the generator for one self-contained question per chunk (`FeatureTag eval.userbook.gen`), runs the **real Ask path** per question, and judges the resulting answer's citations with the **shared `CitationJudge`** — same rubric + SupportRate (D1≥4) as the catalog. Two behaviour probes round it out: a **warm greeting** ("hi") checked *structurally* (answers, no citations, no `[n]` marker, not refused — no judge call) and a fixed **off-book** question judged for invented facts (passes iff the answer declines or stays grounded). **Empty/un-embedded book → short-circuit**: NO generator/judge LLM call, persist a failed 0-row with a note (mirrors the catalog no-LLM-on-empty invariant). Refactor: the catalog citation judge (`JudgeCitationsAsync`) + the `EvalRun` row factory (`MakeRun`) are extracted into one internal `CitationJudge` helper that both runners call — the rubric never forks; the catalog `RagEvalRunner` + its tests stay byte-for-byte green. New endpoint `POST /admin/rag/userbook/{id}/eval?judge=openai` (admin-auth inherited) resolves the owner via `db.UserBooks`, **logs the target userId** (privacy: admin eval reads private user content), runs 6 probes, persists, and returns `UserBookRagEvalDto` (citation score/supportRate, retrieval fraction, behaviour pass, per-probe breakdown). 404 on unknown book, 503 with no OpenAI key. Persists `rag.userbook.citation` / `rag.userbook.behavior` / `rag.userbook.retrieval` `eval_run` rows. `dotnet build -c Release` clean; AiEvals 61 + UnitTests 886 green (synthesised-probe aggregate, greeting structural pass/fail, off-book judge pass/fail, **empty-chunks → asserted zero generator/judge calls** via a throwing fake, SupportRate math, persist on/off). P2 admin UI is a separate slice.
+
### Ask this book — conversational, streaming web chat — backend (AI-028) (2026-06-19)
Backend for the conversational "Ask this book" upgrade: **model bump + multi-turn memory + warm-companion prompt + SSE streaming**, with grounding, citations, and the spoiler gate intact. `rag.ask` now routes to a dedicated keyed provider `openai-rag` on **gpt-4.1-mini** (was gpt-4.1-nano), mirroring `openai-explain` (`OpenAI:RagAsk:Model`, `Ai:Routes:rag.ask → openai-rag`, decorator-loop entry, `ModelRegistrySeeder` row). The system prompt is rewritten from "answer ONLY from excerpts else refuse" to a **warm reading companion** that is still strictly grounded — every book-fact claim must come from the numbered excerpts and cite `[n]` (citation contract + parser unchanged), but greetings/meta ("hi", "what can you do") get a warm invite with **no forced citation and no refusal**, and a genuine question with no matching excerpt gets a graceful "I don't see that in what you've read so far" rather than an invented fact. **Multi-turn**: `AskRequest` gains `History: AskTurnDto[]` (role `"user"`/`"assistant"`); the server defensively clamps to the **last 6 turns**, caps each turn at 4000 chars, normalizes roles, and assembles a real chat (system → numbered-excerpts context block → prior turns → new question last). Retrieval still runs on the latest question only, so the grounding eval is byte-identical with `[]` history. **SSE** (content-negotiated, mirrors Explain): `Accept: text/event-stream` → `delta` events (token fragments) then a terminal `done` carrying `{ citations, lastReadOrd, insufficient }` (camelCase, identical citation shape to the JSON path); empty-chunks → one friendly `delta` + `done {insufficient:true}` with **no model call**; provider/mid-stream failure → terminal `error`. JSON path returns the unchanged `AskResponse` (eval + mobile keep working). Ask `MaxOutputTokens` raised 320 → 400 for conversational length. `dotnet build -c Release` clean; 868 unit tests green (history clamp, multi-turn message assembly, SSE event sequencing over a fake delta stream, companion greeting-vs-content prompt structure) + integration (catalog spoiler-gate, owner-404, SSE content-type + framing, JSON history passthrough — skip-on-unavailable). **Note: the grounding golden eval (`RagEvalRunner`) MUST be re-run on mini post-deploy** — the companion prompt loosened the refusal rule, so this is the real hallucination-risk gate (paid; not runnable in CI). Frontend = parallel agent (AI-026e).
diff --git a/backend/src/Ai/TextStack.Ai.EvalSuite/CitationJudge.cs b/backend/src/Ai/TextStack.Ai.EvalSuite/CitationJudge.cs
new file mode 100644
index 00000000..e5ff8c77
--- /dev/null
+++ b/backend/src/Ai/TextStack.Ai.EvalSuite/CitationJudge.cs
@@ -0,0 +1,134 @@
+using Application.Rag;
+using Domain.Entities;
+using Microsoft.Extensions.AI;
+using Microsoft.Extensions.AI.Evaluation;
+using TextStack.Ai.Core;
+using TextStack.Ai.Evals;
+using TextStack.Ai.Llm;
+using TextStack.Ai.Rag;
+
+namespace TextStack.Ai.EvalSuite;
+
+///
+/// Shared citation-correctness machinery for the RAG evals (AI-027b + user-book P1). Both the catalog
+/// and the call ONE copy of the judge
+/// rubric + scoring + the row factory, so the support metric never forks. The
+/// "support" axis MUST stay Dim1 — SupportRate reads .
+///
+internal static class CitationJudge
+{
+ internal const string CitationFeature = "rag.citation";
+ internal const int SupportPassThreshold = 4; // judge ≥4/5 on the support axis = a correct citation
+ internal const string NoJudge = "n/a";
+
+ internal static readonly Rubric Rubric = new(
+ "support: does the cited excerpt actually contain or directly imply the specific claim it is attached to?",
+ "relevance: is the excerpt genuinely on-topic for the answer, not a loosely-related passage?",
+ "faithfulness: does the answer avoid asserting anything the cited excerpts do not support (no outside knowledge)?");
+
+ private static readonly ChatMessage[] JudgePlaceholderMessages = [new ChatMessage(ChatRole.User, string.Empty)];
+
+ ///
+ /// Generates a grounded answer per question (over its already-retrieved chunks) and judges each
+ /// citation against the FULL text of the excerpt it points to. Returns the mean 1–5 score and the
+ /// support rate (citations scored ≥ on the support axis). Shared
+ /// verbatim by both runners.
+ ///
+ internal static async Task JudgeCitationsAsync(
+ IRagAskService ask,
+ ILlmService judge,
+ IReadOnlyList<(string Question, IReadOnlyList Chunks)> retrieved,
+ CancellationToken ct)
+ {
+ var chatConfig = new ChatConfiguration(new LlmServiceChatClient(judge, defaultFeatureTag: "eval.judge"));
+ var scores = new List();
+ var supported = 0;
+ var answersGenerated = 0;
+
+ foreach (var (question, chunks) in retrieved)
+ {
+ ct.ThrowIfCancellationRequested();
+ // lastReadOrd is irrelevant here — chunks are supplied directly (no user gating).
+ var answer = await ask.AskFromChunksAsync(question, chunks, [], [], lastReadOrd: int.MaxValue, ct);
+ if (answer.Insufficient || answer.Citations.Count == 0)
+ continue;
+ answersGenerated++;
+
+ foreach (var cited in answer.Citations)
+ {
+ ct.ThrowIfCancellationRequested();
+ var evidence =
+ $"Question: {question}\n\nAnswer:\n{answer.Answer}\n\n" +
+ $"Cited excerpt [{cited.Marker}] (the answer attributes a claim to this passage):\n{cited.Chunk.Text}";
+
+ var evaluator = new RubricEvaluator(CitationFeature, Rubric);
+ var result = await evaluator.EvaluateAsync(
+ JudgePlaceholderMessages,
+ new ChatResponse(new ChatMessage(ChatRole.Assistant, answer.Answer)),
+ chatConfig, [new RubricEvidenceContext(evidence)], ct);
+
+ var score = new JudgeScore(
+ ReadAxis(result, Rubric.Dim1),
+ ReadAxis(result, Rubric.Dim2),
+ ReadAxis(result, Rubric.Dim3),
+ string.Empty);
+ scores.Add(score);
+ if (score.D1 >= SupportPassThreshold)
+ supported++;
+ }
+ }
+
+ if (scores.Count == 0)
+ return new RagCitationSummary(0, 0, 0, answersGenerated);
+
+ var summary = JudgeRunner.Aggregate(scores);
+ var supportRate = (double)supported / scores.Count;
+ return new RagCitationSummary(summary.MeanOverall, supportRate, scores.Count, answersGenerated);
+ }
+
+ ///
+ /// Lightweight faithfulness check for the user-book off-book probe: does
+ /// introduce facts not grounded in the book? Returns true when the answer is clean (gracefully
+ /// declines or stays grounded — no invented facts), false when it hallucinates outside knowledge.
+ /// One judge call; parses a leading yes/no.
+ ///
+ internal static async Task JudgeNoInventedFactsAsync(
+ ILlmService judge, string question, string answer, CancellationToken ct)
+ {
+ var prompt =
+ "You are grading whether an assistant answer about a book introduces facts NOT grounded in " +
+ "that book. A good answer either gracefully declines (says it can't find this in the book) " +
+ "or stays grounded. A bad answer asserts specific outside facts (dates, names, scores, " +
+ "events) as if from the book.\n\n" +
+ $"Question: {question}\n\nAnswer:\n{answer}\n\n" +
+ "Does the answer introduce facts not grounded in the book? Reply with exactly YES or NO.";
+ var request = new LlmRequest(
+ SystemPrompt: "You are a strict faithfulness grader. Reply YES or NO only.",
+ Messages: [new LlmMessage("user", prompt)],
+ MaxOutputTokens: 4,
+ FeatureTag: "eval.judge");
+ var response = await judge.CompleteAsync(request, ct);
+ var verdict = response.Text.Trim().ToLowerInvariant();
+ // "yes" = invented facts present → NOT clean. Anything else (incl. "no") = clean.
+ return !verdict.StartsWith("yes", StringComparison.Ordinal);
+ }
+
+ // RubricEvaluator names each axis "{feature}.{label}" (label = text before ':').
+ private static int ReadAxis(EvaluationResult result, string dim) =>
+ (int)Math.Round(result.Get($"{CitationFeature}.{dim.Split(':')[0].Trim()}").Value ?? 0);
+
+ /// Shared row factory — one copy for every RAG eval feature.
+ internal static EvalRun MakeRun(
+ string feature, string modelId, string judgeModelId, decimal score, int n, string? gitSha, string breakdown) => new()
+ {
+ Id = Guid.NewGuid(),
+ Feature = feature,
+ ModelId = modelId,
+ JudgeModelId = judgeModelId,
+ Score = Math.Round(score, 3),
+ N = n,
+ BreakdownJson = breakdown,
+ GitSha = gitSha,
+ CreatedAt = DateTimeOffset.UtcNow,
+ };
+}
diff --git a/backend/src/Ai/TextStack.Ai.EvalSuite/RagEvalRunner.cs b/backend/src/Ai/TextStack.Ai.EvalSuite/RagEvalRunner.cs
index 1ce45dbe..2d3c3b81 100644
--- a/backend/src/Ai/TextStack.Ai.EvalSuite/RagEvalRunner.cs
+++ b/backend/src/Ai/TextStack.Ai.EvalSuite/RagEvalRunner.cs
@@ -1,12 +1,8 @@
using Application.Common.Interfaces;
using Application.Rag;
-using Domain.Entities;
-using Microsoft.Extensions.AI;
-using Microsoft.Extensions.AI.Evaluation;
using Microsoft.Extensions.Logging;
using TextStack.Ai.Core;
using TextStack.Ai.Evals;
-using TextStack.Ai.Llm;
using TextStack.Ai.Rag;
namespace TextStack.Ai.EvalSuite;
@@ -49,17 +45,7 @@ public sealed class RagEvalRunner(ILogger logger)
{
// Retrieval scores 0–1 (recall / 1−leak), unlike the 1–5 judged features — the feature key disambiguates.
private const string RetrievalModelId = "hybrid-retrieval";
- private const string NoJudge = "n/a";
- private const int SupportPassThreshold = 4; // judge ≥4/5 on the support axis = a correct citation
- private const string CitationFeature = "rag.citation";
-
- // The "support" axis MUST stay Dim1 — SupportRate reads JudgeScore.D1.
- private static readonly Rubric CitationRubric = new(
- "support: does the cited excerpt actually contain or directly imply the specific claim it is attached to?",
- "relevance: is the excerpt genuinely on-topic for the answer, not a loosely-related passage?",
- "faithfulness: does the answer avoid asserting anything the cited excerpts do not support (no outside knowledge)?");
-
- private static readonly ChatMessage[] JudgePlaceholderMessages = [new ChatMessage(ChatRole.User, string.Empty)];
+ private const string NoJudge = CitationJudge.NoJudge;
public async Task RunAsync(
IRagService rag,
@@ -110,7 +96,7 @@ public async Task RunAsync(
// Citation correctness (027b) — only when a generator + judge are supplied.
RagCitationSummary? citation = null;
if (ask is not null && judge is not null)
- citation = await JudgeCitationsAsync(ask, judge, retrievedByQuestion, ct);
+ citation = await CitationJudge.JudgeCitationsAsync(ask, judge, retrievedByQuestion, ct);
logger.LogInformation(
"RAG eval edition={Edition} recall@{K}={Recall:0.00} (N={RecallN}) spoilerLeakRate={Leak:0.00} (N={SpoilerN}) citation={Cit}",
@@ -119,12 +105,12 @@ public async Task RunAsync(
if (persist && db is not null)
{
- db.EvalRuns.Add(MakeRun("rag.retrieval", RetrievalModelId, NoJudge, (decimal)recall, recallCases.Count, gitSha,
+ db.EvalRuns.Add(CitationJudge.MakeRun("rag.retrieval", RetrievalModelId, NoJudge, (decimal)recall, recallCases.Count, gitSha,
$"{{\"recallAtK\":{recall:0.000},\"k\":{k},\"hits\":{recallDetail.Count(c => c.Hit)}}}"));
- db.EvalRuns.Add(MakeRun("rag.spoiler", RetrievalModelId, NoJudge, (decimal)(1.0 - leakRate), spoilerCases.Count, gitSha,
+ db.EvalRuns.Add(CitationJudge.MakeRun("rag.spoiler", RetrievalModelId, NoJudge, (decimal)(1.0 - leakRate), spoilerCases.Count, gitSha,
$"{{\"leakRate\":{leakRate:0.000},\"leakingCases\":{spoilerDetail.Count(c => c.LeakCount > 0)}}}"));
if (citation is not null)
- db.EvalRuns.Add(MakeRun(CitationFeature, RagAskService.FeatureTag, judgeModelId ?? NoJudge,
+ db.EvalRuns.Add(CitationJudge.MakeRun(CitationJudge.CitationFeature, RagAskService.FeatureTag, judgeModelId ?? NoJudge,
(decimal)citation.Score, citation.CitationsJudged, gitSha,
$"{{\"supportRate\":{citation.SupportRate:0.000},\"answers\":{citation.AnswersGenerated}}}"));
await db.SaveChangesAsync(ct);
@@ -132,79 +118,4 @@ public async Task RunAsync(
return new RagEvalResult(recall, recallCases.Count, leakRate, spoilerCases.Count, recallDetail, spoilerDetail, citation);
}
-
- ///
- /// Generates a grounded answer per question (over its already-retrieved chunks) and judges each
- /// citation against the FULL text of the excerpt it points to. Returns the mean 1–5 score and the
- /// support rate (citations scored ≥ on the support axis).
- ///
- private async Task JudgeCitationsAsync(
- IRagAskService ask,
- ILlmService judge,
- IReadOnlyList<(string Question, IReadOnlyList Chunks)> retrieved,
- CancellationToken ct)
- {
- var chatConfig = new ChatConfiguration(new LlmServiceChatClient(judge, defaultFeatureTag: "eval.judge"));
- var scores = new List();
- var supported = 0;
- var answersGenerated = 0;
-
- foreach (var (question, chunks) in retrieved)
- {
- ct.ThrowIfCancellationRequested();
- // lastReadOrd is irrelevant here — chunks are supplied directly (no user gating).
- var answer = await ask.AskFromChunksAsync(question, chunks, [], [], lastReadOrd: int.MaxValue, ct);
- if (answer.Insufficient || answer.Citations.Count == 0)
- continue;
- answersGenerated++;
-
- foreach (var cited in answer.Citations)
- {
- ct.ThrowIfCancellationRequested();
- var evidence =
- $"Question: {question}\n\nAnswer:\n{answer.Answer}\n\n" +
- $"Cited excerpt [{cited.Marker}] (the answer attributes a claim to this passage):\n{cited.Chunk.Text}";
-
- var evaluator = new RubricEvaluator(CitationFeature, CitationRubric);
- var result = await evaluator.EvaluateAsync(
- JudgePlaceholderMessages,
- new ChatResponse(new ChatMessage(ChatRole.Assistant, answer.Answer)),
- chatConfig, [new RubricEvidenceContext(evidence)], ct);
-
- var score = new JudgeScore(
- ReadAxis(result, CitationRubric.Dim1),
- ReadAxis(result, CitationRubric.Dim2),
- ReadAxis(result, CitationRubric.Dim3),
- string.Empty);
- scores.Add(score);
- if (score.D1 >= SupportPassThreshold)
- supported++;
- }
- }
-
- if (scores.Count == 0)
- return new RagCitationSummary(0, 0, 0, answersGenerated);
-
- var summary = JudgeRunner.Aggregate(scores);
- var supportRate = (double)supported / scores.Count;
- return new RagCitationSummary(summary.MeanOverall, supportRate, scores.Count, answersGenerated);
- }
-
- // RubricEvaluator names each axis "{feature}.{label}" (label = text before ':').
- private static int ReadAxis(EvaluationResult result, string dim) =>
- (int)Math.Round(result.Get($"{CitationFeature}.{dim.Split(':')[0].Trim()}").Value ?? 0);
-
- private static EvalRun MakeRun(
- string feature, string modelId, string judgeModelId, decimal score, int n, string? gitSha, string breakdown) => new()
- {
- Id = Guid.NewGuid(),
- Feature = feature,
- ModelId = modelId,
- JudgeModelId = judgeModelId,
- Score = Math.Round(score, 3),
- N = n,
- BreakdownJson = breakdown,
- GitSha = gitSha,
- CreatedAt = DateTimeOffset.UtcNow,
- };
}
diff --git a/backend/src/Ai/TextStack.Ai.EvalSuite/UserBookRagEvalRunner.cs b/backend/src/Ai/TextStack.Ai.EvalSuite/UserBookRagEvalRunner.cs
new file mode 100644
index 00000000..06eda4fe
--- /dev/null
+++ b/backend/src/Ai/TextStack.Ai.EvalSuite/UserBookRagEvalRunner.cs
@@ -0,0 +1,239 @@
+using Application.Common.Interfaces;
+using Application.Rag;
+using Microsoft.Extensions.Logging;
+using TextStack.Ai.Core;
+using TextStack.Ai.Evals;
+using TextStack.Ai.Llm;
+using TextStack.Ai.Rag;
+
+namespace TextStack.Ai.EvalSuite;
+
+/// One generated grounding probe's outcome — the question we synthesised and whether Ask cited it.
+public sealed record UserBookProbeCase(string Question, int Citations, bool Insufficient);
+
+///
+/// The two behavioural probes (greeting + off-book) surfaced for the admin UI — each pass/fail with a
+/// short note so the owner can see WHY a companion run failed without re-reading private content.
+///
+public sealed record UserBookBehaviorCase(string Kind, string Question, bool Pass, string Note);
+
+///
+/// Result of a user-book RAG eval (P1): the citation summary over the generated grounding probes, the
+/// fraction of those probes that retrieved ≥1 chunk (), the combined behaviour
+/// pass-fraction (greeting + off-book), and per-probe detail. is null when the
+/// book has no indexed chunks (empty-chunks short-circuit — no LLM call was made).
+///
+public sealed record UserBookRagEvalResult(
+ RagCitationSummary? Citation,
+ double Retrieval,
+ int ProbeN,
+ double BehaviorPass,
+ IReadOnlyList ProbeCases,
+ IReadOnlyList BehaviorCases,
+ string? Note);
+
+///
+/// Live grounding validation for ANY user-uploaded book (P1, sibling to ).
+/// The catalog eval scores fixed golden sets keyed by editionId; user books have no goldens, so this
+/// runner SYNTHESISES probes from the book's own chunks instead:
+///
+/// - Generated grounding (N probes): seed retrieval for a spread of chunks, ask the
+/// generator to write one self-contained question per chunk, then run the REAL Ask path per question
+/// (full-book retrieval, no gate) and judge the answer's citations with the shared
+/// — same rubric + SupportRate as the catalog.
+/// - Greeting ("hi"): a warm greeting must answer, not cite, and not refuse — a purely
+/// structural check (no judge call).
+/// - Off-book: a clearly off-book question must either decline or stay grounded — judged
+/// for invented facts.
+///
+/// Empty/un-embedded book → short-circuit: NO generator/judge LLM call, persist a failed 0-row with a
+/// note (mirrors the catalog "no-LLM-on-empty" invariant). Persists rag.userbook.citation,
+/// rag.userbook.behavior (and rag.userbook.retrieval) rows.
+///
+public sealed class UserBookRagEvalRunner(ILogger logger)
+{
+ public const string CitationFeature = "rag.userbook.citation";
+ public const string BehaviorFeature = "rag.userbook.behavior";
+ public const string RetrievalFeature = "rag.userbook.retrieval";
+ private const string GeneratorModelId = "userbook-probe-gen";
+
+ // A broad seed that surfaces a spread of the book's chunks regardless of subject matter.
+ private const string SeedQuery = "main ideas and key topics summary";
+
+ // A fixed off-book question — its answer must decline or stay grounded, never invent.
+ private const string OffBookQuestion =
+ "What does this book say about the 2026 FIFA World Cup final score?";
+
+ private const string ProbeGenSystemPrompt =
+ "You write a single reading-comprehension question grounded in one passage.";
+
+ public async Task RunAsync(
+ IRagService rag,
+ IRagAskService ask,
+ ILlmService generator,
+ ILlmService judge,
+ string judgeModelId,
+ Guid userId,
+ Guid userBookId,
+ int probeCount,
+ int k,
+ bool persist,
+ IAppDbContext? db,
+ string? gitSha,
+ CancellationToken ct)
+ {
+ // Seed retrieval for a spread of chunks. Empty => un-embedded/empty book: short-circuit with NO
+ // generator/judge LLM call (mirrors the catalog no-LLM-on-empty invariant).
+ var seed = await rag.RetrieveUserBookAsync(userId, userBookId, SeedQuery, probeCount, maxChapterOrd: null, ct);
+ if (seed.Count == 0)
+ {
+ const string note = "No indexed chunks for this user book — not embedded yet (no LLM call made).";
+ logger.LogInformation(
+ "user-book RAG eval userBook={UserBook} skipped — no indexed chunks", userBookId);
+ if (persist && db is not null)
+ {
+ db.EvalRuns.Add(CitationJudge.MakeRun(CitationFeature, GeneratorModelId, judgeModelId, 0m, 0, gitSha,
+ $"{{\"note\":\"empty-chunks\",\"probes\":0}}"));
+ db.EvalRuns.Add(CitationJudge.MakeRun(BehaviorFeature, GeneratorModelId, judgeModelId, 0m, 0, gitSha,
+ $"{{\"note\":\"empty-chunks\"}}"));
+ await db.SaveChangesAsync(ct);
+ }
+ return new UserBookRagEvalResult(
+ Citation: null, Retrieval: 0, ProbeN: 0, BehaviorPass: 0, ProbeCases: [], BehaviorCases: [], Note: note);
+ }
+
+ // 1) Generate one self-contained question per seed chunk (one generator call each).
+ var questions = new List(seed.Count);
+ foreach (var chunk in seed)
+ {
+ ct.ThrowIfCancellationRequested();
+ questions.Add(await GenerateProbeAsync(generator, chunk.Text, ct));
+ }
+
+ // 2) Run the REAL Ask path per question (full-book retrieval, no gate) and keep the retrieved
+ // chunks so the shared CitationJudge can score the resulting answer's citations.
+ var retrievedByQuestion = new List<(string Question, IReadOnlyList Chunks)>();
+ var probeCases = new List();
+ var probesWithChunks = 0;
+ foreach (var question in questions)
+ {
+ ct.ThrowIfCancellationRequested();
+ var chunks = await rag.RetrieveUserBookAsync(userId, userBookId, question, k, maxChapterOrd: null, ct);
+ if (chunks.Count > 0)
+ probesWithChunks++;
+ retrievedByQuestion.Add((question, chunks));
+ }
+
+ var citation = await CitationJudge.JudgeCitationsAsync(ask, judge, retrievedByQuestion, ct);
+
+ // Per-probe detail: re-run Ask once for the breakdown (cheap, mirrors the judge's own Ask call —
+ // we surface citation count + insufficiency per probe for the admin view).
+ foreach (var (question, chunks) in retrievedByQuestion)
+ {
+ ct.ThrowIfCancellationRequested();
+ var answer = await ask.AskFromChunksAsync(question, chunks, [], [], lastReadOrd: int.MaxValue, ct);
+ probeCases.Add(new UserBookProbeCase(question, answer.Citations.Count, answer.Insufficient));
+ }
+
+ var retrievalRate = (double)probesWithChunks / questions.Count;
+
+ // 3) Behaviour probes: greeting (structural) + off-book (judged for invented facts).
+ var greeting = await EvaluateGreetingAsync(rag, ask, userId, userBookId, k, ct);
+ var offBook = await EvaluateOffBookAsync(rag, ask, judge, userId, userBookId, k, ct);
+ var behaviorCases = new[] { greeting, offBook };
+ var behaviorPass = behaviorCases.Count(c => c.Pass) / (double)behaviorCases.Length;
+
+ logger.LogInformation(
+ "user-book RAG eval userBook={UserBook} probes={Probes} citation={Cit}/support {Support:0.00} retrieval={Retrieval:0.00} behavior={Behavior:0.00}",
+ userBookId, questions.Count, citation.Score, citation.SupportRate, retrievalRate, behaviorPass);
+
+ if (persist && db is not null)
+ {
+ db.EvalRuns.Add(CitationJudge.MakeRun(CitationFeature, GeneratorModelId, judgeModelId,
+ (decimal)citation.Score, citation.CitationsJudged, gitSha,
+ $"{{\"supportRate\":{citation.SupportRate:0.000},\"answers\":{citation.AnswersGenerated},\"probes\":{questions.Count}}}"));
+ db.EvalRuns.Add(CitationJudge.MakeRun(BehaviorFeature, GeneratorModelId, judgeModelId,
+ (decimal)behaviorPass, behaviorCases.Length, gitSha,
+ $"{{\"greeting\":{greeting.Pass.ToString().ToLowerInvariant()},\"offBook\":{offBook.Pass.ToString().ToLowerInvariant()}}}"));
+ db.EvalRuns.Add(CitationJudge.MakeRun(RetrievalFeature, GeneratorModelId, CitationJudge.NoJudge,
+ (decimal)retrievalRate, questions.Count, gitSha,
+ $"{{\"probesWithChunks\":{probesWithChunks}}}"));
+ await db.SaveChangesAsync(ct);
+ }
+
+ return new UserBookRagEvalResult(
+ citation, retrievalRate, questions.Count, behaviorPass, probeCases, behaviorCases, Note: null);
+ }
+
+ /// One generator call: a self-contained, non-yes/no question answerable only from the passage.
+ private static async Task GenerateProbeAsync(ILlmService generator, string passage, CancellationToken ct)
+ {
+ var prompt =
+ "Write one self-contained question answerable ONLY from this passage. No yes/no. " +
+ "Output the question only.\n\nPassage:\n" + passage;
+ var request = new LlmRequest(
+ SystemPrompt: ProbeGenSystemPrompt,
+ Messages: [new LlmMessage("user", prompt)],
+ MaxOutputTokens: 80,
+ FeatureTag: "eval.userbook.gen");
+ var response = await generator.CompleteAsync(request, ct);
+ return response.Text.Trim();
+ }
+
+ ///
+ /// Greeting probe: a warm "hi" must answer (non-empty), NOT cite (no citations, no [n] marker
+ /// in the text), and NOT refuse (not insufficient). Purely structural — no judge call.
+ ///
+ private static async Task EvaluateGreetingAsync(
+ IRagService rag, IRagAskService ask, Guid userId, Guid userBookId, int k, CancellationToken ct)
+ {
+ const string question = "hi";
+ var chunks = await rag.RetrieveUserBookAsync(userId, userBookId, question, k, maxChapterOrd: null, ct);
+ var answer = await ask.AskFromChunksAsync(question, chunks, [], [], lastReadOrd: int.MaxValue, ct);
+
+ var nonEmpty = !string.IsNullOrWhiteSpace(answer.Answer);
+ var noCitations = answer.Citations.Count == 0;
+ var notRefused = !answer.Insufficient;
+ var noMarker = !HasCitationMarker(answer.Answer);
+ var pass = nonEmpty && noCitations && notRefused && noMarker;
+
+ var note = pass
+ ? "Warm greeting: answered, no citations, not refused."
+ : $"Failed structural greeting check (nonEmpty={nonEmpty}, noCitations={noCitations}, notRefused={notRefused}, noMarker={noMarker}).";
+ return new UserBookBehaviorCase("greeting", question, pass, note);
+ }
+
+ ///
+ /// Off-book probe: a clearly off-book question must gracefully decline OR stay grounded with no
+ /// invented facts. Passes iff the answer is insufficient OR the judge finds no invented facts.
+ ///
+ private static async Task EvaluateOffBookAsync(
+ IRagService rag, IRagAskService ask, ILlmService judge, Guid userId, Guid userBookId, int k, CancellationToken ct)
+ {
+ var chunks = await rag.RetrieveUserBookAsync(userId, userBookId, OffBookQuestion, k, maxChapterOrd: null, ct);
+ var answer = await ask.AskFromChunksAsync(OffBookQuestion, chunks, [], [], lastReadOrd: int.MaxValue, ct);
+
+ if (answer.Insufficient)
+ return new UserBookBehaviorCase("off_book", OffBookQuestion, true,
+ "Declined an off-book question (insufficient context).");
+
+ var noInvented = await CitationJudge.JudgeNoInventedFactsAsync(judge, OffBookQuestion, answer.Answer, ct);
+ var note = noInvented
+ ? "Stayed grounded on an off-book question (no invented facts)."
+ : "Introduced facts not grounded in the book on an off-book question.";
+ return new UserBookBehaviorCase("off_book", OffBookQuestion, noInvented, note);
+ }
+
+ private static bool HasCitationMarker(string text)
+ {
+ for (var i = 0; i + 2 < text.Length; i++)
+ if (text[i] == '[' && char.IsDigit(text[i + 1]))
+ {
+ var j = i + 1;
+ while (j < text.Length && char.IsDigit(text[j])) j++;
+ if (j < text.Length && text[j] == ']')
+ return true;
+ }
+ return false;
+ }
+}
diff --git a/backend/src/Api/Endpoints/AdminRagEndpoints.cs b/backend/src/Api/Endpoints/AdminRagEndpoints.cs
index 1fb78847..a375d1b3 100644
--- a/backend/src/Api/Endpoints/AdminRagEndpoints.cs
+++ b/backend/src/Api/Endpoints/AdminRagEndpoints.cs
@@ -3,6 +3,7 @@
using Application.Rag;
using Contracts.Admin;
using Microsoft.AspNetCore.Mvc;
+using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.DependencyInjection;
using TextStack.Ai.Core;
using TextStack.Ai.EvalSuite;
@@ -27,6 +28,79 @@ public static void MapAdminRagEndpoints(this WebApplication app)
group.MapGet("/{editionId:guid}/search", Search);
group.MapGet("/{editionId:guid}/context", Context);
group.MapPost("/{editionId:guid}/eval", RunEval);
+ group.MapPost("/userbook/{id:guid}/eval", RunUserBookEval);
+ }
+
+ // User-book RAG eval (P1): live grounding validation for ANY user-uploaded book — no goldens, so it
+ // synthesises probes from the book's own chunks (generated grounding + greeting + off-book). Reads
+ // PRIVATE user content under admin auth, so it logs the target userId. `judge` = openai (default,
+ // Eval:JudgeModel) | ollama. Persists rag.userbook.citation / .behavior / .retrieval EvalRun rows.
+ private static async Task RunUserBookEval(
+ Guid id,
+ [FromQuery] int? k,
+ [FromQuery] string? judge,
+ IServiceProvider services,
+ IConfiguration config,
+ UserBookRagEvalRunner runner,
+ IAppDbContext db,
+ ILogger logger,
+ CancellationToken ct)
+ {
+ var userId = await db.UserBooks
+ .Where(b => b.Id == id)
+ .Select(b => (Guid?)b.UserId)
+ .FirstOrDefaultAsync(ct);
+ if (userId is null)
+ return Results.NotFound("Book not found");
+
+ // Privacy note: admin-triggered eval reads this user's private uploaded content.
+ logger.LogInformation("admin RAG eval reads private user content for userId {UserId}", userId.Value);
+
+ if (!TryResolve(services, out var rag, out var unavailable))
+ return unavailable;
+
+ IRagAskService ask;
+ ILlmService generator;
+ try
+ {
+ ask = services.GetRequiredService();
+ generator = services.GetRequiredService();
+ }
+ catch (InvalidOperationException)
+ {
+ return Results.Problem("Embeddings are not configured (no OpenAI key).", statusCode: 503);
+ }
+
+ var useOllama = (judge ?? "openai").Trim().ToLowerInvariant() == "ollama";
+ var judgeKey = useOllama ? "ollama" : "openai-judge";
+ var judgeModelId = useOllama
+ ? config["Ollama:Model"] ?? "gemma4:e2b"
+ : config["Eval:JudgeModel"] ?? "gpt-4.1";
+ var judgeClient = services.GetRequiredKeyedService(judgeKey);
+
+ var limit = Math.Clamp(k ?? IRagService.DefaultK, 1, MaxK);
+ var gitSha = Environment.GetEnvironmentVariable("GIT_SHA");
+
+ var result = await runner.RunAsync(
+ rag, ask, generator, judgeClient, judgeModelId, userId.Value, id,
+ probeCount: 6, limit, persist: true, db, gitSha, ct);
+
+ var citation = result.Citation is null
+ ? null
+ : new RagCitationDto(
+ Math.Round(result.Citation.Score, 3),
+ Math.Round(result.Citation.SupportRate, 4),
+ result.Citation.CitationsJudged,
+ result.Citation.AnswersGenerated);
+
+ return Results.Ok(new UserBookRagEvalDto(
+ citation,
+ Math.Round(result.Retrieval, 4),
+ result.ProbeN,
+ Math.Round(result.BehaviorPass, 4),
+ result.ProbeCases.Select(c => new UserBookProbeDto(c.Question, c.Citations, c.Insufficient)).ToList(),
+ result.BehaviorCases.Select(c => new UserBookBehaviorDto(c.Kind, c.Question, c.Pass, c.Note)).ToList(),
+ result.Note));
}
// Phase 4 DoD gate (AI-027): runs the RAG eval against a real, embedded edition. Retrieval
diff --git a/backend/src/Api/Program.cs b/backend/src/Api/Program.cs
index c2780178..10f19229 100644
--- a/backend/src/Api/Program.cs
+++ b/backend/src/Api/Program.cs
@@ -86,6 +86,7 @@
builder.Services.AddSingleton();
builder.Services.AddSingleton();
builder.Services.AddSingleton();
+builder.Services.AddSingleton();
builder.Services.AddSingleton();
builder.Services.AddSingleton();
builder.Services.AddSingleton();
diff --git a/backend/src/Contracts/Admin/RagDtos.cs b/backend/src/Contracts/Admin/RagDtos.cs
index d1968145..edea1976 100644
--- a/backend/src/Contracts/Admin/RagDtos.cs
+++ b/backend/src/Contracts/Admin/RagDtos.cs
@@ -51,3 +51,24 @@ public record RagEvalDto(
RagCitationDto? Citation,
IReadOnlyList RecallCases,
IReadOnlyList SpoilerCases);
+
+/// One synthesised grounding probe's outcome in the user-book eval (P1): citation count + refusal.
+public record UserBookProbeDto(string Question, int Citations, bool Insufficient);
+
+/// One behaviour probe's outcome (greeting | off_book) with a short pass/fail note.
+public record UserBookBehaviorDto(string Kind, string Question, bool Pass, string Note);
+
+///
+/// Result of the user-book RAG eval (P1): citation correctness over the generated grounding probes, the
+/// fraction of probes that retrieved ≥1 chunk, the combined greeting+off-book behaviour pass-fraction,
+/// and per-probe detail. is null + set when the book has no
+/// indexed chunks (empty short-circuit — no LLM call made).
+///
+public record UserBookRagEvalDto(
+ RagCitationDto? Citation,
+ double Retrieval,
+ int ProbeN,
+ double BehaviorPass,
+ IReadOnlyList Probes,
+ IReadOnlyList Behavior,
+ string? Note);
diff --git a/tests/TextStack.AiEvals/UserBookRagEvalRunnerTests.cs b/tests/TextStack.AiEvals/UserBookRagEvalRunnerTests.cs
new file mode 100644
index 00000000..fcee50d7
--- /dev/null
+++ b/tests/TextStack.AiEvals/UserBookRagEvalRunnerTests.cs
@@ -0,0 +1,283 @@
+using Application.Rag;
+using Microsoft.Extensions.Logging.Abstractions;
+using TextStack.Ai.Core;
+using TextStack.Ai.EvalSuite;
+using TextStack.Ai.Rag;
+
+namespace TextStack.AiEvals;
+
+///
+/// Deterministic coverage for (user-book RAG eval, P1) with fake
+/// retrieval + generator + judge + Ask (no DB, no embeddings, no key). Proves the synthesised-probe
+/// path: seed retrieval → one generated question per chunk → real Ask path per question → shared
+/// CitationJudge → greeting (structural) + off-book (judged) behaviour. Counting fakes assert the
+/// empty-chunks short-circuit makes NO generator/judge LLM call (the catalog no-LLM-on-empty invariant).
+///
+public class UserBookRagEvalRunnerTests
+{
+ private static readonly Guid UserId = Guid.NewGuid();
+ private static readonly Guid BookId = Guid.NewGuid();
+
+ private static UserBookRagEvalRunner Runner() =>
+ new(NullLogger.Instance);
+
+ private static RetrievedChunk Chunk(string text) =>
+ new(Guid.NewGuid(), Guid.NewGuid(), 0, 0, text, 0, text.Length, 1.0);
+
+ /// Returns chunks for any user-book query; empty on demand.
+ private sealed class FakeUserBookRag(int count) : IRagService
+ {
+ public Task> RetrieveAsync(
+ Guid editionId, string query, int k, int? maxChapterOrd, CancellationToken ct) =>
+ throw new NotSupportedException();
+
+ public Task> RetrieveUserBookAsync(
+ Guid userId, Guid userBookId, string query, int k, int? maxChapterOrd, CancellationToken ct)
+ {
+ Assert.Equal(UserId, userId);
+ Assert.Equal(BookId, userBookId);
+ Assert.Null(maxChapterOrd); // user books are never gated
+ // `count` is the book's available chunk pool; a retrieval returns up to k of them.
+ var chunks = Enumerable.Range(0, Math.Min(count, k)).Select(i => Chunk($"passage {i} for '{query}'")).ToList();
+ return Task.FromResult>(chunks);
+ }
+ }
+
+ /// Echoes a one-citation answer over the first chunk; empty chunks → insufficient (no cite).
+ private sealed class FakeAsk : IRagAskService
+ {
+ public Task AskAsync(Guid u, Guid s, Guid e, string q, int k, Guid? currentChapterId, IReadOnlyList history, CancellationToken ct) =>
+ throw new NotSupportedException();
+
+ public Task AskFromChunksAsync(
+ string question, IReadOnlyList chunks, IReadOnlyList notes, IReadOnlyList history, int lastReadOrd, CancellationToken ct)
+ {
+ var citations = chunks.Count == 0
+ ? Array.Empty()
+ : [new AskCitationSource(1, chunks[0])];
+ return Task.FromResult(new AskAnswer($"Grounded answer [1]. ({question})", citations, lastReadOrd, Insufficient: chunks.Count == 0));
+ }
+ }
+
+ /// A warm-greeting Ask: always non-empty, no citation, no [n] marker, never insufficient.
+ private sealed class WarmGreetingAsk : IRagAskService
+ {
+ public Task AskAsync(Guid u, Guid s, Guid e, string q, int k, Guid? currentChapterId, IReadOnlyList history, CancellationToken ct) =>
+ throw new NotSupportedException();
+
+ public Task AskFromChunksAsync(
+ string question, IReadOnlyList chunks, IReadOnlyList notes, IReadOnlyList history, int lastReadOrd, CancellationToken ct) =>
+ Task.FromResult(new AskAnswer("Hello! Happy to chat about this book.", [], lastReadOrd, Insufficient: false));
+ }
+
+ /// An Ask that cites even on a greeting — fails the structural greeting check.
+ private sealed class CitingGreetingAsk : IRagAskService
+ {
+ public Task AskAsync(Guid u, Guid s, Guid e, string q, int k, Guid? currentChapterId, IReadOnlyList history, CancellationToken ct) =>
+ throw new NotSupportedException();
+
+ public Task AskFromChunksAsync(
+ string question, IReadOnlyList chunks, IReadOnlyList notes, IReadOnlyList history, int lastReadOrd, CancellationToken ct)
+ {
+ var cite = chunks.Count == 0 ? Array.Empty() : [new AskCitationSource(1, chunks[0])];
+ return Task.FromResult(new AskAnswer("Hi [1].", cite, lastReadOrd, Insufficient: false));
+ }
+ }
+
+ /// Generator that returns a fixed question; counts calls (to assert no-call-on-empty).
+ private sealed class CountingGenerator : ILlmService
+ {
+ public int Calls;
+ public Task CompleteAsync(LlmRequest request, CancellationToken ct)
+ {
+ Calls++;
+ return Task.FromResult(new LlmResponse(
+ "What is the main idea of this passage?", [], new LlmUsage(0, 0, 0m), "gen-fake", Guid.NewGuid()));
+ }
+
+ public IAsyncEnumerable StreamAsync(LlmRequest request, CancellationToken ct) =>
+ throw new NotSupportedException();
+ }
+
+ ///
+ /// Judge returning a fixed rubric verdict (support/relevance/faithfulness) for citation grading, and
+ /// a fixed YES/NO for the off-book faithfulness grade. Counts calls (to assert no-call-on-empty).
+ ///
+ private sealed class CountingJudge(int d1, int d2, int d3, string offBookVerdict) : ILlmService
+ {
+ public int Calls;
+ public Task CompleteAsync(LlmRequest request, CancellationToken ct)
+ {
+ Calls++;
+ // The off-book faithfulness grader uses MaxOutputTokens=4 and asks for YES/NO; everything
+ // else is the rubric JSON judge.
+ var text = request.MaxOutputTokens <= 4
+ ? offBookVerdict
+ : $"{{\"d1\": {d1}, \"d2\": {d2}, \"d3\": {d3}, \"rationale\": \"ok\"}}";
+ return Task.FromResult(new LlmResponse(text, [], new LlmUsage(0, 0, 0m), "judge-fake", Guid.NewGuid()));
+ }
+
+ public IAsyncEnumerable StreamAsync(LlmRequest request, CancellationToken ct) =>
+ throw new NotSupportedException();
+ }
+
+ /// A fake that throws if any LLM call is made — proves the empty-chunks short-circuit.
+ private sealed class ThrowingLlm : ILlmService
+ {
+ public Task CompleteAsync(LlmRequest request, CancellationToken ct) =>
+ throw new InvalidOperationException("LLM must not be called on empty chunks");
+
+ public IAsyncEnumerable StreamAsync(LlmRequest request, CancellationToken ct) =>
+ throw new NotSupportedException();
+ }
+
+ [Fact]
+ public async Task RunAsync_IndexedBook_GeneratesNProbesAndAggregatesCitations()
+ {
+ var gen = new CountingGenerator();
+ var result = await Runner().RunAsync(
+ new FakeUserBookRag(count: 6), new FakeAsk(), gen, new CountingJudge(5, 4, 5, "no"),
+ judgeModelId: "judge-fake", UserId, BookId, probeCount: 6, k: 8,
+ persist: false, db: null, gitSha: null, TestContext.Current.CancellationToken);
+
+ // Seed retrieval returns 6 chunks → 6 generated questions → 6 generator calls.
+ Assert.Equal(6, gen.Calls);
+ Assert.Equal(6, result.ProbeN);
+ Assert.Equal(6, result.ProbeCases.Count);
+ Assert.NotNull(result.Citation);
+ Assert.Equal(6, result.Citation!.CitationsJudged); // one citation per probe answer
+ Assert.Equal((5 + 4 + 5) / 3.0, result.Citation.Score, 3);
+ Assert.Equal(1.0, result.Citation.SupportRate, 12); // D1=5 ≥4 for every citation
+ Assert.Equal(1.0, result.Retrieval, 12); // every probe retrieved ≥1 chunk
+ Assert.Null(result.Note);
+ }
+
+ [Fact]
+ public async Task RunAsync_LowSupportAxis_ZeroSupportRate()
+ {
+ var result = await Runner().RunAsync(
+ new FakeUserBookRag(count: 1), new FakeAsk(), new CountingGenerator(),
+ new CountingJudge(2, 5, 5, "no"), judgeModelId: "judge-fake", UserId, BookId,
+ probeCount: 6, k: 8, persist: false, db: null, gitSha: null, TestContext.Current.CancellationToken);
+
+ Assert.NotNull(result.Citation);
+ Assert.Equal(0.0, result.Citation!.SupportRate, 12); // D1=2 < 4 → no citation supported
+ }
+
+ [Fact]
+ public async Task RunAsync_WarmGreeting_StructuralPass()
+ {
+ var result = await Runner().RunAsync(
+ new FakeUserBookRag(count: 1), new WarmGreetingAsk(), new CountingGenerator(),
+ new CountingJudge(5, 5, 5, "no"), judgeModelId: "judge-fake", UserId, BookId,
+ probeCount: 6, k: 8, persist: false, db: null, gitSha: null, TestContext.Current.CancellationToken);
+
+ var greeting = result.BehaviorCases.Single(c => c.Kind == "greeting");
+ Assert.True(greeting.Pass); // non-empty, 0 citations, not insufficient, no [n] marker
+ }
+
+ [Fact]
+ public async Task RunAsync_GreetingThatCites_StructuralFail()
+ {
+ var result = await Runner().RunAsync(
+ new FakeUserBookRag(count: 1), new CitingGreetingAsk(), new CountingGenerator(),
+ new CountingJudge(5, 5, 5, "no"), judgeModelId: "judge-fake", UserId, BookId,
+ probeCount: 6, k: 8, persist: false, db: null, gitSha: null, TestContext.Current.CancellationToken);
+
+ var greeting = result.BehaviorCases.Single(c => c.Kind == "greeting");
+ Assert.False(greeting.Pass); // cited + has [1] marker → fails the warm-greeting check
+ }
+
+ [Fact]
+ public async Task RunAsync_OffBookGrounded_JudgePass()
+ {
+ // Judge says "no" (no invented facts) → off-book probe passes. WarmGreetingAsk keeps the off-book
+ // answer non-insufficient so it actually reaches the judge.
+ var result = await Runner().RunAsync(
+ new FakeUserBookRag(count: 1), new WarmGreetingAsk(), new CountingGenerator(),
+ new CountingJudge(5, 5, 5, offBookVerdict: "no"), judgeModelId: "judge-fake", UserId, BookId,
+ probeCount: 6, k: 8, persist: false, db: null, gitSha: null, TestContext.Current.CancellationToken);
+
+ var offBook = result.BehaviorCases.Single(c => c.Kind == "off_book");
+ Assert.True(offBook.Pass);
+ }
+
+ [Fact]
+ public async Task RunAsync_OffBookHallucinates_JudgeFail()
+ {
+ // Judge says "yes" (invented facts) → off-book probe fails. WarmGreetingAsk makes the greeting
+ // pass and keeps the off-book answer non-insufficient so the "yes" verdict is what fails it.
+ var result = await Runner().RunAsync(
+ new FakeUserBookRag(count: 1), new WarmGreetingAsk(), new CountingGenerator(),
+ new CountingJudge(5, 5, 5, offBookVerdict: "yes"), judgeModelId: "judge-fake", UserId, BookId,
+ probeCount: 6, k: 8, persist: false, db: null, gitSha: null, TestContext.Current.CancellationToken);
+
+ var offBook = result.BehaviorCases.Single(c => c.Kind == "off_book");
+ Assert.False(offBook.Pass);
+ Assert.Equal(0.5, result.BehaviorPass, 12); // greeting passes, off-book fails → 1 of 2
+ }
+
+ [Fact]
+ public async Task RunAsync_Persist_WritesCitationBehaviorRetrievalRows()
+ {
+ var db = new CapturingDb();
+ await Runner().RunAsync(
+ new FakeUserBookRag(count: 1), new FakeAsk(), new CountingGenerator(),
+ new CountingJudge(5, 5, 5, "no"), judgeModelId: "judge-fake", UserId, BookId,
+ probeCount: 6, k: 8, persist: true, db, gitSha: "abc123", TestContext.Current.CancellationToken);
+
+ Assert.Equal(3, db.Added.Count);
+ Assert.Contains(db.Added, r => r.Feature == UserBookRagEvalRunner.CitationFeature);
+ Assert.Contains(db.Added, r => r.Feature == UserBookRagEvalRunner.BehaviorFeature);
+ Assert.Contains(db.Added, r => r.Feature == UserBookRagEvalRunner.RetrievalFeature);
+ Assert.All(db.Added, r => Assert.Equal("abc123", r.GitSha));
+ Assert.Equal(1, db.SaveCalls);
+ }
+
+ [Fact]
+ public async Task RunAsync_PersistOff_WritesNothing()
+ {
+ var db = new CapturingDb();
+ await Runner().RunAsync(
+ new FakeUserBookRag(count: 1), new FakeAsk(), new CountingGenerator(),
+ new CountingJudge(5, 5, 5, "no"), judgeModelId: "judge-fake", UserId, BookId,
+ probeCount: 6, k: 8, persist: false, db, gitSha: null, TestContext.Current.CancellationToken);
+
+ Assert.Empty(db.Added);
+ Assert.Equal(0, db.SaveCalls);
+ }
+
+ [Fact]
+ public async Task RunAsync_EmptyChunks_PersistsTwoFailedRows()
+ {
+ var db = new CapturingDb();
+ await Runner().RunAsync(
+ new FakeUserBookRag(count: 0), new FakeAsk(), new ThrowingLlm(), new ThrowingLlm(),
+ judgeModelId: "judge-fake", UserId, BookId, probeCount: 6, k: 8,
+ persist: true, db, gitSha: null, TestContext.Current.CancellationToken);
+
+ // citation + behavior rows, both 0-score with an empty-chunks note; NO retrieval row.
+ Assert.Equal(2, db.Added.Count);
+ Assert.All(db.Added, r => Assert.Equal(0m, r.Score));
+ Assert.All(db.Added, r => Assert.Contains("empty-chunks", r.BreakdownJson));
+ Assert.Equal(1, db.SaveCalls);
+ }
+
+ [Fact]
+ public async Task RunAsync_EmptyChunks_ShortCircuitsWithNoLlmCall()
+ {
+ var result = await Runner().RunAsync(
+ new FakeUserBookRag(count: 0), new FakeAsk(), new ThrowingLlm(), new ThrowingLlm(),
+ judgeModelId: "judge-fake", UserId, BookId, probeCount: 6, k: 8,
+ persist: false, db: null, gitSha: null, TestContext.Current.CancellationToken);
+
+ // Empty seed → no generator/judge call (ThrowingLlm would have thrown), failed 0-row + note.
+ Assert.Null(result.Citation);
+ Assert.Equal(0, result.ProbeN);
+ Assert.Equal(0.0, result.Retrieval, 12);
+ Assert.Equal(0.0, result.BehaviorPass, 12);
+ Assert.Empty(result.ProbeCases);
+ Assert.Empty(result.BehaviorCases);
+ Assert.NotNull(result.Note);
+ }
+}