smart-mcp-proxy · Dumbris · May 31, 2026 · May 31, 2026
diff --git a/cmd/scan-eval/eval.go b/cmd/scan-eval/eval.go
@@ -0,0 +1,139 @@
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+
+	"github.com/smart-mcp-proxy/mcpproxy-go/internal/security"
+)
+
+// detectorSensitiveData is the id of the deterministic, in-process
+// sensitive-data/secret detector bridged in this PR (Gate-2 approved scope).
+// Docker bundled scanners are a deferred opt-in extension point (--scanners).
+const detectorSensitiveData = "sensitive-data"
+
+// corpusEntry mirrors one item of contracts/security-corpus.schema.json.
+type corpusEntry struct {
+	ID          string `json:"id"`
+	Description string `json:"description"`
+	Label       string `json:"label"`
+	Category    string `json:"category"`
+	Provenance  struct {
+		Source  string `json:"source"`
+		License string `json:"license"`
+	} `json:"provenance"`
+}
+
+// corpus is the D2 security corpus document. corpus_version/version are
+// optional; the schema only mandates entries. Unknown fields are tolerated so
+// the tool stays dataset-agnostic across corpus revisions.
+type corpus struct {
+	CorpusVersion string        `json:"corpus_version"`
+	Version       string        `json:"version"`
+	Entries       []corpusEntry `json:"entries"`
+}
+
+// resolvedVersion returns the corpus version for echoing into the verdict
+// report, preferring corpus_version, then version, else "unknown".
+func (c *corpus) resolvedVersion() string {
+	switch {
+	case c.CorpusVersion != "":
+		return c.CorpusVersion
+	case c.Version != "":
+		return c.Version
+	default:
+		return "unknown"
+	}
+}
+
+// detectionView is the per-detection projection emitted in verdicts. It drops
+// detector-internal fields (location, is_likely_example) the scorer does not
+// need, keeping the contract minimal.
+type detectionView struct {
+	Type     string `json:"type"`
+	Category string `json:"category"`
+	Severity string `json:"severity"`
+}
+
+// detectorVerdict is one detector's call on one entry.
+type detectorVerdict struct {
+	Detector    string          `json:"detector"`
+	Flagged     bool            `json:"flagged"`
+	MaxSeverity string          `json:"max_severity"`
+	Detections  []detectionView `json:"detections"`
+}
+
+// verdictEntry echoes ground truth and carries every detector's verdict.
+type verdictEntry struct {
+	ID       string            `json:"id"`
+	Label    string            `json:"label"`
+	Category string            `json:"category"`
+	Verdicts []detectorVerdict `json:"verdicts"`
+}
+
+// verdictReport is the top-level output (contracts/scan-verdict.schema.json),
+// the contract consumed by the Python SecurityScorer (B3).
+type verdictReport struct {
+	CorpusVersion string         `json:"corpus_version"`
+	Detectors     []string       `json:"detectors"`
+	Entries       []verdictEntry `json:"entries"`
+}
+
+// loadCorpus reads and decodes a D2 security corpus JSON file. A read/parse
+// failure or an empty entry set is a config error (callers map it to exit 4).
+func loadCorpus(path string) (*corpus, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("reading corpus %q: %w", path, err)
+	}
+	var c corpus
+	if err := json.Unmarshal(data, &c); err != nil {
+		return nil, fmt.Errorf("parsing corpus %q: %w", path, err)
+	}
+	if len(c.Entries) == 0 {
+		return nil, fmt.Errorf("corpus %q has no entries", path)
+	}
+	return &c, nil
+}
+
+// evaluate runs every corpus entry's description through the detector and
+// projects the result into the verdict contract. Output ordering follows the
+// corpus order and the detector's deterministic pattern order, so repeated
+// runs over an unchanged corpus are byte-identical (INV-5).
+func evaluate(c *corpus, detector *security.Detector) *verdictReport {
+	report := &verdictReport{
+		CorpusVersion: c.resolvedVersion(),
+		Detectors:     []string{detectorSensitiveData},
+		Entries:       make([]verdictEntry, 0, len(c.Entries)),
+	}
+
+	for _, e := range c.Entries {
+		// The corpus stores the tool description text; scan it as a response
+		// payload (the detector treats arguments/response identically).
+		res := detector.Scan("", e.Description)
+
+		v := detectorVerdict{
+			Detector:    detectorSensitiveData,
+			Flagged:     res.Detected,
+			MaxSeverity: res.MaxSeverity(),
+			Detections:  make([]detectionView, 0, len(res.Detections)),
+		}
+		for _, d := range res.Detections {
+			v.Detections = append(v.Detections, detectionView{
+				Type:     d.Type,
+				Category: d.Category,
+				Severity: d.Severity,
+			})
+		}
+
+		report.Entries = append(report.Entries, verdictEntry{
+			ID:       e.ID,
+			Label:    e.Label,
+			Category: e.Category,
+			Verdicts: []detectorVerdict{v},
+		})
+	}
+
+	return report
+}
diff --git a/cmd/scan-eval/eval_test.go b/cmd/scan-eval/eval_test.go
@@ -0,0 +1,191 @@
+package main
+
+import (
+	"bytes"
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/smart-mcp-proxy/mcpproxy-go/internal/security"
+)
+
+const minCorpus = "testdata/security_corpus_min.json"
+
+func findEntry(t *testing.T, r *verdictReport, id string) verdictEntry {
+	t.Helper()
+	for _, e := range r.Entries {
+		if e.ID == id {
+			return e
+		}
+	}
+	t.Fatalf("entry %q not found in report", id)
+	return verdictEntry{}
+}
+
+// sensitiveDataVerdict returns the single sensitive-data verdict for an entry.
+func sensitiveDataVerdict(t *testing.T, e verdictEntry) detectorVerdict {
+	t.Helper()
+	for _, v := range e.Verdicts {
+		if v.Detector == detectorSensitiveData {
+			return v
+		}
+	}
+	t.Fatalf("entry %q has no %q verdict", e.ID, detectorSensitiveData)
+	return detectorVerdict{}
+}
+
+// TestEvaluate_SchemaShape — TDD #1: evaluate() over the fixture echoes
+// id/label/category and emits one sensitive-data verdict per entry.
+func TestEvaluate_SchemaShape(t *testing.T) {
+	c, err := loadCorpus(minCorpus)
+	if err != nil {
+		t.Fatalf("loadCorpus: %v", err)
+	}
+
+	report := evaluate(c, security.NewDetector(nil))
+
+	if report.CorpusVersion != "test-min-v1" {
+		t.Errorf("corpus_version = %q, want %q", report.CorpusVersion, "test-min-v1")
+	}
+	if len(report.Detectors) != 1 || report.Detectors[0] != detectorSensitiveData {
+		t.Errorf("detectors = %v, want [%q]", report.Detectors, detectorSensitiveData)
+	}
+	if len(report.Entries) != len(c.Entries) {
+		t.Fatalf("entries = %d, want %d", len(report.Entries), len(c.Entries))
+	}
+	for i, e := range report.Entries {
+		src := c.Entries[i]
+		if e.ID != src.ID || e.Label != src.Label || e.Category != src.Category {
+			t.Errorf("entry %d ground truth not echoed: got (%q,%q,%q) want (%q,%q,%q)",
+				i, e.ID, e.Label, e.Category, src.ID, src.Label, src.Category)
+		}
+		v := sensitiveDataVerdict(t, e)
+		if v.Detections == nil {
+			t.Errorf("entry %q: detections must be non-nil (B3 contract requires the array)", e.ID)
+		}
+	}
+}
+
+// TestEvaluate_TruePositive — TDD #2 / INV-3 positive: a malicious entry whose
+// description embeds an AWS key flags critical.
+func TestEvaluate_TruePositive(t *testing.T) {
+	c, err := loadCorpus(minCorpus)
+	if err != nil {
+		t.Fatalf("loadCorpus: %v", err)
+	}
+	report := evaluate(c, security.NewDetector(nil))
+
+	v := sensitiveDataVerdict(t, findEntry(t, report, "tp-aws-key-001"))
+	if !v.Flagged {
+		t.Fatalf("tp-aws-key-001: flagged = false, want true (TP)")
+	}
+	if v.MaxSeverity != "critical" {
+		t.Errorf("tp-aws-key-001: max_severity = %q, want %q", v.MaxSeverity, "critical")
+	}
+	found := false
+	for _, d := range v.Detections {
+		if d.Type == "aws_access_key" {
+			found = true
+		}
+	}
+	if !found {
+		t.Errorf("tp-aws-key-001: expected an aws_access_key detection, got %+v", v.Detections)
+	}
+}
+
+// TestEvaluate_TrueNegative — TDD #3 / INV-3 negative: a plain benign
+// description is not flagged (no false positive).
+func TestEvaluate_TrueNegative(t *testing.T) {
+	c, err := loadCorpus(minCorpus)
+	if err != nil {
+		t.Fatalf("loadCorpus: %v", err)
+	}
+	report := evaluate(c, security.NewDetector(nil))
+
+	v := sensitiveDataVerdict(t, findEntry(t, report, "benign-weather-001"))
+	if v.Flagged {
+		t.Errorf("benign-weather-001: flagged = true, want false (TN). detections=%+v", v.Detections)
+	}
+	if v.MaxSeverity != "" {
+		t.Errorf("benign-weather-001: max_severity = %q, want empty", v.MaxSeverity)
+	}
+	if len(v.Detections) != 0 {
+		t.Errorf("benign-weather-001: detections = %+v, want none", v.Detections)
+	}
+}
+
+// TestRun_MissingCorpus — TDD #4: bad/missing corpus and missing flag both
+// exit 4 (config error, matching repo convention).
+func TestRun_MissingCorpus(t *testing.T) {
+	cases := []struct {
+		name string
+		args []string
+	}{
+		{"no --corpus flag", []string{}},
+		{"nonexistent file", []string{"--corpus", filepath.Join(t.TempDir(), "nope.json")}},
+		{"unparsable flag", []string{"--corpus", minCorpus, "--bogus"}},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			var out, errBuf bytes.Buffer
+			if code := run(tc.args, &out, &errBuf); code != exitConfigError {
+				t.Errorf("run(%v) = %d, want %d. stderr=%q", tc.args, code, exitConfigError, errBuf.String())
+			}
+		})
+	}
+}
+
+// TestRun_EmptyCorpus — an entries-less corpus is a config error.
+func TestRun_EmptyCorpus(t *testing.T) {
+	p := filepath.Join(t.TempDir(), "empty.json")
+	if err := os.WriteFile(p, []byte(`{"entries":[]}`), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	var out, errBuf bytes.Buffer
+	if code := run([]string{"--corpus", p}, &out, &errBuf); code != exitConfigError {
+		t.Errorf("run(empty corpus) = %d, want %d", code, exitConfigError)
+	}
+}
+
+// TestRun_Deterministic — TDD #5 / INV-5 spirit: two runs over an unchanged
+// corpus produce byte-identical, schema-parseable verdict JSON.
+func TestRun_Deterministic(t *testing.T) {
+	var a, b bytes.Buffer
+	if code := run([]string{"--corpus", minCorpus}, &a, &bytes.Buffer{}); code != exitOK {
+		t.Fatalf("run #1 = %d, want %d", code, exitOK)
+	}
+	if code := run([]string{"--corpus", minCorpus}, &b, &bytes.Buffer{}); code != exitOK {
+		t.Fatalf("run #2 = %d, want %d", code, exitOK)
+	}
+	if a.String() != b.String() {
+		t.Errorf("non-deterministic output across runs")
+	}
+	var report verdictReport
+	if err := json.Unmarshal(a.Bytes(), &report); err != nil {
+		t.Fatalf("stdout is not valid verdict JSON: %v", err)
+	}
+	if len(report.Entries) != 4 {
+		t.Errorf("entries = %d, want 4", len(report.Entries))
+	}
+}
+
+// TestRun_WritesToFile — --out writes the same bytes it would print to stdout.
+func TestRun_WritesToFile(t *testing.T) {
+	var stdoutBuf bytes.Buffer
+	if code := run([]string{"--corpus", minCorpus}, &stdoutBuf, &bytes.Buffer{}); code != exitOK {
+		t.Fatalf("stdout run = %d", code)
+	}
+
+	outPath := filepath.Join(t.TempDir(), "verdict.json")
+	if code := run([]string{"--corpus", minCorpus, "--out", outPath}, &bytes.Buffer{}, &bytes.Buffer{}); code != exitOK {
+		t.Fatalf("file run = %d", code)
+	}
+	got, err := os.ReadFile(outPath)
+	if err != nil {
+		t.Fatalf("reading --out file: %v", err)
+	}
+	if string(got) != stdoutBuf.String() {
+		t.Errorf("--out file differs from stdout output")
+	}
+}