Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 139 additions & 0 deletions cmd/scan-eval/eval.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package main

import (
"encoding/json"
"fmt"
"os"

"github.com/smart-mcp-proxy/mcpproxy-go/internal/security"
)

// detectorSensitiveData is the id of the deterministic, in-process
// sensitive-data/secret detector bridged in this PR (Gate-2 approved scope).
// Docker bundled scanners are a deferred opt-in extension point (--scanners).
const detectorSensitiveData = "sensitive-data"

// corpusEntry mirrors one item of contracts/security-corpus.schema.json.
type corpusEntry struct {
ID string `json:"id"`
Description string `json:"description"`
Label string `json:"label"`
Category string `json:"category"`
Provenance struct {
Source string `json:"source"`
License string `json:"license"`
} `json:"provenance"`
}

// corpus is the D2 security corpus document. corpus_version/version are
// optional; the schema only mandates entries. Unknown fields are tolerated so
// the tool stays dataset-agnostic across corpus revisions.
type corpus struct {
CorpusVersion string `json:"corpus_version"`
Version string `json:"version"`
Entries []corpusEntry `json:"entries"`
}

// resolvedVersion returns the corpus version for echoing into the verdict
// report, preferring corpus_version, then version, else "unknown".
func (c *corpus) resolvedVersion() string {
switch {
case c.CorpusVersion != "":
return c.CorpusVersion
case c.Version != "":
return c.Version
default:
return "unknown"
}
}

// detectionView is the per-detection projection emitted in verdicts. It drops
// detector-internal fields (location, is_likely_example) the scorer does not
// need, keeping the contract minimal.
type detectionView struct {
Type string `json:"type"`
Category string `json:"category"`
Severity string `json:"severity"`
}

// detectorVerdict is one detector's call on one entry.
type detectorVerdict struct {
Detector string `json:"detector"`
Flagged bool `json:"flagged"`
MaxSeverity string `json:"max_severity"`
Detections []detectionView `json:"detections"`
}

// verdictEntry echoes ground truth and carries every detector's verdict.
type verdictEntry struct {
ID string `json:"id"`
Label string `json:"label"`
Category string `json:"category"`
Verdicts []detectorVerdict `json:"verdicts"`
}

// verdictReport is the top-level output (contracts/scan-verdict.schema.json),
// the contract consumed by the Python SecurityScorer (B3).
type verdictReport struct {
CorpusVersion string `json:"corpus_version"`
Detectors []string `json:"detectors"`
Entries []verdictEntry `json:"entries"`
}

// loadCorpus reads and decodes a D2 security corpus JSON file. A read/parse
// failure or an empty entry set is a config error (callers map it to exit 4).
func loadCorpus(path string) (*corpus, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("reading corpus %q: %w", path, err)
}
var c corpus
if err := json.Unmarshal(data, &c); err != nil {
return nil, fmt.Errorf("parsing corpus %q: %w", path, err)
}
if len(c.Entries) == 0 {
return nil, fmt.Errorf("corpus %q has no entries", path)
}
return &c, nil
}

// evaluate runs every corpus entry's description through the detector and
// projects the result into the verdict contract. Output ordering follows the
// corpus order and the detector's deterministic pattern order, so repeated
// runs over an unchanged corpus are byte-identical (INV-5).
func evaluate(c *corpus, detector *security.Detector) *verdictReport {
report := &verdictReport{
CorpusVersion: c.resolvedVersion(),
Detectors: []string{detectorSensitiveData},
Entries: make([]verdictEntry, 0, len(c.Entries)),
}

for _, e := range c.Entries {
// The corpus stores the tool description text; scan it as a response
// payload (the detector treats arguments/response identically).
res := detector.Scan("", e.Description)

v := detectorVerdict{
Detector: detectorSensitiveData,
Flagged: res.Detected,
MaxSeverity: res.MaxSeverity(),
Detections: make([]detectionView, 0, len(res.Detections)),
}
for _, d := range res.Detections {
v.Detections = append(v.Detections, detectionView{
Type: d.Type,
Category: d.Category,
Severity: d.Severity,
})
}

report.Entries = append(report.Entries, verdictEntry{
ID: e.ID,
Label: e.Label,
Category: e.Category,
Verdicts: []detectorVerdict{v},
})
}

return report
}
191 changes: 191 additions & 0 deletions cmd/scan-eval/eval_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
package main

import (
"bytes"
"encoding/json"
"os"
"path/filepath"
"testing"

"github.com/smart-mcp-proxy/mcpproxy-go/internal/security"
)

const minCorpus = "testdata/security_corpus_min.json"

func findEntry(t *testing.T, r *verdictReport, id string) verdictEntry {
t.Helper()
for _, e := range r.Entries {
if e.ID == id {
return e
}
}
t.Fatalf("entry %q not found in report", id)
return verdictEntry{}
}

// sensitiveDataVerdict returns the single sensitive-data verdict for an entry.
func sensitiveDataVerdict(t *testing.T, e verdictEntry) detectorVerdict {
t.Helper()
for _, v := range e.Verdicts {
if v.Detector == detectorSensitiveData {
return v
}
}
t.Fatalf("entry %q has no %q verdict", e.ID, detectorSensitiveData)
return detectorVerdict{}
}

// TestEvaluate_SchemaShape — TDD #1: evaluate() over the fixture echoes
// id/label/category and emits one sensitive-data verdict per entry.
func TestEvaluate_SchemaShape(t *testing.T) {
c, err := loadCorpus(minCorpus)
if err != nil {
t.Fatalf("loadCorpus: %v", err)
}

report := evaluate(c, security.NewDetector(nil))

if report.CorpusVersion != "test-min-v1" {
t.Errorf("corpus_version = %q, want %q", report.CorpusVersion, "test-min-v1")
}
if len(report.Detectors) != 1 || report.Detectors[0] != detectorSensitiveData {
t.Errorf("detectors = %v, want [%q]", report.Detectors, detectorSensitiveData)
}
if len(report.Entries) != len(c.Entries) {
t.Fatalf("entries = %d, want %d", len(report.Entries), len(c.Entries))
}
for i, e := range report.Entries {
src := c.Entries[i]
if e.ID != src.ID || e.Label != src.Label || e.Category != src.Category {
t.Errorf("entry %d ground truth not echoed: got (%q,%q,%q) want (%q,%q,%q)",
i, e.ID, e.Label, e.Category, src.ID, src.Label, src.Category)
}
v := sensitiveDataVerdict(t, e)
if v.Detections == nil {
t.Errorf("entry %q: detections must be non-nil (B3 contract requires the array)", e.ID)
}
}
}

// TestEvaluate_TruePositive — TDD #2 / INV-3 positive: a malicious entry whose
// description embeds an AWS key flags critical.
func TestEvaluate_TruePositive(t *testing.T) {
c, err := loadCorpus(minCorpus)
if err != nil {
t.Fatalf("loadCorpus: %v", err)
}
report := evaluate(c, security.NewDetector(nil))

v := sensitiveDataVerdict(t, findEntry(t, report, "tp-aws-key-001"))
if !v.Flagged {
t.Fatalf("tp-aws-key-001: flagged = false, want true (TP)")
}
if v.MaxSeverity != "critical" {
t.Errorf("tp-aws-key-001: max_severity = %q, want %q", v.MaxSeverity, "critical")
}
found := false
for _, d := range v.Detections {
if d.Type == "aws_access_key" {
found = true
}
}
if !found {
t.Errorf("tp-aws-key-001: expected an aws_access_key detection, got %+v", v.Detections)
}
}

// TestEvaluate_TrueNegative — TDD #3 / INV-3 negative: a plain benign
// description is not flagged (no false positive).
func TestEvaluate_TrueNegative(t *testing.T) {
c, err := loadCorpus(minCorpus)
if err != nil {
t.Fatalf("loadCorpus: %v", err)
}
report := evaluate(c, security.NewDetector(nil))

v := sensitiveDataVerdict(t, findEntry(t, report, "benign-weather-001"))
if v.Flagged {
t.Errorf("benign-weather-001: flagged = true, want false (TN). detections=%+v", v.Detections)
}
if v.MaxSeverity != "" {
t.Errorf("benign-weather-001: max_severity = %q, want empty", v.MaxSeverity)
}
if len(v.Detections) != 0 {
t.Errorf("benign-weather-001: detections = %+v, want none", v.Detections)
}
}

// TestRun_MissingCorpus — TDD #4: bad/missing corpus and missing flag both
// exit 4 (config error, matching repo convention).
func TestRun_MissingCorpus(t *testing.T) {
cases := []struct {
name string
args []string
}{
{"no --corpus flag", []string{}},
{"nonexistent file", []string{"--corpus", filepath.Join(t.TempDir(), "nope.json")}},
{"unparsable flag", []string{"--corpus", minCorpus, "--bogus"}},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
var out, errBuf bytes.Buffer
if code := run(tc.args, &out, &errBuf); code != exitConfigError {
t.Errorf("run(%v) = %d, want %d. stderr=%q", tc.args, code, exitConfigError, errBuf.String())
}
})
}
}

// TestRun_EmptyCorpus — an entries-less corpus is a config error.
func TestRun_EmptyCorpus(t *testing.T) {
p := filepath.Join(t.TempDir(), "empty.json")
if err := os.WriteFile(p, []byte(`{"entries":[]}`), 0o644); err != nil {
t.Fatal(err)
}
var out, errBuf bytes.Buffer
if code := run([]string{"--corpus", p}, &out, &errBuf); code != exitConfigError {
t.Errorf("run(empty corpus) = %d, want %d", code, exitConfigError)
}
}

// TestRun_Deterministic — TDD #5 / INV-5 spirit: two runs over an unchanged
// corpus produce byte-identical, schema-parseable verdict JSON.
func TestRun_Deterministic(t *testing.T) {
var a, b bytes.Buffer
if code := run([]string{"--corpus", minCorpus}, &a, &bytes.Buffer{}); code != exitOK {
t.Fatalf("run #1 = %d, want %d", code, exitOK)
}
if code := run([]string{"--corpus", minCorpus}, &b, &bytes.Buffer{}); code != exitOK {
t.Fatalf("run #2 = %d, want %d", code, exitOK)
}
if a.String() != b.String() {
t.Errorf("non-deterministic output across runs")
}
var report verdictReport
if err := json.Unmarshal(a.Bytes(), &report); err != nil {
t.Fatalf("stdout is not valid verdict JSON: %v", err)
}
if len(report.Entries) != 4 {
t.Errorf("entries = %d, want 4", len(report.Entries))
}
}

// TestRun_WritesToFile — --out writes the same bytes it would print to stdout.
func TestRun_WritesToFile(t *testing.T) {
var stdoutBuf bytes.Buffer
if code := run([]string{"--corpus", minCorpus}, &stdoutBuf, &bytes.Buffer{}); code != exitOK {
t.Fatalf("stdout run = %d", code)
}

outPath := filepath.Join(t.TempDir(), "verdict.json")
if code := run([]string{"--corpus", minCorpus, "--out", outPath}, &bytes.Buffer{}, &bytes.Buffer{}); code != exitOK {
t.Fatalf("file run = %d", code)
}
got, err := os.ReadFile(outPath)
if err != nil {
t.Fatalf("reading --out file: %v", err)
}
if string(got) != stdoutBuf.String() {
t.Errorf("--out file differs from stdout output")
}
}
Loading
Loading