From 61650c15d07569cb86032f8b236312f647baf616 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 18:47:34 -0700 Subject: [PATCH 01/28] =?UTF-8?q?docs(specs):=20design=20=E2=80=94=20secti?= =?UTF-8?q?on-number=20expansion=20across=20all=20ingest=20formats?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...6-06-05-section-number-expansion-design.md | 168 ++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 docs/superpowers/specs/2026-06-05-section-number-expansion-design.md diff --git a/docs/superpowers/specs/2026-06-05-section-number-expansion-design.md b/docs/superpowers/specs/2026-06-05-section-number-expansion-design.md new file mode 100644 index 0000000..54fb3f8 --- /dev/null +++ b/docs/superpowers/specs/2026-06-05-section-number-expansion-design.md @@ -0,0 +1,168 @@ +# Design: Section-Number Expansion — Suffixed Shapes Across All Formats + +**Date:** 2026-06-05 +**Status:** Approved (user-confirmed) +**Branch:** `feat/section-number-expansion` (worktree off origin/main @ `ba99b64`) + +## Problem + +SpecR canonically handles CSI section numbers as `NN NN NN` (e.g. `26 00 13`). Real-world +specifications — regardless of ingest format (.SEC, DOCX, plaintext) — use two longer shapes +that SpecR must respect as **distinct section identities**: + +- **Level 4 (dotted suffix):** `26 00 13.10`, `26 00 13.20` — distinct sections, not variants +- **Level 5 (agency suffix):** `01 32 01.00 10` — trailing pair identifies the agency + (10 = Army Corps, 20 = NAVFAC, 30/40 = NASA/AFCEC) + +The UFGS reference library is the on-disk proof of scale, but suffixed numbers arrive through +every ingest path: DOCX section headers, prose cross-references in any format, and `.txt` +uploads. Corpus ground truth (`docs/references/UFGS`, 665 `.SEC` files, windows-1252): + +| Shape | Count | +|---|---| +| `SECTION NN NN NN` | 422 | +| `SECTION NN NN NN.NN` | 162 | +| `SECTION NN NN NN.NN NN` | 76 | +| Whitespace dirt (leading/double spaces) | 3 | +| Bare SCN without `SECTION ` prefix | 2 | + +36% of the reference corpus carries a suffix. This is core data, not an edge case. + +## Audit Findings (25-agent sweep, 61 touchpoints, 18 adversarially verified) + +### Broken today (suffix rejected or silently truncated) + +| Site | Failure with `26 00 13.10` | +|---|---| +| `src/ast/schemas.ts` `SpecTreeSchema.section`, `PatchSpecBodySchema.section` | `/^\d{2} \d{2} \d{2}$/` → PATCH /specs/:id returns 422 | +| `src/parser/refs/rules.ts` (`csi-section-keyword` pattern) + `src/parser/refs/extract.ts` `buildRef` | Regex `\b` stops at the dot; captures base only → **cross-ref silently links to the wrong section** | +| `src/lib/infer-section.ts` `KEYWORD_RE` | Truncates `01 33 23.33` → `01 33 23` — collides two real, distinct UFGS sections (silent data corruption on upsert) | +| `src/lib/infer-section.ts` `BARE_NUM_RE` | `$`-anchored → suffixed bare header never matches → inference returns `unknown` | +| `src/lib/infer-section.ts` `INLINE_TITLE_RE` | `\b\s+` fails at the dot → inline title extraction lost entirely | +| `src/parser/text/index.ts` `SECTION_EXTRACT_RE` / `BARE_SECTION_RE` | `.txt` header truncates section and garbles title | +| `src/api/parse.ts` `workerOutputSchema.tree.section` | No regex at all → POST /parse accepts what PATCH rejects (ingestion routes disagree) | +| `src/api/generate.ts` `safeFilename` | Dot mangled to dash (cosmetic) | + +### Already suffix-safe (must not regress; pin with tests) + +- SEC ``/`` extraction (`[^<]+` captures verbatim; tests pin `27 05 13.43`) +- DB columns: all `varchar(20)` — longest real form `01 32 01.00 10` is 14 chars +- Division derivation `slice(0, 2)`; division filter `LIKE 'NN %'` +- Generator (markdown + DOCX) and MCP resources: opaque string interpolation +- Lexicographic `ORDER BY` on section strings — provably correct for this fixed-width grammar +- Exact-equality joins/lookups (`specs.section = spec_sections.section_number`, ref resolution, + broken-ref repair) — suffix-to-suffix matches work; exact-match semantics chosen (below) + +## Locked Decisions + +1. **Format scope: full expanded shape.** `NN NN NN`, `NN NN NN.NN`, `NN NN NN.NN NN` are all + first-class canonical section numbers, accepted from every ingest format (.SEC, DOCX, .txt). +2. **Linking: exact match only.** A ref to `26 00 13` never resolves to `26 00 13.10` (or vice + versa). A base ref with no exact target stays an honest broken ref. No family fallback. +3. **Catalog: seed from UFGS.** `spec_sections` gains suffixed entries via the (fixed) seed path. +4. **Approach: single source of truth** (Approach A). One pure module owns the grammar; all + consumers import it. Structured `SectionNumber` type with decomposed columns (Approach C) + rejected — exact-match semantics leaves its power unused. Recorded as ADR-020. + +## Architecture + +### New module: `src/lib/section-number.ts` (pure, no I/O) + +```typescript +/** Anchored canonical validator: single spaces, exact shape. */ +export const SECTION_NUMBER_RE = /^\d{2} \d{2} \d{2}(?:\.\d{2}(?: \d{2})?)?$/; + +/** + * Composable regex fragment for scanners. Tolerates NBSP and multi-space runs + * between groups; wraps the entire section number in ONE capture group so a + * consumer embeds it as `new RegExp(`\\bSECTION\\s+${sectionNumberFragment()}`, 'i')` + * and recovers the value via normalizeSectionNumber(match[1]). + */ +export function sectionNumberFragment(): string; + +/** NBSP→space, collapse whitespace runs, trim. Returns canonical form, or null if + * the result does not match SECTION_NUMBER_RE. */ +export function normalizeSectionNumber(raw: string): string | null; + +/** Scan free text for section-number citations. Returns normalized values + offsets. */ +export function findSectionNumbers(text: string): readonly SectionMatch[]; + +/** Zod schema for API/AST validation: z.string() refined by SECTION_NUMBER_RE. */ +export const SectionNumberSchema: z.ZodString; +``` + +**Known ambiguity (documented, not solved):** in free prose, a trailing two-digit pair after a +dotted suffix is captured as an agency suffix only when it is not followed by another digit. +`Section 26 00 13.10 20 mm pipe` therefore mis-captures `26 00 13.10 20`. Rare; accepted; pinned +by a test marked `// KNOWN AMBIGUITY` per repo convention. The `.SEC` `` path is immune +(tagged, verbatim). + +### Consumer adoption (8 sites) + +| Site | Change | +|---|---| +| `src/ast/schemas.ts` | Both section regexes → `SectionNumberSchema` | +| `src/api/parse.ts` (workerOutputSchema) | `section: z.string()` → `SectionNumberSchema.or(z.literal('unknown'))` — closes the parse-vs-PATCH inconsistency while allowing section-less docs | +| `src/parser/refs/rules.ts` + `extract.ts` | Pattern embeds `sectionNumberFragment()`; `buildRef` normalizes the single capture. Fixes silent truncation | +| `src/lib/infer-section.ts` | `KEYWORD_RE`, `BARE_NUM_RE`, `INLINE_TITLE_RE` rebuilt on the fragment. Fixes truncation, no-match, and lost inline titles | +| `src/parser/text/index.ts` + `signals.ts` | `SECTION_EXTRACT_RE` / `BARE_SECTION_RE` rebuilt on the fragment; suffixed `.txt` headers keep suffix and title | +| `src/parser/sec/index.ts` | SCN: tolerate optional `SECTION ` prefix (captures the 2 bare-SCN corpus files), then `normalizeSectionNumber`. SRF: normalize-or-verbatim — never reject a tagged ref; an unnormalizable SRF stays trimmed-verbatim (exact-match resolution simply won't find it) | +| `src/db/seed.ts` | Same prefix tolerance + normalize before upsert | +| `src/api/generate.ts` | `safeFilename` allows `.` → `26-00-13.10.docx`; no leading/trailing dots | + +### DB migration `013_section_number_normalize_and_check.ts` (one file, two steps) + +1. **Normalize existing rows** — SQL `regexp_replace` (NBSP→space, collapse runs, trim) on + `specs.section` and `spec_sections.section_number`. If normalization would violate a unique + constraint (two rows differing only in whitespace), the migration **aborts loudly** — no + silent merging. +2. **CHECK constraints:** + - `specs.section`: `value ~ '^\d{2} \d{2} \d{2}(\.\d{2}( \d{2})?)?$' OR value = 'unknown'` + (`'unknown'` is the inference-failure sentinel persisted by the parse path) + - `spec_sections.section_number`: pure shape check, no `'unknown'` escape (the catalog is + seeded only from successfully extracted SCN values) + **Deliberately no constraint** on `spec_references.target_spec_section`: that column records + what the source document said (descriptive, not canonical); rejecting a malformed citation at + insert would lose the ref. Exact-match resolution already leaves it unresolved. + +Down migration: drops the CHECK constraints. The whitespace normalization is acknowledged +lossy/irreversible (down is a no-op for data). + +No column-width changes (`varchar(20)` fits the 14-char agency form). After the parser fixes, +re-running `pnpm seed` populates suffixed catalog entries — fulfilling decision 3. + +## Testing + +- **Module unit suite:** normalization table — NBSP, corpus dirt (leading/double spaces), agency + form, rejections (`26 00 13.1`, `26 00 13.10 5`, `2600 13`, `26 00 13.10.20`). +- **Regression tests named by symptom:** + - `'refs: Section 26 00 13.10 citation — suffix retained, not truncated to base'` + - `'infer-section: keyword scan keeps .33 — 01 33 23.33 is not 01 33 23'` + - `'infer-section: bare suffixed header 26 00 13.10 inferred, not none'` + - `'text parser: SECTION 27 05 13.43 - TITLE — suffix kept, title extracted'` + - `'sec parser: bare SCN without SECTION prefix yields section'` + - `'generate: filename preserves dotted suffix'` +- **Suffix-safety pins** for already-working paths: SRF verbatim, division slice/LIKE, markdown + H1, DOCX title paragraph, MCP section table. +- **Integration:** agency-suffixed `.SEC` end-to-end (parse → persist → catalog join inDatabase → + exact-match ref resolution → broken-ref repair); PATCH accepts suffixed section over HTTP; + migration up/down round-trip. +- **KNOWN AMBIGUITY pin:** prose agency-pair capture vs trailing measurements. + +## Delivery — 4 sub-MVP PRs off this branch (500-LOC gate each) + +1. `feat(lib): section-number module — expanded-shape validator + normalizer` (+ ADR-020) +2. `feat(parser): adopt section-number module in refs/inference/text parsers` +3. `feat(api): accept suffixed sections in schemas, parse worker, PATCH, filenames` + (also refreshes the four stale `NN NN NN`-only examples in ARCHITECTURE.md) +4. `feat(db): normalization + shape CHECK constraints migration, seed prefix tolerance` + +Each PR independently green in CI; PRs 2–4 depend on 1. + +## Out of Scope (explicit) + +- Family/fuzzy cross-reference matching (exact match only; rejected by decision 2) +- Structured `SectionNumber` type / decomposed DB columns (rejected; see ADR-020) +- Sort-order changes (lexicographic order is already correct for this grammar) +- Mockup-branch SPA changes (separate branch; linkifier parity handled there later) +- MCP tool changes (division filter is unaffected by suffixes) From 4701aba9c2f9e35a966da24eda9ba6c7a2ce6ef3 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 19:16:06 -0700 Subject: [PATCH 02/28] =?UTF-8?q?docs(plans):=20section-number=20expansion?= =?UTF-8?q?=20implementation=20plan=20=E2=80=94=204=20sub-MVP=20PRs,=20TDD?= =?UTF-8?q?=20task=20breakdown?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2026-06-05-section-number-expansion.md | 1521 +++++++++++++++++ 1 file changed, 1521 insertions(+) create mode 100644 docs/superpowers/plans/2026-06-05-section-number-expansion.md diff --git a/docs/superpowers/plans/2026-06-05-section-number-expansion.md b/docs/superpowers/plans/2026-06-05-section-number-expansion.md new file mode 100644 index 0000000..d88037e --- /dev/null +++ b/docs/superpowers/plans/2026-06-05-section-number-expansion.md @@ -0,0 +1,1521 @@ +# Section-Number Expansion Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Accept the full expanded CSI section-number grammar — `NN NN NN`, `NN NN NN.NN`, `NN NN NN.NN NN` — as first-class section identities across every parse, inference, linking, validation, and storage path. + +**Architecture:** One new pure module `src/lib/section-number.ts` owns the grammar (anchored validator, composable regex fragment, whitespace normalizer, Zod schema). Eight consumer sites adopt it. A DB migration normalizes existing rows and adds CHECK constraints as the last line of defense. Spec: `docs/superpowers/specs/2026-06-05-section-number-expansion-design.md`. + +**Tech Stack:** TypeScript strict, Zod v4 (`.exactOptional()`, `z.uuid()` style), vitest (`pnpm test` = `vitest run --project unit`), node-pg-migrate, PostgreSQL. + +**Branch:** All work on `feat/section-number-expansion` (this worktree, based on origin/main `ba99b64`). PR cutting at the end (Task 16). + +**Key grammar facts** (corpus-verified, 665 UFGS `.SEC` files): +- 422 base / 162 dotted / 76 agency-suffixed; whitespace dirt exists (leading/double spaces); 2 SCN tags lack the `SECTION ` prefix. +- `26 00 13` and `26 00 13.10` and `26 00 13.20` are THREE DIFFERENT sections. Truncation is data corruption. +- Linking is **exact match only** (locked decision). Lexicographic sort is already correct for this grammar — do not touch ordering. +- JS `\s` already matches NBSP (` `); normalization still canonicalizes runs to single ASCII spaces. + +--- + +## File Structure + +| File | Responsibility | +|---|---| +| **Create** `src/lib/section-number.ts` | Grammar owner: `SECTION_NUMBER_RE`, `sectionNumberFragment()`, `normalizeSectionNumber()`, `findSectionNumbers()`, `SectionNumberSchema` | +| **Create** `src/lib/section-number.test.ts` | Module unit suite | +| **Create** `docs/adr/020-section-number-expanded-shape.md` | Decision record | +| **Modify** `CLAUDE.md` | Add ADR-020 to the ADR list | +| **Modify** `src/parser/refs/rules.ts` | `csi-section-keyword` pattern embeds fragment | +| **Modify** `src/parser/refs/extract.ts` | `buildRef` normalizes single capture | +| **Modify** `src/lib/infer-section.ts` | `KEYWORD_RE` / `INLINE_TITLE_RE` / `BARE_NUM_RE` rebuilt on fragment | +| **Modify** `src/parser/text/index.ts` | `SECTION_EXTRACT_RE` / `BARE_SECTION_RE` rebuilt on fragment | +| **Modify** `src/parser/sec/index.ts` | SCN + SRF normalize-or-verbatim | +| **Modify** `src/ast/schemas.ts` | Both section regexes → schema from module | +| **Modify** `src/api/parse.ts` | Worker output schema tightened; body section override validated | +| **Modify** `src/api/generate.ts` | `safeFilename` preserves dots in section | +| **Modify** `ARCHITECTURE.md` | Refresh 4 stale `NN NN NN`-only examples | +| **Create** `src/db/migrations/013_section_number_normalize_and_check.ts` | Normalize rows + CHECK constraints | +| **Modify** `src/db/seed.ts` | SCN regex prefix-optional + normalize | +| Tests modified alongside each consumer | See per-task sections | + +**Module boundary rule:** consumers import from `'../lib/section-number.js'` (or `'../../lib/section-number.js'`) — `lib/` is importable from any module. Never import lib internals from each other's internals. + +--- + +## PR 1 — `feat(lib): section-number module — expanded-shape validator + normalizer` + +### Task 1: The grammar module + +**Files:** +- Create: `src/lib/section-number.ts` +- Test: `src/lib/section-number.test.ts` + +- [ ] **Step 1: Write the failing test** + +```typescript +// src/lib/section-number.test.ts +import { describe, it, expect } from 'vitest'; +import { + SECTION_NUMBER_RE, + sectionNumberFragment, + normalizeSectionNumber, + findSectionNumbers, + SectionNumberSchema, +} from './section-number.js'; + +describe('SECTION_NUMBER_RE', () => { + it.each(['26 00 13', '26 00 13.10', '26 00 13.20', '01 32 01.00 10', '27 05 13.43'])( + 'accepts canonical %s', + (s) => { + expect(SECTION_NUMBER_RE.test(s)).toBe(true); + } + ); + + it.each([ + '26 00 13.1', // one-digit suffix + '26 00 13.100', // three-digit suffix + '26 00 13.10 5', // one-digit agency + '26 00 13 10', // agency without dotted suffix + '2600 13', // missing separator + '26 00 13.10.20', // double dot + '26 00 13', // double internal space (canonical form is single-space) + ' 26 00 13', // leading space + 'unknown', // sentinel is NOT a section number + ])('rejects %s', (s) => { + expect(SECTION_NUMBER_RE.test(s)).toBe(false); + }); +}); + +describe('normalizeSectionNumber', () => { + it('passes canonical forms through', () => { + expect(normalizeSectionNumber('26 00 13')).toBe('26 00 13'); + expect(normalizeSectionNumber('01 32 01.00 10')).toBe('01 32 01.00 10'); + }); + + it('canonicalizes corpus whitespace dirt: leading/trailing/double spaces', () => { + expect(normalizeSectionNumber(' 26 00 13 ')).toBe('26 00 13'); + expect(normalizeSectionNumber('26 00 13.10')).toBe('26 00 13.10'); + }); + + it('canonicalizes NBSP separators', () => { + expect(normalizeSectionNumber('26\u00A000\u00A013.10')).toBe('26 00 13.10'); + }); + + it('returns null for non-section strings', () => { + expect(normalizeSectionNumber('PAINTING')).toBeNull(); + expect(normalizeSectionNumber('26 00 13.1')).toBeNull(); + expect(normalizeSectionNumber('')).toBeNull(); + expect(normalizeSectionNumber('unknown')).toBeNull(); + }); +}); + +describe('sectionNumberFragment', () => { + it('embeds into a keyword scanner and captures the full number as group 1', () => { + const re = new RegExp(String.raw`\bSECTION\s+${sectionNumberFragment()}`, 'i'); + expect(re.exec('SECTION 26 00 13.10 PANELBOARDS')?.[1]).toBe('26 00 13.10'); + expect(re.exec('SECTION 01 32 01.00 10 QUALITY')?.[1]).toBe('01 32 01.00 10'); + expect(re.exec('SECTION 26 00 13 GENERAL')?.[1]).toBe('26 00 13'); + }); + + it('does not capture a trailing pair as agency without a dotted suffix', () => { + const re = new RegExp(String.raw`\bSECTION\s+${sectionNumberFragment()}`, 'i'); + // "20 AMP" must not become an agency suffix — agency requires the dot first + expect(re.exec('SECTION 26 00 13 20 AMP PANELBOARDS')?.[1]).toBe('26 00 13'); + }); + + it('does not match digits glued to longer numbers', () => { + const re = new RegExp(`^${sectionNumberFragment()}$`); + expect(re.test('26 00 134')).toBe(false); + expect(re.test('126 00 13')).toBe(false); + expect(re.test('26 00 13.1010')).toBe(false); + }); + + it('does not capture agency from a following 4-digit year', () => { + const re = new RegExp(String.raw`\bSECTION\s+${sectionNumberFragment()}`, 'i'); + expect(re.exec('SECTION 26 00 13.10 2024 EDITION')?.[1]).toBe('26 00 13.10'); + }); + + // KNOWN AMBIGUITY: a bare two-digit token after a dotted suffix is + // indistinguishable from an agency suffix in free prose. We accept the + // false positive; tagged .SEC refs are immune (verbatim path). + it('KNOWN AMBIGUITY: "26 00 13.10 20 mm" captures 20 as agency', () => { + const re = new RegExp(String.raw`\bSection\s+${sectionNumberFragment()}`, 'i'); + expect(re.exec('See Section 26 00 13.10 20 mm pipe')?.[1]).toBe('26 00 13.10 20'); + }); +}); + +describe('findSectionNumbers', () => { + it('finds and normalizes all citations with offsets', () => { + const text = 'See 26 00 13.10 and also 09\u00A091 00.'; + const found = findSectionNumbers(text); + expect(found.map((f) => f.value)).toEqual(['26 00 13.10', '09 91 00']); + expect(found[0]?.index).toBe(4); + }); + + it('returns empty array when nothing matches', () => { + expect(findSectionNumbers('no numbers here')).toEqual([]); + }); +}); + +describe('SectionNumberSchema', () => { + it('accepts expanded shapes', () => { + expect(SectionNumberSchema.safeParse('01 32 01.00 10').success).toBe(true); + }); + it('rejects malformed and sentinel values', () => { + expect(SectionNumberSchema.safeParse('27210').success).toBe(false); + expect(SectionNumberSchema.safeParse('unknown').success).toBe(false); + }); +}); +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pnpm test src/lib/section-number.test.ts` +Expected: FAIL — `Cannot find module './section-number.js'` + +- [ ] **Step 3: Write the implementation** + +```typescript +// src/lib/section-number.ts +import { z } from 'zod'; + +/** + * Canonical CSI/UFGS section-number grammar (expanded shape): + * NN NN NN — MasterFormat Level 3 (26 00 13) + * NN NN NN.NN — Level 4 dotted suffix (26 00 13.10) + * NN NN NN.NN NN — Level 5 agency suffix, UFGS (01 32 01.00 10) + * Each shape is a DISTINCT section identity. See ADR-020. + */ +export const SECTION_NUMBER_RE = /^\d{2} \d{2} \d{2}(?:\.\d{2}(?: \d{2})?)?$/; + +// Scanner fragment. Differences from SECTION_NUMBER_RE, all deliberate: +// - `\s+` separators: tolerates NBSP/multi-space/newline dirt found in real +// documents (JS `\s` includes  ); normalizeSectionNumber canonicalizes. +// - Agency separator is horizontal-only ([^\S\r\n]) so a 2-digit token on the +// NEXT LINE is never absorbed as an agency suffix. +// - (? { + const rule = SECTION_REF_RULES.find((r) => r.id === 'csi-section-keyword')!; + const fresh = new RegExp(rule.pattern.source, rule.pattern.flags.replace('g', '')); + expect(fresh.exec('See Section 26 00 13.10 for switchgear')?.[1]).toBe('26 00 13.10'); + }); + + it('csi-section-keyword: captures agency suffix — Section 01 32 01.00 10', () => { + const rule = SECTION_REF_RULES.find((r) => r.id === 'csi-section-keyword')!; + const fresh = new RegExp(rule.pattern.source, rule.pattern.flags.replace('g', '')); + expect(fresh.exec('per Section 01 32 01.00 10 requirements')?.[1]).toBe('01 32 01.00 10'); + }); +``` + +Append to `src/parser/refs/extract.test.ts` (match its existing tree-builder helpers; if it has a `makeTree`/`makeNode` helper, reuse it — the assertion is what matters): + +```typescript + it('refs: Section 26 00 13.10 citation — suffix retained, not truncated to base', () => { + const tree = makeTreeWithText('Comply with Section 26 00 13.10 and Section 09 91 00.'); + const refs = extractRefsFromTree(tree); + const sections = refs + .filter((r) => r.targetType === 'section') + .map((r) => r.targetSpecSection); + expect(sections).toContain('26 00 13.10'); + expect(sections).toContain('09 91 00'); + expect(sections).not.toContain('26 00 13'); + }); + + it('refs: NBSP-separated citation normalizes to canonical spacing', () => { + const tree = makeTreeWithText('See Section 26\u00A000\u00A013.10 now.'); + const refs = extractRefsFromTree(tree); + expect(refs.find((r) => r.targetType === 'section')?.targetSpecSection).toBe('26 00 13.10'); + }); +``` + +(If `extract.test.ts` lacks a one-string tree helper, add at top of the file:) + +```typescript +function makeTreeWithText(text: string): SpecTree { + return { + id: '00000000-0000-4000-8000-000000000001', + section: '27 21 00', + title: 'Test', + parts: [ + { + id: '00000000-0000-4000-8000-000000000002', + type: 'part', + text, + children: [], + meta: {}, + }, + ], + }; +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pnpm test src/parser/refs/` +Expected: FAIL — captured group is `'26'` (old group 1) / truncated `'26 00 13'` + +- [ ] **Step 3: Implement** + +In `src/parser/refs/rules.ts` — add import at top, replace the rule pattern and examples: + +```typescript +import { sectionNumberFragment } from '../../lib/section-number.js'; +``` + +```typescript + { + id: 'csi-section-keyword', + description: + 'Matches "Section XX XX XX[.XX[ XX]]" — standard CSI cross-reference with keyword ' + + 'prefix, including Level 4 dotted suffixes and UFGS Level 5 agency suffixes. ' + + 'Most reliable pattern; matches how spec writers are trained to cite other sections.', + pattern: new RegExp(String.raw`\bSection\s+${sectionNumberFragment()}`, 'gi'), + targetType: 'section', + examples: [ + 'See Section 09 91 00', + 'Section 27 21 00 applies to this work', + 'See Section 26 00 13.10', + 'per Section 01 32 01.00 10', + ], + knownFalsePositives: ['Section 26 00 13.10 20 mm pipe — trailing pair reads as agency'], + }, +``` + +In `src/parser/refs/extract.ts` — add import, change the section branch of `buildRef` (the whole section number is now capture group 1): + +```typescript +import { normalizeSectionNumber } from '../../lib/section-number.js'; +``` + +```typescript +function buildRef(sourceNodeId: string, rule: ExtractionRule, match: RegExpMatchArray): SecRef { + if (rule.targetType === 'section') { + const raw = match[1] ?? ''; + return { + sourceNodeId, + targetType: 'section', + targetSpecSection: normalizeSectionNumber(raw) ?? raw.trim(), + referenceText: match[0], + }; + } + return { + sourceNodeId, + targetType: 'standard', + standardCode: `${match[1]} ${match[2]}`, + referenceText: match[0], + }; +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pnpm test src/parser/refs/` +Expected: PASS, including the pre-existing malformed-rejection tests (`Section 9 91 00`, `Section 091 00` still rejected by the fragment's `\d{2}` groups and guards) + +- [ ] **Step 5: Commit** + +```bash +git add src/parser/refs/ +git commit -m "fix(parser): prose section refs capture dotted and agency suffixes — no more base truncation" +``` + +### Task 4: Content inference (`infer-section.ts`) + +Fixes silent truncation (`01 33 23.33` → `01 33 23`), bare-number no-match, and lost inline titles. + +**Files:** +- Modify: `src/lib/infer-section.ts` (lines 13-15 regexes; `findInlineTitle`; `scanKeyword`; `scanBareNumber`) +- Test: `src/lib/infer-section.test.ts` + +- [ ] **Step 1: Write the failing tests** + +Append to the existing describe block in `src/lib/infer-section.test.ts` (reuse its `makeTree` helper): + +```typescript + it('infer-section: keyword scan keeps .33 — 01 33 23.33 is not 01 33 23', () => { + const tree = makeTree([{ text: 'SECTION 01 33 23.33 AVIATION FUEL DISTRIBUTION' }]); + const result = inferSectionMeta(tree); + expect(result.method).toBe('content-high'); + expect(result.inferredSection).toBe('01 33 23.33'); + }); + + it('infer-section: keyword scan keeps agency suffix — 01 32 01.00 10', () => { + const tree = makeTree([{ text: 'SECTION 01 32 01.00 10' }, { text: 'QUALITY CONTROL' }]); + const result = inferSectionMeta(tree); + expect(result.inferredSection).toBe('01 32 01.00 10'); + expect(result.inferredTitle).toBe('QUALITY CONTROL'); + }); + + it('infer-section: inline title extracted from suffixed header', () => { + const tree = makeTree([{ text: 'SECTION 01 33 23.33 AVIATION FUEL DISTRIBUTION' }]); + expect(inferSectionMeta(tree).inferredTitle).toBe('AVIATION FUEL DISTRIBUTION'); + }); + + it('infer-section: bare suffixed header 26 00 13.10 inferred, not none', () => { + const tree = makeTree([{ text: '26 00 13.10' }, { text: 'PANELBOARDS' }]); + const result = inferSectionMeta(tree); + expect(result.method).toBe('content-medium'); + expect(result.inferredSection).toBe('26 00 13.10'); + }); +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pnpm test src/lib/infer-section.test.ts` +Expected: FAIL — `'01 33 23'` (truncated), `'none'` method for bare suffixed + +- [ ] **Step 3: Implement** + +In `src/lib/infer-section.ts`, add import and replace the three regex constants: + +```typescript +import { normalizeSectionNumber, sectionNumberFragment } from './section-number.js'; + +const KEYWORD_RE = new RegExp(String.raw`\bSECTION\s+${sectionNumberFragment()}`, 'i'); +const INLINE_TITLE_RE = new RegExp( + String.raw`\bSECTION\s+${sectionNumberFragment()}\s+(\S.*)`, + 'i' +); +const BARE_NUM_RE = new RegExp(`^${sectionNumberFragment()}$`); +``` + +`findInlineTitle` — the title is now capture group 2 (group 1 is the section number): + +```typescript +function findInlineTitle(nodeText: string): string | null { + const inlineMatch = INLINE_TITLE_RE.exec(nodeText); + if (inlineMatch?.[2] !== undefined && isValidTitle(inlineMatch[2])) { + return inlineMatch[2].trim(); + } + return null; +} +``` + +`scanKeyword` and `scanBareNumber` — normalize the single capture; skip (keep scanning) if normalization fails: + +```typescript +function scanKeyword(nodes: readonly SpecNode[]): SectionInference | null { + for (let i = 0; i < nodes.length; i++) { + const m = KEYWORD_RE.exec(nodes[i]?.text ?? ''); + const section = m === null ? null : normalizeSectionNumber(m[1] ?? ''); + if (section !== null) { + return { + method: 'content-high', + confidence: 'high', + inferredSection: section, + inferredTitle: findTitle(nodes, i), + titleMatch: 'unknown', + }; + } + } + return null; +} + +function scanBareNumber(nodes: readonly SpecNode[]): SectionInference | null { + for (let i = 0; i < nodes.length; i++) { + const m = BARE_NUM_RE.exec((nodes[i]?.text ?? '').trim()); + const section = m === null ? null : normalizeSectionNumber(m[1] ?? ''); + if (section !== null) { + return { + method: 'content-medium', + confidence: 'medium', + inferredSection: section, + inferredTitle: findTitle(nodes, i), + titleMatch: 'unknown', + }; + } + } + return null; +} +``` + +(`isValidTitle` needs no change — it calls `.test()` on the same constants.) + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pnpm test src/lib/infer-section.test.ts` +Expected: PASS — new tests AND all pre-existing tests (title-window boundaries, embedded-sentence rejection, 50-node cap) + +- [ ] **Step 5: Commit** + +```bash +git add src/lib/infer-section.ts src/lib/infer-section.test.ts +git commit -m "fix(lib): section inference keeps dotted and agency suffixes — truncation collided distinct sections" +``` + +### Task 5: Plaintext parser (`parser/text`) + +**Files:** +- Modify: `src/parser/text/index.ts` (lines 41-42 regexes; `extractSectionMeta` lines 62-79) +- Test: `src/parser/text/index.test.ts` (exists — find its header-meta describe block and append) + +- [ ] **Step 1: Write the failing tests** + +Append to `src/parser/text/index.test.ts`: + +```typescript + it('text parser: SECTION 27 05 13.43 - TITLE — suffix kept, title extracted', () => { + const result = parseText('SECTION 27 05 13.43 - TELEVISION DISTRIBUTION\n\nPART 1 GENERAL\n'); + expect(result.tree.section).toBe('27 05 13.43'); + expect(result.tree.title).toBe('TELEVISION DISTRIBUTION'); + }); + + it('text parser: agency-suffixed header with dash title', () => { + const result = parseText('SECTION 01 32 01.00 10 - QUALITY CONTROL\n\nPART 1 GENERAL\n'); + expect(result.tree.section).toBe('01 32 01.00 10'); + expect(result.tree.title).toBe('QUALITY CONTROL'); + }); + + it('text parser: bare suffixed header line', () => { + const result = parseText('26 00 13.10 - PANELBOARDS\n\nPART 1 GENERAL\n'); + expect(result.tree.section).toBe('26 00 13.10'); + expect(result.tree.title).toBe('PANELBOARDS'); + }); +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pnpm test src/parser/text/` +Expected: FAIL — section `'27 05 13'`, title `'unknown'` (the `.43` garbles the dash branch) + +- [ ] **Step 3: Implement** + +In `src/parser/text/index.ts`, add import and replace the two regex constants (title moves to group 2): + +```typescript +import { normalizeSectionNumber, sectionNumberFragment } from '../../lib/section-number.js'; + +const SECTION_EXTRACT_RE = new RegExp( + String.raw`SECTION\s+${sectionNumberFragment()}(?:\s*[-–—]\s*(.+))?`, + 'i' +); +const BARE_SECTION_RE = new RegExp(String.raw`^${sectionNumberFragment()}(?:\s*[-–—]\s*(.+))?`); +``` + +In `extractSectionMeta`, replace the match-handling block: + +```typescript + const m = SECTION_EXTRACT_RE.exec(trimmed) ?? BARE_SECTION_RE.exec(trimmed); + if (m !== null) { + const section = normalizeSectionNumber(m[1] ?? ''); + if (section !== null) { + return { + section, + title: (m[2] ?? '').trim() || 'unknown', + }; + } + } +``` + +(`src/parser/text/signals.ts` `SECTION_HEADER_RE` needs NO change — it is an unanchored prefix classifier that already accepts suffixed headers; the pin test below proves it.) + +Append a pin test to `src/parser/text/index.test.ts`: + +```typescript + it('text parser: suffixed SECTION line classified as header, not body content', () => { + const result = parseText( + 'SECTION 27 05 13.43 - TELEVISION DISTRIBUTION\nPART 1 GENERAL\n1.1 SUMMARY\n' + ); + // header line must not appear as a structural/continuation node + const texts: string[] = []; + const walk = (n: { text: string; children: readonly { text: string }[] }): void => { + texts.push(n.text); + n.children.forEach((c) => walk(c as never)); + }; + result.tree.parts.forEach((p) => walk(p as never)); + expect(texts.some((t) => t.includes('TELEVISION DISTRIBUTION'))).toBe(false); + }); +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pnpm test src/parser/text/` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add src/parser/text/ +git commit -m "fix(parser): .txt header extraction keeps suffixed section numbers and their titles" +``` + +### Task 6: SEC parser normalization (`parser/sec`) + +SCN/SRF already preserve suffixes verbatim — this task adds **canonicalization** (whitespace dirt exists in 3 corpus files) without ever rejecting a tagged value. + +**Files:** +- Modify: `src/parser/sec/index.ts` (SCN at lines 184-189, `pushSrfRefs` at lines 74-83) +- Test: `src/parser/sec/index.test.ts` + +- [ ] **Step 1: Write the failing tests** + +Append to `src/parser/sec/index.test.ts` (reuse its fixture-string style): + +```typescript + it('sec parser: SCN with whitespace dirt normalizes to canonical form', () => { + const xml = `SECTION 26 00 13.10 PANELBOARDS`; + const { tree } = parseSec(xml); + expect(tree.section).toBe('26 00 13.10'); + }); + + it('sec parser: SRF target normalizes NBSP separators to canonical form', () => { + const xml = `SECTION 27 41 00TPART 1XSee 26\u00A000\u00A013.10 now.`; + const { refs } = parseSec(xml); + const sRef = refs.find((r) => r.targetType === 'section'); + expect(sRef?.targetSpecSection).toBe('26 00 13.10'); + }); + + it('sec parser: unnormalizable SRF content kept verbatim (never dropped)', () => { + const xml = `SECTION 27 41 00TPART 1XSee APPENDIX B now.`; + const { refs } = parseSec(xml); + const sRef = refs.find((r) => r.targetType === 'section'); + expect(sRef?.targetSpecSection).toBe('APPENDIX B'); + }); +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pnpm test src/parser/sec/` +Expected: FAIL — first test gets `'26 00 13.10 '`-ish raw (double-space/trailing preserved), second gets NBSP-joined string + +(If the first two happen to pass due to `trimValues`, keep them as pins — at minimum the NBSP test fails.) + +- [ ] **Step 3: Implement** + +In `src/parser/sec/index.ts`, add import: + +```typescript +import { normalizeSectionNumber } from '../../lib/section-number.js'; +``` + +Replace the SCN extraction in `parseSec`: + +```typescript + // SCN/STL are parsed with processEntities: false — decode here. + // Normalize-or-verbatim: canonicalize section whitespace when the value is a + // valid expanded-shape number; keep verbatim otherwise (downstream schema + // gates decide what to do with non-conforming values). + const scnRaw = decodeXmlEntities( + requireString(sec['SCN'], 'SCN') + .replace(/^SECTION\s+/i, '') + .trim() + ); + const section = normalizeSectionNumber(scnRaw) ?? scnRaw; +``` + +Replace `pushSrfRefs`: + +```typescript +function pushSrfRefs(raw: string, nodeId: string, refs: SecRef[]): void { + for (const sec of extractSrfSections(raw)) { + refs.push({ + sourceNodeId: nodeId, + targetType: 'section', + // Normalize-or-verbatim: a tagged ref is never rejected; exact-match + // resolution simply won't find non-conforming targets. + targetSpecSection: normalizeSectionNumber(sec) ?? sec, + referenceText: stripTags(raw).slice(0, 200), + }); + } +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pnpm test src/parser/sec/` +Expected: PASS — including pre-existing suffix-pin tests (`27 05 13.43`) + +- [ ] **Step 5: Run the full unit suite and commit** + +Run: `pnpm test` +Expected: PASS (517 pre-existing + all new) + +```bash +git add src/parser/sec/ +git commit -m "feat(parser): SEC SCN/SRF section numbers normalize to canonical expanded shape" +``` + +**PR 2 cut point** (~420 LOC). + +--- + +## PR 3 — `feat(api): accept suffixed sections in schemas, parse worker, PATCH, filenames` + +### Task 7: AST schemas + +**Files:** +- Modify: `src/ast/schemas.ts` (lines 64-78) +- Test: `src/ast/schemas.test.ts` + +- [ ] **Step 1: Write the failing tests** + +Append to the relevant describe blocks in `src/ast/schemas.test.ts`: + +```typescript + it('SpecTreeSchema: accepts dotted and agency-suffixed sections', () => { + const base = { id: '00000000-0000-4000-8000-000000000001', title: 'T', parts: [] }; + expect(SpecTreeSchema.safeParse({ ...base, section: '26 00 13.10' }).success).toBe(true); + expect(SpecTreeSchema.safeParse({ ...base, section: '01 32 01.00 10' }).success).toBe(true); + }); + + it('SpecTreeSchema: accepts the unknown sentinel (parser output for section-less docs)', () => { + const base = { id: '00000000-0000-4000-8000-000000000001', title: 'T', parts: [] }; + expect(SpecTreeSchema.safeParse({ ...base, section: 'unknown' }).success).toBe(true); + }); + + it('PatchSpecBodySchema: accepts suffixed sections, rejects the unknown sentinel', () => { + expect(PatchSpecBodySchema.safeParse({ section: '26 00 13.10' }).success).toBe(true); + expect(PatchSpecBodySchema.safeParse({ section: '01 32 01.00 10' }).success).toBe(true); + expect(PatchSpecBodySchema.safeParse({ section: 'unknown' }).success).toBe(false); + expect(PatchSpecBodySchema.safeParse({ section: '26 00 13.1' }).success).toBe(false); + }); +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pnpm test src/ast/schemas.test.ts` +Expected: FAIL — suffixed sections rejected by the old regex + +- [ ] **Step 3: Implement** + +In `src/ast/schemas.ts`, add import and replace both `section` fields: + +```typescript +import { SectionNumberSchema } from '../lib/section-number.js'; +``` + +```typescript +export const SpecTreeSchema = z.object({ + id: z.uuid(), + // Canonical expanded shape, or the 'unknown' sentinel emitted by parsers + // when no section number is found (content inference may fill it later). + section: SectionNumberSchema.or(z.literal('unknown')), + title: z.string().check(z.minLength(1)), + parts: z.array(SpecNodeSchema), + warnings: z.array(ParseWarningSchema).exactOptional(), +}); + +export const PatchSpecBodySchema = z.object({ + title: z.string().check(z.minLength(1)).exactOptional(), + // PATCH must set a real section — the sentinel is not assignable by clients. + section: SectionNumberSchema.exactOptional(), +}); +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pnpm test src/ast/schemas.test.ts` +Expected: PASS — including pre-existing malformed-rejection tests (`'27210'`) + +- [ ] **Step 5: Commit** + +```bash +git add src/ast/schemas.ts src/ast/schemas.test.ts +git commit -m "feat(api): AST schemas accept expanded section shapes; PATCH rejects sentinel" +``` + +### Task 8: Parse worker output schema + body section override + +**Files:** +- Modify: `src/api/parse.ts` (workerOutputSchema line 94; `parseHandler` lines 54-71) +- Test: `src/api/parse.test.ts` + +- [ ] **Step 1: Update the worker mock, then write the failing tests** + +**CRITICAL pre-step:** `src/api/parse.test.ts` lines 10-16 mock `parsePool.run` to resolve +`{ tree: { id: '', section: 'test', title: 'T', parts: [] }, refs: [] }`. The tightened +schema rejects `section: 'test'` — change the mock's section to `'27 21 00'`, or the +existing async-job tests fail out-of-band: + +```typescript +vi.mock('../lib/parse-pool.js', () => ({ + parsePool: { + run: vi.fn().mockResolvedValue({ + tree: { id: '', section: '27 21 00', title: 'T', parts: [] }, + refs: [], + }), + }, +})); +``` + +Then append to the `describe('parseHandler', ...)` block (mirrors the file's `makeRes()` + +literal-`Request` style; a `.txt` upload skips archive/MIME validation per `validateUpload`): + +```typescript + it('parse: dirty section override normalized before persist', async () => { + const { persistParsedSpec } = await import('../db/index.js'); + const { parseHandler } = await import('./parse.js'); + const req = { + file: { originalname: 'spec.txt', mimetype: 'text/plain', buffer: Buffer.from('x') }, + body: { section: '26 00 13.10' }, + } as unknown as Request; + const res = makeRes(); + await parseHandler(req, res); + expect(res.status).toHaveBeenCalledWith(202); + await vi.waitFor(() => { + expect(persistParsedSpec).toHaveBeenCalledWith( + expect.objectContaining({ tree: expect.objectContaining({ section: '26 00 13.10' }) }) + ); + }); + }); + + it('parse: malformed section override → 400 before job creation', async () => { + const { createJob } = await import('../lib/jobs.js'); + const { parseHandler } = await import('./parse.js'); + const req = { + file: { originalname: 'spec.txt', mimetype: 'text/plain', buffer: Buffer.from('x') }, + body: { section: '26 00 13.1' }, + } as unknown as Request; + const res = makeRes(); + await parseHandler(req, res); + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ error: 'invalid section override format' }) + ); + expect(createJob).not.toHaveBeenCalled(); + }); +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pnpm test src/api/parse.test.ts` +Expected: FAIL — override is passed through raw today (no 400, no normalization) + +- [ ] **Step 3: Implement** + +In `src/api/parse.ts`, add import: + +```typescript +import { SectionNumberSchema, normalizeSectionNumber } from '../lib/section-number.js'; +``` + +Tighten the worker schema (`section` line only): + +```typescript +const workerOutputSchema = z.object({ + tree: z.object({ + id: z.string(), + section: SectionNumberSchema.or(z.literal('unknown')), + title: z.string(), + parts: z.array(z.unknown()), + warnings: z.array(ParseWarningSchema).optional(), + }), + refs: z.array(SecRefSchema).default([]), + capabilities: z.array(z.string()).optional(), +}); +``` + +`parseBody(req.body)` is currently called inline in the `processParseJob` dispatch — refactor minimally and immutably (full replacement handler): + +```typescript +export async function parseHandler(req: Request, res: Response): Promise { + if (!req.file) { + res.status(400).json({ success: false, error: 'file required' }); + return; + } + + const ext = path.extname(req.file.originalname).toLowerCase(); + const validationError = await validateUpload(req, ext); + if (validationError !== null) { + res.status(400).json({ success: false, error: validationError }); + return; + } + + const rawBody = parseBody(req.body); + const normalizedSection = + rawBody.section !== undefined ? normalizeSectionNumber(rawBody.section) : undefined; + if (rawBody.section !== undefined && normalizedSection === null) { + res.status(400).json({ success: false, error: 'invalid section override format' }); + return; + } + const body: ParseBody = { + ...rawBody, + ...(normalizedSection != null ? { section: normalizedSection } : {}), + }; + + const jobId = createJob(); + // Pass buffer and ext, not the full file object, so the request closure can be GC'd + void processParseJob(jobId, req.file.buffer, ext, body); + res.status(202).json({ success: true, data: { jobId } }); +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pnpm test src/api/parse.test.ts` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add src/api/parse.ts src/api/parse.test.ts +git commit -m "feat(api): parse worker schema gates expanded shapes; section override normalized, 400 on malformed" +``` + +### Task 9: Download filename + +**Files:** +- Modify: `src/api/generate.ts` (`safeFilename`, line 9-16) +- Test: `src/api/generate.test.ts` (create if absent; check for an existing unit test file first) + +- [ ] **Step 1: Write the failing test** + +If `src/api/generate.test.ts` does not exist, create it; `safeFilename` is module-private, so export it for testability: + +```typescript +// src/api/generate.test.ts +import { describe, it, expect } from 'vitest'; +import { safeFilename } from './generate.js'; + +describe('safeFilename', () => { + it('generate: filename preserves dotted suffix', () => { + expect(safeFilename('26 00 13.10', 'Panelboards')).toBe('26-00-13.10-Panelboards.docx'); + }); + + it('generate: agency form keeps dot, spaces become dashes', () => { + expect(safeFilename('01 32 01.00 10', 'QC')).toBe('01-32-01.00-10-QC.docx'); + }); + + it('generate: base form unchanged behavior', () => { + expect(safeFilename('27 21 00', 'Structured Cabling')).toBe('27-21-00-Structured-Cabling.docx'); + }); +}); +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pnpm test src/api/generate.test.ts` +Expected: FAIL — `safeFilename` not exported / dot becomes dash + +- [ ] **Step 3: Implement** + +In `src/api/generate.ts`: + +```typescript +// Exported for unit testing. +export function safeFilename(section: string, title: string): string { + // '.' is allowed in the section part so '26 00 13.10' stays distinguishable + // from a hypothetical '26 00 1310' in the suggested filename. + const s = section.replace(/[^a-zA-Z0-9.-]/g, '-').replace(/-+/g, '-'); + const t = title + .replace(/[^a-zA-Z0-9-]/g, '-') + .replace(/-+/g, '-') + .slice(0, 60); + return `${s}-${t}.docx`; +} +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `pnpm test src/api/generate.test.ts` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add src/api/generate.ts src/api/generate.test.ts +git commit -m "fix(api): download filename preserves section dotted suffix" +``` + +### Task 9b: Generator suffix-safety pins (no production change) + +The generator renders `tree.section` opaquely and is already suffix-safe — pin it so a future +"helpful" refactor can't regress it. + +**Files:** +- Test: `src/generator/markdown.test.ts`, `src/generator/index.test.ts` + +- [ ] **Step 1: Write the pin tests** (these should pass immediately — they are regression pins, not TDD reds) + +Append to `src/generator/markdown.test.ts` (reuse its tree-fixture helper style): + +```typescript + it('renderMarkdown: suffixed section renders verbatim in H1', () => { + const tree = makeTree({ section: '27 05 13.43', title: 'TV Distribution' }); + expect(renderMarkdown(tree)).toContain('# SECTION 27 05 13.43 — TV Distribution'); + }); +``` + +Append to `src/generator/index.test.ts`: + +```typescript + it('generateDocx: agency-suffixed section survives into document.xml', async () => { + const buffer = await generateDocx(makeTree({ section: '01 32 01.00 10', title: 'QC' })); + const xml = await readDocumentXml(buffer); // reuse the file's existing unzip helper + expect(xml).toContain('01 32 01.00 10'); + }); +``` + +(Adapt fixture-builder names to each file's existing helpers — both files already construct +SpecTree fixtures inline; only the `section` value is new.) + +- [ ] **Step 2: Run tests — expect immediate PASS** + +Run: `pnpm test src/generator/` +Expected: PASS (pins confirm already-correct behavior) + +- [ ] **Step 3: Commit** + +```bash +git add src/generator/ +git commit -m "test(generator): pin suffixed-section rendering in markdown H1 and DOCX title" +``` + +### Task 10: PATCH over HTTP (integration) + ARCHITECTURE examples + +**Files:** +- Modify: `src/api/specs.integration.test.ts` +- Modify: `ARCHITECTURE.md` (lines ~228, ~265, ~347, ~417 — locate by content, not line number) + +- [ ] **Step 1: Write the integration test** (runs only under `pnpm test:integration`; needs PostgreSQL via `docker compose up -d postgres`) + +Append at the END of the `describe('PATCH /specs/:id (integration)', ...)` block in +`src/api/specs.integration.test.ts` — it uses `fetch` against `baseUrl` with a seeded +`testSpecId` (NOT supertest). Append last because these tests mutate the seeded spec's +section, and the earlier title-update test asserts `section === '27 21 00'`: + +```typescript + it('accepts a dotted-suffix section', async () => { + const res = await fetch(`${baseUrl}/specs/${testSpecId}`, { + method: 'PATCH', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ section: '27 21 00.10' }), + }); + const body = (await res.json()) as Record; + expect(res.status).toBe(200); + expect((body['data'] as Record)['section']).toBe('27 21 00.10'); + }); + + it('accepts an agency-suffix section', async () => { + const res = await fetch(`${baseUrl}/specs/${testSpecId}`, { + method: 'PATCH', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ section: '27 21 00.10 20' }), + }); + const body = (await res.json()) as Record; + expect(res.status).toBe(200); + expect((body['data'] as Record)['section']).toBe('27 21 00.10 20'); + }); +``` + +(Keep the existing `returns 422 for invalid section format` test untouched — `'27210'` must still 422.) + +- [ ] **Step 2: Update ARCHITECTURE.md examples** + +Find and update the four stale comments (exact current text → replacement): + +1. `section: string // CSI section number, e.g. "27 21 00"` → + `section: string // CSI section number, e.g. "27 21 00", "26 00 13.10", "01 32 01.00 10"` +2. `section VARCHAR(20), -- "27 21 00"` → + `section VARCHAR(20), -- "27 21 00" | "26 00 13.10" | "01 32 01.00 10" (expanded shape, ADR-020)` +3. `target_spec_section VARCHAR(20), -- "09 91 00" — for section refs` → + `target_spec_section VARCHAR(20), -- "09 91 00" / "26 00 13.10" — for section refs` +4. In the ref-table example row containing `See Section 09 91 00`: leave the example itself, but if a nearby comment claims the `NN NN NN` shape is the only valid form, amend it to mention expanded shapes. + +- [ ] **Step 3: Run integration tests (requires DB)** + +Run: `docker compose up -d postgres && pnpm migrate && pnpm seed && pnpm test:integration` +Expected: PASS (new PATCH tests + all pre-existing) + +- [ ] **Step 4: Commit** + +```bash +git add src/api/specs.integration.test.ts ARCHITECTURE.md +git commit -m "test(api): PATCH accepts expanded section shapes over HTTP; refresh ARCHITECTURE examples" +``` + +**PR 3 cut point** (~300 LOC). + +--- + +## PR 4 — `feat(db): normalization + shape CHECK constraints migration, seed prefix tolerance` + +### Task 11: Seed SCN tolerance + normalization + +**Files:** +- Modify: `src/db/seed.ts` (SCN_RE line 15, `extractSectionMeta` lines 18-29) +- Test: `src/db/seed.test.ts` + +- [ ] **Step 1: Write the failing tests** + +Append to `src/db/seed.test.ts`: + +```typescript + it('seed: extractSectionMeta retains .43 suffix, division still 27', () => { + const content = `SECTION 27 05 13.43TV DISTRIBUTION`; + expect(extractSectionMeta(content)).toEqual({ + sectionNumber: '27 05 13.43', + title: 'TV DISTRIBUTION', + division: '27', + }); + }); + + it('seed: bare SCN without SECTION prefix yields section', () => { + const content = `01 31 23.13 20SUSTAINABILITY REPORTING`; + expect(extractSectionMeta(content)?.sectionNumber).toBe('01 31 23.13 20'); + }); + + it('seed: whitespace dirt in SCN normalizes to canonical form', () => { + const content = `SECTION 26 00 13.10 X`; + expect(extractSectionMeta(content)?.sectionNumber).toBe('26 00 13.10'); + }); + + it('seed: unnormalizable SCN content is skipped (null), not seeded dirty', () => { + const content = `SECTION TBDX`; + expect(extractSectionMeta(content)).toBeNull(); + }); +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pnpm test src/db/seed.test.ts` +Expected: FAIL — bare-SCN test returns null (prefix required today); dirty test returns dirty string + +- [ ] **Step 3: Implement** + +In `src/db/seed.ts`: + +```typescript +import { normalizeSectionNumber } from '../lib/section-number.js'; + +// Prefix-optional: 2 corpus files carry a bare SCN without the 'SECTION ' keyword. +const SCN_RE = /\s*(?:SECTION\s+)?([^<]+)<\/SCN>/i; +const STL_RE = /([^<]+)<\/STL>/; + +export function extractSectionMeta(content: string): SectionRecord | null { + const scnMatch = SCN_RE.exec(content); + const stlMatch = STL_RE.exec(content); + + if (!scnMatch?.[1] || !stlMatch?.[1]) return null; + + // Catalog rows must be canonical — the shape CHECK constraint (migration 013) + // enforces this at the DB layer; skipping here keeps the seed loud-and-clean. + const sectionNumber = normalizeSectionNumber(scnMatch[1]); + if (sectionNumber === null) return null; + + const title = stlMatch[1].trim(); + const division = sectionNumber.slice(0, 2); + + return { sectionNumber, title, division }; +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pnpm test src/db/seed.test.ts` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add src/db/seed.ts src/db/seed.test.ts +git commit -m "feat(db): seed tolerates bare SCN, normalizes section numbers before upsert" +``` + +### Task 12: Migration 013 — normalize + CHECK constraints + +**Files:** +- Create: `src/db/migrations/013_section_number_normalize_and_check.ts` +- Test: `src/db/queries/specs.integration.test.ts` (constraint behavior; integration project) + +- [ ] **Step 1: Write the migration** + +```typescript +// src/db/migrations/013_section_number_normalize_and_check.ts +import type { MigrationBuilder } from 'node-pg-migrate'; + +// Expanded shape (ADR-020): NN NN NN | NN NN NN.NN | NN NN NN.NN NN +const SHAPE = String.raw`^\d{2} \d{2} \d{2}(\.\d{2}( \d{2})?)?$`; + +// NBSP→space, collapse whitespace runs, trim — SQL mirror of +// normalizeSectionNumber() in src/lib/section-number.ts. +const NORM = (col: string): string => + `btrim(regexp_replace(replace(${col}, chr(160), ' '), '\\s+', ' ', 'g'))`; + +export const up = (pgm: MigrationBuilder): void => { + // Step 1: normalize existing rows. If two rows collapse to the same key, + // the existing UNIQUE constraints (specs_section_source_unique, + // spec_sections_section_number_key) abort this migration loudly — by design; + // resolve duplicates manually before re-running. + pgm.sql(`UPDATE specs SET section = ${NORM('section')} WHERE section <> ${NORM('section')}`); + pgm.sql( + `UPDATE spec_sections SET section_number = ${NORM('section_number')} WHERE section_number <> ${NORM('section_number')}` + ); + + // Step 2: shape gates. specs.section additionally admits the 'unknown' + // sentinel written by the parse path for section-less documents. + pgm.addConstraint('specs', 'specs_section_shape_check', { + check: `section ~ '${SHAPE}' OR section = 'unknown'`, + }); + pgm.addConstraint('spec_sections', 'spec_sections_section_number_shape_check', { + check: `section_number ~ '${SHAPE}'`, + }); + // Deliberately NO constraint on spec_references.target_spec_section: it + // records what the source document said (descriptive, not canonical). +}; + +export const down = (pgm: MigrationBuilder): void => { + pgm.dropConstraint('spec_sections', 'spec_sections_section_number_shape_check'); + pgm.dropConstraint('specs', 'specs_section_shape_check'); + // Whitespace normalization is lossy and is not reversed. +}; +``` + +- [ ] **Step 2: Write the failing integration tests** + +Append to `src/db/queries/specs.integration.test.ts` (or the project's DB-constraint test home): + +```typescript + it('db: specs.section CHECK accepts expanded shapes and the unknown sentinel', async () => { + await pool.query( + `INSERT INTO specs (section, title, source) VALUES ('99 88 77.10 20', 'Shape OK', 'arcat')` + ); + await pool.query( + `INSERT INTO specs (section, title, source) VALUES ('unknown', 'Sentinel OK', 'arcat')` + ); + await pool.query( + `DELETE FROM specs WHERE section IN ('99 88 77.10 20', 'unknown') AND source = 'arcat'` + ); + }); + + it('db: specs.section CHECK rejects malformed sections', async () => { + await expect( + pool.query(`INSERT INTO specs (section, title, source) VALUES ('99 8877', 'Bad', 'arcat')`) + ).rejects.toThrow(/specs_section_shape_check/); + }); + + it('db: spec_sections shape CHECK rejects the sentinel (catalog is canonical-only)', async () => { + await expect( + pool.query( + `INSERT INTO spec_sections (section_number, title, division) VALUES ('unknown', 'Bad', 'un')` + ) + ).rejects.toThrow(/spec_sections_section_number_shape_check/); + }); +``` + +- [ ] **Step 3: Run migration + integration tests** + +```bash +docker compose up -d postgres +pnpm migrate # applies 013 +pnpm seed # now seeds suffixed + bare-SCN catalog entries +pnpm test:integration +``` +Expected: migrate applies cleanly; seed logs a HIGHER record count than before (suffixed entries now included); integration tests PASS. + +Verify the seed delta explicitly: + +```bash +docker compose exec postgres psql -U specr -d specr -c \ + "SELECT count(*) FILTER (WHERE section_number ~ '\\.') AS suffixed, count(*) AS total FROM spec_sections;" +``` +Expected: `suffixed` ≈ 238+ (162 dotted + 76 agency, minus any duplicate SCNs), `total` ≈ 660+. + +- [ ] **Step 4: Verify down-migration reversibility** + +```bash +pnpm migrate:down # drops both constraints +pnpm migrate # re-applies +``` +Expected: both run cleanly. + +- [ ] **Step 5: Commit** + +```bash +git add src/db/migrations/013_section_number_normalize_and_check.ts src/db/queries/specs.integration.test.ts +git commit -m "feat(db): migration 013 — normalize section whitespace, add expanded-shape CHECK constraints" +``` + +### Task 13: End-to-end integration — agency-suffixed .SEC round trip + +**Files:** +- Test: `src/lib/file-loader.integration.test.ts` (harness: `loadFiles([absolutePath])` against real corpus files, `pool` from `'../db/index.js'`, `PROJECT_ROOT` const already defined at top) + +- [ ] **Step 1: Write the tests** + +Append inside `describe('loadFiles() integration', ...)`: + +```typescript + const AGENCY_FIXTURE = path.join( + PROJECT_ROOT, + 'docs/references/UFGS/DIVISION_01/01_32_01.00_10.SEC' + ); + + it('e2e: agency-suffixed corpus file loads with section intact', async () => { + const result = await loadFiles([AGENCY_FIXTURE]); + expect(result.succeeded).toBe(1); + + const row = await pool.query<{ section: string }>( + `SELECT section FROM specs WHERE section = '01 32 01.00 10' AND source = 'ufgs' LIMIT 1` + ); + expect(row.rows[0]?.section).toBe('01 32 01.00 10'); + }); + + it('e2e: ref targeting an agency-suffixed section resolves by exact match', async () => { + const { persistParsedSpec } = await import('../db/index.js'); + const target = await pool.query<{ id: string }>( + `SELECT id FROM specs WHERE section = '01 32 01.00 10' AND source = 'ufgs' LIMIT 1` + ); + expect(target.rows[0]?.id).toBeDefined(); + + const sourceNodeId = '00000000-0000-4000-8000-00000000aaaa'; + const specId = await persistParsedSpec({ + tree: { + id: '00000000-0000-4000-8000-00000000bbbb', + section: '99 88 77', + title: 'Ref Source', + parts: [ + { id: sourceNodeId, type: 'part', text: 'See Section 01 32 01.00 10.', children: [], meta: {} }, + ], + }, + refs: [ + { + sourceNodeId, + targetType: 'section', + targetSpecSection: '01 32 01.00 10', + referenceText: 'See Section 01 32 01.00 10.', + }, + ], + }); + + const refRow = await pool.query<{ target_spec_id: string | null }>( + `SELECT target_spec_id FROM spec_references WHERE source_spec_id = $1`, + [specId] + ); + expect(refRow.rows[0]?.target_spec_id).toBe(target.rows[0]?.id); + + await pool.query(`DELETE FROM specs WHERE id = $1`, [specId]); + }); + + it('e2e: catalog join + division filter — suffixed section listed inDatabase for division 01', async () => { + const { listSpecSections } = await import('../db/index.js'); + const sections = await listSpecSections('01'); + const entry = sections.find((s) => s.section === '01 32 01.00 10'); + // catalog row exists (pnpm seed) AND exact-equality join sees the loaded spec + expect(entry).toBeDefined(); + expect(entry?.inDatabase).toBe(true); + }); +``` + +- [ ] **Step 2: Run integration tests** + +Run: `pnpm test:integration` +Expected: PASS + +- [ ] **Step 3: Commit** + +```bash +git add -A src/ +git commit -m "test(integration): agency-suffixed .SEC end-to-end — parse, persist, exact-match ref resolution" +``` + +**PR 4 cut point** (~280 LOC). + +--- + +## Task 14: Full verification sweep + +- [ ] **Step 1: Full local gate** + +```bash +pnpm lint && pnpm build && pnpm test +docker compose up -d postgres && pnpm migrate && pnpm seed && pnpm test:integration +``` +Expected: all green. If `pnpm lint` flags `max-lines-per-function` on touched files, extract helpers — do not suppress rules. + +- [ ] **Step 2: Corpus smoke test** — bulk-load the real UFGS corpus and confirm suffixed sections persist: + +```bash +pnpm load:files 'docs/references/UFGS/DIVISION_01/01_32_01.00_10.SEC' 2>&1 | tail -3 +docker compose exec postgres psql -U specr -d specr -c \ + "SELECT section, title FROM specs WHERE section = '01 32 01.00 10';" +``` +Expected: one row, section intact with agency suffix. + +- [ ] **Step 3: Commit any stragglers; do NOT push yet** + +## Task 15: Update the parser edge-cases memory note + +The session memory `project_parser_edge_cases.md` tracks open parser bugs — after this work, the suffix-truncation class is fixed; record that (done by the orchestrating session, not a subagent). + +## Task 16: PR cutting (stacked) + +The branch now holds 4 contiguous commit groups. Cut stacked PRs: + +```bash +# identify group-boundary SHAs +git log --oneline ba99b64..HEAD + +# PR 1 +git branch feat/section-number-lib +git push -u origin feat/section-number-lib +gh pr create --base main --head feat/section-number-lib \ + --title "feat(lib): section-number module — expanded-shape validator + normalizer" \ + --body "" + +# PR 2 (stacked on PR 1) +git branch feat/section-number-parsers +git push -u origin feat/section-number-parsers +gh pr create --base feat/section-number-lib --head feat/section-number-parsers \ + --title "feat(parser): adopt section-number module in refs/inference/text parsers" ... + +# PR 3, PR 4: same pattern, each based on the previous branch +``` + +After PR 1 merges, retarget PR 2's base to `main` (`gh pr edit --base main`), and so on down the stack. Each PR body needs: summary, exact test-plan commands, explicit out-of-scope note ("This PR does NOT include …" per CLAUDE.md). + +**LOC-gate note for PR 1:** the branch history starts with the design spec + this plan +(~900 doc lines) which land inside PR 1's diff. `docs/superpowers/` is not in the LOC-check +exclusion list, so CI will warn. State in the PR body that the code delta is ~330 LOC and the +remainder is design/plan documentation. + +--- + +## Self-Review Notes (already applied) + +- `signals.ts` deliberately unchanged — prefix classifier already suffix-tolerant; pinned by test in Task 5. +- `SpecTreeSchema` admits `'unknown'` (deviation from the design doc's shorthand table, consistent with its migration section + worker schema — parsers legitimately emit the sentinel). +- MCP, generator, division slice/LIKE, sort order: no changes (audit: suffix-safe). +- `spec_references.target_spec_section`: intentionally unconstrained (design decision). +- Existing tests expected to keep passing without modification: rules.test.ts malformed-rejection, infer-section window/embedded tests, sec refs `27 05 13.43` pins, PATCH 422-on-malformed. From 823cc0c2e1bc836441b1a5ac3ff59d50c145fd33 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 20:22:23 -0700 Subject: [PATCH 03/28] =?UTF-8?q?feat(lib):=20section-number=20module=20?= =?UTF-8?q?=E2=80=94=20expanded-shape=20validator=20+=20normalizer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/lib/section-number.test.ts | 113 +++++++++++++++++++++++++++++++++ src/lib/section-number.ts | 57 +++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 src/lib/section-number.test.ts create mode 100644 src/lib/section-number.ts diff --git a/src/lib/section-number.test.ts b/src/lib/section-number.test.ts new file mode 100644 index 0000000..354884a --- /dev/null +++ b/src/lib/section-number.test.ts @@ -0,0 +1,113 @@ +// src/lib/section-number.test.ts +import { describe, it, expect } from 'vitest'; +import { + SECTION_NUMBER_RE, + sectionNumberFragment, + normalizeSectionNumber, + findSectionNumbers, + SectionNumberSchema, +} from './section-number.js'; + +describe('SECTION_NUMBER_RE', () => { + it.each(['26 00 13', '26 00 13.10', '26 00 13.20', '01 32 01.00 10', '27 05 13.43'])( + 'accepts canonical %s', + (s) => { + expect(SECTION_NUMBER_RE.test(s)).toBe(true); + } + ); + + it.each([ + '26 00 13.1', // one-digit suffix + '26 00 13.100', // three-digit suffix + '26 00 13.10 5', // one-digit agency + '26 00 13 10', // agency without dotted suffix + '2600 13', // missing separator + '26 00 13.10.20', // double dot + '26 00 13', // double internal space (canonical form is single-space) + ' 26 00 13', // leading space + 'unknown', // sentinel is NOT a section number + ])('rejects %s', (s) => { + expect(SECTION_NUMBER_RE.test(s)).toBe(false); + }); +}); + +describe('normalizeSectionNumber', () => { + it('passes canonical forms through', () => { + expect(normalizeSectionNumber('26 00 13')).toBe('26 00 13'); + expect(normalizeSectionNumber('01 32 01.00 10')).toBe('01 32 01.00 10'); + }); + + it('canonicalizes corpus whitespace dirt: leading/trailing/double spaces', () => { + expect(normalizeSectionNumber(' 26 00 13 ')).toBe('26 00 13'); + expect(normalizeSectionNumber('26 00 13.10')).toBe('26 00 13.10'); + }); + + it('canonicalizes NBSP separators', () => { + expect(normalizeSectionNumber('26 00 13.10')).toBe('26 00 13.10'); + }); + + it('returns null for non-section strings', () => { + expect(normalizeSectionNumber('PAINTING')).toBeNull(); + expect(normalizeSectionNumber('26 00 13.1')).toBeNull(); + expect(normalizeSectionNumber('')).toBeNull(); + expect(normalizeSectionNumber('unknown')).toBeNull(); + }); +}); + +describe('sectionNumberFragment', () => { + it('embeds into a keyword scanner and captures the full number as group 1', () => { + const re = new RegExp(String.raw`\bSECTION\s+${sectionNumberFragment()}`, 'i'); + expect(re.exec('SECTION 26 00 13.10 PANELBOARDS')?.[1]).toBe('26 00 13.10'); + expect(re.exec('SECTION 01 32 01.00 10 QUALITY')?.[1]).toBe('01 32 01.00 10'); + expect(re.exec('SECTION 26 00 13 GENERAL')?.[1]).toBe('26 00 13'); + }); + + it('does not capture a trailing pair as agency without a dotted suffix', () => { + const re = new RegExp(String.raw`\bSECTION\s+${sectionNumberFragment()}`, 'i'); + // "20 AMP" must not become an agency suffix — agency requires the dot first + expect(re.exec('SECTION 26 00 13 20 AMP PANELBOARDS')?.[1]).toBe('26 00 13'); + }); + + it('does not match digits glued to longer numbers', () => { + const re = new RegExp(`^${sectionNumberFragment()}$`); + expect(re.test('26 00 134')).toBe(false); + expect(re.test('126 00 13')).toBe(false); + expect(re.test('26 00 13.1010')).toBe(false); + }); + + it('does not capture agency from a following 4-digit year', () => { + const re = new RegExp(String.raw`\bSECTION\s+${sectionNumberFragment()}`, 'i'); + expect(re.exec('SECTION 26 00 13.10 2024 EDITION')?.[1]).toBe('26 00 13.10'); + }); + + // KNOWN AMBIGUITY: a bare two-digit token after a dotted suffix is + // indistinguishable from an agency suffix in free prose. We accept the + // false positive; tagged .SEC refs are immune (verbatim path). + it('KNOWN AMBIGUITY: "26 00 13.10 20 mm" captures 20 as agency', () => { + const re = new RegExp(String.raw`\bSection\s+${sectionNumberFragment()}`, 'i'); + expect(re.exec('See Section 26 00 13.10 20 mm pipe')?.[1]).toBe('26 00 13.10 20'); + }); +}); + +describe('findSectionNumbers', () => { + it('finds and normalizes all citations with offsets', () => { + const text = 'See 26 00 13.10 and also 09 91 00.'; + const found = findSectionNumbers(text); + expect(found.map((f) => f.value)).toEqual(['26 00 13.10', '09 91 00']); + expect(found[0]?.index).toBe(4); + }); + + it('returns empty array when nothing matches', () => { + expect(findSectionNumbers('no numbers here')).toEqual([]); + }); +}); + +describe('SectionNumberSchema', () => { + it('accepts expanded shapes', () => { + expect(SectionNumberSchema.safeParse('01 32 01.00 10').success).toBe(true); + }); + it('rejects malformed and sentinel values', () => { + expect(SectionNumberSchema.safeParse('27210').success).toBe(false); + expect(SectionNumberSchema.safeParse('unknown').success).toBe(false); + }); +}); diff --git a/src/lib/section-number.ts b/src/lib/section-number.ts new file mode 100644 index 0000000..a3ed0dd --- /dev/null +++ b/src/lib/section-number.ts @@ -0,0 +1,57 @@ +// src/lib/section-number.ts +import { z } from 'zod'; + +/** + * Canonical CSI/UFGS section-number grammar (expanded shape): + * NN NN NN — MasterFormat Level 3 (26 00 13) + * NN NN NN.NN — Level 4 dotted suffix (26 00 13.10) + * NN NN NN.NN NN — Level 5 agency suffix, UFGS (01 32 01.00 10) + * Each shape is a DISTINCT section identity. See ADR-020. + */ +export const SECTION_NUMBER_RE = /^\d{2} \d{2} \d{2}(?:\.\d{2}(?: \d{2})?)?$/; + +// Scanner fragment. Differences from SECTION_NUMBER_RE, all deliberate: +// - `\s+` separators: tolerates NBSP/multi-space/newline dirt found in real +// documents (JS `\s` includes ); normalizeSectionNumber canonicalizes. +// - Agency separator is horizontal-only ([^\S\r\n]) so a 2-digit token on the +// NEXT LINE is never absorbed as an agency suffix. +// - (? Date: Fri, 5 Jun 2026 20:27:27 -0700 Subject: [PATCH 04/28] test(lib): pin section-number fragment capture-group + multiline separator contracts --- src/lib/section-number.test.ts | 17 +++++++++++++++++ src/lib/section-number.ts | 9 ++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/lib/section-number.test.ts b/src/lib/section-number.test.ts index 354884a..d165543 100644 --- a/src/lib/section-number.test.ts +++ b/src/lib/section-number.test.ts @@ -62,6 +62,11 @@ describe('sectionNumberFragment', () => { expect(re.exec('SECTION 26 00 13 GENERAL')?.[1]).toBe('26 00 13'); }); + it('exposes exactly ONE capture group (group 1 = whole number)', () => { + // length === 2 → [full match, group 1]; consumer-added groups start at 2 + expect(new RegExp(sectionNumberFragment()).exec('26 00 13.10 20')?.length).toBe(2); + }); + it('does not capture a trailing pair as agency without a dotted suffix', () => { const re = new RegExp(String.raw`\bSECTION\s+${sectionNumberFragment()}`, 'i'); // "20 AMP" must not become an agency suffix — agency requires the dot first @@ -100,6 +105,18 @@ describe('findSectionNumbers', () => { it('returns empty array when nothing matches', () => { expect(findSectionNumbers('no numbers here')).toEqual([]); }); + + it('matches across a newline inter-group separator (\\s+) and normalizes', () => { + // inter-group separators use \s+, which spans the newline + const found = findSectionNumbers('26\n00 13'); + expect(found.map((f) => f.value)).toEqual(['26 00 13']); + }); + + it('does not absorb a next-line pair as agency (horizontal-only separator)', () => { + // agency separator is [^\S\r\n]+ — a 2-digit token on the NEXT line is left out + const found = findSectionNumbers('see 26 00 13.10\n20 items'); + expect(found.map((f) => f.value)).toEqual(['26 00 13.10']); + }); }); describe('SectionNumberSchema', () => { diff --git a/src/lib/section-number.ts b/src/lib/section-number.ts index a3ed0dd..4294c67 100644 --- a/src/lib/section-number.ts +++ b/src/lib/section-number.ts @@ -21,7 +21,14 @@ export const SECTION_NUMBER_RE = /^\d{2} \d{2} \d{2}(?:\.\d{2}(?: \d{2})?)?$/; // and recover the value via normalizeSectionNumber(match[1]). const FRAGMENT = String.raw`(? Date: Fri, 5 Jun 2026 20:29:01 -0700 Subject: [PATCH 05/28] docs(adr): ADR-020 expanded section-number shape as opaque normalized string --- CLAUDE.md | 1 + docs/adr/020-section-number-expanded-shape.md | 39 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 docs/adr/020-section-number-expanded-shape.md diff --git a/CLAUDE.md b/CLAUDE.md index 7cff09d..8e7ae71 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -249,6 +249,7 @@ docs/adr/ 017-project-manual-publishing.md 018-document-concurrency-state-model.md 019-scope-boundaries-content-neutral-platform.md + 020-section-number-expanded-shape.md ``` **ADR format:** diff --git a/docs/adr/020-section-number-expanded-shape.md b/docs/adr/020-section-number-expanded-shape.md new file mode 100644 index 0000000..7308685 --- /dev/null +++ b/docs/adr/020-section-number-expanded-shape.md @@ -0,0 +1,39 @@ +# ADR-020: Expanded Section-Number Shape as Opaque Normalized String + +## Status: Accepted + +## Context + +CSI MasterFormat Level 4 (`26 00 13.10`) and UFGS Level 5 agency suffixes +(`01 32 01.00 10`; 10 = Army Corps, 20 = NAVFAC, 30/40 = NASA/AFCEC) appear in +36% of the UFGS reference corpus and arrive through every ingest format (.SEC, +DOCX, plaintext). SpecR previously validated only `NN NN NN`, silently +truncating suffixes in prose-ref extraction and content inference — collapsing +distinct sections (e.g. `01 33 23` vs `01 33 23.33`) into one identity. + +Two viable designs: +1. Opaque normalized string, grammar owned by one module. +2. Structured `SectionNumber` type with decomposed DB columns + (division/l2/l3/suffix/agency). + +## Decision + +Opaque normalized string (`src/lib/section-number.ts` owns the grammar). +Canonical form: single ASCII spaces, `NN NN NN`, `NN NN NN.NN`, or +`NN NN NN.NN NN`. Cross-reference linking remains **exact match only** — a ref +to `26 00 13` never resolves to `26 00 13.10` or vice versa. DB CHECK +constraints enforce shape on `specs.section` (plus the `'unknown'` inference +sentinel) and `spec_sections.section_number`; +`spec_references.target_spec_section` stays unconstrained because it records +what the source document said. + +## Consequences + +- One module to change when the grammar grows; consumers embed its fragment. +- Exact-match keeps broken refs honest (a base ref to a missing base section + is genuinely broken) at the cost of no family fallback. +- Structured queries (e.g. "all agency variants of X") require LIKE prefixes + rather than column equality — acceptable; no current feature needs them. +- Free-prose ambiguity: `Section 26 00 13.10 20 mm` mis-reads `20` as an + agency suffix. Documented as KNOWN AMBIGUITY; tagged .SEC refs are immune. +- Lexicographic ORDER BY remains correct for the fixed-width grammar. From 86be5ecc25fbd83fed6d12a2cbe8cea6ae514069 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 20:32:42 -0700 Subject: [PATCH 06/28] =?UTF-8?q?fix(parser):=20prose=20section=20refs=20c?= =?UTF-8?q?apture=20dotted=20and=20agency=20suffixes=20=E2=80=94=20no=20mo?= =?UTF-8?q?re=20base=20truncation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parser/refs/extract.test.ts | 32 ++++++++++++++++++++++++++++++++ src/parser/refs/extract.ts | 4 +++- src/parser/refs/rules.test.ts | 12 ++++++++++++ src/parser/refs/rules.ts | 16 ++++++++++++---- 4 files changed, 59 insertions(+), 5 deletions(-) diff --git a/src/parser/refs/extract.test.ts b/src/parser/refs/extract.test.ts index 9857a5c..80e53f2 100644 --- a/src/parser/refs/extract.test.ts +++ b/src/parser/refs/extract.test.ts @@ -3,6 +3,23 @@ import { v4 as uuidv4 } from 'uuid'; import { extractRefsFromTree } from './extract.js'; import type { SpecNode, SpecTree } from '../../ast/types.js'; +function makeTreeWithText(text: string): SpecTree { + return { + id: '00000000-0000-4000-8000-000000000001', + section: '27 21 00', + title: 'Test', + parts: [ + { + id: '00000000-0000-4000-8000-000000000002', + type: 'part', + text, + children: [], + meta: {}, + }, + ], + }; +} + function makeNode( type: SpecNode['type'], text: string, @@ -132,4 +149,19 @@ describe('extractRefsFromTree', () => { expect(orgs).toContain(expected); } }); + + it('refs: Section 26 00 13.10 citation — suffix retained, not truncated to base', () => { + const tree = makeTreeWithText('Comply with Section 26 00 13.10 and Section 09 91 00.'); + const refs = extractRefsFromTree(tree); + const sections = refs.filter((r) => r.targetType === 'section').map((r) => r.targetSpecSection); + expect(sections).toContain('26 00 13.10'); + expect(sections).toContain('09 91 00'); + expect(sections).not.toContain('26 00 13'); + }); + + it('refs: NBSP-separated citation normalizes to canonical spacing', () => { + const tree = makeTreeWithText('See Section 26 00 13.10 now.'); + const refs = extractRefsFromTree(tree); + expect(refs.find((r) => r.targetType === 'section')?.targetSpecSection).toBe('26 00 13.10'); + }); }); diff --git a/src/parser/refs/extract.ts b/src/parser/refs/extract.ts index d0c5b66..e72df17 100644 --- a/src/parser/refs/extract.ts +++ b/src/parser/refs/extract.ts @@ -5,6 +5,7 @@ import { buildStandardRefRules, type ExtractionRule, } from './rules.js'; +import { normalizeSectionNumber } from '../../lib/section-number.js'; const DEFAULT_RULES: readonly ExtractionRule[] = [ ...SECTION_REF_RULES, @@ -46,10 +47,11 @@ function toGlobalPattern(rule: ExtractionRule): RegExp { function buildRef(sourceNodeId: string, rule: ExtractionRule, match: RegExpMatchArray): SecRef { if (rule.targetType === 'section') { + const raw = match[1] ?? ''; return { sourceNodeId, targetType: 'section', - targetSpecSection: `${match[1]} ${match[2]} ${match[3]}`, + targetSpecSection: normalizeSectionNumber(raw) ?? raw.trim(), referenceText: match[0], }; } diff --git a/src/parser/refs/rules.test.ts b/src/parser/refs/rules.test.ts index 099b7f3..25ef749 100644 --- a/src/parser/refs/rules.test.ts +++ b/src/parser/refs/rules.test.ts @@ -27,6 +27,18 @@ describe('SECTION_REF_RULES', () => { expect(rule.examples.length).toBeGreaterThan(0); } }); + + it('csi-section-keyword: captures dotted suffix — Section 26 00 13.10 not truncated to base', () => { + const rule = SECTION_REF_RULES.find((r) => r.id === 'csi-section-keyword')!; + const fresh = new RegExp(rule.pattern.source, rule.pattern.flags.replace('g', '')); + expect(fresh.exec('See Section 26 00 13.10 for switchgear')?.[1]).toBe('26 00 13.10'); + }); + + it('csi-section-keyword: captures agency suffix — Section 01 32 01.00 10', () => { + const rule = SECTION_REF_RULES.find((r) => r.id === 'csi-section-keyword')!; + const fresh = new RegExp(rule.pattern.source, rule.pattern.flags.replace('g', '')); + expect(fresh.exec('per Section 01 32 01.00 10 requirements')?.[1]).toBe('01 32 01.00 10'); + }); }); describe('STANDARD_ORG_PATTERNS', () => { diff --git a/src/parser/refs/rules.ts b/src/parser/refs/rules.ts index ecf3e15..57cdf3b 100644 --- a/src/parser/refs/rules.ts +++ b/src/parser/refs/rules.ts @@ -2,6 +2,8 @@ // Operates on any text content reachable through SpecTree walks. // Rules are data — not code — so agents can inspect, propose, and fix them. +import { sectionNumberFragment } from '../../lib/section-number.js'; + // ─── Rule type ──────────────────────────────────────────────────────────────── export interface ExtractionRule { @@ -19,12 +21,18 @@ export const SECTION_REF_RULES: readonly ExtractionRule[] = [ { id: 'csi-section-keyword', description: - 'Matches "Section XX XX XX" — standard CSI cross-reference with keyword prefix. ' + + 'Matches "Section XX XX XX[.XX[ XX]]" — standard CSI cross-reference with keyword ' + + 'prefix, including Level 4 dotted suffixes and UFGS Level 5 agency suffixes. ' + 'Most reliable pattern; matches how spec writers are trained to cite other sections.', - pattern: /\bSection\s+(\d{2})\s+(\d{2})\s+(\d{2})\b/gi, + pattern: new RegExp(String.raw`\bSection\s+${sectionNumberFragment()}`, 'gi'), targetType: 'section', - examples: ['See Section 09 91 00', 'Section 27 21 00 applies to this work'], - knownFalsePositives: [], + examples: [ + 'See Section 09 91 00', + 'Section 27 21 00 applies to this work', + 'See Section 26 00 13.10', + 'per Section 01 32 01.00 10', + ], + knownFalsePositives: ['Section 26 00 13.10 20 mm pipe — trailing pair reads as agency'], }, ]; From 33273bf4b6a44569e23bf8fd1704bfe9c272c9fa Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 20:39:39 -0700 Subject: [PATCH 07/28] =?UTF-8?q?fix(lib):=20section=20inference=20keeps?= =?UTF-8?q?=20dotted=20and=20agency=20suffixes=20=E2=80=94=20truncation=20?= =?UTF-8?q?collided=20distinct=20sections?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/lib/infer-section.test.ts | 26 ++++++++++++++++++++++++++ src/lib/infer-section.ts | 24 +++++++++++++++--------- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/src/lib/infer-section.test.ts b/src/lib/infer-section.test.ts index bc1abbe..5586d3f 100644 --- a/src/lib/infer-section.test.ts +++ b/src/lib/infer-section.test.ts @@ -133,6 +133,32 @@ describe('inferSectionMeta', () => { expect(() => inferSectionMeta(tree)).not.toThrow(); expect(inferSectionMeta(tree).confidence).toBe('none'); }); + + it('infer-section: keyword scan keeps .33 — 01 33 23.33 is not 01 33 23', () => { + const tree = makeTree([{ text: 'SECTION 01 33 23.33 AVIATION FUEL DISTRIBUTION' }]); + const result = inferSectionMeta(tree); + expect(result.method).toBe('content-high'); + expect(result.inferredSection).toBe('01 33 23.33'); + }); + + it('infer-section: keyword scan keeps agency suffix — 01 32 01.00 10', () => { + const tree = makeTree([{ text: 'SECTION 01 32 01.00 10' }, { text: 'QUALITY CONTROL' }]); + const result = inferSectionMeta(tree); + expect(result.inferredSection).toBe('01 32 01.00 10'); + expect(result.inferredTitle).toBe('QUALITY CONTROL'); + }); + + it('infer-section: inline title extracted from suffixed header', () => { + const tree = makeTree([{ text: 'SECTION 01 33 23.33 AVIATION FUEL DISTRIBUTION' }]); + expect(inferSectionMeta(tree).inferredTitle).toBe('AVIATION FUEL DISTRIBUTION'); + }); + + it('infer-section: bare suffixed header 26 00 13.10 inferred, not none', () => { + const tree = makeTree([{ text: '26 00 13.10' }, { text: 'PANELBOARDS' }]); + const result = inferSectionMeta(tree); + expect(result.method).toBe('content-medium'); + expect(result.inferredSection).toBe('26 00 13.10'); + }); }); describe('computeTitleMatch', () => { diff --git a/src/lib/infer-section.ts b/src/lib/infer-section.ts index a1a1660..e75aa27 100644 --- a/src/lib/infer-section.ts +++ b/src/lib/infer-section.ts @@ -1,4 +1,5 @@ import type { SpecTree, SpecNode } from '../ast/types.js'; +import { normalizeSectionNumber, sectionNumberFragment } from './section-number.js'; export interface SectionInference { readonly method: 'metadata' | 'content-high' | 'content-medium' | 'none'; @@ -10,9 +11,12 @@ export interface SectionInference { readonly titleMatch: 'exact' | 'close' | 'divergent' | 'unknown'; } -const KEYWORD_RE = /\bSECTION\s+(\d{2})\s+(\d{2})\s+(\d{2})\b/i; -const INLINE_TITLE_RE = /\bSECTION\s+\d{2}\s+\d{2}\s+\d{2}\b\s+(.*)/i; -const BARE_NUM_RE = /^(\d{2})\s+(\d{2})\s+(\d{2})$/; +const KEYWORD_RE = new RegExp(String.raw`\bSECTION\s+${sectionNumberFragment()}`, 'i'); +const INLINE_TITLE_RE = new RegExp( + String.raw`\bSECTION\s+${sectionNumberFragment()}\s+(\S.*)`, + 'i' +); +const BARE_NUM_RE = new RegExp(`^${sectionNumberFragment()}$`); const MAX_NODES = 50; const TITLE_MIN_LENGTH = 3; const TITLE_MAX_LENGTH = 150; @@ -43,8 +47,8 @@ function isValidTitle(text: string): boolean { function findInlineTitle(nodeText: string): string | null { const inlineMatch = INLINE_TITLE_RE.exec(nodeText); - if (inlineMatch?.[1] !== undefined && isValidTitle(inlineMatch[1])) { - return inlineMatch[1].trim(); + if (inlineMatch?.[2] !== undefined && isValidTitle(inlineMatch[2])) { + return inlineMatch[2].trim(); } return null; } @@ -101,11 +105,12 @@ const NONE_RESULT: SectionInference = { function scanKeyword(nodes: readonly SpecNode[]): SectionInference | null { for (let i = 0; i < nodes.length; i++) { const m = KEYWORD_RE.exec(nodes[i]?.text ?? ''); - if (m !== null) { + const section = m === null ? null : normalizeSectionNumber(m[1] ?? ''); + if (section !== null) { return { method: 'content-high', confidence: 'high', - inferredSection: `${m[1]} ${m[2]} ${m[3]}`, + inferredSection: section, inferredTitle: findTitle(nodes, i), titleMatch: 'unknown', }; @@ -117,11 +122,12 @@ function scanKeyword(nodes: readonly SpecNode[]): SectionInference | null { function scanBareNumber(nodes: readonly SpecNode[]): SectionInference | null { for (let i = 0; i < nodes.length; i++) { const m = BARE_NUM_RE.exec((nodes[i]?.text ?? '').trim()); - if (m !== null) { + const section = m === null ? null : normalizeSectionNumber(m[1] ?? ''); + if (section !== null) { return { method: 'content-medium', confidence: 'medium', - inferredSection: `${m[1]} ${m[2]} ${m[3]}`, + inferredSection: section, inferredTitle: findTitle(nodes, i), titleMatch: 'unknown', }; From 10e3544f731d715fea1dfeba62a08119c9d14538 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 20:48:17 -0700 Subject: [PATCH 08/28] =?UTF-8?q?fix(lib):=20strip=20dash=20separator=20in?= =?UTF-8?q?=20inferred=20inline=20titles=20=E2=80=94=20parity=20with=20tex?= =?UTF-8?q?t=20parser?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/lib/infer-section.test.ts | 12 ++++++++++++ src/lib/infer-section.ts | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/lib/infer-section.test.ts b/src/lib/infer-section.test.ts index 5586d3f..3f5f2a0 100644 --- a/src/lib/infer-section.test.ts +++ b/src/lib/infer-section.test.ts @@ -159,6 +159,18 @@ describe('inferSectionMeta', () => { expect(result.method).toBe('content-medium'); expect(result.inferredSection).toBe('26 00 13.10'); }); + + it('infer-section: dash-separated inline title — dash stripped, parity with text parser', () => { + const tree = makeTree([{ text: 'SECTION 26 00 13.10 - PANELBOARDS' }]); + const result = inferSectionMeta(tree); + expect(result.inferredSection).toBe('26 00 13.10'); + expect(result.inferredTitle).toBe('PANELBOARDS'); + }); + + it('infer-section: em-dash separated inline title on base section', () => { + const tree = makeTree([{ text: 'SECTION 26 09 33 — MOTOR CONTROLLERS' }]); + expect(inferSectionMeta(tree).inferredTitle).toBe('MOTOR CONTROLLERS'); + }); }); describe('computeTitleMatch', () => { diff --git a/src/lib/infer-section.ts b/src/lib/infer-section.ts index e75aa27..f29357f 100644 --- a/src/lib/infer-section.ts +++ b/src/lib/infer-section.ts @@ -13,7 +13,7 @@ export interface SectionInference { const KEYWORD_RE = new RegExp(String.raw`\bSECTION\s+${sectionNumberFragment()}`, 'i'); const INLINE_TITLE_RE = new RegExp( - String.raw`\bSECTION\s+${sectionNumberFragment()}\s+(\S.*)`, + String.raw`\bSECTION\s+${sectionNumberFragment()}(?:\s*[-–—]\s*|\s+)(\S.*)`, 'i' ); const BARE_NUM_RE = new RegExp(`^${sectionNumberFragment()}$`); From f89c2c4b83036a41fbb943e93a32b6c56a9bc0e1 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 20:50:45 -0700 Subject: [PATCH 09/28] fix(parser): .txt header extraction keeps suffixed section numbers and their titles --- src/parser/text/index.test.ts | 31 +++++++++++++++++++++++++++++++ src/parser/text/index.ts | 19 +++++++++++++------ 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/src/parser/text/index.test.ts b/src/parser/text/index.test.ts index fabbbd2..f829e91 100644 --- a/src/parser/text/index.test.ts +++ b/src/parser/text/index.test.ts @@ -132,4 +132,35 @@ describe('parseText — section extraction edge cases', () => { expect(result.tree.parts.every((p) => p.type === 'part')).toBe(true); expect(result.tree.parts).toHaveLength(1); }); + + it('text parser: SECTION 27 05 13.43 - TITLE — suffix kept, title extracted', () => { + const result = parseText('SECTION 27 05 13.43 - TELEVISION DISTRIBUTION\n\nPART 1 GENERAL\n'); + expect(result.tree.section).toBe('27 05 13.43'); + expect(result.tree.title).toBe('TELEVISION DISTRIBUTION'); + }); + + it('text parser: agency-suffixed header with dash title', () => { + const result = parseText('SECTION 01 32 01.00 10 - QUALITY CONTROL\n\nPART 1 GENERAL\n'); + expect(result.tree.section).toBe('01 32 01.00 10'); + expect(result.tree.title).toBe('QUALITY CONTROL'); + }); + + it('text parser: bare suffixed header line', () => { + const result = parseText('26 00 13.10 - PANELBOARDS\n\nPART 1 GENERAL\n'); + expect(result.tree.section).toBe('26 00 13.10'); + expect(result.tree.title).toBe('PANELBOARDS'); + }); + + it('text parser: suffixed SECTION line classified as header, not body content', () => { + const result = parseText( + 'SECTION 27 05 13.43 - TELEVISION DISTRIBUTION\nPART 1 GENERAL\n1.1 SUMMARY\n' + ); + const texts: string[] = []; + const walk = (n: SpecNode): void => { + texts.push(n.text); + n.children.forEach((c) => walk(c)); + }; + result.tree.parts.forEach((p) => walk(p)); + expect(texts.some((t) => t.includes('TELEVISION DISTRIBUTION'))).toBe(false); + }); }); diff --git a/src/parser/text/index.ts b/src/parser/text/index.ts index 3e5cd05..803ad86 100644 --- a/src/parser/text/index.ts +++ b/src/parser/text/index.ts @@ -1,5 +1,6 @@ import { v4 as uuidv4 } from 'uuid'; import { inferSectionMeta } from '../../lib/infer-section.js'; +import { normalizeSectionNumber, sectionNumberFragment } from '../../lib/section-number.js'; import type { SpecNode, SpecNodeMeta, @@ -38,8 +39,11 @@ const WARNING_SUGGESTIONS: Readonly> = { 'More PART headings than a CSI spec normally has (typically 3). Headings may be over-matched.', }; -const SECTION_EXTRACT_RE = /SECTION\s+(\d{2})\s+(\d{2})\s+(\d{2})(?:\s*[-–—]\s*(.+))?/i; -const BARE_SECTION_RE = /^(\d{2})\s+(\d{2})\s+(\d{2})(?:\s*[-–—]\s*(.+))?/; +const SECTION_EXTRACT_RE = new RegExp( + String.raw`SECTION\s+${sectionNumberFragment()}(?:\s*[-–—]\s*(.+))?`, + 'i' +); +const BARE_SECTION_RE = new RegExp(String.raw`^${sectionNumberFragment()}(?:\s*[-–—]\s*(.+))?`); /** Scan up to this many non-blank lines for the SECTION header. * 10 instead of 5: UFGS files have a metadata header block before the SECTION line @@ -68,10 +72,13 @@ function extractSectionMeta( const trimmed = line.trim(); const m = SECTION_EXTRACT_RE.exec(trimmed) ?? BARE_SECTION_RE.exec(trimmed); if (m !== null) { - return { - section: `${m[1]} ${m[2]} ${m[3]}`, - title: (m[4] ?? '').trim() || 'unknown', - }; + const section = normalizeSectionNumber(m[1] ?? ''); + if (section !== null) { + return { + section, + title: (m[2] ?? '').trim() || 'unknown', + }; + } } if (++scanned >= MAX_HEADER_SCAN) break; } From f156694381c3323bab60cc7b00ed71a2cfef26fb Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 20:59:38 -0700 Subject: [PATCH 10/28] feat(parser): SEC SCN/SRF section numbers normalize to canonical expanded shape --- src/parser/sec/index.test.ts | 27 +++++++++++++++++++++++++++ src/parser/sec/index.ts | 13 ++++++++++--- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/src/parser/sec/index.test.ts b/src/parser/sec/index.test.ts index f22627a..68fd163 100644 --- a/src/parser/sec/index.test.ts +++ b/src/parser/sec/index.test.ts @@ -323,3 +323,30 @@ describe('parseSec — XML entity decoding', () => { expect(txt?.text).toContain('bogus reference � survives'); }); }); + +describe('parseSec — SCN/SRF whitespace canonicalization', () => { + it('sec parser: SCN with whitespace dirt normalizes to canonical form', () => { + const xml = `SECTION 26 00 13.10 PANELBOARDS`; + const { tree } = parseSec(xml); + expect(tree.section).toBe('26 00 13.10'); + }); + + it('sec parser: SRF target normalizes NBSP separators to canonical form', () => { + // NBSP (U+00A0) separators -- written as escape sequences to avoid no-irregular-whitespace + const nbsp = '\u00a0'; + const srfContent = `26${nbsp}00${nbsp}13.10`; + const xml = + `SECTION 27 41 00T` + + `PART 1XSee ${srfContent} now.`; + const { refs } = parseSec(xml); + const sRef = refs.find((r) => r.targetType === 'section'); + expect(sRef?.targetSpecSection).toBe('26 00 13.10'); + }); + + it('sec parser: unnormalizable SRF content kept verbatim (never dropped)', () => { + const xml = `SECTION 27 41 00TPART 1XSee APPENDIX B now.`; + const { refs } = parseSec(xml); + const sRef = refs.find((r) => r.targetType === 'section'); + expect(sRef?.targetSpecSection).toBe('APPENDIX B'); + }); +}); diff --git a/src/parser/sec/index.ts b/src/parser/sec/index.ts index 6672380..fbbe056 100644 --- a/src/parser/sec/index.ts +++ b/src/parser/sec/index.ts @@ -4,6 +4,7 @@ import type { SpecNode, SpecTree, NodeType, SecRef } from '../../ast/types.js'; import { ParserError } from '../error.js'; import type { NteNode, PrtNode, RefNode, SptNode } from './elements.js'; import { decodeXmlEntities } from './entities.js'; +import { normalizeSectionNumber } from '../../lib/section-number.js'; export type { SecRef }; @@ -76,7 +77,9 @@ function pushSrfRefs(raw: string, nodeId: string, refs: SecRef[]): void { refs.push({ sourceNodeId: nodeId, targetType: 'section', - targetSpecSection: sec, + // Normalize-or-verbatim: a tagged ref is never rejected; exact-match + // resolution simply won't find non-conforming targets. + targetSpecSection: normalizeSectionNumber(sec) ?? sec, referenceText: stripTags(raw).slice(0, 200), }); } @@ -181,12 +184,16 @@ export function parseSec(xml: string): ParsedSec { const sec = (root as Record)['SEC'] as Record | undefined; if (!sec) throw new ParserError('SEC root element not found'); - // SCN/STL are parsed with processEntities: false — decode here - const section = decodeXmlEntities( + // SCN/STL are parsed with processEntities: false — decode here. + // Normalize-or-verbatim: canonicalize section whitespace when the value is a + // valid expanded-shape number; keep verbatim otherwise (downstream schema + // gates decide what to do with non-conforming values). + const scnRaw = decodeXmlEntities( requireString(sec['SCN'], 'SCN') .replace(/^SECTION\s+/i, '') .trim() ); + const section = normalizeSectionNumber(scnRaw) ?? scnRaw; const title = decodeXmlEntities(requireString(sec['STL'], 'STL')); const refs: SecRef[] = []; From 972d3729536396d5bf4260476e971598ccddb57d Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 21:04:28 -0700 Subject: [PATCH 11/28] test(parser): pin internal SCN whitespace normalization --- src/parser/sec/index.test.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/parser/sec/index.test.ts b/src/parser/sec/index.test.ts index 68fd163..8d79773 100644 --- a/src/parser/sec/index.test.ts +++ b/src/parser/sec/index.test.ts @@ -331,6 +331,12 @@ describe('parseSec — SCN/SRF whitespace canonicalization', () => { expect(tree.section).toBe('26 00 13.10'); }); + it('sec parser: SCN with internal whitespace dirt normalizes (prefix-strip alone cannot fix)', () => { + const xml = `SECTION 26 00 13.10PANELBOARDS`; + const { tree } = parseSec(xml); + expect(tree.section).toBe('26 00 13.10'); + }); + it('sec parser: SRF target normalizes NBSP separators to canonical form', () => { // NBSP (U+00A0) separators -- written as escape sequences to avoid no-irregular-whitespace const nbsp = '\u00a0'; From 94a156b4d6830df20653c4ee92f15cf61927cc92 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 21:07:41 -0700 Subject: [PATCH 12/28] =?UTF-8?q?docs(parser):=20correct=20SCN=20comment?= =?UTF-8?q?=20=E2=80=94=20gates=20not=20yet=20landed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parser/sec/index.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/parser/sec/index.ts b/src/parser/sec/index.ts index fbbe056..809e08b 100644 --- a/src/parser/sec/index.ts +++ b/src/parser/sec/index.ts @@ -186,8 +186,10 @@ export function parseSec(xml: string): ParsedSec { // SCN/STL are parsed with processEntities: false — decode here. // Normalize-or-verbatim: canonicalize section whitespace when the value is a - // valid expanded-shape number; keep verbatim otherwise (downstream schema - // gates decide what to do with non-conforming values). + // valid expanded-shape number; keep verbatim otherwise. Tagged values are + // never rejected here — exact-match linkage simply won't find + // non-conforming sections (validation gates arrive with the API schema + + // DB CHECK constraint work). const scnRaw = decodeXmlEntities( requireString(sec['SCN'], 'SCN') .replace(/^SECTION\s+/i, '') From 5259fd5362012a73a13f5030f0d80d77650ea239 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 21:09:32 -0700 Subject: [PATCH 13/28] feat(api): AST schemas accept expanded section shapes; PATCH rejects sentinel --- src/ast/schemas.test.ts | 22 ++++++++++++++++++++++ src/ast/schemas.ts | 11 ++++++----- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/ast/schemas.test.ts b/src/ast/schemas.test.ts index e5d92d2..77857f8 100644 --- a/src/ast/schemas.test.ts +++ b/src/ast/schemas.test.ts @@ -78,6 +78,28 @@ describe('SpecTreeSchema — valid inputs', () => { }); }); +describe('SpecTreeSchema — expanded section shapes', () => { + it('SpecTreeSchema: accepts dotted and agency-suffixed sections', () => { + const base = { id: '00000000-0000-4000-8000-000000000001', title: 'T', parts: [] }; + expect(SpecTreeSchema.safeParse({ ...base, section: '26 00 13.10' }).success).toBe(true); + expect(SpecTreeSchema.safeParse({ ...base, section: '01 32 01.00 10' }).success).toBe(true); + }); + + it('SpecTreeSchema: accepts the unknown sentinel (parser output for section-less docs)', () => { + const base = { id: '00000000-0000-4000-8000-000000000001', title: 'T', parts: [] }; + expect(SpecTreeSchema.safeParse({ ...base, section: 'unknown' }).success).toBe(true); + }); +}); + +describe('PatchSpecBodySchema — expanded section shapes', () => { + it('PatchSpecBodySchema: accepts suffixed sections, rejects the unknown sentinel', () => { + expect(PatchSpecBodySchema.safeParse({ section: '26 00 13.10' }).success).toBe(true); + expect(PatchSpecBodySchema.safeParse({ section: '01 32 01.00 10' }).success).toBe(true); + expect(PatchSpecBodySchema.safeParse({ section: 'unknown' }).success).toBe(false); + expect(PatchSpecBodySchema.safeParse({ section: '26 00 13.1' }).success).toBe(false); + }); +}); + describe('SpecTreeSchema — invalid inputs', () => { it('rejects section not matching DD NN NN format', () => { expect(() => diff --git a/src/ast/schemas.ts b/src/ast/schemas.ts index ef25d55..1244455 100644 --- a/src/ast/schemas.ts +++ b/src/ast/schemas.ts @@ -1,5 +1,6 @@ import { z } from 'zod'; import type { SpecNode } from './types.js'; +import { SectionNumberSchema } from '../lib/section-number.js'; export const NodeTypeSchema = z.enum([ 'spec', @@ -63,7 +64,9 @@ export const ParseWarningSchema = z.object({ export const SpecTreeSchema = z.object({ id: z.uuid(), - section: z.string().regex(/^\d{2} \d{2} \d{2}$/), + // Canonical expanded shape, or the 'unknown' sentinel emitted by parsers + // when no section number is found (content inference may fill it later). + section: z.union([SectionNumberSchema, z.literal('unknown')]), title: z.string().check(z.minLength(1)), parts: z.array(SpecNodeSchema), warnings: z.array(ParseWarningSchema).exactOptional(), @@ -71,10 +74,8 @@ export const SpecTreeSchema = z.object({ export const PatchSpecBodySchema = z.object({ title: z.string().check(z.minLength(1)).exactOptional(), - section: z - .string() - .regex(/^\d{2} \d{2} \d{2}$/) - .exactOptional(), + // PATCH must set a real section — the sentinel is not assignable by clients. + section: SectionNumberSchema.exactOptional(), }); export const CreateProjectBodySchema = z.object({ From 5c6d40284ef16d95c3392bab6090c9113b676167 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 21:18:05 -0700 Subject: [PATCH 14/28] feat(api): parse worker schema gates expanded shapes; section override normalized, 400 on malformed --- src/api/parse.test.ts | 42 +++++++++++++++++++++++++++++++++++++++--- src/api/parse.ts | 17 +++++++++++++++-- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/src/api/parse.test.ts b/src/api/parse.test.ts index 4a9a93f..a37c243 100644 --- a/src/api/parse.test.ts +++ b/src/api/parse.test.ts @@ -9,9 +9,10 @@ vi.mock('../parser/index.js', () => ({ })); vi.mock('../lib/parse-pool.js', () => ({ parsePool: { - run: vi - .fn() - .mockResolvedValue({ tree: { id: '', section: 'test', title: 'T', parts: [] }, refs: [] }), + run: vi.fn().mockResolvedValue({ + tree: { id: '', section: '27 21 00', title: 'T', parts: [] }, + refs: [], + }), }, })); vi.mock('../lib/jobs.js', () => ({ @@ -136,6 +137,41 @@ describe('parseHandler', () => { await parseHandler(req, res); expect(res.status).toHaveBeenCalledWith(202); }); + + it('parse: dirty section override normalized before persist', async () => { + const { persistParsedSpec } = await import('../db/index.js'); + const { updateJob } = await import('../lib/jobs.js'); + vi.mocked(updateJob).mockImplementation(() => {}); + const { parseHandler } = await import('./parse.js'); + const req = { + file: { originalname: 'spec.txt', mimetype: 'text/plain', buffer: Buffer.from('x') }, + body: { section: '26 00 13.10' }, + } as unknown as Request; + const res = makeRes(); + await parseHandler(req, res); + expect(res.status).toHaveBeenCalledWith(202); + await vi.waitFor(() => { + expect(persistParsedSpec).toHaveBeenCalledTimes(1); + }); + const callArg = vi.mocked(persistParsedSpec).mock.calls[0]?.[0]; + expect(callArg?.tree.section).toBe('26 00 13.10'); + }); + + it('parse: malformed section override → 400 before job creation', async () => { + const { createJob } = await import('../lib/jobs.js'); + const { parseHandler } = await import('./parse.js'); + const req = { + file: { originalname: 'spec.txt', mimetype: 'text/plain', buffer: Buffer.from('x') }, + body: { section: '26 00 13.1' }, + } as unknown as Request; + const res = makeRes(); + await parseHandler(req, res); + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ error: 'invalid section override format' }) + ); + expect(createJob).not.toHaveBeenCalled(); + }); }); describe('parseJobHandler', () => { diff --git a/src/api/parse.ts b/src/api/parse.ts index 8665e3a..b283fcf 100644 --- a/src/api/parse.ts +++ b/src/api/parse.ts @@ -10,6 +10,7 @@ import { persistParsedSpec } from '../db/index.js'; import { logger } from '../lib/logger.js'; import type { SpecNode, SpecTree } from '../ast/types.js'; import { ParseWarningSchema, SecRefSchema } from '../ast/schemas.js'; +import { SectionNumberSchema, normalizeSectionNumber } from '../lib/section-number.js'; interface ParseBody { readonly section?: string; @@ -64,9 +65,21 @@ export async function parseHandler(req: Request, res: Response): Promise { return; } + const rawBody = parseBody(req.body); + const normalizedSection = + rawBody.section !== undefined ? normalizeSectionNumber(rawBody.section) : undefined; + if (rawBody.section !== undefined && normalizedSection === null) { + res.status(400).json({ success: false, error: 'invalid section override format' }); + return; + } + const body: ParseBody = { + ...rawBody, + ...(normalizedSection != null ? { section: normalizedSection } : {}), + }; + const jobId = createJob(); // Pass buffer and ext, not the full file object, so the request closure can be GC'd - void processParseJob(jobId, req.file.buffer, ext, parseBody(req.body)); + void processParseJob(jobId, req.file.buffer, ext, body); res.status(202).json({ success: true, data: { jobId } }); } @@ -91,7 +104,7 @@ function countNodes(nodes: readonly SpecNode[]): number { const workerOutputSchema = z.object({ tree: z.object({ id: z.string(), - section: z.string(), + section: z.union([SectionNumberSchema, z.literal('unknown')]), title: z.string(), parts: z.array(z.unknown()), warnings: z.array(ParseWarningSchema).optional(), From c71b6f03c9202af3ebb6705c9e8d22eb1c743144 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 21:24:53 -0700 Subject: [PATCH 15/28] fix(parser): normalize dc:subject so free-text degrades to 'unknown', not a job-killing section --- src/parser/docx/index.test.ts | 29 +++++++++++++++++++++++++++++ src/parser/docx/index.ts | 8 +++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/parser/docx/index.test.ts b/src/parser/docx/index.test.ts index ea2c4da..f3831cf 100644 --- a/src/parser/docx/index.test.ts +++ b/src/parser/docx/index.test.ts @@ -155,6 +155,35 @@ describe('parseDocx — error handling', () => { }); }); +describe('parseDocx — dc:subject section normalization (#gate)', () => { + function coreWith(subject: string): string { + return ` + + ${subject} + Structured Cabling +`; + } + + it('degrades free-text dc:subject to unknown (does not leak prose as section)', async () => { + const buffer = await makeDocx({ coreXml: coreWith('Division 26 - Electrical') }); + const tree = await parseDocx(buffer); + expect(tree.section).toBe('unknown'); + }); + + it('keeps a conforming dc:subject section number', async () => { + const buffer = await makeDocx({ coreXml: coreWith('26 00 13.10') }); + const tree = await parseDocx(buffer); + expect(tree.section).toBe('26 00 13.10'); + }); + + it('normalizes a dirty (multi-space) dc:subject section number', async () => { + const buffer = await makeDocx({ coreXml: coreWith('26 00 13.10') }); + const tree = await parseDocx(buffer); + expect(tree.section).toBe('26 00 13.10'); + }); +}); + // ── ARCAT-realistic end-to-end: numbering-generated PART prefixes, style-only // part linkage (reverse pStyle), preamble, specifier notes, no core.xml ── diff --git a/src/parser/docx/index.ts b/src/parser/docx/index.ts index 31fcce6..9f33e90 100644 --- a/src/parser/docx/index.ts +++ b/src/parser/docx/index.ts @@ -7,6 +7,7 @@ import { parseDocument } from './document.js'; import { classifyParagraphs, buildTree, auditTreeStructure } from './inference.js'; import type { SpecTree } from '../../ast/types.js'; import type { NumberingMap, StyleMap } from './types.js'; +import { normalizeSectionNumber } from '../../lib/section-number.js'; // SECURITY (issue #19): add uncompressed size check after JSZip.loadAsync — // reject if total uncompressed bytes > 50MB to prevent ZIP bomb exhaustion. @@ -25,8 +26,13 @@ function parseCoreMetadata(xml: string): { section: string; title: string } { const props = parsed['cp:coreProperties'] as Record | undefined; const subject = props?.['dc:subject']; const titleVal = props?.['dc:title']; + // dc:subject is free-text in Word — normalize so non-conforming values degrade + // to 'unknown' and the orchestrator's content inference takes over (instead of + // leaking prose downstream where the worker section-gate would kill the job). + const section = + typeof subject === 'string' ? (normalizeSectionNumber(subject) ?? 'unknown') : 'unknown'; return { - section: typeof subject === 'string' && subject.trim() ? subject.trim() : 'unknown', + section, title: typeof titleVal === 'string' && titleVal.trim() ? titleVal.trim() : 'unknown', }; } catch { From 8da2c2ba2756fb257c23b05bbc1e5e54ae7e5524 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 21:26:31 -0700 Subject: [PATCH 16/28] fix(api): friendly job error for section-gate failures; refresh stale parseDocx mock --- src/api/parse.test.ts | 42 +++++++++++++++++++++++++++++++++++++++++- src/api/parse.ts | 15 ++++++++++++++- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/src/api/parse.test.ts b/src/api/parse.test.ts index a37c243..1722ed3 100644 --- a/src/api/parse.test.ts +++ b/src/api/parse.test.ts @@ -3,7 +3,7 @@ import type { Request, Response } from 'express'; vi.mock('../parser/index.js', () => ({ parseSec: vi.fn(), - parseDocx: vi.fn().mockResolvedValue({ id: '', section: 'test', title: 'T', parts: [] }), + parseDocx: vi.fn().mockResolvedValue({ id: '', section: '27 21 00', title: 'T', parts: [] }), assertDocxSafe: vi.fn().mockResolvedValue(undefined), assertSecSafe: vi.fn(), })); @@ -317,3 +317,43 @@ describe('processParseJob refs persistence (#53)', () => { expect(callArg?.refs).toEqual([]); }); }); + +describe('processParseJob section-gate error message', () => { + it('surfaces a friendly message (not a Zod blob) when the worker section fails the gate', async () => { + const { parsePool } = await import('../lib/parse-pool.js'); + const { updateJob, createJob } = await import('../lib/jobs.js'); + vi.mocked(createJob).mockReturnValue('gate-job-id'); + + vi.mocked(parsePool.run).mockResolvedValueOnce({ + tree: { + id: '00000000-0000-0000-0000-000000000004', + section: 'garbage', + title: 'T', + parts: [], + }, + refs: [], + }); + vi.mocked(updateJob).mockImplementation(() => {}); + + const { parseHandler } = await import('./parse.js'); + const req = { + file: { + originalname: 'spec.sec', + mimetype: 'text/xml', + buffer: Buffer.from('', 'utf-8'), + }, + body: {}, + } as unknown as Request; + await parseHandler(req, makeRes()); + + await vi.waitFor(() => { + expect(updateJob).toHaveBeenCalledWith( + 'gate-job-id', + expect.objectContaining({ + status: 'failed', + error: 'parsed section number is not a valid CSI section (expected NN NN NN[.NN[ NN]])', + }) + ); + }); + }); +}); diff --git a/src/api/parse.ts b/src/api/parse.ts index b283fcf..ed83878 100644 --- a/src/api/parse.ts +++ b/src/api/parse.ts @@ -113,6 +113,19 @@ const workerOutputSchema = z.object({ capabilities: z.array(z.string()).optional(), }); +const SECTION_GATE_MESSAGE = + 'parsed section number is not a valid CSI section (expected NN NN NN[.NN[ NN]])'; + +// A worker-output section that fails the gate surfaces as a raw Zod issue blob. +// Translate that one case to a human-readable message; everything else keeps its +// original error text (still context-chained via SpecrError where applicable). +function jobErrorMessage(err: unknown): string { + if (err instanceof z.ZodError && err.issues.some((i) => i.path.includes('section'))) { + return SECTION_GATE_MESSAGE; + } + return err instanceof Error ? err.message : 'parse failed'; +} + async function processParseJob( jobId: string, buffer: Buffer, @@ -157,7 +170,7 @@ async function processParseJob( logger.error({ err, jobId }, 'parse job failed'); updateJob(jobId, { status: 'failed', - error: err instanceof Error ? err.message : 'parse failed', + error: jobErrorMessage(err), }); } } From fa48fabb8a0fad7d54861019e99d2b63cadbaae7 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 21:28:37 -0700 Subject: [PATCH 17/28] fix(api): download filename preserves section dotted suffix safeFilename now allows '.' in the section part so '26 00 13.10' renders as '26-00-13.10-Panelboards.docx' rather than mangling the dot to a dash. Function exported for unit testing. --- src/api/generate.test.ts | 28 ++++++++++++++++++++++++++++ src/api/generate.ts | 7 +++++-- 2 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 src/api/generate.test.ts diff --git a/src/api/generate.test.ts b/src/api/generate.test.ts new file mode 100644 index 0000000..1119ed9 --- /dev/null +++ b/src/api/generate.test.ts @@ -0,0 +1,28 @@ +import { describe, it, expect, vi } from 'vitest'; + +vi.mock('../db/index.js', () => ({ + getSpecTree: vi.fn(), +})); +vi.mock('../generator/index.js', () => ({ + generateDocx: vi.fn(), +})); +vi.mock('../lib/logger.js', () => ({ + logger: { info: vi.fn(), error: vi.fn(), debug: vi.fn(), warn: vi.fn() }, +})); + +describe('safeFilename', () => { + it('generate: filename preserves dotted suffix', async () => { + const { safeFilename } = await import('./generate.js'); + expect(safeFilename('26 00 13.10', 'Panelboards')).toBe('26-00-13.10-Panelboards.docx'); + }); + + it('generate: agency form keeps dot, spaces become dashes', async () => { + const { safeFilename } = await import('./generate.js'); + expect(safeFilename('01 32 01.00 10', 'QC')).toBe('01-32-01.00-10-QC.docx'); + }); + + it('generate: base form unchanged behavior', async () => { + const { safeFilename } = await import('./generate.js'); + expect(safeFilename('27 21 00', 'Structured Cabling')).toBe('27-21-00-Structured-Cabling.docx'); + }); +}); diff --git a/src/api/generate.ts b/src/api/generate.ts index d825f25..bcb88e4 100644 --- a/src/api/generate.ts +++ b/src/api/generate.ts @@ -6,8 +6,11 @@ import { logger } from '../lib/logger.js'; const DOCX_MIME = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'; -function safeFilename(section: string, title: string): string { - const s = section.replace(/[^a-zA-Z0-9-]/g, '-').replace(/-+/g, '-'); +// Exported for unit testing. +export function safeFilename(section: string, title: string): string { + // '.' is allowed in the section part so '26 00 13.10' stays distinguishable + // from a hypothetical '26 00 1310' in the suggested filename. + const s = section.replace(/[^a-zA-Z0-9.-]/g, '-').replace(/-+/g, '-'); const t = title .replace(/[^a-zA-Z0-9-]/g, '-') .replace(/-+/g, '-') From 17f94409c4d419e2c3f70cd6060f3c5b8aa16414 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 21:30:05 -0700 Subject: [PATCH 18/28] test(generator): pin suffixed-section rendering in markdown H1 and DOCX title MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Regression pins — no production change. Verifies that renderMarkdown emits the section verbatim in the H1 header and that generateDocx writes it unchanged into document.xml, so future refactors cannot silently mangle dotted agency suffixes (e.g. '27 05 13.43'). --- src/generator/index.test.ts | 12 ++++++++++++ src/generator/markdown.test.ts | 9 +++++++++ 2 files changed, 21 insertions(+) diff --git a/src/generator/index.test.ts b/src/generator/index.test.ts index 4e7d9e9..d626b35 100644 --- a/src/generator/index.test.ts +++ b/src/generator/index.test.ts @@ -156,6 +156,18 @@ describe('generateDocx', () => { expect(Buffer.isBuffer(buffer)).toBe(true); expect(buffer.length).toBeGreaterThan(0); }); + + it('generateDocx: agency-suffixed section survives into document.xml', async () => { + const suffixedTree: SpecTree = { + id: '00000000-0000-0000-0000-000000000001', + section: '01 32 01.00 10', + title: 'Project Schedule', + parts: [], + }; + const buffer = await generateDocx(suffixedTree); + const xml = await getDocXml(buffer); + expect(xml).toContain('01 32 01.00 10'); + }); }); describe('generateDocx — content controls', () => { diff --git a/src/generator/markdown.test.ts b/src/generator/markdown.test.ts index a4d32c0..12fcf76 100644 --- a/src/generator/markdown.test.ts +++ b/src/generator/markdown.test.ts @@ -157,6 +157,15 @@ describe('renderMarkdown', () => { }; expect(renderMarkdown(empty)).toBe('# SECTION 00 00 00 — Empty'); }); + it('renderMarkdown: suffixed section renders verbatim in H1', () => { + const suffixed: SpecTree = { + id: '00000000-0000-0000-0000-000000000001', + section: '27 05 13.43', + title: 'TV Distribution', + parts: [], + }; + expect(renderMarkdown(suffixed)).toBe('# SECTION 27 05 13.43 — TV Distribution'); + }); it('renders continuation without label', () => { const withCont: SpecTree = { id: '00000000-0000-0000-0000-000000000001', From 96071188defe4823e5bf60c5c1349dbaa6ec78fe Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 21:36:21 -0700 Subject: [PATCH 19/28] test(api): PATCH accepts expanded section shapes over HTTP; refresh ARCHITECTURE examples --- ARCHITECTURE.md | 6 +++--- src/api/specs.integration.test.ts | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index dbae0c1..102dccf 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -225,7 +225,7 @@ interface CsiNode { interface CsiTree { id: string // spec ID - section: string // CSI section number, e.g. "27 21 00" + section: string // CSI section number, e.g. "27 21 00", "26 00 13.10", "01 32 01.00 10" title: string parts: CsiNode[] // root-level Part nodes } @@ -262,7 +262,7 @@ interface ApiResponse { -- Specs CREATE TABLE specs ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - section VARCHAR(20), -- "27 21 00" + section VARCHAR(20), -- "27 21 00" | "26 00 13.10" | "01 32 01.00 10" (expanded shape, ADR-020) title TEXT, source VARCHAR(20), -- 'ufgs' | 'arcat' | 'cpi' | 'unknown' created_at TIMESTAMPTZ DEFAULT now(), @@ -344,7 +344,7 @@ CREATE TABLE spec_references ( source_spec_id UUID REFERENCES specs(id) ON DELETE CASCADE, source_paragraph_id UUID REFERENCES paragraphs(id) ON DELETE CASCADE, target_type VARCHAR(20) NOT NULL, -- 'section' | 'paragraph' | 'standard' - target_spec_section VARCHAR(20), -- "09 91 00" — for section refs + target_spec_section VARCHAR(20), -- "09 91 00" / "26 00 13.10" — for section refs target_spec_id UUID REFERENCES specs(id) ON DELETE SET NULL, target_paragraph_id UUID REFERENCES paragraphs(id) ON DELETE SET NULL, standard_code TEXT, -- "ASTM C150" — for standard refs diff --git a/src/api/specs.integration.test.ts b/src/api/specs.integration.test.ts index 195b3d2..6205a3d 100644 --- a/src/api/specs.integration.test.ts +++ b/src/api/specs.integration.test.ts @@ -106,4 +106,26 @@ describe('PATCH /specs/:id (integration)', () => { }); expect(res.status).toBe(404); }); + + it('accepts a dotted-suffix section', async () => { + const res = await fetch(`${baseUrl}/specs/${testSpecId}`, { + method: 'PATCH', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ section: '27 21 00.10' }), + }); + const body = (await res.json()) as Record; + expect(res.status).toBe(200); + expect((body['data'] as Record)['section']).toBe('27 21 00.10'); + }); + + it('accepts an agency-suffix section', async () => { + const res = await fetch(`${baseUrl}/specs/${testSpecId}`, { + method: 'PATCH', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ section: '27 21 00.10 20' }), + }); + const body = (await res.json()) as Record; + expect(res.status).toBe(200); + expect((body['data'] as Record)['section']).toBe('27 21 00.10 20'); + }); }); From c45a2278a8c6ab9b2fa913b0c3e4a35dca7ab05d Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 21:43:00 -0700 Subject: [PATCH 20/28] feat(db): seed tolerates bare SCN, normalizes section numbers before upsert --- src/db/seed.test.ts | 24 ++++++++++++++++++++++++ src/db/seed.ts | 14 +++++++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/db/seed.test.ts b/src/db/seed.test.ts index db59f9d..d2364cf 100644 --- a/src/db/seed.test.ts +++ b/src/db/seed.test.ts @@ -28,4 +28,28 @@ describe('extractSectionMeta', () => { expect(result?.sectionNumber).toBe('27 21 00'); expect(result?.title).toBe('Structured Cabling'); }); + + it('seed: extractSectionMeta retains .43 suffix, division still 27', () => { + const content = `SECTION 27 05 13.43TV DISTRIBUTION`; + expect(extractSectionMeta(content)).toEqual({ + sectionNumber: '27 05 13.43', + title: 'TV DISTRIBUTION', + division: '27', + }); + }); + + it('seed: bare SCN without SECTION prefix yields section', () => { + const content = `01 31 23.13 20SUSTAINABILITY REPORTING`; + expect(extractSectionMeta(content)?.sectionNumber).toBe('01 31 23.13 20'); + }); + + it('seed: whitespace dirt in SCN normalizes to canonical form', () => { + const content = `SECTION 26 00 13.10 X`; + expect(extractSectionMeta(content)?.sectionNumber).toBe('26 00 13.10'); + }); + + it('seed: unnormalizable SCN content is skipped (null), not seeded dirty', () => { + const content = `SECTION TBDX`; + expect(extractSectionMeta(content)).toBeNull(); + }); }); diff --git a/src/db/seed.ts b/src/db/seed.ts index 878bbb3..b7f6ede 100644 --- a/src/db/seed.ts +++ b/src/db/seed.ts @@ -2,6 +2,7 @@ import { readdir, readFile } from 'node:fs/promises'; import { join, resolve } from 'node:path'; import { fileURLToPath } from 'node:url'; import type { Pool } from 'pg'; +import { normalizeSectionNumber } from '../lib/section-number.js'; // Provenance: see docs/adr/013-csi-sections-seed-public-domain-derivation.md const UFGS_DIR = join(process.cwd(), 'docs/references/UFGS'); @@ -12,16 +13,23 @@ export interface SectionRecord { readonly division: string; } -const SCN_RE = /SECTION ([^<]+)<\/SCN>/; +// Try prefix form first; fall back to bare number (2 corpus files omit 'SECTION '). +// Capture starts at \d so it cannot overlap with the preceding \s+ — no backtracking. +const SCN_PREFIX_RE = /SECTION\s+(\d[^<]*)<\/SCN>/i; +const SCN_BARE_RE = /(\d[^<]*)<\/SCN>/; const STL_RE = /([^<]+)<\/STL>/; export function extractSectionMeta(content: string): SectionRecord | null { - const scnMatch = SCN_RE.exec(content); + const scnMatch = SCN_PREFIX_RE.exec(content) ?? SCN_BARE_RE.exec(content); const stlMatch = STL_RE.exec(content); if (!scnMatch?.[1] || !stlMatch?.[1]) return null; - const sectionNumber = scnMatch[1].trim(); + // Catalog rows must be canonical — the shape CHECK constraint (migration 013) + // enforces this at the DB layer; skipping here keeps the seed loud-and-clean. + const sectionNumber = normalizeSectionNumber(scnMatch[1]); + if (sectionNumber === null) return null; + const title = stlMatch[1].trim(); const division = sectionNumber.slice(0, 2); From 970c3efb165e603034b9d24f17049f8a1f99d5b6 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 21:53:18 -0700 Subject: [PATCH 21/28] fix(db): seed tolerates leading whitespace before SCN SECTION keyword --- src/db/seed.test.ts | 5 +++++ src/db/seed.ts | 11 ++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/db/seed.test.ts b/src/db/seed.test.ts index d2364cf..f00f842 100644 --- a/src/db/seed.test.ts +++ b/src/db/seed.test.ts @@ -52,4 +52,9 @@ describe('extractSectionMeta', () => { const content = `SECTION TBDX`; expect(extractSectionMeta(content)).toBeNull(); }); + + it('seed: leading whitespace before SECTION keyword tolerated (26_29_23.SEC corpus shape)', () => { + const content = ` SECTION 26 29 23X`; + expect(extractSectionMeta(content)?.sectionNumber).toBe('26 29 23'); + }); }); diff --git a/src/db/seed.ts b/src/db/seed.ts index b7f6ede..e6edc31 100644 --- a/src/db/seed.ts +++ b/src/db/seed.ts @@ -13,14 +13,15 @@ export interface SectionRecord { readonly division: string; } -// Try prefix form first; fall back to bare number (2 corpus files omit 'SECTION '). -// Capture starts at \d so it cannot overlap with the preceding \s+ — no backtracking. -const SCN_PREFIX_RE = /SECTION\s+(\d[^<]*)<\/SCN>/i; -const SCN_BARE_RE = /(\d[^<]*)<\/SCN>/; +// Optional leading whitespace + optional 'SECTION ' keyword: real corpus files +// carry both a bare SCN (2 files omit the keyword) and a leading space before it +// (e.g. 26_29_23.SEC: ` SECTION 26 29 23`). The capture is anchored to +// a digit, so [^<]* cannot backtrack past — no ReDoS. +const SCN_RE = /\s*(?:SECTION\s+)?(\d[^<]*)<\/SCN>/i; const STL_RE = /([^<]+)<\/STL>/; export function extractSectionMeta(content: string): SectionRecord | null { - const scnMatch = SCN_PREFIX_RE.exec(content) ?? SCN_BARE_RE.exec(content); + const scnMatch = SCN_RE.exec(content); const stlMatch = STL_RE.exec(content); if (!scnMatch?.[1] || !stlMatch?.[1]) return null; From 7a0797c4be62b2de2583f08564f46ed5a2ddbcc3 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 21:58:53 -0700 Subject: [PATCH 22/28] feat(db): warn on skipped section files during seed --- src/db/seed.test.ts | 21 +++++++++++++++- src/db/seed.ts | 59 ++++++++++++++++++++++++++++++++------------- 2 files changed, 62 insertions(+), 18 deletions(-) diff --git a/src/db/seed.test.ts b/src/db/seed.test.ts index f00f842..9f7a2bb 100644 --- a/src/db/seed.test.ts +++ b/src/db/seed.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from 'vitest'; -import { extractSectionMeta } from './seed.js'; +import { extractSectionMeta, collectFromContents } from './seed.js'; describe('extractSectionMeta', () => { it('extracts section number and title from valid SEC content', () => { @@ -58,3 +58,22 @@ describe('extractSectionMeta', () => { expect(extractSectionMeta(content)?.sectionNumber).toBe('26 29 23'); }); }); + +describe('collectFromContents', () => { + it('counts every scanned content vs the records it keeps', () => { + const good = `SECTION 01 11 00SUMMARY`; + const bad = `SECTION TBDX`; + const result = collectFromContents([good, bad]); + expect(result.scanned).toBe(2); + expect(result.records).toHaveLength(1); + expect(result.records[0]?.sectionNumber).toBe('01 11 00'); + }); + + it('scanned equals kept when every content is canonical', () => { + const a = `SECTION 01 11 00A`; + const b = `SECTION 27 21 00B`; + const result = collectFromContents([a, b]); + expect(result.scanned).toBe(2); + expect(result.records).toHaveLength(2); + }); +}); diff --git a/src/db/seed.ts b/src/db/seed.ts index e6edc31..fe0e473 100644 --- a/src/db/seed.ts +++ b/src/db/seed.ts @@ -13,6 +13,11 @@ export interface SectionRecord { readonly division: string; } +export interface CollectResult { + readonly records: readonly SectionRecord[]; + readonly scanned: number; +} + // Optional leading whitespace + optional 'SECTION ' keyword: real corpus files // carry both a bare SCN (2 files omit the keyword) and a leading space before it // (e.g. 26_29_23.SEC: ` SECTION 26 29 23`). The capture is anchored to @@ -37,31 +42,44 @@ export function extractSectionMeta(content: string): SectionRecord | null { return { sectionNumber, title, division }; } -async function collectDivisionRecords(divPath: string): Promise { - const entries = await readdir(divPath, { withFileTypes: true }); - const records: SectionRecord[] = []; - - for (const entry of entries) { - if (!entry.isFile() || !entry.name.toLowerCase().endsWith('.sec')) continue; - const content = await readFile(join(divPath, entry.name), 'latin1'); - const record = extractSectionMeta(content); - if (record !== null) records.push(record); - } +/** + * Pure extraction over a batch of file contents. `scanned` counts every input; + * `records` holds only the canonical ones. A gap between the two is the + * silent-truncation signal the seed warns on. + */ +export function collectFromContents(contents: readonly string[]): CollectResult { + const records = contents + .map((content) => extractSectionMeta(content)) + .filter((record): record is SectionRecord => record !== null); + + return { records, scanned: contents.length }; +} - return records; +async function collectDivisionRecords(divPath: string): Promise { + const entries = await readdir(divPath, { withFileTypes: true }); + const secFiles = entries.filter( + (entry) => entry.isFile() && entry.name.toLowerCase().endsWith('.sec') + ); + const contents = await Promise.all( + secFiles.map((entry) => readFile(join(divPath, entry.name), 'latin1')) + ); + + return collectFromContents(contents); } -async function collectRecords(): Promise { +async function collectRecords(): Promise { const entries = await readdir(UFGS_DIR, { withFileTypes: true }); - const all: SectionRecord[] = []; + const records: SectionRecord[] = []; + let scanned = 0; for (const entry of entries) { if (!entry.isDirectory()) continue; - const divRecords = await collectDivisionRecords(join(UFGS_DIR, entry.name)); - all.push(...divRecords); + const division = await collectDivisionRecords(join(UFGS_DIR, entry.name)); + records.push(...division.records); + scanned += division.scanned; } - return all; + return { records, scanned }; } async function seed(pool: Pool): Promise { @@ -70,9 +88,16 @@ async function seed(pool: Pool): Promise { logger.info('seeding CSI section reference data'); - const records = await collectRecords(); + const { records, scanned } = await collectRecords(); logger.info({ count: records.length }, 'collected section records'); + if (scanned > records.length) { + logger.warn( + { scanned, kept: records.length, skipped: scanned - records.length }, + 'section files skipped during seed' + ); + } + try { for (const { sectionNumber, title, division } of records) { await pool.query( From be6d98a355b6518da129358ad4f34fdc9745f042 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 22:05:19 -0700 Subject: [PATCH 23/28] =?UTF-8?q?feat(db):=20migration=20013=20=E2=80=94?= =?UTF-8?q?=20normalize=20section=20whitespace,=20add=20expanded-shape=20C?= =?UTF-8?q?HECK=20constraints?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../013_section_number_normalize_and_check.ts | 37 +++++++++++++++++++ src/db/queries/specs.integration.test.ts | 30 +++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 src/db/migrations/013_section_number_normalize_and_check.ts diff --git a/src/db/migrations/013_section_number_normalize_and_check.ts b/src/db/migrations/013_section_number_normalize_and_check.ts new file mode 100644 index 0000000..28e741c --- /dev/null +++ b/src/db/migrations/013_section_number_normalize_and_check.ts @@ -0,0 +1,37 @@ +import type { MigrationBuilder } from 'node-pg-migrate'; + +// Expanded shape (ADR-020): NN NN NN | NN NN NN.NN | NN NN NN.NN NN +const SHAPE = String.raw`^\d{2} \d{2} \d{2}(\.\d{2}( \d{2})?)?$`; + +// NBSP→space, collapse whitespace runs, trim — SQL mirror of +// normalizeSectionNumber() in src/lib/section-number.ts. +const norm = (col: string): string => + `btrim(regexp_replace(replace(${col}, chr(160), ' '), '\\s+', ' ', 'g'))`; + +export const up = (pgm: MigrationBuilder): void => { + // Step 1: normalize existing rows. If two rows collapse to the same key, + // the existing UNIQUE constraints (specs_section_source_unique, + // csi_sections_section_number_key on spec_sections) abort this migration + // loudly — by design; resolve duplicates manually before re-running. + pgm.sql(`UPDATE specs SET section = ${norm('section')} WHERE section <> ${norm('section')}`); + pgm.sql( + `UPDATE spec_sections SET section_number = ${norm('section_number')} WHERE section_number <> ${norm('section_number')}` + ); + + // Step 2: shape gates. specs.section additionally admits the 'unknown' + // sentinel written by the parse path for section-less documents. + pgm.addConstraint('specs', 'specs_section_shape_check', { + check: `section ~ '${SHAPE}' OR section = 'unknown'`, + }); + pgm.addConstraint('spec_sections', 'spec_sections_section_number_shape_check', { + check: `section_number ~ '${SHAPE}'`, + }); + // Deliberately NO constraint on spec_references.target_spec_section: it + // records what the source document said (descriptive, not canonical). +}; + +export const down = (pgm: MigrationBuilder): void => { + pgm.dropConstraint('spec_sections', 'spec_sections_section_number_shape_check'); + pgm.dropConstraint('specs', 'specs_section_shape_check'); + // Whitespace normalization is lossy and is not reversed. +}; diff --git a/src/db/queries/specs.integration.test.ts b/src/db/queries/specs.integration.test.ts index d6afdd0..47b228e 100644 --- a/src/db/queries/specs.integration.test.ts +++ b/src/db/queries/specs.integration.test.ts @@ -102,3 +102,33 @@ describe('createSpec', () => { } }); }); + +describe('migration 013 — section shape CHECK constraints', () => { + it('db: specs.section CHECK accepts expanded shapes and the unknown sentinel', async () => { + const r1 = await pool.query( + `INSERT INTO specs (section, title, source) VALUES ('99 88 77.10 20', 'Shape OK', 'arcat') RETURNING section` + ); + expect(r1.rows[0]).toMatchObject({ section: '99 88 77.10 20' }); + const r2 = await pool.query( + `INSERT INTO specs (section, title, source) VALUES ('unknown', 'Sentinel OK', 'arcat') RETURNING section` + ); + expect(r2.rows[0]).toMatchObject({ section: 'unknown' }); + await pool.query( + `DELETE FROM specs WHERE section IN ('99 88 77.10 20', 'unknown') AND source = 'arcat'` + ); + }); + + it('db: specs.section CHECK rejects malformed sections', async () => { + await expect( + pool.query(`INSERT INTO specs (section, title, source) VALUES ('99 8877', 'Bad', 'arcat')`) + ).rejects.toThrow(/specs_section_shape_check/); + }); + + it('db: spec_sections shape CHECK rejects the sentinel (catalog is canonical-only)', async () => { + await expect( + pool.query( + `INSERT INTO spec_sections (section_number, title, division) VALUES ('unknown', 'Bad', 'un')` + ) + ).rejects.toThrow(/spec_sections_section_number_shape_check/); + }); +}); From 9c9f5bc20b3e5c8694aef1c396c47fbde5deccb3 Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 22:14:34 -0700 Subject: [PATCH 24/28] test(db): leak-proof cleanup in shape-check accept test --- src/db/queries/specs.integration.test.ts | 25 +++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/db/queries/specs.integration.test.ts b/src/db/queries/specs.integration.test.ts index 47b228e..8ee711f 100644 --- a/src/db/queries/specs.integration.test.ts +++ b/src/db/queries/specs.integration.test.ts @@ -105,17 +105,20 @@ describe('createSpec', () => { describe('migration 013 — section shape CHECK constraints', () => { it('db: specs.section CHECK accepts expanded shapes and the unknown sentinel', async () => { - const r1 = await pool.query( - `INSERT INTO specs (section, title, source) VALUES ('99 88 77.10 20', 'Shape OK', 'arcat') RETURNING section` - ); - expect(r1.rows[0]).toMatchObject({ section: '99 88 77.10 20' }); - const r2 = await pool.query( - `INSERT INTO specs (section, title, source) VALUES ('unknown', 'Sentinel OK', 'arcat') RETURNING section` - ); - expect(r2.rows[0]).toMatchObject({ section: 'unknown' }); - await pool.query( - `DELETE FROM specs WHERE section IN ('99 88 77.10 20', 'unknown') AND source = 'arcat'` - ); + try { + const r1 = await pool.query( + `INSERT INTO specs (section, title, source) VALUES ('99 88 77.10 20', 'Shape OK', 'arcat') RETURNING section` + ); + expect(r1.rows[0]).toMatchObject({ section: '99 88 77.10 20' }); + const r2 = await pool.query( + `INSERT INTO specs (section, title, source) VALUES ('unknown', 'Sentinel OK', 'arcat') RETURNING section` + ); + expect(r2.rows[0]).toMatchObject({ section: 'unknown' }); + } finally { + await pool.query( + `DELETE FROM specs WHERE section IN ('99 88 77.10 20', 'unknown') AND source = 'arcat'` + ); + } }); it('db: specs.section CHECK rejects malformed sections', async () => { From 1f899ba04df449f0e488fe05f5bd782b00c29f0e Mon Sep 17 00:00:00 2001 From: thewrz Date: Fri, 5 Jun 2026 22:18:05 -0700 Subject: [PATCH 25/28] =?UTF-8?q?test(integration):=20agency-suffixed=20.S?= =?UTF-8?q?EC=20end-to-end=20=E2=80=94=20load,=20persist,=20exact-match=20?= =?UTF-8?q?refs,=20catalog=20join?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/lib/file-loader.integration.test.ts | 68 +++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/src/lib/file-loader.integration.test.ts b/src/lib/file-loader.integration.test.ts index acbcd72..5ec7c47 100644 --- a/src/lib/file-loader.integration.test.ts +++ b/src/lib/file-loader.integration.test.ts @@ -87,4 +87,72 @@ describe('loadFiles() integration', () => { ); expect(row.rows).toHaveLength(0); }); + + const AGENCY_FIXTURE = path.join( + PROJECT_ROOT, + 'docs/references/UFGS/DIVISION_01/01_32_01.00_10.SEC' + ); + + it('e2e: agency-suffixed corpus file loads with section intact', async () => { + const result = await loadFiles([AGENCY_FIXTURE]); + expect(result.succeeded).toBe(1); + + const row = await pool.query<{ section: string }>( + `SELECT section FROM specs WHERE section = '01 32 01.00 10' AND source = 'ufgs' LIMIT 1` + ); + expect(row.rows[0]?.section).toBe('01 32 01.00 10'); + }); + + it('e2e: ref targeting an agency-suffixed section resolves by exact match', async () => { + const { persistParsedSpec } = await import('../db/index.js'); + const target = await pool.query<{ id: string }>( + `SELECT id FROM specs WHERE section = '01 32 01.00 10' AND source = 'ufgs' LIMIT 1` + ); + expect(target.rows[0]?.id).toBeDefined(); + + const sourceNodeId = '00000000-0000-4000-8000-00000000aaaa'; + const specId = await persistParsedSpec({ + tree: { + id: '00000000-0000-4000-8000-00000000bbbb', + section: '99 88 77', + title: 'Ref Source', + parts: [ + { + id: sourceNodeId, + type: 'part', + text: 'See Section 01 32 01.00 10.', + children: [], + meta: { source: 'ufgs' }, + }, + ], + }, + refs: [ + { + sourceNodeId, + targetType: 'section', + targetSpecSection: '01 32 01.00 10', + referenceText: 'See Section 01 32 01.00 10.', + }, + ], + }); + + try { + const refRow = await pool.query<{ target_spec_id: string | null }>( + `SELECT target_spec_id FROM spec_references WHERE source_spec_id = $1`, + [specId] + ); + expect(refRow.rows[0]?.target_spec_id).toBe(target.rows[0]?.id); + } finally { + await pool.query(`DELETE FROM specs WHERE id = $1`, [specId]); + } + }); + + it('e2e: catalog join + division filter — suffixed section listed in database for division 01', async () => { + const { listSpecSections } = await import('../db/index.js'); + const sections = await listSpecSections('01'); + const entry = sections.find((s) => s.section === '01 32 01.00 10'); + // catalog row exists (pnpm seed) AND exact-equality join sees the loaded spec + expect(entry).toBeDefined(); + expect(entry?.inDatabase).toBe(true); + }); }); From b9f6978ac0f0ee6ab27d61d2024a1263199fbecc Mon Sep 17 00:00:00 2001 From: thewrz Date: Sat, 6 Jun 2026 12:15:02 -0700 Subject: [PATCH 26/28] docs(plans): fix markdownlint MD038/MD040 in plan doc --- .../plans/2026-06-05-section-number-expansion.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/superpowers/plans/2026-06-05-section-number-expansion.md b/docs/superpowers/plans/2026-06-05-section-number-expansion.md index d88037e..ada60e3 100644 --- a/docs/superpowers/plans/2026-06-05-section-number-expansion.md +++ b/docs/superpowers/plans/2026-06-05-section-number-expansion.md @@ -11,7 +11,7 @@ **Branch:** All work on `feat/section-number-expansion` (this worktree, based on origin/main `ba99b64`). PR cutting at the end (Task 16). **Key grammar facts** (corpus-verified, 665 UFGS `.SEC` files): -- 422 base / 162 dotted / 76 agency-suffixed; whitespace dirt exists (leading/double spaces); 2 SCN tags lack the `SECTION ` prefix. +- 422 base / 162 dotted / 76 agency-suffixed; whitespace dirt exists (leading/double spaces); 2 SCN tags lack the `SECTION` keyword prefix. - `26 00 13` and `26 00 13.10` and `26 00 13.20` are THREE DIFFERENT sections. Truncation is data corruption. - Linking is **exact match only** (locked decision). Lexicographic sort is already correct for this grammar — do not touch ordering. - JS `\s` already matches NBSP (` `); normalization still canonicalizes runs to single ASCII spaces. @@ -302,9 +302,9 @@ what the source document said. - [ ] **Step 2: Add to CLAUDE.md ADR list** In `CLAUDE.md`, in the `docs/adr/` listing, after the line -` 019-scope-boundaries-content-neutral-platform.md`, add: +`019-scope-boundaries-content-neutral-platform.md`, add: -``` +```text 020-section-number-expanded-shape.md ``` From fb18cf3c860aace2b03f8ad2eec0911b729fe7f9 Mon Sep 17 00:00:00 2001 From: thewrz Date: Sat, 6 Jun 2026 12:15:02 -0700 Subject: [PATCH 27/28] =?UTF-8?q?ci:=20run=20PR=20checks=20for=20all=20bas?= =?UTF-8?q?e=20branches=20=E2=80=94=20stacked=20sub-MVP=20PRs=20need=20ind?= =?UTF-8?q?ependent=20CI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 870be6d..d30f9ed 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,8 +3,9 @@ name: CI on: push: branches: ["main"] + # No base-branch filter: stacked sub-MVP PRs target feature branches and + # must pass CI independently (CLAUDE.md PR discipline). pull_request: - branches: ["main"] permissions: contents: read From f188957a2c69cc2040ed9cfac0da0498ba1746b7 Mon Sep 17 00:00:00 2001 From: thewrz Date: Sat, 6 Jun 2026 12:26:26 -0700 Subject: [PATCH 28/28] =?UTF-8?q?fix(api):=20Zod-validate=20/parse=20body?= =?UTF-8?q?=20fields=20=E2=80=94=20non-string=20section/title=20now=20400?= =?UTF-8?q?=20instead=20of=20silent=20drop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/api/parse.test.ts | 29 +++++++++++++++++++++++++++++ src/api/parse.ts | 21 ++++++++++++--------- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/src/api/parse.test.ts b/src/api/parse.test.ts index 1722ed3..64d13a6 100644 --- a/src/api/parse.test.ts +++ b/src/api/parse.test.ts @@ -172,6 +172,35 @@ describe('parseHandler', () => { ); expect(createJob).not.toHaveBeenCalled(); }); + + it('parse: non-string body fields → 400, not silently dropped', async () => { + const { createJob } = await import('../lib/jobs.js'); + const { parseHandler } = await import('./parse.js'); + const req = { + file: { originalname: 'spec.txt', mimetype: 'text/plain', buffer: Buffer.from('x') }, + body: { section: 12345 }, + } as unknown as Request; + const res = makeRes(); + await parseHandler(req, res); + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ error: 'invalid request body' }) + ); + expect(createJob).not.toHaveBeenCalled(); + }); + + it('parse: non-object body treated as empty (multer yields {} for fieldless multipart)', async () => { + const { createJob } = await import('../lib/jobs.js'); + vi.mocked(createJob).mockReturnValue('no-body-job'); + const { parseHandler } = await import('./parse.js'); + const req = { + file: { originalname: 'spec.txt', mimetype: 'text/plain', buffer: Buffer.from('x') }, + body: undefined, + } as unknown as Request; + const res = makeRes(); + await parseHandler(req, res); + expect(res.status).toHaveBeenCalledWith(202); + }); }); describe('parseJobHandler', () => { diff --git a/src/api/parse.ts b/src/api/parse.ts index ed83878..823bf40 100644 --- a/src/api/parse.ts +++ b/src/api/parse.ts @@ -17,14 +17,12 @@ interface ParseBody { readonly title?: string; } -function parseBody(raw: unknown): ParseBody { - if (typeof raw !== 'object' || raw === null) return {}; - const r = raw as Record; - return { - ...(typeof r['section'] === 'string' ? { section: r['section'] } : {}), - ...(typeof r['title'] === 'string' ? { title: r['title'] } : {}), - }; -} +// Multipart text fields from multer. Non-strict (unknown keys stripped) to +// match PatchSpecBodySchema; non-string section/title is a 400, not a silent drop. +const ParseBodySchema = z.object({ + section: z.string().exactOptional(), + title: z.string().exactOptional(), +}); const DOCX_MIME = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'; const ALLOWED_EXT = new Set(['.docx', '.sec', '.txt']); @@ -65,7 +63,12 @@ export async function parseHandler(req: Request, res: Response): Promise { return; } - const rawBody = parseBody(req.body); + const bodyResult = ParseBodySchema.safeParse(req.body ?? {}); + if (!bodyResult.success) { + res.status(400).json({ success: false, error: 'invalid request body' }); + return; + } + const rawBody: ParseBody = bodyResult.data; const normalizedSection = rawBody.section !== undefined ? normalizeSectionNumber(rawBody.section) : undefined; if (rawBody.section !== undefined && normalizedSection === null) {