From 14f2d29d0b21af58c02f57e86c9ee7c9b7d0241b Mon Sep 17 00:00:00 2001 From: Alan Shurafa Date: Fri, 5 Jun 2026 22:23:24 -0400 Subject: [PATCH] [recipes] Add import verification --- recipes/import-verification/README.md | 135 +++++++ .../fixtures/sample-thoughts.json | 73 ++++ recipes/import-verification/metadata.json | 21 ++ .../import-verification/verify-imports.mjs | 339 ++++++++++++++++++ 4 files changed, 568 insertions(+) create mode 100644 recipes/import-verification/README.md create mode 100644 recipes/import-verification/fixtures/sample-thoughts.json create mode 100644 recipes/import-verification/metadata.json create mode 100644 recipes/import-verification/verify-imports.mjs diff --git a/recipes/import-verification/README.md b/recipes/import-verification/README.md new file mode 100644 index 00000000..8aebf5a9 --- /dev/null +++ b/recipes/import-verification/README.md @@ -0,0 +1,135 @@ +# Import Verification + +> Read-only checks for imported Open Brain thoughts. + +## What It Does + +This recipe verifies that imports landed in `public.thoughts` with enough metadata to audit, filter, and troubleshoot them later. It checks source coverage, metadata completeness, missing embeddings, duplicate fingerprints, sample rows, and optional text probes. It does not write to your database. + +Use it after running an import recipe such as ChatGPT Conversation Import, Obsidian Vault Import, Gmail import, Google Activity Import, Readwise Import, or a custom importer. + +## Prerequisites + +- Working Open Brain setup ([guide](../../docs/01-getting-started.md)) +- Node.js 18+ +- Supabase project URL and service-role key for live database checks + +## Credential Tracker + +```text +IMPORT VERIFICATION -- CREDENTIAL TRACKER +-------------------------------------- + +FROM YOUR OPEN BRAIN SETUP + Supabase Project URL: ____________ + Supabase Service Role Key: ____________ + +OPTIONAL + Source to verify: ____________ (example: chatgpt, obsidian, gmail) + Probe text: ____________ (a phrase you expect to find) + +-------------------------------------- +``` + +## Steps + +1. Open this recipe folder: + + ```bash + cd recipes/import-verification + ``` + +2. Run the fixture check first. This proves the script works without using credentials: + + ```bash + node verify-imports.mjs --fixture fixtures/sample-thoughts.json + ``` + +3. Export your Supabase credentials: + + ```bash + export SUPABASE_URL="https://YOUR_PROJECT_REF.supabase.co" + export SUPABASE_SERVICE_ROLE_KEY="your-service-role-key" + ``` + + You can also put these values in `.env.local` in this recipe folder. Do not commit `.env.local`. + +4. Verify recent imports: + + ```bash + node verify-imports.mjs --limit 1000 + ``` + +5. Verify one source: + + ```bash + node verify-imports.mjs --source chatgpt --limit 1000 --sample 5 + ``` + +6. Add a text probe when you know a phrase should exist: + + ```bash + node verify-imports.mjs --source obsidian --probe "home maintenance" --limit 1000 + ``` + +7. Use strict mode for CI-style checks: + + ```bash + node verify-imports.mjs --source readwise --strict + ``` + +## Options + +| Flag | Default | Description | +| ---- | ------- | ----------- | +| `--source SOURCE` | all sources | Filter scanned rows to a source slug such as `chatgpt`, `obsidian`, or `gmail`. | +| `--limit N` | `1000` | Maximum recent rows to scan. | +| `--sample N` | `5` | Number of sample thoughts to print. | +| `--probe TEXT` | none | Check whether scanned thoughts contain a phrase. This is a text probe, not semantic MCP search. | +| `--json` | off | Print machine-readable JSON instead of human-readable output. | +| `--strict` | off | Exit with code `1` when missing metadata, missing embeddings, duplicate fingerprints, or failed probes are found. | +| `--fixture FILE` | none | Analyze a local JSON fixture instead of connecting to Supabase. | +| `--help` | off | Show usage. | + +## What Gets Checked + +- **Rows by source**: counts scanned rows by `source_type`, `metadata.source_type`, or `metadata.source`. +- **Metadata completeness**: checks for `source`, `source_type`, `source_label`, `imported_at`, `importer_name`, `importer_version`, `input_hash`, `content_fingerprint`, `sensitivity_tier`, and `provenance`. +- **Embeddings**: flags rows with missing or empty embeddings. +- **Duplicate fingerprints**: finds repeated `content_fingerprint` values in the scanned rows. +- **Samples**: prints representative imported rows with ID, source, created date, and content preview. +- **Probe text**: checks whether any scanned content contains a phrase you expect to find. + +Older importers may not have every metadata field yet. By default the script reports those gaps without failing. Use `--strict` when you are validating a new importer that should meet the current contract. + +## Expected Outcome + +For a healthy import, you should see: + +- The expected source has non-zero rows. +- Recent imported rows have source metadata. +- Embeddings are present unless the importer intentionally skipped embeddings. +- Duplicate fingerprints are zero or explainable. +- A probe phrase finds at least one matching row when provided. + +## Exit Codes + +| Code | Meaning | +| ---- | ------- | +| `0` | Checks ran. In non-strict mode this can include warnings. | +| `1` | Strict mode found verification failures. | +| `2` | Missing configuration, unreadable fixture, JSON parse error, or Supabase query failure. | + +## Troubleshooting + +**Issue: `SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY are required`** +Solution: Export both variables or create a local `.env.local` file in this recipe folder. Use the service-role key, not the anon key. + +**Issue: Source count is zero** +Solution: Check the source slug used by the importer. Some older recipes store only `metadata.source`, while newer schemas may also have a top-level `source_type` column. + +**Issue: Missing metadata warnings** +Solution: Older imports may predate the current metadata contract. The warnings are useful for cleanup, but they are not necessarily a failed import unless you use `--strict`. + +**Issue: Probe text fails** +Solution: Increase `--limit`, use a simpler phrase, or verify with semantic MCP search. The probe is a local text check over scanned rows, not vector search. diff --git a/recipes/import-verification/fixtures/sample-thoughts.json b/recipes/import-verification/fixtures/sample-thoughts.json new file mode 100644 index 00000000..122f3f10 --- /dev/null +++ b/recipes/import-verification/fixtures/sample-thoughts.json @@ -0,0 +1,73 @@ +[ + { + "id": "00000000-0000-4000-8000-000000000001", + "content": "[ChatGPT: Database Migration Strategy | 2025-09-15] Chose PostgreSQL for the reporting service because the team needed relational joins, transactional consistency, and familiar operational tooling.", + "created_at": "2026-06-06T00:00:00Z", + "embedding": [0.01, 0.02, 0.03], + "source_type": "chatgpt", + "content_fingerprint": "sha256:9ec0acdf6f8c1f82d681d0e3b8ec8a7f1ea61d5b4179a6149ddccde2b1a1e33a", + "metadata": { + "source": "chatgpt", + "source_type": "chatgpt", + "source_label": "ChatGPT export - conversations.json", + "source_id": "conversation_abc123:thought_1", + "source_path": "chatgpt-export/conversations.json", + "source_locator": "messages 12-18", + "original_created_at": "2025-09-15T14:22:00Z", + "imported_at": "2026-06-06T00:00:00Z", + "importer_name": "chatgpt-conversation-import", + "importer_version": "1.0.0", + "input_hash": "sha256:2b45a3b4a377d0f5f27b0f8ad77e8f4fc4c8795a739e4fbd9610b7f742b2c4f1", + "content_fingerprint": "sha256:9ec0acdf6f8c1f82d681d0e3b8ec8a7f1ea61d5b4179a6149ddccde2b1a1e33a", + "sensitivity_tier": "standard", + "provenance": { + "method": "llm_extraction", + "source_record": "conversation abc123", + "source_locator": "messages 12-18", + "artifact": "chatgpt-export/conversations.json", + "extractor_model": "openai/gpt-4o-mini", + "review_status": "unreviewed" + }, + "type": "decision", + "topics": ["database", "architecture", "reporting"], + "people": [], + "action_items": [], + "confidence": "firm" + } + }, + { + "id": "00000000-0000-4000-8000-000000000002", + "content": "[Obsidian: Home Maintenance | Projects] Replace HVAC filters quarterly and keep receipts in the home maintenance folder.", + "created_at": "2026-06-06T00:05:00Z", + "embedding": [0.04, 0.05, 0.06], + "source_type": "obsidian", + "content_fingerprint": "sha256:5f6c9dd716d345f5356e833052e7f2ecfb6a17674f44b61d8ed847a6fc777100", + "metadata": { + "source": "obsidian", + "source_type": "obsidian", + "source_label": "House vault", + "source_id": "Projects/Home Maintenance.md", + "source_path": "Projects/Home Maintenance.md", + "source_locator": "heading: Seasonal maintenance", + "original_created_at": "2025-12-01T09:00:00Z", + "imported_at": "2026-06-06T00:05:00Z", + "importer_name": "obsidian-vault-import", + "importer_version": "1.0.0", + "input_hash": "sha256:083da49e5e445bd40357f6f591af393b806d1f6a2b1d8d1f6a5c9bf8a8c76f22", + "content_fingerprint": "sha256:5f6c9dd716d345f5356e833052e7f2ecfb6a17674f44b61d8ed847a6fc777100", + "sensitivity_tier": "personal", + "provenance": { + "method": "chunked_record", + "source_record": "Projects/Home Maintenance.md", + "source_locator": "heading: Seasonal maintenance", + "artifact": "Projects/Home Maintenance.md", + "review_status": "unreviewed" + }, + "type": "task", + "topics": ["home", "maintenance"], + "people": [], + "action_items": ["Replace HVAC filters quarterly"], + "confidence": "firm" + } + } +] diff --git a/recipes/import-verification/metadata.json b/recipes/import-verification/metadata.json new file mode 100644 index 00000000..6b94e309 --- /dev/null +++ b/recipes/import-verification/metadata.json @@ -0,0 +1,21 @@ +{ + "name": "Import Verification", + "description": "Read-only verification script for checking Open Brain import coverage, metadata completeness, embeddings, duplicate fingerprints, and sample rows.", + "category": "recipes", + "author": { + "name": "Alan Shurafa", + "github": "alanshurafa" + }, + "version": "1.0.0", + "requires": { + "open_brain": true, + "services": ["Supabase"], + "tools": ["Node.js 18+"] + }, + "requires_skills": [], + "tags": ["import", "verification", "audit", "metadata", "read-only"], + "difficulty": "beginner", + "estimated_time": "10 minutes", + "created": "2026-06-06", + "updated": "2026-06-06" +} diff --git a/recipes/import-verification/verify-imports.mjs b/recipes/import-verification/verify-imports.mjs new file mode 100644 index 00000000..a3461d29 --- /dev/null +++ b/recipes/import-verification/verify-imports.mjs @@ -0,0 +1,339 @@ +#!/usr/bin/env node + +import fs from "node:fs"; +import path from "node:path"; + +const REQUIRED_METADATA_FIELDS = [ + "source", + "source_type", + "source_label", + "imported_at", + "importer_name", + "importer_version", + "input_hash", + "content_fingerprint", + "sensitivity_tier", + "provenance", +]; + +const OPTIONAL_COLUMNS = ["source_type", "content_fingerprint"]; +const BASE_COLUMNS = ["id", "content", "metadata", "created_at", "embedding"]; + +function usage() { + console.log(`Usage: + node verify-imports.mjs [options] + +Options: + --source SOURCE Filter scanned rows to a source slug + --limit N Maximum recent rows to scan (default: 1000) + --sample N Number of sample rows to print (default: 5) + --probe TEXT Check scanned content for a phrase + --json Print JSON output + --strict Exit 1 when warnings are found + --fixture FILE Analyze local JSON rows instead of Supabase + --help Show this help`); +} + +function parseArgs(argv) { + const args = { + source: "", + limit: 1000, + sample: 5, + probe: "", + json: false, + strict: false, + fixture: "", + }; + + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i]; + if (arg === "--help") { + args.help = true; + } else if (arg === "--json") { + args.json = true; + } else if (arg === "--strict") { + args.strict = true; + } else if (arg === "--source") { + args.source = argv[++i] || ""; + } else if (arg.startsWith("--source=")) { + args.source = arg.slice("--source=".length); + } else if (arg === "--limit") { + args.limit = parsePositiveInt(argv[++i], "limit"); + } else if (arg.startsWith("--limit=")) { + args.limit = parsePositiveInt(arg.slice("--limit=".length), "limit"); + } else if (arg === "--sample") { + args.sample = parsePositiveInt(argv[++i], "sample"); + } else if (arg.startsWith("--sample=")) { + args.sample = parsePositiveInt(arg.slice("--sample=".length), "sample"); + } else if (arg === "--probe") { + args.probe = argv[++i] || ""; + } else if (arg.startsWith("--probe=")) { + args.probe = arg.slice("--probe=".length); + } else if (arg === "--fixture") { + args.fixture = argv[++i] || ""; + } else if (arg.startsWith("--fixture=")) { + args.fixture = arg.slice("--fixture=".length); + } else { + throw new Error(`Unknown argument: ${arg}`); + } + } + + return args; +} + +function parsePositiveInt(value, name) { + const parsed = Number.parseInt(value, 10); + if (!Number.isFinite(parsed) || parsed < 0) { + throw new Error(`--${name} must be a non-negative integer`); + } + return parsed; +} + +function loadDotEnv(filePath) { + if (!fs.existsSync(filePath)) return; + const lines = fs.readFileSync(filePath, "utf8").split(/\r?\n/); + for (const line of lines) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith("#")) continue; + const match = /^([A-Za-z_][A-Za-z0-9_]*)=(.*)$/.exec(trimmed); + if (!match) continue; + const [, key, rawValue] = match; + if (process.env[key]) continue; + process.env[key] = rawValue.replace(/^["']|["']$/g, ""); + } +} + +function authHeaders(serviceKey) { + return { + apikey: serviceKey, + Authorization: `Bearer ${serviceKey}`, + "Content-Type": "application/json", + }; +} + +async function fetchThoughts(args) { + loadDotEnv(path.join(process.cwd(), ".env.local")); + loadDotEnv(path.join(process.cwd(), ".env")); + + const supabaseUrl = process.env.SUPABASE_URL; + const serviceKey = process.env.SUPABASE_SERVICE_ROLE_KEY || process.env.SUPABASE_SECRET_KEY; + + if (!supabaseUrl || !serviceKey) { + throw new Error("SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY are required for live checks"); + } + + const columns = await detectColumns(supabaseUrl, serviceKey); + const rows = []; + const pageSize = Math.min(1000, Math.max(1, args.limit || 1000)); + + for (let offset = 0; rows.length < args.limit; offset += pageSize) { + const remaining = args.limit - rows.length; + const limit = Math.min(pageSize, remaining); + const url = new URL(`${supabaseUrl.replace(/\/$/, "")}/rest/v1/thoughts`); + url.searchParams.set("select", columns.join(",")); + url.searchParams.set("order", "created_at.desc"); + url.searchParams.set("limit", String(limit)); + url.searchParams.set("offset", String(offset)); + + const response = await fetch(url, { headers: authHeaders(serviceKey) }); + if (!response.ok) { + throw new Error(`Supabase query failed: ${response.status} ${await response.text()}`); + } + const page = await response.json(); + rows.push(...page); + if (page.length < limit) break; + } + + return rows; +} + +async function detectColumns(supabaseUrl, serviceKey) { + let columns = [...BASE_COLUMNS, ...OPTIONAL_COLUMNS]; + + for (const optionalColumn of OPTIONAL_COLUMNS) { + const ok = await canSelectColumns(supabaseUrl, serviceKey, columns); + if (ok) return columns; + columns = columns.filter((column) => column !== optionalColumn); + } + + const ok = await canSelectColumns(supabaseUrl, serviceKey, columns); + if (!ok) { + throw new Error(`Supabase thoughts query failed with required columns: ${BASE_COLUMNS.join(", ")}`); + } + return columns; +} + +async function canSelectColumns(supabaseUrl, serviceKey, columns) { + const url = new URL(`${supabaseUrl.replace(/\/$/, "")}/rest/v1/thoughts`); + url.searchParams.set("select", columns.join(",")); + url.searchParams.set("limit", "1"); + const response = await fetch(url, { headers: authHeaders(serviceKey) }); + return response.ok; +} + +function readFixture(filePath) { + const raw = fs.readFileSync(filePath, "utf8"); + const parsed = JSON.parse(raw); + return Array.isArray(parsed) ? parsed : [parsed]; +} + +function rowSource(row) { + const metadata = row.metadata || {}; + return row.source_type || metadata.source_type || metadata.source || "unknown"; +} + +function rowFingerprint(row) { + const metadata = row.metadata || {}; + return row.content_fingerprint || metadata.content_fingerprint || ""; +} + +function preview(content) { + return String(content || "").replace(/\s+/g, " ").trim().slice(0, 140); +} + +function analyzeRows(rows, args) { + const filtered = args.source + ? rows.filter((row) => rowSource(row) === args.source) + : rows; + + const bySource = new Map(); + const missingMetadata = []; + const missingEmbeddings = []; + const fingerprints = new Map(); + const samples = []; + const probeNeedle = args.probe.toLowerCase(); + let probeMatches = 0; + + for (const row of filtered) { + const source = rowSource(row); + bySource.set(source, (bySource.get(source) || 0) + 1); + + const metadata = row.metadata || {}; + const missingFields = REQUIRED_METADATA_FIELDS.filter((field) => { + if (field === "content_fingerprint") return !metadata[field] && !row.content_fingerprint; + if (field === "source_type") return !metadata[field] && !row.source_type; + return metadata[field] === undefined || metadata[field] === null || metadata[field] === ""; + }); + if (missingFields.length) { + missingMetadata.push({ + id: row.id, + source, + missing_fields: missingFields, + }); + } + + if (!row.embedding || (Array.isArray(row.embedding) && row.embedding.length === 0)) { + missingEmbeddings.push({ id: row.id, source }); + } + + const fingerprint = rowFingerprint(row); + if (fingerprint) { + const bucket = fingerprints.get(fingerprint) || []; + bucket.push(row.id); + fingerprints.set(fingerprint, bucket); + } + + if (probeNeedle && String(row.content || "").toLowerCase().includes(probeNeedle)) { + probeMatches += 1; + } + + if (samples.length < args.sample) { + samples.push({ + id: row.id, + source, + created_at: row.created_at, + preview: preview(row.content), + }); + } + } + + const duplicateFingerprints = [...fingerprints.entries()] + .filter(([, ids]) => ids.length > 1) + .map(([fingerprint, ids]) => ({ fingerprint, ids })); + + const warnings = []; + if (args.source && filtered.length === 0) warnings.push(`No rows found for source '${args.source}'`); + if (missingMetadata.length) warnings.push(`${missingMetadata.length} row(s) missing required metadata fields`); + if (missingEmbeddings.length) warnings.push(`${missingEmbeddings.length} row(s) missing embeddings`); + if (duplicateFingerprints.length) warnings.push(`${duplicateFingerprints.length} duplicate fingerprint group(s) found`); + if (probeNeedle && probeMatches === 0) warnings.push(`Probe text not found: ${args.probe}`); + + return { + scanned_rows: rows.length, + matched_rows: filtered.length, + source_filter: args.source || null, + by_source: Object.fromEntries([...bySource.entries()].sort(([a], [b]) => a.localeCompare(b))), + missing_metadata: missingMetadata, + missing_embeddings: missingEmbeddings, + duplicate_fingerprints: duplicateFingerprints, + probe: args.probe ? { query: args.probe, matches: probeMatches } : null, + samples, + warnings, + }; +} + +function printHuman(result) { + console.log("Import Verification"); + console.log("==================="); + console.log(`Scanned rows: ${result.scanned_rows}`); + console.log(`Matched rows: ${result.matched_rows}`); + if (result.source_filter) console.log(`Source filter: ${result.source_filter}`); + + console.log("\nRows by source:"); + if (Object.keys(result.by_source).length === 0) { + console.log(" none"); + } else { + for (const [source, count] of Object.entries(result.by_source)) { + console.log(` ${source}: ${count}`); + } + } + + console.log("\nFindings:"); + console.log(` Missing metadata rows: ${result.missing_metadata.length}`); + console.log(` Missing embedding rows: ${result.missing_embeddings.length}`); + console.log(` Duplicate fingerprint groups: ${result.duplicate_fingerprints.length}`); + if (result.probe) console.log(` Probe matches: ${result.probe.matches}`); + + if (result.samples.length) { + console.log("\nSamples:"); + for (const sample of result.samples) { + console.log(` - ${sample.id} [${sample.source}] ${sample.preview}`); + } + } + + if (result.warnings.length) { + console.log("\nWarnings:"); + for (const warning of result.warnings) console.log(` - ${warning}`); + } +} + +async function main() { + let args; + try { + args = parseArgs(process.argv.slice(2)); + if (args.help) { + usage(); + return; + } + } catch (error) { + console.error(`Error: ${error.message}`); + usage(); + process.exit(2); + } + + try { + const rows = args.fixture ? readFixture(args.fixture) : await fetchThoughts(args); + const result = analyzeRows(rows, args); + if (args.json) { + console.log(JSON.stringify(result, null, 2)); + } else { + printHuman(result); + } + if (args.strict && result.warnings.length) process.exit(1); + } catch (error) { + console.error(`Error: ${error.message}`); + process.exit(2); + } +} + +main();