From a671069acb0b0da241648c3e997d9ede91f28098 Mon Sep 17 00:00:00 2001
From: Mike Freedman <mike@timescale.com>
Date: Sun, 31 May 2026 00:44:56 -0400
Subject: [PATCH] fix(undo): record NULL version_id when inline capture is
 empty

ExecuteUndoTransaction's Step 3 inserts a type='undo' log entry for
every affected file, with version_id pointing at the row that the
archive BEFORE trigger wrote during Step 1's DELETE or Step 2's
UPSERT (captured inline immediately after each mutation). The capture
invariant assumes the archive trigger fires -- which it does in any
properly-configured history-enabled workspace.

If the trigger is missing or disabled (a malformed workspace), the
captured-version_id map ends up with no entry for the affected file
and Step 3 reads back the empty string. The previous code passed that
empty string as a UUID parameter, which PG rejected with the cryptic
error: ERROR: invalid input syntax for type uuid: "" (SQLSTATE 22P02).
The entire undo transaction rolled back.

Convert the empty string to a SQL NULL (via a nil interface{}) before
the INSERT, so the transaction completes successfully and records the
undo entry as "no captured snapshot." NULL is already a legitimate
version_id (type='create' log entries record NULL too -- there is no
before-state for a fresh row), and downstream undo-of-undo dispatch
already returns a clear "cannot undo operation: no version_id" error
when it encounters one. The defensive path costs one small helper
applied in both the DELETE and RESTORE Step 3 loops.

To keep the workspace-inconsistency signal loud despite the
transaction succeeding, emit a logging.Warn from the helper with the
affected file_id, the branch (delete vs restore), and a hint to
verify the archive trigger exists on the source table. Operators
investigating the warning find the malformed workspace; the undo
itself isn't blocked by the missing trigger.

Updates TestExecuteUndoTransaction_DeleteOnlyUndo's leading comment
to describe the dual purpose it now serves: the original empty-
RestoreFileIDs branch guard, plus the new defensive NULL-version_id
path. The test passes without any trigger installation, matching the
no-trigger scenario the production fix targets.
---
 internal/tigerfs/db/query.go      | 49 ++++++++++++++++++++++++++++---
 internal/tigerfs/db/query_test.go | 25 ++++++++++++----
 2 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/internal/tigerfs/db/query.go b/internal/tigerfs/db/query.go
index c26a1d1..ae7d08f 100644
--- a/internal/tigerfs/db/query.go
+++ b/internal/tigerfs/db/query.go
@@ -1191,6 +1191,37 @@ func (c *Client) QueryLogEntry(ctx context.Context, schema, logTable, logID stri
 	return &f, nil
 }
 
+// emptyVersionIDToNull converts an empty version_id captured during
+// ExecuteUndoTransaction Step 1/2 into a SQL NULL (returned as nil
+// interface{}) for the Step 3 INSERT, while logging a warning so the
+// underlying workspace inconsistency is surfaced to operators.
+//
+// The inline captures in Steps 1 and 2 only fail to populate the
+// captured map when the BEFORE archive trigger didn't write a history
+// row for the mutation -- typically because the trigger is missing or
+// disabled on a malformed workspace. Recording NULL on the resulting
+// undo log row keeps the transaction from failing with a confusing
+// "invalid input syntax for type uuid" PG error (the version_id column
+// can't accept the empty string literal). NULL is already a legitimate
+// value for the column (create-type log entries store NULL too), and
+// downstream undo-of-undo dispatch handles empty/NULL version_id
+// cleanly by returning a clear "no version_id" error.
+//
+// branch is "delete" or "restore" -- it labels which Step 3 loop hit
+// the empty capture, helping correlate the warning to the affected
+// path. fileID is the source row that triggered the empty capture.
+func emptyVersionIDToNull(versionID, fileID, branch string) interface{} {
+	if versionID != "" {
+		return versionID
+	}
+	logging.Warn("undo log entry has no version_id; archive trigger may not be firing",
+		zap.String("file_id", fileID),
+		zap.String("branch", branch),
+		zap.String("hint", "verify the archive BEFORE trigger exists on the source table; "+
+			"this undo log entry's version_id was recorded as NULL and the entry will not be reversible"))
+	return nil
+}
+
 // ExecuteUndoTransaction executes a batch of undo operations atomically.
 // Within a single transaction: deletes created rows, upserts from history,
 // and inserts undo log entries. BEFORE triggers fire on each restore,
@@ -1390,19 +1421,29 @@ func (c *Client) ExecuteUndoTransaction(ctx context.Context, params *UndoTransac
 		if i < len(params.DeleteFilenames) {
 			filename = params.DeleteFilenames[i]
 		}
-		versionIDVal := capturedDeleteVIDs[fileID]
+		// Pass nil (-> SQL NULL) when the inline capture in Step 1 produced no
+		// version_id. The Step 1 capture only fails to populate the map when
+		// the BEFORE-DELETE archive trigger didn't write a history row -- an
+		// invariant violation in a properly-configured workspace, but possible
+		// if the trigger is missing or disabled. Recording NULL instead of an
+		// empty string keeps the transaction from failing with a cryptic
+		// "invalid input syntax for type uuid" and lets the undo complete;
+		// the warning below surfaces the underlying workspace inconsistency.
+		// See ExecuteUndoTransaction's Step 1 comment for the capture invariant.
+		versionIDArg := emptyVersionIDToNull(capturedDeleteVIDs[fileID], fileID, "delete")
 		if _, err := tx.Exec(ctx,
 			fmt.Sprintf(
 				`INSERT INTO %s (%s, %s, %s, %s, %s, %s, %s) VALUES (uuidv7(), $1, 'undo', $2, $3, $4, $5)`,
 				logTable, qi("log_id"), qi("file_id"), qi("type"), qi("user_id"), qi("filename"), qi("version_id"), qi("description"),
 			),
-			fileID, params.UserID, filename, versionIDVal, params.Description,
+			fileID, params.UserID, filename, versionIDArg, params.Description,
 		); err != nil {
 			return fmt.Errorf("failed to insert undo log entry for delete: %w", err)
 		}
 	}
 	for i, fileID := range params.RestoreFileIDs {
-		versionIDVal := capturedRestoreVIDs[fileID]
+		// Same nil-conversion + warning as the DELETE branch above.
+		versionIDArg := emptyVersionIDToNull(capturedRestoreVIDs[fileID], fileID, "restore")
 		if _, err := tx.Exec(ctx,
 			fmt.Sprintf(
 				`INSERT INTO %s (%s, %s, %s, %s, %s, %s, %s) VALUES (uuidv7(), $1, 'undo', $2, $3, $4, $5)`,
@@ -1410,7 +1451,7 @@ func (c *Client) ExecuteUndoTransaction(ctx context.Context, params *UndoTransac
 				qi("log_id"), qi("file_id"), qi("type"), qi("user_id"),
 				qi("filename"), qi("version_id"), qi("description"),
 			),
-			fileID, params.UserID, params.RestoreFilenames[i], versionIDVal, params.Description,
+			fileID, params.UserID, params.RestoreFilenames[i], versionIDArg, params.Description,
 		); err != nil {
 			return fmt.Errorf("failed to insert undo log entry for restore: %w", err)
 		}
diff --git a/internal/tigerfs/db/query_test.go b/internal/tigerfs/db/query_test.go
index d2b9252..c29453e 100644
--- a/internal/tigerfs/db/query_test.go
+++ b/internal/tigerfs/db/query_test.go
@@ -2383,11 +2383,26 @@ func TestExecuteUndoTransaction_DefersFKMultiLevel(t *testing.T) {
 	}
 }
 
-// TestExecuteUndoTransaction_DeleteOnlyUndo guards the empty-RestoreFileIDs
-// branch of the post-undo modified_at bump. When a user undoes a single
-// 'create' op, the only work is to DELETE the created row from source -- no
-// rows are restored. The mtime-bump SQL uses ANY($1::uuid[]) and would
-// error on an empty array if the branch wasn't gated.
+// TestExecuteUndoTransaction_DeleteOnlyUndo guards two distinct edge cases
+// of a DELETE-only undo (single 'create' op being reversed):
+//
+//  1. The empty-RestoreFileIDs branch of the post-undo modified_at bump.
+//     That SQL uses ANY($1::uuid[]) and would error on an empty array if
+//     the branch wasn't gated.
+//
+//  2. The defensive NULL-version_id path in Step 3. The fixture deliberately
+//     doesn't install the archive BEFORE-DELETE trigger (setupUndoTestEnv
+//     keeps the schema trigger-free so other tests can seed history rows
+//     directly without interference), which means Step 1's DELETE fires no
+//     trigger and writes no history row. The inline version_id capture
+//     finds nothing. Step 3 must record the resulting undo log entry with
+//     version_id = NULL rather than failing with PG's "invalid input
+//     syntax for type uuid" on the empty string. In a properly-configured
+//     production workspace the archive trigger always fires and this
+//     defensive path is dead code; the test holds it down regardless so a
+//     future regression doesn't reintroduce the cryptic-error failure
+//     mode. emptyVersionIDToNull (in query.go) handles the conversion and
+//     emits a warning identifying the file_id and the branch.
 func TestExecuteUndoTransaction_DeleteOnlyUndo(t *testing.T) {
 	env, cleanup := setupUndoTestEnv(t)
 	defer cleanup()