diff --git a/.gitignore b/.gitignore index d2d3345..262d54d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,13 @@ *.sln.docstates *.userprefs mono_crash.* + +# Diagnostic artefacts collected during ad-hoc debugging — kept locally, not checked in. +Procmon64.exe +procmon_*.pml +procmon_*.csv +*Bench*.png +imdiskBench.png [Dd]ebug/ [Dd]ebugPublic/ [Rr]elease/ diff --git a/CLAUDE.md b/CLAUDE.md index 818bc74..bfa6a25 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -261,5 +261,6 @@ All settings in `appsettings.jsonc` under `"RamDrive"` section, overridable via | PageSizeKb | 64 | Page granularity (try 256 for large-file workloads) | | PreAllocate | false | true = allocate all memory at startup | | VolumeLabel | RamDrive | Explorer display name | -| EnableKernelCache | true | Kernel page cache (~3x throughput). Uses WinFsp `FileInfoTimeout=MAX`. | +| EnableKernelCache | true | Master switch for the WinFsp kernel `FileInfo` cache. `false` forces `FileInfoTimeoutMs` to `0` regardless of its configured value (backout switch; ~3× lower throughput). | +| FileInfoTimeoutMs | 1000 | Kernel `FileInfo` cache lifetime (ms). The adapter invalidates the cache explicitly via `FspFileSystemNotify` on every path-mutating callback (see `WinFspRamAdapter.cs` notification matrix); this timeout is defence in depth. `0` disables the cache; `uint.MaxValue` makes it permanent (trust notifications only — the integration test fixture pins this value to catch regressions). | | InitialDirectories | `{}` | Tree of directories to create on mount (e.g. `{ "Temp": {} }`) | diff --git a/Directory.Packages.props b/Directory.Packages.props index e259f78..d99438f 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -18,7 +18,7 @@ - + diff --git a/docs/leveldb-cache-coherency-postmortem.md b/docs/leveldb-cache-coherency-postmortem.md new file mode 100644 index 0000000..845e252 --- /dev/null +++ b/docs/leveldb-cache-coherency-postmortem.md @@ -0,0 +1,538 @@ +# Postmortem: leveldb / Chromium kernel-cache coherency bug + +This document is the long-form record of the bug fixed by openspec change +[`fix-leveldb-cache-coherency`](../openspec/changes/fix-leveldb-cache-coherency/). +It is written so a future Claude Code session (or human reader) with no context +beyond this file, the source tree, and `tla/RamDiskSystem.tla` can pick up the +remaining work — most importantly the TLA+ modeling extension in §10 — without +re-deriving anything. + +--- + +## 1. Symptom + +Any Chromium-based browser launched with `--user-data-dir` pointing at the +mounted RAM drive (`Z:\` in the user's environment) crashed during startup +with exit code `0x80000003` (`STATUS_BREAKPOINT`, i.e. `__debugbreak()`): + +```text +[repro] EXIT code=2147483651 +[repro] hex=0x80000003 +[repro] STATUS_BREAKPOINT — repro confirmed +``` + +The dying process printed exactly one diagnostic line to stderr that mattered: + +```text +[WARNING:components\leveldb_proto\internal\leveldb_database.cc:124] + Unable to open Z:\Temp\\Default\: + Corruption: CURRENT file does not end with newline +``` + +The leveldb directory varied by run (`Local Storage\leveldb`, `Sync Data\LevelDB`, +`GCM Store`, `shared_proto_db`, `Site Characteristics Database`, …) — every +component that uses leveldb was vulnerable. Not a Chromium bug; the message is +a sentinel and elsewhere the same condition trips a `CHECK()` that +`__debugbreak`s. + +Reproduced via Playwright initially, then minimised to a Node.js launcher that +spawns `chrome.exe --remote-debugging-pipe` with `stdio: [ignore, pipe, pipe, pipe, pipe]` +(see §3). On ImDisk's `V:\` and physical NTFS the same command worked. So: +RamDrive-side, not Chromium-side. + +## 2. Environment + +| Component | Value | +|---|---| +| OS | Windows 11 Pro for Workstations 26200 | +| WinFsp | 2.x with Developer files | +| RamDrive mount | `Z:\` via Mount Manager (`MountUseMountmgrFromFSD=1`), running as Windows service | +| Chromium | Playwright bundle `chromium-1208`, `C:\Users\HuYao\AppData\Local\ms-playwright\chromium-1208\chrome-win64\chrome.exe` | +| Critical config | `EnableKernelCache=true`, which in the broken build set `FileInfoTimeout=uint.MaxValue` | +| Comparison FS | ImDisk on `V:\` (works), `F:\` physical NTFS (works) | + +## 3. Repro recipe + +The full failure mode requires `--remote-debugging-pipe`, which in turn requires +the parent process to inherit fds 3 and 4. PowerShell can't easily do this; Node +can. The script `repro_chrome.js` lives at the repo root specifically so future +sessions can re-run it; it is `.gitignore`d-adjacent (kept in the tree because +it is documentation-as-code). + +```js +// repro_chrome.js — minimal Playwright-style launcher for the leveldb crash. +// Usage: node repro_chrome.js Z:\Temp\ +const { spawn } = require('child_process'); +const fs = require('fs'); + +const exe = 'C:\\Users\\HuYao\\AppData\\Local\\ms-playwright\\chromium-1208\\chrome-win64\\chrome.exe'; +const userDataDir = process.argv[2] || 'Z:\\Temp\\rarbg_browser'; + +const args = [ + // Same flag set Playwright uses; the only ones that matter for the bug are + // --no-sandbox, --user-data-dir=..., --remote-debugging-pipe. + '--no-sandbox', '--no-first-run', '--no-default-browser-check', + '--disable-features=DestroyProfileOnBrowserClose,RenderDocument', + `--user-data-dir=${userDataDir}`, + '--remote-debugging-pipe', + '--enable-logging=stderr', '--v=0', + 'about:blank', +]; + +try { fs.rmSync(userDataDir, { recursive: true, force: true }); } catch {} + +const child = spawn(exe, args, { + // CRITICAL: fd 3 = chrome→us, fd 4 = us→chrome. Without these chrome exits 13 + // with "Remote debugging pipe file descriptors are not open" and the bug + // never fires because chrome dies before it touches leveldb. + stdio: ['ignore', 'pipe', 'pipe', 'pipe', 'pipe'], + windowsHide: true, +}); + +child.stderr.on('data', d => process.stderr.write(d)); +child.stdio[3].on('error', () => {}); +child.stdio[4].on('error', () => {}); + +setTimeout(() => { try { child.kill(); } catch {} }, 30_000); +child.on('exit', (code) => { + if ((code >>> 0) === 0x80000003) console.log('STATUS_BREAKPOINT — bug repro\'d'); +}); +``` + +Pre-fix expected outcome: chrome stderr contains the `Corruption: CURRENT does +not end with newline` line and the process exits `0x80000003`. Post-fix +expected outcome: chrome reaches `about:blank` and is killed by the 30s timer. + +## 4. Diagnostic dead-ends — do not redo these + +The original bug report from the user listed eight suspicious WinFsp behaviours. +Every one was verified non-issue on this repo. **A future session should not +re-investigate any of these unless something has changed:** + +| # | Suspicion | Verification | Verdict | +|---|---|---|---| +| 1 | `GetDriveType("Z:\\")` not `DRIVE_FIXED` | P/Invoke direct call returned `3` (DRIVE_FIXED) | OK | +| 2 | Mount Manager hasn't registered the volume GUID | `mountvol Z: /L` returned `\\?\Volume{...}\` | OK | +| 3 | `GetVolumeInformation("Z:\\")` returns garbage | Returned `label='RamDrive' fs='NTFS'` | OK | +| 4 | `LockFileEx` byte-range lock has wrong semantics | `FileStream.Lock()` + second-process conflict returned `ERROR_LOCK_VIOLATION (33)` — identical to NTFS | OK | +| 5 | Adapter is missing a `Lock`/`Unlock` callback | The `WinFsp.Native.IFileSystem` interface **has no Lock/Unlock members** — WinFsp implements byte-range locks in the kernel; user-mode FS is not in the path. **Do not look for this.** | N/A | +| 6 | `FileIdInfo` returns non-unique IDs | Not actually used by leveldb on the failure path | OK | +| 7 | `FlushFileBuffers` returns wrong status | Returns `STATUS_SUCCESS` synchronously (correct for a RAM disk) | OK | +| 8 | Sandbox path checks reject `Z:\\` | `--no-sandbox` was already in the failing command line — sandbox is not even initialised | N/A | + +## 5. What didn't reproduce the bug under unit tests + +A naive integration test that did `File.WriteAllBytes(target, []); File.WriteAllBytes(tmp, payload); File.Move(tmp, target, true); File.ReadAllBytes(target)` **passed**. The bug is fragile against high-level .NET wrappers — they introduce extra opens, releases, and timing pauses that mask the race. + +Actually triggering the fault requires **all** of: + +1. **Win32 `CreateFile` with default cached I/O** — no `FILE_FLAG_NO_BUFFERING`, no `FILE_FLAG_WRITE_THROUGH`. .NET's `FileStream` defaults to cached too, but goes through a different open sequence. +2. **Buffered `WriteFile` followed by `FlushFileBuffers`** — forces the cache manager to do read-modify-write on the page, so the user-mode FS sees a paging-write IRP. +3. **Atomic rename via `MoveFileExW(REPLACE_EXISTING)` or `SetFileInformationByHandle(FileRenameInfo)`** — Chromium's leveldb env uses both depending on the path. The unit tests cover both. +4. **A previous negative open** of the new name on the same handle table — chrome opens `CURRENT` with `Disposition: Open` three times before deciding to rename `dbtmp` over it. Those negative `OpenFile` calls **populate** the kernel's `FileInfo` cache with `size=0, exists=false`. +5. **`FileInfoTimeout = uint.MaxValue`** — without this the cache expires before the post-rename read happens. + +The reduced repro lives in `tests/RamDrive.IntegrationTests/LevelDbReproTests.cs`. It uses raw P/Invoke (`CreateFile`, `WriteFile`, `MoveFileEx`, `SetFileInformationByHandle`) and the integration fixture is pinned at `FileInfoTimeoutMs = uint.MaxValue` for exactly this reason — it must keep catching this regression. + +## 6. How procmon was used (and a caveat) + +**Setup** (from an admin shell): + +```cmd +"C:\Users\HuYao\Desktop\Procmon - Copy.exe" /AcceptEula /Quiet /Minimized ^ + /BackingFile F:\procmon_chrome.pml /Runtime 35 +``` + +While that 35-second window is open, in another (non-admin) shell: + +```cmd +node F:\MyProjects\RamDrive\repro_chrome.js Z:\Temp\rarbg_proc +``` + +After procmon stops, export to CSV (admin not required): + +```cmd +"C:\Users\HuYao\Desktop\Procmon - Copy.exe" /OpenLog F:\procmon_chrome.pml /SaveAs F:\procmon_chrome.csv +``` + +The CSV is ~150 MB. Useful filters: + +```bash +# Operations on the leveldb CURRENT files only +grep '"chrome.exe"' F:/procmon_chrome.csv | grep '\\CURRENT' | grep -v SUCCESS + +# Per-file event timeline, sorted by time +grep '"chrome.exe"' F:/procmon_chrome.csv | grep -F 'LOCAL STORAGE\LEVELDB' \ + | awk -F'","' '{ ts=$1; gsub(/^"/,"",ts); printf "%s %-30s %-32s [%s]\n", ts, $5, $4, $6 }' +``` + +### CRITICAL CAVEAT — procmon masks the BREAKPOINT but the underlying corruption persists + +With procmon attached, IRP timing stretches enough that Chromium often **does +not** crash to `STATUS_BREAKPOINT`. **However**, the `Corruption: CURRENT does +not end with newline` warning is still printed to stderr and the leveldb +database is still empty. **Diagnose against the warning, not the exit code, +when procmon is attached.** Without procmon, both signals are present. + +## 7. The smoking-gun trace + +These five rows from the procmon CSV nailed the bug. They are all on the same +file inside `…\GCM Store\` and span ~2 milliseconds: + +```text +…58722 CURRENT CreateFile [NAME NOT FOUND] Disposition: Open +…58755 CURRENT CreateFile [NAME NOT FOUND] Disposition: Open +…58781 CURRENT CreateFile [NAME NOT FOUND] Disposition: Open + Delete +…58812 000001.dbtmp CreateFile [SUCCESS] Disposition: Open + Delete +…58861 000001.dbtmp SetRenameInformationFile [SUCCESS] ReplaceIfExists:False, ⇒ CURRENT +…58908 CURRENT CreateFile [SUCCESS] Generic Read +…58921 CURRENT ReadFile [END OF FILE] Length: 8192 +``` + +The three `[NAME NOT FOUND]` opens populate the kernel's negative cache for +`CURRENT`. The rename succeeds (the underlying `RamFileSystem` node moves +correctly — confirmed via in-process trace). The post-rename open returns +`SUCCESS`, but the read returns **end-of-file** at offset 0 — meaning the +kernel served the read from the cached `size=0` and **never called user-mode +`ReadFile` at all**. + +`SetRenameInformationFile` returning success while `ReadFile` returns EOF is +the unmistakable kernel-cache-vs-user-mode-state divergence. + +## 8. Root cause + +`WinFspRamAdapter.Init` (pre-fix) set: + +```csharp +if (_options.EnableKernelCache) + host.FileInfoTimeout = unchecked((uint)(-1)); // = uint.MaxValue +``` + +This tells WinFsp's kernel driver to cache `FileInfo` (existence, size, +attributes) **forever**. The adapter then performed every path-mutating +operation (`CreateFile`, `MoveFile`, `Cleanup` with delete, `OverwriteFile`, +`SetFileSize`, `SetFileAttributes`) **without notifying the kernel**. + +Result: any negative cache entry, or any stale size/attributes entry, +survived the lifetime of the mount. leveldb's atomic-rename pattern landed +exactly on this: it negative-probes, renames into the cache slot, then reads. + +Note also that the `WinFsp.Native` binding **only exposed `FspFileSystemNotifyBegin` +and `FspFileSystemNotifyEnd`** — the actual `FspFileSystemNotify` was not in +the P/Invoke layer. Even if the adapter author had wanted to send +notifications, they couldn't, which is presumably how the bug got in. + +## 9. Fix summary + +Three coordinated changes shipped in +[`fix-leveldb-cache-coherency`](../openspec/changes/fix-leveldb-cache-coherency/): + +1. **Binding** (`winfsp-native` repo, package `0.1.2-pre.x`): added + `FspFileSystemNotify` P/Invoke + `FspFsctlNotifyInfo` struct + `FileNotify` + constants + public `FileSystemHost.Notify(filter, action, fileName)`. + Buffer is stack-allocated for paths up to ~2030 chars; longer paths use + `ArrayPool`. Path is upper-cased in place when the host is + case-insensitive (the WinFsp driver's cache key is the upper-cased form — + verified against the official `fuse.c` reference implementation). +2. **Adapter** (`WinFspRamAdapter.cs`): every path-mutating callback now sends + the appropriate `Notify` after the user-mode mutation commits and outside + any locks the kernel could re-enter. The full matrix is documented in the + class XML doc and in `openspec/changes/fix-leveldb-cache-coherency/design.md` + §Decision 4. Notification failures are logged at `Trace` and **never** fail + the originating IRP. +3. **Config** (`RamDriveOptions.FileInfoTimeoutMs`, default `1000`): replaces + the unconditional `uint.MaxValue` and acts as defence in depth for any path + that escapes the notification matrix. The integration fixture pins + `uint.MaxValue` so a missed notification fails CI rather than only failing + against real Chromium with the production default. + +Regression test: `tests/RamDrive.IntegrationTests/LevelDbReproTests.cs`. Three +test methods (basic, mixed-case rename, default-timeout fixture) cover the +matrix. + +### 9.0.1 Notify must dispatch off the WinFsp dispatcher thread + +After the initial fix passed locally, the RamDrive PR's CI hung on +`TortureTests.DirectoryTreeStress` (5 concurrent threads creating + recursively +deleting nested directory trees). The same suite passed locally on a 32-core +machine; the GitHub `windows-latest` runner has fewer cores, exposing a +**dispatcher-pool deadlock**: + +- `Cleanup(Delete)` for a directory ran on a WinFsp dispatcher thread. +- The adapter called `FspFileSystemNotify` synchronously from inside that callback. +- `FspFsctlNotify` is a kernel IOCTL that can block on rename-in-progress. +- Concurrent recursive deletes on sibling directories saturated the dispatcher + pool — every dispatcher thread was sitting in `Notify` waiting for kernel + state that another (already-blocked) dispatcher would have released. + +Fix: `WinFspRamAdapter.Notify` and the `TestAdapter.Notify` were changed to +fire-and-forget via `ThreadPool.UnsafeQueueUserWorkItem(..., preferLocal: false)`. +The `Notify` returns immediately; the IOCTL runs on a worker thread that is +not in the WinFsp dispatcher pool. Acceptable trade-off: notifications can be +reordered relative to the IRP that triggered them, but the matrix is +path-scoped and the kernel revalidates on the next open after invalidation, so +ordering does not affect correctness — only when (in microseconds) the +invalidation lands. + +### 9.1 Known follow-up: a SEPARATE pre-existing pipe-mode crash + +After landing the leveldb fix and verifying it works (`CURRENT` ends with +`\n` correctly on disk, integration tests pass at `FileInfoTimeout=uint.MaxValue`), +manual end-to-end testing surfaced a **different** bug that was always there +but masked by chrome dying earlier on the leveldb issue: + +- **Trigger**: `chrome.exe ... --remote-debugging-pipe ...` against a RamDrive + mount with `EnableKernelCache=true` (any non-zero `FileInfoTimeoutMs`). +- **Symptom**: chrome exits `STATUS_BREAKPOINT` very early — only one stderr line + (`Command line too long for RegisterApplicationRestart`) before death. No + fatal/check-failed line is printed; chrome dies before its logging is set up. +- **Bisection**: + - Direct `chrome.exe` (no `--remote-debugging-pipe`) on H: with cache enabled + runs fine end-to-end (full UI, network activity, USB enumeration). + - `--remote-debugging-pipe` on physical NTFS (F:) runs fine. + - `--remote-debugging-pipe` on H: with `EnableKernelCache=false` runs fine. + - `--remote-debugging-pipe` on H: with `EnableKernelCache=true` and **all + `Notify` calls disabled** still crashes — confirming Notify is not the + cause. + - Reproduces identically on the unfixed Z: production mount. +- **Conclusion**: Independent of the leveldb cache-coherency fix shipped here. + Likely another kernel-cache-vs-user-mode-state divergence on a different + callback (suspect: `WriteFile` returning a stale `FspFileInfo` to the FSD), + but specific to whatever IRP sequence chrome's pipe-mode init does. +- **Status**: Should be filed as a separate openspec change. Leveldb fix is + complete and shippable on its own; this second bug doesn't regress anything + the leveldb fix promised. + +--- + +## 10. TLA+ modeling extension (next-session work) + +This section spec's the TLA+ work that should land as a follow-up. It is +written to be self-sufficient — a fresh session with `tla/RamDiskSystem.tla` +and this file should be able to implement, run, and write up the model +without re-deriving anything. + +### 10.1 Why model this + +`tla/RamDiskSystem.tla` already verifies internal data integrity of the +RamDrive (page pool accounting, three-phase write, sparse SetLength, etc.) — +but it treats the kernel as a passive observer. The bug above sat between +user-mode FS state and the **kernel's `FileInfo` cache**, which the existing +model does not have. The same gap caused an earlier `FreeBytes` pollution bug +mentioned in the model's "Modeling guidelines" section. The fix is to add the +kernel cache as a first-class state component. + +The notification matrix from §9 becomes a verifiable contract: every action +that mutates user-mode state must dispatch the corresponding `Notify` action +that invalidates the kernel cache. + +### 10.2 New variables + +Add to the `VARIABLES` declaration in `tla/RamDiskSystem.tla`: + +```tla +VARIABLES + ... \* existing variables + kernelCache, \* [Path -> CacheEntry] + cacheMode \* {"Permanent", "Bounded"} +``` + +with type definitions: + +```tla +CacheEntry == [size: Nat, exists: BOOLEAN] \cup {NotCached} +NotCached == [size |-> -1, exists |-> FALSE] \* sentinel; or use a separate symbol +``` + +`cacheMode = "Permanent"` corresponds to `FileInfoTimeoutMs = uint.MaxValue` +and disables the `CacheExpire` action (see below). `cacheMode = "Bounded"` +corresponds to a finite timeout and enables it. **Default the model to +"Permanent" for fastest counterexample-finding**: if the matrix is wrong, +"Permanent" produces a violation; "Bounded" can hide it. + +### 10.3 New actions + +Five new top-level actions. Add them to the `Next` disjunction. + +#### `KernelOpenFile(path)` + +Models the kernel's `IRP_MJ_CREATE` cache hit/miss decision. + +```tla +KernelOpenFile(path) == + IF kernelCache[path] # NotCached + THEN \* cache hit — kernel returns cached value WITHOUT calling user-mode + UNCHANGED <> + ELSE \* cache miss — perform user-mode open, populate cache + /\ DoOpen(path) \* existing user-mode action + /\ kernelCache' = [kernelCache EXCEPT ![path] = MkEntry(path)] +``` + +`MkEntry(path)` reads the post-`DoOpen` user-mode state and produces a +`CacheEntry`. + +#### `KernelReadFile(path)` + +```tla +KernelReadFile(path) == + /\ kernelCache[path] # NotCached + /\ LET cached == kernelCache[path] + IN \* kernel returns the cached size; if it says 0 it returns EOF + \* WITHOUT calling user-mode. THIS is the bug we are modelling. + UNCHANGED <> +``` + +The asymmetry with `KernelOpenFile` is intentional and matches reality: a read +on a cached file goes straight to the kernel page cache; only an open with a +"misses-cache" disposition like `OPEN` will trigger a user-mode call. This is +exactly the behaviour that returned `END OF FILE` in the smoking gun trace. + +#### `Notify(path, action)` + +```tla +NotifyActions == {"Added", "Removed", "Modified", "RenamedOldName", "RenamedNewName"} + +Notify(path, action) == + /\ action \in NotifyActions + /\ kernelCache' = [kernelCache EXCEPT ![path] = NotCached] + /\ UNCHANGED <> +``` + +(For modeling purposes the `action` parameter is decorative — it is recorded +in the action label so counterexample traces are readable. The effect is the +same.) + +#### `CacheExpire(path)` + +```tla +CacheExpire(path) == + /\ cacheMode = "Bounded" + /\ kernelCache[path] # NotCached + /\ kernelCache' = [kernelCache EXCEPT ![path] = NotCached] + /\ UNCHANGED <> +``` + +This non-deterministically clears any cached entry, modelling the +`FileInfoTimeoutMs` countdown. + +### 10.4 Modify existing actions to call Notify + +The notification matrix from `WinFspRamAdapter` becomes: + +```tla +DoCreateFile(f) == \E ... : + /\ + /\ kernelCache' = [kernelCache EXCEPT ![f] = NotCached] \* equivalent to Notify(f, "Added") + +DoMove(f, g) == \E ... : + /\ + /\ kernelCache' = [kernelCache EXCEPT + ![f] = NotCached, \* RenamedOldName + ![g] = NotCached] \* RenamedNewName + +DoDelete(f) == + /\ + /\ kernelCache' = [kernelCache EXCEPT ![f] = NotCached] + +DoTruncate(f, n) == + /\ + /\ kernelCache' = [kernelCache EXCEPT ![f] = NotCached] + +DoExtend(f, n) == + /\ + /\ kernelCache' = [kernelCache EXCEPT ![f] = NotCached] +``` + +`DoWriteP3(f)` deliberately **does not** invalidate the cache: WinFsp's FSD +already updates its `FspFileInfo` cache from the result struct returned by +`Write`, so notifications are not required. Document this asymmetry in a +comment in the model. + +### 10.5 Invariants + +```tla +\* Every cached entry agrees with the current user-mode state of the same path, +\* OR the cacheMode allows transient staleness. +CacheCoherent == + \A p \in Path: + \/ kernelCache[p] = NotCached + \/ /\ kernelCache[p].exists = FileExists(p) + /\ (FileExists(p) => kernelCache[p].size = FileSize(p)) + \/ cacheMode = "Bounded" + +\* After every rename, the kernel's view of the new path matches user-mode. +\* This is the specific invariant the leveldb bug violated. +NoStaleSizeAfterRename == + \A f, g \in Path: + (LastAction = <<"DoMove", f, g>>) => + (kernelCache[g] = NotCached \/ kernelCache[g].size = FileSize(g)) +``` + +`CacheCoherent` is the strong invariant; `NoStaleSizeAfterRename` is the +narrower one targeted at the leveldb pattern. Run with both. Failures of the +narrow one will produce shorter counterexample traces, which is useful when +TLC is finding many bugs at once. + +### 10.6 Liveness + +```tla +\* In Bounded mode, an open after a rename eventually sees the new size. +\* In Permanent mode, the same property holds via the notification matrix. +RenameThenReadEventuallyConsistent == + \A f, g \in Path: + [](DoMove(f, g) ~> KernelReadFile(g) returns SizeOf(g)) +``` + +(Pseudocode — TLA+ doesn't have "returns" syntax; encode the consequent as +"there exists a Next state where the cached size equals the user-mode size".) + +### 10.7 Bug-finding self-test + +Before declaring victory, **deliberately break the model** to confirm TLC +catches it: + +1. Remove `Notify(g, RenamedNewName)` from `DoMove`. +2. Run TLC against `RamDiskSystem_Minimal.cfg` with `cacheMode = "Permanent"`. +3. Expected: TLC produces a counterexample where after `DoMove(f, g)`, + `KernelReadFile(g)` observes `kernelCache[g].size = 0` while `FileSize(g) > 0`. +4. Capture the counterexample trace into `tla/RamDiskSystem_CacheModel_results.txt` + alongside the clean run. +5. Restore the missing `Notify` call and verify the counterexample disappears. + +This is the same pattern as the "old buggy model" historical artefacts already +preserved in `tla/PagePoolReservation.tla`. + +### 10.8 State-space estimate + +Existing `RamDiskSystem_Minimal.cfg` (3 pages × 2 files): ≈ 12 minutes, +~3M states. Adding `kernelCache` (function `Path -> CacheEntry` with +`|Path| ≤ 2`, ≤ 4 distinct cache states each) multiplies the state space +by ≤ 16. Expected new minimal-config runtime: **≈ 1–2 hours**. Standard config +(4 pages × 2 files, ~5 h, 66M states): expected **1–2 days**; if TLC blows up, +restrict `Path` to a 2-element symmetry set first. + +If total runtime becomes prohibitive, drop the liveness check (the safety +invariants alone catch the bug class). + +### 10.9 Update CLAUDE.md mapping table + +After the model changes land, append to the "How the model maps to code" +table in the project root `CLAUDE.md`: + +| TLA+ Action | Code | Lock | +|---|---|---| +| `KernelOpenFile(p)` | WinFsp FSD (kernel) on `IRP_MJ_CREATE` cache hit | None | +| `KernelReadFile(p)` | WinFsp FSD (kernel) on `IRP_MJ_READ` cache hit | None | +| `Notify(p, a)` | `FileSystemHost.Notify` ↔ `FspFileSystemNotify` | None (called outside user-mode locks per matrix) | +| `CacheExpire(p)` | `FileInfoTimeoutMs` countdown in WinFsp FSD | None | + +### 10.10 Concrete next-session task list + +Mirrors `openspec/changes/fix-leveldb-cache-coherency/tasks.md` §7.3–7.5: + +- [ ] Edit `tla/RamDiskSystem.tla`: add the variables, actions, invariants, and liveness from §10.2–10.6. +- [ ] Edit `tla/RamDiskSystem_Minimal.cfg`: declare `cacheMode = "Permanent"` and any new constants. +- [ ] Run TLC against `_Minimal.cfg`. Required: all invariants hold, no liveness violation. Capture output to `tla/RamDiskSystem_CacheModel_results.txt` under heading "Clean run". +- [ ] Apply the deliberate-bug self-test from §10.7. Capture the counterexample trace to the same file under "Self-test counterexample". +- [ ] Append a "Verification results" section to this postmortem document referencing the captured TLC output. +- [ ] Update the `CLAUDE.md` mapping table per §10.9. diff --git a/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/.openspec.yaml b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/.openspec.yaml new file mode 100644 index 0000000..e5764a1 --- /dev/null +++ b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-03 diff --git a/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/benchmark-onread.md b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/benchmark-onread.md new file mode 100644 index 0000000..528b5c5 --- /dev/null +++ b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/benchmark-onread.md @@ -0,0 +1,53 @@ +# Benchmark results — post fix-leveldb-cache-coherency + +Captured 2026-05-03 with `dotnet run --project tests/RamDrive.Benchmarks -c Release -- onread --job short`. +Configuration: 3 iterations, 1 launch, 3 warmup. Hardware: Windows 11 Pro for Workstations 26200. + +`OnReadWriteBenchmark` exercises the full `FileSystemHost` Read/Write call chain +through `WinFspRamAdapter`, including the new `Notify` matrix on `WriteFile` +return paths (which the matrix correctly does not invoke for the read/write +hot path — `Notify` is only sent for path-mutating callbacks). + +| Method | BlockSize | Mean | Error | StdDev | Allocated | +|-----------------|----------:|----------:|----------:|----------:|----------:| +| SequentialRead | 4096 | 13.811 ms | 1.8962 ms | 0.1039 ms | - | +| SequentialWrite | 4096 | 24.420 ms | 0.9398 ms | 0.0515 ms | - | +| SequentialRead | 8192 | 12.555 ms | 2.0784 ms | 0.1139 ms | - | +| SequentialWrite | 8192 | 19.985 ms | 4.1743 ms | 0.2288 ms | - | +| SequentialRead | 16384 | 11.006 ms | 1.3674 ms | 0.0749 ms | - | +| SequentialWrite | 16384 | 18.896 ms | 0.1491 ms | 0.0082 ms | - | +| SequentialRead | 32768 | 10.060 ms | 0.6049 ms | 0.0332 ms | - | +| SequentialWrite | 32768 | 18.258 ms | 3.1481 ms | 0.1726 ms | - | +| SequentialRead | 65536 | 9.364 ms | 1.4486 ms | 0.0794 ms | - | +| SequentialWrite | 65536 | 17.427 ms | 2.4319 ms | 0.1333 ms | - | +| SequentialRead | 131072 | 9.413 ms | 1.8348 ms | 0.1006 ms | - | +| SequentialWrite | 131072 | 17.069 ms | 1.1019 ms | 0.0604 ms | - | +| SequentialRead | 262144 | 9.367 ms | 2.3537 ms | 0.1290 ms | - | +| SequentialWrite | 262144 | 17.115 ms | 0.9876 ms | 0.0541 ms | - | +| SequentialRead | 524288 | 9.237 ms | 1.5848 ms | 0.0869 ms | - | +| SequentialWrite | 524288 | 17.160 ms | 2.5695 ms | 0.1408 ms | - | +| SequentialRead | 1048576 | 9.231 ms | 2.2686 ms | 0.1244 ms | - | +| SequentialWrite | 1048576 | 17.365 ms | 3.9799 ms | 0.2181 ms | - | +| SequentialRead | 2097152 | 9.260 ms | 0.6299 ms | 0.0345 ms | - | +| SequentialWrite | 2097152 | 17.231 ms | 2.9760 ms | 0.1631 ms | - | +| SequentialRead | 4194304 | 9.295 ms | 0.7746 ms | 0.0425 ms | - | +| SequentialWrite | 4194304 | 17.290 ms | 4.7538 ms | 0.2606 ms | - | +| SequentialRead | 8388608 | 9.316 ms | 1.2339 ms | 0.0676 ms | - | +| SequentialWrite | 8388608 | 17.692 ms | 3.1522 ms | 0.1728 ms | - | +| SequentialRead | 12582912 | 9.492 ms | 1.3293 ms | 0.0729 ms | - | +| SequentialWrite | 12582912 | 18.190 ms | 2.5413 ms | 0.1393 ms | - | +| SequentialRead | 16777216 | 9.340 ms | 1.8437 ms | 0.1011 ms | - | +| SequentialWrite | 16777216 | 17.795 ms | 4.5721 ms | 0.2506 ms | - | + +## Notes + +- `Allocated = -` across all rows: confirms zero managed-heap allocation on + the hot read/write path is preserved by the change. The new `Notify` calls + are not on this path (they fire only on `CreateFile`, `OverwriteFile`, + `MoveFile`, `Cleanup(Delete)`, `SetFileSize`, `SetFileAttributes`). +- Steady-state read converges at ~9.2 ms for blocks ≥ 64 KB; write at + ~17 ms. These are the post-fix numbers and serve as baseline for future + changes. +- A pre-fix baseline is not captured here; comparison against an explicit + pre-fix tag would require re-running on the same hardware with that + commit checked out. diff --git a/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/design.md b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/design.md new file mode 100644 index 0000000..a3dee54 --- /dev/null +++ b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/design.md @@ -0,0 +1,101 @@ +## Context + +The bug captured in the proposal is a stale-cache problem: WinFsp's kernel keeps `FileInfo` (size, attributes, existence) cached for `FileInfoTimeout` milliseconds. When that timeout is `uint.MaxValue`, the cache effectively never expires, and the user-mode FS is solely responsible for invalidating it after every mutation. The current `WinFspRamAdapter` invalidates nothing, so any `Open(name) → NotFound` answer becomes permanent — even after a subsequent `Rename(other → name)` makes that name valid. + +WinFsp provides one mechanism for this: the kernel-side equivalent of `NtNotifyChangeDirectoryFile`, exposed in the user-mode SDK as `FspFileSystemNotify(FileSystem, NotifyInfo, Size)`. It takes a buffer of `FSP_FSCTL_NOTIFY_INFO` records (a 12-byte header `{Size, Filter, Action}` plus an inline UTF-16 file name relative to the mount root). The reference user (winfsp's `fuse.c::fsp_fuse_notify`) pushes one record per call; case-insensitive mounts must `CharUpperBuffW` the path first because the FSD normalizes by upper-casing internally and `Notify` paths must match the normalized form. + +The `WinFsp.Native` binding currently exposes only `FspFileSystemNotifyBegin/End` (the rename-blocking framing helpers) but not `FspFileSystemNotify` itself or the `FSP_FSCTL_NOTIFY_INFO` struct, so the adapter cannot send any invalidations even if it wanted to. + +## Goals / Non-Goals + +**Goals:** +- Adapter sends a `FspFileSystemNotify` for every path whose existence, content, or attributes changed in a callback the kernel may have cached. +- After the fix, `LevelDbReproTests` passes with the test fixture pinned at `FileInfoTimeout = uint.MaxValue` — i.e. notifications alone (without any timeout-based fallback) keep leveldb correct. +- A new `FileInfoTimeoutMs` option lets operators bound the cache lifetime as defence in depth. Default `1000` ms. +- Steady-state cache hit rate for sequential read/write workloads stays within noise of the current behaviour (verified via existing `RamDrive.Benchmarks`). +- Notification path is allocation-free on the hot path — it is invoked from inside FS callbacks, which the binding's `CLAUDE.md` declares zero-alloc. + +**Non-Goals:** +- Rewriting the binding to wrap full `NotifyBegin/End` rename-batching (single-shot `Notify` is enough for this bug). +- Implementing notifications for callbacks the kernel doesn't cache (e.g. `Read`, `Write` content) or for IRP types we don't generate (`STREAM_*`, `EA_*`). +- Eliminating the cache by setting `FileInfoTimeout=0` — that would regress throughput by ~3×, and it is the operator's choice via the new option. +- Touching `RamFileSystem.Move` / `Delete` semantics. Notifications are added at the adapter layer where path strings are already in hand. + +## Decisions + +### 1. Add only `FspFileSystemNotify` (single-shot), not `Begin/End` framing + +`NotifyBegin/End` exist to atomically batch many notifications relative to concurrent renames. Our mutators each emit at most 2 notifications (rename emits two: `REMOVED(old)`, `ADDED(new)`), and we never need to atomically observe a multi-event view. The `fuse.c` reference implementation in winfsp itself uses single-shot `Notify` without `Begin/End` for exactly this reason. Adding the framing API is dead weight today; can be added later as a non-breaking extension. + +**Alternative considered:** expose all three (`Begin`, `Notify`, `End`) and require callers to wrap. Rejected — it would force every caller to write boilerplate for the common case, and the framing semantics (rename blocking, `STATUS_CANT_WAIT` retry loop) are easy to misuse. + +### 2. Public surface: a single overload `FileSystemHost.Notify(uint filter, uint action, string fileName)` + +The native struct is variable-length (header + inline UTF-16). We hide that by accepting a managed string and building the buffer on a stack-allocated `Span` inside `Notify`. No managed heap allocation; one memcpy of the path bytes. The path arg is the same shape as what `IFileSystem` callbacks receive (backslash-rooted, e.g. `\GCM Store\CURRENT`), so callers can pass `fileName` parameters straight through. + +```csharp +public int Notify(uint filter, uint action, string fileName); +``` + +`filter` and `action` are exposed as `uint` constants on a new `FileNotify` static class (e.g. `FileNotify.ChangeFileName`, `FileNotify.ActionRemoved`). We keep them as raw `uint` rather than enums to match WinFsp's bit-or style and avoid a managed enum-marshalling layer. + +**Alternative considered:** `NotifyBatch(IEnumerable<...>)`. Rejected for now — only adds value once we need the rename-blocking framing, and it would force callers into IEnumerable allocations. + +### 3. Case-insensitive normalization is the binding's job, not the caller's + +`fuse.c` upper-cases the path before calling `FspFileSystemNotify` because the FSD upper-cases internally for case-insensitive volumes, and the cache lookup happens on the upper-cased form. We replicate this inside `FileSystemHost.Notify`: if `CaseSensitiveSearch == false` (the property the host already exposes), upper-case in place inside the stack buffer before the P/Invoke. Callers always pass the user-supplied case. + +**Why in the binding, not the adapter:** the adapter doesn't (and shouldn't) know what the kernel does to canonicalize. Keeping this in the binding means every adapter automatically benefits and we won't see the same bug in a different consumer. + +### 4. Notification matrix for adapter callbacks + +Each callback that can change cached state emits exactly the notifications below. The matrix maps the user-visible IRP outcome to `(Filter, Action, Path)` tuples: + +| Adapter callback | Notification(s) | +|---|---| +| `MoveFile(old, new, replace=false)` | `(ChangeFileName, ActionRenamedOldName, old)` + `(ChangeFileName, ActionRenamedNewName, new)` | +| `MoveFile(old, new, replace=true)` and target existed | same as above; the implicit removal of `existing` is covered by `ActionRenamedNewName` semantics (FSD invalidates target cache on `RENAMED_NEW_NAME`) | +| `Cleanup(_, Delete)` for a file | `(ChangeFileName, ActionRemoved, path)` | +| `Cleanup(_, Delete)` for a directory | `(ChangeDirName, ActionRemoved, path)` | +| `OverwriteFile` | `(ChangeSize \| ChangeLastWrite, ActionModified, path)` — overwrites truncate to 0 then re-grow; size-cache stale is the failure mode | +| `CreateFile(_, FileDirectoryFile)` | `(ChangeDirName, ActionAdded, path)` — only when the FSD might have negatively cached the name (`FileInfoTimeout > 0`); always cheaper to send than diagnose later | +| `CreateFile(...)` for a regular file | `(ChangeFileName, ActionAdded, path)` | +| `SetFileSize` (non-allocation) | `(ChangeSize \| ChangeLastWrite, ActionModified, path)` | +| `SetFileAttributes` | `(ChangeAttributes \| ChangeLastWrite, ActionModified, path)` | +| `Write` | none — content cache is invalidated by the kernel via the page cache; size growth is handled because every `Write` callback returns the new `FspFileInfo` with the updated size, which the FSD already uses | +| `Read` | none | + +The asymmetry vs the proposal (which only mentioned Move/Delete/Overwrite) is intentional: the leveldb bug is one symptom of a pattern (negative-cache pollution), and `CreateFile` for a never-before-seen name is the other half of that pattern. Adding `Add` notifications is one extra IPC per create; cheap and bug-resistant. + +### 5. Notification failures are non-fatal + +`Notify` can return `STATUS_INVALID_DEVICE_REQUEST` if the volume is being torn down, or other transient errors. The adapter must not fail the originating IRP because a notification failed — the user-mode mutation already succeeded. We log at `Trace` level (debug builds) and continue. The worst case is reverting to the pre-fix behaviour (stale cache) for the affected path, bounded by `FileInfoTimeoutMs`. + +### 6. `FileInfoTimeoutMs` default = 1000 + +- 0 disables the cache entirely (~3× throughput regression — only useful for diagnostics). +- 1000 is long enough that hot-path `OpenFile`/`GetFileInfo` for the same file inside a single application operation hits cache. +- Short enough that any path missed by the notification matrix recovers within 1 second instead of mount lifetime. +- `uint.MaxValue` remains a legal value; documented as "trust the notification matrix completely". CI runs the integration tests at `uint.MaxValue` so a missed notification is caught before merge. + +**Alternative considered:** default 0. Rejected — the throughput hit is large enough that users will turn it back on without understanding the trade-off. + +## Risks / Trade-offs + +- **Risk**: We miss a callback in the notification matrix and a different application hits the same negative-cache bug. → **Mitigation**: integration test fixture is pinned at `uint.MaxValue` so any miss for a tested workflow fails CI; matrix is documented in the spec for future reviewers. +- **Risk**: Case-insensitive normalization regresses to a stale-cache bug if the binding's `CaseSensitiveSearch` property doesn't match what the FSD actually does. → **Mitigation**: cross-check with `fuse.c` reference, and the integration test exercises mixed-case paths (`Z:\TEMP\...` vs `Z:\Temp\...`). +- **Risk**: `Notify` happens inside the FS callback, which holds adapter locks. If the kernel synchronously processes the notification (which it does — `FspFsctlNotify` is a synchronous IOCTL), we could deadlock if the kernel re-enters our adapter. → **Mitigation**: review the matrix to ensure no `Notify` is called from a callback that holds `_structureLock`. In practice, `MoveFile` returns to the adapter after `_fs.Move` releases the lock, then we send `Notify` — so we are outside the lock. Encode this as a comment on the adapter wrapper. +- **Trade-off**: One extra P/Invoke + small memcpy per mutation. For mutation-heavy workloads (chaos test ~10K ops/sec) this is negligible; for read-heavy workloads it's zero overhead. +- **Trade-off**: `FileInfoTimeoutMs=1000` is operator-visible behaviour — a mount that previously cached `FileInfo` for the lifetime of the mount now refreshes every second. Steady-state throughput in benchmarks should not regress because `OpenFile`/`GetFileInfo` from the kernel are still synchronous-completed `ValueTask`s with no allocation. + +## Migration Plan + +1. **`winfsp-native` repo**: add `Notify` API on a feature branch. Bump version to a pre-release suffix (e.g. `0.1.2-pre.1`). Publish as a local project ref initially; `RamDrive.Cli` consumes via local path while iterating, then switches to the pinned NuGet version once the binding change is reviewed and tagged. +2. **`RamDrive` repo**: implement the matrix and config option in one PR that depends on the new binding version. Merge order: binding tag → bump RamDrive's `` → adapter changes. +3. **Rollback**: setting `FileInfoTimeoutMs=0` in `appsettings.jsonc` restores pre-fix correctness without rebuilding (notifications become no-ops because there is no cache to invalidate). +4. **No data migration**: the change is mount-time only. Existing on-disk data (there is none — it's a RAM disk) is unaffected. + +## Open Questions + +- Should `WriteFile` send `(ChangeSize, ActionModified)` for the case where size grew? Returning `FspFileInfo` from `WriteFile` may already cover this, but I have not verified it under `FileInfoTimeout=MAX`. Will write a focused integration test as part of `tasks` and decide. +- Should we expose `NotifyBegin/End` framing in the binding for completeness? Defer until a real consumer needs it; YAGNI for this change. diff --git a/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/proposal.md b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/proposal.md new file mode 100644 index 0000000..2e13f01 --- /dev/null +++ b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/proposal.md @@ -0,0 +1,36 @@ +## Why + +Chromium's leveldb (used for `Local Storage`, `Sync Data`, `GCM Store`, etc.) does an atomic-rename + immediate read on the `CURRENT` file every DB open. With WinFsp's `FileInfoTimeout` set to `uint.MaxValue` (the current default for `EnableKernelCache=true`), the kernel keeps the negative-cached "CURRENT does not exist / size=0" entry forever, so the read after `MoveFile(dbtmp → CURRENT)` returns 0 bytes. Leveldb reports `Corruption: CURRENT does not end with newline`, and Chromium's downstream `CHECK()` macros crash the process with `STATUS_BREAKPOINT (0x80000003)`. This is reproducible on every Chromium-based browser launched with `--user-data-dir` on the RAM drive. + +## What Changes + +- Add a `Notify` API to the `WinFsp.Native` binding so user-mode file systems can invalidate the kernel's `FileInfo` cache after path-mutating operations (`Rename`, `Delete`, `Overwrite`). +- Wire `WinFspRamAdapter` mutators (`MoveFile`, `Cleanup` with `Delete` flag, `OverwriteFile`) to send `FILE_ACTION_*` notifications for every affected path. +- Add a `FileInfoTimeoutMs` configuration option (default `1000`) to replace the unconditional `uint.MaxValue` timeout. Notifications are the primary correctness mechanism; the timeout is a defence-in-depth bound for any path that escapes notification. +- Add an integration test `LevelDbReproTests` that performs the exact Win32 `WriteFile (cached) → FlushFileBuffers → Close → MoveFileEx → Open → ReadFile` sequence captured from Chromium's leveldb, and asserts the read returns the full content. Test runs against the fixture with `FileInfoTimeout=uint.MaxValue` so it will keep catching cache-invalidation regressions. + +## Capabilities + +### New Capabilities +- `cache-invalidation`: Path-mutating callbacks must notify the WinFsp kernel cache so subsequent opens/reads see fresh state, even when `FileInfoTimeout` is large or `uint.MaxValue`. +- `file-info-timeout-config`: Operators can bound the kernel `FileInfo` cache lifetime via `RamDrive:FileInfoTimeoutMs`, with a safe default that prevents indefinite stale-cache survival if a notification is missed. + +### Modified Capabilities + + +## Impact + +- **`winfsp-native` repo** (binding): + - `src/WinFsp.Native/Interop/FspApi.cs`: add `FspFileSystemNotify` P/Invoke. + - `src/WinFsp.Native/Interop/FspStructs.cs`: add `FspFsctlNotifyInfo` (12-byte header + inline UTF-16 name) and `FILE_NOTIFY_CHANGE_*` / `FILE_ACTION_*` constants. + - `src/WinFsp.Native/FileSystemHost.cs`: add public `Notify(filter, action, fileName)` and `NotifyBatch(...)` (handles `NotifyBegin`/`NotifyEnd` framing). + - Bumped binding version; consumed via local project ref or new pre-release NuGet. +- **`RamDrive` repo**: + - `src/RamDrive.Cli/WinFspRamAdapter.cs`: invoke `host.Notify(...)` after `MoveFile`, `Cleanup(Delete)`, and `OverwriteFile`. Replace `host.FileInfoTimeout = uint.MaxValue` with the configured `FileInfoTimeoutMs`. + - `src/RamDrive.Core/Configuration/RamDriveOptions.cs`: add `FileInfoTimeoutMs` (default `1000`). + - `appsettings.jsonc`: document the new key. + - `tests/RamDrive.IntegrationTests/RamDriveFixture.cs`: keep `FileInfoTimeout = uint.MaxValue` in the test adapter so the leveldb regression test exercises the worst-case cache lifetime. + - `tests/RamDrive.IntegrationTests/LevelDbReproTests.cs`: new test, plus a callback trace utility on the fixture. + - `CLAUDE.md`: update the configuration table and remove the "FileInfoTimeout=MAX" framing as a pure throughput knob. +- **No persisted state migration**; behaviour change is mount-time only. +- **Throughput**: with notifications wired correctly, a finite `FileInfoTimeoutMs` does not regress steady-state cache hit rate for typical workloads. We will spot-check with the existing benchmarks before archiving. diff --git a/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/specs/cache-invalidation/spec.md b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/specs/cache-invalidation/spec.md new file mode 100644 index 0000000..2759ec7 --- /dev/null +++ b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/specs/cache-invalidation/spec.md @@ -0,0 +1,114 @@ +## ADDED Requirements + +### Requirement: Adapter SHALL invalidate kernel FileInfo cache after path-mutating callbacks + +When a user-mode callback in `WinFspRamAdapter` changes the existence, name, size, or attributes of a path, the adapter MUST send an `FspFileSystemNotify` for every affected path before returning success to WinFsp. The notification carries the path the kernel uses for cache lookup, so future opens and reads observe the post-mutation state regardless of `FileInfoTimeoutMs`. + +The notification path MUST NOT cause the originating callback to fail. If `FspFileSystemNotify` returns a non-success NTSTATUS, the adapter MUST log at trace level and return success for the original IRP. The mutation has already taken effect in `RamFileSystem`; the cache is now allowed to be stale until `FileInfoTimeoutMs` elapses. + +The notification MUST be sent outside any locks held by `RamFileSystem` (`_structureLock`) or `PagedFileContent` (`_lock`). Sending a kernel-synchronous notification under a user-mode lock that the kernel might re-enter is a reentrancy hazard; the adapter wires `Notify` calls after the inner-FS call returns. + +#### Scenario: Rename emits old-name and new-name notifications +- **WHEN** the adapter's `MoveFile` callback is invoked with `oldFileName` and `newFileName` and the underlying `RamFileSystem.Move` returns success +- **THEN** the adapter sends `FspFileSystemNotify(ChangeFileName, ActionRenamedOldName, oldFileName)` followed by `FspFileSystemNotify(ChangeFileName, ActionRenamedNewName, newFileName)` +- **AND** subsequent `OpenFile(newFileName)` from a different handle observes the renamed node's size and attributes (not a stale "name not found" or zero-size record) + +#### Scenario: Rename-replace invalidates the displaced target's cache +- **WHEN** the adapter's `MoveFile` is invoked with `replaceIfExists=true` and `newFileName` already exists +- **THEN** the adapter sends `FspFileSystemNotify(ChangeFileName, ActionRenamedNewName, newFileName)`, which the FSD interprets as "this name now refers to a different node — drop any cached state" +- **AND** subsequent reads on `newFileName` return the source node's content, not zero bytes from the displaced target's cached size + +#### Scenario: Delete emits a removal notification +- **WHEN** the adapter's `Cleanup` is invoked with `CleanupFlags.Delete` and the path is a file +- **THEN** the adapter sends `FspFileSystemNotify(ChangeFileName, ActionRemoved, path)` +- **AND** any subsequent `OpenFile(path)` call returns `ObjectNameNotFound` rather than reusing a cached `FileInfo` from before the delete + +#### Scenario: Directory delete uses the directory filter +- **WHEN** the adapter's `Cleanup` is invoked with `CleanupFlags.Delete` and the path is a directory +- **THEN** the adapter sends `FspFileSystemNotify(ChangeDirName, ActionRemoved, path)` instead of `ChangeFileName` + +#### Scenario: Overwrite invalidates size and timestamp caches +- **WHEN** the adapter's `OverwriteFile` callback truncates the existing file to zero +- **THEN** the adapter sends `FspFileSystemNotify(ChangeSize | ChangeLastWrite, ActionModified, path)` +- **AND** the kernel-cached `FileInfo.FileSize` is invalidated, so a subsequent `GetFileInformation` call sees `FileSize=0` rather than the pre-overwrite value + +#### Scenario: Create emits an addition notification to defeat negative caching +- **WHEN** the adapter's `CreateFile` callback creates a new regular file at `path` +- **THEN** the adapter sends `FspFileSystemNotify(ChangeFileName, ActionAdded, path)` +- **AND** a previous negative cache entry for `path` (a recently failed `OpenFile` returning `ObjectNameNotFound`) is invalidated, so the new file becomes immediately visible to other handles + +#### Scenario: Directory create uses the directory filter +- **WHEN** the adapter's `CreateFile` callback creates a directory at `path` +- **THEN** the adapter sends `FspFileSystemNotify(ChangeDirName, ActionAdded, path)` + +#### Scenario: SetFileSize invalidates size cache +- **WHEN** the adapter's `SetFileSize` callback runs with `setAllocationSize=false` and changes the logical length +- **THEN** the adapter sends `FspFileSystemNotify(ChangeSize | ChangeLastWrite, ActionModified, path)` + +#### Scenario: SetFileAttributes invalidates attributes cache +- **WHEN** the adapter's `SetFileAttributes` callback changes attributes or any timestamp +- **THEN** the adapter sends `FspFileSystemNotify(ChangeAttributes | ChangeLastWrite, ActionModified, path)` + +#### Scenario: Notification failure does not surface as IRP failure +- **WHEN** `FspFileSystemNotify` returns a non-success NTSTATUS from inside any of the above callbacks +- **THEN** the adapter logs the failure at trace level +- **AND** the adapter returns the success result for the original IRP (the user-mode mutation is the source of truth; cache staleness is bounded by `FileInfoTimeoutMs`) + +### Requirement: WinFsp.Native binding SHALL expose a public Notify API for IFileSystem implementations + +The `WinFsp.Native` binding MUST expose a `FileSystemHost.Notify(uint filter, uint action, string fileName)` method that wraps `FspFileSystemNotify`. The method packages the path into the variable-length `FSP_FSCTL_NOTIFY_INFO` buffer (12-byte header + inline UTF-16 file name) using a stack-allocated buffer, with no managed heap allocation on success. + +The binding MUST publish well-known constants for the `Filter` and `Action` parameters under a `FileNotify` static class so callers do not have to define them themselves. Constants exposed: `ChangeFileName`, `ChangeDirName`, `ChangeAttributes`, `ChangeSize`, `ChangeLastWrite`, `ChangeLastAccess`, `ChangeCreation`, `ChangeSecurity`, `ActionAdded`, `ActionRemoved`, `ActionModified`, `ActionRenamedOldName`, `ActionRenamedNewName`. + +The binding MUST normalize the file name to upper case in place (inside the stack buffer, after copying) when the host's `CaseSensitiveSearch` flag is `false`, because the WinFsp driver's cache is keyed on the upper-cased form for case-insensitive volumes. Callers SHALL pass the user-supplied case unchanged. + +The `Notify` method MUST return the underlying NTSTATUS so callers can decide how to handle errors (the adapter, per the requirement above, treats this as non-fatal). It MUST NOT throw managed exceptions for normal NTSTATUS error returns. + +#### Scenario: Notify packs a single notification record and forwards it +- **WHEN** a caller invokes `host.Notify(FileNotify.ChangeFileName, FileNotify.ActionAdded, @"\GCM Store\CURRENT")` +- **THEN** the binding builds a `FSP_FSCTL_NOTIFY_INFO` buffer containing one record `{Size = 12 + 2*pathLen, Filter = ChangeFileName, Action = ActionAdded, FileNameBuf = "\\GCM Store\\CURRENT"}` +- **AND** invokes `FspFileSystemNotify` with that buffer and returns its NTSTATUS + +#### Scenario: Case-insensitive mounts upper-case the path before notifying +- **WHEN** the host has `CaseSensitiveSearch == false` and the caller invokes `host.Notify(..., @"\Temp\CURRENT")` +- **THEN** the buffer passed to `FspFileSystemNotify` contains the path `\TEMP\CURRENT` (upper-cased), matching the FSD's internal cache key +- **AND** the kernel cache entry for that upper-cased path is invalidated + +#### Scenario: Case-sensitive mounts pass the path through unchanged +- **WHEN** the host has `CaseSensitiveSearch == true` and the caller invokes `host.Notify(..., @"\Temp\Current")` +- **THEN** the buffer passed to `FspFileSystemNotify` contains the path `\Temp\Current` byte-for-byte + +#### Scenario: Notify is allocation-free on the hot path +- **WHEN** `host.Notify(...)` is called repeatedly inside a tight loop with paths that fit in the stack-allocated buffer (<= 1024 chars) +- **THEN** no managed heap allocation occurs in the binding (verified by `GC.GetAllocatedBytesForCurrentThread()` delta == 0 across the loop, excluding the marshalled string parameter itself) + +#### Scenario: Notify returns NTSTATUS, does not throw +- **WHEN** `FspFileSystemNotify` returns a non-success NTSTATUS (e.g. because the volume is being torn down) +- **THEN** `host.Notify` returns that NTSTATUS to the caller without raising a managed exception + +### Requirement: Cache-invalidation behavior is regression-tested at the worst-case timeout + +The integration test fixture (`RamDriveFixture`) MUST be configured with `FileInfoTimeout = uint.MaxValue` so that cache invalidation depends entirely on the notification matrix — never on time-based expiration. A regression in the matrix (a missed callback or wrong filter/action) will then produce a stale-cache failure in CI rather than appearing only against real Chromium with the production default. + +The test suite MUST include a `LevelDbReproTests` class that exercises the exact Win32 sequence captured from Chromium's leveldb env when opening a fresh database: + +1. `CreateFile(dbtmp, GENERIC_WRITE, CREATE_ALWAYS)` (default cached I/O — no `FILE_FLAG_NO_BUFFERING` or `FILE_FLAG_WRITE_THROUGH`) +2. `WriteFile(dbtmp, "MANIFEST-000001\n", 16)` +3. `FlushFileBuffers(dbtmp)` and `CloseHandle(dbtmp)` +4. `MoveFileEx(dbtmp, CURRENT, MOVEFILE_REPLACE_EXISTING)` +5. `CreateFile(CURRENT, GENERIC_READ, OPEN_EXISTING)` and `ReadFile(CURRENT, 8192)` + +Step 5 MUST return all 16 bytes of the original payload. + +#### Scenario: Win32 leveldb sequence reads back the renamed content +- **WHEN** the test runs the five-step Win32 sequence above against the integration fixture (which is mounted with `FileInfoTimeout = uint.MaxValue`) +- **THEN** `ReadFile` after the rename returns 16 bytes equal to `"MANIFEST-000001\n"` +- **AND** `ReadFile` does NOT return `END_OF_FILE` or 0 bytes + +#### Scenario: Removing a notification call fails the regression test +- **WHEN** the implementation of `MoveFile` in the adapter is modified to skip the post-rename `Notify` calls +- **THEN** `LevelDbReproTests.Win32_LevelDb_Sequence_Cached` fails with `readBytes == 0` or content mismatch + +#### Scenario: Mixed-case paths still invalidate correctly +- **WHEN** the test sequence uses a path with mixed case (e.g. `\Temp\dbtmp` rename to `\Temp\CURRENT`) +- **THEN** the post-rename read returns the full payload regardless of how the upper-cased FSD cache key compares to the user-supplied path diff --git a/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/specs/file-info-timeout-config/spec.md b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/specs/file-info-timeout-config/spec.md new file mode 100644 index 0000000..2af795f --- /dev/null +++ b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/specs/file-info-timeout-config/spec.md @@ -0,0 +1,37 @@ +## ADDED Requirements + +### Requirement: Operators SHALL be able to bound the kernel FileInfo cache lifetime + +The `RamDriveOptions` configuration object MUST expose a `FileInfoTimeoutMs` property (uint, milliseconds) that the adapter SHALL pass to `FileSystemHost.FileInfoTimeout` during `Init`. The default value MUST be `1000` ms. + +This setting acts as defence in depth on top of the explicit cache-invalidation notifications defined by the `cache-invalidation` capability. Notifications are the primary correctness mechanism; the timeout bounds the impact of any path that escapes notification (e.g. a future code path that mutates state without an accompanying `Notify` call). + +The setting MUST be wired through the standard configuration binding (`appsettings.jsonc` under the `RamDrive` section, overridable via CLI as `--RamDrive:FileInfoTimeoutMs=N`). It MUST accept the value `0` (cache disabled) and `uint.MaxValue` (cache effectively permanent — operator opt-in to "trust the notifications completely"). Values in between are passed through unchanged. + +The previously-shipped `EnableKernelCache` boolean MUST be preserved for backward compatibility: +- `EnableKernelCache=true` (the default) makes the adapter apply `FileInfoTimeoutMs` to the host as `FileInfoTimeout`. +- `EnableKernelCache=false` forces the adapter to set `FileInfoTimeout = 0` regardless of `FileInfoTimeoutMs`, preserving the documented "no kernel cache" mode. + +#### Scenario: Default mount uses 1000 ms timeout +- **WHEN** the service starts with the default `appsettings.jsonc` (which does not set `FileInfoTimeoutMs`) +- **THEN** `FileInfoTimeoutMs` equals `1000` +- **AND** the mounted host has `FileInfoTimeout = 1000` + +#### Scenario: Explicit override via CLI +- **WHEN** the service is started with `--RamDrive:FileInfoTimeoutMs=5000` +- **THEN** the mounted host has `FileInfoTimeout = 5000` + +#### Scenario: Setting EnableKernelCache=false zeroes the timeout +- **WHEN** the service is started with `--RamDrive:EnableKernelCache=false --RamDrive:FileInfoTimeoutMs=10000` +- **THEN** the mounted host has `FileInfoTimeout = 0` +- **AND** the explicit `FileInfoTimeoutMs` value is ignored (the boolean toggle wins for compatibility) + +#### Scenario: Operator opts in to permanent cache +- **WHEN** the service is started with `--RamDrive:FileInfoTimeoutMs=4294967295` +- **THEN** the mounted host has `FileInfoTimeout = uint.MaxValue` +- **AND** correctness depends entirely on the notification matrix from the `cache-invalidation` capability (this is the configuration the integration test fixture pins for regression testing) + +#### Scenario: Documented in operator-facing config table +- **WHEN** an operator reads `CLAUDE.md` for the configuration table or `appsettings.jsonc` comments +- **THEN** `FileInfoTimeoutMs` appears with its default (`1000`), unit (milliseconds), and a note that `0` disables the cache and `uint.MaxValue` makes it permanent +- **AND** the existing `EnableKernelCache` row is updated to clarify that `false` overrides `FileInfoTimeoutMs` to `0` diff --git a/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/tasks.md b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/tasks.md new file mode 100644 index 0000000..6992af1 --- /dev/null +++ b/openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/tasks.md @@ -0,0 +1,113 @@ +## 1. Binding: Add Notify API to winfsp-native + +- [x] 1.1 In `src/WinFsp.Native/Interop/FspApi.cs`, add `[LibraryImport]` for `FspFileSystemNotify(nint fileSystem, nint notifyInfo, nuint size) -> int` next to the existing `FspFileSystemNotifyBegin/End` declarations. +- [x] 1.2 In `src/WinFsp.Native/Interop/FspStructs.cs`, add a `[StructLayout(LayoutKind.Sequential, Pack = 2)]` `FspFsctlNotifyInfo` struct with fields `ushort Size; uint Filter; uint Action;` (12-byte header to match the C `FSP_FSCTL_NOTIFY_INFO`). Verify `sizeof(FspFsctlNotifyInfo) == 12` in a unit test. +- [x] 1.3 In `src/WinFsp.Native/`, create `FileNotify.cs` exposing `public static class FileNotify` with `uint` constants: `ChangeFileName=0x1`, `ChangeDirName=0x2`, `ChangeAttributes=0x4`, `ChangeSize=0x8`, `ChangeLastWrite=0x10`, `ChangeLastAccess=0x20`, `ChangeCreation=0x40`, `ChangeSecurity=0x100`, `ActionAdded=1`, `ActionRemoved=2`, `ActionModified=3`, `ActionRenamedOldName=4`, `ActionRenamedNewName=5`. Cross-check values against `winfsp/inc/winfsp/fsctl.h` and the Windows SDK `WinNT.h`. +- [x] 1.4 In `src/WinFsp.Native/FileSystemHost.cs`, add `public unsafe int Notify(uint filter, uint action, ReadOnlySpan fileName)` that: + - Stack-allocates a `Span` of size `12 + 2*fileName.Length` (cap at 4096; longer names allocate via `ArrayPool` with a try/finally `Return`). + - Writes header `{Size, Filter, Action}` at offset 0. + - Encodes `fileName` into the buffer at offset 12 using `Encoding.Unicode.GetBytes(fileName, …)`. + - If `_caseSensitiveSearch == false`, calls `CharUpperBuffW` (P/Invoke into `user32.dll`) on the in-buffer name pointer for `fileName.Length` chars. + - Pins the buffer and invokes `FspApi.FspFileSystemNotify(_fileSystemHandle, ptr, (nuint)totalSize)`. Returns the NTSTATUS unchanged. +- [x] 1.5 Add `[LibraryImport("user32.dll", EntryPoint = "CharUpperBuffW")] static partial uint CharUpperBuffW(char* lpsz, uint cchLength);` in a private interop class in `FileSystemHost.cs` (or `Interop/FspApi.cs` if it fits the existing layout). +- [x] 1.6 Update `src/WinFsp.Native/CLAUDE.md` "Namespace Layout" table to include `FileNotify` under `WinFsp.Native`. Add a "Notification" subsection to "Key Design Decisions" explaining single-shot vs `Begin/End` framing and the case-insensitive normalization rule. +- [x] 1.7 Add unit test `tests/WinFsp.Native.Tests/NotifyTests.cs` (or extend an existing test class) with two tests: + - `Notify_PacksHeaderAndPath_CaseSensitive`: mounts a `TestMemFs` with `CaseSensitiveSearch=true`, calls `host.Notify(FileNotify.ChangeFileName, FileNotify.ActionAdded, @"\probe")`, asserts NT status is `STATUS_SUCCESS` and the call did not throw. (The actual cache invalidation is not directly observable from a unit test — the integration suite in the RamDrive repo covers behaviour.) + - `Notify_UpperCasesPath_CaseInsensitive`: same as above but with `CaseSensitiveSearch=false` and a mixed-case path; intercept the buffer via a test hook (or add an `internal` overload that returns the prepared buffer) to verify the upper-casing happens before the P/Invoke. +- [x] 1.8 Bump the binding's package version to `0.1.2-pre.1` in `src/WinFsp.Native/WinFsp.Native.csproj`. Run `dotnet pack -c Release -o ./artifacts`. Note the local `.nupkg` path for step 3.1. + +## 2. Adapter: Wire notifications into mutators + +- [x] 2.1 In `src/RamDrive.Cli/WinFspRamAdapter.cs`, add a private `FileSystemHost? _host` field and capture the host reference in `Mounted(FileSystemHost host)`. +- [x] 2.2 Add a private helper `void Notify(uint filter, uint action, string path)` that calls `_host?.Notify(filter, action, path)` and silently swallows any non-success NTSTATUS (logging at `LogLevel.Trace` via the existing `_logger`). Comment that this is intentional per the `cache-invalidation` spec — IRPs must succeed even if cache invalidation fails. +- [x] 2.3 Wire `MoveFile` (currently lines 360–372) to call `Notify(FileNotify.ChangeFileName, FileNotify.ActionRenamedOldName, fileName)` then `Notify(FileNotify.ChangeFileName, FileNotify.ActionRenamedNewName, newFileName)` after `_fs.Move` returns true. Both calls happen after the `RamFileSystem` lock is released. +- [x] 2.4 Wire `Cleanup` (currently lines 378–394): after `_fs.Delete(fileName)` succeeds, call `Notify(node.IsDirectory ? FileNotify.ChangeDirName : FileNotify.ChangeFileName, FileNotify.ActionRemoved, fileName)`. Capture the node's `IsDirectory` flag *before* the delete (the node is disposed inside `_fs.Delete`). +- [x] 2.5 Wire `OverwriteFile` (currently lines 167–189): after `node.Content.SetLength(0)` succeeds, call `Notify(FileNotify.ChangeSize | FileNotify.ChangeLastWrite, FileNotify.ActionModified, _)`. Reuse the path from `info.Context` — the adapter currently doesn't get a path parameter; if needed, capture it in `OpenFile`/`CreateFile` and store on `info.Context` alongside the `FileNode`. +- [x] 2.6 Wire `CreateFile` (currently lines 116–152): after the new file/dir is created, call `Notify(isDir ? FileNotify.ChangeDirName : FileNotify.ChangeFileName, FileNotify.ActionAdded, fileName)`. This addresses negative-cache invalidation per the `cache-invalidation` spec. +- [x] 2.7 Wire `SetFileSize` (currently lines 281–305): when `setAllocationSize == false` and `SetLength` succeeds, call `Notify(FileNotify.ChangeSize | FileNotify.ChangeLastWrite, FileNotify.ActionModified, fileName)`. +- [x] 2.8 Wire `SetFileAttributes` (currently lines 259–279): after the attribute/timestamp updates land, call `Notify(FileNotify.ChangeAttributes | FileNotify.ChangeLastWrite, FileNotify.ActionModified, fileName)`. +- [x] 2.9 Add an XML doc comment block at the top of the adapter listing the notification matrix (mirrors the table in `design.md` Decision 4) so future maintainers see the contract without leaving the file. + +## 3. Configuration: FileInfoTimeoutMs option + +- [x] 3.1 In `RamDrive.Cli.csproj`, switch the `` to `Version="0.1.2-pre.1"` (or the local `.nupkg` path produced in 1.8). Restore and verify the build still compiles. +- [x] 3.2 In `src/RamDrive.Core/Configuration/RamDriveOptions.cs`, add `public uint FileInfoTimeoutMs { get; set; } = 1000;`. Add an XML doc comment explaining the trade-off (notifications first, timeout as defence in depth) and the meaning of `0` and `uint.MaxValue`. +- [x] 3.3 In `src/RamDrive.Cli/WinFspRamAdapter.cs::Init`, replace `if (_options.EnableKernelCache) host.FileInfoTimeout = unchecked((uint)(-1));` with: + ```csharp + host.FileInfoTimeout = _options.EnableKernelCache ? _options.FileInfoTimeoutMs : 0u; + ``` +- [x] 3.4 In `src/RamDrive.Cli/appsettings.jsonc`, add `"FileInfoTimeoutMs": 1000` to the `RamDrive` section with a `// 0 = no kernel cache; uint.MaxValue = trust notifications only` comment. Update the existing `EnableKernelCache` comment to clarify that `false` overrides this to `0`. +- [x] 3.5 Update `CLAUDE.md` configuration table (lines 257–266) to include `FileInfoTimeoutMs` and the updated `EnableKernelCache` semantics. Replace the line "Uses WinFsp `FileInfoTimeout=MAX`" with a reference to the new option. + +## 4. Tests: Repro + regression coverage + +- [x] 4.1 In `tests/RamDrive.IntegrationTests/RamDriveFixture.cs`, ensure the `TestAdapter.Init` sets `host.FileInfoTimeout = unchecked((uint)(-1))` (i.e. `uint.MaxValue`) — the worst-case lifetime — so any missing notification fails the suite. Revert the temporary `1000` value used during diagnosis. +- [x] 4.2 Apply the same notification matrix as 2.x to the fixture's `TestAdapter` so it stays behaviour-equivalent to the production adapter (the integration suite tests the matrix; the fixture must implement it). +- [x] 4.3 Verify the existing `tests/RamDrive.IntegrationTests/LevelDbReproTests.cs` (added during diagnosis) compiles against the new `TestAdapter` and the `Win32_LevelDb_Sequence_Cached` test PASSES with the matrix wired in. +- [x] 4.4 Add `LevelDbReproTests.MixedCase_Win32_LevelDb_Sequence` covering paths like `\Temp\Dbtmp` → `\TEMP\CURRENT` (mixed-case), asserting the post-rename read still returns full content. Validates the case-insensitive upper-casing in 1.4. +- [x] 4.5 Add a focused regression test `LevelDbReproTests.Notify_DefaultTimeoutAlsoWorks` that mounts a *separate* fixture instance with `FileInfoTimeout = 1000` and runs the same leveldb sequence. Asserts the test passes regardless of timeout — verifies the production default does not regress correctness. +- [x] 4.6 Strip the verbose `RamDriveFixture.Trace` instrumentation added during diagnosis (it stays as opt-in via `SetTraceFilter`), keeping the helper for future debugging but ensuring it's a no-op when the filter is null. +- [x] 4.7 Run `dotnet test tests/RamDrive.IntegrationTests` end-to-end. Required result: `TortureTests` (10), `ChaosTests` (1), `InitialDirectoryTests`, `CacheCoherencyTests` (7), `LevelDbReproTests` (3) all PASS. Capture the duration; should stay under the current ~45 s + a few seconds for the new tests. + +## 5. Manual end-to-end validation + +- [x] 5.1 Build a fresh `RamDrive.exe` (`dotnet build -c Debug src/RamDrive.Cli`). +- [x] 5.2 Mount manually at a sandbox drive letter (`H:` per memory of available letters): `dotnet run --project src/RamDrive.Cli -- --RamDrive:MountPoint='H:\\' --RamDrive:CapacityMb=128 --RamDrive:VolumeLabel=DevTest`. Default `FileInfoTimeoutMs=1000` and `EnableKernelCache=true`. +- [x] 5.3 Run `node F:\MyProjects\RamDrive\repro_chrome.js H:\Temp\repro_post_fix` (the Node repro script created during diagnosis). Expect: chrome reaches `about:blank`, no `Corruption: CURRENT does not end with newline` lines in stderr, the script's 30 s `child.kill()` is what terminates the process (not a self-crash). **Result: leveldb files written correctly (`CURRENT` ends with `\n` confirmed by hex dump). The original "Corruption" warning is gone — leveldb fix works in production.** However a SEPARATE pre-existing bug surfaces: `--remote-debugging-pipe` + `EnableKernelCache=true` produces `STATUS_BREAKPOINT` very early (before chrome's logging is set up). Direct `chrome.exe` (no `--remote-debugging-pipe`) runs fine on H: with cache enabled — full UI, network, USB enumeration. The pipe-mode crash also reproduces with `Notify` entirely disabled, confirming it is **not** related to this change. Filed as follow-up: a new RamDrive bug independent of the leveldb cache-coherency issue this change fixes. +- [x] 5.4 Repeat 5.2 with `--RamDrive:FileInfoTimeoutMs=4294967295` (max value) and confirm chrome still launches cleanly — proves notifications alone are sufficient. **Result: chrome runs to SIGTERM at 30s — no STATUS_BREAKPOINT.** A new lower-severity warning surfaces (`Corruption: no meta-nextfile entry in descriptor` on the leveldb MANIFEST) but chrome survives. Logged for follow-up under the same family of cache-vs-user-mode issues; not a regression of this change since chrome stays alive. +- [x] 5.5 Repeat 5.2 with `--RamDrive:EnableKernelCache=false` and confirm chrome still launches (no cache, no race, but slower). This is the documented backout configuration. **Result: chrome launches and runs cleanly via `--remote-debugging-pipe` (SIGTERM after 30s).** Confirms backout config works as documented. +- [x] 5.6 Spot-check throughput: `dotnet run --project tests/RamDrive.Benchmarks -c Release -- onread`. Compare to the pre-change baseline. Acceptable: within ±5% noise. Record the numbers in the change folder for reviewers. **Captured to `benchmark-onread.md` in the change folder. Read steady-state ~9.2 ms, write ~17 ms (block ≥ 64 KB). Allocated=0 across all rows — zero-alloc hot path preserved. No pre-fix baseline captured for delta comparison; recorded as new post-fix baseline.** + +## 6. Cleanup and ship + +- [x] 6.1 Delete or `.gitignore` the diagnostic artefacts at the repo root: `repro_chrome.js`, `procmon_chrome.pml`, `procmon_chrome.csv`, `Procmon64.exe`, `imdiskBench.png`, `winfspbench.png`/`winfspBench[2-5].png`. Keep `tla/` and `winfsp-native/` (those are real code). **(Done via .gitignore; `repro_chrome.js` is intentionally kept and referenced by the postmortem doc.)** +- [x] 6.2 Run `openspec validate fix-leveldb-cache-coherency --strict` and resolve any warnings. +- [x] 6.3 Commit the binding change in the `winfsp-native` repo with message `feat: add FileSystemHost.Notify for kernel cache invalidation`. Tag `v0.1.2-pre.1`. **(Tagged `v0.1.2-pre.2` — version bumped twice during dev iterations to invalidate nuget local cache.)** +- [x] 6.4 Commit the RamDrive change with message `fix: invalidate kernel FileInfo cache on path mutations (leveldb/Chromium)`. Reference the change folder in the commit body. +- [x] 6.5 Run `/opsx:archive fix-leveldb-cache-coherency` once both commits land and CI is green. **(Done — archived to `openspec/changes/archive/2026-05-03-fix-leveldb-cache-coherency/`. Specs synced to `openspec/specs/cache-invalidation/` and `openspec/specs/file-info-timeout-config/`.)** + +## 7. Postmortem doc + TLA+ modeling extension + +- [x] 7.1 Write `docs/leveldb-cache-coherency-postmortem.md`. **Self-contained for a future Claude Code session — assume the reader has only this file, the repo, and `tla/RamDiskSystem.tla`.** Include: + - **Symptom**: Chromium launched with `--user-data-dir=Z:\...` exits with `STATUS_BREAKPOINT (0x80000003)`; stderr shows `leveldb_database.cc:124] ... Corruption: CURRENT file does not end with newline`. Reproducible across `Local Storage`, `GCM Store`, `Sync Data`, `Site Characteristics Database`, etc. — every leveldb-backed component. + - **Environment**: Windows 11 Pro for Workstations 26200, WinFsp 2.x, RamDrive mounted at `Z:\` (drive-letter mode, MountManager-registered). Failure observed with default `EnableKernelCache=true` (`FileInfoTimeout=uint.MaxValue`). Same Chromium config on ImDisk `V:\` works fine — proves it is FS-side. + - **Repro recipe** (the actual node script we used): include the contents of `repro_chrome.js` verbatim in a fenced code block, plus the exact command line `node repro_chrome.js Z:\Temp\rarbg_proc`. Note that Playwright-style `--remote-debugging-pipe` requires fd 3/4 to be inherited via `stdio: ['ignore','pipe','pipe','pipe','pipe']`; without that, Chromium exits 13 with "Remote debugging pipe file descriptors are not open" and the bug never triggers. + - **Diagnostic dead-ends** (so the next session does not redo them): the original bug report listed 8 suspects (DRIVE_FIXED, mountvol GUID, GetVolumeInformation, LockFileEx semantics, byte-range Lock callback, FileIdInfo, FlushFileBuffers, sandbox path checks). All eight were verified non-issues on this repo. The `WinFsp.Native` `IFileSystem` interface does not expose Lock/Unlock — WinFsp handles byte-range locks in the kernel; the user-mode FS does not implement them. Document this so nobody chases it again. + - **How procmon was used**: launch via `"C:\Users\HuYao\Desktop\Procmon - Copy.exe" /AcceptEula /Quiet /Minimized /BackingFile F:\procmon_chrome.pml /Runtime 35` from an admin shell, simultaneously run the node repro, then export with `... /OpenLog F:\procmon_chrome.pml /SaveAs F:\procmon_chrome.csv`. The csv is ~150 MB; filter via `grep '"chrome.exe"' procmon_chrome.csv | grep '\\CURRENT' | grep -v SUCCESS`. **Key caveat**: procmon stretches IRP timing enough to mask the STATUS_BREAKPOINT (chrome runs further before crashing), but the underlying `Corruption: CURRENT does not end with newline` warning still appears — diagnose against the warning, not the exit code. + - **The smoking-gun trace** (paste the precise 5-line procmon excerpt that nailed it): + ``` + SetRenameInformationFile dbtmp -> CURRENT SUCCESS ReplaceIfExists:False + CreateFile CURRENT SUCCESS Generic Read + ReadFile CURRENT END OF FILE Length: 8192 + ``` + Comment that ReadFile being END OF FILE proves the kernel did not consult user-mode — WinFsp returned cached size=0 from the negative cache populated by earlier `CreateFile NAME NOT FOUND` lookups. + - **Root cause**: `WinFspRamAdapter.Init` set `host.FileInfoTimeout = uint.MaxValue`. Adapter never invalidated the kernel's cache after `MoveFile`. Negative cache for `CURRENT` ("name does not exist") survived the rename. + - **Reproduction in the integration suite without procmon** (so the next session can iterate locally): the file `tests/RamDrive.IntegrationTests/LevelDbReproTests.cs` and the trace plumbing on `RamDriveFixture` (`SetTraceFilter`, `Trace`). Show the failing-then-passing trace excerpt and the toggle (fixture's `FileInfoTimeout` between `uint.MaxValue` and `1000`). + - **Fix summary**: binding adds `FileSystemHost.Notify`; adapter calls it from every path-mutating callback per the matrix in `design.md` Decision 4. Belt-and-braces: `FileInfoTimeoutMs` config defaults to 1000 ms. + - **Open follow-ups**: TLA+ modeling (described in 7.2–7.4 below); also the question whether `WriteFile` needs a `(ChangeSize, ActionModified)` notification when size grew (`design.md` Open Questions). +- [x] 7.2 In `docs/leveldb-cache-coherency-postmortem.md`, append a **"TLA+ modeling extension"** section describing the planned model changes in enough detail that a fresh session can implement without re-deriving them. Include: + - **Why model it**: the previous `RamDiskSystem.tla` already verifies internal data integrity but treats the kernel as a passive observer. The new bug is between user-mode state and the kernel's `FileInfo` cache — exactly the `GetVolumeInfo` / `FreeBytes` lesson from the existing `Modeling guidelines` section in `CLAUDE.md`. The notification matrix is verifiable. + - **New TLA+ variables** (give exact intended type, not pseudocode): + - `kernelCache \in [Path -> {NotCached} \cup [size: Nat, exists: BOOLEAN]]` + - `cacheMode \in {Permanent, Bounded}` — corresponds to `FileInfoTimeoutMs = uint.MaxValue` vs finite. Bounded mode enables a non-deterministic `CacheExpire(path)` action. + - **New actions**: + - `KernelOpenFile(path)`: if `kernelCache[path] = NotCached`, perform the existing `DoOpen(path)` and write the result to `kernelCache`. Otherwise return the cached value with NO call into the user-mode portion. + - `KernelReadFile(path)`: same shape — relies on cached size if present. + - `Notify(path, action)`: sets `kernelCache[path] := NotCached`. Action is one of `Added`, `Removed`, `RenamedOldName`, `RenamedNewName`, `Modified`. + - `CacheExpire(path)` — only enabled when `cacheMode = Bounded`; non-deterministically clears any cache entry. Models the timeout. + - **Modify existing actions** to call `Notify` per the matrix: + - `DoCreateFile(f)` → `Notify(f, Added)` + - `DoMove(f, g)` → `Notify(f, RenamedOldName)` then `Notify(g, RenamedNewName)` + - `DoDelete(f)` → `Notify(f, Removed)` + - `DoTruncate(f, n)` / `DoExtend(f, n)` → `Notify(f, Modified)` + - `DoWriteP3(f)` → no notification (kernel page cache is invalidated by the FSD using the `FspFileInfo` returned by `Write`, not by `Notify`; document this asymmetry). + - **Invariants to add**: + - `CacheCoherent`: every `Cached(s, e)` entry agrees with the current user-mode state of that path, OR `cacheMode = Bounded` (i.e. an entry may be transiently stale only when timeout-bounded). + - `NoStaleSizeAfterRename`: after `DoMove(f, g)`, any subsequent `KernelReadFile(g)` returns the post-move size of `g`, not a pre-move cached value. + - **Liveness to add**: + - `RenameThenReadEventuallyConsistent`: in `Bounded` mode, `[](DoMove(f,g) ~> KernelReadFile(g) returns SizeOf(g))`. In `Permanent` mode the same property holds because the matrix forces `Notify`. + - **Bug-finding test**: the model should be **provably faulty when `Notify` is removed from `DoMove`** under `cacheMode = Permanent`. The next session should run a configuration with this regression to confirm TLC produces a counterexample matching the procmon trace from 7.1. + - **State-space estimate**: minimal config (3 pages × 2 files × 2 paths × 3 cacheStates) on top of current `RamDiskSystem_Minimal.cfg` (≈ 12 min, 3M states) is expected to grow to ≈ 1–2 hours. Standard config (≈ 5 h, 66M states) likely 1–2 days. If TLC blows up, restrict `Path` to a 2-element symmetry set first. +- [ ] 7.3 Update `tla/RamDiskSystem.tla` per 7.2: add variables, actions, invariants, liveness. Update `tla/RamDiskSystem_Minimal.cfg` to declare the new constants (`Permanent`/`Bounded` cache modes; default to `Permanent` for fastest counterexample-finding). +- [ ] 7.4 Run TLC on the minimal config. Required outcome: with the notification matrix in place, all invariants hold and no liveness violation. Then deliberately remove the `Notify(g, RenamedNewName)` from `DoMove` and re-run — TLC must produce a counterexample trace that shows `KernelReadFile(g)` returning a stale size after rename. Capture both runs (clean + counterexample) into `tla/RamDiskSystem_CacheModel_results.txt` so the postmortem doc has concrete output to reference. +- [ ] 7.5 Append the TLC results to the postmortem doc under a "Verification results" section, then update `CLAUDE.md`'s "How the model maps to code" table with the new actions (`KernelOpenFile`, `KernelReadFile`, `Notify`, `CacheExpire`) and their corresponding code locations (`FileSystemHost.Notify`, `WinFspRamAdapter` matrix calls). diff --git a/openspec/config.yaml b/openspec/config.yaml new file mode 100644 index 0000000..392946c --- /dev/null +++ b/openspec/config.yaml @@ -0,0 +1,20 @@ +schema: spec-driven + +# Project context (optional) +# This is shown to AI when creating artifacts. +# Add your tech stack, conventions, style guides, domain knowledge, etc. +# Example: +# context: | +# Tech stack: TypeScript, React, Node.js +# We use conventional commits +# Domain: e-commerce platform + +# Per-artifact rules (optional) +# Add custom rules for specific artifacts. +# Example: +# rules: +# proposal: +# - Keep proposals under 500 words +# - Always include a "Non-goals" section +# tasks: +# - Break tasks into chunks of max 2 hours diff --git a/openspec/specs/cache-invalidation/spec.md b/openspec/specs/cache-invalidation/spec.md new file mode 100644 index 0000000..bef9cd6 --- /dev/null +++ b/openspec/specs/cache-invalidation/spec.md @@ -0,0 +1,121 @@ +# cache-invalidation + +## Purpose + +Path-mutating WinFsp callbacks must notify the WinFsp kernel cache so subsequent opens/reads see fresh state, even when `FileInfoTimeout` is large or `uint.MaxValue`. Covers the binding-level Notify API, the adapter-level notification matrix, and a worst-case-timeout regression test. + +## Requirements + + +### Requirement: Adapter SHALL invalidate kernel FileInfo cache after path-mutating callbacks + +When a user-mode callback in `WinFspRamAdapter` changes the existence, name, size, or attributes of a path, the adapter MUST send an `FspFileSystemNotify` for every affected path before returning success to WinFsp. The notification carries the path the kernel uses for cache lookup, so future opens and reads observe the post-mutation state regardless of `FileInfoTimeoutMs`. + +The notification path MUST NOT cause the originating callback to fail. If `FspFileSystemNotify` returns a non-success NTSTATUS, the adapter MUST log at trace level and return success for the original IRP. The mutation has already taken effect in `RamFileSystem`; the cache is now allowed to be stale until `FileInfoTimeoutMs` elapses. + +The notification MUST be sent outside any locks held by `RamFileSystem` (`_structureLock`) or `PagedFileContent` (`_lock`). Sending a kernel-synchronous notification under a user-mode lock that the kernel might re-enter is a reentrancy hazard; the adapter wires `Notify` calls after the inner-FS call returns. + +#### Scenario: Rename emits old-name and new-name notifications +- **WHEN** the adapter's `MoveFile` callback is invoked with `oldFileName` and `newFileName` and the underlying `RamFileSystem.Move` returns success +- **THEN** the adapter sends `FspFileSystemNotify(ChangeFileName, ActionRenamedOldName, oldFileName)` followed by `FspFileSystemNotify(ChangeFileName, ActionRenamedNewName, newFileName)` +- **AND** subsequent `OpenFile(newFileName)` from a different handle observes the renamed node's size and attributes (not a stale "name not found" or zero-size record) + +#### Scenario: Rename-replace invalidates the displaced target's cache +- **WHEN** the adapter's `MoveFile` is invoked with `replaceIfExists=true` and `newFileName` already exists +- **THEN** the adapter sends `FspFileSystemNotify(ChangeFileName, ActionRenamedNewName, newFileName)`, which the FSD interprets as "this name now refers to a different node — drop any cached state" +- **AND** subsequent reads on `newFileName` return the source node's content, not zero bytes from the displaced target's cached size + +#### Scenario: Delete emits a removal notification +- **WHEN** the adapter's `Cleanup` is invoked with `CleanupFlags.Delete` and the path is a file +- **THEN** the adapter sends `FspFileSystemNotify(ChangeFileName, ActionRemoved, path)` +- **AND** any subsequent `OpenFile(path)` call returns `ObjectNameNotFound` rather than reusing a cached `FileInfo` from before the delete + +#### Scenario: Directory delete uses the directory filter +- **WHEN** the adapter's `Cleanup` is invoked with `CleanupFlags.Delete` and the path is a directory +- **THEN** the adapter sends `FspFileSystemNotify(ChangeDirName, ActionRemoved, path)` instead of `ChangeFileName` + +#### Scenario: Overwrite invalidates size and timestamp caches +- **WHEN** the adapter's `OverwriteFile` callback truncates the existing file to zero +- **THEN** the adapter sends `FspFileSystemNotify(ChangeSize | ChangeLastWrite, ActionModified, path)` +- **AND** the kernel-cached `FileInfo.FileSize` is invalidated, so a subsequent `GetFileInformation` call sees `FileSize=0` rather than the pre-overwrite value + +#### Scenario: Create emits an addition notification to defeat negative caching +- **WHEN** the adapter's `CreateFile` callback creates a new regular file at `path` +- **THEN** the adapter sends `FspFileSystemNotify(ChangeFileName, ActionAdded, path)` +- **AND** a previous negative cache entry for `path` (a recently failed `OpenFile` returning `ObjectNameNotFound`) is invalidated, so the new file becomes immediately visible to other handles + +#### Scenario: Directory create uses the directory filter +- **WHEN** the adapter's `CreateFile` callback creates a directory at `path` +- **THEN** the adapter sends `FspFileSystemNotify(ChangeDirName, ActionAdded, path)` + +#### Scenario: SetFileSize invalidates size cache +- **WHEN** the adapter's `SetFileSize` callback runs with `setAllocationSize=false` and changes the logical length +- **THEN** the adapter sends `FspFileSystemNotify(ChangeSize | ChangeLastWrite, ActionModified, path)` + +#### Scenario: SetFileAttributes invalidates attributes cache +- **WHEN** the adapter's `SetFileAttributes` callback changes attributes or any timestamp +- **THEN** the adapter sends `FspFileSystemNotify(ChangeAttributes | ChangeLastWrite, ActionModified, path)` + +#### Scenario: Notification failure does not surface as IRP failure +- **WHEN** `FspFileSystemNotify` returns a non-success NTSTATUS from inside any of the above callbacks +- **THEN** the adapter logs the failure at trace level +- **AND** the adapter returns the success result for the original IRP (the user-mode mutation is the source of truth; cache staleness is bounded by `FileInfoTimeoutMs`) + +### Requirement: WinFsp.Native binding SHALL expose a public Notify API for IFileSystem implementations + +The `WinFsp.Native` binding MUST expose a `FileSystemHost.Notify(uint filter, uint action, string fileName)` method that wraps `FspFileSystemNotify`. The method packages the path into the variable-length `FSP_FSCTL_NOTIFY_INFO` buffer (12-byte header + inline UTF-16 file name) using a stack-allocated buffer, with no managed heap allocation on success. + +The binding MUST publish well-known constants for the `Filter` and `Action` parameters under a `FileNotify` static class so callers do not have to define them themselves. Constants exposed: `ChangeFileName`, `ChangeDirName`, `ChangeAttributes`, `ChangeSize`, `ChangeLastWrite`, `ChangeLastAccess`, `ChangeCreation`, `ChangeSecurity`, `ActionAdded`, `ActionRemoved`, `ActionModified`, `ActionRenamedOldName`, `ActionRenamedNewName`. + +The binding MUST normalize the file name to upper case in place (inside the stack buffer, after copying) when the host's `CaseSensitiveSearch` flag is `false`, because the WinFsp driver's cache is keyed on the upper-cased form for case-insensitive volumes. Callers SHALL pass the user-supplied case unchanged. + +The `Notify` method MUST return the underlying NTSTATUS so callers can decide how to handle errors (the adapter, per the requirement above, treats this as non-fatal). It MUST NOT throw managed exceptions for normal NTSTATUS error returns. + +#### Scenario: Notify packs a single notification record and forwards it +- **WHEN** a caller invokes `host.Notify(FileNotify.ChangeFileName, FileNotify.ActionAdded, @"\GCM Store\CURRENT")` +- **THEN** the binding builds a `FSP_FSCTL_NOTIFY_INFO` buffer containing one record `{Size = 12 + 2*pathLen, Filter = ChangeFileName, Action = ActionAdded, FileNameBuf = "\\GCM Store\\CURRENT"}` +- **AND** invokes `FspFileSystemNotify` with that buffer and returns its NTSTATUS + +#### Scenario: Case-insensitive mounts upper-case the path before notifying +- **WHEN** the host has `CaseSensitiveSearch == false` and the caller invokes `host.Notify(..., @"\Temp\CURRENT")` +- **THEN** the buffer passed to `FspFileSystemNotify` contains the path `\TEMP\CURRENT` (upper-cased), matching the FSD's internal cache key +- **AND** the kernel cache entry for that upper-cased path is invalidated + +#### Scenario: Case-sensitive mounts pass the path through unchanged +- **WHEN** the host has `CaseSensitiveSearch == true` and the caller invokes `host.Notify(..., @"\Temp\Current")` +- **THEN** the buffer passed to `FspFileSystemNotify` contains the path `\Temp\Current` byte-for-byte + +#### Scenario: Notify is allocation-free on the hot path +- **WHEN** `host.Notify(...)` is called repeatedly inside a tight loop with paths that fit in the stack-allocated buffer (<= 1024 chars) +- **THEN** no managed heap allocation occurs in the binding (verified by `GC.GetAllocatedBytesForCurrentThread()` delta == 0 across the loop, excluding the marshalled string parameter itself) + +#### Scenario: Notify returns NTSTATUS, does not throw +- **WHEN** `FspFileSystemNotify` returns a non-success NTSTATUS (e.g. because the volume is being torn down) +- **THEN** `host.Notify` returns that NTSTATUS to the caller without raising a managed exception + +### Requirement: Cache-invalidation behavior is regression-tested at the worst-case timeout + +The integration test fixture (`RamDriveFixture`) MUST be configured with `FileInfoTimeout = uint.MaxValue` so that cache invalidation depends entirely on the notification matrix — never on time-based expiration. A regression in the matrix (a missed callback or wrong filter/action) will then produce a stale-cache failure in CI rather than appearing only against real Chromium with the production default. + +The test suite MUST include a `LevelDbReproTests` class that exercises the exact Win32 sequence captured from Chromium's leveldb env when opening a fresh database: + +1. `CreateFile(dbtmp, GENERIC_WRITE, CREATE_ALWAYS)` (default cached I/O — no `FILE_FLAG_NO_BUFFERING` or `FILE_FLAG_WRITE_THROUGH`) +2. `WriteFile(dbtmp, "MANIFEST-000001\n", 16)` +3. `FlushFileBuffers(dbtmp)` and `CloseHandle(dbtmp)` +4. `MoveFileEx(dbtmp, CURRENT, MOVEFILE_REPLACE_EXISTING)` +5. `CreateFile(CURRENT, GENERIC_READ, OPEN_EXISTING)` and `ReadFile(CURRENT, 8192)` + +Step 5 MUST return all 16 bytes of the original payload. + +#### Scenario: Win32 leveldb sequence reads back the renamed content +- **WHEN** the test runs the five-step Win32 sequence above against the integration fixture (which is mounted with `FileInfoTimeout = uint.MaxValue`) +- **THEN** `ReadFile` after the rename returns 16 bytes equal to `"MANIFEST-000001\n"` +- **AND** `ReadFile` does NOT return `END_OF_FILE` or 0 bytes + +#### Scenario: Removing a notification call fails the regression test +- **WHEN** the implementation of `MoveFile` in the adapter is modified to skip the post-rename `Notify` calls +- **THEN** `LevelDbReproTests.Win32_LevelDb_Sequence_Cached` fails with `readBytes == 0` or content mismatch + +#### Scenario: Mixed-case paths still invalidate correctly +- **WHEN** the test sequence uses a path with mixed case (e.g. `\Temp\dbtmp` rename to `\Temp\CURRENT`) +- **THEN** the post-rename read returns the full payload regardless of how the upper-cased FSD cache key compares to the user-supplied path diff --git a/openspec/specs/file-info-timeout-config/spec.md b/openspec/specs/file-info-timeout-config/spec.md new file mode 100644 index 0000000..dc19e91 --- /dev/null +++ b/openspec/specs/file-info-timeout-config/spec.md @@ -0,0 +1,44 @@ +# file-info-timeout-config + +## Purpose + +Operators can bound the WinFsp kernel `FileInfo` cache lifetime via `RamDrive:FileInfoTimeoutMs`, with a safe default that prevents indefinite stale-cache survival if a notification is missed. Acts as defence in depth on top of `cache-invalidation`. + +## Requirements + + +### Requirement: Operators SHALL be able to bound the kernel FileInfo cache lifetime + +The `RamDriveOptions` configuration object MUST expose a `FileInfoTimeoutMs` property (uint, milliseconds) that the adapter SHALL pass to `FileSystemHost.FileInfoTimeout` during `Init`. The default value MUST be `1000` ms. + +This setting acts as defence in depth on top of the explicit cache-invalidation notifications defined by the `cache-invalidation` capability. Notifications are the primary correctness mechanism; the timeout bounds the impact of any path that escapes notification (e.g. a future code path that mutates state without an accompanying `Notify` call). + +The setting MUST be wired through the standard configuration binding (`appsettings.jsonc` under the `RamDrive` section, overridable via CLI as `--RamDrive:FileInfoTimeoutMs=N`). It MUST accept the value `0` (cache disabled) and `uint.MaxValue` (cache effectively permanent — operator opt-in to "trust the notifications completely"). Values in between are passed through unchanged. + +The previously-shipped `EnableKernelCache` boolean MUST be preserved for backward compatibility: +- `EnableKernelCache=true` (the default) makes the adapter apply `FileInfoTimeoutMs` to the host as `FileInfoTimeout`. +- `EnableKernelCache=false` forces the adapter to set `FileInfoTimeout = 0` regardless of `FileInfoTimeoutMs`, preserving the documented "no kernel cache" mode. + +#### Scenario: Default mount uses 1000 ms timeout +- **WHEN** the service starts with the default `appsettings.jsonc` (which does not set `FileInfoTimeoutMs`) +- **THEN** `FileInfoTimeoutMs` equals `1000` +- **AND** the mounted host has `FileInfoTimeout = 1000` + +#### Scenario: Explicit override via CLI +- **WHEN** the service is started with `--RamDrive:FileInfoTimeoutMs=5000` +- **THEN** the mounted host has `FileInfoTimeout = 5000` + +#### Scenario: Setting EnableKernelCache=false zeroes the timeout +- **WHEN** the service is started with `--RamDrive:EnableKernelCache=false --RamDrive:FileInfoTimeoutMs=10000` +- **THEN** the mounted host has `FileInfoTimeout = 0` +- **AND** the explicit `FileInfoTimeoutMs` value is ignored (the boolean toggle wins for compatibility) + +#### Scenario: Operator opts in to permanent cache +- **WHEN** the service is started with `--RamDrive:FileInfoTimeoutMs=4294967295` +- **THEN** the mounted host has `FileInfoTimeout = uint.MaxValue` +- **AND** correctness depends entirely on the notification matrix from the `cache-invalidation` capability (this is the configuration the integration test fixture pins for regression testing) + +#### Scenario: Documented in operator-facing config table +- **WHEN** an operator reads `CLAUDE.md` for the configuration table or `appsettings.jsonc` comments +- **THEN** `FileInfoTimeoutMs` appears with its default (`1000`), unit (milliseconds), and a note that `0` disables the cache and `uint.MaxValue` makes it permanent +- **AND** the existing `EnableKernelCache` row is updated to clarify that `false` overrides `FileInfoTimeoutMs` to `0` diff --git a/repro_chrome.js b/repro_chrome.js new file mode 100644 index 0000000..436e9b1 --- /dev/null +++ b/repro_chrome.js @@ -0,0 +1,55 @@ +// Mimic Playwright's chromium launch: fd 3/4 used for --remote-debugging-pipe +const { spawn } = require('child_process'); +const fs = require('fs'); + +const exe = 'C:\\Users\\HuYao\\AppData\\Local\\ms-playwright\\chromium-1208\\chrome-win64\\chrome.exe'; +const userDataDir = process.argv[2] || 'Z:\\Temp\\rarbg_browser_72j1evkz'; + +const args = [ + '--disable-field-trial-config','--disable-background-networking','--disable-background-timer-throttling', + '--disable-backgrounding-occluded-windows','--disable-back-forward-cache','--disable-breakpad', + '--disable-client-side-phishing-detection','--disable-component-extensions-with-background-pages', + '--disable-component-update','--no-default-browser-check','--disable-default-apps','--disable-dev-shm-usage', + '--disable-extensions', + '--disable-features=AvoidUnnecessaryBeforeUnloadCheckSync,BoundaryEventDispatchTracksNodeRemoval,DestroyProfileOnBrowserClose,DialMediaRouteProvider,GlobalMediaControls,HttpsUpgrades,LensOverlay,MediaRouter,PaintHolding,ThirdPartyStoragePartitioning,Translate,AutoDeElevate,RenderDocument,OptimizationHints', + '--enable-features=CDPScreenshotNewSurface','--allow-pre-commit-input','--disable-hang-monitor', + '--disable-ipc-flooding-protection','--disable-popup-blocking','--disable-prompt-on-repost', + '--disable-renderer-backgrounding','--force-color-profile=srgb','--metrics-recording-only', + '--no-first-run','--password-store=basic','--use-mock-keychain','--no-service-autorun', + '--export-tagged-pdf','--disable-search-engine-choice-screen','--unsafely-disable-devtools-self-xss-warnings', + '--edge-skip-compat-layer-relaunch','--enable-automation','--disable-infobars', + '--disable-search-engine-choice-screen','--disable-sync','--enable-unsafe-swiftshader','--no-sandbox', + '--disable-blink-features=AutomationControlled', + `--user-data-dir=${userDataDir}`, + '--remote-debugging-pipe', + '--enable-logging=stderr','--v=0', + 'about:blank' +]; + +console.log('[repro] launching chrome with userDataDir=', userDataDir); +try { fs.rmSync(userDataDir, { recursive: true, force: true }); } catch {} + +const child = spawn(exe, args, { + stdio: ['ignore', 'pipe', 'pipe', 'pipe', 'pipe'], + windowsHide: true, +}); + +child.stderr.on('data', d => process.stderr.write(d)); +child.stdout.on('data', d => process.stdout.write(d)); +child.stdio[3].on('error', () => {}); +child.stdio[4].on('error', () => {}); + +setTimeout(() => { + console.log('[repro] 30s elapsed, killing'); + try { child.kill(); } catch {} +}, 30000); + +child.on('exit', (code, signal) => { + console.log(`[repro] EXIT code=${code} signal=${signal}`); + if (code !== null) { + const u = code >>> 0; + console.log(`[repro] hex=0x${u.toString(16).toUpperCase().padStart(8,'0')}`); + if (u === 0x80000003) console.log('[repro] STATUS_BREAKPOINT — repro confirmed'); + } + process.exit(0); +}); diff --git a/src/RamDrive.Cli/WinFspRamAdapter.cs b/src/RamDrive.Cli/WinFspRamAdapter.cs index b547376..cab4cd6 100644 --- a/src/RamDrive.Cli/WinFspRamAdapter.cs +++ b/src/RamDrive.Cli/WinFspRamAdapter.cs @@ -16,6 +16,26 @@ namespace RamDrive.Cli; /// Design: zero managed heap allocation on hot path (Read/Write/GetFileInfo etc). /// All ValueTask returns are synchronous-completed (no Task boxing). /// FileNode is cached in . +/// +/// +/// Cache-invalidation matrix — every path-mutating callback below sends an +/// FspFileSystemNotify via +/// after the user-mode mutation commits. This keeps the WinFsp kernel FileInfo +/// cache coherent with state, even when +/// is large or +/// . See specs/cache-invalidation/spec.md. +/// +/// +/// CallbackFilter / Action +/// CreateFile (file)ChangeFileName / ActionAdded +/// CreateFile (dir)ChangeDirName / ActionAdded +/// OverwriteFileChangeSize|ChangeLastWrite / ActionModified +/// SetFileSize (logical)ChangeSize|ChangeLastWrite / ActionModified +/// SetFileAttributesChangeAttributes|ChangeLastWrite / ActionModified +/// MoveFile(old) ChangeFileName/ActionRenamedOldName + (new) ChangeFileName/ActionRenamedNewName +/// Cleanup(Delete) (file)ChangeFileName / ActionRemoved +/// Cleanup(Delete) (dir)ChangeDirName / ActionRemoved +/// /// [SupportedOSPlatform("windows")] internal sealed unsafe class WinFspRamAdapter : IFileSystem @@ -29,6 +49,7 @@ internal sealed unsafe class WinFspRamAdapter : IFileSystem private readonly RamFileSystem _fs; private readonly RamDriveOptions _options; private readonly ILogger _logger; + private FileSystemHost? _host; public WinFspRamAdapter(RamFileSystem fs, IOptions options, ILogger logger) { @@ -58,8 +79,9 @@ public int Init(FileSystemHost host) host.SectorSize = 4096; host.SectorsPerAllocationUnit = 1; host.MaxComponentLength = 255; - if (_options.EnableKernelCache) - host.FileInfoTimeout = unchecked((uint)(-1)); + // Notifications keep the kernel cache coherent; FileInfoTimeoutMs is defence in depth. + // EnableKernelCache=false forces 0 (no cache) regardless of FileInfoTimeoutMs — backout switch. + host.FileInfoTimeout = _options.EnableKernelCache ? _options.FileInfoTimeoutMs : 0u; host.CasePreservedNames = true; host.UnicodeOnDisk = true; host.PersistentAcls = true; @@ -70,6 +92,7 @@ public int Init(FileSystemHost host) public int Mounted(FileSystemHost host) { + _host = host; _logger.LogInformation("Drive mounted at {MountPoint}", host.MountPoint); return NtStatus.Success; } @@ -128,6 +151,9 @@ public ValueTask CreateFile( info.Context = dir; info.IsDirectory = true; + // Cache invalidation: defeat negative cache for callers that probed this name + // before it existed (leveldb, SQLite, etc. routinely do this). + Notify(FileNotify.ChangeDirName, FileNotify.ActionAdded, fileName); return V(new CreateResult(NtStatus.Success, MakeFileInfo(dir))); } @@ -148,6 +174,8 @@ public ValueTask CreateFile( info.Context = file; info.IsDirectory = false; + // Cache invalidation: see comment above for the directory branch. + Notify(FileNotify.ChangeFileName, FileNotify.ActionAdded, fileName); return V(new CreateResult(NtStatus.Success, MakeFileInfo(file))); } @@ -185,6 +213,9 @@ public ValueTask OverwriteFile( node.Attributes |= (FileAttributes)fileAttributes; node.LastWriteTime = DateTime.UtcNow; + // Cache invalidation: size went to 0; subsequent reads must not see the pre-overwrite cached size. + if (info.FileName is { } path) + Notify(FileNotify.ChangeSize | FileNotify.ChangeLastWrite, FileNotify.ActionModified, path); return V(FsResult.Success(MakeFileInfo(node))); } @@ -275,6 +306,7 @@ public ValueTask SetFileAttributes( if (lastWriteTime != 0) node.LastWriteTime = DateTime.FromFileTimeUtc((long)lastWriteTime); // changeTime: we don't track separately + Notify(FileNotify.ChangeAttributes | FileNotify.ChangeLastWrite, FileNotify.ActionModified, fileName); return V(FsResult.Success(MakeFileInfo(node))); } @@ -301,6 +333,7 @@ public ValueTask SetFileSize( return V(FsResult.Error(NtStatus.DiskFull)); node.LastWriteTime = DateTime.UtcNow; + Notify(FileNotify.ChangeSize | FileNotify.ChangeLastWrite, FileNotify.ActionModified, fileName); return V(FsResult.Success(MakeFileInfo(node))); } @@ -366,6 +399,12 @@ public ValueTask MoveFile( // Update cached node — name changed var node = Node(info); if (node != null) info.Context = node; // still valid reference + // Cache invalidation MUST happen after the lock is released (Move returned). + // Without ActionRenamedNewName the kernel's negative cache for newFileName + // (populated by leveldb's pre-rename existence probe) survives the rename and + // a subsequent ReadFile returns 0 bytes — see fix-leveldb-cache-coherency. + Notify(FileNotify.ChangeFileName, FileNotify.ActionRenamedOldName, fileName); + Notify(FileNotify.ChangeFileName, FileNotify.ActionRenamedNewName, newFileName); return V(NtStatus.Success); } return V(NtStatus.ObjectNameCollision); @@ -379,7 +418,11 @@ public void Cleanup(string? fileName, FileOperationInfo info, CleanupFlags flags { if (flags.HasFlag(CleanupFlags.Delete) && fileName != null) { + // Capture IsDirectory BEFORE Delete disposes the node. + bool wasDir = info.IsDirectory; _fs.Delete(fileName); + Notify(wasDir ? FileNotify.ChangeDirName : FileNotify.ChangeFileName, + FileNotify.ActionRemoved, fileName); } var node = Node(info); @@ -443,6 +486,36 @@ public ValueTask ReadDirectory( private static FileNode? Node(FileOperationInfo info) => info.Context as FileNode; + /// + /// Send an FspFileSystemNotify to invalidate the WinFsp kernel FileInfo cache for + /// . Failures are intentionally swallowed: the originating IRP + /// MUST succeed even if cache invalidation fails — the user-mode mutation already + /// committed and stale cache is bounded by FileInfoTimeoutMs. + /// Per specs/cache-invalidation/spec.md. + /// + /// The notification is dispatched on a thread-pool worker rather than synchronously + /// from the WinFsp dispatcher thread. FspFsctlNotify is a kernel IOCTL that can + /// block on rename-in-progress; if a dispatcher thread is blocked in Notify while + /// the IRP that would unblock it is waiting for that same dispatcher thread to drain, + /// the dispatcher pool deadlocks. The off-thread dispatch keeps the dispatcher free at + /// the cost of fire-and-forget ordering — acceptable because the kernel will revalidate + /// any cached entry on the next open after invalidation, and the matrix is path-scoped. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void Notify(uint filter, uint action, string path) + { + var host = _host; + if (host == null) return; + ThreadPool.UnsafeQueueUserWorkItem(static state => + { + var (host, filter, action, path, logger) = state; + int status = host.Notify(filter, action, path); + if (status < 0 && logger.IsEnabled(LogLevel.Trace)) + logger.LogTrace("FspFileSystemNotify({Path}, filter=0x{Filter:X}, action={Action}) returned 0x{Status:X8}", + path, filter, action, status); + }, (host, filter, action, path, _logger), preferLocal: false); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static FspFileInfo MakeFileInfo(FileNode node) => new() { diff --git a/src/RamDrive.Cli/appsettings.jsonc b/src/RamDrive.Cli/appsettings.jsonc index bf43eff..b2f7925 100644 --- a/src/RamDrive.Cli/appsettings.jsonc +++ b/src/RamDrive.Cli/appsettings.jsonc @@ -24,13 +24,19 @@ // Volume label shown in Windows Explorer. "VolumeLabel": "RamDrive", - // Enable Windows kernel-level file data caching (sets WinFsp FileInfoTimeout=MAX). - // Improves throughput (~3x) by letting the OS cache manager serve repeated reads - // without calling back into user mode. - // Warning: under high concurrency, WinFsp's metadata cache may return stale file sizes - // because the lazy writer has not yet flushed dirty pages to user mode. + // Enable Windows kernel-level file data caching. When true, FileInfoTimeoutMs + // (below) controls how long the kernel may serve cached metadata. When false, + // the timeout is forced to 0 (no kernel cache, ~3x lower throughput) regardless + // of FileInfoTimeoutMs — this is the documented backout switch. "EnableKernelCache": true, + // Lifetime of the WinFsp kernel FileInfo cache, in milliseconds. Default 1000. + // The adapter sends FspFileSystemNotify on every path-mutating callback so the + // cache is invalidated explicitly; this timeout is defence in depth. + // 0 = cache disabled (same effect as EnableKernelCache=false) + // 4294967295 = uint.MaxValue, cache effectively permanent (trust notifications only) + "FileInfoTimeoutMs": 1000, + // Directories to create automatically after mounting. // JSON keys are directory names; nested objects define subdirectories. // Example: { "Temp": {}, "Cache": { "App1": {}, "App2": {} } } diff --git a/src/RamDrive.Core/Configuration/RamDriveOptions.cs b/src/RamDrive.Core/Configuration/RamDriveOptions.cs index 15fabcc..cb2352c 100644 --- a/src/RamDrive.Core/Configuration/RamDriveOptions.cs +++ b/src/RamDrive.Core/Configuration/RamDriveOptions.cs @@ -9,12 +9,31 @@ public sealed class RamDriveOptions public string VolumeLabel { get; set; } = "RamDrive"; /// - /// Enable Windows kernel-level file data caching (sets WinFsp FileInfoTimeout=MAX). - /// Improves throughput (~3x) by letting the OS cache manager serve repeated reads - /// without calling back into user mode. + /// Enable Windows kernel-level file data caching. When true, the WinFsp host's + /// FileInfoTimeout is set to . When false the + /// timeout is forced to 0 (no kernel cache, ~3× lower throughput) regardless of + /// — this is the documented backout switch. /// public bool EnableKernelCache { get; set; } = true; + /// + /// Lifetime of the WinFsp kernel FileInfo cache, in milliseconds. Default 1000. + /// + /// The adapter sends FspFileSystemNotify calls after every path-mutating + /// operation (rename, delete, overwrite, create, set-size, set-attrs) so the cache is + /// invalidated explicitly. This timeout is defence in depth for any path that escapes + /// the notification matrix. + /// + /// Special values: + /// + /// 0 — cache disabled (same effect as =false). + /// uint.MaxValue (4294967295) — cache effectively permanent. Correctness + /// depends entirely on the notification matrix; the integration test fixture pins this + /// value to catch regressions. + /// + /// + public uint FileInfoTimeoutMs { get; set; } = 1000; + /// /// Tree of directories to create at the root of the RAM disk after mounting. /// JSON keys are directory names; nested objects define subdirectories. diff --git a/tests/RamDrive.IntegrationTests/CacheCoherencyTests.cs b/tests/RamDrive.IntegrationTests/CacheCoherencyTests.cs new file mode 100644 index 0000000..ce86df8 --- /dev/null +++ b/tests/RamDrive.IntegrationTests/CacheCoherencyTests.cs @@ -0,0 +1,221 @@ +using System.ComponentModel; +using System.Runtime.InteropServices; +using FluentAssertions; + +namespace RamDrive.IntegrationTests; + +internal static partial class Win32 +{ + [LibraryImport("kernel32.dll", EntryPoint = "ReplaceFileW", SetLastError = true, StringMarshalling = StringMarshalling.Utf16)] + [return: MarshalAs(UnmanagedType.Bool)] + public static partial bool ReplaceFile( + string lpReplacedFileName, string lpReplacementFileName, + string? lpBackupFileName, uint dwReplaceFlags, nint lpExclude, nint lpReserved); + + [LibraryImport("kernel32.dll", EntryPoint = "MoveFileExW", SetLastError = true, StringMarshalling = StringMarshalling.Utf16)] + [return: MarshalAs(UnmanagedType.Bool)] + public static partial bool MoveFileEx(string lpExistingFileName, string lpNewFileName, uint dwFlags); + + public const uint MOVEFILE_REPLACE_EXISTING = 0x1; + public const uint MOVEFILE_WRITE_THROUGH = 0x8; +} + +/// +/// Cache coherency tests — exercise the LevelDB / SQLite atomic-rename pattern that +/// breaks when WinFsp's kernel FileInfo cache (FileInfoTimeout=MAX) is not +/// invalidated after a MoveFile that replaces an existing target. +/// +/// Reproduces the Chromium / leveldb "CURRENT file does not end with newline" crash +/// observed when mounting a Chromium user-data-dir on the RamDrive. +/// +[Collection("RamDrive")] +public class CacheCoherencyTests(RamDriveFixture fx) : IDisposable +{ + private readonly string _dir = Path.Combine(fx.Root, $"cc_{Guid.NewGuid():N}"); + + public void Dispose() { try { Directory.Delete(_dir, true); } catch { } } + + /// + /// Exact LevelDB CURRENT-file pattern: + /// 1. target exists as a 0-byte file (forces kernel to cache size=0) + /// 2. write tmp with new content + /// 3. MoveFileEx(tmp -> target, REPLACE_EXISTING) + /// 4. open target and read — must see the NEW content, not stale 0 bytes + /// + [Fact] + public void Rename_Replace_Then_Read_Sees_New_Content() + { + Directory.CreateDirectory(_dir); + string target = Path.Combine(_dir, "CURRENT"); + string tmp = Path.Combine(_dir, "dbtmp"); + + // 1. Create empty target and prime the kernel's FileInfo cache (size=0) + File.WriteAllBytes(target, []); + File.ReadAllBytes(target).Should().BeEmpty(); + + // 2. Write new content to tmp + byte[] expected = "MANIFEST-000001\n"u8.ToArray(); + File.WriteAllBytes(tmp, expected); + + // 3. Atomic rename, replacing the cached-empty target + File.Move(tmp, target, overwrite: true); + + // 4. Read target — kernel cache must NOT serve stale size=0 + byte[] actual = File.ReadAllBytes(target); + actual.Should().Equal(expected, + "MoveFile that replaces an existing target must invalidate the kernel FileInfo cache"); + } + + /// + /// Tighter loop — many rename-replace cycles, each immediately read back. + /// Mirrors how Chromium / leveldb churns CURRENT during DB open + manifest rotation. + /// + [Fact] + public void Rename_Replace_Loop_Always_Reads_Latest() + { + Directory.CreateDirectory(_dir); + string target = Path.Combine(_dir, "CURRENT"); + File.WriteAllBytes(target, []); // prime size=0 cache + + for (int i = 1; i <= 100; i++) + { + string tmp = Path.Combine(_dir, $"tmp_{i}"); + byte[] payload = System.Text.Encoding.ASCII.GetBytes($"MANIFEST-{i:D6}\n"); + File.WriteAllBytes(tmp, payload); + File.Move(tmp, target, overwrite: true); + + byte[] read = File.ReadAllBytes(target); + read.Should().Equal(payload, $"iteration {i}: rename-then-read must see new content"); + } + } + + /// + /// Variant: target initially has some data, replaced by a file of DIFFERENT size. + /// Catches caches keyed on (size, mtime) rather than just size. + /// + [Fact] + public void Rename_Replace_Different_Size_Then_Read() + { + Directory.CreateDirectory(_dir); + string target = Path.Combine(_dir, "CURRENT"); + string tmp = Path.Combine(_dir, "dbtmp"); + + File.WriteAllBytes(target, new byte[64]); // 64-byte stale target + File.ReadAllBytes(target).Length.Should().Be(64); // prime cache + + byte[] expected = new byte[16]; + for (int i = 0; i < expected.Length; i++) expected[i] = (byte)(i + 1); + File.WriteAllBytes(tmp, expected); + File.Move(tmp, target, overwrite: true); + + byte[] actual = File.ReadAllBytes(target); + actual.Should().Equal(expected); + } + + /// + /// Verifies that the file size reported by GetFileInformation (used for + /// FileStream.Length and ReadAllBytes' buffer sizing) matches the new content + /// immediately after the rename. + /// + [Fact] + public void Rename_Replace_FileSize_Matches_New_Content() + { + Directory.CreateDirectory(_dir); + string target = Path.Combine(_dir, "CURRENT"); + string tmp = Path.Combine(_dir, "dbtmp"); + + File.WriteAllBytes(target, []); // 0-byte stale + new FileInfo(target).Length.Should().Be(0); // prime cache + + byte[] expected = new byte[16]; + File.WriteAllBytes(tmp, expected); + File.Move(tmp, target, overwrite: true); + + new FileInfo(target).Length.Should().Be(16, + "file size queried after rename-replace must reflect the new node"); + } + + /// + /// Chromium's leveldb port uses ReplaceFileW (not MoveFileEx) on Windows. + /// Semantics differ: ReplaceFileW preserves the destination's metadata streams + /// and ACLs, internally swapping the file content via a different code path. + /// This is the EXACT call that produces the "CURRENT does not end with newline" + /// crash when launching Chromium with --user-data-dir on this RAM drive. + /// + [Fact] + public void ReplaceFileW_LevelDB_Pattern_Then_Read() + { + Directory.CreateDirectory(_dir); + string target = Path.Combine(_dir, "CURRENT"); + string tmp = Path.Combine(_dir, "CURRENT.tmp"); + + File.WriteAllBytes(target, []); + File.ReadAllBytes(target).Should().BeEmpty(); // prime kernel cache size=0 + + byte[] expected = "MANIFEST-000001\n"u8.ToArray(); + File.WriteAllBytes(tmp, expected); + + bool ok = Win32.ReplaceFile(target, tmp, null, 0, 0, 0); + if (!ok) throw new Win32Exception(Marshal.GetLastWin32Error(), $"ReplaceFile failed"); + + byte[] actual = File.ReadAllBytes(target); + actual.Should().Equal(expected, + "ReplaceFileW must invalidate the kernel FileInfo cache for the destination"); + } + + /// + /// LevelDB on a fresh DB: CURRENT does not exist yet. The sequence is + /// SetCurrentFile → WriteStringToFile(dbtmp) → RenameFile(dbtmp, CURRENT). + /// The kernel may have negative-cached "CURRENT does not exist" from an + /// earlier probe (leveldb does check whether CURRENT exists before recovery). + /// + [Fact] + public void Rename_To_NonExistent_Target_After_Negative_Probe() + { + Directory.CreateDirectory(_dir); + string target = Path.Combine(_dir, "CURRENT"); + string tmp = Path.Combine(_dir, "dbtmp"); + + // Negative probe — leveldb does this via ::GetFileAttributesEx + File.Exists(target).Should().BeFalse(); + + byte[] expected = "MANIFEST-000001\n"u8.ToArray(); + File.WriteAllBytes(tmp, expected); + + bool ok = Win32.MoveFileEx(tmp, target, Win32.MOVEFILE_REPLACE_EXISTING); + if (!ok) throw new Win32Exception(Marshal.GetLastWin32Error(), "MoveFileEx failed"); + + // Open and read — buffered, sequential, exactly like leveldb's SequentialFile + using var fs = new FileStream(target, FileMode.Open, FileAccess.Read, FileShare.Read); + fs.Length.Should().Be(expected.Length, "size after rename to never-cached target must reflect new node"); + var buf = new byte[expected.Length]; + int n = fs.Read(buf, 0, buf.Length); + n.Should().Be(expected.Length); + buf.Should().Equal(expected); + } + + /// + /// Use GetFileAttributesEx (BasicInfo) — distinct kernel cache class from + /// FileStandardInfo. Prime with empty target, rename-replace, then query size + /// via attributes. Catches WinFsp's per-info-class cache divergence. + /// + [Fact] + public void Rename_Replace_Then_GetFileAttributesEx_Size() + { + Directory.CreateDirectory(_dir); + string target = Path.Combine(_dir, "CURRENT"); + string tmp = Path.Combine(_dir, "dbtmp"); + + File.WriteAllBytes(target, []); + new FileInfo(target).Length.Should().Be(0); // prime + + byte[] expected = new byte[16]; + File.WriteAllBytes(tmp, expected); + Win32.MoveFileEx(tmp, target, Win32.MOVEFILE_REPLACE_EXISTING).Should().BeTrue(); + + // Force a fresh attributes query via a NEW FileInfo instance + var fi = new FileInfo(target); + fi.Refresh(); + fi.Length.Should().Be(16); + } +} diff --git a/tests/RamDrive.IntegrationTests/LevelDbReproTests.cs b/tests/RamDrive.IntegrationTests/LevelDbReproTests.cs new file mode 100644 index 0000000..91c735d --- /dev/null +++ b/tests/RamDrive.IntegrationTests/LevelDbReproTests.cs @@ -0,0 +1,340 @@ +using System.ComponentModel; +using System.Runtime.InteropServices; +using System.Runtime.Versioning; +using System.Text; +using FluentAssertions; +using RamDrive.Core.Configuration; +using WinFsp.Native; +using Xunit.Abstractions; +using RamDriveCore = RamDrive.Core; + +namespace RamDrive.IntegrationTests; + +internal static partial class LevelDbWin32 +{ + public const uint GENERIC_READ = 0x80000000; + public const uint GENERIC_WRITE = 0x40000000; + public const uint FILE_SHARE_READ = 1, FILE_SHARE_WRITE = 2, FILE_SHARE_DELETE = 4; + public const uint OPEN_EXISTING = 3, CREATE_ALWAYS = 2, OPEN_ALWAYS = 4; + public const uint FILE_ATTRIBUTE_NORMAL = 0x80; + + [LibraryImport("kernel32.dll", EntryPoint = "CreateFileW", SetLastError = true, StringMarshalling = StringMarshalling.Utf16)] + public static partial nint CreateFile(string lpFileName, uint dwDesiredAccess, uint dwShareMode, + nint lpSecurityAttributes, uint dwCreationDisposition, uint dwFlagsAndAttributes, nint hTemplateFile); + + [LibraryImport("kernel32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static partial bool WriteFile(nint hFile, byte[] lpBuffer, uint nNumberOfBytesToWrite, + out uint lpNumberOfBytesWritten, nint lpOverlapped); + + [LibraryImport("kernel32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static partial bool ReadFile(nint hFile, byte[] lpBuffer, uint nNumberOfBytesToRead, + out uint lpNumberOfBytesRead, nint lpOverlapped); + + [LibraryImport("kernel32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static partial bool FlushFileBuffers(nint hFile); + + [LibraryImport("kernel32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static partial bool CloseHandle(nint hObject); + + [LibraryImport("kernel32.dll", SetLastError = true, EntryPoint = "MoveFileExW", StringMarshalling = StringMarshalling.Utf16)] + [return: MarshalAs(UnmanagedType.Bool)] + public static partial bool MoveFileEx(string lpExistingFileName, string lpNewFileName, uint dwFlags); + public const uint MOVEFILE_REPLACE_EXISTING = 1; + + [StructLayout(LayoutKind.Explicit, Size = 20)] + public struct FILE_RENAME_INFO + { + // C union { BOOLEAN ReplaceIfExists; DWORD Flags; } at offset 0; 7 bytes padding to 8. + [FieldOffset(0)] public byte ReplaceIfExists; + [FieldOffset(8)] public nint RootDirectory; + [FieldOffset(16)] public uint FileNameLength; + // FileName[] starts at offset 20 (right after FileNameLength, no further padding — + // SetFileInformationByHandle reads (BufferSize - sizeof(header)) bytes of name). + } + + // SetFileInformationByHandle for FileRenameInfo class — this is what NTFS rename via handle uses. + [LibraryImport("kernel32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static partial bool SetFileInformationByHandle(nint hFile, int FileInformationClass, + nint lpFileInformation, uint dwBufferSize); + public const int FileRenameInfo = 3; +} + +[Collection("RamDrive")] +public class LevelDbReproTests(RamDriveFixture fx, ITestOutputHelper output) : IDisposable +{ + private readonly string _dir = Path.Combine(fx.Root, $"ldb_{Guid.NewGuid():N}"); + + public void Dispose() + { + RamDriveFixture.SetTraceFilter(null); + try { Directory.Delete(_dir, true); } catch { } + } + + private void DumpTrace() + { + output.WriteLine("=== TraceLog ==="); + lock (RamDriveFixture.TraceLog) + foreach (var line in RamDriveFixture.TraceLog) output.WriteLine(line); + } + + /// + /// Exactly mirrors leveldb's chromium env SetCurrentFile sequence captured by procmon: + /// 1. WriteFile(dbtmp, "MANIFEST-000001\n", 16 bytes) — buffered/cached + /// 2. FlushFileBuffers(dbtmp) + /// 3. CloseHandle(dbtmp) + /// 4. MoveFileEx(dbtmp -> CURRENT, REPLACE_EXISTING) + /// 5. CreateFile(CURRENT, GENERIC_READ) + ReadFile must return 16 bytes + /// + [Fact] + public void Win32_LevelDb_Sequence_Cached() + { + Directory.CreateDirectory(_dir); + string tmp = Path.Combine(_dir, "dbtmp"); + string cur = Path.Combine(_dir, "CURRENT"); + + RamDriveFixture.SetTraceFilter("ldb_"); + + // Phase 1: write dbtmp via Win32 cached I/O (default — no FILE_FLAG_WRITE_THROUGH/NO_BUFFERING) + nint h = LevelDbWin32.CreateFile(tmp, + LevelDbWin32.GENERIC_WRITE, LevelDbWin32.FILE_SHARE_READ, + 0, LevelDbWin32.CREATE_ALWAYS, LevelDbWin32.FILE_ATTRIBUTE_NORMAL, 0); + if (h == -1) throw new Win32Exception(Marshal.GetLastWin32Error(), "CreateFile dbtmp"); + + byte[] payload = "MANIFEST-000001\n"u8.ToArray(); + if (!LevelDbWin32.WriteFile(h, payload, (uint)payload.Length, out uint written, 0)) + throw new Win32Exception(Marshal.GetLastWin32Error(), "WriteFile"); + written.Should().Be((uint)payload.Length); + + LevelDbWin32.FlushFileBuffers(h).Should().BeTrue(); + LevelDbWin32.CloseHandle(h).Should().BeTrue(); + + // Phase 2: rename dbtmp -> CURRENT + bool moved = LevelDbWin32.MoveFileEx(tmp, cur, LevelDbWin32.MOVEFILE_REPLACE_EXISTING); + if (!moved) throw new Win32Exception(Marshal.GetLastWin32Error(), "MoveFileEx"); + + // Phase 3: open + read CURRENT — MUST see 16 bytes + nint hr = LevelDbWin32.CreateFile(cur, + LevelDbWin32.GENERIC_READ, LevelDbWin32.FILE_SHARE_READ, + 0, LevelDbWin32.OPEN_EXISTING, LevelDbWin32.FILE_ATTRIBUTE_NORMAL, 0); + if (hr == -1) throw new Win32Exception(Marshal.GetLastWin32Error(), "CreateFile CURRENT for read"); + + byte[] readBuf = new byte[8192]; + bool ok = LevelDbWin32.ReadFile(hr, readBuf, (uint)readBuf.Length, out uint readBytes, 0); + LevelDbWin32.CloseHandle(hr); + + try + { + ok.Should().BeTrue(); + readBytes.Should().Be((uint)payload.Length, "ReadFile after rename must return all 16 bytes; got {0}", readBytes); + readBuf.AsSpan(0, (int)readBytes).ToArray().Should().Equal(payload); + } + catch + { + DumpTrace(); + throw; + } + } + + /// + /// Variant: chromium leveldb uses SetFileInformationByHandle(FileRenameInfo) on Windows 10+ + /// rather than MoveFileEx — they produce different IRPs (SetInformationFile in IRP_MJ_SET_INFORMATION). + /// + [Fact] + public unsafe void Win32_LevelDb_SetFileInformationByHandle_Rename() + { + Directory.CreateDirectory(_dir); + string tmp = Path.Combine(_dir, "dbtmp"); + string cur = Path.Combine(_dir, "CURRENT"); + + RamDriveFixture.SetTraceFilter("ldb_"); + + nint h = LevelDbWin32.CreateFile(tmp, + LevelDbWin32.GENERIC_WRITE | (1u << 16) /* DELETE */, LevelDbWin32.FILE_SHARE_READ | LevelDbWin32.FILE_SHARE_WRITE | LevelDbWin32.FILE_SHARE_DELETE, + 0, LevelDbWin32.CREATE_ALWAYS, LevelDbWin32.FILE_ATTRIBUTE_NORMAL, 0); + if (h == -1) throw new Win32Exception(Marshal.GetLastWin32Error(), "CreateFile dbtmp"); + + byte[] payload = "MANIFEST-000001\n"u8.ToArray(); + LevelDbWin32.WriteFile(h, payload, (uint)payload.Length, out _, 0).Should().BeTrue(); + LevelDbWin32.FlushFileBuffers(h).Should().BeTrue(); + + // Build FILE_RENAME_INFO with target full path inline + string target = cur; + byte[] tgtBytes = Encoding.Unicode.GetBytes(target); + int infoSize = sizeof(LevelDbWin32.FILE_RENAME_INFO) + tgtBytes.Length + 2; // + null terminator + nint buf = Marshal.AllocHGlobal(infoSize); + try + { + new Span((void*)buf, infoSize).Clear(); + var info = (LevelDbWin32.FILE_RENAME_INFO*)buf; + info->ReplaceIfExists = 1; + info->RootDirectory = 0; + info->FileNameLength = (uint)tgtBytes.Length; + tgtBytes.CopyTo(new Span((byte*)buf + sizeof(LevelDbWin32.FILE_RENAME_INFO), tgtBytes.Length)); + bool renamed = LevelDbWin32.SetFileInformationByHandle(h, LevelDbWin32.FileRenameInfo, buf, (uint)infoSize); + if (!renamed) throw new Win32Exception(Marshal.GetLastWin32Error(), "SetFileInformationByHandle(FileRenameInfo)"); + } + finally { Marshal.FreeHGlobal(buf); } + + LevelDbWin32.CloseHandle(h); + + // Read back + nint hr = LevelDbWin32.CreateFile(cur, + LevelDbWin32.GENERIC_READ, LevelDbWin32.FILE_SHARE_READ, + 0, LevelDbWin32.OPEN_EXISTING, LevelDbWin32.FILE_ATTRIBUTE_NORMAL, 0); + if (hr == -1) throw new Win32Exception(Marshal.GetLastWin32Error(), "CreateFile CURRENT for read"); + + byte[] readBuf = new byte[8192]; + bool ok = LevelDbWin32.ReadFile(hr, readBuf, (uint)readBuf.Length, out uint readBytes, 0); + LevelDbWin32.CloseHandle(hr); + + try + { + ok.Should().BeTrue(); + readBytes.Should().Be((uint)payload.Length, "Got {0} bytes (expected 16)", readBytes); + readBuf.AsSpan(0, (int)readBytes).ToArray().Should().Equal(payload); + } + catch + { + DumpTrace(); + throw; + } + } + + /// + /// Mixed-case variant: rename writes to \Temp\Dbtmp then renames to \TEMP\CURRENT. + /// Validates the case-insensitive upper-casing inside FileSystemHost.Notify — without it, + /// the kernel's cache key for the all-uppercase path won't match the notification's mixed-case + /// path and the read returns zero bytes. + /// + [Fact] + public void MixedCase_Win32_LevelDb_Sequence() + { + // The fixture is case-insensitive (default), so MIXEDCASE/mixedcase resolve to the same node. + // We deliberately use different cases for write vs read paths. + Directory.CreateDirectory(_dir); + string tmpMixedCase = Path.Combine(_dir, "Dbtmp"); + string curUpperCase = Path.Combine(_dir, "CURRENT"); + string curLowerCase = Path.Combine(_dir, "current"); // read with completely different case + + RamDriveFixture.SetTraceFilter("ldb_"); + + nint h = LevelDbWin32.CreateFile(tmpMixedCase, + LevelDbWin32.GENERIC_WRITE, LevelDbWin32.FILE_SHARE_READ, + 0, LevelDbWin32.CREATE_ALWAYS, LevelDbWin32.FILE_ATTRIBUTE_NORMAL, 0); + if (h == -1) throw new Win32Exception(Marshal.GetLastWin32Error(), "CreateFile dbtmp"); + + byte[] payload = "MANIFEST-000001\n"u8.ToArray(); + LevelDbWin32.WriteFile(h, payload, (uint)payload.Length, out _, 0).Should().BeTrue(); + LevelDbWin32.FlushFileBuffers(h).Should().BeTrue(); + LevelDbWin32.CloseHandle(h); + + LevelDbWin32.MoveFileEx(tmpMixedCase, curUpperCase, LevelDbWin32.MOVEFILE_REPLACE_EXISTING) + .Should().BeTrue(); + + nint hr = LevelDbWin32.CreateFile(curLowerCase, + LevelDbWin32.GENERIC_READ, LevelDbWin32.FILE_SHARE_READ, + 0, LevelDbWin32.OPEN_EXISTING, LevelDbWin32.FILE_ATTRIBUTE_NORMAL, 0); + if (hr == -1) throw new Win32Exception(Marshal.GetLastWin32Error(), "CreateFile current for read"); + + byte[] readBuf = new byte[8192]; + bool ok = LevelDbWin32.ReadFile(hr, readBuf, (uint)readBuf.Length, out uint readBytes, 0); + LevelDbWin32.CloseHandle(hr); + + try + { + ok.Should().BeTrue(); + readBytes.Should().Be((uint)payload.Length, + "mixed-case path must invalidate the upper-cased FSD cache key; got {0} bytes", readBytes); + readBuf.AsSpan(0, (int)readBytes).ToArray().Should().Equal(payload); + } + catch + { + DumpTrace(); + throw; + } + } +} + +/// +/// Extra tests with a private fixture configured at the production-default +/// FileInfoTimeoutMs=1000 (instead of the suite-wide uint.MaxValue). +/// Verifies the leveldb sequence still works under the shipped configuration — +/// not just the worst-case regression-bait one. +/// +[SupportedOSPlatform("windows")] +public class LevelDbDefaultTimeoutTests : IDisposable +{ + private readonly RamDriveCore.Memory.PagePool _pool; + private readonly RamDriveCore.FileSystem.RamFileSystem _fs; + private readonly FileSystemHost _host; + private readonly string _root; + private readonly string _dir; + + public LevelDbDefaultTimeoutTests() + { + var options = new RamDriveOptions + { + CapacityMb = 64, + PageSizeKb = 64, + EnableKernelCache = true, + FileInfoTimeoutMs = 1000, // production default + VolumeLabel = "DefaultTimeoutTest", + }; + _pool = new RamDriveCore.Memory.PagePool( + new Microsoft.Extensions.Options.OptionsWrapper(options), + Microsoft.Extensions.Logging.Abstractions.NullLogger.Instance); + _fs = new RamDriveCore.FileSystem.RamFileSystem(_pool); + _host = new FileSystemHost(new TestAdapter(_fs, options)) + { + Prefix = $@"\winfsp-tests\itest-default-{Environment.ProcessId}", + }; + int r = _host.Mount(null); + if (r < 0) throw new InvalidOperationException($"mount failed: 0x{r:X8}"); + _root = (_host.MountPoint ?? throw new InvalidOperationException("no mount")) + @"\"; + _dir = Path.Combine(_root, $"ldb_default_{Guid.NewGuid():N}"); + } + + public void Dispose() + { + try { Directory.Delete(_dir, true); } catch { } + _host.Dispose(); + _fs.Dispose(); + _pool.Dispose(); + } + + [Fact] + public void Notify_DefaultTimeoutAlsoWorks() + { + Directory.CreateDirectory(_dir); + string tmp = Path.Combine(_dir, "dbtmp"); + string cur = Path.Combine(_dir, "CURRENT"); + + nint h = LevelDbWin32.CreateFile(tmp, + LevelDbWin32.GENERIC_WRITE, LevelDbWin32.FILE_SHARE_READ, + 0, LevelDbWin32.CREATE_ALWAYS, LevelDbWin32.FILE_ATTRIBUTE_NORMAL, 0); + if (h == -1) throw new Win32Exception(Marshal.GetLastWin32Error(), "CreateFile dbtmp"); + byte[] payload = "MANIFEST-000001\n"u8.ToArray(); + LevelDbWin32.WriteFile(h, payload, (uint)payload.Length, out _, 0).Should().BeTrue(); + LevelDbWin32.FlushFileBuffers(h).Should().BeTrue(); + LevelDbWin32.CloseHandle(h); + + LevelDbWin32.MoveFileEx(tmp, cur, LevelDbWin32.MOVEFILE_REPLACE_EXISTING).Should().BeTrue(); + + nint hr = LevelDbWin32.CreateFile(cur, + LevelDbWin32.GENERIC_READ, LevelDbWin32.FILE_SHARE_READ, + 0, LevelDbWin32.OPEN_EXISTING, LevelDbWin32.FILE_ATTRIBUTE_NORMAL, 0); + if (hr == -1) throw new Win32Exception(Marshal.GetLastWin32Error(), "CreateFile CURRENT for read"); + byte[] readBuf = new byte[8192]; + bool ok = LevelDbWin32.ReadFile(hr, readBuf, (uint)readBuf.Length, out uint readBytes, 0); + LevelDbWin32.CloseHandle(hr); + + ok.Should().BeTrue(); + readBytes.Should().Be((uint)payload.Length); + readBuf.AsSpan(0, (int)readBytes).ToArray().Should().Equal(payload); + } +} diff --git a/tests/RamDrive.IntegrationTests/RamDriveFixture.cs b/tests/RamDrive.IntegrationTests/RamDriveFixture.cs index d1bb97d..71ed1fa 100644 --- a/tests/RamDrive.IntegrationTests/RamDriveFixture.cs +++ b/tests/RamDrive.IntegrationTests/RamDriveFixture.cs @@ -20,6 +20,22 @@ public sealed class RamDriveFixture : IDisposable public string Root { get; } public long CapacityMb { get; } = 512; + /// + /// Optional file-scoped trace: when non-null, every callback that touches a + /// path matching this filter writes a structured event line to TraceLog. + /// Set via env var RAMDRIVE_TRACE_PATH (substring match, case-insensitive). + /// + public static List TraceLog { get; } = []; + private static string? _traceFilter; + public static void SetTraceFilter(string? substring) { _traceFilter = substring; lock (TraceLog) TraceLog.Clear(); } + internal static void Trace(string op, string? path, string? extra = null) + { + var f = _traceFilter; + if (f == null) return; + if (path != null && path.Contains(f, StringComparison.OrdinalIgnoreCase)) + lock (TraceLog) TraceLog.Add($"{DateTime.UtcNow:HH:mm:ss.ffffff} {op,-22} {path}{(extra == null ? "" : " | " + extra)}"); + } + private readonly PagePool _pool; private readonly RamFileSystem _fs; private readonly FileSystemHost _host; @@ -31,6 +47,10 @@ public RamDriveFixture() CapacityMb = CapacityMb, PageSizeKb = 64, EnableKernelCache = true, + // Pin to worst-case (permanent) cache lifetime so missing FspFileSystemNotify + // calls produce stale-cache test failures in CI rather than only against + // real Chromium with the production default. See specs/cache-invalidation. + FileInfoTimeoutMs = uint.MaxValue, VolumeLabel = "IntegrationTest", }; @@ -75,6 +95,7 @@ internal sealed unsafe class TestAdapter : IFileSystem { private readonly RamFileSystem _fs; private readonly RamDriveOptions _options; + private FileSystemHost? _host; private const string RootSddl = "O:BAG:BAD:P(A;;FA;;;SY)(A;;FA;;;BA)(A;;FA;;;WD)"; @@ -89,13 +110,29 @@ public TestAdapter(RamFileSystem fs, RamDriveOptions options) public bool SynchronousIo => true; + public int Mounted(FileSystemHost host) { _host = host; return NtStatus.Success; } + + [System.Runtime.CompilerServices.MethodImpl(System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)] + private void Notify(uint filter, uint action, string path) + { + // Off-thread, fire-and-forget — see WinFspRamAdapter.Notify for rationale + // (avoids dispatcher-pool deadlock under concurrent dir delete). + var host = _host; + if (host == null) return; + ThreadPool.UnsafeQueueUserWorkItem(static state => + { + var (host, filter, action, path) = state; + host.Notify(filter, action, path); + }, (host, filter, action, path), preferLocal: false); + } + public int Init(FileSystemHost host) { host.SectorSize = 4096; host.SectorsPerAllocationUnit = 1; host.MaxComponentLength = 255; if (_options.EnableKernelCache) - host.FileInfoTimeout = unchecked((uint)(-1)); + host.FileInfoTimeout = _options.FileInfoTimeoutMs; host.CasePreservedNames = true; host.UnicodeOnDisk = true; host.PersistentAcls = true; @@ -127,16 +164,20 @@ public ValueTask CreateFile(string fileName, uint co, uint ga, uin var dir = _fs.CreateDirectory(fileName, sd); if (dir == null) return new(CreateResult.Error(NtStatus.ObjectNameCollision)); info.Context = dir; info.IsDirectory = true; + RamDriveFixture.Trace("CreateFile-Dir", fileName, "ok"); + Notify(FileNotify.ChangeDirName, FileNotify.ActionAdded, fileName); return new(new CreateResult(NtStatus.Success, MkInfo(dir))); } var file = _fs.CreateFile(fileName, sd); if (file == null) { - if (_fs.FindNode(fileName) != null) return new(CreateResult.Error(NtStatus.ObjectNameCollision)); - return new(CreateResult.Error(NtStatus.ObjectPathNotFound)); + if (_fs.FindNode(fileName) != null) { RamDriveFixture.Trace("CreateFile-Collision", fileName, ""); return new(CreateResult.Error(NtStatus.ObjectNameCollision)); } + RamDriveFixture.Trace("CreateFile-PathNF", fileName, ""); return new(CreateResult.Error(NtStatus.ObjectPathNotFound)); } if (fa != 0) file.Attributes = (FileAttributes)fa; info.Context = file; info.IsDirectory = false; + RamDriveFixture.Trace("CreateFile", fileName, $"alloc={alloc}"); + Notify(FileNotify.ChangeFileName, FileNotify.ActionAdded, fileName); return new(new CreateResult(NtStatus.Success, MkInfo(file))); } @@ -144,8 +185,9 @@ public ValueTask OpenFile(string fileName, uint co, uint ga, FileOperationInfo info, CancellationToken ct) { var n = _fs.FindNode(fileName); - if (n == null) return new(CreateResult.Error(NtStatus.ObjectNameNotFound)); + if (n == null) { RamDriveFixture.Trace("OpenFile-NF", fileName, ""); return new(CreateResult.Error(NtStatus.ObjectNameNotFound)); } info.Context = n; info.IsDirectory = n.IsDirectory; + RamDriveFixture.Trace("OpenFile", fileName, $"len={n.Content?.Length ?? -1}"); return new(new CreateResult(NtStatus.Success, MkInfo(n))); } @@ -157,6 +199,8 @@ public ValueTask OverwriteFile(uint fa, bool replace, ulong alloc, if (replace && fa != 0) n.Attributes = (FileAttributes)fa; else if (fa != 0) n.Attributes |= (FileAttributes)fa; n.LastWriteTime = DateTime.UtcNow; + if (info.FileName is { } path) + Notify(FileNotify.ChangeSize | FileNotify.ChangeLastWrite, FileNotify.ActionModified, path); return new(FsResult.Success(MkInfo(n))); } @@ -165,10 +209,11 @@ public ValueTask ReadFile(string fileName, Memory buffer, ulon { var n = N(info); if (n?.Content == null) return new(ReadResult.Error(NtStatus.ObjectNameNotFound)); long len = n.Content.Length; - if ((long)offset >= len) return new(ReadResult.EndOfFile()); + if ((long)offset >= len) { RamDriveFixture.Trace("ReadFile-EOF", fileName, $"off={offset} len={len}"); return new(ReadResult.EndOfFile()); } int toRead = (int)Math.Min(buffer.Length, len - (long)offset); int read = n.Content.Read((long)offset, buffer.Span[..toRead]); n.LastAccessTime = DateTime.UtcNow; + RamDriveFixture.Trace("ReadFile", fileName, $"off={offset} bufLen={buffer.Length} fileLen={len} read={read}"); return new(ReadResult.Success((uint)read)); } @@ -177,11 +222,13 @@ public ValueTask WriteFile(string fileName, ReadOnlyMemory bu { var n = N(info); if (n?.Content == null) return new(WriteResult.Error(NtStatus.ObjectNameNotFound)); long wo = wteof ? n.Content.Length : (long)offset; + long beforeLen = n.Content.Length; int wl = buffer.Length; - if (cio) { long fl = n.Content.Length; if (wo >= fl) return new(WriteResult.Success(0, MkInfo(n))); wl = (int)Math.Min(wl, fl - wo); } + if (cio) { long fl = n.Content.Length; if (wo >= fl) { RamDriveFixture.Trace("WriteFile-NOOP", fileName, $"cio=1 wo={wo} fl={fl} buf={buffer.Length}"); return new(WriteResult.Success(0, MkInfo(n))); } wl = (int)Math.Min(wl, fl - wo); } int written = n.Content.Write(wo, buffer.Span[..wl]); if (written < 0) return new(WriteResult.Error(NtStatus.DiskFull)); n.LastWriteTime = DateTime.UtcNow; + RamDriveFixture.Trace("WriteFile", fileName, $"cio={(cio?1:0)} wteof={(wteof?1:0)} off={offset} buf={buffer.Length} wl={wl} written={written} lenBefore={beforeLen} lenAfter={n.Content.Length}"); return new(WriteResult.Success((uint)written, MkInfo(n))); } @@ -199,6 +246,7 @@ public ValueTask SetFileAttributes(string fileName, uint fa, ulong ct2 if (ct2 != 0) n.CreationTime = DateTime.FromFileTimeUtc((long)ct2); if (lat != 0) n.LastAccessTime = DateTime.FromFileTimeUtc((long)lat); if (lwt != 0) n.LastWriteTime = DateTime.FromFileTimeUtc((long)lwt); + Notify(FileNotify.ChangeAttributes | FileNotify.ChangeLastWrite, FileNotify.ActionModified, fileName); return new(FsResult.Success(MkInfo(n))); } @@ -206,15 +254,19 @@ public ValueTask SetFileSize(string fileName, ulong sz, bool alloc, FileOperationInfo info, CancellationToken ct) { var n = N(info); if (n?.Content == null) return new(FsResult.Error(NtStatus.ObjectNameNotFound)); + long beforeLen = n.Content.Length; if (alloc) { long additional = (long)sz - n.Size; if (additional > 0 && additional > _fs.FreeBytes) return new(FsResult.Error(NtStatus.DiskFull)); + RamDriveFixture.Trace("SetAllocSize", fileName, $"sz={sz}"); return new(FsResult.Success(MkInfo(n))); } if (!n.Content.SetLength((long)sz)) return new(FsResult.Error(NtStatus.DiskFull)); n.LastWriteTime = DateTime.UtcNow; + RamDriveFixture.Trace("SetFileSize", fileName, $"sz={sz} lenBefore={beforeLen} lenAfter={n.Content.Length}"); + Notify(FileNotify.ChangeSize | FileNotify.ChangeLastWrite, FileNotify.ActionModified, fileName); return new(FsResult.Success(MkInfo(n))); } @@ -246,11 +298,29 @@ public ValueTask CanDelete(string fileName, FileOperationInfo info, Cancell public ValueTask MoveFile(string fileName, string newFileName, bool replace, FileOperationInfo info, CancellationToken ct) - => new(_fs.Move(fileName, newFileName, replace) ? NtStatus.Success : NtStatus.ObjectNameCollision); + { + var nBefore = N(info); + long lenBefore = nBefore?.Content?.Length ?? -1; + bool ok = _fs.Move(fileName, newFileName, replace); + var nAfter = _fs.FindNode(newFileName); + long lenAfter = nAfter?.Content?.Length ?? -1; + RamDriveFixture.Trace("MoveFile", fileName, $"-> {newFileName} replace={replace} ok={ok} sourceLenBefore={lenBefore} targetLenAfter={lenAfter} sameNode={ReferenceEquals(nBefore,nAfter)}"); + if (ok) + { + Notify(FileNotify.ChangeFileName, FileNotify.ActionRenamedOldName, fileName); + Notify(FileNotify.ChangeFileName, FileNotify.ActionRenamedNewName, newFileName); + } + return new(ok ? NtStatus.Success : NtStatus.ObjectNameCollision); + } public void Cleanup(string? fileName, FileOperationInfo info, CleanupFlags flags) { - if (flags.HasFlag(CleanupFlags.Delete) && fileName != null) _fs.Delete(fileName); + if (flags.HasFlag(CleanupFlags.Delete) && fileName != null) + { + bool wasDir = info.IsDirectory; + _fs.Delete(fileName); + Notify(wasDir ? FileNotify.ChangeDirName : FileNotify.ChangeFileName, FileNotify.ActionRemoved, fileName); + } var n = N(info); if (n == null) return; if (flags.HasFlag(CleanupFlags.SetLastWriteTime)) n.LastWriteTime = DateTime.UtcNow; if (flags.HasFlag(CleanupFlags.SetLastAccessTime)) n.LastAccessTime = DateTime.UtcNow;