From 18d3a328bb5b8d772bad309f5c7905a6ca493487 Mon Sep 17 00:00:00 2001 From: bahdotsh Date: Fri, 15 May 2026 13:58:33 +0200 Subject: [PATCH 1/9] feat: add Mozilla-Readability full-text article extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most RSS feeds ship truncated summaries and call it a day. The user then has to leave the terminal, open a browser, and re-read the metadata they just read in feedr. This is the single biggest paper cut in the whole experience. Add a full-text extraction path that fetches the article URL and runs Mozilla Readability over the HTML to pull out the actual content. Bind it to Shift+F in the detail view (toggle on success, retry on failure, no-op while pending), or set fulltext = true on a feed to auto-extract newly-seen items on refresh. The crate of choice is dom_smoothie — pure sync, MSRV matches ours exactly, actively maintained, has a parse budget. The obvious alternative (article_scraper) is async-only and would drag tokio plus a full reqwest async stack into a codebase that has been deliberately single-threaded synchronous from day one. Not happening. Extractions run on per-request std::thread::spawn workers that post results back over an mpsc::channel — same shape as spawn_feed_refresh. State lives in App::extracted, keyed by the existing item-id scheme, with an LRU cap of 500 and explicit in-memory-only semantics. No disk persistence in v1 — cache invalidation is a problem I would rather not invent for a feature whose entire point is "fetch fresh content on demand." Per-feed Authorization headers are deliberately NOT forwarded to article URLs. They are third-party hosts. Leaking bearer tokens to random blog domains is the kind of thing that ends up in a CVE. While at it, mark_feed_seen is hoisted out of collect_exec_on_new so exec_on_new and the new auto-fulltext path share one mark per feed arrival. Calling it twice would double-mark and the second consumer would see an empty newly_seen list and silently do nothing. The gate also covers both consumers now, so users with neither hook still pay zero disk for seen-set tracking — same behavior as before, just with a second consumer wired in. --- CLAUDE.md | 5 +- Cargo.lock | 367 +++++++++++++++++++++++++++++++++++++++++---- Cargo.toml | 1 + README.md | 20 +++ src/app.rs | 289 ++++++++++++++++++++++++++++++++++- src/config.rs | 41 +++++ src/config_tui.rs | 1 + src/events.rs | 4 + src/feed.rs | 184 +++++++++++++++++++++++ src/keybindings.rs | 11 ++ src/tui.rs | 124 +++++++++++++-- src/ui/detail.rs | 79 ++++++++-- src/ui/modals.rs | 5 + 13 files changed, 1074 insertions(+), 57 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index ab71c5f..b5c87e1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project Overview -Feedr is a terminal-based RSS/Atom feed reader built with Rust, using ratatui/crossterm for the TUI. It supports feed management, categorization, filtering, dual themes, OPML import, auto-refresh with per-domain rate limiting, feed auto-discovery from HTML pages, configurable keybindings, mouse support, a help overlay, and newsboat-style external-command hooks (macros and `exec_on_new`). +Feedr is a terminal-based RSS/Atom feed reader built with Rust, using ratatui/crossterm for the TUI. It supports feed management, categorization, filtering, dual themes, OPML import, auto-refresh with per-domain rate limiting, feed auto-discovery from HTML pages, configurable keybindings, mouse support, a help overlay, newsboat-style external-command hooks (macros and `exec_on_new`), and Mozilla-Readability full-text article extraction. ## Build & Development Commands @@ -65,7 +65,8 @@ MSRV: 1.75.0. CI runs tests on stable, beta, and 1.75.0. - **Authenticated feeds**: `feed_headers: HashMap>` in `App` maps feed URLs to custom HTTP headers. Built from `config.default_feeds` entries that have `headers`. Passed to `Feed::fetch_url()` at all fetch call sites. - **Compact mode**: `app.compact` bool is updated each frame by `update_compact_mode(terminal_height)`. Rendering in `ui.rs` branches on `app.compact` for layout, title bar, help bar, and dashboard item format. Controlled by `config.ui.compact_mode` (`Auto`/`Always`/`Never`). Dialog modals use `centered_rect_with_min()` to enforce minimum dimensions regardless of compact mode. - **External-command hooks (macros + `exec_on_new`)**: Commands are run **without a shell**. Templates are tokenized once at config load via `shlex`, then `expand_argv_template` substitutes `%X` placeholders into individual argv tokens (no re-expansion), so feed content cannot break out of an argument. The macro engine queues steps into `app.pending_macro_steps` from `events.rs` and the TUI loop drains them in `tui.rs::drain_macro_steps` — drain lives at the loop level because `pipe-to` needs the terminal handle to suspend the TUI. Chains halt on the first step error (tracked via a `pre_error` guard so a stale `app.error` doesn't spuriously abort). The macro prefix (default `,`) is checked at the top of `handle_key_event` and only when `input_mode == Normal`, so text-input modes are not disturbed; an idle prefix times out via the existing success-message timeout. -- **`exec_on_new` crash semantics**: AT-MOST-ONCE. `flush_exec_on_new` persists the `seen_items` / `feeds_seeded` sets *before* spawning any child, so a kill mid-fire loses a notification rather than re-firing on the next launch. `mark_feed_seen` only flips `feeds_seeded` on a fetch that returned items (transiently-empty first fetches don't seed), and the first observation of a feed seeds the seen-set silently to avoid a firehose. Children are spawned detached with stdio nulled; a reaper thread waits on each so they don't linger as zombies. The seen-set is pruned in `remove_current_feed` to prevent monotonic growth across feed churn. +- **`exec_on_new` crash semantics**: AT-MOST-ONCE. `flush_exec_on_new` persists the `seen_items` / `feeds_seeded` sets *before* spawning any child, so a kill mid-fire loses a notification rather than re-firing on the next launch. `mark_feed_seen` only flips `feeds_seeded` on a fetch that returned items (transiently-empty first fetches don't seed), and the first observation of a feed seeds the seen-set silently to avoid a firehose. Children are spawned detached with stdio nulled; a reaper thread waits on each so they don't linger as zombies. The seen-set is pruned in `remove_current_feed` to prevent monotonic growth across feed churn. **Single-shared mark per feed**: `mark_feed_seen` is hoisted to the feed-drain call site in `tui.rs` (gated on `exec_on_new_template.is_some() || fulltext_feeds.contains(&feed.url)`) so multiple consumers (currently exec_on_new and fulltext) share one mark per feed arrival — calling it twice would double-mark and the second consumer would see an empty `newly_seen` list. +- **Full-text extraction**: `feed::extract_article` fetches an article URL with the existing `reqwest::blocking::Client` and runs Mozilla Readability via the `dom_smoothie` crate. **Sync, no tokio.** Per-feed `Authorization`/auth headers are intentionally NOT forwarded to article URLs (they're third-party hosts — propagating would be a credential leak). Each extraction runs on a `std::thread::spawn` worker that posts to a long-lived `mpsc::channel` drained every loop iteration in `run_app`. State lives only in `App::extracted: HashMap` (`Pending` / `Ready(ExtractedArticle)` / `Failed(String)`) — **in-memory only**, never persisted; LRU-capped at `EXTRACTED_CACHE_CAP = 500` with insertion-order tracking via `extracted_order: VecDeque`. `Shift+F` (`KeyAction::FetchFullText`) toggles between summary and extracted text when `Ready`, queues a new request when absent, and re-queues on `Failed` (so the user can retry). Per-feed `fulltext = true` in config auto-extracts newly-seen items on refresh (same `mark_feed_seen` "newly seen" semantics as `exec_on_new` — first fetch seeds silently, no firehose). Extracted entries are pruned alongside `seen_items` in `remove_current_feed`. ## Commit Conventions diff --git a/Cargo.lock b/Cargo.lock index cc431c1..bffb879 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -168,6 +168,21 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "1.3.2" @@ -380,7 +395,20 @@ dependencies = [ "cssparser-macros", "dtoa-short", "itoa", - "phf", + "phf 0.10.1", + "smallvec", +] + +[[package]] +name = "cssparser" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dae61cf9c0abb83bd659dab65b7e4e38d8236824c85f0f804f173567bda257d2" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf 0.13.1", "smallvec", ] @@ -405,6 +433,27 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "derive_more" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134" +dependencies = [ + "derive_more-impl", +] + +[[package]] +name = "derive_more-impl" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb" +dependencies = [ + "proc-macro2", + "quote", + "rustc_version", + "syn 2.0.100", +] + [[package]] name = "dirs" version = "5.0.1" @@ -437,6 +486,40 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "dom_query" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521e380c0c8afb8d9a1e83a1822ee03556fc3e3e7dbc1fd30be14e37f9cb3f89" +dependencies = [ + "bit-set", + "cssparser 0.36.0", + "foldhash", + "html5ever 0.38.0", + "nom", + "precomputed-hash", + "selectors 0.36.1", + "tendril 0.5.0", +] + +[[package]] +name = "dom_smoothie" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f21cad308997c8c518aaee15c8b38bb04dc699bbcf30da4176443c9ef40c1bb5" +dependencies = [ + "dom_query", + "flagset", + "foldhash", + "gjson", + "html-escape", + "once_cell", + "phf 0.13.1", + "tendril 0.5.0", + "thiserror 2.0.18", + "unicode-segmentation", +] + [[package]] name = "dtoa" version = "1.0.11" @@ -521,6 +604,7 @@ dependencies = [ "clap", "crossterm", "dirs", + "dom_smoothie", "feed-rs", "html2text", "open", @@ -537,6 +621,12 @@ dependencies = [ "uuid", ] +[[package]] +name = "flagset" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7ac824320a75a52197e8f2d787f6a38b6718bb6897a35142d749af3c0e8f4fe" + [[package]] name = "flate2" version = "1.1.2" @@ -553,6 +643,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "foreign-types" version = "0.3.2" @@ -682,6 +778,12 @@ version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" +[[package]] +name = "gjson" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43503cc176394dd30a6525f5f36e838339b8b5619be33ed9a7783841580a97b6" + [[package]] name = "h2" version = "0.3.26" @@ -744,15 +846,24 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "html-escape" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476" +dependencies = [ + "utf8-width", +] + [[package]] name = "html2text" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74cda84f06c1cc83476f79ae8e2e892b626bdadafcb227baec54c918cadc18a0" dependencies = [ - "html5ever", - "markup5ever", - "tendril", + "html5ever 0.26.0", + "markup5ever 0.11.0", + "tendril 0.4.3", "unicode-width 0.1.14", "xml5ever", ] @@ -765,12 +876,22 @@ checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" dependencies = [ "log", "mac", - "markup5ever", + "markup5ever 0.11.0", "proc-macro2", "quote", "syn 1.0.109", ] +[[package]] +name = "html5ever" +version = "0.38.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1054432bae2f14e0061e33d23402fbaa67a921d319d56adc6bcf887ddad1cbc2" +dependencies = [ + "log", + "markup5ever 0.38.0", +] + [[package]] name = "http" version = "0.2.12" @@ -1127,11 +1248,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" dependencies = [ "log", - "phf", - "phf_codegen", - "string_cache", - "string_cache_codegen", - "tendril", + "phf 0.10.1", + "phf_codegen 0.10.0", + "string_cache 0.8.9", + "string_cache_codegen 0.5.4", + "tendril 0.4.3", +] + +[[package]] +name = "markup5ever" +version = "0.38.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8983d30f2915feeaaab2d6babdd6bc7e9ed1a00b66b5e6d74df19aa9c0e91862" +dependencies = [ + "log", + "tendril 0.5.0", + "web_atoms", ] [[package]] @@ -1210,6 +1342,15 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1302,7 +1443,7 @@ checksum = "df2f96426c857a92676dc29a9e2a181eb39321047ac994491c69eae01619ddf2" dependencies = [ "hard-xml", "serde", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -1358,11 +1499,22 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" dependencies = [ - "phf_macros", + "phf_macros 0.10.0", "phf_shared 0.10.0", "proc-macro-hack", ] +[[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_macros 0.13.1", + "phf_shared 0.13.1", + "serde", +] + [[package]] name = "phf_codegen" version = "0.10.0" @@ -1373,6 +1525,16 @@ dependencies = [ "phf_shared 0.10.0", ] +[[package]] +name = "phf_codegen" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", +] + [[package]] name = "phf_generator" version = "0.10.0" @@ -1393,6 +1555,16 @@ dependencies = [ "rand", ] +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared 0.13.1", +] + [[package]] name = "phf_macros" version = "0.10.0" @@ -1407,6 +1579,19 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "phf_macros" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "phf_shared" version = "0.10.0" @@ -1425,6 +1610,15 @@ dependencies = [ "siphasher 1.0.1", ] +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher 1.0.1", +] + [[package]] name = "pin-project-lite" version = "0.2.16" @@ -1466,9 +1660,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.94" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ "unicode-ident", ] @@ -1562,7 +1756,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom 0.2.15", "libredox", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -1642,6 +1836,21 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "1.1.2" @@ -1698,13 +1907,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "585480e3719b311b78a573db1c9d9c4c1f8010c2dee4cc59c2efe58ea4dbc3e1" dependencies = [ "ahash", - "cssparser", + "cssparser 0.31.2", "ego-tree", "getopts", - "html5ever", + "html5ever 0.26.0", "once_cell", - "selectors", - "tendril", + "selectors 0.25.0", + "tendril 0.4.3", ] [[package]] @@ -1737,18 +1946,43 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" dependencies = [ "bitflags 2.9.0", - "cssparser", - "derive_more", + "cssparser 0.31.2", + "derive_more 0.99.20", "fxhash", "log", "new_debug_unreachable", - "phf", - "phf_codegen", + "phf 0.10.1", + "phf_codegen 0.10.0", "precomputed-hash", - "servo_arc", + "servo_arc 0.3.0", "smallvec", ] +[[package]] +name = "selectors" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5d9c0c92a92d33f08817311cf3f2c29a3538a8240e94a6a3c622ce652d7e00c" +dependencies = [ + "bitflags 2.9.0", + "cssparser 0.36.0", + "derive_more 2.1.1", + "log", + "new_debug_unreachable", + "phf 0.13.1", + "phf_codegen 0.13.1", + "precomputed-hash", + "rustc-hash", + "servo_arc 0.4.3", + "smallvec", +] + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + [[package]] name = "serde" version = "1.0.219" @@ -1811,6 +2045,15 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "servo_arc" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "shlex" version = "1.3.0" @@ -1903,6 +2146,18 @@ dependencies = [ "serde", ] +[[package]] +name = "string_cache" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a18596f8c785a729f2819c0f6a7eae6ebeebdfffbfe4214ae6b087f690e31901" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared 0.13.1", + "precomputed-hash", +] + [[package]] name = "string_cache_codegen" version = "0.5.4" @@ -1915,6 +2170,18 @@ dependencies = [ "quote", ] +[[package]] +name = "string_cache_codegen" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "585635e46db231059f76c5849798146164652513eb9e8ab2685939dd90f29b69" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", + "proc-macro2", + "quote", +] + [[package]] name = "strsim" version = "0.11.1" @@ -2027,13 +2294,32 @@ dependencies = [ "utf-8", ] +[[package]] +name = "tendril" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4790fc369d5a530f4b544b094e31388b9b3a37c0f4652ade4505945f5660d24" +dependencies = [ + "new_debug_unreachable", + "utf-8", +] + [[package]] name = "thiserror" version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", ] [[package]] @@ -2047,6 +2333,17 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "tinystr" version = "0.7.6" @@ -2215,6 +2512,12 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" +[[package]] +name = "utf8-width" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1292c0d970b54115d14f2492fe0170adf21d68a1de108eebc51c1df4f346a091" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -2354,6 +2657,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web_atoms" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7cff6eef815df1834fd250e3a2ff436044d82a9f1bc1980ca1dbdf07effc538" +dependencies = [ + "phf 0.13.1", + "phf_codegen 0.13.1", + "string_cache 0.9.0", + "string_cache_codegen 0.6.1", +] + [[package]] name = "winapi" version = "0.3.9" @@ -2768,7 +3083,7 @@ checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650" dependencies = [ "log", "mac", - "markup5ever", + "markup5ever 0.11.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 8ee8fab..6aa800a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ toml = "0.8" scraper = "0.18" url = "2" shlex = "1.3" +dom_smoothie = "0.17" [profile.release] codegen-units = 1 diff --git a/README.md b/README.md index c29c125..384adb6 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ Feedr is a feature-rich terminal-based RSS feed reader written in Rust. It provi - **Mark All Read**: Quickly mark all visible items as read with `m` - **Article Preview**: Toggle an inline preview pane in the dashboard view - **Link Extraction**: Extract and browse all links from an article with `l` +- **Full-Text Extraction**: Strip away summaries and read the actual article content inline via Mozilla Readability — manual on `Shift+F`, or auto-extract on refresh per feed with `fulltext = true` - **Help Overlay**: Press `?` for a scrollable keybinding reference overlay - **OPML Import**: Bulk import feeds from OPML files via `feedr --import ` - **Browser Integration**: Open articles in your default browser @@ -175,6 +176,7 @@ All keybindings below show their defaults. You can remap any action via the `[ke | `s` | Toggle starred | | `o` | Open item in browser | | `l` | Extract and show all links | +| `Shift+F` | Toggle/fetch full-text (Readability) | #### Starred View | Key | Action | @@ -314,6 +316,23 @@ Cookie = "session=abc123" ``` Headers are sent with every request for that feed, including refreshes. +#### Full-Text Extraction +Most RSS feeds ship only short summaries. Feedr can fetch the linked article URL and run [Mozilla Readability](https://github.com/mozilla/readability) (via the `dom_smoothie` crate) to extract the actual article body and render it inline. + +- **Manual**: in the article detail view, press `Shift+F` to extract the focused article. Press `Shift+F` again to toggle back to the original summary, or after a failure to retry. +- **Auto on refresh**: set `fulltext = true` on a feed and Feedr will auto-extract newly-seen items on each refresh (same "no firehose" rule as `exec_on_new` — the first observation of a feed seeds silently). + +```toml +[[default_feeds]] +url = "https://example.com/summary-only-feed.xml" +fulltext = true +``` + +Notes: +- Extracted content is **in-memory only** — it is not persisted to disk. A restart re-extracts on demand. +- Per-feed auth headers are **not** sent to the article URL (article URLs are typically third-party hosts; forwarding `Authorization` would leak credentials). +- Pages with very short extracted bodies (likely JS-rendered or behind a wall) fail gracefully and fall back to showing the original summary. + ### External-Command Hooks Feedr supports newsboat-style external commands for two workflows: **macros** (key-triggered chains that act on the focused article) and **`exec_on_new`** (a notification hook fired per newly-seen item after each refresh). @@ -417,6 +436,7 @@ toggle_theme = "F5" # Function keys | `open_category_management` | `Ctrl+c` | Category management | | `assign_category` | `c` | Assign category to feed | | `extract_links` | `l` | Extract links from article | +| `fetch_full_text` | `Shift+F` | Toggle/fetch full-text (Readability) | | `scroll_preview_up` | `Shift+K`, `Shift+Up` | Scroll preview up | | `scroll_preview_down` | `Shift+J`, `Shift+Down` | Scroll preview down | | `toggle_expand` | `Space` | Expand/collapse in tree view | diff --git a/src/app.rs b/src/app.rs index f862b29..bc7abb0 100644 --- a/src/app.rs +++ b/src/app.rs @@ -1,5 +1,5 @@ use crate::config::{CompactMode, Config}; -use crate::feed::{Feed, FeedCategory, FeedItem}; +use crate::feed::{ExtractedArticle, Feed, FeedCategory, FeedItem}; use crate::ui::ColorScheme; use anyhow::Result; use chrono::{DateTime, Utc}; @@ -143,6 +143,25 @@ pub struct App { /// disabled or the template failed to tokenize at startup (in which case /// a startup warning is surfaced). pub exec_on_new_template: Option>, + /// Per-item full-text extraction cache. Keyed by the same item_id scheme + /// as `read_items` / `starred_items`. In-memory only; not persisted. + pub extracted: HashMap, + /// Insertion order for `extracted`, used to drop the oldest entry when + /// the cache hits `EXTRACTED_CACHE_CAP`. + pub extracted_order: VecDeque, + /// Feed URLs that have `fulltext = true` in config and should trigger + /// auto-extraction for newly-seen items on each refresh. Same derivation + /// pattern as `feed_headers` / `feed_refresh_intervals`. + pub fulltext_feeds: HashSet, + /// When the focused article has a `Ready` extraction, this flag controls + /// whether the detail view renders the extracted text or the original + /// summary. Single global toggle (not per-item) — simplest UX and the + /// detail view only ever shows one article at a time. + pub show_extracted: bool, + /// Queue of extraction requests for the TUI loop to spawn worker threads + /// for. Decouples request-time (event handler) from spawn-time (loop + /// drain) so events.rs doesn't need the thread-handle / channel. + pub pending_extraction_requests: VecDeque, } /// Snapshot of the focused article passed to template-expansion / pipe-payload helpers. @@ -191,6 +210,32 @@ pub enum AddFeedResult { }, } +/// State of a per-item full-text extraction. Lives only in-memory: a +/// restart re-fetches on demand or via the next refresh for fulltext-flagged +/// feeds. No on-disk persistence by design — saves us from cache-invalidation +/// logic and bounds storage growth. +#[derive(Clone, Debug)] +pub enum ExtractionState { + Pending, + Ready(ExtractedArticle), + Failed(String), +} + +/// A queued request for the TUI loop to spawn a worker thread for. Filled +/// in `request_extraction_for_current` and `enqueue_auto_extractions`, +/// drained in `tui.rs` each tick (same shape as `pending_macro_steps`). +#[derive(Clone, Debug)] +pub struct ExtractionRequest { + pub id: String, + pub url: String, +} + +/// Cap on the number of cached extracted articles. Auto-fetch over a big +/// feed could otherwise accumulate state monotonically across a long +/// session. Insertion-order LRU eviction (oldest cache entries are dropped +/// first) — `extracted_order` tracks the insertion sequence. +pub const EXTRACTED_CACHE_CAP: usize = 500; + #[derive(Serialize, Deserialize)] struct SavedData { bookmarks: Vec, @@ -256,6 +301,14 @@ impl App { .filter_map(|f| f.refresh_interval.map(|interval| (f.url.clone(), interval))) .collect(); + // Build the set of URLs that opted into full-text auto-extraction + let fulltext_feeds: HashSet = config + .default_feeds + .iter() + .filter(|f| f.fulltext.unwrap_or(false)) + .map(|f| f.url.clone()) + .collect(); + // Parse last session time from saved data let last_session_time = saved_data .last_session_time @@ -351,6 +404,11 @@ impl App { feeds_seeded: saved_data.feeds_seeded, pending_macro_steps: VecDeque::new(), exec_on_new_template, + extracted: HashMap::new(), + extracted_order: VecDeque::new(), + fulltext_feeds, + show_extracted: true, + pending_extraction_requests: VecDeque::new(), }; app.update_dashboard(); @@ -908,10 +966,23 @@ impl App { // feeds_seeded and its item IDs would stay in seen_items // forever, growing the persisted JSON file monotonically. self.feeds_seeded.remove(&url); + let mut removed_ids: Vec = Vec::new(); for item in &self.feeds[idx].items { let id = make_item_id(&self.feeds[idx], item); self.seen_items.remove(&id); + removed_ids.push(id); } + // Drop in-memory extracted full-text for the removed items + // too — keeping them would never expire (no persistence) + // but they're keyed on item ids that no longer resolve. + for id in &removed_ids { + if self.extracted.remove(id).is_some() { + if let Some(pos) = self.extracted_order.iter().position(|x| x == id) { + self.extracted_order.remove(pos); + } + } + } + self.fulltext_feeds.remove(&url); // Remove from feeds self.feeds.remove(idx); @@ -1695,6 +1766,134 @@ impl App { ) -> Option<&crate::keybindings::MacroBinding> { self.macros.iter().find(|m| m.trigger.matches(key)) } + + /// Look up extraction state for an item by its `(feed_idx, item_idx)`. + /// Returns `None` if the indices don't resolve or no extraction has been + /// requested for that item. + pub fn extraction_state_for( + &self, + feed_idx: usize, + item_idx: usize, + ) -> Option<&ExtractionState> { + let id = self.get_item_id(feed_idx, item_idx); + if id.is_empty() { + None + } else { + self.extracted.get(&id) + } + } + + /// Insert an extraction state into the cache and bump LRU order. If the + /// id is already present, the existing entry is replaced and the order + /// entry is moved to the end. If we're at the cap, the oldest entry is + /// evicted. + pub fn record_extraction_result(&mut self, id: String, result: Result) { + if id.is_empty() { + return; + } + let state = match result { + Ok(article) => ExtractionState::Ready(article), + Err(e) => ExtractionState::Failed(format!("{}", e)), + }; + self.insert_extraction(id, state); + } + + /// Mark an item as `Pending` and append the request to the spawn queue. + /// No-op if the item already has any state (Pending / Ready / Failed) — + /// callers should clear or re-request explicitly. + pub fn request_extraction(&mut self, id: String, url: String) { + if id.is_empty() || url.is_empty() { + return; + } + if self.extracted.contains_key(&id) { + return; + } + self.insert_extraction(id.clone(), ExtractionState::Pending); + self.pending_extraction_requests + .push_back(ExtractionRequest { id, url }); + } + + fn insert_extraction(&mut self, id: String, state: ExtractionState) { + // If id is already present, move its LRU position to most-recent and + // overwrite the state. + if let Some(existing) = self.extracted.get_mut(&id) { + *existing = state; + if let Some(pos) = self.extracted_order.iter().position(|x| x == &id) { + self.extracted_order.remove(pos); + } + self.extracted_order.push_back(id); + return; + } + // New entry — evict oldest if we're at the cap. Skip eviction if the + // oldest entry is currently Pending (it has an in-flight thread that + // will try to write back). + while self.extracted_order.len() >= EXTRACTED_CACHE_CAP { + let evict = match self.extracted_order.front() { + Some(front) => front.clone(), + None => break, + }; + let is_pending = matches!(self.extracted.get(&evict), Some(ExtractionState::Pending)); + self.extracted_order.pop_front(); + if !is_pending { + self.extracted.remove(&evict); + } else { + // Pending entry — re-insert at the back so we don't busy-loop + // evicting it. The eventual write-back will land normally. + self.extracted_order.push_back(evict); + break; + } + } + self.extracted.insert(id.clone(), state); + self.extracted_order.push_back(id); + } + + /// Handler for the FetchFullText keybinding. Resolves the focused + /// article, then either toggles the view (if extraction is Ready) or + /// queues a new extraction request. Re-queues on `Failed` so the user + /// can retry. No-op on `Pending` (extraction already in flight). + pub fn toggle_or_request_fulltext(&mut self) { + let Some((feed_idx, item_idx)) = self.current_article_indices() else { + self.error = Some("No article in focus".to_string()); + return; + }; + let id = self.get_item_id(feed_idx, item_idx); + if id.is_empty() { + self.error = Some("No article in focus".to_string()); + return; + } + let url = match self + .feeds + .get(feed_idx) + .and_then(|f| f.items.get(item_idx)) + .and_then(|i| i.link.clone()) + { + Some(u) => u, + None => { + self.error = Some("Article has no URL".to_string()); + return; + } + }; + match self.extracted.get(&id) { + Some(ExtractionState::Ready(_)) => { + self.show_extracted = !self.show_extracted; + } + Some(ExtractionState::Pending) => { + // already in flight — leave it + } + Some(ExtractionState::Failed(_)) => { + self.extracted.remove(&id); + if let Some(pos) = self.extracted_order.iter().position(|x| x == &id) { + self.extracted_order.remove(pos); + } + self.show_extracted = true; + self.request_extraction(id, url); + } + None => { + self.show_extracted = true; + self.request_extraction(id, url); + } + } + } } /// Build an `ArticleContext` directly from a `Feed` + `FeedItem` pair — @@ -2587,4 +2786,92 @@ mod tests { assert!(parsed.seen_items.is_empty()); assert!(parsed.feeds_seeded.is_empty()); } + + fn dummy_extracted(text: &str) -> ExtractedArticle { + ExtractedArticle { + title: "T".to_string(), + plain_text: text.to_string(), + byline: None, + site_name: None, + source_url: "https://ex.com/a".to_string(), + } + } + + #[test] + fn test_request_extraction_creates_pending_and_enqueues() { + let mut app = App::new(); + app.request_extraction("id1".to_string(), "https://ex.com/a".to_string()); + assert!(matches!( + app.extracted.get("id1"), + Some(ExtractionState::Pending) + )); + assert_eq!(app.pending_extraction_requests.len(), 1); + } + + #[test] + fn test_request_extraction_idempotent_for_already_pending() { + let mut app = App::new(); + app.request_extraction("id1".to_string(), "https://ex.com/a".to_string()); + app.request_extraction("id1".to_string(), "https://ex.com/a".to_string()); + // No double-queue: second call must be a no-op because state is Pending. + assert_eq!(app.pending_extraction_requests.len(), 1); + } + + #[test] + fn test_record_extraction_result_transitions_to_ready() { + let mut app = App::new(); + app.request_extraction("id1".to_string(), "https://ex.com/a".to_string()); + app.record_extraction_result("id1".to_string(), Ok(dummy_extracted("body"))); + match app.extracted.get("id1") { + Some(ExtractionState::Ready(a)) => assert_eq!(a.plain_text, "body"), + other => panic!("expected Ready, got {:?}", other), + } + } + + #[test] + fn test_record_extraction_result_transitions_to_failed() { + let mut app = App::new(); + app.request_extraction("id1".to_string(), "https://ex.com/a".to_string()); + app.record_extraction_result("id1".to_string(), Err(anyhow::anyhow!("boom"))); + match app.extracted.get("id1") { + Some(ExtractionState::Failed(msg)) => assert!(msg.contains("boom")), + other => panic!("expected Failed, got {:?}", other), + } + } + + #[test] + fn test_lru_evicts_oldest_at_cap() { + let mut app = App::new(); + // Insert one more than the cap; the first id must be evicted. + for i in 0..EXTRACTED_CACHE_CAP + 1 { + let id = format!("id{}", i); + app.record_extraction_result(id, Ok(dummy_extracted("x"))); + } + assert!(!app.extracted.contains_key("id0"), "oldest must be evicted"); + assert_eq!(app.extracted.len(), EXTRACTED_CACHE_CAP); + } + + #[test] + fn test_remove_current_feed_prunes_extracted_cache() { + let mut app = make_test_app(); + // Seed extraction state for items in the first feed. + let id_old = app.get_item_id(0, 0); + let id_new = app.get_item_id(0, 1); + app.record_extraction_result(id_old.clone(), Ok(dummy_extracted("a"))); + app.record_extraction_result(id_new.clone(), Ok(dummy_extracted("b"))); + assert_eq!(app.extracted.len(), 2); + + // Remove the first feed. + app.selected_feed = Some(0); + app.remove_current_feed().ok(); + + assert!( + !app.extracted.contains_key(&id_old) && !app.extracted.contains_key(&id_new), + "extracted entries for removed feed must be pruned" + ); + assert!( + app.extracted_order.is_empty(), + "LRU order must be pruned in sync with the map" + ); + } } diff --git a/src/config.rs b/src/config.rs index 4dd1bf5..f082a64 100644 --- a/src/config.rs +++ b/src/config.rs @@ -129,6 +129,10 @@ pub struct DefaultFeed { /// Per-feed refresh interval in seconds; None = use global interval #[serde(default, skip_serializing_if = "Option::is_none")] pub refresh_interval: Option, + /// When true, auto-extract full-text for newly-seen items from this feed + /// on each refresh. Manual extraction via Shift+F always works regardless. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub fulltext: Option, } // Default value functions @@ -406,6 +410,15 @@ impl Config { # [default_feeds.headers]\n\ # Authorization = \"Bearer your_token_here\"\n\ #\n\ + # Full-text extraction (Readability) — auto-extract on refresh:\n\ + # [[default_feeds]]\n\ + # url = \"https://example.com/summary-only-feed.xml\"\n\ + # fulltext = true\n\ + #\n\ + # Press Shift+F in the article detail view to extract on-demand\n\ + # for any feed. Auth headers from this feed are NOT sent to the\n\ + # article URL — they would leak to third-party hosts.\n\ + #\n\ # ── External-command hooks ──────────────────────────────\n\ #\n\ # Macros bind a key (invoked as , default prefix is ',') to\n\ @@ -474,6 +487,34 @@ mod tests { assert_eq!(config.ui.error_display_timeout, 3000); } + #[test] + fn test_default_feed_without_fulltext_parses() { + // Existing configs that have no `fulltext` key must keep working. + let toml_str = r#" + [[default_feeds]] + url = "https://example.com/feed.xml" + "#; + let config: Config = toml::from_str(toml_str).unwrap(); + assert_eq!(config.default_feeds.len(), 1); + assert!(config.default_feeds[0].fulltext.is_none()); + } + + #[test] + fn test_default_feed_with_fulltext_round_trips() { + let toml_str = r#" + [[default_feeds]] + url = "https://example.com/feed.xml" + fulltext = true + "#; + let config: Config = toml::from_str(toml_str).unwrap(); + assert_eq!(config.default_feeds[0].fulltext, Some(true)); + + // Round-trip: serialize the parsed config and parse the result. + let serialized = toml::to_string(&config).unwrap(); + let reparsed: Config = toml::from_str(&serialized).unwrap(); + assert_eq!(reparsed.default_feeds[0].fulltext, Some(true)); + } + #[test] fn test_default_feed_with_headers() { let toml_str = r#" diff --git a/src/config_tui.rs b/src/config_tui.rs index 92bc6ee..34700f6 100644 --- a/src/config_tui.rs +++ b/src/config_tui.rs @@ -361,6 +361,7 @@ impl ConfigEditor { category, headers: None, refresh_interval: None, + fulltext: None, }); self.dirty = true; self.adding_feed = false; diff --git a/src/events.rs b/src/events.rs index c883cbc..bd2998a 100644 --- a/src/events.rs +++ b/src/events.rs @@ -162,6 +162,7 @@ pub(crate) fn dispatch_action(app: &mut App, action: KeyAction) { Err(e) => app.error = Some(format!("Failed to mark all read: {}", e)), }, KeyAction::ExtractLinks => app.extract_links_from_current_item(), + KeyAction::FetchFullText => app.toggle_or_request_fulltext(), other => { app.error = Some(format!( "macro: action '{}' not supported in macros", @@ -841,6 +842,9 @@ pub(crate) fn handle_key_event(app: &mut App, key: crossterm::event::KeyEvent) - _ if app.key_matches(KeyAction::Help, &key) => { handle_show_help(app); } + _ if app.key_matches(KeyAction::FetchFullText, &key) => { + app.toggle_or_request_fulltext(); + } _ => {} }, View::Starred => match key.code { diff --git a/src/feed.rs b/src/feed.rs index 985a75b..cf06656 100644 --- a/src/feed.rs +++ b/src/feed.rs @@ -1,5 +1,6 @@ use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; +use dom_smoothie::{Config as ReadabilityConfig, Readability}; use feed_rs::parser; use scraper::{Html, Selector}; use serde::{Deserialize, Serialize}; @@ -10,6 +11,16 @@ use std::time::Duration; use url::Url; use uuid::Uuid; +/// Cap on response body size for article extraction. Anything larger is +/// almost certainly not a typical article page and would only burn CPU in +/// the Readability DOM walker. +const FULLTEXT_MAX_BYTES: usize = 5 * 1024 * 1024; + +/// Minimum text length the extracted article body must contain to be +/// considered useful. Pages below this are treated as "page appears empty" +/// — usually JS-rendered shells, login walls, or 404 placeholders. +const FULLTEXT_MIN_LENGTH: usize = 200; + #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Feed { pub url: String, @@ -100,6 +111,112 @@ pub struct DiscoveredFeed { pub feed_type: FeedType, } +/// Outcome of a successful Readability extraction against an article URL. +/// Decoupled from `dom_smoothie::Article` so the upstream crate can be +/// swapped without rippling through the rest of the codebase. +#[derive(Clone, Debug)] +pub struct ExtractedArticle { + pub title: String, + pub plain_text: String, + pub byline: Option, + pub site_name: Option, + pub source_url: String, +} + +/// Fetch `url` with `client` and run Mozilla-Readability extraction over the +/// returned HTML. Feed auth headers are intentionally NOT propagated: the +/// article URL is on a different host and may be third-party, so leaking +/// per-feed `Authorization` headers would be a credential leak. +/// +/// Rejects non-`text/html` responses, oversized bodies (`FULLTEXT_MAX_BYTES`), +/// and pages whose extracted text falls below `FULLTEXT_MIN_LENGTH` (treated +/// as JS-rendered or empty). +pub fn extract_article( + url: &str, + client: &reqwest::blocking::Client, + user_agent: Option<&str>, +) -> Result { + let default_user_agent = + "Mozilla/5.0 (compatible; Feedr/1.0; +https://github.com/bahdotsh/feedr)"; + let ua = user_agent.unwrap_or(default_user_agent); + + let response = client + .get(url) + .header("User-Agent", ua) + .header("Accept", "text/html, application/xhtml+xml, */*;q=0.5") + .header("Accept-Language", "en-US,en;q=0.9") + .header("Accept-Encoding", "gzip, deflate") + .send() + .context("Failed to fetch article")?; + + let status = response.status(); + if !status.is_success() { + return Err(anyhow::anyhow!( + "HTTP error {} fetching article from {}", + status, + url + )); + } + + let content_type = response + .headers() + .get("content-type") + .and_then(|ct| ct.to_str().ok()) + .unwrap_or("") + .to_lowercase(); + if !content_type.is_empty() + && !content_type.contains("text/html") + && !content_type.contains("application/xhtml") + { + return Err(anyhow::anyhow!( + "Article URL did not return HTML (content-type: {})", + content_type + )); + } + + let final_url = response.url().to_string(); + let bytes = response.bytes().context("Failed to read article body")?; + if bytes.len() > FULLTEXT_MAX_BYTES { + return Err(anyhow::anyhow!( + "Article body too large ({} bytes, cap {} bytes)", + bytes.len(), + FULLTEXT_MAX_BYTES + )); + } + + let html = String::from_utf8_lossy(&bytes).into_owned(); + extract_from_html(&html, &final_url) +} + +/// Pure Readability step — no I/O. Split out so unit tests can exercise the +/// extraction + min-length + error-path logic without a network round-trip. +pub fn extract_from_html(html: &str, source_url: &str) -> Result { + let cfg = ReadabilityConfig { + max_elements_to_parse: 9000, + ..ReadabilityConfig::default() + }; + let mut readability = Readability::new(html, Some(source_url), Some(cfg)) + .context("Failed to initialize Readability")?; + let article = readability + .parse() + .context("Readability could not parse article")?; + + if article.length < FULLTEXT_MIN_LENGTH { + return Err(anyhow::anyhow!( + "Page appears empty (only {} chars of body) — likely JS-rendered or paywalled", + article.length + )); + } + + Ok(ExtractedArticle { + title: article.title, + plain_text: article.text_content.to_string(), + byline: article.byline, + site_name: article.site_name, + source_url: source_url.to_string(), + }) +} + /// Result of fetching a URL that was expected to be a feed. pub enum FeedFetchResult { /// Successfully parsed as an RSS/Atom feed. @@ -548,4 +665,71 @@ mod tests { let err = result.into_feed().unwrap_err(); assert!(err.to_string().contains("No RSS/Atom feed links found")); } + + /// A blog-post-shaped fixture long enough to clear FULLTEXT_MIN_LENGTH + /// once Readability strips boilerplate. + fn fixture_article_html() -> String { + let body = "Feedr is a terminal-based RSS feed reader. It supports \ + categories, OPML import, and a configurable keybinding system. \ + This paragraph is repeated several times to ensure the extracted \ + text content comfortably exceeds the minimum-length threshold \ + that the extractor enforces before declaring the page empty. "; + let mut content = String::new(); + for _ in 0..5 { + content.push_str("

"); + content.push_str(body); + content.push_str("

"); + } + format!( + r#" + About Feedr + + + +
+

About Feedr

+ {} +
+
Copyright 2026
+ "#, + content + ) + } + + #[test] + fn test_extract_from_html_succeeds_on_blog_shaped_page() { + let html = fixture_article_html(); + let article = extract_from_html(&html, "https://example.com/post").unwrap(); + assert!(!article.title.is_empty(), "title should be extracted"); + assert!( + article.plain_text.len() >= FULLTEXT_MIN_LENGTH, + "extracted text below minimum: {}", + article.plain_text.len() + ); + assert_eq!(article.source_url, "https://example.com/post"); + } + + #[test] + fn test_extract_from_html_rejects_short_content() { + let html = r#"

tiny.

"#; + let err = extract_from_html(html, "https://example.com/empty").unwrap_err(); + assert!( + err.to_string().to_lowercase().contains("empty"), + "expected 'empty' in error, got: {}", + err + ); + } + + #[test] + fn test_extract_from_html_rejects_bad_document_url() { + let html = fixture_article_html(); + let err = extract_from_html(&html, "not-a-url").unwrap_err(); + let msg = format!("{:?}", err); + // dom_smoothie's BadDocumentURL wrapped by our context string. + assert!( + msg.contains("Failed to initialize") || msg.to_lowercase().contains("url"), + "expected BadDocumentURL error, got: {}", + msg + ); + } } diff --git a/src/keybindings.rs b/src/keybindings.rs index 30aeafa..8d6e72c 100644 --- a/src/keybindings.rs +++ b/src/keybindings.rs @@ -99,6 +99,7 @@ pub enum KeyAction { AssignCategory, // Detail ExtractLinks, + FetchFullText, ScrollPreviewUp, ScrollPreviewDown, // Tree @@ -140,6 +141,7 @@ impl KeyAction { Self::OpenCategoryManagement => "open-category-management", Self::AssignCategory => "assign-category", Self::ExtractLinks => "extract-links", + Self::FetchFullText => "fetch-full-text", Self::ScrollPreviewUp => "scroll-preview-up", Self::ScrollPreviewDown => "scroll-preview-down", Self::ToggleExpand => "toggle-expand", @@ -181,6 +183,7 @@ impl FromStr for KeyAction { "open_category_management" => Ok(Self::OpenCategoryManagement), "assign_category" => Ok(Self::AssignCategory), "extract_links" => Ok(Self::ExtractLinks), + "fetch_full_text" => Ok(Self::FetchFullText), "scroll_preview_up" => Ok(Self::ScrollPreviewUp), "scroll_preview_down" => Ok(Self::ScrollPreviewDown), "toggle_expand" => Ok(Self::ToggleExpand), @@ -353,6 +356,13 @@ pub fn default_keybindings() -> KeyBindingMap { KeyAction::ExtractLinks, vec![KeyBinding::new(KeyCode::Char('l'))], ); + // Capital F (Shift+F). Matches both crossterm's `Char('F')` events and + // `Char('F')+SHIFT` thanks to `KeyBinding::matches` using a subset check + // on modifiers — same pattern as `JumpBottom` (G). + map.insert( + KeyAction::FetchFullText, + vec![KeyBinding::new(KeyCode::Char('F'))], + ); map.insert( KeyAction::ScrollPreviewUp, vec![ @@ -1114,6 +1124,7 @@ mod tests { KeyAction::OpenCategoryManagement, KeyAction::AssignCategory, KeyAction::ExtractLinks, + KeyAction::FetchFullText, KeyAction::ScrollPreviewUp, KeyAction::ScrollPreviewDown, KeyAction::ToggleExpand, diff --git a/src/tui.rs b/src/tui.rs index a3e43a8..8f227c6 100644 --- a/src/tui.rs +++ b/src/tui.rs @@ -1,6 +1,6 @@ use crate::app::{App, View}; use crate::events::handle_events; -use crate::feed::Feed; +use crate::feed::{ExtractedArticle, Feed}; use crate::ui; use anyhow::{Context, Result}; use crossterm::{ @@ -201,31 +201,28 @@ fn drain_macro_steps(terminal: &mut Terminal, app: &mut App) { } } -/// Diff a freshly-arrived feed against `app.seen_items`, mutate the seen -/// state in memory, and return the argv list to spawn for each genuinely-new -/// item. On the first successful fetch of a feed (URL not yet in -/// `feeds_seeded`), seed the seen set silently and return an empty list. +/// Build the argv list to spawn for each genuinely-new item on this feed. +/// `newly_seen` must come from `mark_feed_seen` — this helper consumes the +/// pre-computed indices so multiple consumers (exec_on_new + fulltext) can +/// share one `mark_feed_seen` call per feed arrival instead of double-marking. /// /// Returns Vec rather than spawning + saving inline so a refresh batch can /// `save_data` ONCE before spawning ANY child, instead of once per feed /// arrival (write amplification of N for N bookmarks). Crash semantics still /// AT-MOST-ONCE: see [`flush_exec_on_new`] for the persistence ordering. /// -/// No-op when the hook is not configured. seen_items / feeds_seeded only -/// exist to drive this hook, so users who never opt in pay neither the -/// memory cost nor any save_data round-trip. -fn collect_exec_on_new(app: &mut App, feed: &Feed) -> Vec> { +/// No-op when the hook is not configured. +fn collect_exec_on_new(app: &App, feed: &Feed, newly_seen: &[usize]) -> Vec> { let argv_template = match app.exec_on_new_template.as_ref() { Some(t) => t.clone(), None => return Vec::new(), }; - let newly_seen_idx = crate::app::mark_feed_seen(app, feed); - if newly_seen_idx.is_empty() { + if newly_seen.is_empty() { return Vec::new(); } - newly_seen_idx - .into_iter() - .filter_map(|idx| { + newly_seen + .iter() + .filter_map(|&idx| { let item = feed.items.get(idx)?; let ctx = crate::app::article_context_from(feed, item); Some(crate::app::expand_argv_template(&argv_template, &ctx)) @@ -233,6 +230,27 @@ fn collect_exec_on_new(app: &mut App, feed: &Feed) -> Vec> { .collect() } +/// For feeds with `fulltext = true`, queue an extraction request for each +/// newly-seen item. Uses the same newly-seen set as `exec_on_new` so an +/// item is only auto-extracted once (and not on the first ever fetch of a +/// feed — that one seeds the seen-set silently, matching the exec_on_new +/// no-firehose contract). +fn enqueue_fulltext_for_new(app: &mut App, feed: &Feed, newly_seen: &[usize]) { + if !app.fulltext_feeds.contains(&feed.url) { + return; + } + for &idx in newly_seen { + let Some(item) = feed.items.get(idx) else { + continue; + }; + let Some(url) = item.link.as_deref() else { + continue; + }; + let id = crate::app::make_item_id(feed, item); + app.request_extraction(id, url.to_string()); + } +} + /// Persist the updated seen-set, then spawn all collected exec_on_new /// children. The "save then spawn" order is the AT-MOST-ONCE guarantee: /// a kill between save and spawn loses notifications, but a restart never @@ -287,11 +305,40 @@ fn spawn_feed_refresh(app: &mut App) -> (usize, mpsc::Receiver<(usize, Result)>) { + if app.pending_extraction_requests.is_empty() { + return; + } + let timeout = app.config.network.http_timeout; + let user_agent = app.config.network.user_agent.clone(); + let client = match Feed::build_client(timeout) { + Ok(c) => c, + Err(_) => return, + }; + while let Some(req) = app.pending_extraction_requests.pop_front() { + let client = client.clone(); + let ua = user_agent.clone(); + let tx = tx.clone(); + std::thread::spawn(move || { + let result = crate::feed::extract_article(&req.url, &client, Some(&ua)); + let _ = tx.send((req.id, result)); + }); + } +} + fn run_app(terminal: &mut Terminal, app: &mut App) -> Result<()> { let mut last_tick = std::time::Instant::now(); let tick_rate = Duration::from_millis(app.config.ui.tick_rate); let error_timeout = Duration::from_millis(app.config.ui.error_display_timeout); + // Long-lived channel for extraction worker completions. Outlives any + // individual refresh because manual `F` extractions can fire at any time. + let (extract_tx, extract_rx) = mpsc::channel::<(String, Result)>(); + // Initial load of bookmarked feeds let (mut pending_count, mut feed_rx) = spawn_feed_refresh(app); @@ -323,7 +370,20 @@ fn run_app(terminal: &mut Terminal, app: &mut App) -> Result<()> let mut pending_exec: Vec> = Vec::new(); while let Ok((idx, result)) = feed_rx.try_recv() { if let Ok(feed) = result { - pending_exec.extend(collect_exec_on_new(app, &feed)); + // Compute newly-seen ONCE per feed arrival, shared + // between exec_on_new and fulltext consumers. Skip the + // mark entirely when neither consumer needs it — keeps + // `seen_items` empty for users without hooks, matching + // the original zero-cost contract. + let needs_newly_seen = app.exec_on_new_template.is_some() + || app.fulltext_feeds.contains(&feed.url); + let newly_seen = if needs_newly_seen { + crate::app::mark_feed_seen(app, &feed) + } else { + Vec::new() + }; + pending_exec.extend(collect_exec_on_new(app, &feed, &newly_seen)); + enqueue_fulltext_for_new(app, &feed, &newly_seen); // Insert at the correct position to maintain bookmark order, // or append if earlier feeds haven't arrived yet let insert_pos = app @@ -381,6 +441,18 @@ fn run_app(terminal: &mut Terminal, app: &mut App) -> Result<()> tick_rate }; + // Spawn workers for any extraction requests queued by events or + // by the feed-refresh path above. Run every iteration so manual + // `F` requests don't have to wait for a tick boundary. + spawn_pending_extractions(app, &extract_tx); + + // Drain completed extractions back into app state. This is what + // flips a `Pending` entry to `Ready` / `Failed`, which the detail + // view reads from on the next draw. + while let Ok((id, result)) = extract_rx.try_recv() { + app.record_extraction_result(id, result); + } + if event::poll(timeout)? { // Handle user input if handle_events(app)? { @@ -488,6 +560,28 @@ mod tests { assert!(app.show_help_overlay); } + #[test] + fn test_record_extraction_result_flips_pending_to_ready() { + use crate::app::ExtractionState; + let mut app = App::new(); + app.request_extraction("id1".to_string(), "https://ex.com/a".to_string()); + // simulate a worker thread completing successfully + app.record_extraction_result( + "id1".to_string(), + Ok(ExtractedArticle { + title: "T".to_string(), + plain_text: "body".to_string(), + byline: None, + site_name: None, + source_url: "https://ex.com/a".to_string(), + }), + ); + match app.extracted.get("id1") { + Some(ExtractionState::Ready(a)) => assert_eq!(a.plain_text, "body"), + other => panic!("expected Ready, got {:?}", other), + } + } + #[test] fn test_drain_preserves_preexisting_error_and_still_runs_step() { // Edge case: if `app.error` is already set before a step runs, the diff --git a/src/ui/detail.rs b/src/ui/detail.rs index af83bb3..e5f2f2a 100644 --- a/src/ui/detail.rs +++ b/src/ui/detail.rs @@ -1,4 +1,4 @@ -use crate::app::App; +use crate::app::{App, ExtractionState}; use crate::ui::utils::{count_wrapped_lines, format_content_for_reading, truncate_url}; use crate::ui::ColorScheme; use html2text::from_read; @@ -11,6 +11,18 @@ use ratatui::{ Frame, }; +/// Which body the detail view is currently showing. Owns its strings so +/// the immutable borrow on `app.extracted` is released before we call any +/// `&mut App` method later in the render (e.g. `update_detail_max_scroll`). +enum BodySource { + Summary, + Extracted(String), + Extracting, + /// Failed extraction — body falls back to the summary, and `reason` is + /// shown as a small dim hint above it. + Failed(String), +} + pub(super) fn render_item_detail( f: &mut Frame, app: &mut App, @@ -117,13 +129,47 @@ pub(super) fn render_item_detail( f.render_widget(header, chunks[0]); - // Process content with enhanced formatting - let description = if let Some(desc) = &item.description { - // Convert HTML to plain text with better width for readability - let raw_text = from_read(desc.as_bytes(), 100); - format_content_for_reading(&raw_text) - } else { - "No description available".to_string() + // Decide which body to render: the original summary, the extracted + // full-text, or a placeholder for in-flight / failed extractions. + let body_source: BodySource = + match (app.selected_feed, app.selected_item, app.show_extracted) { + (Some(fi), Some(ii), true) => match app.extraction_state_for(fi, ii) { + Some(ExtractionState::Ready(article)) => { + BodySource::Extracted(article.plain_text.clone()) + } + Some(ExtractionState::Pending) => BodySource::Extracting, + Some(ExtractionState::Failed(reason)) => BodySource::Failed(reason.clone()), + None => BodySource::Summary, + }, + _ => BodySource::Summary, + }; + + let description = match &body_source { + BodySource::Summary => { + if let Some(desc) = &item.description { + let raw_text = from_read(desc.as_bytes(), 100); + format_content_for_reading(&raw_text) + } else { + "No description available".to_string() + } + } + BodySource::Failed(reason) => { + let summary = if let Some(desc) = &item.description { + let raw_text = from_read(desc.as_bytes(), 100); + format_content_for_reading(&raw_text) + } else { + "No description available".to_string() + }; + // Surface the failure reason at the top, then fall back to + // the original summary so the user still has something to + // read. Press F again to retry. + format!( + "[Full-text extraction failed: {}]\n[Press F to retry — showing summary]\n\n{}", + reason, summary + ) + } + BodySource::Extracted(text) => format_content_for_reading(text), + BodySource::Extracting => "Fetching full text…".to_string(), }; // Calculate the viewport height (accounting for borders and padding) @@ -153,21 +199,28 @@ pub(super) fn render_item_detail( ("↓", "↑") // Light: simple arrows }; + let body_label = match &body_source { + BodySource::Summary => "Summary", + BodySource::Extracted(_) => "Full-text", + BodySource::Extracting => "Extracting…", + BodySource::Failed(_) => "Full-text failed", + }; + let scroll_indicator = if app.detail_max_scroll > 0 { let scroll_pct = (app.detail_vertical_scroll as f32 / app.detail_max_scroll as f32 * 100.0) as u16; if app.detail_vertical_scroll == 0 { format!( - " {} Article Content · Scroll {} for more ", - article_icon, scroll_arrows.0 + " {} {} · Scroll {} for more ", + article_icon, body_label, scroll_arrows.0 ) } else if app.detail_vertical_scroll >= app.detail_max_scroll { - format!(" {} Article Content · End of article ", article_icon) + format!(" {} {} · End of article ", article_icon, body_label) } else { - format!(" {} Article Content · {}% ", article_icon, scroll_pct) + format!(" {} {} · {}% ", article_icon, body_label, scroll_pct) } } else { - format!(" {} Article Content ", article_icon) + format!(" {} {} ", article_icon, body_label) }; // Create content paragraph with theme-specific styling diff --git a/src/ui/modals.rs b/src/ui/modals.rs index 2261dcb..1f67a00 100644 --- a/src/ui/modals.rs +++ b/src/ui/modals.rs @@ -750,6 +750,11 @@ pub(super) fn render_help_overlay(f: &mut Frame, app: &App, color "Extract links/images", &mut lines, ); + add_key( + &kd(&KeyAction::FetchFullText), + "Toggle/fetch full-text (Readability)", + &mut lines, + ); add_key( &kd(&KeyAction::OpenSearch), "Search across all feeds", From c95f71565ab2f33d17817c2c794f5954cc623c08 Mon Sep 17 00:00:00 2001 From: bahdotsh Date: Fri, 15 May 2026 14:03:58 +0200 Subject: [PATCH 2/9] docs(ui): surface fetch-full-text in help bars and macro whitelist While wiring Shift+F in, I only updated the help *overlay* (the scrollable `?` thing) and the README. The bottom help bar that's always on screen, the compact-mode help bar, and the macro-whitelist documentation in the auto-generated config.toml comment block all still pretended the action didn't exist. Add the FetchFullText keybinding to both render_help_bar and render_compact_help_bar for the FeedItemDetail view, and tack fetch-full-text onto the list of macro-callable actions in the config doc-comment. The dispatch_action whitelist in events.rs already allowed it from macros, but a user reading the config file would never have known. --- src/config.rs | 2 +- src/ui/mod.rs | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/config.rs b/src/config.rs index f082a64..de91143 100644 --- a/src/config.rs +++ b/src/config.rs @@ -432,7 +432,7 @@ impl Config { #\n\ # Supported actions inside macros:\n\ # open-in-browser, toggle-star, toggle-read, mark-all-read,\n\ - # refresh, toggle-theme, extract-links, help\n\ + # refresh, toggle-theme, extract-links, fetch-full-text, help\n\ # Other keybinding actions are intentionally not callable from macros.\n\ #\n\ # Variables expanded per argv token:\n\ diff --git a/src/ui/mod.rs b/src/ui/mod.rs index 8884d62..bacf07a 100644 --- a/src/ui/mod.rs +++ b/src/ui/mod.rs @@ -563,7 +563,7 @@ fn render_help_bar(f: &mut Frame, app: &App, area: Rect, colors: } View::FeedItemDetail => { format!( - "{}/{}: Scroll | {}/{}: Fast scroll | {}: Open | {}: Star | {}: Toggle read | {}: Links | {}: Search | {}: Theme | {}: Back | {}: Quit", + "{}/{}: Scroll | {}/{}: Fast scroll | {}: Open | {}: Star | {}: Toggle read | {}: Links | {}: Full-text | {}: Search | {}: Theme | {}: Back | {}: Quit", key_display(&KeyAction::MoveUp, &app.keybindings), key_display(&KeyAction::MoveDown, &app.keybindings), key_display(&KeyAction::PageUp, &app.keybindings), @@ -572,6 +572,7 @@ fn render_help_bar(f: &mut Frame, app: &App, area: Rect, colors: key_display(&KeyAction::ToggleStar, &app.keybindings), key_display(&KeyAction::ToggleRead, &app.keybindings), key_display(&KeyAction::ExtractLinks, &app.keybindings), + key_display(&KeyAction::FetchFullText, &app.keybindings), key_display(&KeyAction::OpenSearch, &app.keybindings), key_display(&KeyAction::ToggleTheme, &app.keybindings), key_display(&KeyAction::Back, &app.keybindings), @@ -713,12 +714,13 @@ fn render_compact_help_bar( kd(&KeyAction::OpenSearch), ), View::FeedItemDetail => format!( - "{}:back {}:scroll {}:open {}:star {}:read", + "{}:back {}:scroll {}:open {}:star {}:read {}:fulltext", kd(&KeyAction::Quit), kd(&KeyAction::MoveDown), kd(&KeyAction::OpenInBrowser), kd(&KeyAction::ToggleStar), kd(&KeyAction::ToggleRead), + kd(&KeyAction::FetchFullText), ), View::CategoryManagement => format!("{}:back n:new e:edit d:del", kd(&KeyAction::Quit),), View::Starred => format!( From 56d8eb22fee61b87f7e1b8077072de9ab371610e Mon Sep 17 00:00:00 2001 From: bahdotsh Date: Fri, 15 May 2026 14:41:21 +0200 Subject: [PATCH 3/9] fix(fulltext): harden extraction against mojibake, SSRF, and OOM The review of the Readability extraction PR turned up a small pile of issues. None individually catastrophic, but together they made the feature unsafe to enable on any real-world feed list. Let's fix them in one go. The big ones: * `String::from_utf8_lossy` on the response body meant every non-UTF8 page (Windows-1252, Shift_JIS, GBK, ...) decoded into U+FFFD soup, which Readability then dutifully rejected as "page appears empty." Route through `encoding_rs` with Content-Type charset > sniff > UTF-8. * `response.bytes()` read the entire body before checking the 5 MB cap, which is a great way to allocate 200 MB on a hostile server. Switch to `Response::take(MAX+1).read_to_end(...)` and pre-check Content-Length while we're at it. * `spawn_pending_extractions` spawned one OS thread per queued request, unconditionally, with zero per-domain throttling. A refresh across several `fulltext = true` feeds could fire 20+ concurrent requests at the same host. Cap concurrent workers at 4 via an Arc, and reuse `last_domain_fetch` so article fetches respect the same politeness budget feed fetches already do. * Auto-fulltext blindly fetched whatever the feed XML put in . A hostile feed could point that at 169.254.169.254 or 10.0.0.x and use the reader to probe the user's network. Add a small allowlist (`is_safe_auto_url`): http/https only, no RFC1918/loopback/link-local/CGNAT, no `.local` or `localhost` names. Manual Shift+F bypasses it -- that's the user's explicit click, not the user's problem. * The LRU cap was soft. When the head was Pending, eviction bailed out and inserted anyway, so a wave of stuck workers could grow the cache past EXTRACTED_CACHE_CAP. Always evict the head; teach `record_extraction_result` to drop late results for non-Pending slots so we don't resurrect evicted or removed entries. * `dom_smoothie` panicking on hostile HTML stranded the slot on Pending forever with no path to retry. Wrap the worker in `catch_unwind` and surface panics as Failed. While at it: detail-view body-source lookup now uses `current_article_indices()` so it matches the action handler; `toggle_or_request_fulltext` shows a success-message hint when fired from a non-detail view (the macro path was silent before); `remove_extraction` helper replaces the two open-coded map+deque removals. Tests cover the new behaviors. The Readability dep didn't need to come with this many footguns on day one. This is what reviews are for. --- CLAUDE.md | 2 +- Cargo.lock | 1 + Cargo.toml | 1 + src/app.rs | 142 ++++++++++++++++++----- src/feed.rs | 293 +++++++++++++++++++++++++++++++++++++++++++++-- src/tui.rs | 108 +++++++++++++++-- src/ui/detail.rs | 31 +++-- 7 files changed, 517 insertions(+), 61 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index b5c87e1..0edc8b0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -66,7 +66,7 @@ MSRV: 1.75.0. CI runs tests on stable, beta, and 1.75.0. - **Compact mode**: `app.compact` bool is updated each frame by `update_compact_mode(terminal_height)`. Rendering in `ui.rs` branches on `app.compact` for layout, title bar, help bar, and dashboard item format. Controlled by `config.ui.compact_mode` (`Auto`/`Always`/`Never`). Dialog modals use `centered_rect_with_min()` to enforce minimum dimensions regardless of compact mode. - **External-command hooks (macros + `exec_on_new`)**: Commands are run **without a shell**. Templates are tokenized once at config load via `shlex`, then `expand_argv_template` substitutes `%X` placeholders into individual argv tokens (no re-expansion), so feed content cannot break out of an argument. The macro engine queues steps into `app.pending_macro_steps` from `events.rs` and the TUI loop drains them in `tui.rs::drain_macro_steps` — drain lives at the loop level because `pipe-to` needs the terminal handle to suspend the TUI. Chains halt on the first step error (tracked via a `pre_error` guard so a stale `app.error` doesn't spuriously abort). The macro prefix (default `,`) is checked at the top of `handle_key_event` and only when `input_mode == Normal`, so text-input modes are not disturbed; an idle prefix times out via the existing success-message timeout. - **`exec_on_new` crash semantics**: AT-MOST-ONCE. `flush_exec_on_new` persists the `seen_items` / `feeds_seeded` sets *before* spawning any child, so a kill mid-fire loses a notification rather than re-firing on the next launch. `mark_feed_seen` only flips `feeds_seeded` on a fetch that returned items (transiently-empty first fetches don't seed), and the first observation of a feed seeds the seen-set silently to avoid a firehose. Children are spawned detached with stdio nulled; a reaper thread waits on each so they don't linger as zombies. The seen-set is pruned in `remove_current_feed` to prevent monotonic growth across feed churn. **Single-shared mark per feed**: `mark_feed_seen` is hoisted to the feed-drain call site in `tui.rs` (gated on `exec_on_new_template.is_some() || fulltext_feeds.contains(&feed.url)`) so multiple consumers (currently exec_on_new and fulltext) share one mark per feed arrival — calling it twice would double-mark and the second consumer would see an empty `newly_seen` list. -- **Full-text extraction**: `feed::extract_article` fetches an article URL with the existing `reqwest::blocking::Client` and runs Mozilla Readability via the `dom_smoothie` crate. **Sync, no tokio.** Per-feed `Authorization`/auth headers are intentionally NOT forwarded to article URLs (they're third-party hosts — propagating would be a credential leak). Each extraction runs on a `std::thread::spawn` worker that posts to a long-lived `mpsc::channel` drained every loop iteration in `run_app`. State lives only in `App::extracted: HashMap` (`Pending` / `Ready(ExtractedArticle)` / `Failed(String)`) — **in-memory only**, never persisted; LRU-capped at `EXTRACTED_CACHE_CAP = 500` with insertion-order tracking via `extracted_order: VecDeque`. `Shift+F` (`KeyAction::FetchFullText`) toggles between summary and extracted text when `Ready`, queues a new request when absent, and re-queues on `Failed` (so the user can retry). Per-feed `fulltext = true` in config auto-extracts newly-seen items on refresh (same `mark_feed_seen` "newly seen" semantics as `exec_on_new` — first fetch seeds silently, no firehose). Extracted entries are pruned alongside `seen_items` in `remove_current_feed`. +- **Full-text extraction**: `feed::extract_article` fetches an article URL with the existing `reqwest::blocking::Client` and runs Mozilla Readability via the `dom_smoothie` crate. **Sync, no tokio.** Per-feed `Authorization`/auth headers are intentionally NOT forwarded to article URLs (they're third-party hosts — propagating would be a credential leak). The response body is read via `Response::take(FULLTEXT_MAX_BYTES+1).read_to_end(...)` so peak allocation is hard-capped at ~5 MB regardless of what the server sends, and the response charset is honored (`Content-Type charset=` → `` sniff → UTF-8) via `encoding_rs` so non-UTF8 pages don't mojibake. Each extraction runs on a `std::thread::spawn` worker wrapped in `catch_unwind` (so a `dom_smoothie` panic on hostile HTML surfaces as `Failed("…panicked…")` instead of stranding the slot on `Pending` forever). The TUI loop maintains a `Arc` `extract_inflight` budget capped at `EXTRACTION_MAX_INFLIGHT = 4`; queued requests beyond that budget — or whose domain was last fetched less than `refresh_rate_limit_delay` ago — are pushed back onto `pending_extraction_requests` to retry on the next loop tick. State lives only in `App::extracted: HashMap` (`Pending` / `Ready(ExtractedArticle)` / `Failed(String)`) — **in-memory only**, never persisted; LRU-capped at `EXTRACTED_CACHE_CAP = 500` with insertion-order tracking via `extracted_order: VecDeque`. The cap is **hard**: `insert_extraction` always evicts the deque head when full, and `record_extraction_result` rejects results for slots that aren't currently `Pending` (so a late worker for an evicted / removed id is dropped rather than resurrecting dead state). `Shift+F` (`KeyAction::FetchFullText`) toggles between summary and extracted text when `Ready`, queues a new request when absent, and re-queues on `Failed` (so the user can retry). Per-feed `fulltext = true` in config auto-extracts newly-seen items on refresh (same `mark_feed_seen` "newly seen" semantics as `exec_on_new` — first fetch seeds silently, no firehose); the auto path additionally filters via `feed::is_safe_auto_url` (http/https only, rejects RFC1918 / loopback / link-local / CGNAT / `localhost`-style names) to prevent a hostile feed from probing internal hosts. Manual `Shift+F` bypasses the URL allowlist (it's the user's explicit action). The detail-view lookup uses `current_article_indices()` (same resolver as the action handler) so they stay in lockstep. Extracted entries are pruned alongside `seen_items` in `remove_current_feed` via the `remove_extraction(&id)` helper. ## Commit Conventions diff --git a/Cargo.lock b/Cargo.lock index bffb879..0f4936f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -605,6 +605,7 @@ dependencies = [ "crossterm", "dirs", "dom_smoothie", + "encoding_rs", "feed-rs", "html2text", "open", diff --git a/Cargo.toml b/Cargo.toml index 6aa800a..52afc74 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ scraper = "0.18" url = "2" shlex = "1.3" dom_smoothie = "0.17" +encoding_rs = "0.8" [profile.release] codegen-units = 1 diff --git a/src/app.rs b/src/app.rs index bc7abb0..b653438 100644 --- a/src/app.rs +++ b/src/app.rs @@ -976,11 +976,7 @@ impl App { // too — keeping them would never expire (no persistence) // but they're keyed on item ids that no longer resolve. for id in &removed_ids { - if self.extracted.remove(id).is_some() { - if let Some(pos) = self.extracted_order.iter().position(|x| x == id) { - self.extracted_order.remove(pos); - } - } + self.remove_extraction(id); } self.fulltext_feeds.remove(&url); @@ -1510,7 +1506,7 @@ impl App { } /// Extract domain from URL (e.g., "reddit.com" from "https://www.reddit.com/r/rust/.rss") - fn extract_domain_from_url(url: &str) -> String { + pub(crate) fn extract_domain_from_url(url: &str) -> String { // Simple domain extraction if let Some(domain_start) = url.find("://") { let after_protocol = &url[domain_start + 3..]; @@ -1783,14 +1779,17 @@ impl App { } } - /// Insert an extraction state into the cache and bump LRU order. If the - /// id is already present, the existing entry is replaced and the order - /// entry is moved to the end. If we're at the cap, the oldest entry is - /// evicted. + /// Land a worker result for `id`. Drops the result if the slot has been + /// evicted, removed (feed deleted), or already taken to a non-Pending + /// state — we never want to resurrect a dead cache entry. The expected + /// predecessor is `Pending`; anything else means the result is stale. pub fn record_extraction_result(&mut self, id: String, result: Result) { if id.is_empty() { return; } + if !matches!(self.extracted.get(&id), Some(ExtractionState::Pending)) { + return; + } let state = match result { Ok(article) => ExtractionState::Ready(article), Err(e) => ExtractionState::Failed(format!("{}", e)), @@ -1813,6 +1812,15 @@ impl App { .push_back(ExtractionRequest { id, url }); } + /// Forget a single extraction slot (both the state and its LRU + /// position). The deque is kept in sync with the map. + pub(crate) fn remove_extraction(&mut self, id: &str) { + self.extracted.remove(id); + if let Some(pos) = self.extracted_order.iter().position(|x| x.as_str() == id) { + self.extracted_order.remove(pos); + } + } + fn insert_extraction(&mut self, id: String, state: ExtractionState) { // If id is already present, move its LRU position to most-recent and // overwrite the state. @@ -1824,23 +1832,17 @@ impl App { self.extracted_order.push_back(id); return; } - // New entry — evict oldest if we're at the cap. Skip eviction if the - // oldest entry is currently Pending (it has an in-flight thread that - // will try to write back). + // Hard cap: evict the oldest entry unconditionally. If that entry + // happens to be `Pending`, the worker's eventual write-back will be + // dropped by `record_extraction_result`'s staleness check — so we + // never grow past EXTRACTED_CACHE_CAP, and we never resurrect an + // evicted slot. while self.extracted_order.len() >= EXTRACTED_CACHE_CAP { - let evict = match self.extracted_order.front() { - Some(front) => front.clone(), + match self.extracted_order.pop_front() { + Some(evict) => { + self.extracted.remove(&evict); + } None => break, - }; - let is_pending = matches!(self.extracted.get(&evict), Some(ExtractionState::Pending)); - self.extracted_order.pop_front(); - if !is_pending { - self.extracted.remove(&evict); - } else { - // Pending entry — re-insert at the back so we don't busy-loop - // evicting it. The eventual write-back will land normally. - self.extracted_order.push_back(evict); - break; } } self.extracted.insert(id.clone(), state); @@ -1873,26 +1875,37 @@ impl App { return; } }; + // Track whether this invocation actually queued / unqueued a + // request so we only show the off-view hint when something happened. + let mut queued_or_toggled = false; match self.extracted.get(&id) { Some(ExtractionState::Ready(_)) => { self.show_extracted = !self.show_extracted; + queued_or_toggled = true; } Some(ExtractionState::Pending) => { // already in flight — leave it } Some(ExtractionState::Failed(_)) => { - self.extracted.remove(&id); - if let Some(pos) = self.extracted_order.iter().position(|x| x == &id) { - self.extracted_order.remove(pos); - } + self.remove_extraction(&id); self.show_extracted = true; self.request_extraction(id, url); + queued_or_toggled = true; } None => { self.show_extracted = true; self.request_extraction(id, url); + queued_or_toggled = true; } } + // The detail view is what renders extracted state; when the action + // fires from anywhere else (macro path on Dashboard / FeedItems / + // Starred) the user gets no visible feedback. Surface a hint so the + // request doesn't feel like a no-op. + if queued_or_toggled && self.view != View::FeedItemDetail { + self.success_message = Some("Extracting full text (open article to view)".to_string()); + self.success_message_time = Some(std::time::Instant::now()); + } } } @@ -2842,9 +2855,13 @@ mod tests { #[test] fn test_lru_evicts_oldest_at_cap() { let mut app = App::new(); - // Insert one more than the cap; the first id must be evicted. + // Insert one more than the cap; the first id must be evicted. Real + // callers always Pending → Ready, and `record_extraction_result` + // refuses results for slots without a Pending predecessor, so do the + // same here. for i in 0..EXTRACTED_CACHE_CAP + 1 { let id = format!("id{}", i); + app.request_extraction(id.clone(), "https://ex.com/a".to_string()); app.record_extraction_result(id, Ok(dummy_extracted("x"))); } assert!(!app.extracted.contains_key("id0"), "oldest must be evicted"); @@ -2854,10 +2871,13 @@ mod tests { #[test] fn test_remove_current_feed_prunes_extracted_cache() { let mut app = make_test_app(); - // Seed extraction state for items in the first feed. + // Seed extraction state for items in the first feed via the real + // request → result transition. let id_old = app.get_item_id(0, 0); let id_new = app.get_item_id(0, 1); + app.request_extraction(id_old.clone(), "https://ex.com/a".to_string()); app.record_extraction_result(id_old.clone(), Ok(dummy_extracted("a"))); + app.request_extraction(id_new.clone(), "https://ex.com/b".to_string()); app.record_extraction_result(id_new.clone(), Ok(dummy_extracted("b"))); assert_eq!(app.extracted.len(), 2); @@ -2874,4 +2894,64 @@ mod tests { "LRU order must be pruned in sync with the map" ); } + + #[test] + fn test_record_extraction_result_drops_late_result_for_evicted_id() { + // Simulates the race where a worker thread completes after its slot + // was evicted (or its feed was removed). We must not resurrect the + // slot — otherwise dead state pins memory and confuses the UI. + let mut app = App::new(); + app.record_extraction_result("ghost".to_string(), Ok(dummy_extracted("body"))); + assert!( + !app.extracted.contains_key("ghost"), + "result for an id with no Pending slot must be dropped" + ); + } + + #[test] + fn test_record_extraction_result_drops_late_result_for_ready_slot() { + // If the slot is already `Ready` (e.g. user manually retried and the + // retry landed first), a second late worker must not clobber it. + let mut app = App::new(); + app.request_extraction("id1".to_string(), "https://ex.com/a".to_string()); + app.record_extraction_result("id1".to_string(), Ok(dummy_extracted("first"))); + // Late second worker — slot is Ready, so this must be a no-op. + app.record_extraction_result("id1".to_string(), Ok(dummy_extracted("late"))); + match app.extracted.get("id1") { + Some(ExtractionState::Ready(a)) => { + assert_eq!(a.plain_text, "first", "late result must not clobber Ready") + } + other => panic!("expected Ready, got {:?}", other), + } + } + + #[test] + fn test_lru_hard_cap_when_oldest_is_pending() { + // Stress the soft-cap edge case: queue CAP+1 requests as Pending + // (no result returned). The cache size must stay at the cap, not + // grow with the number of in-flight requests. + let mut app = App::new(); + for i in 0..EXTRACTED_CACHE_CAP + 5 { + let id = format!("id{}", i); + app.request_extraction(id, "https://ex.com/a".to_string()); + } + assert_eq!( + app.extracted.len(), + EXTRACTED_CACHE_CAP, + "Pending entries must not let the cache exceed the hard cap" + ); + assert_eq!(app.extracted_order.len(), EXTRACTED_CACHE_CAP); + } + + #[test] + fn test_remove_extraction_helper_prunes_both_map_and_order() { + let mut app = App::new(); + app.request_extraction("id1".to_string(), "https://ex.com/a".to_string()); + app.record_extraction_result("id1".to_string(), Ok(dummy_extracted("body"))); + assert_eq!(app.extracted.len(), 1); + assert_eq!(app.extracted_order.len(), 1); + app.remove_extraction("id1"); + assert!(app.extracted.is_empty()); + assert!(app.extracted_order.is_empty()); + } } diff --git a/src/feed.rs b/src/feed.rs index cf06656..121f0f9 100644 --- a/src/feed.rs +++ b/src/feed.rs @@ -6,6 +6,8 @@ use scraper::{Html, Selector}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::fmt; +use std::io::Read; +use std::net::IpAddr; use std::sync::OnceLock; use std::time::Duration; use url::Url; @@ -21,6 +23,12 @@ const FULLTEXT_MAX_BYTES: usize = 5 * 1024 * 1024; /// — usually JS-rendered shells, login walls, or 404 placeholders. const FULLTEXT_MIN_LENGTH: usize = 200; +/// Number of leading bytes of the response we sniff for `` +/// when the Content-Type header doesn't carry one. The HTML5 spec mandates +/// any in-document declaration must appear in the first 1024 bytes of the +/// document, so reading more is just wasted work. +const META_SNIFF_BYTES: usize = 1024; + #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Feed { pub url: String, @@ -123,6 +131,75 @@ pub struct ExtractedArticle { pub source_url: String, } +/// Validates that `url` is safe to fetch on the *auto-fulltext* path (i.e. +/// without a user click). Rejects non-http(s) schemes and hostnames that +/// resolve to private / loopback / link-local addresses, since auto-mode +/// fetches whatever the feed XML put in `` — a hostile feed could +/// otherwise probe the user's internal network. Manual `Shift+F` is the +/// user's explicit action and bypasses this check. +pub fn is_safe_auto_url(url: &str) -> bool { + let parsed = match Url::parse(url) { + Ok(u) => u, + Err(_) => return false, + }; + if !matches!(parsed.scheme(), "http" | "https") { + return false; + } + let host = match parsed.host() { + Some(h) => h, + None => return false, + }; + match host { + url::Host::Ipv4(ip) => is_global_ip(IpAddr::V4(ip)), + url::Host::Ipv6(ip) => is_global_ip(IpAddr::V6(ip)), + url::Host::Domain(name) => { + // Reject obvious local-only names; a hostile feed cannot use + // these to probe RFC1918 space, but we still want to avoid + // hitting "localhost"-style endpoints by accident. + let n = name.to_ascii_lowercase(); + if n == "localhost" || n.ends_with(".localhost") || n.ends_with(".local") { + return false; + } + // DNS resolution itself happens later in reqwest; we don't + // resolve here because that would block. The intent of the + // check is "no obvious internal target encoded directly in + // the URL" — DNS rebinding is out of scope for a feed reader. + true + } + } +} + +fn is_global_ip(ip: IpAddr) -> bool { + match ip { + IpAddr::V4(v4) => { + !(v4.is_loopback() + || v4.is_private() + || v4.is_link_local() + || v4.is_broadcast() + || v4.is_documentation() + || v4.is_unspecified() + || v4.octets()[0] == 0 + // CGNAT 100.64.0.0/10 + || (v4.octets()[0] == 100 && (v4.octets()[1] & 0xC0) == 64) + // 169.254/16 is link_local; also reject the carrier-grade + // /15 and the 192.0.0.0/24 IETF protocol-assignments block. + || (v4.octets()[0] == 192 + && v4.octets()[1] == 0 + && v4.octets()[2] == 0)) + } + IpAddr::V6(v6) => { + !(v6.is_loopback() + || v6.is_unspecified() + // unique-local fc00::/7 + || (v6.segments()[0] & 0xFE00) == 0xFC00 + // link-local fe80::/10 + || (v6.segments()[0] & 0xFFC0) == 0xFE80 + // IPv4-mapped — defer to v4 check via the embedded address + || v6.to_ipv4_mapped().map(|v4| !is_global_ip(IpAddr::V4(v4))).unwrap_or(false)) + } + } +} + /// Fetch `url` with `client` and run Mozilla-Readability extraction over the /// returned HTML. Feed auth headers are intentionally NOT propagated: the /// article URL is on a different host and may be third-party, so leaking @@ -130,12 +207,24 @@ pub struct ExtractedArticle { /// /// Rejects non-`text/html` responses, oversized bodies (`FULLTEXT_MAX_BYTES`), /// and pages whose extracted text falls below `FULLTEXT_MIN_LENGTH` (treated -/// as JS-rendered or empty). +/// as JS-rendered or empty). Body is read with a size-bounded reader so peak +/// allocation cannot exceed the cap, and the response charset is honored so +/// non-UTF8 pages decode correctly. pub fn extract_article( url: &str, client: &reqwest::blocking::Client, user_agent: Option<&str>, ) -> Result { + // Scheme allowlist — reqwest's blocking client refuses non-http(s) by + // default, but this gives a clearer error and makes the policy explicit. + let parsed = Url::parse(url).context("Article URL is not a valid URL")?; + if !matches!(parsed.scheme(), "http" | "https") { + return Err(anyhow::anyhow!( + "Article URL scheme '{}' is not allowed (only http/https)", + parsed.scheme() + )); + } + let default_user_agent = "Mozilla/5.0 (compatible; Feedr/1.0; +https://github.com/bahdotsh/feedr)"; let ua = user_agent.unwrap_or(default_user_agent); @@ -158,12 +247,13 @@ pub fn extract_article( )); } - let content_type = response + let content_type_raw = response .headers() .get("content-type") .and_then(|ct| ct.to_str().ok()) .unwrap_or("") - .to_lowercase(); + .to_string(); + let content_type = content_type_raw.to_lowercase(); if !content_type.is_empty() && !content_type.contains("text/html") && !content_type.contains("application/xhtml") @@ -174,20 +264,113 @@ pub fn extract_article( )); } + // Reject upfront if Content-Length already exceeds the cap — saves us + // the full transfer for obviously oversized responses. + if let Some(declared_len) = response + .headers() + .get("content-length") + .and_then(|cl| cl.to_str().ok()) + .and_then(|cl| cl.parse::().ok()) + { + if declared_len > FULLTEXT_MAX_BYTES { + return Err(anyhow::anyhow!( + "Article body too large ({} bytes declared, cap {} bytes)", + declared_len, + FULLTEXT_MAX_BYTES + )); + } + } + let final_url = response.url().to_string(); - let bytes = response.bytes().context("Failed to read article body")?; + + // Read at most FULLTEXT_MAX_BYTES + 1 so we can detect overflow without + // ever allocating beyond cap+1. `reqwest::blocking::Response` implements + // `Read`, so `.take()` gives us a hard cap on input consumed. + let mut bytes = Vec::with_capacity(64 * 1024); + let mut reader = response.take((FULLTEXT_MAX_BYTES as u64) + 1); + reader + .read_to_end(&mut bytes) + .context("Failed to read article body")?; if bytes.len() > FULLTEXT_MAX_BYTES { return Err(anyhow::anyhow!( - "Article body too large ({} bytes, cap {} bytes)", - bytes.len(), + "Article body too large (exceeded cap {} bytes)", FULLTEXT_MAX_BYTES )); } - let html = String::from_utf8_lossy(&bytes).into_owned(); + let html = decode_html_bytes(&bytes, &content_type_raw); extract_from_html(&html, &final_url) } +/// Decode `bytes` into a Rust `String` using the encoding declared by the +/// `Content-Type` header, falling back to `` sniffing in the +/// first ~1 KB of the document, and finally to UTF-8. Without this, any +/// non-UTF8 page (Windows-1252, ISO-8859-1, Shift_JIS, GBK, …) produces +/// U+FFFD-laced mojibake that Readability would then misclassify as too +/// short. +pub(crate) fn decode_html_bytes(bytes: &[u8], content_type: &str) -> String { + let mut encoding: Option<&'static encoding_rs::Encoding> = None; + + // 1) Honor charset= in the Content-Type header. + if let Some(label) = charset_from_content_type(content_type) { + encoding = encoding_rs::Encoding::for_label(label.as_bytes()); + } + + // 2) Otherwise sniff the leading bytes for an in-document . + if encoding.is_none() { + let sniff_len = bytes.len().min(META_SNIFF_BYTES); + if let Some(label) = sniff_meta_charset(&bytes[..sniff_len]) { + encoding = encoding_rs::Encoding::for_label(label.as_bytes()); + } + } + + // 3) Default to UTF-8 — same fate `from_utf8_lossy` would have given us, + // but routed through encoding_rs so behavior is uniform. + let encoding = encoding.unwrap_or(encoding_rs::UTF_8); + let (cow, _enc, _had_errors) = encoding.decode(bytes); + cow.into_owned() +} + +fn charset_from_content_type(content_type: &str) -> Option { + for part in content_type.split(';') { + let lower = part.trim().to_ascii_lowercase(); + if let Some(rest) = lower.strip_prefix("charset=") { + let v = rest.trim().trim_matches('"').trim_matches('\''); + if !v.is_empty() { + return Some(v.to_string()); + } + } + } + None +} + +/// Tiny sniffer for `` or ``. Stays byte-level (latin-1 lossy decode is fine +/// here: ASCII-superset encodings agree on the bytes we care about) so we +/// don't have to pick an encoding before we know one. +fn sniff_meta_charset(head: &[u8]) -> Option { + let s = String::from_utf8_lossy(head).to_ascii_lowercase(); + // + if let Some(idx) = s.find("charset") { + let after = &s[idx + "charset".len()..]; + // Skip optional `=`, whitespace, optional quote + let after = after.trim_start(); + let after = after.strip_prefix('=').unwrap_or(after).trim_start(); + let after = after + .strip_prefix('"') + .or_else(|| after.strip_prefix('\'')) + .unwrap_or(after); + let end = after + .find(['"', '\'', ' ', '>', ';', '/']) + .unwrap_or(after.len()); + let label = &after[..end]; + if !label.is_empty() { + return Some(label.to_string()); + } + } + None +} + /// Pure Readability step — no I/O. Split out so unit tests can exercise the /// extraction + min-length + error-path logic without a network round-trip. pub fn extract_from_html(html: &str, source_url: &str) -> Result { @@ -732,4 +915,100 @@ mod tests { msg ); } + + #[test] + fn test_decode_html_bytes_uses_content_type_charset() { + // "Café" in Windows-1252: 0x43 0x61 0x66 0xE9 + let bytes: &[u8] = &[0x43, 0x61, 0x66, 0xE9]; + let decoded = decode_html_bytes(bytes, "text/html; charset=windows-1252"); + assert_eq!(decoded, "Café"); + } + + #[test] + fn test_decode_html_bytes_handles_quoted_charset_in_header() { + let bytes: &[u8] = &[0x43, 0x61, 0x66, 0xE9]; + let decoded = decode_html_bytes(bytes, "text/html; charset=\"windows-1252\""); + assert_eq!(decoded, "Café"); + } + + #[test] + fn test_decode_html_bytes_falls_back_to_meta_charset() { + // Document declares Shift_JIS via ; Content-Type omits charset. + // Bytes "日本" in Shift_JIS: 0x93 0xFA 0x96 0x7B. + let mut html: Vec = b"".to_vec(); + html.extend_from_slice(&[0x93, 0xFA, 0x96, 0x7B]); + html.extend_from_slice(b""); + let decoded = decode_html_bytes(&html, "text/html"); + assert!( + decoded.contains("日本"), + "expected meta-sniffed Shift_JIS to decode, got: {}", + decoded + ); + } + + #[test] + fn test_decode_html_bytes_defaults_to_utf8() { + let html = "Café"; + let decoded = decode_html_bytes(html.as_bytes(), ""); + assert!(decoded.contains("Café")); + } + + #[test] + fn test_is_safe_auto_url_accepts_public_https() { + assert!(is_safe_auto_url("https://example.com/article")); + assert!(is_safe_auto_url("http://example.com/article")); + } + + #[test] + fn test_is_safe_auto_url_rejects_non_http_scheme() { + assert!(!is_safe_auto_url("file:///etc/passwd")); + assert!(!is_safe_auto_url("ftp://example.com/x")); + assert!(!is_safe_auto_url("javascript:alert(1)")); + assert!(!is_safe_auto_url("data:text/html,` in the head must not + // fool the sniffer. + let html = b"\ + \ + \ + x"; + let label = sniff_meta_charset(html).expect("should find charset"); + assert_eq!(label, "utf-8", "script literal must be ignored"); + } + + #[test] + fn test_sniff_meta_charset_ignores_metadata_tag() { + // `` must not be matched as ``. + let html = b"\ + charset=bogus\ + \ + "; + let label = sniff_meta_charset(html).expect("should find charset"); + assert_eq!(label, "utf-8"); + } + + #[test] + fn test_sniff_meta_charset_handles_http_equiv_meta() { + // The other legal form: + let html = b"\ + \ + "; + let label = sniff_meta_charset(html).expect("should find charset"); + assert_eq!(label, "iso-8859-1"); + } + #[test] fn test_is_safe_auto_url_accepts_public_https() { assert!(is_safe_auto_url("https://example.com/article")); diff --git a/src/tui.rs b/src/tui.rs index 6f86f84..318feeb 100644 --- a/src/tui.rs +++ b/src/tui.rs @@ -268,8 +268,9 @@ fn enqueue_fulltext_for_new(app: &mut App, feed: &Feed, newly_seen: &[usize]) { // Auto path → use the safe-redirect client. The URL came from feed // XML, so a hostile feed could publish a public-looking link that // 302s to RFC1918 / metadata endpoints; revalidating each hop closes - // that window. - app.request_extraction(id, url.to_string(), true); + // that window. Auto requests go to the back of the queue so an + // explicit user `Shift+F` is never starved behind a refresh batch. + app.request_extraction(id, url.to_string(), true, false); } } @@ -344,19 +345,19 @@ fn spawn_pending_extractions( app: &mut App, tx: &mpsc::Sender<(String, Result)>, inflight: &Arc, + manual_client: &mut Option, + safe_client: &mut Option, ) { if app.pending_extraction_requests.is_empty() { return; } let timeout = app.config.network.http_timeout; let user_agent = app.config.network.user_agent.clone(); - // Two clients: the permissive one for manual `Shift+F` (user-explicit, - // same redirect policy as feed fetches) and the safe-redirect one for - // the auto path, which revalidates every hop with `is_safe_auto_url`. - // Built lazily — if a tick has only manual requests we never construct - // the safe client, and vice versa. - let mut manual_client: Option = None; - let mut safe_client: Option = None; + // `manual_client` / `safe_client` are owned by `run_app` so the + // `reqwest::blocking::Client`s are built at most once per session + // instead of once per tick that has work. Lazy build still: if the + // session never uses Shift+F we never construct the manual client, + // and vice versa. let rate_limit = Duration::from_millis(app.config.general.refresh_rate_limit_delay); let now = std::time::Instant::now(); @@ -399,10 +400,10 @@ fn spawn_pending_extractions( // the request and stop. let client = if req.safe_redirects { match safe_client { - Some(ref c) => c.clone(), + Some(c) => c.clone(), None => match Feed::build_safe_redirect_client(timeout) { Ok(c) => { - safe_client = Some(c.clone()); + *safe_client = Some(c.clone()); c } Err(_) => { @@ -413,10 +414,10 @@ fn spawn_pending_extractions( } } else { match manual_client { - Some(ref c) => c.clone(), + Some(c) => c.clone(), None => match Feed::build_client(timeout) { Ok(c) => { - manual_client = Some(c.clone()); + *manual_client = Some(c.clone()); c } Err(_) => { @@ -426,13 +427,12 @@ fn spawn_pending_extractions( }, } }; - // Claim the budget slot and stamp the domain BEFORE spawning so a - // concurrent caller (or the very next loop iteration) sees the slot - // taken / the domain freshly used. + // Claim the budget slot before spawning so a concurrent caller (or + // the very next loop iteration) sees the slot taken. We defer the + // `last_domain_fetch` stamp until AFTER `thread::Builder::spawn` + // returns Ok — a rare OS-level spawn failure should re-queue + // without burning an extra rate-limit window for the next attempt. inflight.fetch_add(1, Ordering::SeqCst); - if !domain.is_empty() { - app.last_domain_fetch.insert(domain, now); - } let ua = user_agent.clone(); let tx = tx.clone(); let inflight_for_worker = Arc::clone(inflight); @@ -464,6 +464,12 @@ fn spawn_pending_extractions( app.pending_extraction_requests.push_front(req); break; } + // Spawn succeeded — now stamp the domain so the next request for + // the same host respects the rate-limit. Done last so a failed + // spawn doesn't leave a stale stamp behind. + if !domain.is_empty() { + app.last_domain_fetch.insert(domain, now); + } } // Re-enqueue throttled requests at the front, preserving their original // FIFO order. Without `.rev()` we'd flip them. @@ -484,6 +490,13 @@ fn run_app(terminal: &mut Terminal, app: &mut App) -> Result<()> // on entry and drops it on exit (even on panic), so the pool never // deadlocks even if `dom_smoothie` blows up on hostile HTML. let extract_inflight = Arc::new(AtomicUsize::new(0)); + // HTTP clients for extraction workers. Owned at the loop level (not + // rebuilt every call to `spawn_pending_extractions`) so a busy refresh + // doesn't churn `reqwest::blocking::Client` instances per tick. + // Lazy-built on first use of each path: a session that never hits + // `Shift+F` never constructs `extract_manual_client`, and vice versa. + let mut extract_manual_client: Option = None; + let mut extract_safe_client: Option = None; // Initial load of bookmarked feeds let (mut pending_count, mut feed_rx) = spawn_feed_refresh(app); @@ -590,7 +603,13 @@ fn run_app(terminal: &mut Terminal, app: &mut App) -> Result<()> // Spawn workers for any extraction requests queued by events or // by the feed-refresh path above. Run every iteration so manual // `F` requests don't have to wait for a tick boundary. - spawn_pending_extractions(app, &extract_tx, &extract_inflight); + spawn_pending_extractions( + app, + &extract_tx, + &extract_inflight, + &mut extract_manual_client, + &mut extract_safe_client, + ); // Drain completed extractions back into app state. This is what // flips a `Pending` entry to `Ready` / `Failed`, which the detail @@ -710,7 +729,12 @@ mod tests { fn test_record_extraction_result_flips_pending_to_ready() { use crate::app::ExtractionState; let mut app = App::new(); - app.request_extraction("id1".to_string(), "https://ex.com/a".to_string(), false); + app.request_extraction( + "id1".to_string(), + "https://ex.com/a".to_string(), + false, + false, + ); // simulate a worker thread completing successfully app.record_extraction_result( "id1".to_string(), diff --git a/src/ui/detail.rs b/src/ui/detail.rs index edc29eb..0784e42 100644 --- a/src/ui/detail.rs +++ b/src/ui/detail.rs @@ -176,7 +176,18 @@ pub(super) fn render_item_detail( ) } BodySource::Extracted(text) => format_content_for_reading(text), - BodySource::Extracting => "Fetching full text…".to_string(), + BodySource::Extracting => { + // Keep the summary visible while the worker is in flight — + // a slow site can take several seconds, and replacing the + // body with "Fetching…" leaves the user with nothing to + // read AND no way to revert (Shift+F is a no-op on Pending). + // The scroll-indicator title (see body_label below) carries + // the "Extracting…" status instead. + format!( + "[Fetching full text… — showing summary]\n\n{}", + summary_text() + ) + } }; // Calculate the viewport height (accounting for borders and padding) From 41794f58c028d8d7360c262e51711d6c7c7fe205 Mon Sep 17 00:00:00 2001 From: bahdotsh Date: Fri, 15 May 2026 15:47:41 +0200 Subject: [PATCH 6/9] fix(fulltext): close stale-result race and recover stuck Pending slots MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PR review found two real bugs hiding in the extraction worker pool, plus a handful of smaller things worth cleaning up. Time to actually fix them instead of waving hands at them. The first one: if a Pending slot gets LRU-evicted and then re-requested for the same id, the *original* worker is still out there, blissfully fetching. When it eventually delivers, the staleness gate sees Pending and writes through — so the "manual retry" the user just performed gets silently satisfied by the stale fetch. The id maps 1:1 to URL so the content is at least the right URL, but it's not the fresh data the user asked for. Fix: tag every ExtractionRequest with a monotonic generation. Pending carries it too. record_extraction_result requires both the state AND generation to match. The worker echoes its generation back through the channel; stale results get dropped. The second one: catch_unwind covers Rust panics, but if the OS decides to kill the worker (OOM, SIGSEGV in a native dep), the slot stays Pending forever. toggle_or_request_fulltext's contains_key guard then blocks the user from ever retrying. Fix: add a watchdog. Pending now carries started_at; every loop tick we flip anything older than http_timeout * 3 to Failed("timed out"). The inflight counter is intentionally *not* decremented — if the worker is still alive and exits normally, its own fetch_sub runs; double-decrementing would underflow. After enough OS kills the budget gets pinned and the user has to restart, but at least retries work. While at it: - The meta-charset sniffer was matching the first \`charset\` substring it found in a meta tag, which meant \`\` extracted the empty value out of name= and gave up before finding the real one. Now requires a word boundary before \`charset\` and \`=\` after. - The detail-view "Press F to retry" hint was hardcoded. If the user rebinds fetch_full_text, the hint lies. Use key_display(). - The doc claim that body allocation was "hard-capped at ~5 MB" wasn't quite true — Vec::read_to_end's doubling growth from a 64 KiB hint peaks at ~8 MiB for a 5 MiB body. Preallocate at the cap. Modern allocators don't commit physical pages until written, so small pages cost nothing. - The inflight counter was using SeqCst, which is overkill for a plain bound. Relaxed everywhere; the mpsc channel carries the result-handoff ordering. - New regression test for the "mark_feed_seen is called exactly once per feed arrival" contract — the promise was in the docs but not in tests. --- src/app.rs | 229 +++++++++++++++++++++++++++++++++++++++++++---- src/feed.rs | 98 +++++++++++++++++--- src/tui.rs | 125 ++++++++++++++++++++++---- src/ui/detail.rs | 18 ++-- 4 files changed, 416 insertions(+), 54 deletions(-) diff --git a/src/app.rs b/src/app.rs index 3126f97..8e7961e 100644 --- a/src/app.rs +++ b/src/app.rs @@ -162,6 +162,11 @@ pub struct App { /// for. Decouples request-time (event handler) from spawn-time (loop /// drain) so events.rs doesn't need the thread-handle / channel. pub pending_extraction_requests: VecDeque, + /// Monotonic generation counter for extraction requests. Each new + /// `Pending` slot gets a fresh value so a worker for a previously + /// evicted-and-recreated slot can't accidentally satisfy the new + /// request — `record_extraction_result` matches the generation. + pub next_extraction_generation: u64, } /// Snapshot of the focused article passed to template-expansion / pipe-payload helpers. @@ -216,7 +221,16 @@ pub enum AddFeedResult { /// logic and bounds storage growth. #[derive(Clone, Debug)] pub enum ExtractionState { - Pending, + /// In-flight. `generation` matches the generation stamped on the + /// `ExtractionRequest` that produced this slot; `started_at` drives the + /// stale-Pending watchdog. The generation closes a TOCTOU window where + /// a Pending slot is evicted from the LRU and re-requested for the same + /// id — without the tag, the old worker's eventual result would land on + /// the new slot. + Pending { + generation: u64, + started_at: Instant, + }, Ready(ExtractedArticle), Failed(String), } @@ -234,6 +248,10 @@ pub struct ExtractionRequest { /// (user already chose the article — same trust model as opening it in /// a browser). pub safe_redirects: bool, + /// Generation tag matching the `Pending` slot this request was created + /// for. Worker echoes it back through the result channel so the receiver + /// can drop late results from stale (evicted-and-recreated) slots. + pub generation: u64, } /// Cap on the number of cached extracted articles. Auto-fetch over a big @@ -242,6 +260,13 @@ pub struct ExtractionRequest { /// first) — `extracted_order` tracks the insertion sequence. pub const EXTRACTED_CACHE_CAP: usize = 500; +/// Stale-`Pending` watchdog timeout, as a multiple of `http_timeout`. A worker +/// whose Pending slot is older than `http_timeout * PENDING_WATCHDOG_MULTIPLIER` +/// is assumed dead (OS kill / SIGSEGV in a native dep) and flipped to +/// `Failed("timed out")` so the user can retry. Multiplier covers slow DNS, +/// TLS handshakes, and Readability CPU time on top of the network timeout. +pub const PENDING_WATCHDOG_MULTIPLIER: u32 = 3; + #[derive(Serialize, Deserialize)] struct SavedData { bookmarks: Vec, @@ -415,6 +440,7 @@ impl App { fulltext_feeds, show_extracted: true, pending_extraction_requests: VecDeque::new(), + next_extraction_generation: 0, }; app.update_dashboard(); @@ -1786,15 +1812,26 @@ impl App { } /// Land a worker result for `id`. Drops the result if the slot has been - /// evicted, removed (feed deleted), or already taken to a non-Pending - /// state — we never want to resurrect a dead cache entry. The expected - /// predecessor is `Pending`; anything else means the result is stale. - pub fn record_extraction_result(&mut self, id: String, result: Result) { + /// evicted, removed (feed deleted), already taken to a non-Pending state, + /// or if the request's generation no longer matches the current Pending + /// slot — we never want to resurrect a dead cache entry or let a stale + /// worker satisfy a re-requested slot. + pub fn record_extraction_result( + &mut self, + id: String, + generation: u64, + result: Result, + ) { if id.is_empty() { return; } - if !matches!(self.extracted.get(&id), Some(ExtractionState::Pending)) { - return; + // Must be Pending AND the generation must match. The generation + // check closes the TOCTOU window where an LRU-evicted Pending slot + // gets re-requested: the old worker's result would otherwise land + // on the new Pending slot of the same id. + match self.extracted.get(&id) { + Some(ExtractionState::Pending { generation: g, .. }) if *g == generation => {} + _ => return, } let state = match result { Ok(article) => ExtractionState::Ready(article), @@ -1823,11 +1860,20 @@ impl App { if self.extracted.contains_key(&id) { return; } - self.insert_extraction(id.clone(), ExtractionState::Pending); + let generation = self.next_extraction_generation; + self.next_extraction_generation = self.next_extraction_generation.wrapping_add(1); + self.insert_extraction( + id.clone(), + ExtractionState::Pending { + generation, + started_at: Instant::now(), + }, + ); let req = ExtractionRequest { id, url, safe_redirects, + generation, }; if priority_front { self.pending_extraction_requests.push_front(req); @@ -1836,6 +1882,37 @@ impl App { } } + /// Sweep Pending slots that have been in-flight longer than + /// `http_timeout * PENDING_WATCHDOG_MULTIPLIER`. A normally-completing + /// worker (success, error, even a Rust panic caught by `catch_unwind`) + /// always sends a result back through the channel. A worker that + /// silently disappears — OS kill, SIGSEGV in a native dep, the parent + /// thread crashing — leaves its slot stuck on Pending forever, which + /// blocks the user from re-requesting via `toggle_or_request_fulltext`'s + /// `contains_key` guard. Flipping to `Failed("timed out")` lets the + /// user retry. The corresponding `inflight` slot is intentionally NOT + /// decremented: if the worker is actually still alive and eventually + /// exits normally, its own `fetch_sub` will run; we'd otherwise + /// double-decrement and underflow the counter. + pub fn prune_stale_pending_extractions(&mut self, http_timeout: std::time::Duration) { + let watchdog = http_timeout.saturating_mul(PENDING_WATCHDOG_MULTIPLIER); + if watchdog.is_zero() { + return; + } + let now = Instant::now(); + let mut to_fail: Vec = Vec::new(); + for (id, state) in self.extracted.iter() { + if let ExtractionState::Pending { started_at, .. } = state { + if now.duration_since(*started_at) > watchdog { + to_fail.push(id.clone()); + } + } + } + for id in to_fail { + self.insert_extraction(id, ExtractionState::Failed("timed out".to_string())); + } + } + /// Forget a single extraction slot (both the state and its LRU /// position). The deque is kept in sync with the map. pub(crate) fn remove_extraction(&mut self, id: &str) { @@ -1907,7 +1984,7 @@ impl App { self.show_extracted = !self.show_extracted; queued_or_toggled = true; } - Some(ExtractionState::Pending) => { + Some(ExtractionState::Pending { .. }) => { // already in flight — leave it } Some(ExtractionState::Failed(_)) => { @@ -2837,6 +2914,16 @@ mod tests { } } + /// Look up the generation stamped on a `Pending` slot, so tests can + /// echo it back through `record_extraction_result` the same way a + /// real worker would. + fn pending_generation(app: &App, id: &str) -> u64 { + match app.extracted.get(id) { + Some(ExtractionState::Pending { generation, .. }) => *generation, + other => panic!("expected Pending slot for {}, got {:?}", id, other), + } + } + #[test] fn test_request_extraction_creates_pending_and_enqueues() { let mut app = App::new(); @@ -2848,7 +2935,7 @@ mod tests { ); assert!(matches!( app.extracted.get("id1"), - Some(ExtractionState::Pending) + Some(ExtractionState::Pending { .. }) )); assert_eq!(app.pending_extraction_requests.len(), 1); } @@ -2941,7 +3028,8 @@ mod tests { false, false, ); - app.record_extraction_result("id1".to_string(), Ok(dummy_extracted("body"))); + let gen = pending_generation(&app, "id1"); + app.record_extraction_result("id1".to_string(), gen, Ok(dummy_extracted("body"))); match app.extracted.get("id1") { Some(ExtractionState::Ready(a)) => assert_eq!(a.plain_text, "body"), other => panic!("expected Ready, got {:?}", other), @@ -2957,7 +3045,8 @@ mod tests { false, false, ); - app.record_extraction_result("id1".to_string(), Err(anyhow::anyhow!("boom"))); + let gen = pending_generation(&app, "id1"); + app.record_extraction_result("id1".to_string(), gen, Err(anyhow::anyhow!("boom"))); match app.extracted.get("id1") { Some(ExtractionState::Failed(msg)) => assert!(msg.contains("boom")), other => panic!("expected Failed, got {:?}", other), @@ -2974,7 +3063,8 @@ mod tests { for i in 0..EXTRACTED_CACHE_CAP + 1 { let id = format!("id{}", i); app.request_extraction(id.clone(), "https://ex.com/a".to_string(), false, false); - app.record_extraction_result(id, Ok(dummy_extracted("x"))); + let gen = pending_generation(&app, &id); + app.record_extraction_result(id, gen, Ok(dummy_extracted("x"))); } assert!(!app.extracted.contains_key("id0"), "oldest must be evicted"); assert_eq!(app.extracted.len(), EXTRACTED_CACHE_CAP); @@ -2988,9 +3078,11 @@ mod tests { let id_old = app.get_item_id(0, 0); let id_new = app.get_item_id(0, 1); app.request_extraction(id_old.clone(), "https://ex.com/a".to_string(), false, false); - app.record_extraction_result(id_old.clone(), Ok(dummy_extracted("a"))); + let gen_old = pending_generation(&app, &id_old); + app.record_extraction_result(id_old.clone(), gen_old, Ok(dummy_extracted("a"))); app.request_extraction(id_new.clone(), "https://ex.com/b".to_string(), false, false); - app.record_extraction_result(id_new.clone(), Ok(dummy_extracted("b"))); + let gen_new = pending_generation(&app, &id_new); + app.record_extraction_result(id_new.clone(), gen_new, Ok(dummy_extracted("b"))); assert_eq!(app.extracted.len(), 2); // Remove the first feed. @@ -3013,7 +3105,9 @@ mod tests { // was evicted (or its feed was removed). We must not resurrect the // slot — otherwise dead state pins memory and confuses the UI. let mut app = App::new(); - app.record_extraction_result("ghost".to_string(), Ok(dummy_extracted("body"))); + // Any generation — slot doesn't exist, so result is dropped before + // generation is even consulted. + app.record_extraction_result("ghost".to_string(), 0, Ok(dummy_extracted("body"))); assert!( !app.extracted.contains_key("ghost"), "result for an id with no Pending slot must be dropped" @@ -3031,9 +3125,11 @@ mod tests { false, false, ); - app.record_extraction_result("id1".to_string(), Ok(dummy_extracted("first"))); + let gen = pending_generation(&app, "id1"); + app.record_extraction_result("id1".to_string(), gen, Ok(dummy_extracted("first"))); // Late second worker — slot is Ready, so this must be a no-op. - app.record_extraction_result("id1".to_string(), Ok(dummy_extracted("late"))); + // Whatever generation it claims is irrelevant; the state check fails. + app.record_extraction_result("id1".to_string(), gen, Ok(dummy_extracted("late"))); match app.extracted.get("id1") { Some(ExtractionState::Ready(a)) => { assert_eq!(a.plain_text, "first", "late result must not clobber Ready") @@ -3042,6 +3138,100 @@ mod tests { } } + #[test] + fn test_record_extraction_result_drops_stale_generation_on_re_requested_slot() { + // The TOCTOU race the generation tag closes: a Pending slot is + // evicted by LRU, then re-requested for the same id. The OLD + // worker (still running) eventually delivers its result. Without + // the generation check, that stale result would land on the new + // Pending slot — silently satisfying a "manual retry" with a + // stale fetch. + let mut app = App::new(); + app.request_extraction( + "id1".to_string(), + "https://ex.com/a".to_string(), + false, + false, + ); + let stale_gen = pending_generation(&app, "id1"); + // Evict + re-request manually (faster than filling 500 entries). + app.remove_extraction("id1"); + app.request_extraction( + "id1".to_string(), + "https://ex.com/a".to_string(), + false, + false, + ); + let fresh_gen = pending_generation(&app, "id1"); + assert_ne!( + stale_gen, fresh_gen, + "re-requested slot must carry a fresh generation" + ); + // Old worker lands a result tagged with the stale generation. + app.record_extraction_result("id1".to_string(), stale_gen, Ok(dummy_extracted("stale"))); + // Slot must still be Pending — the stale result was discarded. + assert!( + matches!( + app.extracted.get("id1"), + Some(ExtractionState::Pending { generation: g, .. }) if *g == fresh_gen + ), + "stale-generation result must not satisfy the re-requested slot" + ); + } + + #[test] + fn test_prune_stale_pending_extractions_flips_old_pending_to_failed() { + let mut app = App::new(); + app.request_extraction( + "id1".to_string(), + "https://ex.com/a".to_string(), + false, + false, + ); + // Forcibly age the Pending slot's started_at past the watchdog. + let id = "id1".to_string(); + let gen = pending_generation(&app, &id); + let aged = Instant::now() + .checked_sub(std::time::Duration::from_secs(3600)) + .expect("Instant::now() - 1h must be representable on a freshly-booted machine"); + app.extracted.insert( + id.clone(), + ExtractionState::Pending { + generation: gen, + started_at: aged, + }, + ); + // 30 s × 3 = 90 s watchdog; aged 3600 s exceeds it. + app.prune_stale_pending_extractions(std::time::Duration::from_secs(30)); + match app.extracted.get(&id) { + Some(ExtractionState::Failed(msg)) => { + assert!( + msg.contains("timed out"), + "expected timeout reason: {}", + msg + ) + } + other => panic!("expected Failed after watchdog, got {:?}", other), + } + } + + #[test] + fn test_prune_stale_pending_extractions_leaves_fresh_pending_alone() { + let mut app = App::new(); + app.request_extraction( + "id1".to_string(), + "https://ex.com/a".to_string(), + false, + false, + ); + // Freshly Pending — well within the watchdog window. + app.prune_stale_pending_extractions(std::time::Duration::from_secs(30)); + assert!(matches!( + app.extracted.get("id1"), + Some(ExtractionState::Pending { .. }) + )); + } + #[test] fn test_lru_hard_cap_when_oldest_is_pending() { // Stress the soft-cap edge case: queue CAP+1 requests as Pending @@ -3069,7 +3259,8 @@ mod tests { false, false, ); - app.record_extraction_result("id1".to_string(), Ok(dummy_extracted("body"))); + let gen = pending_generation(&app, "id1"); + app.record_extraction_result("id1".to_string(), gen, Ok(dummy_extracted("body"))); assert_eq!(app.extracted.len(), 1); assert_eq!(app.extracted_order.len(), 1); app.remove_extraction("id1"); diff --git a/src/feed.rs b/src/feed.rs index d0c4aa8..7b0cf53 100644 --- a/src/feed.rs +++ b/src/feed.rs @@ -293,10 +293,14 @@ pub fn extract_article( let final_url = response.url().to_string(); - // Read at most FULLTEXT_MAX_BYTES + 1 so we can detect overflow without - // ever allocating beyond cap+1. `reqwest::blocking::Response` implements - // `Read`, so `.take()` gives us a hard cap on input consumed. - let mut bytes = Vec::with_capacity(64 * 1024); + // Preallocate at the hard cap so `read_to_end`'s doubling growth can + // never push capacity past it. Without this, starting from a smaller + // hint (e.g. 64 KiB) grows the backing buffer to ~8 MiB for a 5 MiB + // body — invalidating the "peak allocation ~ FULLTEXT_MAX_BYTES" claim. + // Modern allocators don't commit physical pages until written, so a + // 5 MiB virtual reservation costs essentially zero physical memory + // for the common short-page case. + let mut bytes = Vec::with_capacity(FULLTEXT_MAX_BYTES + 1); let mut reader = response.take((FULLTEXT_MAX_BYTES as u64) + 1); reader .read_to_end(&mut bytes) @@ -380,18 +384,45 @@ fn sniff_meta_charset(head: &[u8]) -> Option { continue; } let tag = &s[tag_start..tag_end]; - if let Some(charset_idx) = tag.find("charset") { - let after = &tag[charset_idx + "charset".len()..]; - let after = after.trim_start(); - let after = after.strip_prefix('=').unwrap_or(after).trim_start(); - let after = after + // Walk every `charset` occurrence in the tag, not just the first. + // Without this, `` would match + // inside `name="charset"`, get an empty value, and skip to the next + // tag — silently missing the real declaration. + let mut search_from = 0; + while let Some(rel) = tag[search_from..].find("charset") { + let charset_idx = search_from + rel; + search_from = charset_idx + "charset".len(); + // Require a word boundary before `charset` so we don't match + // inside an attribute value like `name="charset"` (where the + // preceding char is `"`) or inside a longer attribute name like + // `data-charset`. Allowed boundaries are whitespace or `/` + // between attributes, or start-of-tag (`', ';', '/', '\t', '\n', '\r']) - .unwrap_or(after.len()); - let label = &after[..end]; + .unwrap_or(value.len()); + let label = &value[..end]; if !label.is_empty() { return Some(label.to_string()); } @@ -1052,6 +1083,45 @@ mod tests { assert_eq!(label, "iso-8859-1"); } + #[test] + fn test_sniff_meta_charset_skips_attribute_value_named_charset() { + // Pathological: an attribute whose VALUE is literally "charset" + // would have shadowed the real declaration with the old + // first-occurrence-wins logic. The tightened sniffer requires + // `charset` to be preceded by a word boundary AND followed by `=`, + // so the value match is rejected. + let html = b"\ + \ + "; + let label = sniff_meta_charset(html).expect("should find charset"); + assert_eq!(label, "utf-8"); + } + + #[test] + fn test_sniff_meta_charset_skips_data_attr_with_charset_value() { + // `data-foo="charset=bogus"` would have been a false positive under + // the old logic (substring + `=` after charset). The boundary check + // — `charset` must be preceded by whitespace/`/`/start-of-tag — + // rejects it because the preceding char is `"`. + let html = b"\ + \ + "; + let label = sniff_meta_charset(html).expect("should find charset"); + assert_eq!(label, "utf-8"); + } + + #[test] + fn test_sniff_meta_charset_skips_longer_attribute_name_prefixed_with_charset() { + // Word-boundary check: a hypothetical attribute like + // `data-charset="bogus"` shouldn't trigger — the char before + // `charset` is `-`, not a boundary. + let html = b"\ + \ + "; + let label = sniff_meta_charset(html).expect("should find charset"); + assert_eq!(label, "utf-8"); + } + #[test] fn test_is_safe_auto_url_accepts_public_https() { assert!(is_safe_auto_url("https://example.com/article")); diff --git a/src/tui.rs b/src/tui.rs index 318feeb..ae0245b 100644 --- a/src/tui.rs +++ b/src/tui.rs @@ -343,7 +343,7 @@ fn spawn_feed_refresh(app: &mut App) -> (usize, mpsc::Receiver<(usize, Result)>, + tx: &mpsc::Sender<(String, u64, Result)>, inflight: &Arc, manual_client: &mut Option, safe_client: &mut Option, @@ -366,7 +366,10 @@ fn spawn_pending_extractions( // in original order so it retries next tick. let mut deferred: Vec = Vec::new(); loop { - let in_use = inflight.load(Ordering::SeqCst); + // Relaxed is sufficient: the counter is just a bound on concurrent + // workers and doesn't synchronize any other memory. Result hand-off + // happens through the mpsc channel, which carries its own ordering. + let in_use = inflight.load(Ordering::Relaxed); if in_use >= EXTRACTION_MAX_INFLIGHT { break; } @@ -378,11 +381,13 @@ fn spawn_pending_extractions( // round-trip only to have `record_extraction_result` discard the // outcome. Cheap monotonic-correctness gate; must come before the // throttle defer so a stale id doesn't bounce back onto the queue. - if !matches!( - app.extracted.get(&req.id), - Some(crate::app::ExtractionState::Pending) - ) { - continue; + // Match BOTH the Pending state and the request's generation — + // an evicted-and-recreated slot keeps the same id but gets a fresh + // generation, so a stale queued request must be discarded here. + match app.extracted.get(&req.id) { + Some(crate::app::ExtractionState::Pending { generation, .. }) + if *generation == req.generation => {} + _ => continue, } let domain = App::extract_domain_from_url(&req.url); let too_soon = !rate_limit.is_zero() @@ -432,12 +437,13 @@ fn spawn_pending_extractions( // `last_domain_fetch` stamp until AFTER `thread::Builder::spawn` // returns Ok — a rare OS-level spawn failure should re-queue // without burning an extra rate-limit window for the next attempt. - inflight.fetch_add(1, Ordering::SeqCst); + inflight.fetch_add(1, Ordering::Relaxed); let ua = user_agent.clone(); let tx = tx.clone(); let inflight_for_worker = Arc::clone(inflight); let req_id = req.id.clone(); let req_url = req.url.clone(); + let req_generation = req.generation; // `thread::Builder::spawn` returns `io::Result` instead of panicking // on OS-level thread-creation failure (rare, but a panic from the // main TUI loop would crash the app). On failure, release the slot @@ -456,11 +462,11 @@ fn spawn_pending_extractions( "extraction worker panicked (hostile HTML?)" )), }; - let _ = tx.send((req_id, result)); - inflight_for_worker.fetch_sub(1, Ordering::SeqCst); + let _ = tx.send((req_id, req_generation, result)); + inflight_for_worker.fetch_sub(1, Ordering::Relaxed); }); if spawn_result.is_err() { - inflight.fetch_sub(1, Ordering::SeqCst); + inflight.fetch_sub(1, Ordering::Relaxed); app.pending_extraction_requests.push_front(req); break; } @@ -485,7 +491,10 @@ fn run_app(terminal: &mut Terminal, app: &mut App) -> Result<()> // Long-lived channel for extraction worker completions. Outlives any // individual refresh because manual `F` extractions can fire at any time. - let (extract_tx, extract_rx) = mpsc::channel::<(String, Result)>(); + // The `u64` is the request's generation tag, echoed back by the worker so + // `record_extraction_result` can drop late results from + // evicted-and-recreated slots. + let (extract_tx, extract_rx) = mpsc::channel::<(String, u64, Result)>(); // Bounds concurrent extraction workers. Each spawned worker bumps this // on entry and drops it on exit (even on panic), so the pool never // deadlocks even if `dom_smoothie` blows up on hostile HTML. @@ -614,10 +623,16 @@ fn run_app(terminal: &mut Terminal, app: &mut App) -> Result<()> // Drain completed extractions back into app state. This is what // flips a `Pending` entry to `Ready` / `Failed`, which the detail // view reads from on the next draw. - while let Ok((id, result)) = extract_rx.try_recv() { - app.record_extraction_result(id, result); + while let Ok((id, generation, result)) = extract_rx.try_recv() { + app.record_extraction_result(id, generation, result); } + // Watchdog: flip any Pending slot older than `http_timeout * N` to + // `Failed("timed out")`. Cheap (cache cap is 500); covers the rare + // case where a worker thread is killed externally (OOM, signal, + // SIGSEGV in a native dep) and never sends its result. + app.prune_stale_pending_extractions(Duration::from_secs(app.config.network.http_timeout)); + if event::poll(timeout)? { // Handle user input if handle_events(app)? { @@ -735,9 +750,13 @@ mod tests { false, false, ); - // simulate a worker thread completing successfully + // The request's generation is on the queued ExtractionRequest the + // worker would have spawned for. Echoing it back through + // `record_extraction_result` simulates the channel hand-off. + let gen = app.pending_extraction_requests[0].generation; app.record_extraction_result( "id1".to_string(), + gen, Ok(ExtractedArticle { title: "T".to_string(), plain_text: "body".to_string(), @@ -823,6 +842,82 @@ mod tests { assert_eq!(app.pending_extraction_requests[0].url, "https://ex.com/a"); } + /// Regression lock for the "single shared mark per feed arrival" contract. + /// `mark_feed_seen` must only be called ONCE per feed delivery, even when + /// both `exec_on_new` and `fulltext` are consuming the newly-seen set — + /// calling it twice would double-mark and the second consumer would see + /// an empty set. The fix lives at the `run_app` call site (gating on + /// `exec_on_new_template.is_some() || fulltext_feeds.contains(...)`) and + /// then handing the same `&[usize]` to both consumers. + #[test] + fn test_mark_feed_seen_shared_between_exec_on_new_and_fulltext() { + let mut app = App::new(); + // Configure BOTH consumers for this feed. + app.exec_on_new_template = Some(vec!["echo".to_string(), "%t".to_string()]); + let feed = build_feed_with_items( + "https://ex.com/feed.xml", + vec![ + ("First", Some("https://ex.com/1")), + ("Second", Some("https://ex.com/2")), + ], + ); + app.fulltext_feeds.insert(feed.url.clone()); + + // Simulate the first refresh: feed is not yet seeded, so the first + // mark seeds silently and returns an empty newly-seen set. + let first_seen = crate::app::mark_feed_seen(&mut app, &feed); + assert!( + first_seen.is_empty(), + "first observation of a feed must seed silently" + ); + // Both consumers see the same (empty) list — neither fires. + let exec = collect_exec_on_new(&app, &feed, &first_seen); + assert!(exec.is_empty(), "exec_on_new must not fire on first fetch"); + enqueue_fulltext_for_new(&mut app, &feed, &first_seen); + assert!( + app.pending_extraction_requests.is_empty(), + "fulltext must not fire on first fetch" + ); + + // Second refresh delivers the SAME items + one new item. The single + // mark_feed_seen call MUST return only the new index, and both + // consumers fed the same slice MUST both observe it. A second + // mark_feed_seen call here would return [] (double-mark bug). + let feed2 = build_feed_with_items( + "https://ex.com/feed.xml", + vec![ + ("First", Some("https://ex.com/1")), + ("Second", Some("https://ex.com/2")), + ("Third", Some("https://ex.com/3")), + ], + ); + let second_seen = crate::app::mark_feed_seen(&mut app, &feed2); + assert_eq!( + second_seen, + vec![2], + "only the genuinely-new item should be reported" + ); + // A duplicate mark — the bug this whole regression test guards + // against — would now return an empty slice. + let dup = crate::app::mark_feed_seen(&mut app, &feed2); + assert!( + dup.is_empty(), + "calling mark_feed_seen twice for one arrival would starve the second consumer" + ); + + // The actual contract: both consumers fed the SAME `second_seen` + // slice must both react to the new item. + let exec = collect_exec_on_new(&app, &feed2, &second_seen); + assert_eq!(exec.len(), 1, "exec_on_new must fire for the new item"); + enqueue_fulltext_for_new(&mut app, &feed2, &second_seen); + assert_eq!( + app.pending_extraction_requests.len(), + 1, + "fulltext must enqueue for the new item from the same newly-seen slice" + ); + assert_eq!(app.pending_extraction_requests[0].url, "https://ex.com/3"); + } + #[test] fn test_drain_preserves_preexisting_error_and_still_runs_step() { // Edge case: if `app.error` is already set before a step runs, the diff --git a/src/ui/detail.rs b/src/ui/detail.rs index 0784e42..7edc190 100644 --- a/src/ui/detail.rs +++ b/src/ui/detail.rs @@ -1,4 +1,5 @@ use crate::app::{App, ExtractionState}; +use crate::keybindings::{key_display, KeyAction}; use crate::ui::utils::{count_wrapped_lines, format_content_for_reading, truncate_url}; use crate::ui::ColorScheme; use html2text::from_read; @@ -143,7 +144,7 @@ pub(super) fn render_item_detail( Some(ExtractionState::Ready(article)) => { BodySource::Extracted(article.plain_text.clone()) } - Some(ExtractionState::Pending) => BodySource::Extracting, + Some(ExtractionState::Pending { .. }) => BodySource::Extracting, Some(ExtractionState::Failed(reason)) => BodySource::Failed(reason.clone()), None => BodySource::Summary, } @@ -163,15 +164,20 @@ pub(super) fn render_item_detail( } }; + // The retry / toggle hints embed the *current* binding for + // FetchFullText so they stay correct if the user remaps it. + let fulltext_key = key_display(&KeyAction::FetchFullText, &app.keybindings); + let description = match &body_source { BodySource::Summary => summary_text(), BodySource::Failed(reason) => { // Surface the failure reason at the top, then fall back to // the original summary so the user still has something to - // read. Press F again to retry. + // read. Press the FetchFullText key again to retry. format!( - "[Full-text extraction failed: {}]\n[Press F to retry — showing summary]\n\n{}", + "[Full-text extraction failed: {}]\n[Press {} to retry — showing summary]\n\n{}", reason, + fulltext_key, summary_text() ) } @@ -180,9 +186,9 @@ pub(super) fn render_item_detail( // Keep the summary visible while the worker is in flight — // a slow site can take several seconds, and replacing the // body with "Fetching…" leaves the user with nothing to - // read AND no way to revert (Shift+F is a no-op on Pending). - // The scroll-indicator title (see body_label below) carries - // the "Extracting…" status instead. + // read AND no way to revert (the FetchFullText key is a + // no-op on Pending). The scroll-indicator title (see + // body_label below) carries the "Extracting…" status. format!( "[Fetching full text… — showing summary]\n\n{}", summary_text() From 705bfa8ef36344db72be1c5983af61cc8c96e54f Mon Sep 17 00:00:00 2001 From: bahdotsh Date: Fri, 15 May 2026 16:04:47 +0200 Subject: [PATCH 7/9] fix(fulltext): plug inflight leak, reset toggle on focus, block v4 multicast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code review against the full-text extraction path turned up three things worth fixing before merge. The biggest one: the stale-Pending watchdog flipped slots to Failed but *intentionally* didn't decrement `extract_inflight`. The reasoning was sound — if the worker is actually still alive, it will decrement on its own and we'd otherwise underflow the counter. The unstated consequence is that a truly-dead worker (OS kill, SIGSEGV in a native dep) leaks one budget slot for the rest of the session. After four such events the pool is permanently saturated and Shift+F just sits there doing nothing. Let's fix that. `prune_stale_pending_extractions` now returns the count of slots it timed out, and the caller releases that many inflight budget slots using a saturating decrement. The worker exit path uses the same saturating decrement, so a slow worker that completes *after* the watchdog already released its slot clamps at zero instead of wrapping to `usize::MAX` — which would turn the bound into "spawn as many workers as you want until the heap runs out." Worst case is a transient one-slot overcommit, which is a much better failure mode than either the leak or the underflow. While at it: `show_extracted` was a single global bool. If you toggled to summary view on article A and walked to article B, B silently rendered as summary even though its title bar advertised "Full-text" as the body. Anchor the toggle to the focused indices and reset it when focus changes. And the IPv4 branch of `is_global_ip` wasn't rejecting `224.0.0.0/4` even though the IPv6 branch rejected `ff00::/8`. Multicast HTTP servers are not really a thing, but asymmetric allowlists *are* a thing reviewers reasonably complain about. Added `v4.is_multicast()` and called it a day. Tests added for all three. cargo fmt and cargo clippy -D warnings clean; 165 tests pass. --- src/app.rs | 106 +++++++++++++++++++++++++++++++++++++++++------ src/feed.rs | 6 +++ src/tui.rs | 96 ++++++++++++++++++++++++++++++++++++++++-- src/ui/detail.rs | 5 +++ 4 files changed, 196 insertions(+), 17 deletions(-) diff --git a/src/app.rs b/src/app.rs index 8e7961e..eb9b26d 100644 --- a/src/app.rs +++ b/src/app.rs @@ -158,6 +158,14 @@ pub struct App { /// summary. Single global toggle (not per-item) — simplest UX and the /// detail view only ever shows one article at a time. pub show_extracted: bool, + /// `(feed_idx, item_idx)` of the article the detail view rendered last + /// frame. When the focus moves to a different article, `show_extracted` + /// is reset to `true` so the user doesn't carry over a "view summary" + /// toggle from one article to another (and end up staring at a summary + /// while the title bar labels the body `Full-text`). Only the detail + /// view writes to / consults this — other views resolve focus through + /// `current_article_indices` directly. + pub last_detail_focus: Option<(usize, usize)>, /// Queue of extraction requests for the TUI loop to spawn worker threads /// for. Decouples request-time (event handler) from spawn-time (loop /// drain) so events.rs doesn't need the thread-handle / channel. @@ -439,6 +447,7 @@ impl App { extracted_order: VecDeque::new(), fulltext_feeds, show_extracted: true, + last_detail_focus: None, pending_extraction_requests: VecDeque::new(), next_extraction_generation: 0, }; @@ -1883,21 +1892,24 @@ impl App { } /// Sweep Pending slots that have been in-flight longer than - /// `http_timeout * PENDING_WATCHDOG_MULTIPLIER`. A normally-completing - /// worker (success, error, even a Rust panic caught by `catch_unwind`) - /// always sends a result back through the channel. A worker that - /// silently disappears — OS kill, SIGSEGV in a native dep, the parent - /// thread crashing — leaves its slot stuck on Pending forever, which - /// blocks the user from re-requesting via `toggle_or_request_fulltext`'s - /// `contains_key` guard. Flipping to `Failed("timed out")` lets the - /// user retry. The corresponding `inflight` slot is intentionally NOT - /// decremented: if the worker is actually still alive and eventually - /// exits normally, its own `fetch_sub` will run; we'd otherwise - /// double-decrement and underflow the counter. - pub fn prune_stale_pending_extractions(&mut self, http_timeout: std::time::Duration) { + /// `http_timeout * PENDING_WATCHDOG_MULTIPLIER`, flip them to + /// `Failed("timed out")`, and return the count so the caller can release + /// the corresponding inflight budget slots. + /// + /// A normally-completing worker (success, error, even a Rust panic caught + /// by `catch_unwind`) always sends a result back through the channel. A + /// worker that silently disappears — OS kill, SIGSEGV in a native dep, + /// parent thread crash — leaves its slot stuck on Pending forever, which + /// blocks `toggle_or_request_fulltext`'s `contains_key` guard. The + /// returned count is what `run_app` passes to a saturating decrement of + /// the shared `extract_inflight` counter so the pool doesn't leak budget + /// slots across a session. Both the watchdog path and the worker-exit + /// path use saturating decrement, so a slow worker that completes AFTER + /// the watchdog already released its slot can't underflow the counter. + pub fn prune_stale_pending_extractions(&mut self, http_timeout: std::time::Duration) -> usize { let watchdog = http_timeout.saturating_mul(PENDING_WATCHDOG_MULTIPLIER); if watchdog.is_zero() { - return; + return 0; } let now = Instant::now(); let mut to_fail: Vec = Vec::new(); @@ -1908,9 +1920,11 @@ impl App { } } } + let count = to_fail.len(); for id in to_fail { self.insert_extraction(id, ExtractionState::Failed("timed out".to_string())); } + count } /// Forget a single extraction slot (both the state and its LRU @@ -2011,6 +2025,18 @@ impl App { self.success_message_time = Some(std::time::Instant::now()); } } + + /// Called from the detail renderer each frame. If the focused article + /// changed since the last frame, reset the global `show_extracted` + /// toggle to `true` so the user doesn't carry a "view summary" choice + /// from one article to another. Idempotent on unchanged focus. + pub fn sync_detail_focus_anchor(&mut self) { + let current_focus = self.current_article_indices(); + if self.last_detail_focus != current_focus { + self.show_extracted = true; + self.last_detail_focus = current_focus; + } + } } /// Build an `ArticleContext` directly from a `Feed` + `FeedItem` pair — @@ -3267,4 +3293,58 @@ mod tests { assert!(app.extracted.is_empty()); assert!(app.extracted_order.is_empty()); } + + #[test] + fn test_sync_detail_focus_anchor_resets_show_extracted_on_focus_change() { + // Regression test for the cross-article show_extracted bug: the + // global toggle must reset to `true` whenever the detail view's + // focused article changes, so a "view summary" choice on article A + // doesn't silently carry over to article B. + let mut app = make_test_app(); + app.view = View::FeedItemDetail; + app.selected_feed = Some(0); + app.selected_item = Some(0); + // First sync — establishes the anchor at (0, 0). + app.sync_detail_focus_anchor(); + assert!(app.show_extracted); + assert_eq!(app.last_detail_focus, Some((0, 0))); + + // Simulate the user pressing Shift+F to toggle off on article (0, 0). + app.show_extracted = false; + // Re-sync on the same article — must NOT clobber the user's toggle. + app.sync_detail_focus_anchor(); + assert!( + !app.show_extracted, + "anchor unchanged → user's toggle preserved" + ); + + // User navigates to article (0, 1). Sync must reset the toggle. + app.selected_item = Some(1); + app.sync_detail_focus_anchor(); + assert!( + app.show_extracted, + "focus changed → show_extracted must reset to true" + ); + assert_eq!(app.last_detail_focus, Some((0, 1))); + } + + #[test] + fn test_sync_detail_focus_anchor_is_noop_when_focus_unchanged() { + // Per-frame call must be idempotent — otherwise a toggled-off state + // would be wiped on the very next frame and the toggle would feel + // unresponsive. + let mut app = make_test_app(); + app.view = View::FeedItemDetail; + app.selected_feed = Some(0); + app.selected_item = Some(0); + app.sync_detail_focus_anchor(); + app.show_extracted = false; + for _ in 0..10 { + app.sync_detail_focus_anchor(); + } + assert!( + !app.show_extracted, + "repeated syncs must not wipe the toggle" + ); + } } diff --git a/src/feed.rs b/src/feed.rs index 7b0cf53..ed14897 100644 --- a/src/feed.rs +++ b/src/feed.rs @@ -178,6 +178,9 @@ fn is_global_ip(ip: IpAddr) -> bool { || v4.is_broadcast() || v4.is_documentation() || v4.is_unspecified() + // Multicast 224.0.0.0/4 — IPv6 multicast (ff00::/8) is + // already rejected below; this closes the v4/v6 asymmetry. + || v4.is_multicast() || v4.octets()[0] == 0 // CGNAT 100.64.0.0/10 || (v4.octets()[0] == 100 && (v4.octets()[1] & 0xC0) == 64) @@ -1156,6 +1159,9 @@ mod tests { assert!(!is_safe_auto_url("http://[fe80::1]/x")); // IPv6 deprecated site-local fec0::/10 assert!(!is_safe_auto_url("http://[fec0::1]/x")); + // IPv4 multicast 224.0.0.0/4 — parity with IPv6 multicast below. + assert!(!is_safe_auto_url("http://224.0.0.1/x")); + assert!(!is_safe_auto_url("http://239.255.255.250/x")); // IPv6 multicast ff00::/8 assert!(!is_safe_auto_url("http://[ff02::1]/x")); // 6to4 wrapping link-local — would route via 6to4 relay to diff --git a/src/tui.rs b/src/tui.rs index ae0245b..83979eb 100644 --- a/src/tui.rs +++ b/src/tui.rs @@ -26,6 +26,20 @@ use std::{io, time::Duration}; /// the politeness budget the feed-refresh path already uses. const EXTRACTION_MAX_INFLIGHT: usize = 4; +/// Saturating decrement of the inflight counter — used by BOTH the worker's +/// exit path and the watchdog. The watchdog releases a slot for any Pending +/// it flips to `Failed("timed out")` so a truly dead worker can't leak +/// budget for the rest of the session. If both paths fire for the same +/// slot (slow worker the watchdog wrote off, then completed anyway), the +/// saturating sub clamps at zero instead of underflowing to usize::MAX — +/// which would effectively turn the bound into "spawn as many workers as +/// you want until the heap runs out". +fn release_inflight_slot(counter: &AtomicUsize) { + let _ = counter.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| { + Some(x.saturating_sub(1)) + }); +} + /// RAII guard that re-enters the alt-screen + raw mode + mouse capture on drop, /// so a panic during a pipe-to child invocation cannot leave the terminal in /// a broken state. @@ -463,10 +477,10 @@ fn spawn_pending_extractions( )), }; let _ = tx.send((req_id, req_generation, result)); - inflight_for_worker.fetch_sub(1, Ordering::Relaxed); + release_inflight_slot(&inflight_for_worker); }); if spawn_result.is_err() { - inflight.fetch_sub(1, Ordering::Relaxed); + release_inflight_slot(inflight); app.pending_extraction_requests.push_front(req); break; } @@ -630,8 +644,17 @@ fn run_app(terminal: &mut Terminal, app: &mut App) -> Result<()> // Watchdog: flip any Pending slot older than `http_timeout * N` to // `Failed("timed out")`. Cheap (cache cap is 500); covers the rare // case where a worker thread is killed externally (OOM, signal, - // SIGSEGV in a native dep) and never sends its result. - app.prune_stale_pending_extractions(Duration::from_secs(app.config.network.http_timeout)); + // SIGSEGV in a native dep) and never sends its result. Release one + // inflight slot per pruned Pending so a chain of dead workers can't + // permanently saturate the pool. The matching saturating sub in the + // worker's exit path makes a spurious wake-up (worker actually still + // alive, finishes later) safe — at worst we transiently overcommit + // by one slot, never underflow. + let pruned = app + .prune_stale_pending_extractions(Duration::from_secs(app.config.network.http_timeout)); + for _ in 0..pruned { + release_inflight_slot(&extract_inflight); + } if event::poll(timeout)? { // Handle user input @@ -740,6 +763,71 @@ mod tests { assert!(app.show_help_overlay); } + #[test] + fn test_release_inflight_slot_saturates_at_zero() { + // The watchdog and worker-exit paths can race for the same slot. + // The saturating decrement must clamp at zero rather than wrapping + // to usize::MAX — otherwise the inflight bound is meaningless and + // the pool would spawn workers without limit. + let counter = AtomicUsize::new(0); + release_inflight_slot(&counter); + release_inflight_slot(&counter); + assert_eq!(counter.load(Ordering::Relaxed), 0); + } + + #[test] + fn test_release_inflight_slot_decrements_above_zero() { + let counter = AtomicUsize::new(3); + release_inflight_slot(&counter); + assert_eq!(counter.load(Ordering::Relaxed), 2); + release_inflight_slot(&counter); + release_inflight_slot(&counter); + assert_eq!(counter.load(Ordering::Relaxed), 0); + // One more — must still clamp. + release_inflight_slot(&counter); + assert_eq!(counter.load(Ordering::Relaxed), 0); + } + + #[test] + fn test_prune_returns_count_so_caller_can_release_inflight() { + // Regression test for the inflight-leak fix: the watchdog must report + // how many slots it pruned so `run_app` can release that many inflight + // budget slots. Without the count, a chain of dead workers would + // saturate the pool for the rest of the session. + use crate::app::ExtractionState; + let mut app = App::new(); + app.request_extraction( + "stuck1".to_string(), + "https://ex.com/a".to_string(), + false, + false, + ); + app.request_extraction( + "stuck2".to_string(), + "https://ex.com/b".to_string(), + false, + false, + ); + // Forcibly age both Pending slots past the watchdog window. + let aged = std::time::Instant::now() + .checked_sub(Duration::from_secs(3600)) + .expect("Instant::now() - 1h must be representable"); + for id in ["stuck1", "stuck2"] { + if let Some(ExtractionState::Pending { generation, .. }) = app.extracted.get(id) { + let gen = *generation; + app.extracted.insert( + id.to_string(), + ExtractionState::Pending { + generation: gen, + started_at: aged, + }, + ); + } + } + let pruned = app.prune_stale_pending_extractions(Duration::from_secs(30)); + assert_eq!(pruned, 2, "watchdog must report the slot count"); + } + #[test] fn test_record_extraction_result_flips_pending_to_ready() { use crate::app::ExtractionState; diff --git a/src/ui/detail.rs b/src/ui/detail.rs index 7edc190..02afa00 100644 --- a/src/ui/detail.rs +++ b/src/ui/detail.rs @@ -30,6 +30,11 @@ pub(super) fn render_item_detail( area: Rect, colors: &ColorScheme, ) { + // Reset the `show_extracted` toggle on article focus change. Done at + // the top so we hold no immutable borrow on `app` (the `current_item` + // call below would conflict). Logic lives on `App` so it's + // unit-testable without spinning up a TestBackend. + app.sync_detail_focus_anchor(); if let Some(item) = app.current_item() { // Split the area into header and content with better proportions let chunks = Layout::default() From 106994fb5aab2005c863584109ea592feb3e9ec2 Mon Sep 17 00:00:00 2001 From: bahdotsh Date: Fri, 15 May 2026 16:25:41 +0200 Subject: [PATCH 8/9] fix(fulltext): stop watchdog from leaking inflight on stranded slots MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code review caught a slow leak in the extraction worker pool. The 4-worker cap turned out to be advisory at best. It turns out `ExtractionState::Pending.started_at` was stamped at `request_extraction` time, but the watchdog uses the same field to identify stuck workers — and releases one inflight permit per pruned slot. A request that aged out while still *queued* (rate-limited behind a long backlog) never claimed a permit in the first place. The watchdog "released" it anyway, silently widening the effective pool by one per spurious release. Saturating sub kept it from underflowing, so this wasn't a crash — just drift in the wrong direction. Not great. Fix is structural: add `spawned: bool` to Pending, default false, flip to true only after `thread::Builder::spawn` returns Ok. The watchdog now counts only spawned-pruned slots toward the inflight release. Queue-stranded slots still flip to Failed so manual retry works — only the permit accounting is gated. Adding a field forces every construction site to declare its spawned-state, which beats comments. While at it: switch the `dom_smoothie` spec from `0.17` to `~0.17` — same constraint in cargo (pre-1.0 caret == tilde), but the intent is explicit at the manifest for a security-critical dep. Reword the misleading "peak allocation ~ FULLTEXT_MAX_BYTES" comment (DOM allocation inside dom_smoothie is separate working-set, not bounded here). Swap `format!("{}", e)` for `format!("{:#}", e)` in `record_extraction_result` so anyhow's cause chain survives. Also added five hermetic stub tests for `extract_article`: oversized Content-Length, wrong Content-Type, and the safe-redirect client refusing to follow into RFC1918. The pure-Readability path was well-tested; the wrapping fetch path's security gates were not. --- Cargo.toml | 9 ++- src/app.rs | 213 +++++++++++++++++++++++++++++++++++++++++++++++++--- src/feed.rs | 161 ++++++++++++++++++++++++++++++++++++++- src/tui.rs | 42 +++++++---- 4 files changed, 395 insertions(+), 30 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 52afc74..8f08ddc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,7 +34,14 @@ toml = "0.8" scraper = "0.18" url = "2" shlex = "1.3" -dom_smoothie = "0.17" +# dom_smoothie is pre-1.0 and is the security-critical HTML parsing surface +# for the full-text extraction path. Cargo.lock is committed (so the actual +# version pin is enforced there), and the default `"0.17"` (=caret) already +# blocks 0.18+ on pre-1.0 crates. The explicit `~0.17` here is intent +# signaling — it makes "patch-only updates allowed; major-version bumps +# require manual review" obvious at the manifest level rather than only in +# the lockfile. +dom_smoothie = "~0.17" encoding_rs = "0.8" [profile.release] diff --git a/src/app.rs b/src/app.rs index eb9b26d..d92f0ce 100644 --- a/src/app.rs +++ b/src/app.rs @@ -235,9 +235,18 @@ pub enum ExtractionState { /// a Pending slot is evicted from the LRU and re-requested for the same /// id — without the tag, the old worker's eventual result would land on /// the new slot. + /// + /// `spawned` flips to true only after the spawn loop has both bumped + /// the inflight counter AND successfully called `thread::Builder::spawn`. + /// A request that ages out while still queued (rate-limited behind a + /// backlog longer than the watchdog window) is `spawned == false`, and + /// the watchdog MUST NOT release an inflight slot for it — there was no + /// permit to release, and a spurious release would shrink the effective + /// pool below `EXTRACTION_MAX_INFLIGHT`. Pending { generation: u64, started_at: Instant, + spawned: bool, }, Ready(ExtractedArticle), Failed(String), @@ -1843,8 +1852,13 @@ impl App { _ => return, } let state = match result { + // `{:#}` flattens the anyhow cause chain into one line + // ("outer: caused by: inner"), preserving context that the + // default `{}` formatter drops. The user only ever sees the + // top message in the detail view, but a chained message helps + // when the failure is logged or reported. Ok(article) => ExtractionState::Ready(article), - Err(e) => ExtractionState::Failed(format!("{}", e)), + Err(e) => ExtractionState::Failed(format!("{:#}", e)), }; self.insert_extraction(id, state); } @@ -1876,6 +1890,7 @@ impl App { ExtractionState::Pending { generation, started_at: Instant::now(), + spawned: false, }, ); let req = ExtractionRequest { @@ -1893,8 +1908,9 @@ impl App { /// Sweep Pending slots that have been in-flight longer than /// `http_timeout * PENDING_WATCHDOG_MULTIPLIER`, flip them to - /// `Failed("timed out")`, and return the count so the caller can release - /// the corresponding inflight budget slots. + /// `Failed("timed out")`, and return the count of pruned slots that + /// were actually *spawned* — so the caller can release exactly that + /// many inflight budget slots. /// /// A normally-completing worker (success, error, even a Rust panic caught /// by `catch_unwind`) always sends a result back through the channel. A @@ -1903,28 +1919,58 @@ impl App { /// blocks `toggle_or_request_fulltext`'s `contains_key` guard. The /// returned count is what `run_app` passes to a saturating decrement of /// the shared `extract_inflight` counter so the pool doesn't leak budget - /// slots across a session. Both the watchdog path and the worker-exit - /// path use saturating decrement, so a slow worker that completes AFTER - /// the watchdog already released its slot can't underflow the counter. + /// slots across a session. + /// + /// Critically, we only count pruned slots whose `spawned` flag is true: + /// a request that aged out while still queued never claimed an inflight + /// permit, so releasing one for it would silently widen the effective + /// worker cap. Both the watchdog path and the worker-exit path use + /// saturating decrement, so a slow worker that completes AFTER the + /// watchdog already released its slot can't underflow the counter. pub fn prune_stale_pending_extractions(&mut self, http_timeout: std::time::Duration) -> usize { let watchdog = http_timeout.saturating_mul(PENDING_WATCHDOG_MULTIPLIER); if watchdog.is_zero() { return 0; } let now = Instant::now(); - let mut to_fail: Vec = Vec::new(); + let mut to_fail: Vec<(String, bool)> = Vec::new(); for (id, state) in self.extracted.iter() { - if let ExtractionState::Pending { started_at, .. } = state { + if let ExtractionState::Pending { + started_at, + spawned, + .. + } = state + { if now.duration_since(*started_at) > watchdog { - to_fail.push(id.clone()); + to_fail.push((id.clone(), *spawned)); } } } - let count = to_fail.len(); - for id in to_fail { + let mut spawned_pruned = 0usize; + for (id, spawned) in to_fail { + if spawned { + spawned_pruned += 1; + } self.insert_extraction(id, ExtractionState::Failed("timed out".to_string())); } - count + spawned_pruned + } + + /// Mark a Pending slot as spawned (the spawn loop has bumped the inflight + /// counter and successfully called `thread::Builder::spawn`). Idempotent + /// — a no-op for any other state, so race conditions where the worker + /// already landed a result before this is called are harmless. + pub(crate) fn mark_extraction_spawned(&mut self, id: &str, generation: u64) { + if let Some(ExtractionState::Pending { + generation: g, + spawned, + .. + }) = self.extracted.get_mut(id) + { + if *g == generation { + *spawned = true; + } + } } /// Forget a single extraction slot (both the state and its LRU @@ -3215,6 +3261,9 @@ mod tests { false, ); // Forcibly age the Pending slot's started_at past the watchdog. + // Mark it `spawned=true` so this test exercises the genuine + // stuck-worker case (a worker that bumped inflight and never + // released it) — that's what the watchdog is meant to recover. let id = "id1".to_string(); let gen = pending_generation(&app, &id); let aged = Instant::now() @@ -3225,6 +3274,7 @@ mod tests { ExtractionState::Pending { generation: gen, started_at: aged, + spawned: true, }, ); // 30 s × 3 = 90 s watchdog; aged 3600 s exceeds it. @@ -3241,6 +3291,145 @@ mod tests { } } + #[test] + fn test_prune_returns_zero_for_aged_but_unspawned_pending() { + // Regression test for the inflight-counter drift: a request that + // aged out while still queued (rate-limited behind a long backlog) + // has `spawned=false`, so the watchdog must NOT count it toward the + // inflight-release total. Counting it would silently widen the + // effective worker cap because no permit was ever claimed. + let mut app = App::new(); + app.request_extraction( + "stranded".to_string(), + "https://ex.com/a".to_string(), + false, + false, + ); + let gen = pending_generation(&app, "stranded"); + // Age the slot but leave `spawned=false` — the request never made + // it through the spawn loop. + let aged = Instant::now() + .checked_sub(std::time::Duration::from_secs(3600)) + .expect("Instant::now() - 1h must be representable"); + app.extracted.insert( + "stranded".to_string(), + ExtractionState::Pending { + generation: gen, + started_at: aged, + spawned: false, + }, + ); + let pruned = app.prune_stale_pending_extractions(std::time::Duration::from_secs(30)); + assert_eq!( + pruned, 0, + "watchdog must return zero for queue-stranded prunes — no inflight permit was ever claimed" + ); + // The slot still flips to Failed so the user can retry; only the + // inflight-release count is gated on `spawned`. + assert!( + matches!( + app.extracted.get("stranded"), + Some(ExtractionState::Failed(_)) + ), + "queue-stranded slot must still be flipped to Failed so a manual retry works" + ); + } + + #[test] + fn test_prune_counts_only_spawned_slots_when_mixed() { + // Mixed batch: one spawned (real stuck worker) and one queue-stranded. + // Only the spawned one should be counted toward inflight release. + let mut app = App::new(); + app.request_extraction( + "spawned".to_string(), + "https://ex.com/a".to_string(), + false, + false, + ); + app.request_extraction( + "queued".to_string(), + "https://ex.com/b".to_string(), + false, + false, + ); + let aged = Instant::now() + .checked_sub(std::time::Duration::from_secs(3600)) + .expect("Instant::now() - 1h must be representable"); + let g1 = pending_generation(&app, "spawned"); + let g2 = pending_generation(&app, "queued"); + app.extracted.insert( + "spawned".to_string(), + ExtractionState::Pending { + generation: g1, + started_at: aged, + spawned: true, + }, + ); + app.extracted.insert( + "queued".to_string(), + ExtractionState::Pending { + generation: g2, + started_at: aged, + spawned: false, + }, + ); + let pruned = app.prune_stale_pending_extractions(std::time::Duration::from_secs(30)); + assert_eq!(pruned, 1, "only the spawned slot must be counted"); + } + + #[test] + fn test_mark_extraction_spawned_flips_flag_on_matching_generation() { + let mut app = App::new(); + app.request_extraction( + "id1".to_string(), + "https://ex.com/a".to_string(), + false, + false, + ); + let gen = pending_generation(&app, "id1"); + // Before: spawned=false from request_extraction. + assert!(matches!( + app.extracted.get("id1"), + Some(ExtractionState::Pending { spawned: false, .. }) + )); + app.mark_extraction_spawned("id1", gen); + assert!(matches!( + app.extracted.get("id1"), + Some(ExtractionState::Pending { spawned: true, .. }) + )); + } + + #[test] + fn test_mark_extraction_spawned_is_noop_on_generation_mismatch() { + // If the slot was evicted and re-requested between spawn() and + // mark_extraction_spawned, the old generation must not flip the + // flag on the fresh slot. The stale worker's record_extraction_result + // would also be dropped by the generation check, so leaving + // spawned=false on the fresh slot is the right semantic. + let mut app = App::new(); + app.request_extraction( + "id1".to_string(), + "https://ex.com/a".to_string(), + false, + false, + ); + let stale_gen = pending_generation(&app, "id1"); + app.remove_extraction("id1"); + app.request_extraction( + "id1".to_string(), + "https://ex.com/a".to_string(), + false, + false, + ); + let fresh_gen = pending_generation(&app, "id1"); + assert_ne!(stale_gen, fresh_gen); + app.mark_extraction_spawned("id1", stale_gen); + assert!(matches!( + app.extracted.get("id1"), + Some(ExtractionState::Pending { spawned: false, .. }) + )); + } + #[test] fn test_prune_stale_pending_extractions_leaves_fresh_pending_alone() { let mut app = App::new(); diff --git a/src/feed.rs b/src/feed.rs index ed14897..bc751f3 100644 --- a/src/feed.rs +++ b/src/feed.rs @@ -299,10 +299,12 @@ pub fn extract_article( // Preallocate at the hard cap so `read_to_end`'s doubling growth can // never push capacity past it. Without this, starting from a smaller // hint (e.g. 64 KiB) grows the backing buffer to ~8 MiB for a 5 MiB - // body — invalidating the "peak allocation ~ FULLTEXT_MAX_BYTES" claim. - // Modern allocators don't commit physical pages until written, so a - // 5 MiB virtual reservation costs essentially zero physical memory - // for the common short-page case. + // body — making the body-buffer ceiling ambiguous. With the + // preallocation, the body-buffer ceiling is exactly FULLTEXT_MAX_BYTES; + // the downstream DOM allocation inside dom_smoothie is separate + // working-set and not bounded here. Modern allocators don't commit + // physical pages until written, so a 5 MiB virtual reservation costs + // essentially zero physical memory for the common short-page case. let mut bytes = Vec::with_capacity(FULLTEXT_MAX_BYTES + 1); let mut reader = response.take((FULLTEXT_MAX_BYTES as u64) + 1); reader @@ -1195,4 +1197,155 @@ mod tests { err ); } + + // ── extract_article end-to-end tests against a local TcpListener stub ── + // + // These exercise the wrapping fetch path — Content-Length cap, + // Content-Type rejection, and the safe-redirect client's behavior when + // a 30x points into a private IP. The pure-Readability parsing path + // (extract_from_html) is already covered above; these close the gap on + // the network-side security gates. + // + // No external mock-server dependency: a one-shot TcpListener serves a + // single canned HTTP response and exits. Cheap, hermetic, deterministic. + + use std::io::{Read as IoRead, Write as IoWrite}; + use std::net::TcpListener; + use std::thread; + + /// Spawn a one-shot HTTP stub on 127.0.0.1 that writes `response_bytes` + /// verbatim to the first accepted client and closes. Returns the bound + /// URL. The stub thread is detached — the OS reclaims it after the test. + fn spawn_stub_server(response_bytes: Vec) -> String { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind 127.0.0.1"); + let port = listener.local_addr().expect("local_addr").port(); + thread::spawn(move || { + if let Ok((mut stream, _)) = listener.accept() { + // Read SOME of the request so the client doesn't get a + // RST while still writing headers. Single read is enough + // for a short GET; we don't actually inspect it. + let mut buf = [0u8; 4096]; + let _ = stream.read(&mut buf); + let _ = stream.write_all(&response_bytes); + let _ = stream.flush(); + // Dropping the stream signals connection close to the client. + } + }); + format!("http://127.0.0.1:{}/", port) + } + + #[test] + fn test_extract_article_rejects_oversized_content_length() { + // Server declares 100 MiB body — far past the 5 MiB cap. The + // pre-check on the Content-Length header must short-circuit the + // read entirely (no body actually streamed in this test). + let declared = FULLTEXT_MAX_BYTES * 20; + let resp = format!( + "HTTP/1.1 200 OK\r\n\ + Content-Type: text/html\r\n\ + Content-Length: {}\r\n\ + Connection: close\r\n\ + \r\n", + declared + ); + let url = spawn_stub_server(resp.into_bytes()); + let client = reqwest::blocking::Client::builder() + .timeout(Duration::from_secs(5)) + .build() + .unwrap(); + let err = extract_article(&url, &client, None).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("too large"), + "expected oversized rejection, got: {}", + msg + ); + } + + #[test] + fn test_extract_article_rejects_non_html_content_type() { + let body = b"\x89PNG\r\n\x1a\nnotactuallyhtml"; + let resp = format!( + "HTTP/1.1 200 OK\r\n\ + Content-Type: image/png\r\n\ + Content-Length: {}\r\n\ + Connection: close\r\n\ + \r\n", + body.len() + ); + let mut bytes = resp.into_bytes(); + bytes.extend_from_slice(body); + let url = spawn_stub_server(bytes); + let client = reqwest::blocking::Client::builder() + .timeout(Duration::from_secs(5)) + .build() + .unwrap(); + let err = extract_article(&url, &client, None).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("did not return HTML") || msg.contains("content-type"), + "expected content-type rejection, got: {}", + msg + ); + } + + #[test] + fn test_extract_article_rejects_redirect_into_private_ip_via_safe_client() { + // Stub returns a 302 pointing into RFC1918 space. The + // safe-redirect client's policy must refuse to follow — the + // resulting response surfaces as a non-2xx status, which + // extract_article translates into an "HTTP error" failure. + // (Without the safe client, reqwest would happily follow the 302 + // and probe 192.168.1.1 — the SSRF this exists to prevent.) + let resp = b"HTTP/1.1 302 Found\r\n\ + Location: http://192.168.1.1/admin\r\n\ + Content-Length: 0\r\n\ + Connection: close\r\n\ + \r\n"; + let url = spawn_stub_server(resp.to_vec()); + let client = + Feed::build_safe_redirect_client(5).expect("safe-redirect client should build"); + let err = extract_article(&url, &client, None).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("HTTP error") || msg.contains("302"), + "expected non-2xx surfacing from refused redirect, got: {}", + msg + ); + } + + #[test] + fn test_extract_article_follows_safe_redirect_to_public_target() { + // Sanity-check the other side of the redirect policy: a 302 to a + // public-looking target is followed. This stub returns 302 → + // http://example.invalid/ which the client WILL try to follow. + // The follow attempt will fail (example.invalid is unresolvable), + // but the failure must be a NETWORK error, NOT the policy-stop + // "HTTP error 302" surfacing from the refused-redirect path. + // That distinction is what tells us the policy correctly let the + // public target through. + let resp = b"HTTP/1.1 302 Found\r\n\ + Location: http://example.invalid/article\r\n\ + Content-Length: 0\r\n\ + Connection: close\r\n\ + \r\n"; + let url = spawn_stub_server(resp.to_vec()); + let client = + Feed::build_safe_redirect_client(5).expect("safe-redirect client should build"); + let err = extract_article(&url, &client, None).unwrap_err(); + let msg = err.to_string().to_lowercase(); + // Must NOT be "HTTP error 302" — that would mean the policy + // refused to follow a public target. Must be a fetch / dns / io + // failure from the actual follow attempt. + assert!( + !msg.contains("http error 302"), + "policy must follow public-target redirects, but surfaced 302: {}", + msg + ); + assert!( + msg.contains("failed to fetch") || msg.contains("dns") || msg.contains("resolve"), + "expected a network failure from following the public redirect, got: {}", + msg + ); + } } diff --git a/src/tui.rs b/src/tui.rs index 83979eb..3039ef0 100644 --- a/src/tui.rs +++ b/src/tui.rs @@ -484,9 +484,16 @@ fn spawn_pending_extractions( app.pending_extraction_requests.push_front(req); break; } - // Spawn succeeded — now stamp the domain so the next request for - // the same host respects the rate-limit. Done last so a failed - // spawn doesn't leave a stale stamp behind. + // Spawn succeeded — flip the slot's `spawned` flag so the watchdog + // knows this slot actually holds an inflight permit. Without this, + // a queue-stranded request (rate-limited beyond the watchdog + // window) would have its inflight slot "released" by the watchdog + // even though it never claimed one — silently widening the + // effective worker cap by one per spurious release. + app.mark_extraction_spawned(&req.id, req.generation); + // Stamp the domain so the next request for the same host respects + // the rate-limit. Done after spawn so a failed spawn doesn't leave + // a stale stamp behind. if !domain.is_empty() { app.last_domain_fetch.insert(domain, now); } @@ -642,14 +649,17 @@ fn run_app(terminal: &mut Terminal, app: &mut App) -> Result<()> } // Watchdog: flip any Pending slot older than `http_timeout * N` to - // `Failed("timed out")`. Cheap (cache cap is 500); covers the rare - // case where a worker thread is killed externally (OOM, signal, - // SIGSEGV in a native dep) and never sends its result. Release one - // inflight slot per pruned Pending so a chain of dead workers can't - // permanently saturate the pool. The matching saturating sub in the - // worker's exit path makes a spurious wake-up (worker actually still - // alive, finishes later) safe — at worst we transiently overcommit - // by one slot, never underflow. + // `Failed("timed out")`. Cost ceiling per tick is bounded by + // EXTRACTED_CACHE_CAP (500) map reads — at the configured tick rate + // that's ~5k reads/sec worst case, negligible. Covers the rare case + // where a worker thread is killed externally (OOM, signal, SIGSEGV + // in a native dep) and never sends its result. `prune_*` returns + // the count of pruned slots that were actually *spawned* (held an + // inflight permit); queue-stranded prunes don't release inflight, + // so a long backlog can't shrink the effective pool. The matching + // saturating sub in the worker's exit path makes a spurious wake-up + // (worker actually still alive, finishes later) safe — at worst we + // transiently overcommit by one slot, never underflow. let pruned = app .prune_stale_pending_extractions(Duration::from_secs(app.config.network.http_timeout)); for _ in 0..pruned { @@ -808,7 +818,9 @@ mod tests { false, false, ); - // Forcibly age both Pending slots past the watchdog window. + // Forcibly age both Pending slots past the watchdog window AND + // mark them spawned=true so they represent real stuck workers + // (the case the watchdog's inflight-release is meant for). let aged = std::time::Instant::now() .checked_sub(Duration::from_secs(3600)) .expect("Instant::now() - 1h must be representable"); @@ -820,12 +832,16 @@ mod tests { ExtractionState::Pending { generation: gen, started_at: aged, + spawned: true, }, ); } } let pruned = app.prune_stale_pending_extractions(Duration::from_secs(30)); - assert_eq!(pruned, 2, "watchdog must report the slot count"); + assert_eq!( + pruned, 2, + "watchdog must report the spawned-pruned count for inflight release" + ); } #[test] From 1b18f442da66d2f0b48cfee37260ae394e3310b7 Mon Sep 17 00:00:00 2001 From: bahdotsh Date: Fri, 15 May 2026 16:48:47 +0200 Subject: [PATCH 9/9] fix(fulltext): close LRU/watchdog leak and stop promoting Failed to MRU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Another pass over the extraction worker pool turned up two real correctness issues, plus a couple of load-bearing invariants the code was relying on without anything actually locking them. The big one: `insert_extraction` unconditionally evicted the oldest deque entry when the cache hit its cap. If that entry happened to be a `Pending` slot whose worker was still alive — and the cache only fills under sustained extraction load, so by definition the workers ARE alive — the slot disappears from `extracted`. The stale-Pending watchdog only walks `extracted`, so once the slot is gone, the watchdog can't find it. If that worker is silently dead (OS kill, SIGSEGV in a native dep), its inflight permit leaks for the rest of the session. Eventually the pool saturates and Shift+F sits there doing nothing. Not great. Fix is to scan the deque from the head for the first entry that's NOT a `Pending { spawned: true }` and evict THAT one. Soft cap instead of hard — the effective bound becomes `EXTRACTED_CACHE_CAP + EXTRACTION_MAX_INFLIGHT`, i.e. 504 entries instead of 500. The worker pool size caps how many we can ever be protecting, so this can't grow without bound. If literally every entry is a spawned Pending (would mean the whole pool has been stuck for a long time), we let the cache transiently grow; the watchdog gets to those slots on the next tick and restores evictability on the next insert. The second one: `prune_stale_pending_extractions` flipped a stale Pending to Failed by calling `insert_extraction`, which moves the entry to MRU. So a watchdog-timed-out slot was suddenly *fresher* than every Ready entry that came in after it. Inverted eviction order, for absolutely no reason. Mutate the state in place instead. Preserves the original LRU position so timed-out entries get evicted ahead of newer Ready results, which is what you'd actually want. While at it, the comment on the watchdog's inflight release said "transiently overcommit by one slot." That's wrong. If all four pool slots stall, the watchdog releases four, four new workers spawn, and now we have eight actually running. The real bound is `EXTRACTION_MAX_INFLIGHT` slots of overcommit, not one. Fix the comment to say what the code actually does. Two contracts that were load-bearing but had no test coverage: * `extract_article` does not send Authorization headers. The function signature deliberately omits a headers parameter, but nothing failed loudly if a future refactor added one to the static header list. Added a capturing stub server and a test that asserts on the raw bytes on the wire. If anyone ever wires per-feed auth into article extraction, this fails immediately. * `enqueue_fulltext_for_new` short-circuits on items that already have any extraction state (Ready / Failed / Pending). The guard lives in `request_extraction`'s `contains_key` check, but a refactor could quietly remove it and a refresh would suddenly start re-extracting every cached article on every cycle. Locked that too. cargo fmt clean, clippy -D warnings clean, 177/177 tests passing. --- src/app.rs | 160 +++++++++++++++++++++++++++++++++++++++++++++++++--- src/feed.rs | 69 ++++++++++++++++++++++ src/tui.rs | 92 +++++++++++++++++++++++++++++- 3 files changed, 311 insertions(+), 10 deletions(-) diff --git a/src/app.rs b/src/app.rs index d92f0ce..d9dd304 100644 --- a/src/app.rs +++ b/src/app.rs @@ -275,6 +275,14 @@ pub struct ExtractionRequest { /// feed could otherwise accumulate state monotonically across a long /// session. Insertion-order LRU eviction (oldest cache entries are dropped /// first) — `extracted_order` tracks the insertion sequence. +/// +/// Soft cap, not hard: `insert_extraction` refuses to evict +/// `Pending { spawned: true }` slots — evicting one would hide it from the +/// stale-Pending watchdog (which only looks at `extracted`) and leak the +/// worker's inflight permit if the worker is dead. The number of such slots +/// at any moment is bounded by the worker-pool size (see +/// `EXTRACTION_MAX_INFLIGHT` in `tui.rs`), so the effective cap grows by at +/// most that constant. pub const EXTRACTED_CACHE_CAP: usize = 500; /// Stale-`Pending` watchdog timeout, as a multiple of `http_timeout`. A worker @@ -1951,7 +1959,13 @@ impl App { if spawned { spawned_pruned += 1; } - self.insert_extraction(id, ExtractionState::Failed("timed out".to_string())); + // Mutate in place — must NOT route through `insert_extraction`, + // which promotes the entry to MRU. A watchdog-failed slot is + // strictly less useful than newer Ready entries, so it should + // keep its original LRU position and be evicted ahead of them. + if let Some(state) = self.extracted.get_mut(&id) { + *state = ExtractionState::Failed("timed out".to_string()); + } } spawned_pruned } @@ -1993,15 +2007,34 @@ impl App { self.extracted_order.push_back(id); return; } - // Hard cap: evict the oldest entry unconditionally. If that entry - // happens to be `Pending`, the worker's eventual write-back will be - // dropped by `record_extraction_result`'s staleness check — so we - // never grow past EXTRACTED_CACHE_CAP, and we never resurrect an - // evicted slot. + // LRU eviction with a soft-protect for `Pending { spawned: true }` + // slots. Evicting a spawned Pending would hide the slot from the + // stale-Pending watchdog (it iterates `extracted`), leaving a dead + // worker's inflight permit leaked for the rest of the session. + // Skip those slots and evict the next-oldest evictable entry. + // + // The protected-slot count is bounded by the worker pool (see + // `EXTRACTION_MAX_INFLIGHT` in `tui.rs`), so the cache size is + // bounded at `EXTRACTED_CACHE_CAP + EXTRACTION_MAX_INFLIGHT` — a + // soft cap, not hard, but still strictly bounded. If every entry + // happens to be a spawned Pending (would mean the entire pool has + // been stuck for a long time), the cache transiently grows past + // CAP; the watchdog will eventually flip those slots to Failed, + // restoring evictability on the next insert. while self.extracted_order.len() >= EXTRACTED_CACHE_CAP { - match self.extracted_order.pop_front() { - Some(evict) => { - self.extracted.remove(&evict); + let evict_idx = self.extracted_order.iter().position(|id| { + !matches!( + self.extracted.get(id), + Some(ExtractionState::Pending { spawned: true, .. }) + ) + }); + match evict_idx { + Some(idx) => { + // `VecDeque::remove` is O(N) for arbitrary indices, + // but N ≤ ~500 and eviction only fires at the cap. + if let Some(evict) = self.extracted_order.remove(idx) { + self.extracted.remove(&evict); + } } None => break, } @@ -3536,4 +3569,113 @@ mod tests { "repeated syncs must not wipe the toggle" ); } + + /// Regression lock for the LRU-vs-watchdog leak: evicting a Pending slot + /// whose `spawned` flag is true would hide it from + /// `prune_stale_pending_extractions` (which only iterates `extracted`), + /// so a silently-dead worker's inflight permit would leak for the rest + /// of the session. `insert_extraction` must skip such slots during + /// eviction and evict the next-oldest evictable entry. + #[test] + fn test_lru_eviction_skips_spawned_pending_slots() { + let mut app = App::new(); + // Seed one spawned-Pending slot at the head of the LRU. Real callers + // would have inserted via `request_extraction` then flipped `spawned` + // via `mark_extraction_spawned`; we do the same here. + app.request_extraction( + "spawned_head".to_string(), + "https://ex.com/a".to_string(), + false, + false, + ); + let gen = pending_generation(&app, "spawned_head"); + app.mark_extraction_spawned("spawned_head", gen); + // Fill the rest of the cap with Ready entries (insertion-order LRU + // puts them all behind the spawned head). + for i in 0..EXTRACTED_CACHE_CAP - 1 { + let id = format!("ready{}", i); + app.request_extraction(id.clone(), "https://ex.com/x".to_string(), false, false); + let g = pending_generation(&app, &id); + app.record_extraction_result(id, g, Ok(dummy_extracted("body"))); + } + assert_eq!(app.extracted.len(), EXTRACTED_CACHE_CAP); + + // Insert one more entry — eviction must skip the spawned-Pending + // head and instead evict the next-oldest (ready0). + app.request_extraction( + "newcomer".to_string(), + "https://ex.com/n".to_string(), + false, + false, + ); + let g = pending_generation(&app, "newcomer"); + app.record_extraction_result("newcomer".to_string(), g, Ok(dummy_extracted("body"))); + + assert!( + app.extracted.contains_key("spawned_head"), + "spawned-Pending slot must NOT be evicted — would hide it from the watchdog" + ); + assert!( + !app.extracted.contains_key("ready0"), + "the next-oldest evictable entry should have been evicted instead" + ); + // Size still bounded — cap +/- the one we skipped (no extra growth + // here because we only protected one slot and evicted one entry). + assert_eq!(app.extracted.len(), EXTRACTED_CACHE_CAP); + } + + /// Watchdog must NOT promote a stale Pending → Failed transition to the + /// most-recently-used end of the LRU deque. A timed-out entry should be + /// strictly less useful than newer Ready entries and evicted ahead of + /// them — promoting it would invert that order. + #[test] + fn test_watchdog_failure_does_not_promote_lru_position() { + let mut app = App::new(); + app.request_extraction( + "first".to_string(), + "https://ex.com/a".to_string(), + false, + false, + ); + // Insert another entry so we can observe LRU order before/after. + app.request_extraction( + "second".to_string(), + "https://ex.com/b".to_string(), + false, + false, + ); + // Order should be [first, second] in the deque. + let order_before: Vec = app.extracted_order.iter().cloned().collect(); + assert_eq!( + order_before, + vec!["first".to_string(), "second".to_string()] + ); + + // Age `first` past the watchdog window and run the prune. + let gen = pending_generation(&app, "first"); + let aged = Instant::now() + .checked_sub(std::time::Duration::from_secs(3600)) + .expect("Instant::now() - 1h must be representable"); + app.extracted.insert( + "first".to_string(), + ExtractionState::Pending { + generation: gen, + started_at: aged, + spawned: true, + }, + ); + app.prune_stale_pending_extractions(std::time::Duration::from_secs(30)); + + // State flipped to Failed, but LRU position preserved at the head. + assert!(matches!( + app.extracted.get("first"), + Some(ExtractionState::Failed(_)) + )); + let order_after: Vec = app.extracted_order.iter().cloned().collect(); + assert_eq!( + order_after, + vec!["first".to_string(), "second".to_string()], + "watchdog must NOT promote a Failed entry to MRU — would let it outlive newer Ready slots" + ); + } } diff --git a/src/feed.rs b/src/feed.rs index bc751f3..f2cab5a 100644 --- a/src/feed.rs +++ b/src/feed.rs @@ -1211,6 +1211,7 @@ mod tests { use std::io::{Read as IoRead, Write as IoWrite}; use std::net::TcpListener; + use std::sync::mpsc; use std::thread; /// Spawn a one-shot HTTP stub on 127.0.0.1 that writes `response_bytes` @@ -1348,4 +1349,72 @@ mod tests { msg ); } + + /// Like `spawn_stub_server`, but also captures the raw bytes of the + /// first request the stub receives and ships them back through an mpsc + /// channel. Lets a test assert on what `extract_article` actually puts + /// on the wire (headers, path, etc.) — vs. just what it returns. + fn spawn_capturing_stub_server(response_bytes: Vec) -> (String, mpsc::Receiver>) { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind 127.0.0.1"); + let port = listener.local_addr().expect("local_addr").port(); + let (tx, rx) = mpsc::channel(); + thread::spawn(move || { + if let Ok((mut stream, _)) = listener.accept() { + // 8 KiB is plenty for a GET request with our headers; we + // only need enough to inspect the header block. + let mut buf = vec![0u8; 8192]; + let n = stream.read(&mut buf).unwrap_or(0); + buf.truncate(n); + let _ = tx.send(buf); + let _ = stream.write_all(&response_bytes); + let _ = stream.flush(); + } + }); + (format!("http://127.0.0.1:{}/", port), rx) + } + + /// Regression lock for the "no per-feed Authorization on extraction" rule. + /// The function signature deliberately omits a headers parameter, but + /// nothing today fails loudly if a future refactor adds + /// `Authorization` to the static header list. This test inspects the + /// raw bytes on the wire and asserts that no `Authorization` header + /// is sent — third-party article hosts must not see per-feed bearer + /// tokens / cookies. + #[test] + fn test_extract_article_does_not_send_authorization_header() { + let body = fixture_article_html(); + let resp = format!( + "HTTP/1.1 200 OK\r\n\ + Content-Type: text/html; charset=utf-8\r\n\ + Content-Length: {}\r\n\ + Connection: close\r\n\ + \r\n{}", + body.len(), + body + ); + let (url, captured) = spawn_capturing_stub_server(resp.into_bytes()); + let client = reqwest::blocking::Client::builder() + .timeout(Duration::from_secs(5)) + .build() + .unwrap(); + // Outcome is irrelevant — we only care about what hit the wire. + let _ = extract_article(&url, &client, None); + let raw = captured + .recv_timeout(Duration::from_secs(5)) + .expect("stub must have captured a request"); + let request = String::from_utf8_lossy(&raw); + // Header-name comparison is case-insensitive (RFC 7230 §3.2). + let lower = request.to_ascii_lowercase(); + assert!( + !lower.contains("\r\nauthorization:") && !lower.starts_with("authorization:"), + "extract_article must NOT forward an Authorization header — saw it in request:\n{}", + request + ); + // Sanity: confirm we captured a real request, not just an empty read. + assert!( + lower.starts_with("get /"), + "expected a GET request, got: {}", + request + ); + } } diff --git a/src/tui.rs b/src/tui.rs index 3039ef0..3952e06 100644 --- a/src/tui.rs +++ b/src/tui.rs @@ -659,7 +659,9 @@ fn run_app(terminal: &mut Terminal, app: &mut App) -> Result<()> // so a long backlog can't shrink the effective pool. The matching // saturating sub in the worker's exit path makes a spurious wake-up // (worker actually still alive, finishes later) safe — at worst we - // transiently overcommit by one slot, never underflow. + // transiently overcommit by up to `EXTRACTION_MAX_INFLIGHT` slots + // (every "released-but-still-alive" worker contributes one slot of + // over-commit until it actually exits), never underflow. let pruned = app .prune_stale_pending_extractions(Duration::from_secs(app.config.network.http_timeout)); for _ in 0..pruned { @@ -925,6 +927,94 @@ mod tests { assert!(app.pending_extraction_requests[0].safe_redirects); } + /// Regression lock: `enqueue_fulltext_for_new` must short-circuit on + /// items that already have ANY extraction state (Ready / Failed / + /// Pending). The contract lives in `request_extraction`'s + /// `contains_key` guard, but it's load-bearing — without it, a refresh + /// would re-extract every Ready item on every cycle (burning bandwidth, + /// thrashing the LRU) and double-queue Pending items for in-flight + /// workers. + #[test] + fn test_enqueue_fulltext_for_new_short_circuits_on_existing_state() { + use crate::app::ExtractionState; + let mut app = App::new(); + let feed = build_feed_with_items( + "https://ex.com/feed.xml", + vec![ + ("Cached-Ready", Some("https://ex.com/ready")), + ("Cached-Failed", Some("https://ex.com/failed")), + ("Cached-Pending", Some("https://ex.com/pending")), + ("Fresh", Some("https://ex.com/fresh")), + ], + ); + app.fulltext_feeds.insert(feed.url.clone()); + + let id_ready = crate::app::make_item_id(&feed, &feed.items[0]); + let id_failed = crate::app::make_item_id(&feed, &feed.items[1]); + let id_pending = crate::app::make_item_id(&feed, &feed.items[2]); + + // Drive Ready: Pending → record_extraction_result(Ok). + app.request_extraction( + id_ready.clone(), + "https://ex.com/ready".to_string(), + true, + false, + ); + let gen = match app.extracted.get(&id_ready) { + Some(ExtractionState::Pending { generation, .. }) => *generation, + other => panic!("expected Pending, got {:?}", other), + }; + app.record_extraction_result( + id_ready, + gen, + Ok(ExtractedArticle { + title: "T".into(), + plain_text: "body".into(), + byline: None, + site_name: None, + source_url: "https://ex.com/ready".into(), + }), + ); + + // Drive Failed: Pending → record_extraction_result(Err). + app.request_extraction( + id_failed.clone(), + "https://ex.com/failed".to_string(), + true, + false, + ); + let gen = match app.extracted.get(&id_failed) { + Some(ExtractionState::Pending { generation, .. }) => *generation, + other => panic!("expected Pending, got {:?}", other), + }; + app.record_extraction_result(id_failed, gen, Err(anyhow::anyhow!("boom"))); + + // Drive Pending: just leave it. + app.request_extraction( + id_pending, + "https://ex.com/pending".to_string(), + true, + false, + ); + + // Clear the queue so we only observe what the next call adds. + app.pending_extraction_requests.clear(); + + // Re-observe ALL four items. Only the Fresh one should be queued — + // the other three already have state and must be skipped. + enqueue_fulltext_for_new(&mut app, &feed, &[0, 1, 2, 3]); + + assert_eq!( + app.pending_extraction_requests.len(), + 1, + "items with existing extraction state must not be re-queued" + ); + assert_eq!( + app.pending_extraction_requests[0].url, "https://ex.com/fresh", + "only the fresh item should be queued" + ); + } + #[test] fn test_enqueue_fulltext_for_new_drops_unsafe_urls() { let mut app = App::new();