fix(discourse): handle modern Discourse API response shapes

blightbow · claude · blightbow · commit 6f971a873899 · 2026-04-09T21:34:10.000-04:00
Three Discourse-specific fixes prompted by a 1.0.0 user report:

1. tags shape drift (Discourse PR #36678, merged 2026-02-02 in
   topic_tags_mixin.rb): the `tags` field globally flipped from
   list[str] to list[{id, name, slug}] across all topic-bearing
   serializers. The "expected str instance, dict found" crash from
   ", ".join(tags) is fixed by extracting `name` per item, and we
   accept both shapes for older instances.

2. search title was always wrong (latent since day one):
   SearchPostSerializer has never emitted topic_title — we silently
   fell back to post.get('name'), which is the user's display name,
   so search results were headlined with the poster's name instead
   of the topic title. Look up the title from the parallel topics[]
   array via topic_id (the topic_map already exists for this).

3. mega-topic detection: topics with &gt;=10000 posts omit
   post_stream.stream and emit isMegaTopic+lastId instead. We
   silently surfaced only the first 20 posts. Detect the flag and
   prepend a clear truncation note so callers know the rest is
   unavailable via the topic endpoint.

Also drop a dead read of topics[].views in search formatting —
SearchTopicListItemSerializer never emits this field, so it was
always None and silently hidden by the if-guard.

Tests: rebuild SAMPLE_SEARCH_RESPONSE to mirror Discourse's actual
shape (no topic_title on posts, no views on topics, name field is
the user display name). Add four regressions: tags as legacy
strings, tags as modern dicts, missing tags, mega-topic note,
title-from-topics-array, and unknown-topic-id fallback. 850 pass,
ty clean.

Bump to 1.0.1.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/manifest.json b/manifest.json
@@ -2,7 +2,7 @@
   "manifest_version": "0.4",
   "name": "parkour-mcp",
   "display_name": "Parkour",
-  "version": "1.0.0",
+  "version": "1.0.1",
   "description": "A content exploration toolkit that helps LLMs surface high signal, unsummarized web content.",
   "long_description": "MCP server providing a content exploration and research synthesis pipeline. Uses clean first-party APIs to surface and explore web content without summarization. Integrates Kagi, Semantic Scholar, arXiv, deps.dev, IETF, GitHub, MediaWiki, Reddit, Discourse, and DOI resolution APIs into a unified tool suite.\n\n**Note:** The web_fetch_js tool (for JavaScript-rendered pages) requires a one-time browser install. Run `playwright install webkit` from any terminal. The other 11 tools work without this step.",
   "homepage": "https://github.com/blightbow/parkour-mcp",
diff --git a/parkour_mcp/discourse.py b/parkour_mcp/discourse.py
@@ -325,11 +325,28 @@ def _format_topic(data: dict, all_posts: list[dict]) -> tuple[str, str]:
         f"{views} views",
         _format_timestamp(created),
     ]
-    tags = data.get("tags") or []
-    if tags:
-        meta.append("tags: " + ", ".join(tags))
+    # Discourse's `tags` field is a list of {id, name, slug} dicts on modern
+    # instances; older instances returned bare strings. Accept both.
+    tags_raw = data.get("tags") or []
+    tag_names = [
+        t if isinstance(t, str) else t.get("name", "")
+        for t in tags_raw
+    ]
+    tag_names = [n for n in tag_names if n]
+    if tag_names:
+        meta.append("tags: " + ", ".join(tag_names))
     parts.append(" | ".join(meta) + "\n")
 
+    # Mega-topic warning: topics with >=10000 posts omit `post_stream.stream`
+    # entirely and emit `isMegaTopic: true, lastId: <int>` instead. We can't
+    # batch-fetch the remaining posts, so only the inline ~20 are included.
+    if data.get("post_stream", {}).get("isMegaTopic"):
+        parts.append(
+            f"> **Note:** This is a mega-topic ({posts_count} posts total). "
+            f"Only the first {len(all_posts)} posts are shown — Discourse "
+            f"does not expose the full post stream for very large topics.\n"
+        )
+
     # Posts
     for post in all_posts:
         post_num = post.get("post_number", 0)
@@ -465,25 +482,22 @@ def _format_search_results(data: dict, base_url: str, limit: int = 10) -> str:
     parts: list[str] = []
 
     if posts:
-        # Build topic_id → topic info map for enrichment
+        # Build topic_id → topic info map for enrichment.  SearchPostSerializer
+        # does NOT emit topic_title — the title must come from the parallel
+        # topics[] array.
         topic_map = {t["id"]: t for t in topics}
 
         for i, post in enumerate(posts, 1):
             topic_id = post.get("topic_id", 0)
-            topic_title = post.get("topic_title", post.get("name", "Untitled"))
+            topic_info = topic_map.get(topic_id, {})
+            topic_title = topic_info.get("title", "Untitled")
             username = post.get("username", "unknown")
             post_num = post.get("post_number", 1)
             blurb = post.get("blurb", "")
-
-            topic_info = topic_map.get(topic_id, {})
             reply_count = topic_info.get("reply_count", 0)
-            views = topic_info.get("views")
 
             parts.append(f"{i}. **{topic_title}**")
-            stats = f"{reply_count} replies"
-            if views:
-                stats += f", {views} views"
-            parts.append(f"   @{username} (post #{post_num}) | {stats}")
+            parts.append(f"   @{username} (post #{post_num}) | {reply_count} replies")
             parts.append(f"   {base_url}/t/{topic_id}/{post_num}")
             if blurb:
                 parts.append(f"   {blurb[:200]}")
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "parkour-mcp"
-version = "1.0.0"
+version = "1.0.1"
 description = "A content exploration toolkit that helps LLMs surface high signal, unsummarized web content."
 readme = "README.md"
 license = "MIT"
diff --git a/tests/test_discourse.py b/tests/test_discourse.py
@@ -60,7 +60,10 @@ def _make_post(
     "reply_count": 1,
     "like_count": 5,
     "category_id": 7,
-    "tags": ["test", "meta"],
+    "tags": [
+        {"id": 1, "name": "test", "slug": "test"},
+        {"id": 2, "name": "meta", "slug": "meta"},
+    ],
     "created_at": "2026-04-01T10:00:00.000Z",
     "slug": "test-topic-title",
     "chunk_size": 20,
@@ -71,27 +74,31 @@ def _make_post(
 }
 
 SAMPLE_SEARCH_RESPONSE = {
+    # Mirrors meta.discourse.org's actual /search.json shape: SearchPostSerializer
+    # does NOT emit topic_title (the post's `name` is the user's display name),
+    # and SearchTopicListItemSerializer does NOT emit `views`. Title and reply
+    # counts must come from the parallel topics[] array.
     "posts": [
         {
             "id": 2001,
             "topic_id": 100,
-            "topic_title": "How to install Discourse",
             "username": "admin",
+            "name": "Admin User",
             "post_number": 1,
             "blurb": "Follow these steps to install Discourse on your server...",
         },
         {
             "id": 2002,
             "topic_id": 200,
-            "topic_title": "Discourse plugin development",
             "username": "dev",
+            "name": "Dev User",
             "post_number": 3,
             "blurb": "Creating plugins requires understanding the Ember frontend...",
         },
     ],
     "topics": [
-        {"id": 100, "title": "How to install Discourse", "reply_count": 5, "views": 100},
-        {"id": 200, "title": "Discourse plugin development", "reply_count": 12, "views": 250},
+        {"id": 100, "title": "How to install Discourse", "reply_count": 5},
+        {"id": 200, "title": "Discourse plugin development", "reply_count": 12},
     ],
 }
 
@@ -232,6 +239,51 @@ def test_post_content_cleaned(self):
         assert "upload://" not in md
         assert "[image]" in md
 
+    def test_tags_legacy_string_shape(self):
+        """Older Discourse instances return tags as a list of bare strings."""
+        topic = {**SAMPLE_TOPIC_RESPONSE, "tags": ["alpha", "beta"]}
+        _, md = _format_topic(topic, SAMPLE_POSTS)
+        assert "tags: alpha, beta" in md
+
+    def test_tags_modern_dict_shape(self):
+        """Modern Discourse returns tags as {id, name, slug} dicts."""
+        topic = {
+            **SAMPLE_TOPIC_RESPONSE,
+            "tags": [
+                {"id": 10, "name": "alpha", "slug": "alpha"},
+                {"id": 20, "name": "beta", "slug": "beta"},
+            ],
+        }
+        _, md = _format_topic(topic, SAMPLE_POSTS)
+        assert "tags: alpha, beta" in md
+
+    def test_tags_missing(self):
+        topic = {k: v for k, v in SAMPLE_TOPIC_RESPONSE.items() if k != "tags"}
+        _, md = _format_topic(topic, SAMPLE_POSTS)
+        assert "tags:" not in md
+
+    def test_mega_topic_emits_truncation_note(self):
+        """Topics with >=10k posts omit post_stream.stream and set
+        isMegaTopic: true. We can only surface the inline posts and must
+        clearly tell the caller the rest is unavailable."""
+        topic = {
+            **SAMPLE_TOPIC_RESPONSE,
+            "posts_count": 12345,
+            "post_stream": {
+                "isMegaTopic": True,
+                "lastId": 99999,
+                "posts": SAMPLE_POSTS,
+            },
+        }
+        _, md = _format_topic(topic, SAMPLE_POSTS)
+        assert "mega-topic" in md
+        assert "12345 posts total" in md
+        assert f"first {len(SAMPLE_POSTS)} posts" in md
+
+    def test_normal_topic_no_mega_note(self):
+        _, md = _format_topic(SAMPLE_TOPIC_RESPONSE, SAMPLE_POSTS)
+        assert "mega-topic" not in md
+
 
 # ---------------------------------------------------------------------------
 # _split_by_posts
@@ -315,6 +367,31 @@ def test_limit(self):
         assert "How to install Discourse" in result
         assert "Discourse plugin development" not in result
 
+    def test_title_comes_from_topics_array(self):
+        """Regression: SearchPostSerializer never emits topic_title.
+
+        Earlier code fell back to post.get('name'), which is the user's
+        display name (e.g. 'Sam Saffron'), not the topic title. The title
+        must be looked up from the parallel topics[] array via topic_id.
+        """
+        result = _format_search_results(SAMPLE_SEARCH_RESPONSE, BASE_URL)
+        # Real titles from topics[] are present
+        assert "How to install Discourse" in result
+        assert "Discourse plugin development" in result
+        # User display names from posts[].name must NOT leak into headlines
+        assert "**Admin User**" not in result
+        assert "**Dev User**" not in result
+
+    def test_unknown_topic_id_falls_back_to_untitled(self):
+        """If a search post references a topic_id missing from topics[],
+        the title falls back to 'Untitled' rather than crashing."""
+        data = {
+            "posts": [{"id": 1, "topic_id": 999, "username": "u", "post_number": 1, "blurb": ""}],
+            "topics": [],
+        }
+        result = _format_search_results(data, BASE_URL)
+        assert "Untitled" in result
+
 
 # ---------------------------------------------------------------------------
 # _format_latest