diff --git a/qiskit-docs-mcp-server/README.md b/qiskit-docs-mcp-server/README.md index 8fef1ba..e358052 100644 --- a/qiskit-docs-mcp-server/README.md +++ b/qiskit-docs-mcp-server/README.md @@ -12,12 +12,13 @@ The Qiskit Documentation MCP Server provides AI assistants and agents with seaml ### Key Features -- **📚 Complete Documentation Access**: Query all 16 Qiskit SDK modules, 6 addon packages, and 30+ implementation guides +- **📚 Complete Documentation Access**: Query all Qiskit SDK modules, addon packages, API references, guides, and tutorials +- **🔄 Dynamic Content Discovery**: Automatically discovers available documentation from the live sitemap — no manual updates needed when new content is published - **📖 Implementation Guides**: Access best practices for optimization, error mitigation, dynamic circuits, and more - **🔍 Smart Search**: Search across the entire Qiskit documentation with fuzzy matching - **🎯 No Authentication Required**: Public documentation access without API tokens - **📝 Markdown Output**: Clean, formatted documentation ready for AI consumption -- **⚡ Fast Retrieval**: Efficient HTTP-based documentation fetching with configurable timeouts +- **⚡ Fast Retrieval**: Efficient HTTP-based documentation fetching with TTL caching and configurable timeouts ## Components @@ -33,15 +34,25 @@ The server implements three tools for documentation access: ### Resources -The server provides four resources for listing available documentation: +The server provides six resources for listing available documentation. Content lists for modules, addons, guides, tutorials, and API packages are **dynamically discovered** from the documentation sitemap and cached, with hardcoded fallback values used when the sitemap is unreachable. | Resource URI | Description | |--------------|-------------| -| `qiskit-docs://modules` | List of all Qiskit SDK modules with descriptions | -| `qiskit-docs://addons` | List of Qiskit addon modules and tutorials | +| `qiskit-docs://modules` | List of all Qiskit SDK modules with URL paths | +| `qiskit-docs://addons` | List of Qiskit addon packages with URL paths | | `qiskit-docs://guides` | List of implementation guides and best practices | +| `qiskit-docs://tutorials` | List of Qiskit tutorials with URL paths | +| `qiskit-docs://api-packages` | List of API packages (runtime, transpiler, REST APIs, etc.) | | `qiskit-docs://error-codes` | List of Qiskit error code categories | +### Resource Templates + +| Resource URI | Description | +|--------------|-------------| +| `qiskit-docs://modules/{module_name}` | Documentation for a specific SDK module | +| `qiskit-docs://guides/{guide_name}` | A specific implementation guide | +| `qiskit-docs://addons/{addon_name}` | Documentation for a specific addon package | + ## Prerequisites - Python 3.10 or higher @@ -302,12 +313,40 @@ All responses include rich metadata: } ``` +### Dynamic Sitemap Discovery + +Resource lists (modules, addons, guides, tutorials, API packages) are automatically discovered from the live documentation sitemap at startup. This means the server adapts to new content without code changes. If the sitemap is unreachable, the server falls back to hardcoded values in `constants.py`. + +To update the hardcoded fallback values from the live sitemap: + +```bash +cd qiskit-docs-mcp-server +uv run python scripts/update_fallback_constants.py +``` + +This prints updated constant lists that can be copied into `constants.py`. + ### HTML to Markdown Conversion Documentation is automatically converted from HTML to clean Markdown format, optimized for AI consumption and human readability. ## Development +### Project Structure + +``` +src/qiskit_docs_mcp_server/ +├── server.py # MCP server definition (tools, resources, prompts) +├── data_fetcher.py # Business logic for fetching and processing documentation +├── http.py # HTTP infrastructure: client management, caching, retries +├── sitemap.py # Dynamic sitemap discovery and page classification +├── html_processing.py # HTML content extraction and markdown conversion +└── constants.py # Configuration constants and hardcoded fallback values + +scripts/ +└── update_fallback_constants.py # Regenerate fallback values from live sitemap +``` + ### Running Tests ```bash diff --git a/qiskit-docs-mcp-server/pyproject.toml b/qiskit-docs-mcp-server/pyproject.toml index 02da98a..c26c0cc 100644 --- a/qiskit-docs-mcp-server/pyproject.toml +++ b/qiskit-docs-mcp-server/pyproject.toml @@ -6,6 +6,7 @@ readme = "README.md" requires-python = ">=3.10,<3.15" dependencies = [ "beautifulsoup4>=4.12.0", + "defusedxml>=0.7.1", "fastmcp>=2.8.1,<3", "html2text>=2020.1.16", "httpx>=0.28.1", @@ -54,6 +55,7 @@ dev = [ "mypy>=1.15.0", "pre-commit>=4.1.0", "ruff>=0.9.4", + "types-defusedxml>=0.7.0", ] test = [ "pytest>=7.4.0", diff --git a/qiskit-docs-mcp-server/scripts/update_fallback_constants.py b/qiskit-docs-mcp-server/scripts/update_fallback_constants.py new file mode 100755 index 0000000..73af818 --- /dev/null +++ b/qiskit-docs-mcp-server/scripts/update_fallback_constants.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# This code is part of Qiskit. +# +# (C) Copyright IBM 2026. +# +# This code is licensed under the Apache License, Version 2.0. You may +# obtain a copy of this license in the LICENSE.txt file in the root directory +# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +# +# Any modifications or derivative works of this code must retain this +# copyright notice, and modified files need to carry a notice indicating +# that they have been altered from the originals. + +"""Fetch the live sitemap and print updated fallback constant lists. + +Run this script periodically to keep the hardcoded fallback values in +``constants.py`` in sync with the live Qiskit documentation sitemap. + +Usage: + cd qiskit-docs-mcp-server + uv run python scripts/update_fallback_constants.py +""" + +from __future__ import annotations + +import sys + +import httpx + + +sys.path.insert(0, "src") +from qiskit_docs_mcp_server.constants import SITEMAP_URL +from qiskit_docs_mcp_server.sitemap import _parse_sitemap_xml + + +def _format_list(name: str, values: list[str]) -> str: + """Format a Python list constant.""" + items = ",\n".join(f' "{v}"' for v in values) + return f"{name}: list[str] = [\n{items},\n]" + + +def main() -> None: + print(f"Fetching sitemap from {SITEMAP_URL} ...") + response = httpx.get(SITEMAP_URL, follow_redirects=True, timeout=30.0) + response.raise_for_status() + xml_text = response.text + + pages = _parse_sitemap_xml(xml_text) + + print("\nDiscovered:") + for category, items in pages.items(): + print(f" {category}: {len(items)} entries") + + print("\n" + "=" * 72) + print("Copy the following into constants.py (fallback lists section):") + print("=" * 72 + "\n") + + mapping = { + "AVAILABLE_MODULES": "modules", + "AVAILABLE_ADDONS": "addons", + "AVAILABLE_API_PACKAGES": "api_packages", + "AVAILABLE_GUIDES": "guides", + "AVAILABLE_TUTORIALS": "tutorials", + } + + for const_name, key in mapping.items(): + print(_format_list(const_name, pages[key])) + print() + + +if __name__ == "__main__": + main() diff --git a/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/constants.py b/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/constants.py index f23ed17..b06ec26 100644 --- a/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/constants.py +++ b/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/constants.py @@ -42,6 +42,12 @@ def _get_env_float(name: str, default: float) -> float: QISKIT_DOCS_BASE = os.getenv("QISKIT_DOCS_BASE", "https://quantum.cloud.ibm.com/docs/") BASE_URL = os.getenv("QISKIT_SEARCH_BASE_URL", "https://quantum.cloud.ibm.com/") +# Sitemap URL for dynamic page discovery +SITEMAP_URL = os.getenv( + "QISKIT_DOCS_SITEMAP_URL", + "https://quantum.cloud.ibm.com/docs/sitemap-0.xml", +) + # Error code registry ERROR_CODE_CATEGORIES = { "1XXX": "Validation, transpilation, backend availability, authorization, and job management", @@ -60,84 +66,283 @@ def _get_env_float(name: str, default: float) -> float: CACHE_TTL = _get_env_float("QISKIT_DOCS_CACHE_TTL", 3600.0) SEARCH_CACHE_TTL = _get_env_float("QISKIT_SEARCH_CACHE_TTL", 300.0) # 5 min default -# Qiskit modules and their documentation paths -AVAILABLE_MODULES = { +# --------------------------------------------------------------------------- +# Fallback lists — used when sitemap discovery is unavailable +# --------------------------------------------------------------------------- + +# Qiskit SDK modules and their documentation paths +AVAILABLE_MODULES: list[str] = [ # Circuit construction - "circuit": "Quantum circuit construction and manipulation (QuantumCircuit, gates, registers)", + "circuit", + "circuit_annotation", + "circuit_classical", + "circuit_library", + "circuit_random", + "circuit_singleton", # Quantum information - "quantum_info": "Quantum information utilities (states, operators, channels, measures)", + "quantum_info", # Transpilation - "transpiler": "Circuit transpilation and optimization for target hardware", - "synthesis": "Circuit synthesis algorithms (unitary, Clifford, linear functions)", - "dagcircuit": "Directed acyclic graph (DAG) representation of quantum circuits", - "passmanager": "Transpiler pass manager framework for custom transpilation pipelines", - "converters": "Circuit format converters and interoperability utilities", - "compiler": "High-level compilation routines (transpile shortcut)", + "transpiler", + "transpiler_passes", + "transpiler_plugins", + "transpiler_preset", + "transpiler_synthesis_plugins", + "synthesis", + "dagcircuit", + "passmanager", + "converters", + "compiler", # Primitives and providers - "primitives": "Sampler and Estimator primitives for quantum execution", - "providers": "Backend providers and job management interfaces", + "primitives", + "providers", + "providers_basic_provider", + "providers_fake_provider", # Results and visualization - "result": "Quantum job result handling and analysis", - "visualization": "Circuit and result visualization tools", + "result", + "visualization", # Serialization - "qasm2": "OpenQASM 2.0 parsing and generation", - "qasm3": "OpenQASM 3.0 parsing and generation", - "qpy": "Qiskit Python serialization format (QPY) for circuit persistence", + "qasm2", + "qasm3", + "qpy", # Utilities - "utils": "General utility functions and helpers", - "exceptions": "Qiskit exception classes and error hierarchy", -} + "utils", + "exceptions", +] -AVAILABLE_ADDONS = { - "aqc-tensor": "Approximate Quantum Compiler with tensor network techniques", - "cutting": "Circuit cutting to run large circuits on smaller devices", - "mpf": "Multi-product formulas for Hamiltonian simulation", - "obp": "Operator backpropagation for expectation value estimation", - "sqd": "Sample-based Quantum Diagonalization for chemistry and optimization", - "utils": "Shared utilities for Qiskit addon packages", -} +AVAILABLE_ADDONS: list[str] = [ + "aqc-tensor", + "cutting", + "mpf", + "obp", + "sqd", + "utils", +] -AVAILABLE_GUIDES = { - # Getting started - "quick-start": "Get started with Qiskit — create and run your first circuit", - # Circuit building - "construct-circuits": "Build and manipulate quantum circuits", - # Transpilation - "transpile": "Transpile circuits for target backends", - "transpiler-stages": "Understand the six stages of the transpiler pipeline", - "transpile-with-pass-managers": "Use custom pass managers for transpilation", - "defaults-and-configuration-options": "Transpiler defaults and configuration options", - "circuit-transpilation-settings": "Circuit-level transpilation settings", - "qiskit-transpiler-service": "Use the Qiskit Transpiler cloud service", - # Error mitigation and suppression - "error-mitigation-and-suppression-techniques": "Overview of error mitigation and suppression techniques", - "configure-error-mitigation": "Configure error mitigation for Qiskit primitives", - "configure-error-suppression": "Configure error suppression techniques", - # Execution - "primitives": "Use Sampler and Estimator primitives for quantum execution", - "execution-modes": "Job, session, and batch execution modes", - "runtime-options-overview": "Overview of Qiskit Runtime configuration options", - "directed-execution-model": "Use the directed execution model", - # Dynamic circuits - "dynamic-circuits": "Mid-circuit measurements and classical control flow", - # Post-processing addons - "qiskit-addons-sqd": "Use Sample-based Quantum Diagonalization (SQD)", - # Qiskit Functions - circuit functions - "functions": "Overview of Qiskit Functions", - "ibm-circuit-function": "IBM Circuit Function for optimized execution", - "algorithmiq-tem": "Algorithmiq Tensor Error Mitigation (TEM)", - "qedma-qesem": "Qedma Quantum Error Suppression and Error Mitigation (QESEM)", - "q-ctrl-performance-management": "Q-CTRL Performance Management for optimized circuits", - # Qiskit Functions - application functions - "colibritd-pde": "ColibrITD PDE solver function", - "global-data-quantum-optimizer": "Global Data Quantum Optimizer function", - "qunova-chemistry": "Qunova Chemistry solver function", - "kipu-optimization": "Kipu Optimization solver function", - "q-ctrl-optimization-solver": "Q-CTRL Optimization Solver function", - "multiverse-computing-singularity": "Multiverse Computing Singularity function", - # Security and support - "secure-data": "Data security and privacy on IBM Quantum", - "support": "Getting support and help with Qiskit and IBM Quantum", -} +# Additional API packages beyond the core SDK and addons +AVAILABLE_API_PACKAGES: list[str] = [ + "qiskit-ibm-runtime", + "qiskit-ibm-transpiler", + "qiskit-c", + "qiskit-runtime-rest", + "quantum-system-rest", + "functions", +] + +AVAILABLE_GUIDES: list[str] = [ + "DAG-representation", + "access-groups", + "access-instances-platform-apis", + "add-job-tags", + "addons", + "ai-transpiler-passes", + "algorithmiq-tem", + "allocation-limits", + "bit-ordering", + "build-noise-models", + "c-extension-for-python", + "calibration-jobs", + "choose-execution-mode", + "circuit-library", + "circuit-transpilation-settings", + "classical-feedforward-and-control-flow", + "cloud-account-structure", + "cloud-setup", + "cloud-setup-invited", + "cloud-setup-rest-api", + "cloud-setup-untrusted", + "code-of-conduct", + "colibritd-pde", + "common-parameters", + "composer", + "compute-services", + "configure-error-mitigation", + "configure-error-suppression", + "configure-qiskit-local", + "considerations-set-up-runtime", + "construct-circuits", + "context-based-restrictions", + "create-a-provider", + "create-transpiler-plugin", + "custom-backend", + "custom-roles", + "custom-transpiler-pass", + "debug-qiskit-runtime-jobs", + "debugging-tools", + "defaults-and-configuration-options", + "directed-execution-model", + "dynamical-decoupling-pass-manager", + "error-mitigation-and-suppression-techniques", + "error-mitigation-overview", + "estimate-job-run-time", + "execute-dynamic-circuits", + "execution-modes", + "execution-modes-faq", + "execution-modes-rest-api", + "fair-share-scheduler", + "faq", + "fractional-gates", + "function-template-chemistry-workflow", + "function-template-hamiltonian-simulation", + "functions", + "get-started-with-primitives", + "global-data-quantum-optimizer", + "ha-dr", + "hello-world", + "ibm-circuit-function", + "initialize-account", + "install-c-api", + "install-qiskit", + "install-qiskit-runtime", + "install-qiskit-runtime-source", + "install-qiskit-source", + "instances", + "interoperate-qiskit-qasm2", + "interoperate-qiskit-qasm3", + "intro-to-patterns", + "introduction-to-qasm", + "invite-and-manage-users", + "job-limits", + "kipu-optimization", + "latest-updates", + "local-simulators", + "local-testing-mode", + "logging", + "manage-appid", + "manage-cloud-users", + "manage-cost", + "max-execution-time", + "measure-qubits", + "metapackage-migration", + "minimize-time", + "monitor-job", + "multiverse-computing-singularity", + "noise-learning", + "observability-quantum-system", + "observability-runtime-rest", + "online-lab-environments", + "open-source", + "operator-class", + "operators-overview", + "plans-overview", + "plot-quantum-states", + "primitive-input-output", + "primitives", + "primitives-examples", + "primitives-rest-api", + "processor-types", + "pulse-migration", + "q-ctrl-optimization-solver", + "q-ctrl-performance-management", + "qasm-feature-table", + "qedma-qesem", + "qiskit-1.0", + "qiskit-1.0-features", + "qiskit-1.0-installation", + "qiskit-2.0", + "qiskit-addons-sqd", + "qiskit-addons-sqd-get-started", + "qiskit-backendv1-to-v2", + "qiskit-code-assistant", + "qiskit-code-assistant-jupyterlab", + "qiskit-code-assistant-local", + "qiskit-code-assistant-openai-api", + "qiskit-code-assistant-vscode", + "qiskit-function-templates", + "qiskit-mcp-servers", + "qiskit-runtime-circuit-timing", + "qiskit-runtime-primitives", + "qiskit-sdk-version-strategy", + "qiskit-transpiler-service", + "qpu-information", + "qrmi", + "quick-start", + "quickstart-steps-org", + "qunova-chemistry", + "repetition-rate-execution", + "represent-quantum-computers", + "responsibilities", + "retired-qpus", + "run-jobs-batch", + "run-jobs-session", + "runtime-options-overview", + "save-circuits", + "save-credentials", + "save-jobs", + "secure-data", + "serverless", + "serverless-first-program", + "serverless-manage-resources", + "serverless-port-code", + "serverless-run-first-workload", + "set-optimization", + "simulate-stabilizer-circuits", + "simulate-with-qiskit-aer", + "simulate-with-qiskit-sdk-primitives", + "slurm-hpc-ux", + "slurm-plugin", + "specify-observables-pauli", + "specify-runtime-options", + "stretch", + "support", + "synthesize-unitary-operators", + "tools-intro", + "transpile", + "transpile-with-pass-managers", + "transpiler-plugins", + "transpiler-stages", + "upgrade-from-open", + "v2-primitives", + "view-cost", + "virtual-private-endpoints", + "visualize-circuit-timing", + "visualize-circuits", + "visualize-results", +] + +AVAILABLE_TUTORIALS: list[str] = [ + "advanced-techniques-for-qaoa", + "ai-transpiler-introduction", + "approximate-quantum-compilation-for-time-evolution", + "chsh-inequality", + "colibritd-pde", + "combine-error-mitigation-techniques", + "compilation-methods-for-hamiltonian-simulation-circuits", + "dc-hex-ising", + "depth-reduction-with-circuit-cutting", + "edc-cut-bell-pair-benchmarking", + "error-mitigation-with-qiskit-functions", + "fractional-gates", + "ghz-spacetime-codes", + "global-data-quantum-optimizer", + "grovers-algorithm", + "krylov-quantum-diagonalization", + "long-range-entanglement", + "multi-product-formula", + "nishimori-phase-transition", + "operator-back-propagation", + "pauli-correlation-encoding-for-qaoa", + "periodic-boundary-conditions-with-circuit-cutting", + "probabilistic-error-amplification", + "projected-quantum-kernels", + "qedma-2d-ising-with-qesem", + "quantum-approximate-optimization-algorithm", + "quantum-kernel-training", + "quantum-phase-estimation-qctrl", + "qunova-hivqe", + "readout-error-mitigation-sampler", + "real-time-benchmarking-for-qubit-selection", + "repetition-codes", + "sample-based-krylov-quantum-diagonalization", + "sample-based-quantum-diagonalization", + "shors-algorithm", + "simulate-kicked-ising-tem", + "sml-classification", + "solve-higher-order-binary-optimization-problems-with-q-ctrls-optimization-solver", + "solve-market-split-problem-with-iskay-quantum-optimizer", + "spin-chain-vqe", + "transpilation-optimizations-with-sabre", + "transverse-field-ising-model", + "wire-cutting", +] SEARCH_PATH = "endpoints-docs-learning/api/search" diff --git a/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/data_fetcher.py b/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/data_fetcher.py index 4692cba..9f1c72f 100644 --- a/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/data_fetcher.py +++ b/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/data_fetcher.py @@ -10,181 +10,39 @@ # copyright notice, and modified files need to carry a notice indicating # that they have been altered from the originals. -import asyncio +"""Business logic for fetching and processing Qiskit documentation.""" + import logging import re -import time -from collections import OrderedDict from datetime import datetime, timezone from typing import Any from urllib.parse import quote, urlparse -import html2text -import httpx from bs4 import BeautifulSoup from qiskit_docs_mcp_server.constants import ( AVAILABLE_ADDONS, + AVAILABLE_API_PACKAGES, AVAILABLE_GUIDES, AVAILABLE_MODULES, + AVAILABLE_TUTORIALS, BASE_URL, - CACHE_TTL, ERROR_CODE_CATEGORIES, - HTTP_TIMEOUT, QISKIT_DOCS_BASE, - SEARCH_CACHE_TTL, SEARCH_PATH, ) +from qiskit_docs_mcp_server.html_processing import ( + _strip_html_tags, + convert_html_to_markdown, +) +from qiskit_docs_mcp_server.http import fetch_text, fetch_text_json +from qiskit_docs_mcp_server.sitemap import get_sitemap_pages logger = logging.getLogger(__name__) -# Allowed hostname for URL validation (derived from configurable QISKIT_DOCS_BASE) _ALLOWED_HOST = urlparse(QISKIT_DOCS_BASE).netloc -# Retry configuration for transient HTTP failures -_MAX_RETRIES = 2 # Total attempts (1 initial + 1 retry) -_RETRY_DELAY = 1.0 # Seconds between retries - - -class _TTLCache: - """Simple in-memory cache with TTL and LRU eviction.""" - - def __init__(self, ttl: float = 3600.0, max_size: int = 128): - self._ttl = ttl - self._max_size = max_size - self._cache: OrderedDict[str, tuple[float, Any]] = OrderedDict() - - def get(self, key: str) -> Any | None: - if key in self._cache: - timestamp, value = self._cache[key] - if time.monotonic() - timestamp < self._ttl: - self._cache.move_to_end(key) # LRU touch - return value - del self._cache[key] - return None - - def set(self, key: str, value: Any) -> None: - if key in self._cache: - del self._cache[key] - elif len(self._cache) >= self._max_size: - self._cache.popitem(last=False) # Evict LRU — O(1) - self._cache[key] = (time.monotonic(), value) - - def clear(self) -> None: - self._cache.clear() - - -_text_cache = _TTLCache(ttl=CACHE_TTL) -_json_cache = _TTLCache(ttl=SEARCH_CACHE_TTL) - -_client_holder: dict[str, httpx.AsyncClient] = {} - - -def set_http_client(client: httpx.AsyncClient) -> None: - """Set the shared HTTP client (called by server lifespan).""" - _client_holder["client"] = client - - -def clear_http_client() -> None: - """Clear the shared HTTP client (called on server shutdown).""" - _client_holder.clear() - - -def _get_http_client() -> httpx.AsyncClient: - """Get or create a shared HTTP client.""" - client = _client_holder.get("client") - if client is None or client.is_closed: - client = httpx.AsyncClient(timeout=HTTP_TIMEOUT, follow_redirects=True) - _client_holder["client"] = client - return client - - -def _strip_html_tags(text: str) -> str: - """Strip HTML tags from a string. - - Args: - text: String potentially containing HTML tags - - Returns: - String with all HTML tags removed - """ - return re.sub(r"<[^>]+>", "", text) - - -def extract_main_content(html: str) -> str: - """Extract main content from HTML, removing navigation chrome. - - Strips nav, header, footer, aside elements and ARIA-role navigation, - then returns the
,
, or role='main' content. Falls back - to (with chrome removed) if no semantic main content is found. - - Args: - html: Full HTML page content - - Returns: - HTML string with only the main content - """ - soup = BeautifulSoup(html, "html.parser") - - # Remove structural chrome elements - for tag_name in ["nav", "header", "footer", "aside"]: - for element in soup.find_all(tag_name): - element.decompose() - - # Remove ARIA-role navigation elements - for role in ["navigation", "banner", "contentinfo", "complementary"]: - for element in soup.find_all(attrs={"role": role}): - element.decompose() - - # Remove skip-to-content links - for element in soup.find_all("a", class_=lambda c: c and "skip" in c.lower()): - element.decompose() - for element in soup.find_all( - "a", - string=lambda s: s and "skip to" in s.lower(), # type: ignore[call-overload] - ): - element.decompose() - - # Return the best semantic container - main_content = soup.find("main") - if main_content: - return str(main_content) - - article = soup.find("article") - if article: - return str(article) - - main_role = soup.find(attrs={"role": "main"}) - if main_role: - return str(main_role) - - body = soup.find("body") - if body: - return str(body) - - return str(soup) - - -def convert_html_to_markdown(html: str) -> str: - """Convert HTML content to Markdown format. - - Strips navigation chrome (header, footer, nav, aside) before conversion - to produce cleaner markdown output. - - Args: - html: HTML content string - - Returns: - Markdown formatted content - """ - content_html = extract_main_content(html) - h = html2text.HTML2Text() - h.ignore_links = False - h.body_width = 0 - h.ignore_images = False - return h.handle(content_html) - def _truncate_content(content: str, max_length: int = 20000, offset: int = 0) -> dict[str, Any]: """Truncate content with pagination metadata. @@ -275,103 +133,6 @@ def _resolve_url(url: str) -> str: return f"{base}/{path}" -async def _fetch_with_retry(url: str) -> httpx.Response | None: - """Fetch a URL with retry for transient errors (5xx, timeouts). - - Args: - url: The URL to fetch - - Returns: - The httpx Response on success, or None if all attempts fail - """ - client = _get_http_client() - last_error: Exception | None = None - for attempt in range(_MAX_RETRIES): - try: - response = await client.get(url, follow_redirects=True) - response.raise_for_status() - return response - except httpx.TimeoutException as e: # noqa: PERF203 - last_error = e - if attempt < _MAX_RETRIES - 1: - logger.warning( - "Timeout fetching %s (attempt %d), retrying...", - url, - attempt + 1, - ) - await asyncio.sleep(_RETRY_DELAY) - continue - except httpx.HTTPStatusError as e: - last_error = e - if attempt < _MAX_RETRIES - 1 and e.response.status_code >= 500: - logger.warning( - "Server error %d fetching %s (attempt %d), retrying...", - e.response.status_code, - url, - attempt + 1, - ) - await asyncio.sleep(_RETRY_DELAY) - continue - break # 4xx errors — don't retry - except httpx.HTTPError as e: - logger.error("Failed to fetch %s: %s", url, e) - return None - except Exception as e: - logger.error("Unexpected error fetching %s: %s", url, e) - return None - - logger.error("Failed to fetch %s after %d attempts: %s", url, _MAX_RETRIES, last_error) - return None - - -async def fetch_text(url: str) -> str | None: - """Fetch text content from a URL using httpx. - - Retries on transient errors (5xx status codes and timeouts). - - Args: - url: The URL to fetch - - Returns: - The text content of the page, or None if fetch fails - """ - cached: str | None = _text_cache.get(url) - if cached is not None: - return cached - - response = await _fetch_with_retry(url) - if response is None: - return None - - result = response.text - _text_cache.set(url, result) - return result - - -async def fetch_text_json(url: str) -> list[dict[str, Any]] | None: - """Fetch JSON content from a URL using httpx. - - Retries on transient errors (5xx status codes and timeouts). - - Args: - url: The URL to fetch - - Returns: - The JSON content as a list of dicts, or None if fetch fails - """ - cached: list[dict[str, Any]] | None = _json_cache.get(url) - if cached is not None: - return cached - - response = await _fetch_with_retry(url) - if response is None: - return None - - result: list[dict[str, Any]] = response.json() - _json_cache.set(url, result) - return result - - async def get_page_docs(url: str, max_length: int = 20000, offset: int = 0) -> dict[str, Any]: """Fetch any Qiskit documentation page and return as markdown. @@ -583,53 +344,113 @@ async def lookup_error_code(code: str) -> dict[str, Any]: } -def get_list_of_modules() -> dict[str, Any]: - """Get list of all Qiskit SDK modules with descriptions and URL paths.""" +async def get_list_of_modules() -> dict[str, Any]: + """Get list of all Qiskit SDK modules with URL paths. + + Tries dynamic sitemap discovery first, falls back to hardcoded constants. + """ base = QISKIT_DOCS_BASE.rstrip("/") + sitemap = get_sitemap_pages() + names = sitemap["modules"] if sitemap else AVAILABLE_MODULES return { "status": "success", + "source": "sitemap" if sitemap else "fallback", "modules": [ { "name": name, - "description": desc, "url_path": f"api/qiskit/{name}", "full_url": f"{base}/api/qiskit/{name}", } - for name, desc in AVAILABLE_MODULES.items() + for name in names ], } -def get_list_of_addons() -> dict[str, Any]: - """Get list of all Qiskit addon modules with descriptions and URL paths.""" +async def get_list_of_addons() -> dict[str, Any]: + """Get list of all Qiskit addon packages with URL paths. + + Tries dynamic sitemap discovery first, falls back to hardcoded constants. + """ base = QISKIT_DOCS_BASE.rstrip("/") + sitemap = get_sitemap_pages() + names = sitemap["addons"] if sitemap else AVAILABLE_ADDONS return { "status": "success", + "source": "sitemap" if sitemap else "fallback", "addons": [ { "name": name, - "description": desc, "url_path": f"api/qiskit-addon-{name}", "full_url": f"{base}/api/qiskit-addon-{name}", } - for name, desc in AVAILABLE_ADDONS.items() + for name in names ], } -def get_list_of_guides() -> dict[str, Any]: - """Get list of Qiskit guides and best practices with descriptions and URL paths.""" +async def get_list_of_guides() -> dict[str, Any]: + """Get list of Qiskit guides with URL paths. + + Tries dynamic sitemap discovery first, falls back to hardcoded constants. + """ base = QISKIT_DOCS_BASE.rstrip("/") + sitemap = get_sitemap_pages() + names = sitemap["guides"] if sitemap else AVAILABLE_GUIDES return { "status": "success", + "source": "sitemap" if sitemap else "fallback", "guides": [ { "name": name, - "description": desc, "url_path": f"guides/{name}", "full_url": f"{base}/guides/{name}", } - for name, desc in AVAILABLE_GUIDES.items() + for name in names + ], + } + + +async def get_list_of_tutorials() -> dict[str, Any]: + """Get list of Qiskit tutorials with URL paths. + + Tries dynamic sitemap discovery first, falls back to hardcoded constants. + """ + base = QISKIT_DOCS_BASE.rstrip("/") + sitemap = get_sitemap_pages() + names = sitemap["tutorials"] if sitemap else AVAILABLE_TUTORIALS + return { + "status": "success", + "source": "sitemap" if sitemap else "fallback", + "tutorials": [ + { + "name": name, + "url_path": f"tutorials/{name}", + "full_url": f"{base}/tutorials/{name}", + } + for name in names + ], + } + + +async def get_list_of_api_packages() -> dict[str, Any]: + """Get list of all API packages (beyond SDK modules and addons) with URL paths. + + Includes qiskit-ibm-runtime, qiskit-ibm-transpiler, REST APIs, etc. + Tries dynamic sitemap discovery first, falls back to hardcoded constants. + """ + base = QISKIT_DOCS_BASE.rstrip("/") + sitemap = get_sitemap_pages() + names = sitemap["api_packages"] if sitemap else AVAILABLE_API_PACKAGES + return { + "status": "success", + "source": "sitemap" if sitemap else "fallback", + "api_packages": [ + { + "name": name, + "url_path": f"api/{name}", + "full_url": f"{base}/api/{name}", + } + for name in names ], } diff --git a/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/html_processing.py b/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/html_processing.py new file mode 100644 index 0000000..92470b4 --- /dev/null +++ b/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/html_processing.py @@ -0,0 +1,106 @@ +# This code is part of Qiskit. +# +# (C) Copyright IBM 2026. +# +# This code is licensed under the Apache License, Version 2.0. You may +# obtain a copy of this license in the LICENSE.txt file in the root directory +# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +# +# Any modifications or derivative works of this code must retain this +# copyright notice, and modified files need to carry a notice indicating +# that they have been altered from the originals. + +"""HTML content extraction and markdown conversion.""" + +import re + +import html2text +from bs4 import BeautifulSoup + + +def _strip_html_tags(text: str) -> str: + """Strip HTML tags from a string. + + Args: + text: String potentially containing HTML tags + + Returns: + String with all HTML tags removed + """ + return re.sub(r"<[^>]+>", "", text) + + +def extract_main_content(html: str) -> str: + """Extract main content from HTML, removing navigation chrome. + + Strips nav, header, footer, aside elements and ARIA-role navigation, + then returns the
,
, or role='main' content. Falls back + to (with chrome removed) if no semantic main content is found. + + Args: + html: Full HTML page content + + Returns: + HTML string with only the main content + """ + soup = BeautifulSoup(html, "html.parser") + + # Remove structural chrome elements + for tag_name in ["nav", "header", "footer", "aside"]: + for element in soup.find_all(tag_name): + element.decompose() + + # Remove ARIA-role navigation elements + for role in ["navigation", "banner", "contentinfo", "complementary"]: + for element in soup.find_all(attrs={"role": role}): + element.decompose() + + # Remove skip-to-content links + for element in soup.find_all("a", class_=lambda c: c and "skip" in c.lower()): + element.decompose() + for element in soup.find_all( + "a", + string=lambda s: s and "skip to" in s.lower(), # type: ignore[call-overload] + ): + element.decompose() + + # Return the best semantic container + main_content = soup.find("main") + if main_content: + return str(main_content) + + article = soup.find("article") + if article: + return str(article) + + main_role = soup.find(attrs={"role": "main"}) + if main_role: + return str(main_role) + + body = soup.find("body") + if body: + return str(body) + + return str(soup) + + +_html2text_converter = html2text.HTML2Text() +_html2text_converter.ignore_links = False +_html2text_converter.body_width = 0 +_html2text_converter.ignore_images = False + + +def convert_html_to_markdown(html: str) -> str: + """Convert HTML content to Markdown format. + + Strips navigation chrome (header, footer, nav, aside) before conversion + to produce cleaner markdown output. + + Args: + html: HTML content string + + Returns: + Markdown formatted content + """ + content_html = extract_main_content(html) + return _html2text_converter.handle(content_html) diff --git a/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/http.py b/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/http.py new file mode 100644 index 0000000..848eccc --- /dev/null +++ b/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/http.py @@ -0,0 +1,183 @@ +# This code is part of Qiskit. +# +# (C) Copyright IBM 2026. +# +# This code is licensed under the Apache License, Version 2.0. You may +# obtain a copy of this license in the LICENSE.txt file in the root directory +# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +# +# Any modifications or derivative works of this code must retain this +# copyright notice, and modified files need to carry a notice indicating +# that they have been altered from the originals. + +"""HTTP client management, caching, and low-level fetch operations.""" + +import asyncio +import logging +import time +from collections import OrderedDict +from typing import Any + +import httpx + +from qiskit_docs_mcp_server.constants import ( + CACHE_TTL, + HTTP_TIMEOUT, + SEARCH_CACHE_TTL, +) + + +logger = logging.getLogger(__name__) + +_MAX_RETRIES = 2 +_RETRY_DELAY = 1.0 + + +class _TTLCache: + """Simple in-memory cache with TTL and LRU eviction.""" + + def __init__(self, ttl: float = 3600.0, max_size: int = 128): + self._ttl = ttl + self._max_size = max_size + self._cache: OrderedDict[str, tuple[float, Any]] = OrderedDict() + + def get(self, key: str) -> Any | None: + if key in self._cache: + timestamp, value = self._cache[key] + if time.monotonic() - timestamp < self._ttl: + self._cache.move_to_end(key) # LRU touch + return value + del self._cache[key] + return None + + def set(self, key: str, value: Any) -> None: + if key in self._cache: + del self._cache[key] + elif len(self._cache) >= self._max_size: + self._cache.popitem(last=False) # Evict LRU — O(1) + self._cache[key] = (time.monotonic(), value) + + def clear(self) -> None: + self._cache.clear() + + +_text_cache = _TTLCache(ttl=CACHE_TTL) +_json_cache = _TTLCache(ttl=SEARCH_CACHE_TTL) + +_client_holder: dict[str, httpx.AsyncClient] = {} + + +def set_http_client(client: httpx.AsyncClient) -> None: + """Set the shared HTTP client (called by server lifespan).""" + _client_holder["client"] = client + + +def clear_http_client() -> None: + """Clear the shared HTTP client (called on server shutdown).""" + _client_holder.clear() + + +def _get_http_client() -> httpx.AsyncClient: + """Get or create a shared HTTP client.""" + client = _client_holder.get("client") + if client is None or client.is_closed: + client = httpx.AsyncClient(timeout=HTTP_TIMEOUT, follow_redirects=True) + _client_holder["client"] = client + return client + + +async def _fetch_with_retry(url: str) -> httpx.Response | None: + """Fetch a URL with retry for transient errors (5xx, timeouts). + + Args: + url: The URL to fetch + + Returns: + The httpx Response on success, or None if all attempts fail + """ + client = _get_http_client() + last_error: Exception | None = None + for attempt in range(_MAX_RETRIES): + try: + response = await client.get(url, follow_redirects=True) + response.raise_for_status() + return response + except httpx.TimeoutException as e: # noqa: PERF203 + last_error = e + if attempt < _MAX_RETRIES - 1: + logger.warning( + "Timeout fetching %s (attempt %d), retrying...", + url, + attempt + 1, + ) + await asyncio.sleep(_RETRY_DELAY) + continue + except httpx.HTTPStatusError as e: + last_error = e + if attempt < _MAX_RETRIES - 1 and e.response.status_code >= 500: + logger.warning( + "Server error %d fetching %s (attempt %d), retrying...", + e.response.status_code, + url, + attempt + 1, + ) + await asyncio.sleep(_RETRY_DELAY) + continue + break # 4xx errors — don't retry + except httpx.HTTPError as e: + logger.error("Failed to fetch %s: %s", url, e) + return None + except Exception as e: + logger.error("Unexpected error fetching %s: %s", url, e) + return None + + logger.error("Failed to fetch %s after %d attempts: %s", url, _MAX_RETRIES, last_error) + return None + + +async def fetch_text(url: str) -> str | None: + """Fetch text content from a URL using httpx. + + Retries on transient errors (5xx status codes and timeouts). + + Args: + url: The URL to fetch + + Returns: + The text content of the page, or None if fetch fails + """ + cached: str | None = _text_cache.get(url) + if cached is not None: + return cached + + response = await _fetch_with_retry(url) + if response is None: + return None + + result = response.text + _text_cache.set(url, result) + return result + + +async def fetch_text_json(url: str) -> list[dict[str, Any]] | None: + """Fetch JSON content from a URL using httpx. + + Retries on transient errors (5xx status codes and timeouts). + + Args: + url: The URL to fetch + + Returns: + The JSON content as a list of dicts, or None if fetch fails + """ + cached: list[dict[str, Any]] | None = _json_cache.get(url) + if cached is not None: + return cached + + response = await _fetch_with_retry(url) + if response is None: + return None + + result: list[dict[str, Any]] = response.json() + _json_cache.set(url, result) + return result diff --git a/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/server.py b/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/server.py index 7550f02..b888905 100644 --- a/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/server.py +++ b/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/server.py @@ -27,16 +27,18 @@ from qiskit_docs_mcp_server.constants import HTTP_TIMEOUT from qiskit_docs_mcp_server.data_fetcher import ( - clear_http_client, get_list_of_addons, + get_list_of_api_packages, get_list_of_error_code_categories, get_list_of_guides, get_list_of_modules, + get_list_of_tutorials, get_page_docs, lookup_error_code, search_qiskit_docs, - set_http_client, ) +from qiskit_docs_mcp_server.http import clear_http_client, set_http_client +from qiskit_docs_mcp_server.sitemap import load_sitemap # Configure logging @@ -49,6 +51,7 @@ async def lifespan(server: FastMCP) -> AsyncIterator[None]: """Manage the httpx client lifecycle.""" async with httpx.AsyncClient(timeout=HTTP_TIMEOUT, follow_redirects=True) as client: set_http_client(client) + await load_sitemap() yield clear_http_client() @@ -217,35 +220,56 @@ def how_to(task: str) -> str: @mcp.resource("qiskit-docs://modules", mime_type="application/json") -def modules_resource() -> dict[str, Any]: +async def modules_resource() -> dict[str, Any]: """Get list of all Qiskit SDK modules with URL paths. - Returns curated list of common SDK modules. Use get_page_tool with - 'api/qiskit/{module}' to fetch documentation, or search_docs_tool - to discover any module page. + Dynamically discovered from the documentation sitemap. + Use get_page_tool with 'api/qiskit/{module}' to fetch documentation, + or search_docs_tool to discover any module page. """ - return get_list_of_modules() + return await get_list_of_modules() @mcp.resource("qiskit-docs://addons", mime_type="application/json") -def addons_resource() -> dict[str, Any]: +async def addons_resource() -> dict[str, Any]: """Get list of Qiskit addon packages with URL paths. - Returns curated list of addon packages. Use get_page_tool with - 'api/qiskit-addon-{name}' to fetch documentation. + Dynamically discovered from the documentation sitemap. + Use get_page_tool with 'api/qiskit-addon-{name}' to fetch documentation. """ - return get_list_of_addons() + return await get_list_of_addons() @mcp.resource("qiskit-docs://guides", mime_type="application/json") -def guides_resource() -> dict[str, Any]: +async def guides_resource() -> dict[str, Any]: """Get list of Qiskit implementation guides with URL paths. - Returns curated list of common guides. Use get_page_tool with - 'guides/{name}' to fetch documentation, or search_docs_tool to - discover any guide. + Dynamically discovered from the documentation sitemap. + Use get_page_tool with 'guides/{name}' to fetch documentation, + or search_docs_tool to discover any guide. """ - return get_list_of_guides() + return await get_list_of_guides() + + +@mcp.resource("qiskit-docs://tutorials", mime_type="application/json") +async def tutorials_resource() -> dict[str, Any]: + """Get list of Qiskit tutorials with URL paths. + + Dynamically discovered from the documentation sitemap. + Use get_page_tool with 'tutorials/{name}' to fetch documentation. + """ + return await get_list_of_tutorials() + + +@mcp.resource("qiskit-docs://api-packages", mime_type="application/json") +async def api_packages_resource() -> dict[str, Any]: + """Get list of API packages beyond SDK modules and addons. + + Includes qiskit-ibm-runtime, qiskit-ibm-transpiler, REST API references, etc. + Dynamically discovered from the documentation sitemap. + Use get_page_tool with 'api/{name}' to fetch documentation. + """ + return await get_list_of_api_packages() @mcp.resource("qiskit-docs://error-codes", mime_type="application/json") diff --git a/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/sitemap.py b/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/sitemap.py new file mode 100644 index 0000000..87a612f --- /dev/null +++ b/qiskit-docs-mcp-server/src/qiskit_docs_mcp_server/sitemap.py @@ -0,0 +1,136 @@ +# This code is part of Qiskit. +# +# (C) Copyright IBM 2026. +# +# This code is licensed under the Apache License, Version 2.0. You may +# obtain a copy of this license in the LICENSE.txt file in the root directory +# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +# +# Any modifications or derivative works of this code must retain this +# copyright notice, and modified files need to carry a notice indicating +# that they have been altered from the originals. + +"""Dynamic sitemap discovery for automatic content adaptation.""" + +import logging +import re + +from defusedxml.ElementTree import fromstring as parse_xml + +from qiskit_docs_mcp_server.constants import SITEMAP_URL +from qiskit_docs_mcp_server.http import _get_http_client + + +logger = logging.getLogger(__name__) + +_SITEMAP_NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}" + +_sitemap_data: dict[str, list[str]] | None = None + +_VERSION_SEGMENT_RE = re.compile(r"/(?:\d+\.\d+|dev)(?:/|$)") + + +def _classify_page(path: str, buckets: dict[str, set[str]]) -> None: + """Classify a single English doc path into the appropriate bucket. + + Args: + path: Path relative to ``/docs/en/`` (e.g. ``guides/transpile``). + buckets: Mutable dict of sets to add the slug to. + """ + for prefix, key in (("guides/", "guides"), ("tutorials/", "tutorials")): + if path.startswith(prefix): + slug = path[len(prefix) :] + if slug and "/" not in slug: + buckets[key].add(slug) + return + + if not path.startswith("api/"): + return + + if path.startswith("api/qiskit-addon-"): + rest = path.removeprefix("api/qiskit-addon-") + if rest and "/" not in rest: + buckets["addons"].add(rest) + elif path.startswith("api/qiskit/"): + slug = path.removeprefix("api/qiskit/") + if ( + slug + and "/" not in slug + and not slug.startswith("qiskit.") + and slug not in {"release-notes", "root"} + ): + buckets["modules"].add(slug) + else: + slug = path.removeprefix("api/") + if slug and "/" not in slug and slug != "qiskit": + buckets["api_packages"].add(slug) + + +def _parse_sitemap_xml(xml_text: str) -> dict[str, list[str]]: + """Parse sitemap XML and categorize English page paths. + + Extracts ``/en/`` pages from the sitemap and groups them into: + modules, addons, api_packages, guides, and tutorials. + + Args: + xml_text: Raw XML string from the sitemap + + Returns: + Dict with keys 'modules', 'addons', 'api_packages', 'guides', + 'tutorials', each mapping to a sorted list of slug strings. + """ + root = parse_xml(xml_text) + + buckets: dict[str, set[str]] = { + "modules": set(), + "addons": set(), + "api_packages": set(), + "guides": set(), + "tutorials": set(), + } + + en_marker = "/docs/en/" + for loc in root.iter(f"{_SITEMAP_NS}loc"): + url = loc.text + if url is None: + continue + idx = url.find(en_marker) + if idx == -1: + continue + path = url[idx + len(en_marker) :] + if _VERSION_SEGMENT_RE.search(path): + continue + _classify_page(path, buckets) + + return {key: sorted(values) for key, values in buckets.items()} + + +def get_sitemap_pages() -> dict[str, list[str]] | None: + """Return the sitemap data loaded at startup, or ``None`` if unavailable.""" + return _sitemap_data + + +async def load_sitemap() -> None: + """Fetch and parse the documentation sitemap at server startup. + + Stores the result in a module-level variable so that all subsequent + resource calls can read it synchronously. + """ + global _sitemap_data # noqa: PLW0603 + + try: + client = _get_http_client() + response = await client.get(SITEMAP_URL, follow_redirects=True) + response.raise_for_status() + _sitemap_data = _parse_sitemap_xml(response.text) + logger.info( + "Sitemap loaded: %d modules, %d addons, %d api_packages, %d guides, %d tutorials", + len(_sitemap_data["modules"]), + len(_sitemap_data["addons"]), + len(_sitemap_data["api_packages"]), + len(_sitemap_data["guides"]), + len(_sitemap_data["tutorials"]), + ) + except Exception as e: + logger.warning(f"Failed to fetch sitemap, using fallback constants: {e}") + _sitemap_data = None diff --git a/qiskit-docs-mcp-server/tests/test_data_fetcher.py b/qiskit-docs-mcp-server/tests/test_data_fetcher.py index 0f49b33..78683ae 100644 --- a/qiskit-docs-mcp-server/tests/test_data_fetcher.py +++ b/qiskit-docs-mcp-server/tests/test_data_fetcher.py @@ -18,34 +18,47 @@ import pytest from qiskit_docs_mcp_server.constants import ( AVAILABLE_ADDONS, + AVAILABLE_API_PACKAGES, AVAILABLE_GUIDES, AVAILABLE_MODULES, + AVAILABLE_TUTORIALS, CACHE_TTL, HTTP_TIMEOUT, SEARCH_CACHE_TTL, _get_env_float, ) from qiskit_docs_mcp_server.data_fetcher import ( - _client_holder, - _get_http_client, - _json_cache, _resolve_url, - _strip_html_tags, - _text_cache, _truncate_content, - _TTLCache, - convert_html_to_markdown, - extract_main_content, - fetch_text, - fetch_text_json, get_list_of_addons, + get_list_of_api_packages, get_list_of_error_code_categories, get_list_of_guides, get_list_of_modules, + get_list_of_tutorials, get_page_docs, lookup_error_code, search_qiskit_docs, ) +from qiskit_docs_mcp_server.html_processing import ( + _strip_html_tags, + convert_html_to_markdown, + extract_main_content, +) +from qiskit_docs_mcp_server.http import ( + _client_holder, + _get_http_client, + _json_cache, + _text_cache, + _TTLCache, + fetch_text, + fetch_text_json, +) +from qiskit_docs_mcp_server.sitemap import ( + _parse_sitemap_xml, + get_sitemap_pages, + load_sitemap, +) class TestFetchText: @@ -56,7 +69,7 @@ def setup_method(self): _text_cache.clear() _json_cache.clear() - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_fetch_text_success(self, mock_get_client): """Test successful text fetch.""" mock_response = MagicMock() @@ -69,7 +82,7 @@ async def test_fetch_text_success(self, mock_get_client): result = await fetch_text("https://example.com") assert result == "Sample documentation" - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_fetch_text_http_error(self, mock_get_client): """Test fetch_text with HTTP error.""" mock_client = AsyncMock() @@ -79,7 +92,7 @@ async def test_fetch_text_http_error(self, mock_get_client): result = await fetch_text("https://example.com") assert result is None - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_fetch_text_generic_exception(self, mock_get_client): """Test fetch_text with generic exception.""" mock_client = AsyncMock() @@ -89,8 +102,8 @@ async def test_fetch_text_generic_exception(self, mock_get_client): result = await fetch_text("https://example.com") assert result is None - @patch("qiskit_docs_mcp_server.data_fetcher.asyncio.sleep", new_callable=AsyncMock) - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http.asyncio.sleep", new_callable=AsyncMock) + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_fetch_text_timeout(self, mock_get_client, mock_sleep): """Test fetch_text with timeout.""" mock_client = AsyncMock() @@ -109,8 +122,8 @@ def setup_method(self): _text_cache.clear() _json_cache.clear() - @patch("qiskit_docs_mcp_server.data_fetcher.asyncio.sleep", new_callable=AsyncMock) - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http.asyncio.sleep", new_callable=AsyncMock) + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_timeout_then_success(self, mock_get_client, mock_sleep): """Test that a timeout on first attempt succeeds on retry.""" mock_success = MagicMock() @@ -129,8 +142,8 @@ async def test_timeout_then_success(self, mock_get_client, mock_sleep): assert mock_client.get.call_count == 2 mock_sleep.assert_called_once() - @patch("qiskit_docs_mcp_server.data_fetcher.asyncio.sleep", new_callable=AsyncMock) - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http.asyncio.sleep", new_callable=AsyncMock) + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_503_then_success(self, mock_get_client, mock_sleep): """Test that a 503 on first attempt succeeds on retry.""" mock_503_response = MagicMock() @@ -153,8 +166,8 @@ async def test_503_then_success(self, mock_get_client, mock_sleep): assert mock_client.get.call_count == 2 mock_sleep.assert_called_once() - @patch("qiskit_docs_mcp_server.data_fetcher.asyncio.sleep", new_callable=AsyncMock) - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http.asyncio.sleep", new_callable=AsyncMock) + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_404_not_retried(self, mock_get_client, mock_sleep): """Test that a 404 is NOT retried (4xx errors).""" mock_404_response = MagicMock() @@ -174,8 +187,8 @@ async def test_404_not_retried(self, mock_get_client, mock_sleep): assert mock_client.get.call_count == 1 mock_sleep.assert_not_called() - @patch("qiskit_docs_mcp_server.data_fetcher.asyncio.sleep", new_callable=AsyncMock) - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http.asyncio.sleep", new_callable=AsyncMock) + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_retries_exhausted(self, mock_get_client, mock_sleep): """Test that retries are exhausted and return None.""" mock_client = AsyncMock() @@ -195,8 +208,8 @@ def setup_method(self): _text_cache.clear() _json_cache.clear() - @patch("qiskit_docs_mcp_server.data_fetcher.asyncio.sleep", new_callable=AsyncMock) - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http.asyncio.sleep", new_callable=AsyncMock) + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_json_timeout_then_success(self, mock_get_client, mock_sleep): """Test that a timeout on first attempt succeeds on retry for JSON.""" mock_success = MagicMock() @@ -215,8 +228,8 @@ async def test_json_timeout_then_success(self, mock_get_client, mock_sleep): assert mock_client.get.call_count == 2 mock_sleep.assert_called_once() - @patch("qiskit_docs_mcp_server.data_fetcher.asyncio.sleep", new_callable=AsyncMock) - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http.asyncio.sleep", new_callable=AsyncMock) + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_json_503_then_success(self, mock_get_client, mock_sleep): """Test that a 503 on first attempt succeeds on retry for JSON.""" mock_503_response = MagicMock() @@ -239,8 +252,8 @@ async def test_json_503_then_success(self, mock_get_client, mock_sleep): assert mock_client.get.call_count == 2 mock_sleep.assert_called_once() - @patch("qiskit_docs_mcp_server.data_fetcher.asyncio.sleep", new_callable=AsyncMock) - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http.asyncio.sleep", new_callable=AsyncMock) + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_json_404_not_retried(self, mock_get_client, mock_sleep): """Test that a 404 is NOT retried for JSON fetch.""" mock_404_response = MagicMock() @@ -260,8 +273,8 @@ async def test_json_404_not_retried(self, mock_get_client, mock_sleep): assert mock_client.get.call_count == 1 mock_sleep.assert_not_called() - @patch("qiskit_docs_mcp_server.data_fetcher.asyncio.sleep", new_callable=AsyncMock) - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http.asyncio.sleep", new_callable=AsyncMock) + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_json_retries_exhausted(self, mock_get_client, mock_sleep): """Test that retries are exhausted and return None for JSON.""" mock_client = AsyncMock() @@ -281,7 +294,7 @@ def setup_method(self): _text_cache.clear() _json_cache.clear() - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_fetch_text_json_success(self, mock_get_client): """Test successful JSON fetch.""" mock_response = MagicMock() @@ -294,7 +307,7 @@ async def test_fetch_text_json_success(self, mock_get_client): result = await fetch_text_json("https://example.com/api") assert result == [{"key": "value"}] - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_fetch_text_json_http_error(self, mock_get_client): """Test fetch_text_json with HTTP error.""" mock_client = AsyncMock() @@ -304,7 +317,7 @@ async def test_fetch_text_json_http_error(self, mock_get_client): result = await fetch_text_json("https://example.com/api") assert result is None - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_fetch_text_json_generic_exception(self, mock_get_client): """Test fetch_text_json with generic exception.""" mock_client = AsyncMock() @@ -314,7 +327,7 @@ async def test_fetch_text_json_generic_exception(self, mock_get_client): result = await fetch_text_json("https://example.com/api") assert result is None - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_fetch_text_json_returns_list(self, mock_get_client): """Test that fetch_text_json returns list.""" mock_response = MagicMock() @@ -903,51 +916,266 @@ def test_empty_html(self): assert result.strip() == "" +class TestParseSitemapXml: + """Test sitemap XML parsing.""" + + _SAMPLE_SITEMAP = """ + + https://quantum.cloud.ibm.com/docs/en/guides/quick-start + https://quantum.cloud.ibm.com/docs/en/guides/transpile + https://quantum.cloud.ibm.com/docs/en/tutorials/grovers-algorithm + https://quantum.cloud.ibm.com/docs/en/tutorials/shors-algorithm + https://quantum.cloud.ibm.com/docs/en/api/qiskit/circuit + https://quantum.cloud.ibm.com/docs/en/api/qiskit/transpiler + https://quantum.cloud.ibm.com/docs/en/api/qiskit/qiskit.circuit.QuantumCircuit + https://quantum.cloud.ibm.com/docs/en/api/qiskit/release-notes + https://quantum.cloud.ibm.com/docs/en/api/qiskit-addon-sqd + https://quantum.cloud.ibm.com/docs/en/api/qiskit-addon-sqd/submodule + https://quantum.cloud.ibm.com/docs/en/api/qiskit-ibm-runtime + https://quantum.cloud.ibm.com/docs/en/api/functions + https://quantum.cloud.ibm.com/docs/en/api/qiskit/1.0/circuit + https://quantum.cloud.ibm.com/docs/de/guides/quick-start + https://quantum.cloud.ibm.com/docs/en/errors + """ + + def test_parses_guides(self): + """Test that guides are correctly extracted.""" + result = _parse_sitemap_xml(self._SAMPLE_SITEMAP) + assert "quick-start" in result["guides"] + assert "transpile" in result["guides"] + + def test_parses_tutorials(self): + """Test that tutorials are correctly extracted.""" + result = _parse_sitemap_xml(self._SAMPLE_SITEMAP) + assert "grovers-algorithm" in result["tutorials"] + assert "shors-algorithm" in result["tutorials"] + + def test_parses_modules(self): + """Test that SDK modules are extracted (excluding class pages).""" + result = _parse_sitemap_xml(self._SAMPLE_SITEMAP) + assert "circuit" in result["modules"] + assert "transpiler" in result["modules"] + + def test_excludes_class_pages_from_modules(self): + """Test that qiskit.* class pages are not included as modules.""" + result = _parse_sitemap_xml(self._SAMPLE_SITEMAP) + module_names = result["modules"] + assert not any(n.startswith("qiskit.") for n in module_names) + + def test_excludes_release_notes_from_modules(self): + """Test that release-notes is excluded from modules.""" + result = _parse_sitemap_xml(self._SAMPLE_SITEMAP) + assert "release-notes" not in result["modules"] + + def test_parses_addons(self): + """Test that addon packages are extracted (top-level only).""" + result = _parse_sitemap_xml(self._SAMPLE_SITEMAP) + assert "sqd" in result["addons"] + # Submodule pages should not create separate addon entries + assert len(result["addons"]) == 1 + + def test_parses_api_packages(self): + """Test that non-SDK, non-addon API packages are extracted.""" + result = _parse_sitemap_xml(self._SAMPLE_SITEMAP) + assert "qiskit-ibm-runtime" in result["api_packages"] + assert "functions" in result["api_packages"] + + def test_excludes_versioned_paths(self): + """Test that versioned paths (e.g., /1.0/) are excluded.""" + result = _parse_sitemap_xml(self._SAMPLE_SITEMAP) + # The versioned /1.0/circuit should not add a duplicate 'circuit' + # but 'circuit' from the non-versioned path should be present + assert "circuit" in result["modules"] + + def test_excludes_non_english_pages(self): + """Test that non-English pages are excluded.""" + result = _parse_sitemap_xml(self._SAMPLE_SITEMAP) + # The German guide should not appear + all_items = ( + result["guides"] + + result["tutorials"] + + result["modules"] + + result["addons"] + + result["api_packages"] + ) + # quick-start appears once (English), not duplicated from German + assert all_items.count("quick-start") <= 1 + + def test_results_are_sorted(self): + """Test that all result lists are sorted.""" + result = _parse_sitemap_xml(self._SAMPLE_SITEMAP) + for key in ("guides", "tutorials", "modules", "addons", "api_packages"): + assert result[key] == sorted(result[key]) + + def test_empty_sitemap(self): + """Test parsing an empty sitemap.""" + xml = """ + + """ + result = _parse_sitemap_xml(xml) + assert result["guides"] == [] + assert result["tutorials"] == [] + assert result["modules"] == [] + assert result["addons"] == [] + assert result["api_packages"] == [] + + +class TestLoadSitemap: + """Test load_sitemap / get_sitemap_pages functions.""" + + def setup_method(self): + """Reset sitemap state before each test.""" + import qiskit_docs_mcp_server.sitemap as _mod + + _mod._sitemap_data = None + + @patch("qiskit_docs_mcp_server.sitemap._get_http_client") + async def test_returns_parsed_pages(self, mock_get_client): + """Test that load_sitemap populates sitemap data.""" + xml = """ + + https://quantum.cloud.ibm.com/docs/en/guides/transpile + https://quantum.cloud.ibm.com/docs/en/api/qiskit/circuit + """ + mock_response = MagicMock() + mock_response.text = xml + mock_response.raise_for_status = MagicMock() + mock_client = AsyncMock() + mock_client.get.return_value = mock_response + mock_get_client.return_value = mock_client + + await load_sitemap() + result = get_sitemap_pages() + assert result is not None + assert "transpile" in result["guides"] + assert "circuit" in result["modules"] + + @patch("qiskit_docs_mcp_server.sitemap._get_http_client") + async def test_returns_none_on_failure(self, mock_get_client): + """Test that get_sitemap_pages returns None on HTTP error.""" + mock_client = AsyncMock() + mock_client.get.side_effect = httpx.HTTPError("Connection failed") + mock_get_client.return_value = mock_client + + await load_sitemap() + assert get_sitemap_pages() is None + + @patch("qiskit_docs_mcp_server.sitemap._get_http_client") + async def test_stores_result(self, mock_get_client): + """Test that sitemap data persists after load_sitemap.""" + xml = """ + + https://quantum.cloud.ibm.com/docs/en/guides/quick-start + """ + mock_response = MagicMock() + mock_response.text = xml + mock_response.raise_for_status = MagicMock() + mock_client = AsyncMock() + mock_client.get.return_value = mock_response + mock_get_client.return_value = mock_client + + await load_sitemap() + result1 = get_sitemap_pages() + result2 = get_sitemap_pages() + assert result1 is result2 + assert mock_client.get.call_count == 1 + + class TestListHelpers: """Test list helper functions.""" - def test_get_list_of_modules(self): - """Test get_list_of_modules returns correct structure with url_path.""" - result = get_list_of_modules() + def setup_method(self): + """Reset sitemap state to force fallback.""" + import qiskit_docs_mcp_server.sitemap as _mod + + _mod._sitemap_data = None + + @patch("qiskit_docs_mcp_server.data_fetcher.get_sitemap_pages", return_value=None) + async def test_get_list_of_modules_fallback(self, _mock): + """Test get_list_of_modules falls back to constants.""" + result = await get_list_of_modules() assert result["status"] == "success" + assert result["source"] == "fallback" assert "modules" in result assert isinstance(result["modules"], list) assert len(result["modules"]) > 0 - # Check structure includes name, description, url_path first = result["modules"][0] assert "name" in first - assert "description" in first assert "url_path" in first assert first["url_path"].startswith("api/qiskit/") assert "full_url" in first assert first["full_url"].startswith("https://") - def test_get_list_of_addons(self): - """Test get_list_of_addons returns correct structure with url_path.""" - result = get_list_of_addons() + @patch("qiskit_docs_mcp_server.data_fetcher.get_sitemap_pages", return_value=None) + async def test_get_list_of_addons_fallback(self, _mock): + """Test get_list_of_addons falls back to constants.""" + result = await get_list_of_addons() assert result["status"] == "success" + assert result["source"] == "fallback" assert "addons" in result assert len(result["addons"]) > 0 first = result["addons"][0] assert "name" in first - assert "description" in first assert "url_path" in first assert "qiskit-addon-" in first["url_path"] assert "full_url" in first - def test_get_list_of_guides(self): - """Test get_list_of_guides returns correct structure with url_path.""" - result = get_list_of_guides() + @patch("qiskit_docs_mcp_server.data_fetcher.get_sitemap_pages", return_value=None) + async def test_get_list_of_guides_fallback(self, _mock): + """Test get_list_of_guides falls back to constants.""" + result = await get_list_of_guides() assert result["status"] == "success" + assert result["source"] == "fallback" assert "guides" in result assert len(result["guides"]) > 0 first = result["guides"][0] assert "name" in first - assert "description" in first assert "url_path" in first assert first["url_path"].startswith("guides/") assert "full_url" in first + @patch("qiskit_docs_mcp_server.data_fetcher.get_sitemap_pages", return_value=None) + async def test_get_list_of_tutorials_fallback(self, _mock): + """Test get_list_of_tutorials falls back to constants.""" + result = await get_list_of_tutorials() + assert result["status"] == "success" + assert result["source"] == "fallback" + assert "tutorials" in result + assert len(result["tutorials"]) > 0 + first = result["tutorials"][0] + assert "name" in first + assert "url_path" in first + assert first["url_path"].startswith("tutorials/") + + @patch("qiskit_docs_mcp_server.data_fetcher.get_sitemap_pages", return_value=None) + async def test_get_list_of_api_packages_fallback(self, _mock): + """Test get_list_of_api_packages falls back to constants.""" + result = await get_list_of_api_packages() + assert result["status"] == "success" + assert result["source"] == "fallback" + assert "api_packages" in result + assert len(result["api_packages"]) > 0 + first = result["api_packages"][0] + assert "name" in first + assert "url_path" in first + assert first["url_path"].startswith("api/") + + @patch("qiskit_docs_mcp_server.data_fetcher.get_sitemap_pages") + async def test_get_list_of_modules_from_sitemap(self, mock_sitemap): + """Test get_list_of_modules uses sitemap when available.""" + mock_sitemap.return_value = { + "modules": ["circuit", "transpiler"], + "addons": [], + "api_packages": [], + "guides": [], + "tutorials": [], + } + result = await get_list_of_modules() + assert result["status"] == "success" + assert result["source"] == "sitemap" + names = [m["name"] for m in result["modules"]] + assert names == ["circuit", "transpiler"] + def test_get_list_of_error_code_categories(self): """Test get_list_of_error_code_categories returns correct structure.""" result = get_list_of_error_code_categories() @@ -968,37 +1196,53 @@ def test_qiskit_modules_has_circuit(self): """Test that AVAILABLE_MODULES contains circuit.""" assert "circuit" in AVAILABLE_MODULES - def test_qiskit_modules_are_dict_with_descriptions(self): - """Test that AVAILABLE_MODULES values are description strings.""" - assert isinstance(AVAILABLE_MODULES, dict) - for key, value in AVAILABLE_MODULES.items(): - assert isinstance(key, str) - assert isinstance(value, str) - assert len(value) > 0 + def test_qiskit_modules_are_list_of_strings(self): + """Test that AVAILABLE_MODULES is a list of strings.""" + assert isinstance(AVAILABLE_MODULES, list) + for item in AVAILABLE_MODULES: + assert isinstance(item, str) + assert len(item) > 0 def test_qiskit_addon_modules_not_empty(self): """Test that AVAILABLE_ADDONS is not empty.""" assert len(AVAILABLE_ADDONS) > 0 - def test_qiskit_addons_are_dict_with_descriptions(self): - """Test that AVAILABLE_ADDONS values are description strings.""" - assert isinstance(AVAILABLE_ADDONS, dict) - for key, value in AVAILABLE_ADDONS.items(): - assert isinstance(key, str) - assert isinstance(value, str) - assert len(value) > 0 + def test_qiskit_addons_are_list_of_strings(self): + """Test that AVAILABLE_ADDONS is a list of strings.""" + assert isinstance(AVAILABLE_ADDONS, list) + for item in AVAILABLE_ADDONS: + assert isinstance(item, str) + assert len(item) > 0 def test_qiskit_guides_not_empty(self): """Test that AVAILABLE_GUIDES is not empty.""" assert len(AVAILABLE_GUIDES) > 0 - def test_qiskit_guides_are_dict_with_descriptions(self): - """Test that AVAILABLE_GUIDES values are description strings.""" - assert isinstance(AVAILABLE_GUIDES, dict) - for key, value in AVAILABLE_GUIDES.items(): - assert isinstance(key, str) - assert isinstance(value, str) - assert len(value) > 0 + def test_qiskit_guides_are_list_of_strings(self): + """Test that AVAILABLE_GUIDES is a list of strings.""" + assert isinstance(AVAILABLE_GUIDES, list) + for item in AVAILABLE_GUIDES: + assert isinstance(item, str) + assert len(item) > 0 + + def test_qiskit_tutorials_not_empty(self): + """Test that AVAILABLE_TUTORIALS is not empty.""" + assert len(AVAILABLE_TUTORIALS) > 0 + + def test_qiskit_tutorials_are_list_of_strings(self): + """Test that AVAILABLE_TUTORIALS is a list of strings.""" + assert isinstance(AVAILABLE_TUTORIALS, list) + for item in AVAILABLE_TUTORIALS: + assert isinstance(item, str) + assert len(item) > 0 + + def test_api_packages_not_empty(self): + """Test that AVAILABLE_API_PACKAGES is not empty.""" + assert len(AVAILABLE_API_PACKAGES) > 0 + + def test_api_packages_has_ibm_runtime(self): + """Test that AVAILABLE_API_PACKAGES contains qiskit-ibm-runtime.""" + assert "qiskit-ibm-runtime" in AVAILABLE_API_PACKAGES class TestEnvironmentConfiguration: @@ -1053,15 +1297,15 @@ def test_search_cache_ttl_default(self): """Test that SEARCH_CACHE_TTL defaults to 300.0 (5 minutes).""" assert SEARCH_CACHE_TTL == 300.0 - @patch("qiskit_docs_mcp_server.data_fetcher.httpx.AsyncClient") + @patch("qiskit_docs_mcp_server.http.httpx.AsyncClient") def test_fetch_text_uses_http_timeout(self, mock_client_class): """Test that _get_http_client creates client with HTTP_TIMEOUT.""" - import qiskit_docs_mcp_server.data_fetcher as df - from qiskit_docs_mcp_server.data_fetcher import _get_http_client + import qiskit_docs_mcp_server.http as http_mod + from qiskit_docs_mcp_server.http import _get_http_client # Force creation of a new client - original_holder = df._client_holder.copy() - df._client_holder.clear() + original_holder = http_mod._client_holder.copy() + http_mod._client_holder.clear() try: _get_http_client() mock_client_class.assert_called_once() @@ -1070,8 +1314,8 @@ def test_fetch_text_uses_http_timeout(self, mock_client_class): assert call_kwargs["timeout"] == HTTP_TIMEOUT assert call_kwargs["follow_redirects"] is True finally: - df._client_holder.clear() - df._client_holder.update(original_holder) + http_mod._client_holder.clear() + http_mod._client_holder.update(original_holder) class TestCaching: @@ -1082,7 +1326,7 @@ def setup_method(self): _text_cache.clear() _json_cache.clear() - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_fetch_text_caches_result(self, mock_get_client): """Test that fetch_text caches successful results.""" mock_response = MagicMock() @@ -1102,7 +1346,7 @@ async def test_fetch_text_caches_result(self, mock_get_client): assert result2 == "Cached content" assert mock_client.get.call_count == 1 # No additional network call - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_fetch_text_json_caches_result(self, mock_get_client): """Test that fetch_text_json caches successful results.""" mock_response = MagicMock() @@ -1120,7 +1364,7 @@ async def test_fetch_text_json_caches_result(self, mock_get_client): assert result2 == [{"key": "value"}] assert mock_client.get.call_count == 1 - @patch("qiskit_docs_mcp_server.data_fetcher._get_http_client") + @patch("qiskit_docs_mcp_server.http._get_http_client") async def test_fetch_text_does_not_cache_errors(self, mock_get_client): """Test that failed fetches are not cached.""" mock_client = AsyncMock() @@ -1156,7 +1400,7 @@ def test_cache_evicts_oldest_at_max_size(self): assert cache.get("b") == 2 assert cache.get("c") == 3 - @patch("qiskit_docs_mcp_server.data_fetcher.time") + @patch("qiskit_docs_mcp_server.http.time") def test_cache_entry_expires_after_ttl(self, mock_time): """Test that cache entries expire after TTL.""" mock_time.monotonic.return_value = 1000.0 diff --git a/qiskit-docs-mcp-server/tests/test_server.py b/qiskit-docs-mcp-server/tests/test_server.py index cdf136d..0e71044 100644 --- a/qiskit-docs-mcp-server/tests/test_server.py +++ b/qiskit-docs-mcp-server/tests/test_server.py @@ -54,6 +54,8 @@ def test_resources_registered(self): "qiskit-docs://modules", "qiskit-docs://addons", "qiskit-docs://guides", + "qiskit-docs://tutorials", + "qiskit-docs://api-packages", "qiskit-docs://error-codes", } assert expected_resources.issubset(resource_uris), ( @@ -67,8 +69,8 @@ def test_tool_count(self): assert len(mcp._tool_manager._tools) == 3 def test_resource_count(self): - """Test the expected number of static resources.""" - assert len(mcp._resource_manager._resources) == 4 + """Test the expected number of resources.""" + assert len(mcp._resource_manager._resources) == 6 def test_old_tools_removed(self): """Test that old category-specific tools are no longer registered.""" diff --git a/uv.lock b/uv.lock index 0cd0cae..c1074dd 100644 --- a/uv.lock +++ b/uv.lock @@ -4645,6 +4645,7 @@ version = "0.1.1" source = { editable = "qiskit-docs-mcp-server" } dependencies = [ { name = "beautifulsoup4" }, + { name = "defusedxml" }, { name = "fastmcp" }, { name = "html2text" }, { name = "httpx" }, @@ -4656,6 +4657,7 @@ dev = [ { name = "mypy" }, { name = "pre-commit" }, { name = "ruff" }, + { name = "types-defusedxml" }, ] examples = [ { name = "fastmcp" }, @@ -4678,6 +4680,7 @@ test = [ [package.metadata] requires-dist = [ { name = "beautifulsoup4", specifier = ">=4.12.0" }, + { name = "defusedxml", specifier = ">=0.7.1" }, { name = "fastmcp", specifier = ">=2.8.1,<3" }, { name = "html2text", specifier = ">=2020.1.16" }, { name = "httpx", specifier = ">=0.28.1" }, @@ -4689,6 +4692,7 @@ dev = [ { name = "mypy", specifier = ">=1.15.0" }, { name = "pre-commit", specifier = ">=4.1.0" }, { name = "ruff", specifier = ">=0.9.4" }, + { name = "types-defusedxml", specifier = ">=0.7.0" }, ] examples = [ { name = "fastmcp", specifier = ">=2.14.0" }, @@ -6437,6 +6441,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085, upload-time = "2026-02-21T16:54:41.616Z" }, ] +[[package]] +name = "types-defusedxml" +version = "0.7.0.20260408" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/39/af/d324da5ffbf0af40477533a09ee6c902de335c445a8dcc88c58f62af6e5f/types_defusedxml-0.7.0.20260408.tar.gz", hash = "sha256:f35377d59344f98b57f9bf319cff2107aac35f9e4d42f9ed6cfeeafacffadb00", size = 10638, upload-time = "2026-04-08T04:26:12.239Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/68/7570cfb818d6a5b3ff964114527e28e360eccf18329b457f057a18596e64/types_defusedxml-0.7.0.20260408-py3-none-any.whl", hash = "sha256:2d68db82412170b91b3e490b7c118a4f4e5a27756a126e2453f629c8d514b106", size = 13435, upload-time = "2026-04-08T04:26:11.347Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0"