From 6afc68b7f1d9d3b71ee20449be93558f9b85ad32 Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 10:29:23 +0800 Subject: [PATCH 01/15] reort: split parser layer to 2-layer: accessor and parser, so that we can reuse more code --- docs/design/parser-two-layer-refactor-plan.md | 355 ++++++++++ examples/ov.conf.example | 5 +- openviking/parse/__init__.py | 2 - openviking/parse/accessors/__init__.py | 33 + openviking/parse/accessors/base.py | 191 ++++++ openviking/parse/accessors/feishu.py | 646 ++++++++++++++++++ openviking/parse/accessors/git.py | 581 ++++++++++++++++ openviking/parse/accessors/http.py | 228 +++++++ openviking/parse/accessors/registry.py | 204 ++++++ openviking/parse/parsers/__init__.py | 6 +- openviking/parse/parsers/code/code.py | 111 +-- openviking/parse/parsers/directory.py | 63 ++ openviking/parse/parsers/html.py | 592 +--------------- openviking/parse/parsers/zip_parser.py | 264 +++---- openviking/parse/registry.py | 33 +- openviking/utils/media_processor.py | 212 +++--- openviking/utils/resource_processor.py | 10 +- openviking_cli/utils/config/parser_config.py | 2 +- tests/unit/test_accessors_base.py | 135 ++++ tests/unit/test_accessors_git.py | 84 +++ tests/unit/test_accessors_http.py | 116 ++++ tests/unit/test_accessors_registry.py | 155 +++++ 22 files changed, 3045 insertions(+), 983 deletions(-) create mode 100644 docs/design/parser-two-layer-refactor-plan.md create mode 100644 openviking/parse/accessors/__init__.py create mode 100644 openviking/parse/accessors/base.py create mode 100644 openviking/parse/accessors/feishu.py create mode 100644 openviking/parse/accessors/git.py create mode 100644 openviking/parse/accessors/http.py create mode 100644 openviking/parse/accessors/registry.py create mode 100644 tests/unit/test_accessors_base.py create mode 100644 tests/unit/test_accessors_git.py create mode 100644 tests/unit/test_accessors_http.py create mode 100644 tests/unit/test_accessors_registry.py diff --git a/docs/design/parser-two-layer-refactor-plan.md b/docs/design/parser-two-layer-refactor-plan.md new file mode 100644 index 000000000..1d99a7cc7 --- /dev/null +++ b/docs/design/parser-two-layer-refactor-plan.md @@ -0,0 +1,355 @@ +# OpenViking 解析器两层架构重构计划 + +| 项目 | 信息 | +|-----|------| +| 状态 | `规划中` | +| 创建日期 | 2026-04-13 | +| 目标版本 | v6.0 | + +--- + +## 一、问题分析 + +### 当前架构的问题 + +1. **平铺式注册**:所有 Parser 在同一层级,职责不清晰 +2. **后缀冲突**:`.zip` 可被 `CodeRepositoryParser` 和 `ZipParser` 同时处理 +3. **URL 处理逻辑分散**:`UnifiedResourceProcessor._process_url()` 和 `ParserRegistry.parse()` 都有 URL 检测 +4. **职责混合**:部分 Parser 既负责下载又负责解析(如 `CodeRepositoryParser`, `HTMLParser`, `FeishuParser`) + +--- + +## 二、新架构设计 + +### 核心概念 + +| 层级 | 抽象接口 | 职责 | 示例 | +|-----|---------|------|------| +| **L1** | `DataAccessor` | 获取数据:将远程 URL / 特殊路径 → 本地文件/目录 | GitAccessor, HTTPAccessor, FeishuAccessor | +| **L2** | `DataParser` | 解析数据:本地文件/目录 → `ParseResult` | MarkdownParser, PDFParser, DirectoryParser | +| **混合** | `HybridParser` | 同时实现两个接口(简化插件开发) | (按需使用) | + +### 新的调用流程 + +``` +add_resource(path) + ↓ +ResourceProcessor.process_resource() + ↓ +UnifiedResourceProcessor.process() [重构] + ↓ +【第一阶段:数据访问】 +AccessorRegistry.route(source) + ├─→ 是 URL/远程资源? + │ ├─→ GitAccessor (git@, git://, github.com, ...) + │ ├─→ FeishuAccessor (.feishu.cn, .larksuite.com) + │ ├─→ HTTPAccessor (http://, https://) + │ └─→ 其他 → 下一步 + └─→ 返回: LocalResource (本地路径 + 元数据) + ↓ +【第二阶段:数据解析】 +ParserRegistry.route(local_resource) + ├─→ 是目录? → DirectoryParser + ├─→ 是文件? → 按扩展名匹配 + │ ├─→ .md → MarkdownParser + │ ├─→ .pdf → PDFParser + │ ├─→ .zip → ZipParser + │ └─→ ... + └─→ 返回: ParseResult + ↓ +TreeBuilder + SemanticQueue (保持不变) +``` + +--- + +## 三、详细设计 + +### 3.1 核心抽象接口 + +#### 文件 1: `openviking/parse/accessors/base.py` + +```python +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Optional, Dict, Any, Union +from dataclasses import dataclass + +@dataclass +class LocalResource: + """数据访问层的输出:表示可访问的本地资源""" + path: Path # 本地文件/目录路径 + source_type: str # 原始来源类型: "git", "http", "feishu", "local", ... + original_source: str # 原始 source 字符串 + meta: Dict[str, Any] # 元数据(repo_name, branch, 等) + is_temporary: bool = True # 是否为临时文件,解析后可清理 + +class DataAccessor(ABC): + """数据访问器:负责获取数据到本地""" + + @abstractmethod + def can_handle(self, source: Union[str, Path]) -> bool: + """判断是否能处理该来源""" + pass + + @abstractmethod + async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: + """ + 获取数据到本地 + + 返回: LocalResource,包含本地路径和元数据 + """ + pass + + @property + @abstractmethod + def priority(self) -> int: + """ + 优先级,数字越大优先级越高 + 用于解决冲突:多个 Accessor 都 can_handle 时,选优先级高的 + """ + pass + + def cleanup(self, resource: LocalResource) -> None: + """ + 清理临时资源(可选) + 默认:如果是临时资源,删除本地文件/目录 + """ + if resource.is_temporary: + # 实现清理逻辑 + pass +``` + +#### 文件 2: `openviking/parse/parsers/base_parser.py` (重构) + +```python +# 保持现有 BaseParser 接口,但明确其职责为 DataParser +# 可以考虑重命名为 DataParser 或保留 BaseParser 作为别名 +``` + +#### 文件 3: `openviking/parse/accessors/registry.py` + +```python +from typing import Union, Path, Optional, List +from .base import DataAccessor, LocalResource + +class AccessorRegistry: + """数据访问器注册表""" + + def __init__(self): + self._accessors: List[DataAccessor] = [] + + def register(self, accessor: DataAccessor) -> None: + """注册访问器(按优先级插入)""" + # 按 priority 降序插入 + idx = 0 + for i, a in enumerate(self._accessors): + if accessor.priority > a.priority: + idx = i + break + else: + idx = len(self._accessors) + self._accessors.insert(idx, accessor) + + def get_accessor(self, source: Union[str, Path]) -> Optional[DataAccessor]: + """获取能处理该 source 的访问器(按优先级)""" + for accessor in self._accessors: + if accessor.can_handle(source): + return accessor + return None + + async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: + """ + 路由到合适的访问器获取数据 + + 如果没有访问器能处理,视为本地文件返回 + """ + accessor = self.get_accessor(source) + if accessor: + return await accessor.access(source, **kwargs) + + # 默认:视为本地文件 + path = Path(source) + return LocalResource( + path=path, + source_type="local", + original_source=str(source), + meta={}, + is_temporary=False + ) +``` + +### 3.2 现有 Parser 拆分方案 + +| 当前 Parser | 拆分后 | 说明 | +|-----------|--------|------| +| `CodeRepositoryParser` | `GitAccessor` + `DirectoryParser` | Git clone 逻辑移到 Accessor | +| `HTMLParser` | `HTTPAccessor` + `HTMLParser` | HTTP 下载移到 Accessor | +| `FeishuParser` | `FeishuAccessor` + (新) `FeishuDocumentParser` | 飞书 API 调用移到 Accessor | +| `ZipParser` | 保持为 `DataParser` | 只处理本地 .zip 文件 | +| `DirectoryParser` | 保持为 `DataParser` | 只处理本地目录 | +| 其他 (Markdown, PDF, ...) | 保持为 `DataParser` | 无需变动 | + +### 3.3 Accessor 实现示例 + +#### `GitAccessor` (`openviking/parse/accessors/git.py`) + +```python +# 从 CodeRepositoryParser 提取 git clone / GitHub ZIP 下载逻辑 +``` + +#### `HTTPAccessor` (`openviking/parse/accessors/http.py`) + +```python +# 从 HTMLParser 提取下载逻辑 +# 支持内容类型检测,下载到临时文件 +``` + +#### `FeishuAccessor` (`openviking/parse/accessors/feishu.py`) + +```python +# 从 FeishuParser 提取 API 调用逻辑 +``` + +### 3.4 HybridParser(简化插件开发) + +对于简单的插件,允许同时实现两个接口: + +```python +class HybridParser(DataAccessor, DataParser): + """ + 混合解析器:同时实现访问和解析 + 适用于简单的自定义解析器,不需要拆分 + """ + # 同时实现 DataAccessor 和 DataParser 的接口 + pass +``` + +--- + +## 四、实施步骤 + +### Phase 1: 基础设施(核心接口) + +**目标**:建立新架构的基础框架,不影响现有功能 + +- [ ] 创建 `openviking/parse/accessors/` 目录结构 +- [ ] 实现 `DataAccessor` 抽象基类和 `LocalResource` 数据类 +- [ ] 实现 `AccessorRegistry` +- [ ] 编写单元测试 +- [ ] 更新 `ParserRegistry`,保持向后兼容 + +### Phase 2: 重构 CodeRepositoryParser + +**目标**:将 CodeRepositoryParser 拆分为 GitAccessor + DirectoryParser + +- [ ] 实现 `GitAccessor`(从 `CodeRepositoryParser` 提取 clone/download 逻辑) +- [ ] 更新 `CodeRepositoryParser` 为混合模式(向后兼容)或标记为 deprecated +- [ ] 在 `AccessorRegistry` 中注册 `GitAccessor` +- [ ] 更新 `UnifiedResourceProcessor.process()` 使用新流程 +- [ ] 测试:Git URL, GitHub URL, 本地 .zip 等场景 + +### Phase 3: 重构 HTMLParser 和 FeishuParser + +- [ ] 实现 `HTTPAccessor` +- [ ] 实现 `FeishuAccessor` +- [ ] 重构 `HTMLParser` 只负责解析本地 HTML 文件 +- [ ] 重构 `FeishuParser` 只负责解析下载后的内容 + +### Phase 4: 重构 UnifiedResourceProcessor + +**目标**:简化 `UnifiedResourceProcessor`,使用新的两层架构 + +- [ ] 重构 `UnifiedResourceProcessor.process()`: + ```python + async def process(self, source, ...): + # 第一阶段:获取数据 + local_resource = await accessor_registry.access(source, **kwargs) + + # 第二阶段:解析数据 + parse_result = await parser_registry.parse(local_resource, **kwargs) + + # 清理(如果需要) + # ... + + return parse_result + ``` +- [ ] 移除 `_process_url()`, `_process_file()`, `_process_directory()` 中的重复逻辑 + +### Phase 5: 扩展和优化 + +- [ ] 实现优先级机制解决冲突 +- [ ] 添加 `HybridParser` 支持 +- [ ] 编写迁移文档 +- [ ] 性能优化和测试覆盖 + +--- + +## 五、兼容性策略 + +### 5.1 向后兼容 + +1. **保持现有 API 不变**: + - `ParserRegistry` 接口保持不变 + - `registry.parse()` 仍然可以工作 + - 自定义 Parser 注册方式不变 + +2. **渐进式迁移**: + - 现有 Parser 可以继续使用 + - 新 Parser 鼓励使用新架构 + - 提供迁移指南 + +### 5.2 废弃策略 + +- 标记旧的 `CodeRepositoryParser` 等为 `@deprecated` +- 在 v6.0 或未来版本移除 + +--- + +## 六、文件结构变更 + +``` +openviking/parse/ +├── accessors/ [新增] +│ ├── __init__.py +│ ├── base.py # DataAccessor, LocalResource +│ ├── registry.py # AccessorRegistry +│ ├── git.py # GitAccessor +│ ├── http.py # HTTPAccessor +│ └── feishu.py # FeishuAccessor +├── parsers/ +│ ├── base_parser.py # 明确为 DataParser +│ ├── code/ # 保留但简化 +│ ├── markdown.py +│ ├── pdf.py +│ └── ... +├── registry.py # ParserRegistry (重构) +└── ... +``` + +--- + +## 七、测试计划 + +| 测试类型 | 测试内容 | +|---------|---------| +| 单元测试 | AccessorRegistry, 各 Accessor, 各 Parser | +| 集成测试 | 完整流程:add_resource → Accessor → Parser → TreeBuilder | +| 回归测试 | 确保现有功能不被破坏 | +| 冲突测试 | 测试优先级机制解决 .zip, URL 等冲突场景 | + +--- + +## 八、风险与应对 + +| 风险 | 影响 | 概率 | 应对措施 | +|-----|------|------|---------| +| 重构范围过大 | 高 | 中 | 分阶段实施,每阶段可独立发布 | +| 性能回退 | 中 | 低 | 保持缓存机制,性能基准测试 | +| 自定义插件受影响 | 高 | 低 | 保持向后兼容,提供迁移工具 | + +--- + +## 九、相关文档 + +- [解析系统 README](../../openviking/parse/parsers/README.md) +- [OpenViking 整体架构](../zh/concepts/01-architecture.md) diff --git a/examples/ov.conf.example b/examples/ov.conf.example index 456c995d5..b5227d734 100644 --- a/examples/ov.conf.example +++ b/examples/ov.conf.example @@ -157,10 +157,7 @@ "extract_text_only": false, "preserve_structure": true, "clean_html": true, - "extract_metadata": true, - "code_hosting_domains": ["github.com", "gitlab.com"], - "github_domains": ["github.com", "www.github.com"], - "gitlab_domains": ["gitlab.com", "www.gitlab.com"] + "extract_metadata": true }, "text": { "detect_language": true, diff --git a/openviking/parse/__init__.py b/openviking/parse/__init__.py index b266a1dae..e447710c1 100644 --- a/openviking/parse/__init__.py +++ b/openviking/parse/__init__.py @@ -13,7 +13,6 @@ scan_directory, ) from openviking.parse.parsers.base_parser import BaseParser -from openviking.parse.parsers.code import CodeRepositoryParser from openviking.parse.parsers.html import HTMLParser from openviking.parse.parsers.markdown import MarkdownParser from openviking.parse.parsers.pdf import PDFParser @@ -34,7 +33,6 @@ "MarkdownParser", "PDFParser", "HTMLParser", - "CodeRepositoryParser", "DocumentConverter", # Custom parser support "CustomParserProtocol", diff --git a/openviking/parse/accessors/__init__.py b/openviking/parse/accessors/__init__.py new file mode 100644 index 000000000..1836f30b5 --- /dev/null +++ b/openviking/parse/accessors/__init__.py @@ -0,0 +1,33 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +""" +Data Accessors for OpenViking. + +This module provides the two-layer architecture for resource processing: +- DataAccessor: Fetches data from remote sources or special paths +- DataParser: Parses local files/directories (existing Parser system) +""" + +from .base import DataAccessor, LocalResource +from .feishu import FeishuAccessor +from .git import GitAccessor +from .http import HTTPAccessor +from .registry import ( + AccessorRegistry, + access, + get_accessor_registry, +) + +__all__ = [ + # Base classes + "DataAccessor", + "LocalResource", + # Registry + "AccessorRegistry", + "get_accessor_registry", + "access", + # Accessors + "GitAccessor", + "HTTPAccessor", + "FeishuAccessor", +] diff --git a/openviking/parse/accessors/base.py b/openviking/parse/accessors/base.py new file mode 100644 index 000000000..8d5b4c90e --- /dev/null +++ b/openviking/parse/accessors/base.py @@ -0,0 +1,191 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +""" +Base classes for Data Accessors. + +Data Accessors are responsible for fetching data from remote sources +or special paths and making them available as local files/directories. +""" + +import os +import shutil +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, Union + + +class SourceType: + """ + Enumeration of valid source types for LocalResource. + + Provides type safety and consistency across the system. + """ + + LOCAL = "local" + """Local file system resource.""" + + GIT = "git" + """Git repository (from GitAccessor).""" + + HTTP = "http" + """HTTP/HTTPS resource (from HTTPAccessor).""" + + FEISHU = "feishu" + """Feishu/Lark document (from FeishuAccessor).""" + + +@dataclass +class LocalResource: + """ + Represents a locally accessible resource. + + This is the output of the DataAccessor layer, containing the local + path to the resource along with metadata about its origin. + """ + + path: Path + """Local file/directory path to the resource.""" + + source_type: str + """Original source type: one of SourceType constants.""" + + original_source: str + """Original source string (URL, path, etc.).""" + + meta: Dict[str, Any] = field(default_factory=dict) + """Additional metadata (repo_name, branch, content_type, etc.).""" + + is_temporary: bool = True + """Whether this is a temporary resource that can be cleaned up after parsing.""" + + def cleanup(self) -> None: + """ + Clean up the local resource if it's temporary. + + Removes the file/directory from the local filesystem. + """ + if not self.is_temporary: + return + + if not self.path.exists(): + return + + try: + if self.path.is_dir(): + shutil.rmtree(self.path, ignore_errors=True) + else: + self.path.unlink(missing_ok=True) + except Exception as e: + from openviking_cli.utils.logger import get_logger + + logger = get_logger(__name__) + logger.warning(f"[LocalResource] Failed to cleanup resource {self.path}: {e}") + + def __enter__(self) -> "LocalResource": + """Support context manager protocol.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + """Support context manager protocol - cleanup on exit.""" + self.cleanup() + + +class DataAccessor(ABC): + """ + Abstract base class for data accessors. + + Data Accessors are responsible for: + - Detecting if they can handle a given source + - Fetching the data from the source to a local path + - Providing metadata about the source + - Cleaning up temporary resources when done + """ + + @abstractmethod + def can_handle(self, source: Union[str, Path]) -> bool: + """ + Check if this accessor can handle the given source. + + Args: + source: Source string (URL, path, etc.) or Path object + + Returns: + True if this accessor can handle the source + """ + pass + + @abstractmethod + async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: + """ + Fetch the source and make it available locally. + + Args: + source: Source string (URL, path, etc.) or Path object + **kwargs: Additional accessor-specific arguments + + Returns: + LocalResource pointing to the locally available data + """ + pass + + @property + @abstractmethod + def priority(self) -> int: + """ + Priority of this accessor. + + Higher numbers mean higher priority. When multiple accessors + can handle the same source, the one with the highest priority wins. + + Standard priority levels: + - 100: Specific service (Feishu, etc.) + - 80: Version control (Git, etc.) + - 50: Generic protocols (HTTP, etc.) + - 10: Fallback accessors + """ + pass + + def cleanup(self, resource: LocalResource) -> None: + """ + Clean up the local resource. + + Default implementation calls resource.cleanup(). + Subclasses can override for custom cleanup logic. + + Args: + resource: The LocalResource to clean up + """ + resource.cleanup() + + def _create_temp_dir(self, prefix: str = "ov_accessor_") -> Path: + """ + Create a temporary directory for this accessor. + + Args: + prefix: Prefix for the temporary directory name + + Returns: + Path to the created temporary directory + """ + import tempfile + + temp_dir = tempfile.mkdtemp(prefix=prefix) + return Path(temp_dir) + + def _create_temp_file(self, suffix: str = "", prefix: str = "ov_accessor_") -> Path: + """ + Create a temporary file for this accessor. + + Args: + suffix: Suffix for the temporary file name + prefix: Prefix for the temporary file name + + Returns: + Path to the created temporary file + """ + import tempfile + + fd, temp_path = tempfile.mkstemp(suffix=suffix, prefix=prefix) + os.close(fd) + return Path(temp_path) diff --git a/openviking/parse/accessors/feishu.py b/openviking/parse/accessors/feishu.py new file mode 100644 index 000000000..a1cc6e0fa --- /dev/null +++ b/openviking/parse/accessors/feishu.py @@ -0,0 +1,646 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +""" +Feishu/Lark Accessor. + +Fetches Feishu/Lark cloud documents using the lark-oapi SDK. +This is the DataAccessor layer extracted from FeishuParser. + +Note: This accessor requires the `lark-oapi` package. +Install with: pip install 'openviking[bot-feishu]' +""" + +import os +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Optional, Tuple, Union +from urllib.parse import urlparse + +from openviking_cli.utils.logger import get_logger + +from .base import DataAccessor, LocalResource, SourceType + +logger = get_logger(__name__) + + +def _getattr_safe(obj, key: str, default=None): + """Get attribute from SDK object or dict, with safe fallback.""" + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + +@dataclass +class FeishuDocument: + """Result from fetching a Feishu document.""" + + doc_type: str # "docx" only for now + token: str + markdown_content: str + title: str + meta: Dict[str, Any] + + +class FeishuAccessor(DataAccessor): + """ + Accessor for Feishu/Lark cloud documents. + + Supports: + - Documents: https://*.feishu.cn/docx/{document_id} + - Wiki pages: https://*.feishu.cn/wiki/{token} (resolves to docx) + + Requires: + - lark-oapi package + - FEISHU_APP_ID and FEISHU_APP_SECRET environment variables, + or configuration in ov.conf + """ + + PRIORITY = 100 # Higher than Git/HTTP, very specific + + # Wiki obj_type normalization (API returns short names) + _WIKI_TYPE_MAP = {"doc": "docx", "sheet": "sheets", "bitable": "base"} + + # Attributes that skip processing (structural containers or metadata) + _SKIP_ATTRS = {"page", "table_cell", "quote_container", "grid", "grid_column"} + + # Attribute → special handler method (non-text blocks) + _SPECIAL_BLOCK_HANDLERS = { + "divider": "_handle_divider", + "image": "_handle_image", + "table": "_table_block_to_markdown", + } + + # Attribute → markdown prefix template for text-bearing blocks. + # "{text}" is replaced with extracted text content. + # Headings are handled dynamically (heading1-heading9 → # through #########). + _TEXT_FORMAT = { + "bullet": "- {text}", + "quote": "> {text}", + } + + # Known block_type integer → SDK attribute name mapping. + # Primary dispatch mechanism for reliable block detection. + # Source: Feishu OpenAPI documentation + lark-oapi SDK Block class. + _BLOCK_TYPE_TO_ATTR = { + 1: "page", + 2: "text", + 3: "heading1", + 4: "heading2", + 5: "heading3", + 6: "heading4", + 7: "heading5", + 8: "heading6", + 9: "heading7", + 10: "heading8", + 11: "heading9", + 12: "bullet", + 13: "ordered", + 14: "code", + 15: "quote", + 17: "todo", + 19: "callout", + 22: "divider", + 27: "image", + 31: "table", + 32: "table_cell", + 34: "quote_container", + } + + # All known content attribute names on SDK Block objects (for fallback detection). + _KNOWN_CONTENT_ATTRS = frozenset( + { + "page", + "text", + "heading1", + "heading2", + "heading3", + "heading4", + "heading5", + "heading6", + "heading7", + "heading8", + "heading9", + "bullet", + "ordered", + "code", + "quote", + "todo", + "callout", + "divider", + "image", + "table", + "table_cell", + "quote_container", + "equation", + "task", + "grid", + "grid_column", + } + ) + + def __init__(self): + """Initialize Feishu accessor.""" + self._client = None + self._config = None + + @property + def priority(self) -> int: + return self.PRIORITY + + def can_handle(self, source: Union[str, Path]) -> bool: + """ + Check if this accessor can handle the source. + + Handles Feishu/Lark cloud document URLs. + """ + source_str = str(source) + + # Only handle http/https URLs + if not source_str.startswith(("http://", "https://")): + return False + + return self._is_feishu_url(source_str) + + async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: + """ + Fetch a Feishu document and save to a temporary Markdown file. + + Args: + source: Feishu document URL + **kwargs: Additional arguments + + Returns: + LocalResource pointing to the temporary Markdown file + """ + source_str = str(source) + + try: + # Fetch the document and convert to Markdown + doc = await self._fetch_document(source_str) + + # Create temporary file + temp_file = tempfile.NamedTemporaryFile( + mode="w", + suffix=".md", + prefix="ov_feishu_", + delete=False, + ) + temp_file.write(doc.markdown_content) + temp_file.close() + + # Build metadata + meta = { + "feishu_doc_type": doc.doc_type, + "feishu_token": doc.token, + "feishu_title": doc.title, + **doc.meta, + } + + return LocalResource( + path=Path(temp_file.name), + source_type=SourceType.FEISHU, + original_source=source_str, + meta=meta, + is_temporary=True, + ) + + except Exception as e: + logger.error(f"[FeishuAccessor] Failed to access {source}: {e}", exc_info=True) + raise + + async def _fetch_document(self, url: str) -> FeishuDocument: + """ + Fetch a Feishu document and convert to Markdown. + + This method extracts and adapts the logic from FeishuParser.parse(). + """ + import asyncio + + doc_type, token = self._parse_feishu_url(url) + title = None + meta = {} + + if doc_type == "wiki": + # Resolve wiki node to actual document type + real_type, real_token, title = await asyncio.to_thread(self._resolve_wiki_node, token) + doc_type, token = real_type, real_token + meta["wiki_resolved"] = True + + # Only docx is supported + if doc_type != "docx": + raise ValueError( + f"Unsupported Feishu document type: {doc_type}. " + f"Only docx is supported in this version." + ) + + # Call the handler (in thread pool since lark-oapi is sync) + markdown, doc_title = await asyncio.to_thread(self._parse_docx, token) + + if title: + doc_title = title + + meta["original_url"] = url + + return FeishuDocument( + doc_type=doc_type, + token=token, + markdown_content=markdown, + title=doc_title, + meta=meta, + ) + + @staticmethod + def _is_feishu_url(url: str) -> bool: + """Check if URL is a Feishu/Lark cloud document.""" + parsed = urlparse(url) + host = parsed.hostname or "" + path = parsed.path + is_feishu_domain = host.endswith(".feishu.cn") or host.endswith(".larksuite.com") + has_doc_path = any(path == f"/{t}" or path.startswith(f"/{t}/") for t in ("docx", "wiki")) + return is_feishu_domain and has_doc_path + + @staticmethod + def _parse_feishu_url(url: str) -> Tuple[str, str]: + """ + Extract doc_type and token from Feishu URL. + + Returns: + (doc_type, token) e.g. ("docx", "doxcnABC123") + """ + parsed = urlparse(url) + path_parts = [p for p in parsed.path.split("/") if p] + if len(path_parts) < 2: + raise ValueError(f"Cannot parse Feishu URL: {url}") + doc_type = path_parts[0] # docx, wiki + token = path_parts[1] + return doc_type, token + + # ========== Configuration & Client ========== + + def _get_config(self): + """Get FeishuConfig from OpenViking config.""" + if self._config is None: + from openviking_cli.utils.config import get_openviking_config + + self._config = get_openviking_config().feishu + return self._config + + def _get_client(self): + """Lazy-init lark-oapi client.""" + if self._client is None: + try: + import lark_oapi as lark + except ImportError: + raise ImportError( + "lark-oapi is required for Feishu document parsing. " + "Install it with: pip install 'openviking[bot-feishu]'" + ) + config = self._get_config() + app_id = config.app_id or os.getenv("FEISHU_APP_ID", "") + app_secret = config.app_secret or os.getenv("FEISHU_APP_SECRET", "") + if not app_id or not app_secret: + raise ValueError( + "Feishu credentials not configured. Set FEISHU_APP_ID and " + "FEISHU_APP_SECRET environment variables, or configure in ov.conf." + ) + domain = config.domain or "https://open.feishu.cn" + self._client = ( + lark.Client.builder().app_id(app_id).app_secret(app_secret).domain(domain).build() + ) + return self._client + + # ========== Wiki Resolution ========== + + def _resolve_wiki_node(self, token: str) -> Tuple[str, str, Optional[str]]: + """ + Resolve wiki token to actual document type, token, and title. + + Returns: + (doc_type, obj_token, title) + """ + from lark_oapi.api.wiki.v2 import GetNodeSpaceRequest + + client = self._get_client() + request = GetNodeSpaceRequest.builder().token(token).build() + response = client.wiki.v2.space.get_node(request) + if not response.success(): + raise RuntimeError( + f"Failed to resolve wiki node {token}: code={response.code}, msg={response.msg}" + ) + node = response.data.node + obj_type = node.obj_type or "" + obj_token = node.obj_token or "" + title = node.title + + # Normalize type names + doc_type = self._WIKI_TYPE_MAP.get(obj_type, obj_type) + + return doc_type, obj_token, title + + # ========== Docx Parsing ========== + + def _parse_docx(self, document_id: str) -> Tuple[str, str]: + """ + Fetch all blocks and convert to Markdown. + + Returns: + (markdown_content, document_title) + """ + blocks = self._fetch_all_blocks(document_id) + if not blocks: + return "", "Untitled" + + # Build block lookup by block_id + block_map = {b.block_id: b for b in blocks} + + # Find title from page block + doc_title = "Untitled" + for b in blocks: + if b.page is not None: + if b.page.elements: + doc_title = self._extract_text_from_elements(b.page.elements) + break + + # Convert blocks to markdown + markdown_lines = [] + ordered_counter: Dict[str, int] = {} + + for block in blocks: + if block.page is not None: + continue # Skip page container + + line = self._block_to_markdown( + block, block_map, ordered_counter, document_id=document_id + ) + if line is not None: + markdown_lines.append(line) + + markdown = "\n\n".join(markdown_lines) + + if doc_title and doc_title != "Untitled": + markdown = f"# {doc_title}\n\n{markdown}" + + return markdown, doc_title + + def _fetch_all_blocks(self, document_id: str) -> list: + """Fetch all blocks with pagination. Returns list of SDK block objects.""" + from lark_oapi.api.docx.v1 import ListDocumentBlockRequest + + client = self._get_client() + all_blocks = [] + page_token = None + + while True: + builder = ( + ListDocumentBlockRequest.builder() + .document_id(document_id) + .page_size(500) + .document_revision_id(-1) + ) + if page_token: + builder = builder.page_token(page_token) + + request = builder.build() + response = client.docx.v1.document_block.list(request) + + if not response.success(): + raise RuntimeError( + f"Failed to fetch blocks for {document_id}: " + f"code={response.code}, msg={response.msg}" + ) + + items = response.data.items or [] + all_blocks.extend(items) + + if not response.data.has_more: + break + page_token = response.data.page_token + + return all_blocks + + # ========== Block -> Markdown Conversion ========== + + def _detect_block_attr(self, block) -> Optional[str]: + """Detect which content attribute is populated on a block object. + + Uses block_type integer as the primary dispatch (reliable), falling + back to attribute inspection over a known whitelist for unknown types. + """ + # Primary: lookup by block_type integer + block_type = getattr(block, "block_type", None) + if block_type is not None: + attr = self._BLOCK_TYPE_TO_ATTR.get(block_type) + if attr: + return attr + + # Fallback: scan known content attributes for unknown block types + for attr in self._KNOWN_CONTENT_ATTRS: + if getattr(block, attr, None) is not None: + return attr + return None + + def _block_to_markdown( + self, block, block_map: Dict, ordered_counter: Dict[str, int], document_id: str = "" + ) -> Optional[str]: + """Convert a single SDK block object to markdown string. + + Uses block_type integer for primary dispatch, with attribute whitelist + fallback for unknown types. Formatting is data-driven via _TEXT_FORMAT + and _SPECIAL_BLOCK_HANDLERS tables. + """ + attr = self._detect_block_attr(block) + + if attr is None: + return None + + # Skip structural containers (processed via their children) + if attr in self._SKIP_ATTRS: + return None + + # Reset ordered list counter when any non-ordered block appears + if attr != "ordered": + parent_id = block.parent_id or "" + if parent_id in ordered_counter: + del ordered_counter[parent_id] + + # Special blocks (non-text: divider, image, table) + special_handler = self._SPECIAL_BLOCK_HANDLERS.get(attr) + if special_handler: + return getattr(self, special_handler)(block, block_map, document_id=document_id) + + # --- Text-bearing blocks: extract elements, apply formatting --- + content_obj = getattr(block, attr, None) + if not content_obj or not hasattr(content_obj, "elements") or not content_obj.elements: + return None + + text = self._extract_text_from_elements(content_obj.elements) + if not text: + return None + + # Headings: heading1 -> #, heading2 -> ##, ... + if attr.startswith("heading"): + level = int(attr.replace("heading", "") or "1") + return f"{'#' * level} {text}" + + # Ordered list (needs counter state) + if attr == "ordered": + parent_id = block.parent_id or "" + counter = ordered_counter.get(parent_id, 0) + 1 + ordered_counter[parent_id] = counter + return f"{counter}. {text}" + + # Code block (needs language from style) + if attr == "code": + lang = "" + if hasattr(content_obj, "style") and content_obj.style: + lang = str(getattr(content_obj.style, "language", "") or "") + return f"```{lang}\n{text}\n```" + + # Todo (needs done state from style) + if attr == "todo": + done = False + if hasattr(content_obj, "style") and content_obj.style: + done = getattr(content_obj.style, "done", False) + checkbox = "[x]" if done else "[ ]" + return f"- {checkbox} {text}" + + # Simple template formatting (bullet, quote, etc.) + fmt = self._TEXT_FORMAT.get(attr) + if fmt: + return fmt.format(text=text) + + # Default: return plain text (covers callout, equation, task, unknown, etc.) + return text + + @staticmethod + def _handle_divider(block, block_map: Dict = None, **_) -> str: + """Convert divider block to markdown.""" + return "---" + + @staticmethod + def _handle_image(block, block_map: Dict = None, **_) -> Optional[str]: + """Convert image block to markdown.""" + image = block.image + if not image: + return None + file_token = image.token or "" + alt_text = getattr(image, "alt", "") or "image" + return f"![{alt_text}](feishu://image/{file_token})" + + def _extract_block_text(self, block, attr_name: str) -> str: + """Extract text from a block's named attribute (e.g. block.text, block.heading2).""" + content_obj = getattr(block, attr_name, None) + if content_obj and hasattr(content_obj, "elements") and content_obj.elements: + return self._extract_text_from_elements(content_obj.elements) + return "" + + def _extract_text_from_elements(self, elements) -> str: + """Convert Feishu TextElement SDK objects to formatted text.""" + if not elements: + return "" + parts = [] + for element in elements: + # TextRun + text_run = element.text_run + if text_run: + content = text_run.content or "" + style = text_run.text_element_style + content = self._apply_text_style(content, style) + parts.append(content) + continue + + # MentionUser + mention_user = element.mention_user + if mention_user: + user_id = _getattr_safe(mention_user, "user_id", "user") + parts.append(f"@{user_id}") + continue + + # MentionDoc + mention_doc = element.mention_doc + if mention_doc: + title = _getattr_safe(mention_doc, "title", "document") + url = _getattr_safe(mention_doc, "url", "") + parts.append(f"[{title}]({url})" if url else str(title)) + continue + + # Equation + equation = element.equation + if equation: + parts.append(f"${_getattr_safe(equation, 'content', '')}$") + continue + + return "".join(parts) + + @staticmethod + def _apply_text_style(text: str, style) -> str: + """Apply markdown formatting based on TextElementStyle SDK object.""" + if not text or not style: + return text + # inline_code (SDK uses 'inline_code', not 'code_inline') + if getattr(style, "inline_code", False): + return f"`{text}`" + # link + link = getattr(style, "link", None) + if link: + url = _getattr_safe(link, "url", "") + if url: + text = f"[{text}]({url})" + if getattr(style, "bold", False): + text = f"**{text}**" + if getattr(style, "italic", False): + text = f"*{text}*" + if getattr(style, "strikethrough", False): + text = f"~~{text}~~" + return text + + def _table_block_to_markdown(self, block, block_map: Dict, **_) -> Optional[str]: + """Convert table block to markdown table.""" + table = block.table + children = block.children + if not table or not children: + return None + + prop = table.property + if not prop: + return None + row_size = prop.row_size or 0 + col_size = prop.column_size or 0 + if not row_size or not col_size: + return None + + rows = [] + for row_idx in range(row_size): + row = [] + for col_idx in range(col_size): + cell_idx = row_idx * col_size + col_idx + if cell_idx < len(children): + cell_block_id = children[cell_idx] + cell_block = block_map.get(cell_block_id) + cell_text = self._extract_cell_text(cell_block, block_map) + row.append(cell_text) + else: + row.append("") + rows.append(row) + + from openviking.parse.base import format_table_to_markdown + + return format_table_to_markdown(rows, has_header=True) if rows else None + + def _extract_cell_text(self, cell_block, block_map: Dict) -> str: + """Extract text from a table cell block by reading its children.""" + if not cell_block or not cell_block.children: + return "" + texts = [] + for child_id in cell_block.children: + child = block_map.get(child_id) + if not child: + continue + # Use attribute-driven detection to find text in any block type + attr = self._detect_block_attr(child) + if attr: + text = self._extract_block_text(child, attr) + if text: + texts.append(text) + return " ".join(texts) diff --git a/openviking/parse/accessors/git.py b/openviking/parse/accessors/git.py new file mode 100644 index 000000000..1590bca0d --- /dev/null +++ b/openviking/parse/accessors/git.py @@ -0,0 +1,581 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +""" +Git Repository Accessor. + +Fetches git repositories and zip archives of codebases to local directories. +This is the DataAccessor layer extracted from CodeRepositoryParser. +""" + +import asyncio +import os +import shutil +import stat +import tempfile +import urllib.request +import zipfile +from pathlib import Path, PurePosixPath +from typing import Any, Optional, Tuple, Union +from urllib.parse import unquote, urlparse + +from openviking.utils import is_github_url, is_gitlab_url, parse_code_hosting_url +from openviking.utils.code_hosting_utils import ( + is_code_hosting_url, + is_git_repo_url, + validate_git_ssh_uri, +) +from openviking_cli.utils.config import get_openviking_config +from openviking_cli.utils.logger import get_logger + +from .base import DataAccessor, LocalResource, SourceType + +logger = get_logger(__name__) + + +class GitAccessor(DataAccessor): + """ + Accessor for Git repositories and code archives. + + Supports: + - Git SSH URLs: git@github.com:org/repo.git + - Git HTTP/HTTPS URLs: https://github.com/org/repo.git + - GitHub/GitLab repository pages: https://github.com/org/repo + - Local ZIP files (containing code repositories) + """ + + PRIORITY = 80 # Higher than generic HTTP, lower than specific services + + @property + def priority(self) -> int: + return self.PRIORITY + + def can_handle(self, source: Union[str, Path]) -> bool: + """ + Check if this accessor can handle the source. + + Handles: + - git@ SSH URLs + - git://, ssh:// URLs + - GitHub/GitLab repository URLs (http/https) + - Local paths ending with .git or .zip + """ + source_str = str(source) + + # Git protocol URLs + if source_str.startswith(("git@", "git://", "ssh://")): + try: + if source_str.startswith("git@"): + validate_git_ssh_uri(source_str) + return is_code_hosting_url(source_str) + except ValueError: + return False + + # HTTP/HTTPS URLs to code hosting repos + if source_str.startswith(("http://", "https://")): + return is_git_repo_url(source_str) + + # Local .git or .zip files + if isinstance(source, Path): + path = source + else: + path = Path(source_str) + + suffix = path.suffix.lower() + return suffix in (".git", ".zip") + + async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: + """ + Fetch the git repository or code archive to a local directory. + + Args: + source: Repository URL (git/http) or local zip path + **kwargs: Additional arguments (branch, commit, etc.) + + Returns: + LocalResource pointing to the local directory + """ + source_str = str(source) + temp_local_dir = None + branch = kwargs.get("branch") or kwargs.get("ref") + commit = kwargs.get("commit") + + try: + # Create local temp directory + temp_local_dir = tempfile.mkdtemp(prefix="ov_git_") + logger.info(f"[GitAccessor] Created local temp dir for git: {temp_local_dir}") + + # Fetch content (Clone or Extract) + repo_name = "repository" + local_dir = Path(temp_local_dir) + + if source_str.startswith("git@"): + # git@ SSH URL + repo_name = await self._git_clone( + source_str, + temp_local_dir, + branch=branch, + commit=commit, + ) + elif source_str.startswith(("http://", "https://", "git://", "ssh://")): + repo_url, branch, commit = self._parse_repo_source(source_str, **kwargs) + if self._is_github_url(repo_url): + # Use GitHub ZIP API + local_dir, repo_name = await self._github_zip_download( + repo_url, branch or commit, temp_local_dir + ) + elif self._is_gitlab_url(repo_url): + # Use GitLab ZIP API + local_dir, repo_name = await self._gitlab_zip_download( + repo_url, branch or commit, temp_local_dir + ) + else: + # Non-GitHub/GitLab URL: use git clone + repo_name = await self._git_clone( + repo_url, + temp_local_dir, + branch=branch, + commit=commit, + ) + elif str(source).endswith(".zip"): + repo_name = await self._extract_zip(source_str, temp_local_dir) + else: + raise ValueError(f"Unsupported source for GitAccessor: {source}") + + # Build metadata + meta = {"repo_name": repo_name} + if branch: + meta["repo_ref"] = branch + if commit: + meta["repo_commit"] = commit + + return LocalResource( + path=local_dir, + source_type=SourceType.GIT, + original_source=source_str, + meta=meta, + is_temporary=True, + ) + + except Exception as e: + logger.error( + f"[GitAccessor] Failed to access git repository {source}: {e}", exc_info=True + ) + # Clean up on error + if temp_local_dir and os.path.exists(temp_local_dir): + try: + shutil.rmtree(temp_local_dir, ignore_errors=True) + except Exception: + pass + raise + + def _parse_repo_source(self, source: str, **kwargs) -> Tuple[str, Optional[str], Optional[str]]: + """Parse repository source URL to extract branch/commit info.""" + branch = kwargs.get("branch") or kwargs.get("ref") + commit = kwargs.get("commit") + repo_url = source + if source.startswith(("http://", "https://", "git://", "ssh://")): + parsed = urlparse(source) + repo_url = parsed._replace(query="", fragment="").geturl() + if commit is None or branch is None: + branch, commit = self._extract_ref_from_url(parsed, branch, commit) + repo_url = self._normalize_repo_url(repo_url) + return repo_url, branch, commit + + def _extract_ref_from_url( + self, + parsed: Any, + branch: Optional[str], + commit: Optional[str], + ) -> Tuple[Optional[str], Optional[str]]: + """Extract branch/commit from URL path.""" + if parsed.path: + path_branch, path_commit = self._parse_ref_from_path(parsed.path) + commit = path_commit or commit + # If commit is present in path, ignore branch entirely + if commit: + branch = None + else: + branch = branch or path_branch + return branch, commit + + def _parse_ref_from_path(self, path: str) -> Tuple[Optional[str], Optional[str]]: + """Parse ref from URL path components.""" + parts = [p for p in path.split("/") if p] + branch = None + commit = None + if "commit" in parts: + idx = parts.index("commit") + if idx + 1 < len(parts): + commit = parts[idx + 1] + if "tree" in parts: + idx = parts.index("tree") + if idx + 1 < len(parts): + ref = unquote(parts[idx + 1]) + if self._looks_like_sha(ref): + commit = ref + else: + branch = ref + return branch, commit + + @staticmethod + def _looks_like_sha(ref: str) -> bool: + """Return True if ref looks like a git commit SHA (7-40 hex chars).""" + return 7 <= len(ref) <= 40 and all(c in "0123456789abcdefABCDEF" for c in ref) + + def _normalize_repo_url(self, url: str) -> str: + """Normalize repository URL to base form.""" + if url.startswith(("http://", "https://", "git://", "ssh://")): + parsed = urlparse(url) + path_parts = [p for p in parsed.path.split("/") if p] + base_parts = path_parts + git_index = next((i for i, p in enumerate(path_parts) if p.endswith(".git")), None) + if git_index is not None: + base_parts = path_parts[: git_index + 1] + + config = get_openviking_config() + if ( + parsed.netloc in config.code.github_domains + config.code.gitlab_domains + and len(path_parts) >= 2 + ): + base_parts = path_parts[:2] + base_path = "/" + "/".join(base_parts) + return parsed._replace(path=base_path, query="", fragment="").geturl() + return url + + def _get_repo_name(self, url: str) -> str: + """Get repository name with organization for GitHub/GitLab URLs.""" + # First try to parse as code hosting URL + parsed_org_repo = parse_code_hosting_url(url) + if parsed_org_repo: + return parsed_org_repo + + # Fallback for other URLs + name_source = url + if url.startswith(("http://", "https://", "git://", "ssh://")): + name_source = urlparse(url).path.rstrip("/") + elif ":" in url and not url.startswith("file://"): + name_source = url.split(":", 1)[1] + + # Original logic for non-GitHub/GitLab URLs + name = name_source.rstrip("/").split("/")[-1] + if name.endswith(".git"): + name = name[:-4] + name = "".join(c if c.isalnum() or c in "-_" else "_" for c in name) + return name or "repository" + + async def _run_git(self, args: list[str], cwd: Optional[str] = None) -> str: + """Run a git command.""" + proc = await asyncio.create_subprocess_exec( + *args, + cwd=cwd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + if proc.returncode != 0: + error_msg = stderr.decode().strip() + user_msg = "Git command failed." + if "Could not resolve hostname" in error_msg: + user_msg = ( + "Git command failed: could not resolve hostname. Check the URL or your network." + ) + elif "Permission denied" in error_msg or "publickey" in error_msg: + user_msg = ( + "Git command failed: authentication error. Check your SSH keys or credentials." + ) + raise RuntimeError( + f"{user_msg} Command: git {' '.join(args[1:])}. Details: {error_msg}" + ) + return stdout.decode().strip() + + async def _has_commit(self, repo_dir: str, commit: str) -> bool: + """Check if a commit exists in the repository.""" + try: + await self._run_git(["git", "-C", repo_dir, "rev-parse", "--verify", commit]) + return True + except RuntimeError: + return False + + @staticmethod + def _is_github_url(url: str) -> bool: + """Return True for github.com URLs (supports ZIP archive API).""" + return is_github_url(url) + + @staticmethod + def _is_gitlab_url(url: str) -> bool: + """Return True for gitlab.com URLs (supports ZIP archive API).""" + return is_gitlab_url(url) + + async def _git_clone( + self, + url: str, + target_dir: str, + branch: Optional[str] = None, + commit: Optional[str] = None, + ) -> str: + """Clone a git repository into target_dir; return the repo name.""" + name = self._get_repo_name(url) + logger.info(f"[GitAccessor] Cloning {url} to {target_dir}...") + + clone_args = [ + "git", + "clone", + "--depth", + "1", + "--recursive", + ] + if branch and not commit: + clone_args.extend(["--branch", branch]) + clone_args.extend([url, target_dir]) + await self._run_git(clone_args) + + if commit: + try: + await self._run_git(["git", "-C", target_dir, "fetch", "origin", commit]) + except RuntimeError: + try: + await self._run_git( + ["git", "-C", target_dir, "fetch", "--all", "--tags", "--prune"] + ) + except RuntimeError: + pass + ok = await self._has_commit(target_dir, commit) + if not ok: + try: + await self._run_git( + ["git", "-C", target_dir, "fetch", "--unshallow", "origin"] + ) + except RuntimeError: + pass + ok = await self._has_commit(target_dir, commit) + if not ok: + await self._run_git( + [ + "git", + "-C", + target_dir, + "fetch", + "origin", + "+refs/heads/*:refs/remotes/origin/*", + ] + ) + ok = await self._has_commit(target_dir, commit) + if not ok: + raise RuntimeError(f"Failed to fetch commit {commit} from {url}") + await self._run_git(["git", "-C", target_dir, "checkout", commit]) + + return name + + async def _github_zip_download( + self, + repo_url: str, + branch: Optional[str], + target_dir: str, + ) -> Tuple[Path, str]: + """Download a GitHub repo as a ZIP archive and extract it.""" + repo_name = self._get_repo_name(repo_url) + + # Build archive URL from owner/repo path components. + parsed = urlparse(repo_url) + path_parts = [p for p in parsed.path.split("/") if p] + owner = path_parts[0] + repo_raw = path_parts[1] + # Strip .git suffix for the archive URL + repo_slug = repo_raw[:-4] if repo_raw.endswith(".git") else repo_raw + + if branch: + zip_url = f"https://github.com/{owner}/{repo_slug}/archive/{branch}.zip" + else: + zip_url = f"https://github.com/{owner}/{repo_slug}/archive/HEAD.zip" + + logger.info(f"[GitAccessor] Downloading GitHub ZIP: {zip_url}") + + zip_path = os.path.join(target_dir, "_archive.zip") + extract_dir = os.path.join(target_dir, "_extracted") + os.makedirs(extract_dir, exist_ok=True) + + # Download (blocking HTTP; run in thread pool) + def _download() -> None: + headers = {"User-Agent": "OpenViking"} + github_token = os.environ.get("GITHUB_TOKEN") + if github_token: + headers["Authorization"] = f"token {github_token}" + + req = urllib.request.Request(zip_url, headers=headers) + with urllib.request.urlopen(req, timeout=1800) as resp, open(zip_path, "wb") as f: + shutil.copyfileobj(resp, f) + + try: + await asyncio.to_thread(_download) + except Exception as exc: + raise RuntimeError(f"Failed to download GitHub ZIP {zip_url}: {exc}") + + # Safe extraction with Zip Slip validation + target = Path(extract_dir).resolve() + with zipfile.ZipFile(zip_path, "r") as zf: + for info in zf.infolist(): + mode = info.external_attr >> 16 + if info.is_dir() or stat.S_ISDIR(mode): + continue + if stat.S_ISLNK(mode): + logger.warning( + f"[GitAccessor] Skipping symlink entry in GitHub ZIP: {info.filename}" + ) + continue + raw = info.filename.replace("\\", "/") + raw_parts = [p for p in raw.split("/") if p] + if ".." in raw_parts: + raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}") + if PurePosixPath(raw).is_absolute(): + raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}") + extracted = Path(zf.extract(info, extract_dir)).resolve() + if not extracted.is_relative_to(target): + extracted.unlink(missing_ok=True) + raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}") + + # Remove downloaded archive to free disk space. + try: + os.unlink(zip_path) + except OSError: + pass + + # GitHub ZIPs have a single top-level directory + top_level = [d for d in Path(extract_dir).iterdir() if d.is_dir()] + content_dir = top_level[0] if len(top_level) == 1 else Path(extract_dir) + + logger.info(f"[GitAccessor] GitHub ZIP extracted to {content_dir} ({repo_name})") + return content_dir, repo_name + + async def _gitlab_zip_download( + self, + repo_url: str, + branch: Optional[str], + target_dir: str, + ) -> Tuple[Path, str]: + """Download a GitLab repo as a ZIP archive and extract it.""" + repo_name = self._get_repo_name(repo_url) + + # Build archive URL from owner/repo path components. + # GitLab archive URL format: https://gitlab.com/{owner}/{repo}/-/archive/{ref}/{repo}-{ref}.zip + parsed = urlparse(repo_url) + path_parts = [p for p in parsed.path.split("/") if p] + owner = path_parts[0] + repo_raw = path_parts[1] + # Strip .git suffix for the archive URL + repo_slug = repo_raw[:-4] if repo_raw.endswith(".git") else repo_raw + + ref = branch or "HEAD" + # GitLab uses the format: /{owner}/{repo}/-/archive/{ref}/{repo}-{ref}.zip + zip_url = f"{parsed.scheme}://{parsed.netloc}/{owner}/{repo_slug}/-/archive/{ref}/{repo_slug}-{ref}.zip" + + logger.info(f"[GitAccessor] Downloading GitLab ZIP: {zip_url}") + + zip_path = os.path.join(target_dir, "_archive.zip") + extract_dir = os.path.join(target_dir, "_extracted") + os.makedirs(extract_dir, exist_ok=True) + + # Download (blocking HTTP; run in thread pool) + def _download() -> None: + headers = {"User-Agent": "OpenViking"} + + req = urllib.request.Request(zip_url, headers=headers) + with urllib.request.urlopen(req, timeout=1800) as resp, open(zip_path, "wb") as f: + shutil.copyfileobj(resp, f) + + try: + await asyncio.to_thread(_download) + except Exception as exc: + raise RuntimeError(f"Failed to download GitLab ZIP {zip_url}: {exc}") + + # Safe extraction with Zip Slip validation + target = Path(extract_dir).resolve() + with zipfile.ZipFile(zip_path, "r") as zf: + for info in zf.infolist(): + mode = info.external_attr >> 16 + if info.is_dir() or stat.S_ISDIR(mode): + continue + if stat.S_ISLNK(mode): + logger.warning( + f"[GitAccessor] Skipping symlink entry in GitLab ZIP: {info.filename}" + ) + continue + raw = info.filename.replace("\\", "/") + raw_parts = [p for p in raw.split("/") if p] + if ".." in raw_parts: + raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}") + if PurePosixPath(raw).is_absolute(): + raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}") + extracted = Path(zf.extract(info, extract_dir)).resolve() + if not extracted.is_relative_to(target): + extracted.unlink(missing_ok=True) + raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}") + + # Remove downloaded archive to free disk space. + try: + os.unlink(zip_path) + except OSError: + pass + + # GitLab ZIPs have a single top-level directory: {repo}-{ref}/ + top_level = [d for d in Path(extract_dir).iterdir() if d.is_dir()] + content_dir = top_level[0] if len(top_level) == 1 else Path(extract_dir) + + logger.info(f"[GitAccessor] GitLab ZIP extracted to {content_dir} ({repo_name})") + return content_dir, repo_name + + async def _extract_zip(self, zip_path: str, target_dir: str) -> str: + """Extract a local zip file into target_dir.""" + if zip_path.startswith(("http://", "https://")): + raise NotImplementedError("Zip URL download not yet implemented in GitAccessor") + + path = Path(zip_path) + name = path.stem + + with zipfile.ZipFile(zip_path, "r") as zip_ref: + target = Path(target_dir).resolve() + for info in zip_ref.infolist(): + mode = info.external_attr >> 16 + # Skip directory entries + if info.is_dir() or stat.S_ISDIR(mode): + continue + # Skip symlink entries + if stat.S_ISLNK(mode): + logger.warning(f"[GitAccessor] Skipping symlink entry in zip: {info.filename}") + continue + # Reject entries with suspicious raw path components + raw = info.filename.replace("\\", "/") + raw_parts = [p for p in raw.split("/") if p] + if ".." in raw_parts: + raise ValueError(f"Zip Slip detected: entry {info.filename!r} contains '..'") + if PurePosixPath(raw).is_absolute() or (len(raw) >= 2 and raw[1] == ":"): + raise ValueError( + f"Zip Slip detected: entry {info.filename!r} is an absolute path" + ) + # Normalize and verify + arcname = info.filename.replace("/", os.sep) + if os.path.altsep: + arcname = arcname.replace(os.path.altsep, os.sep) + arcname = os.path.splitdrive(arcname)[1] + arcname = os.sep.join(p for p in arcname.split(os.sep) if p not in ("", ".", "..")) + if not arcname: + continue + member_path = (Path(target_dir) / arcname).resolve() + if not member_path.is_relative_to(target): + raise ValueError( + f"Zip Slip detected: entry {info.filename!r} escapes target directory" + ) + # Extract and verify + extracted = Path(zip_ref.extract(info, target_dir)).resolve() + if not extracted.is_relative_to(target): + # Best-effort cleanup + try: + extracted.unlink(missing_ok=True) + except OSError as cleanup_err: + logger.warning( + f"[GitAccessor] Failed to clean up escaped file {extracted}: {cleanup_err}" + ) + raise ValueError( + f"Zip Slip detected: entry {info.filename!r} escapes target directory" + ) + + return name diff --git a/openviking/parse/accessors/http.py b/openviking/parse/accessors/http.py new file mode 100644 index 000000000..1d228889a --- /dev/null +++ b/openviking/parse/accessors/http.py @@ -0,0 +1,228 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +""" +HTTP URL Accessor. + +Fetches HTTP/HTTPS URLs and makes them available as local files. +This is the DataAccessor layer extracted from HTMLParser. +""" + +import tempfile +from pathlib import Path +from typing import Optional, Union +from urllib.parse import unquote, urlparse + +from openviking.parse.base import lazy_import +from openviking.utils.network_guard import build_httpx_request_validation_hooks +from openviking_cli.utils.logger import get_logger + +from .base import DataAccessor, LocalResource, SourceType + +logger = get_logger(__name__) + + +class HTTPAccessor(DataAccessor): + """ + Accessor for HTTP/HTTPS URLs. + + Features: + - Downloads web pages to local HTML files + - Downloads files (PDF, Markdown, etc.) to local files + - Supports GitHub/GitLab blob to raw URL conversion + - Follows redirects + - Network guard integration + - Detailed error classification (network, timeout, auth, etc.) + """ + + PRIORITY = 50 # Lower than GitAccessor, higher than fallback + + DEFAULT_USER_AGENT = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ) + + def __init__( + self, + timeout: float = 30.0, + user_agent: Optional[str] = None, + ): + """Initialize HTTP accessor.""" + self.timeout = timeout + self.user_agent = user_agent or self.DEFAULT_USER_AGENT + + @property + def priority(self) -> int: + return self.PRIORITY + + def can_handle(self, source: Union[str, Path]) -> bool: + """ + Check if this accessor can handle the source. + + Handles any HTTP/HTTPS URL. + NOTE: GitAccessor and FeishuAccessor have higher priority + and will be checked first for their specific URL types. + """ + source_str = str(source) + + # Only handle http/https URLs + return source_str.startswith(("http://", "https://")) + + async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: + """ + Fetch the HTTP URL to a local file. + + Args: + source: HTTP/HTTPS URL + **kwargs: Additional arguments (request_validator, etc.) + + Returns: + LocalResource pointing to the downloaded file + """ + source_str = str(source) + request_validator = kwargs.get("request_validator") + + # Download the URL + temp_path = await self._download_url( + source_str, + request_validator=request_validator, + ) + + # Build metadata + meta = { + "url": source_str, + "downloaded": True, + } + + return LocalResource( + path=Path(temp_path), + source_type=SourceType.HTTP, + original_source=source_str, + meta=meta, + is_temporary=True, + ) + + @staticmethod + def _extract_filename_from_url(url: str) -> str: + """ + Extract and URL-decode the original filename from a URL. + + Args: + url: URL to extract filename from + + Returns: + Decoded filename (e.g., "schemas.py" from ".../schemas.py") + Falls back to "download" if no filename can be extracted. + """ + parsed = urlparse(url) + # URL-decode path to handle encoded characters (e.g., %E7%99%BE -> Chinese chars) + decoded_path = unquote(parsed.path) + basename = Path(decoded_path).name + return basename if basename else "download" + + async def _download_url( + self, + url: str, + request_validator=None, + ) -> str: + """ + Download URL content to a temporary file. + + Args: + url: URL to download + request_validator: Optional network request validator + + Returns: + Path to the temporary file + """ + httpx = lazy_import("httpx") + + # Convert GitHub/GitLab blob URLs to raw + url = self._convert_to_raw_url(url) + + # Determine file extension from URL (decode first to handle encoded paths) + parsed = urlparse(url) + decoded_path = unquote(parsed.path) + ext = Path(decoded_path).suffix or ".html" + + # Create temp file + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext) + temp_path = temp_file.name + temp_file.close() + + try: + # Download content + client_kwargs = { + "timeout": self.timeout, + "follow_redirects": True, + } + event_hooks = build_httpx_request_validation_hooks(request_validator) + if event_hooks: + client_kwargs["event_hooks"] = event_hooks + client_kwargs["trust_env"] = False + + async with httpx.AsyncClient(**client_kwargs) as client: + headers = {"User-Agent": self.user_agent} + try: + response = await client.get(url, headers=headers) + response.raise_for_status() + except httpx.ConnectError as e: + user_msg = "HTTP request failed: could not connect to server. Check the URL or your network." + raise RuntimeError(f"{user_msg} URL: {url}. Details: {e}") from e + except httpx.TimeoutException as e: + user_msg = "HTTP request failed: timeout. The server took too long to respond." + raise RuntimeError(f"{user_msg} URL: {url}. Details: {e}") from e + except httpx.HTTPStatusError as e: + status_code = e.response.status_code if e.response else "unknown" + if status_code == 401 or status_code == 403: + user_msg = f"HTTP request failed: authentication error ({status_code}). Check your credentials or permissions." + elif status_code == 404: + user_msg = f"HTTP request failed: not found ({status_code}). The URL may be invalid or the resource was removed." + elif 500 <= status_code < 600: + user_msg = f"HTTP request failed: server error ({status_code}). The server encountered an error." + else: + user_msg = f"HTTP request failed: status code {status_code}." + raise RuntimeError(f"{user_msg} URL: {url}. Details: {e}") from e + except Exception as e: + user_msg = "HTTP request failed: unexpected error." + raise RuntimeError(f"{user_msg} URL: {url}. Details: {e}") from e + + # Write to temp file + Path(temp_path).write_bytes(response.content) + + return temp_path + except Exception: + # Clean up on error + try: + p = Path(temp_path) + if p.exists(): + p.unlink(missing_ok=True) + except Exception: + pass + raise + + def _convert_to_raw_url(self, url: str) -> str: + """Convert GitHub/GitLab blob URL to raw URL.""" + parsed = urlparse(url) + try: + from openviking_cli.utils.config import get_openviking_config + + config = get_openviking_config() + github_domains = config.html.github_domains + gitlab_domains = config.html.gitlab_domains + github_raw_domain = config.code.github_raw_domain + + if parsed.netloc in github_domains: + path_parts = parsed.path.strip("/").split("/") + if len(path_parts) >= 4 and path_parts[2] == "blob": + # Remove 'blob' + new_path = "/".join(path_parts[:2] + path_parts[3:]) + return f"https://{github_raw_domain}/{new_path}" + + if parsed.netloc in gitlab_domains and "/blob/" in parsed.path: + return url.replace("/blob/", "/raw/") + + except Exception: + pass + + return url diff --git a/openviking/parse/accessors/registry.py b/openviking/parse/accessors/registry.py new file mode 100644 index 000000000..d2bfb8b63 --- /dev/null +++ b/openviking/parse/accessors/registry.py @@ -0,0 +1,204 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +""" +Registry for Data Accessors. + +Manages DataAccessor registration and provides automatic source routing. +""" + +import logging +from pathlib import Path +from typing import List, Optional, Union + +from .base import DataAccessor, LocalResource, SourceType + +logger = logging.getLogger(__name__) + + +class AccessorRegistry: + """ + Registry for data accessors. + + Provides automatic accessor selection based on source type and priority. + """ + + def __init__(self, register_default: bool = True): + """ + Initialize the accessor registry. + + Args: + register_default: Whether to register default accessors + """ + self._accessors: List[DataAccessor] = [] + + if register_default: + self._register_defaults() + + def _register_defaults(self) -> None: + """Register default accessors.""" + # GitAccessor - handles git repositories + try: + from .git import GitAccessor + + self.register(GitAccessor()) + except Exception as e: + logger.debug(f"[AccessorRegistry] Failed to register GitAccessor: {e}") + + # HTTPAccessor - handles HTTP/HTTPS URLs + try: + from .http import HTTPAccessor + + self.register(HTTPAccessor()) + except Exception as e: + logger.debug(f"[AccessorRegistry] Failed to register HTTPAccessor: {e}") + + # FeishuAccessor - handles Feishu/Lark documents + try: + from .feishu import FeishuAccessor + + self.register(FeishuAccessor()) + except Exception as e: + logger.debug(f"[AccessorRegistry] Failed to register FeishuAccessor: {e}") + + def register(self, accessor: DataAccessor) -> None: + """ + Register an accessor. + + Accessors are stored in descending order of priority. + When multiple accessors can handle the same source, + the one with the highest priority is selected. + + Args: + accessor: DataAccessor instance to register + """ + # Insert in priority order (descending) + insert_idx = len(self._accessors) + for i, existing in enumerate(self._accessors): + if accessor.priority > existing.priority: + insert_idx = i + break + + self._accessors.insert(insert_idx, accessor) + logger.debug( + f"[AccessorRegistry] Registered accessor {accessor.__class__.__name__} with priority {accessor.priority}" + ) + + def unregister(self, accessor_name: str) -> bool: + """ + Unregister an accessor by class name. + + Args: + accessor_name: Name of the accessor class to unregister + + Returns: + True if an accessor was unregistered, False otherwise + """ + for i, accessor in enumerate(self._accessors): + if accessor.__class__.__name__ == accessor_name: + del self._accessors[i] + logger.debug(f"[AccessorRegistry] Unregistered accessor {accessor_name}") + return True + return False + + def get_accessor(self, source: Union[str, Path]) -> Optional[DataAccessor]: + """ + Get the highest priority accessor that can handle the source. + + Args: + source: Source string or Path to check + + Returns: + DataAccessor instance or None if no accessor can handle the source + """ + for accessor in self._accessors: + if accessor.can_handle(source): + return accessor + return None + + def list_accessors(self, source: Optional[Union[str, Path]] = None) -> List[DataAccessor]: + """ + List registered accessors. + + Args: + source: Optional source to filter by (only returns accessors that can handle it) + + Returns: + List of DataAccessor instances + """ + if source is None: + return list(self._accessors) + return [a for a in self._accessors if a.can_handle(source)] + + async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: + """ + Access a source by routing to the appropriate accessor. + + If no accessor can handle the source, it is treated as a local file. + + Args: + source: Source string (URL, path, etc.) or Path object + **kwargs: Additional arguments passed to the accessor + + Returns: + LocalResource pointing to the locally available data + """ + source_str = str(source) + + # Try to find an accessor + accessor = self.get_accessor(source) + if accessor: + logger.debug( + f"[AccessorRegistry] Using accessor {accessor.__class__.__name__} for source: {source_str}" + ) + return await accessor.access(source, **kwargs) + + # Default: treat as local file + logger.debug(f"[AccessorRegistry] No accessor found, treating as local file: {source_str}") + path = Path(source) + return LocalResource( + path=path, + source_type=SourceType.LOCAL, + original_source=source_str, + meta={}, + is_temporary=False, + ) + + def clear(self) -> None: + """Remove all registered accessors.""" + self._accessors.clear() + logger.debug("[AccessorRegistry] Cleared all accessors from registry") + + +# Global registry instance +_default_registry: Optional[AccessorRegistry] = None + + +def get_accessor_registry(register_default: bool = True) -> AccessorRegistry: + """ + Get the default accessor registry. + + Args: + register_default: Whether to register default accessors if creating + a new registry instance (only used on first call) + + Returns: + The global AccessorRegistry instance + """ + global _default_registry + if _default_registry is None: + _default_registry = AccessorRegistry(register_default=register_default) + return _default_registry + + +async def access(source: Union[str, Path], **kwargs) -> LocalResource: + """ + Access a source using the default registry. + + Args: + source: Source string or Path + **kwargs: Additional arguments + + Returns: + LocalResource + """ + return await get_accessor_registry().access(source, **kwargs) diff --git a/openviking/parse/parsers/__init__.py b/openviking/parse/parsers/__init__.py index 6ce39e4b3..5f17740d0 100644 --- a/openviking/parse/parsers/__init__.py +++ b/openviking/parse/parsers/__init__.py @@ -2,10 +2,9 @@ # SPDX-License-Identifier: AGPL-3.0 from .base_parser import BaseParser -from .code import CodeRepositoryParser from .epub import EPubParser from .excel import ExcelParser -from .html import HTMLParser, URLType, URLTypeDetector +from .html import HTMLParser from .markdown import MarkdownParser from .pdf import PDFParser from .powerpoint import PowerPointParser @@ -15,12 +14,9 @@ __all__ = [ "BaseParser", - "CodeRepositoryParser", "EPubParser", "ExcelParser", "HTMLParser", - "URLType", - "URLTypeDetector", "MarkdownParser", "PDFParser", "PowerPointParser", diff --git a/openviking/parse/parsers/code/code.py b/openviking/parse/parsers/code/code.py index a9c1c7da0..9c829583e 100644 --- a/openviking/parse/parsers/code/code.py +++ b/openviking/parse/parsers/code/code.py @@ -13,7 +13,6 @@ import os import shutil import stat -import tempfile import time import urllib.request import zipfile @@ -54,6 +53,30 @@ class CodeRepositoryParser(BaseParser): - Automatic filtering of non-code directories (.git, node_modules, etc.) - Direct mapping to VikingFS temp directory - Preserves directory structure without chunking + + 代码仓库入库处理流程 + + 输入: https://github.com/markwhen/gogetxueqiu + ↓ + [GitAccessor] → LocalResource + - path: /tmp/.../extracted/repo + - source_type: SourceType.GIT + - original_source: "https://github.com/markwhen/gogetxueqiu" + - meta: {repo_name: "markwhen/gogetxueqiu", ...} + ↓ + [media_processor] → 所有目录都用 DirectoryParser!← 简化了! + ↓ + [DirectoryParser.parse()] + ├─→ 检测到 (path/.git).exists() ← 新增! + ├─→ 收集 git 元数据 + └─→ 委托给 CodeRepositoryParser.parse() + ↓ + [CodeRepositoryParser.parse()] + - source_path = original_source (https://github.com/...) + ↓ + [TreeBuilder.finalize_from_temp()] + - 从 source_path 解析出 "markwhen/gogetxueqiu" + - root_uri = "viking://resources/markwhen/gogetxueqiu" """ # Class constants imported from constants.py @@ -87,57 +110,48 @@ def _detect_file_type(self, file_path: Path) -> str: async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: """ - Parse code repository. + Parse code repository (only accepts local directories, as network fetching is done in Accessor layer). Args: - source: Repository URL (git/http) or local zip path + source: Local directory path (already fetched by DataAccessor) instruction: Processing instruction (unused in parser phase) **kwargs: Additional arguments + _source_meta: Metadata from DataAccessor (must contain repo_name, repo_ref, repo_commit) + original_source: Original URL (for repo name extraction if needed) Returns: ParseResult with temp_dir_path pointing to the uploaded content """ start_time = time.time() - source_str = str(source) - temp_local_dir = None - branch = None - commit = None + source_path = Path(source) + + # Check if source is already a local directory (should always be true) + if not source_path.is_dir(): + raise ValueError( + f"CodeRepositoryParser only accepts local directories. " + f"Source type: {type(source)}, value: {source}" + ) + + logger.info(f"[CodeRepositoryParser] Parsing code repository: {source_path}") try: - # 1. Prepare local temp directory - temp_local_dir = tempfile.mkdtemp(prefix="ov_repo_") - logger.info(f"Created local temp dir: {temp_local_dir}") - - # 2. Fetch content (Clone or Extract) - repo_name = "repository" - local_dir = Path(temp_local_dir) - if source_str.startswith("git@"): - # git@ SSH URL: use git clone directly (no GitHub ZIP optimization) - repo_name = await self._git_clone( - source_str, - temp_local_dir, - branch=branch, - commit=commit, + # Get metadata from DataAccessor + source_meta = kwargs.get("_source_meta", {}) + repo_name = source_meta.get("repo_name", "repository") + branch = source_meta.get("repo_ref") + commit = source_meta.get("repo_commit") + + # If repo_name is still default, try to extract from original source + if repo_name == "repository": + original_source = source_meta.get("original_source") or kwargs.get( + "original_source" ) - elif source_str.startswith(("http://", "https://", "git://", "ssh://")): - repo_url, branch, commit = self._parse_repo_source(source_str, **kwargs) - if self._is_github_url(repo_url): - # Use GitHub ZIP API: supports branch names, tags, and commit SHAs - local_dir, repo_name = await self._github_zip_download( - repo_url, branch or commit, temp_local_dir - ) - else: - # Non-GitHub URL: use git clone - repo_name = await self._git_clone( - repo_url, - temp_local_dir, - branch=branch, - commit=commit, - ) - elif str(source).endswith(".zip"): - repo_name = await self._extract_zip(source_str, temp_local_dir) - else: - raise ValueError(f"Unsupported source for CodeRepositoryParser: {source}") + if original_source: + parsed_org_repo = parse_code_hosting_url(original_source) + if parsed_org_repo: + repo_name = parsed_org_repo + + local_dir = source_path # 3. Create VikingFS temp URI viking_fs = self._get_viking_fs() @@ -161,9 +175,11 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) meta={"name": repo_name, "type": "repository"}, ) + # Use original URL as source_path instead of local temp dir for proper org/repo parsing in TreeBuilder + original_source = source_meta.get("original_source") or kwargs.get("original_source") result = create_parse_result( root=root, - source_path=source_str, + source_path=original_source or str(source), source_format="repository", parser_name="CodeRepositoryParser", parse_time=time.time() - start_time, @@ -180,24 +196,17 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) except Exception as e: logger.error(f"Failed to parse repository {source}: {e}", exc_info=True) + # Use original URL for error case as well + original_source = source_meta.get("original_source") or kwargs.get("original_source") return create_parse_result( root=ResourceNode(type=NodeType.ROOT, content_path=None), - source_path=source_str, + source_path=original_source or str(source), source_format="repository", parser_name="CodeRepositoryParser", parse_time=time.time() - start_time, warnings=[f"Failed to parse repository: {str(e)}"], ) - finally: - # Cleanup local temp dir - if temp_local_dir and os.path.exists(temp_local_dir): - try: - shutil.rmtree(temp_local_dir) - logger.debug(f"Cleaned up local temp dir: {temp_local_dir}") - except Exception as e: - logger.warning(f"Failed to cleanup local temp dir {temp_local_dir}: {e}") - async def parse_content( self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs ) -> ParseResult: diff --git a/openviking/parse/parsers/directory.py b/openviking/parse/parsers/directory.py index d8d2b1c98..d190d1367 100644 --- a/openviking/parse/parsers/directory.py +++ b/openviking/parse/parsers/directory.py @@ -87,6 +87,16 @@ async def parse( if not source_path.is_dir(): raise NotADirectoryError(f"Not a directory: {source_path}") + # Check if this is a git repository, delegate to CodeRepositoryParser + if await self._is_git_repository(source_path): + logger.debug( + f"Directory {source_path} is a git repository, delegating to CodeRepositoryParser" + ) + from openviking.parse.parsers.code.code import CodeRepositoryParser + + await self._add_git_metadata(source_path, kwargs) + return await CodeRepositoryParser().parse(str(source_path), instruction, **kwargs) + dir_name = kwargs.get("source_name") or source_path.name warnings: List[str] = [] @@ -452,6 +462,59 @@ async def _merge_temp( except Exception: pass + @staticmethod + async def _is_git_repository(source_path: Path) -> bool: + """Check if the directory contains a git repository.""" + try: + return (source_path / ".git").exists() and (source_path / ".git").is_dir() + except (OSError, PermissionError): + return False + + @staticmethod + async def _add_git_metadata(source_path: Path, kwargs: dict) -> None: + """Add git metadata (branch, commit, repo_name) to kwargs dictionary.""" + try: + from openviking.parse.accessors.git import GitAccessor + + git_accessor = GitAccessor() + + # Get branch + try: + branch = await git_accessor._run_git( + ["git", "-C", str(source_path), "rev-parse", "--abbrev-ref", "HEAD"] + ) + kwargs["repo_ref"] = branch + except Exception as e: + logger.debug(f"Failed to get git branch: {e}") + + # Get commit + try: + commit = await git_accessor._run_git( + ["git", "-C", str(source_path), "rev-parse", "HEAD"] + ) + kwargs["repo_commit"] = commit + except Exception as e: + logger.debug(f"Failed to get git commit: {e}") + + # Get repo name + try: + remote_url = await git_accessor._run_git( + ["git", "-C", str(source_path), "config", "--get", "remote.origin.url"] + ) + if remote_url: + repo_name = git_accessor._get_repo_name(remote_url) + if repo_name and repo_name != "repository": + kwargs["repo_name"] = repo_name + else: + kwargs["repo_name"] = source_path.name + except Exception as e: + logger.debug(f"Failed to get git remote info: {e}") + kwargs["repo_name"] = source_path.name + + except Exception as e: + logger.debug(f"Failed to get git metadata: {e}") + kwargs["repo_name"] = source_path.name + @staticmethod async def _recursive_move( viking_fs: Any, diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py index 97d3ece62..24331113c 100644 --- a/openviking/parse/parsers/html.py +++ b/openviking/parse/parsers/html.py @@ -1,227 +1,47 @@ # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. # SPDX-License-Identifier: AGPL-3.0 """ -HTML and URL parser for OpenViking. +HTML Parser for OpenViking. -Unified parser that handles: -- Local HTML files -- Web pages (URL -> fetch -> parse) -- Download links (URL -> download -> delegate to appropriate parser) +Parses local HTML files. -Preserves natural document hierarchy and filters out navigation/ads. +For URL downloading, use HTTPAccessor in the new two-layer architecture. """ import hashlib import re -import tempfile import time -from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union -from urllib.parse import unquote, urlparse +from typing import List, Optional, Union from openviking.parse.base import ( NodeType, ParseResult, ResourceNode, create_parse_result, - lazy_import, ) from openviking.parse.parsers.base_parser import BaseParser -from openviking.parse.parsers.constants import CODE_EXTENSIONS -from openviking.utils.network_guard import build_httpx_request_validation_hooks -from openviking_cli.exceptions import PermissionDeniedError -from openviking_cli.utils.config import get_openviking_config - -class URLType(Enum): - """URL content types.""" - - WEBPAGE = "webpage" # HTML webpage to parse - DOWNLOAD_PDF = "download_pdf" # PDF file download link - DOWNLOAD_MD = "download_md" # Markdown file download link - DOWNLOAD_TXT = "download_txt" # Text file download link - DOWNLOAD_HTML = "download_html" # HTML file download link - CODE_REPOSITORY = "code_repository" # Code repository (GitHub, GitLab, etc.) - UNKNOWN = "unknown" # Unknown or unsupported type - - -class URLTypeDetector: - """ - Detector for URL content types. - - Uses extension and HTTP HEAD request to determine if a URL is: - - A webpage to scrape - - A file download link (and what type) - """ - - # Extension to URL type mapping - # CODE_EXTENSIONS spread comes first so explicit entries below override - # (e.g., .html/.htm -> DOWNLOAD_HTML instead of DOWNLOAD_TXT) - EXTENSION_MAP = { - **dict.fromkeys(CODE_EXTENSIONS, URLType.DOWNLOAD_TXT), - ".pdf": URLType.DOWNLOAD_PDF, - ".md": URLType.DOWNLOAD_MD, - ".markdown": URLType.DOWNLOAD_MD, - ".txt": URLType.DOWNLOAD_TXT, - ".text": URLType.DOWNLOAD_TXT, - ".html": URLType.DOWNLOAD_HTML, - ".htm": URLType.DOWNLOAD_HTML, - ".git": URLType.CODE_REPOSITORY, - } - - # Content-Type to URL type mapping - CONTENT_TYPE_MAP = { - "application/pdf": URLType.DOWNLOAD_PDF, - "text/markdown": URLType.DOWNLOAD_MD, - "text/plain": URLType.DOWNLOAD_TXT, - "text/html": URLType.WEBPAGE, - "application/xhtml+xml": URLType.WEBPAGE, - } - - async def detect( - self, - url: str, - timeout: float = 10.0, - request_validator=None, - ) -> Tuple[URLType, Dict[str, Any]]: - """ - Detect URL content type. - - Args: - url: URL to detect - timeout: HTTP request timeout - - Returns: - (URLType, metadata dict) - """ - meta = {"url": url, "detected_by": "unknown"} - parsed = urlparse(url) - path_lower = parsed.path.lower() - - # 0. Check for code repository URLs first - if self._is_code_repository_url(url): - meta["detected_by"] = "code_repository_pattern" - return URLType.CODE_REPOSITORY, meta - - # 1. Check extension first - for ext, url_type in self.EXTENSION_MAP.items(): - if path_lower.endswith(ext): - meta["detected_by"] = "extension" - meta["extension"] = ext - return url_type, meta - - # 2. Send HEAD request to check Content-Type - try: - httpx = lazy_import("httpx") - client_kwargs = { - "timeout": timeout, - "follow_redirects": True, - } - event_hooks = build_httpx_request_validation_hooks(request_validator) - if event_hooks: - client_kwargs["event_hooks"] = event_hooks - client_kwargs["trust_env"] = False - - async with httpx.AsyncClient(**client_kwargs) as client: - response = await client.head(url) - content_type = response.headers.get("content-type", "").lower() - - # Remove charset info - if ";" in content_type: - content_type = content_type.split(";")[0].strip() - - meta["content_type"] = content_type - meta["detected_by"] = "content_type" - meta["status_code"] = response.status_code - - # Map content type - for ct_prefix, url_type in self.CONTENT_TYPE_MAP.items(): - if content_type.startswith(ct_prefix): - return url_type, meta - - # Default to webpage for HTML-like content - if "html" in content_type or "xml" in content_type: - return URLType.WEBPAGE, meta - - except PermissionDeniedError: - raise - except Exception as e: - meta["detection_error"] = str(e) - - # 3. Default: assume webpage - return URLType.WEBPAGE, meta - - def _is_code_repository_url(self, url: str) -> bool: - """ - Check if URL is a code repository URL. - - Args: - url: URL to check - - Returns: - True if URL matches code repository patterns - """ - import re - - config = get_openviking_config() - github_domains = list(set(config.html.github_domains + config.code.github_domains)) - gitlab_domains = list(set(config.html.gitlab_domains + config.code.gitlab_domains)) - # Build repository URL patterns from config - repo_patterns = [] - - # Add patterns for GitHub domains - for domain in github_domains: - repo_patterns.append(rf"^https?://{re.escape(domain)}/[^/]+/[^/]+/?$") - - # Add patterns for GitLab domains - for domain in gitlab_domains: - repo_patterns.append(rf"^https?://{re.escape(domain)}/[^/]+/[^/]+/?$") - - # Add other patterns - repo_patterns.extend( - [ - r"^.*\.git$", - r"^git@", - ] - ) - - # Check for URL patterns - for pattern in repo_patterns: - if re.match(pattern, url): - return True - - return False +logger = __import__("openviking_cli.utils.logger").utils.logger.get_logger(__name__) class HTMLParser(BaseParser): """ - Unified parser for HTML files and URLs. + Parser for local HTML files. Features: - Parse local HTML files - - Fetch and parse web pages - - Detect and handle download links - Build hierarchy based on heading tags (h1-h6) - Filter out navigation, ads, and boilerplate - Extract tables and preserve structure + + NOTE: URL downloading functionality has been moved to HTTPAccessor + in the new two-layer architecture. This parser only handles local files. """ - DEFAULT_USER_AGENT = ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/120.0.0.0 Safari/537.36" - ) - - def __init__( - self, - timeout: float = 30.0, - user_agent: Optional[str] = None, - ): + def __init__(self): """Initialize HTML parser.""" - self.timeout = timeout - self.user_agent = user_agent or self.DEFAULT_USER_AGENT - self._url_detector = URLTypeDetector() + pass def _get_readabilipy(self): """Lazy import of readabilipy.""" @@ -258,10 +78,10 @@ def supported_extensions(self) -> List[str]: async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: """ - Unified parse method for HTML files and URLs. + Parse a local HTML file. Args: - source: HTML file path or URL + source: HTML file path instruction: Processing instruction, guides LLM how to understand the resource **kwargs: Additional options @@ -269,238 +89,9 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) ParseResult with document tree """ start_time = time.time() - source_str = str(source) - - # Detect if source is a URL - if source_str.startswith(("http://", "https://")): - return await self._parse_url(source_str, start_time, **kwargs) - else: - return await self._parse_local_file(Path(source), start_time, **kwargs) - - async def _parse_url(self, url: str, start_time: float, **kwargs) -> ParseResult: - """ - Parse URL (webpage or download link). - - Args: - url: URL to parse - start_time: Parse start timestamp - - Returns: - ParseResult - """ - # Detect URL type - request_validator = kwargs.get("request_validator") - url_type, meta = await self._url_detector.detect( - url, - timeout=self.timeout, - request_validator=request_validator, - ) - - if url_type == URLType.WEBPAGE: - # Fetch and parse as webpage - return await self._parse_webpage(url, start_time, meta, **kwargs) - - elif url_type == URLType.DOWNLOAD_PDF: - # Download and delegate to PDF parser - return await self._handle_download_link(url, "pdf", start_time, meta, **kwargs) - - elif url_type == URLType.DOWNLOAD_MD: - # Download and delegate to Markdown parser - return await self._handle_download_link(url, "markdown", start_time, meta, **kwargs) - - elif url_type == URLType.DOWNLOAD_TXT: - # Download and delegate to Text parser - return await self._handle_download_link(url, "text", start_time, meta, **kwargs) - - elif url_type == URLType.DOWNLOAD_HTML: - # Download HTML file and parse - return await self._handle_download_link(url, "html", start_time, meta, **kwargs) - - elif url_type == URLType.CODE_REPOSITORY: - # Delegate to CodeRepositoryParser - return await self._handle_code_repository(url, start_time, meta, **kwargs) - - else: - # Unknown type - try as webpage - return await self._parse_webpage(url, start_time, meta, **kwargs) - - async def _parse_webpage( - self, url: str, start_time: float, meta: Dict[str, Any], **kwargs - ) -> ParseResult: - """ - Fetch and parse a webpage. - - Args: - url: URL to fetch - start_time: Parse start time - meta: Detection metadata - - Returns: - ParseResult - """ - try: - # Fetch HTML - html_content = await self._fetch_html( - url, - request_validator=kwargs.get("request_validator"), - ) - - # Convert to Markdown - markdown_content = self._html_to_markdown(html_content, base_url=url) - - # Parse using MarkdownParser - from openviking.parse.parsers.markdown import MarkdownParser - - md_parser = MarkdownParser() - result = await md_parser.parse_content(markdown_content, source_path=url, **kwargs) - - # Update metadata - result.source_format = "html" - result.parser_name = "HTMLParser" - result.parse_time = time.time() - start_time - result.parse_timestamp = None # Will be set by __post_init__ - result.meta.update(meta) - result.meta["url_type"] = "webpage" - result.meta["intermediate_markdown"] = markdown_content[:500] # Preview - - return result - - except PermissionDeniedError: - raise - except Exception as e: - return create_parse_result( - root=ResourceNode(type=NodeType.ROOT, content_path=None), - source_path=url, - source_format="html", - parser_name="HTMLParser", - parse_time=time.time() - start_time, - warnings=[f"Failed to fetch webpage: {e}"], - ) - - @staticmethod - def _extract_filename_from_url(url: str) -> str: - """ - Extract and URL-decode the original filename from a URL. - - Args: - url: URL to extract filename from - - Returns: - Decoded filename (e.g., "schemas.py" from ".../schemas.py") - Falls back to "download" if no filename can be extracted. - """ - parsed = urlparse(url) - # URL-decode path to handle encoded characters (e.g., %E7%99%BE -> Chinese chars) - decoded_path = unquote(parsed.path) - basename = Path(decoded_path).name - return basename if basename else "download" - - async def _handle_download_link( - self, url: str, file_type: str, start_time: float, meta: Dict[str, Any], **kwargs - ) -> ParseResult: - """ - Download file and delegate to appropriate parser. - - Args: - url: URL to download - file_type: File type ("pdf", "markdown", "text", "html") - start_time: Parse start time - meta: Detection metadata - - Returns: - ParseResult from delegated parser - """ - temp_path = None - try: - # Download to temporary file - temp_path = await self._download_file( - url, - request_validator=kwargs.get("request_validator"), - ) - - # Extract original filename from URL for use as source_path, - # so parsers use it instead of the temp file name. - original_filename = self._extract_filename_from_url(url) - - # Get appropriate parser - if file_type == "pdf": - from openviking.parse.parsers.pdf import PDFParser - - parser = PDFParser() - result = await parser.parse(temp_path, resource_name=Path(original_filename).stem) - elif file_type == "markdown": - from openviking.parse.parsers.markdown import MarkdownParser - - parser = MarkdownParser() - content = Path(temp_path).read_text(encoding="utf-8") - result = await parser.parse_content( - content, source_path=original_filename, **kwargs - ) - elif file_type == "text": - # For text/code files, preserve the original filename and extension. - # Read the downloaded content and save it with the original name - # instead of routing through TextParser->MarkdownParser which - # would rename it to .md and split it into sections. - result = await self._save_downloaded_text(temp_path, original_filename, start_time) - elif file_type == "html": - # Parse downloaded HTML locally - return await self._parse_local_file(Path(temp_path), start_time, **kwargs) - else: - raise ValueError(f"Unsupported file type: {file_type}") - - result.meta.update(meta) - result.meta["downloaded_from"] = url - result.meta["url_type"] = f"download_{file_type}" - return result - - except PermissionDeniedError: - raise - except Exception as e: - return create_parse_result( - root=ResourceNode(type=NodeType.ROOT, content_path=None), - source_path=url, - source_format=file_type, - parser_name="HTMLParser", - parse_time=time.time() - start_time, - warnings=[f"Failed to download/parse link: {e}"], - ) - finally: - if temp_path: - try: - p = Path(temp_path) - if p.exists(): - p.unlink() - except Exception: - pass - - async def _handle_code_repository( - self, url: str, start_time: float, meta: Dict[str, Any], **kwargs - ) -> ParseResult: - """ - Handle code repository URL by delegating to CodeRepositoryParser. - """ - try: - from openviking.parse.parsers.code import CodeRepositoryParser + path = Path(source) - parser = CodeRepositoryParser() - result = await parser.parse(url, **kwargs) - result.meta.update(meta) - result.meta["downloaded_from"] = url - result.meta["url_type"] = "code_repository" - - return result - - except PermissionDeniedError: - raise - except Exception as e: - return create_parse_result( - root=ResourceNode(type=NodeType.ROOT, content_path=None), - source_path=url, - source_format="code_repository", - parser_name="HTMLParser", - parse_time=time.time() - start_time, - warnings=[f"Failed to parse code repository: {e}"], - ) + return await self._parse_local_file(path, start_time, **kwargs) async def _parse_local_file(self, path: Path, start_time: float, **kwargs) -> ParseResult: """Parse local HTML file.""" @@ -521,7 +112,6 @@ async def _parse_local_file(self, path: Path, start_time: float, **kwargs) -> Pa # Add timing info result.parse_time = time.time() - start_time result.parser_name = "HTMLParser" - result.parser_version = "2.0" return result except Exception as e: @@ -534,154 +124,6 @@ async def _parse_local_file(self, path: Path, start_time: float, **kwargs) -> Pa warnings=[f"Failed to read HTML: {e}"], ) - async def _fetch_html(self, url: str, request_validator=None) -> str: - """ - Fetch HTML content from URL. - - Args: - url: URL to fetch - - Returns: - HTML content string - - Raises: - Exception: If fetch fails - """ - httpx = lazy_import("httpx") - - client_kwargs = { - "timeout": self.timeout, - "follow_redirects": True, - } - event_hooks = build_httpx_request_validation_hooks(request_validator) - if event_hooks: - client_kwargs["event_hooks"] = event_hooks - client_kwargs["trust_env"] = False - - async with httpx.AsyncClient(**client_kwargs) as client: - headers = {"User-Agent": self.user_agent} - response = await client.get(url, headers=headers) - response.raise_for_status() - return response.text - - def _convert_to_raw_url(self, url: str) -> str: - """Convert GitHub/GitLab blob URL to raw URL.""" - parsed = urlparse(url) - config = get_openviking_config() - github_domains = config.html.github_domains - gitlab_domains = config.html.gitlab_domains - github_raw_domain = config.code.github_raw_domain - - if parsed.netloc in github_domains: - path_parts = parsed.path.strip("/").split("/") - if len(path_parts) >= 4 and path_parts[2] == "blob": - # Remove 'blob' - new_path = "/".join(path_parts[:2] + path_parts[3:]) - return f"https://{github_raw_domain}/{new_path}" - - if parsed.netloc in gitlab_domains and "/blob/" in parsed.path: - return url.replace("/blob/", "/raw/") - - return url - - async def _save_downloaded_text( - self, temp_path: str, original_filename: str, start_time: float - ) -> ParseResult: - """ - Save a downloaded text/code file preserving its original filename and extension. - - Instead of routing through TextParser -> MarkdownParser (which renames to .md - and splits into sections), this saves the file directly into a VikingFS temp - directory with its original name. - - Args: - temp_path: Path to the downloaded temporary file - original_filename: Original filename from URL (e.g., "schemas.py") - start_time: Parse start timestamp - - Returns: - ParseResult with temp_dir_path set - """ - from openviking.storage.viking_fs import get_viking_fs - - content = Path(temp_path).read_text(encoding="utf-8") - doc_name = Path(original_filename).stem - - viking_fs = get_viking_fs() - temp_uri = viking_fs.create_temp_uri() - await viking_fs.mkdir(temp_uri) - - # Create document root directory (TreeBuilder expects exactly one dir) - root_dir = f"{temp_uri}/{doc_name}" - await viking_fs.mkdir(root_dir) - - # Save with original filename (preserving extension) - file_uri = f"{root_dir}/{original_filename}" - await viking_fs.write_file(file_uri, content) - - root = ResourceNode( - type=NodeType.ROOT, - title=doc_name, - level=0, - ) - - result = create_parse_result( - root=root, - source_path=original_filename, - source_format="text", - parser_name="HTMLParser", - parse_time=time.time() - start_time, - ) - result.temp_dir_path = temp_uri - return result - - async def _download_file(self, url: str, request_validator=None) -> str: - """ - Download file from URL to temporary location. - - Args: - url: URL to download - - Returns: - Path to downloaded temporary file - - Raises: - Exception: If download fails - """ - httpx = lazy_import("httpx") - - url = self._convert_to_raw_url(url) - - # Determine file extension from URL (decode first to handle encoded paths) - parsed = urlparse(url) - decoded_path = unquote(parsed.path) - ext = Path(decoded_path).suffix or ".tmp" - - # Create temp file - temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext) - temp_path = temp_file.name - temp_file.close() - - # Download - client_kwargs = { - "timeout": self.timeout, - "follow_redirects": True, - } - event_hooks = build_httpx_request_validation_hooks(request_validator) - if event_hooks: - client_kwargs["event_hooks"] = event_hooks - client_kwargs["trust_env"] = False - - async with httpx.AsyncClient(**client_kwargs) as client: - headers = {"User-Agent": self.user_agent} - response = await client.get(url, headers=headers) - response.raise_for_status() - - # Write to temp file - Path(temp_path).write_bytes(response.content) - - return temp_path - def _html_to_markdown(self, html: str, base_url: str = "") -> str: """ Convert HTML to Markdown using readabilipy + markdownify (Anthropic approach). @@ -730,7 +172,7 @@ async def parse_content( """ Parse HTML content. - Converts HTML to Markdown and delegates to MarkdownParser (three-phase architecture). + Converts HTML to Markdown and delegates to MarkdownParser. Args: content: HTML content string @@ -742,7 +184,7 @@ async def parse_content( # Convert HTML to Markdown markdown_content = self._html_to_markdown(content, base_url=source_path or "") - # Delegate to MarkdownParser (using three-phase architecture) + # Delegate to MarkdownParser from openviking.parse.parsers.markdown import MarkdownParser md_parser = MarkdownParser() diff --git a/openviking/parse/parsers/zip_parser.py b/openviking/parse/parsers/zip_parser.py index 3418770ec..5dbce8174 100644 --- a/openviking/parse/parsers/zip_parser.py +++ b/openviking/parse/parsers/zip_parser.py @@ -3,18 +3,19 @@ """ ZIP archive parser for OpenViking. -Lists and describes contents of ZIP files. -Converts to markdown and delegates to MarkdownParser. +Extracts ZIP archives and delegates to DirectoryParser for recursive processing. +Supports nested ZIP files via DirectoryParser's recursive parser invocation. """ +import shutil +import tempfile import zipfile -from datetime import datetime from pathlib import Path from typing import List, Optional, Union from openviking.parse.base import ParseResult from openviking.parse.parsers.base_parser import BaseParser -from openviking_cli.utils.config.parser_config import ParserConfig +from openviking.utils.zip_safe import safe_extract_zip from openviking_cli.utils.logger import get_logger logger = get_logger(__name__) @@ -27,200 +28,91 @@ class ZipParser(BaseParser): Supports: .zip Features: - - Lists all files in the archive - - Shows file sizes and modification dates - - Groups files by type/extension + - Extracts ZIP archive to temporary directory + - Delegates to DirectoryParser for recursive content processing + - Supports nested ZIP files (via DirectoryParser recursion) + - Preserves temporary directory for TreeBuilder """ - def __init__(self, config: Optional[ParserConfig] = None): - """Initialize ZIP parser.""" - from openviking.parse.parsers.markdown import MarkdownParser - - self._md_parser = MarkdownParser(config=config) - self.config = config or ParserConfig() - @property def supported_extensions(self) -> List[str]: return [".zip"] async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult: - """Parse from file path.""" - path = Path(source) - - if path.exists(): - markdown_content = self._convert_zip_to_markdown(path) - result = await self._md_parser.parse_content( - markdown_content, - source_path=str(path), - instruction=instruction, - **kwargs, - ) - else: - # Treat as raw content string - result = await self._md_parser.parse_content( - str(source), instruction=instruction, **kwargs - ) - result.source_format = "zip" - result.parser_name = "ZipParser" - return result - - async def parse_content( - self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs - ) -> ParseResult: - """Parse content - for zip, content should be a file path.""" - result = await self._md_parser.parse_content(content, source_path, **kwargs) - result.source_format = "zip" - result.parser_name = "ZipParser" - return result - - def _convert_zip_to_markdown(self, path: Path) -> str: - """ - Convert ZIP file information to markdown format. + """Parse a ZIP file by extracting it and delegating to DirectoryParser. Args: - path: Path to .zip file + source: Path to the .zip file + instruction: Processing instruction (forwarded to DirectoryParser) + **kwargs: Extra options forwarded to DirectoryParser Returns: - Markdown formatted string + ParseResult from DirectoryParser, with temp_dir_path preserved + for TreeBuilder to use later. """ + + path = Path(source) + if not path.exists(): + raise FileNotFoundError(f"ZIP file not found: {path}") + + if not zipfile.is_zipfile(path): + raise ValueError(f"Not a valid ZIP file: {path}") + + # Extract ZIP to temporary directory + temp_dir = Path(tempfile.mkdtemp(prefix="ov_zip_")) try: - with zipfile.ZipFile(path, "r") as zf: - return self._process_zip_contents(zf, path) - except zipfile.BadZipFile: - raise ValueError(f"Invalid or corrupted ZIP file: {path}") - except Exception as e: - raise ValueError(f"Error reading ZIP file: {e}") - - def _process_zip_contents(self, zf: zipfile.ZipFile, path: Path) -> str: - """Process ZIP file contents and return markdown.""" - md_parts = [] - - # Title - md_parts.append(f"# ZIP Archive: {path.name}") - md_parts.append("") - - # Archive info - md_parts.append("## Archive Information") - md_parts.append("") - md_parts.append(f"- **File:** {path.name}") - md_parts.append(f"- **Total files:** {len(zf.namelist())}") - md_parts.append( - f"- **Comment:** {zf.comment.decode('utf-8', errors='ignore') if zf.comment else 'None'}" - ) - md_parts.append("") - - # Group files by extension - files_by_ext = self._group_files_by_extension(zf.namelist()) - - # File listing by category - md_parts.append("## Contents") - md_parts.append("") - - # Summary table - if files_by_ext: - md_parts.append("### File Types Summary") - md_parts.append("") - md_parts.append("| Extension | Count |") - md_parts.append("|-----------|-------|") - for ext, files in sorted(files_by_ext.items(), key=lambda x: -len(x[1])): - display_ext = ext if ext else "(no extension)" - md_parts.append(f"| {display_ext} | {len(files)} |") - md_parts.append("") - - # Detailed listing - md_parts.append("### File List") - md_parts.append("") - - # Create a table with file info - md_parts.append("| File | Size | Modified |") - md_parts.append("|------|------|----------|") - - for info in zf.infolist(): - # Skip directories - if info.is_dir(): - continue - - filename = info.filename - size = self._format_size(info.file_size) - modified = self._format_datetime(info.date_time) - - # Escape pipe characters - filename = filename.replace("|", "\\|") - - md_parts.append(f"| {filename} | {size} | {modified} |") - - md_parts.append("") - - # Directory structure - md_parts.append("## Directory Structure") - md_parts.append("") - md_parts.append("```") - md_parts.append(self._generate_tree_view(zf.namelist())) - md_parts.append("```") - - return "\n".join(md_parts) - - def _group_files_by_extension(self, filenames: List[str]) -> dict: - """Group files by their extension.""" - groups = {} - for name in filenames: - if name.endswith("/"): # Skip directories - continue - ext = Path(name).suffix.lower() - if ext not in groups: - groups[ext] = [] - groups[ext].append(name) - return groups - - def _format_size(self, size: int) -> str: - """Format file size in human-readable format.""" - for unit in ["B", "KB", "MB", "GB", "TB"]: - if size < 1024.0: - return f"{size:.1f} {unit}" - size /= 1024.0 - return f"{size:.1f} PB" - - def _format_datetime(self, dt_tuple) -> str: - """Format datetime tuple from ZIP info.""" - try: - dt = datetime(*dt_tuple) - return dt.strftime("%Y-%m-%d %H:%M") - except: - return "Unknown" - - def _generate_tree_view(self, filenames: List[str]) -> str: - """Generate a tree-like view of the archive contents.""" - # Build a simple tree structure - lines = [] - - # Get unique directories - dirs = set() - for name in filenames: - parts = name.split("/") - for i in range(len(parts) - 1): - dirs.add("/".join(parts[: i + 1]) + "/") - - # Sort all items - all_items = sorted(set(filenames) | dirs) - - for item in all_items: - # Calculate depth - depth = item.count("/") - if item.endswith("/"): - depth -= 1 - - # Create indentation - indent = " " * depth - - # Get just the name part - name = item.rstrip("/").split("/")[-1] if "/" in item else item - - # Add prefix for directories vs files - if item.endswith("/"): - prefix = "[dir] " - else: - prefix = "" + with zipfile.ZipFile(path, "r") as zipf: + safe_extract_zip(zipf, temp_dir) + + # Check if extracted content has a single root directory + extracted_entries = [p for p in temp_dir.iterdir() if p.name not in {".", ".."}] - lines.append(f"{indent}{prefix}{name}") + # Prepare kwargs for DirectoryParser + dir_kwargs = dict(kwargs) + dir_kwargs["instruction"] = instruction - return "\n".join(lines) + # Use DirectoryParser to process the extracted content + from openviking.parse.parsers.directory import DirectoryParser + + parser = DirectoryParser() + + if len(extracted_entries) == 1 and extracted_entries[0].is_dir(): + # Single root directory - parse that directly + dir_kwargs.pop("source_name", None) + result = await parser.parse(str(extracted_entries[0]), **dir_kwargs) + else: + # Multiple entries at root - parse the temp dir itself + # Set source_name from zip filename if not provided + if "source_name" not in dir_kwargs or not dir_kwargs["source_name"]: + dir_kwargs["source_name"] = path.stem + result = await parser.parse(str(temp_dir), **dir_kwargs) + + # Ensure the temporary directory is preserved for TreeBuilder + if not result.temp_dir_path: + result.temp_dir_path = str(temp_dir) + else: + # If DirectoryParser created its own temp, clean up our extraction dir + try: + shutil.rmtree(temp_dir, ignore_errors=True) + except Exception: + pass + + result.source_format = "zip" + result.parser_name = "ZipParser" + return result + + except Exception: + # Clean up on error + try: + shutil.rmtree(temp_dir, ignore_errors=True) + except Exception: + pass + raise + + async def parse_content( + self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs + ) -> ParseResult: + """Parse content - not applicable for ZIP files (needs a file path).""" + raise NotImplementedError( + "ZipParser does not support parse_content. Please provide a file path to parse()." + ) diff --git a/openviking/parse/registry.py b/openviking/parse/registry.py index add62ddb1..d4294a6f1 100644 --- a/openviking/parse/registry.py +++ b/openviking/parse/registry.py @@ -12,7 +12,6 @@ from openviking.parse.base import ParseResult from openviking.parse.parsers.base_parser import BaseParser -from openviking.parse.parsers.code import CodeRepositoryParser from openviking.parse.parsers.directory import DirectoryParser from openviking.parse.parsers.epub import EPubParser from openviking.parse.parsers.excel import ExcelParser @@ -67,9 +66,6 @@ def __init__(self, register_optional: bool = True): self.register("powerpoint", PowerPointParser()) self.register("excel", ExcelParser()) self.register("epub", EPubParser()) - # CodeRepositoryParser also uses .zip; register it before ZipParser - # so that .zip resolves to ZipParser (file) rather than code repo. - self.register("code", CodeRepositoryParser()) self.register("zip", ZipParser()) self.register("directory", DirectoryParser()) @@ -77,14 +73,6 @@ def __init__(self, register_optional: bool = True): self.register("audio", AudioParser()) self.register("video", VideoParser()) - # Optional: Feishu/Lark document parser (requires lark-oapi) - try: - from openviking.parse.parsers.feishu import FeishuParser - - self.register("feishu", FeishuParser()) - except ImportError: - pass - def register(self, name: str, parser: BaseParser) -> None: """ Register a parser. @@ -215,13 +203,16 @@ def get_parser_for_file(self, path: Union[str, Path]) -> Optional[BaseParser]: async def parse(self, source: Union[str, Path], **kwargs) -> ParseResult: """ - Parse a file or content string. + Parse a local file or content string. Automatically selects parser based on file extension. Falls back to text parser for unknown types. + NOTE: For URL handling, use AccessorRegistry in the two-layer architecture. + This registry only handles local files and raw content. + Args: - source: File path or content string + source: Local file path or content string **kwargs: Additional arguments passed to parser Returns: @@ -229,20 +220,6 @@ async def parse(self, source: Union[str, Path], **kwargs) -> ParseResult: """ source_str = str(source) - # First, check if it's a code repository URL - code_parser = self._parsers.get("code") - if code_parser: - # Check if the parser has the is_repository_url method - try: - if hasattr(code_parser, "is_repository_url") and code_parser.is_repository_url( - source_str - ): - logger.info(f"Detected code repository URL: {source_str}") - return await code_parser.parse(source_str, **kwargs) - except Exception as e: - logger.warning(f"Error checking if source is repository URL: {e}") - # Continue with normal parsing flow - # Check if source looks like a file path (short enough and no newlines) is_potential_path = len(source_str) <= 1024 and "\n" not in source_str diff --git a/openviking/utils/media_processor.py b/openviking/utils/media_processor.py index 3d1e19eda..346912273 100644 --- a/openviking/utils/media_processor.py +++ b/openviking/utils/media_processor.py @@ -2,18 +2,16 @@ # SPDX-License-Identifier: AGPL-3.0 """Unified resource processor with strategy-based routing.""" -import tempfile -import zipfile from pathlib import Path from typing import TYPE_CHECKING, Optional from openviking.parse import DocumentConverter, parse +from openviking.parse.accessors.base import SourceType from openviking.parse.base import ParseResult from openviking.server.local_input_guard import ( is_remote_resource_source, looks_like_local_path, ) -from openviking.utils.zip_safe import safe_extract_zip from openviking_cli.exceptions import PermissionDeniedError from openviking_cli.utils.logger import get_logger @@ -25,7 +23,12 @@ class UnifiedResourceProcessor: - """Unified resource processing for files, URLs, and raw content.""" + """Unified resource processing for files, URLs, and raw content. + + Uses two-layer architecture: + - Phase 1: AccessorRegistry gets LocalResource from source + - Phase 2: ParserRegistry parses LocalResource to ParseResult + """ def __init__( self, @@ -35,6 +38,7 @@ def __init__( self.storage = storage self._vlm_processor = vlm_processor self._document_converter = None + self._accessor_registry = None def _get_vlm_processor(self) -> Optional["VLMProcessor"]: if self._vlm_processor is None: @@ -48,6 +52,14 @@ def _get_document_converter(self) -> DocumentConverter: self._document_converter = DocumentConverter() return self._document_converter + def _get_accessor_registry(self): + """Lazy initialize AccessorRegistry for two-layer mode.""" + if self._accessor_registry is None: + from openviking.parse.accessors import get_accessor_registry + + self._accessor_registry = get_accessor_registry() + return self._accessor_registry + async def process( self, source: str, @@ -55,141 +67,83 @@ async def process( allow_local_path_resolution: bool = True, **kwargs, ) -> ParseResult: - """Process any source (file/URL/content) with appropriate strategy.""" - # Check if URL - if self._is_url(source): - return await self._process_url(source, instruction) + """Process any source (file/URL/content) with two-layer architecture. + + Phase 1: Use AccessorRegistry to get LocalResource + Phase 2: Use ParserRegistry to parse LocalResource + + Resource Lifecycle: + - Temporary resources are managed via context manager or temp_dir_path + - Directories needed for TreeBuilder are preserved via ParseResult.temp_dir_path + """ - # Check if looks like a file path (short enough and no newlines) + # First check if source is raw content (not URL/path) is_potential_path = ( allow_local_path_resolution and len(source) <= 1024 and "\n" not in source ) - if is_potential_path: - path = Path(source) - if path.exists(): - if path.is_dir(): - return await self._process_directory(path, instruction, **kwargs) - return await self._process_file(path, instruction, **kwargs) - else: - logger.warning(f"Path {path} does not exist") - raise FileNotFoundError(f"Path {path} does not exist") - - if not allow_local_path_resolution and looks_like_local_path(source): + if not is_potential_path and not self._is_url(source): + # Treat as raw content + return await parse(source, instruction=instruction) + + # Block local paths in HTTP server mode, but allow remote URLs + if ( + not allow_local_path_resolution + and not is_remote_resource_source(source) + and looks_like_local_path(source) + ): raise PermissionDeniedError( "HTTP server only accepts remote resource URLs or temp-uploaded files; " "direct host filesystem paths are not allowed." ) - # Treat as raw content - return await parse(source, instruction=instruction) + # Phase 1: Accessor - get local resource + registry = self._get_accessor_registry() + local_resource = await registry.access(source, **kwargs) + + # Use context manager for automatic cleanup, but preserve directories for TreeBuilder + try: + # Phase 2: Parser - parse the local resource + parse_kwargs = dict(kwargs) + parse_kwargs["instruction"] = instruction + parse_kwargs["vlm_processor"] = self._get_vlm_processor() + parse_kwargs["storage"] = self.storage + parse_kwargs["_source_meta"] = local_resource.meta + parse_kwargs["original_source"] = local_resource.original_source + + # Set resource_name from source_name or path + source_name = kwargs.get("source_name") + if source_name: + parse_kwargs["resource_name"] = Path(source_name).stem + parse_kwargs.setdefault("source_name", source_name) + else: + # For git repositories, use repo_name from meta if available + repo_name = local_resource.meta.get("repo_name") + if repo_name and local_resource.source_type == SourceType.GIT: + # Use the last part of repo_name as the resource_name (e.g., "OpenViking" from "volcengine/OpenViking") + parse_kwargs["resource_name"] = repo_name.split("/")[-1] + else: + parse_kwargs.setdefault("resource_name", local_resource.path.stem) + + # If it's a directory, use DirectoryParser which will delegate to CodeRepositoryParser if it's a git repo + if local_resource.path.is_dir(): + from openviking.parse.parsers.directory import DirectoryParser + + parser = DirectoryParser() + + result = await parser.parse(str(local_resource.path), **parse_kwargs) + # Preserve temporary directory for TreeBuilder + if local_resource.is_temporary and not result.temp_dir_path: + result.temp_dir_path = str(local_resource.path) + # Mark as non-temporary so context manager doesn't clean it up + local_resource.is_temporary = False + return result + + # For files, use the unified parse function (including .zip files via ZipParser) + return await parse(str(local_resource.path), **parse_kwargs) + finally: + # Clean up temporary resources unless they need to be preserved + local_resource.cleanup() def _is_url(self, source: str) -> bool: """Check if source is a URL.""" return is_remote_resource_source(source) - - async def _process_url(self, url: str, instruction: str, **kwargs) -> ParseResult: - """Process URL source.""" - from openviking.utils.code_hosting_utils import is_git_repo_url, validate_git_ssh_uri - - # Validate git@ SSH URIs early - if url.startswith("git@"): - validate_git_ssh_uri(url) - - # Route Feishu/Lark cloud document URLs to FeishuParser - if self._is_feishu_url(url): - from openviking.parse.registry import get_registry - - parser = get_registry().get_parser("feishu") - if parser is None: - raise ImportError( - "FeishuParser not available. " - "Install lark-oapi: pip install 'openviking[bot-feishu]'" - ) - return await parser.parse(url, instruction=instruction, **kwargs) - - # Route git protocols and repo URLs to CodeRepositoryParser - if url.startswith(("git@", "git://", "ssh://")) or is_git_repo_url(url): - from openviking.parse.parsers.code.code import CodeRepositoryParser - - parser = CodeRepositoryParser() - return await parser.parse(url, instruction=instruction, **kwargs) - - from openviking.parse.parsers.html import HTMLParser - - parser = HTMLParser() - return await parser.parse(url, instruction=instruction, **kwargs) - - @staticmethod - def _is_feishu_url(url: str) -> bool: - """Check if URL is a Feishu/Lark cloud document.""" - from urllib.parse import urlparse - - parsed = urlparse(url) - host = parsed.hostname or "" - path = parsed.path - is_feishu_domain = host.endswith(".feishu.cn") or host.endswith(".larksuite.com") - has_doc_path = any( - path == f"/{t}" or path.startswith(f"/{t}/") for t in ("docx", "wiki", "sheets", "base") - ) - return is_feishu_domain and has_doc_path - - async def _process_directory( - self, - dir_path: Path, - instruction: str, - **kwargs, - ) -> ParseResult: - """Process directory source via DirectoryParser. - - Args: - dir_path: Path to the directory. - instruction: Processing instruction. - **kwargs: Forwarded to ``DirectoryParser.parse()`` → - ``scan_directory()``: ``strict``, ``ignore_dirs``, - ``include``, ``exclude``. - """ - from openviking.parse.parsers.directory import DirectoryParser - - parser = DirectoryParser() - return await parser.parse(str(dir_path), instruction=instruction, **kwargs) - - async def _process_file( - self, - file_path: Path, - instruction: str, - **kwargs, - ) -> ParseResult: - """Process file with unified parsing.""" - ext = file_path.suffix.lower() - # Only treat .zip files as archives to extract. - if ext == ".zip" and zipfile.is_zipfile(file_path): - temp_dir = Path(tempfile.mkdtemp()) - try: - with zipfile.ZipFile(file_path, "r") as zipf: - safe_extract_zip(zipf, temp_dir) - - extracted_entries = [p for p in temp_dir.iterdir() if p.name not in {".", ".."}] - if len(extracted_entries) == 1 and extracted_entries[0].is_dir(): - dir_kwargs = dict(kwargs) - dir_kwargs.pop("source_name", None) - return await self._process_directory( - extracted_entries[0], instruction, **dir_kwargs - ) - - return await self._process_directory(temp_dir, instruction, **kwargs) - finally: - pass # Don't delete temp_dir yet, it will be used by TreeBuilder - source_name = kwargs.get("source_name") - if source_name: - kwargs["resource_name"] = Path(source_name).stem - kwargs.setdefault("source_name", source_name) - else: - kwargs.setdefault("resource_name", file_path.stem) - - return await parse( - str(file_path), - instruction=instruction, - vlm_processor=self._get_vlm_processor(), - storage=self.storage, - **kwargs, - ) diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py index 01aaf75df..122f389e0 100644 --- a/openviking/utils/resource_processor.py +++ b/openviking/utils/resource_processor.py @@ -216,6 +216,7 @@ async def process_resource( # ============ Phase 3.5: 首次添加立即落盘 + 生命周期锁 ============ root_uri = result.get("root_uri") temp_uri = result.get("temp_uri") # temp_doc_uri + original_temp_uri = temp_uri # 保存原始 temp_uri 用于最终输出 candidate_uri = getattr(context_tree, "_candidate_uri", None) if context_tree else None lifecycle_lock_handle_id = "" @@ -257,7 +258,8 @@ async def process_resource( except Exception: pass - result["temp_uri"] = root_uri + # 数据已移动到 root_uri,后续处理使用 root_uri + temp_uri = root_uri else: # 增量更新:对目标目录加 SUBTREE 锁 resource_path = viking_fs._uri_to_path(root_uri, ctx=ctx) @@ -267,7 +269,7 @@ async def process_resource( # ============ Phase 4: Optional Steps ============ build_index = kwargs.get("build_index", True) - temp_uri_for_summarize = result.get("temp_uri") or parse_result.temp_dir_path + temp_uri_for_summarize = temp_uri or parse_result.temp_dir_path should_summarize = summarize or build_index if should_summarize: skip_vec = not build_index @@ -294,6 +296,10 @@ async def process_resource( if handle: await get_lock_manager().release(handle) + # 恢复原始 temp_uri 用于输出 + if original_temp_uri is not None: + result["temp_uri"] = original_temp_uri + return result @staticmethod diff --git a/openviking_cli/utils/config/parser_config.py b/openviking_cli/utils/config/parser_config.py index 6c369915c..fbdfdad0b 100644 --- a/openviking_cli/utils/config/parser_config.py +++ b/openviking_cli/utils/config/parser_config.py @@ -415,7 +415,7 @@ def validate(self) -> None: @dataclass -class HTMLConfig(CodeHostingConfig): +class HTMLConfig(ParserConfig): """ Configuration for HTML parsing. diff --git a/tests/unit/test_accessors_base.py b/tests/unit/test_accessors_base.py new file mode 100644 index 000000000..cd2602fc6 --- /dev/null +++ b/tests/unit/test_accessors_base.py @@ -0,0 +1,135 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""Unit tests for DataAccessor base classes and LocalResource.""" + +from pathlib import Path +from typing import Union + +import pytest + +from openviking.parse.accessors.base import DataAccessor, LocalResource + + +class TestLocalResource: + """Tests for LocalResource dataclass.""" + + def test_create_local_resource(self, tmp_path: Path) -> None: + """Basic LocalResource creation.""" + test_file = tmp_path / "test.txt" + test_file.write_text("content") + + resource = LocalResource( + path=test_file, + source_type="local", + original_source="/original/path", + meta={"key": "value"}, + is_temporary=False, + ) + + assert resource.path == test_file + assert resource.source_type == "local" + assert resource.original_source == "/original/path" + assert resource.meta == {"key": "value"} + assert resource.is_temporary is False + + def test_local_resource_defaults(self, tmp_path: Path) -> None: + """LocalResource with default values.""" + test_file = tmp_path / "test.txt" + test_file.write_text("content") + + resource = LocalResource( + path=test_file, + source_type="local", + original_source="/original/path", + ) + + assert resource.meta == {} + assert resource.is_temporary is True + + def test_local_resource_cleanup_temporary(self, tmp_path: Path) -> None: + """cleanup() removes temporary resources.""" + test_dir = tmp_path / "test_dir" + test_dir.mkdir() + (test_dir / "file.txt").write_text("content") + + resource = LocalResource( + path=test_dir, + source_type="test", + original_source="test", + is_temporary=True, + ) + + assert test_dir.exists() + resource.cleanup() + assert not test_dir.exists() + + def test_local_resource_cleanup_non_temporary(self, tmp_path: Path) -> None: + """cleanup() does not remove non-temporary resources.""" + test_file = tmp_path / "test.txt" + test_file.write_text("content") + + resource = LocalResource( + path=test_file, + source_type="local", + original_source="test", + is_temporary=False, + ) + + assert test_file.exists() + resource.cleanup() + assert test_file.exists() + + def test_local_resource_cleanup_missing(self, tmp_path: Path) -> None: + """cleanup() handles missing paths gracefully.""" + missing_path = tmp_path / "missing" + + resource = LocalResource( + path=missing_path, + source_type="test", + original_source="test", + is_temporary=True, + ) + + # Should not raise + resource.cleanup() + + +class TestDataAccessor: + """Tests for DataAccessor abstract base class.""" + + def test_cannot_instantiate_abstract(self) -> None: + """Cannot directly instantiate DataAccessor.""" + with pytest.raises(TypeError): + DataAccessor() # type: ignore + + def test_subclass_must_implement_methods(self) -> None: + """Subclasses must implement all abstract methods.""" + + class IncompleteAccessor(DataAccessor): + pass + + with pytest.raises(TypeError): + IncompleteAccessor() + + def test_complete_subclass(self) -> None: + """A complete subclass can be instantiated.""" + + class TestAccessor(DataAccessor): + def can_handle(self, source: Union[str, Path]) -> bool: + return str(source).startswith("test:") + + async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: + return LocalResource( + path=Path("/tmp/test"), + source_type="test", + original_source=str(source), + ) + + @property + def priority(self) -> int: + return 50 + + accessor = TestAccessor() + assert accessor.can_handle("test:something") + assert not accessor.can_handle("other:something") + assert accessor.priority == 50 diff --git a/tests/unit/test_accessors_git.py b/tests/unit/test_accessors_git.py new file mode 100644 index 000000000..c8f316c38 --- /dev/null +++ b/tests/unit/test_accessors_git.py @@ -0,0 +1,84 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""Unit tests for GitAccessor.""" + +from pathlib import Path + +import pytest + +from openviking.parse.accessors import GitAccessor + + +class TestGitAccessor: + """Tests for GitAccessor.""" + + @pytest.fixture + def accessor(self) -> GitAccessor: + """Create a GitAccessor instance.""" + return GitAccessor() + + def test_priority(self, accessor: GitAccessor) -> None: + """GitAccessor should have correct priority.""" + assert accessor.priority == 80 + + @pytest.mark.parametrize( + "source", + [ + "git@github.com:volcengine/OpenViking.git", + "git@gitlab.com:org/repo.git", + ], + ) + def test_can_handle_git_ssh_url(self, accessor: GitAccessor, source: str) -> None: + """GitAccessor should handle git@ SSH URLs.""" + assert accessor.can_handle(source) is True + + @pytest.mark.parametrize( + "source", + [ + "https://github.com/volcengine/OpenViking", + "https://github.com/volcengine/OpenViking.git", + "https://gitlab.com/org/repo", + "http://github.com/org/repo", + ], + ) + def test_can_handle_github_http_url(self, accessor: GitAccessor, source: str) -> None: + """GitAccessor should handle GitHub/GitLab HTTP URLs.""" + assert accessor.can_handle(source) is True + + @pytest.mark.parametrize( + "source", + [ + "https://github.com/volcengine/OpenViking/tree/main", + "https://github.com/volcengine/OpenViking/tree/abc1234", + ], + ) + def test_can_handle_github_with_ref(self, accessor: GitAccessor, source: str) -> None: + """GitAccessor should handle GitHub URLs with branch/commit.""" + assert accessor.can_handle(source) is True + + def test_can_handle_git_protocol_url(self, accessor: GitAccessor) -> None: + """GitAccessor should handle git:// URLs.""" + assert accessor.can_handle("git://github.com/volcengine/OpenViking.git") is True + + @pytest.mark.parametrize( + "source", + [ + "/path/to/repo.git", + "/path/to/archive.zip", + ], + ) + def test_can_handle_local_files(self, accessor: GitAccessor, source: str) -> None: + """GitAccessor should handle local .git and .zip files.""" + assert accessor.can_handle(Path(source)) is True + + @pytest.mark.parametrize( + "source", + [ + "https://example.com/page.html", + "https://github.com/volcengine/OpenViking/issues/123", + "git@example.com:repo", + ], + ) + def test_cannot_handle_other_urls(self, accessor: GitAccessor, source: str) -> None: + """GitAccessor should not handle non-git URLs or files.""" + assert accessor.can_handle(source) is False diff --git a/tests/unit/test_accessors_http.py b/tests/unit/test_accessors_http.py new file mode 100644 index 000000000..7bf99e6ed --- /dev/null +++ b/tests/unit/test_accessors_http.py @@ -0,0 +1,116 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""Unit tests for HTTPAccessor.""" + +import pytest + +from openviking.parse.accessors import AccessorRegistry, GitAccessor, HTTPAccessor + + +class TestHTTPAccessor: + """Tests for HTTPAccessor.""" + + @pytest.fixture + def accessor(self) -> HTTPAccessor: + """Create a HTTPAccessor instance.""" + return HTTPAccessor() + + def test_priority(self, accessor: HTTPAccessor) -> None: + """HTTPAccessor should have correct priority.""" + assert accessor.priority == 50 + + @pytest.mark.parametrize( + "source", + [ + "https://example.com/page.html", + "http://example.com/document.pdf", + "https://example.org/file.md", + ], + ) + def test_can_handle_http_urls(self, accessor: HTTPAccessor, source: str) -> None: + """HTTPAccessor should handle regular HTTP/HTTPS URLs.""" + assert accessor.can_handle(source) is True + + @pytest.mark.parametrize( + "source", + [ + "/path/to/file.html", + "git@github.com:org/repo.git", + "plain text content", + ], + ) + def test_cannot_handle_non_http(self, accessor: HTTPAccessor, source: str) -> None: + """HTTPAccessor should NOT handle non-HTTP sources.""" + assert accessor.can_handle(source) is False + + @pytest.mark.parametrize( + "url, expected", + [ + ("https://example.com/path/file.html", "file.html"), + ("https://example.com/path/doc.pdf", "doc.pdf"), + ("https://example.com/path/", "path"), + ("https://example.com", "download"), + ], + ) + def test_extract_filename_from_url(self, url: str, expected: str) -> None: + """Test filename extraction from URLs.""" + assert HTTPAccessor._extract_filename_from_url(url) == expected + + +class TestHTTPAccessorPriorityRouting: + """Tests that verify HTTPAccessor works correctly with priority-based routing.""" + + def test_git_url_routed_to_git_accessor(self) -> None: + """Git URLs should be routed to GitAccessor, not HTTPAccessor.""" + registry = AccessorRegistry(register_default=False) + http = HTTPAccessor() + git = GitAccessor() + registry.register(http) + registry.register(git) + + test_url = "https://github.com/volcengine/OpenViking" + + # Both can handle the URL individually (this is OK!) + assert git.can_handle(test_url) is True + assert http.can_handle(test_url) is True + + # But registry picks the higher priority one (GitAccessor) + accessor = registry.get_accessor(test_url) + assert accessor is not None + assert accessor.__class__.__name__ == "GitAccessor" + + def test_regular_http_url_routed_to_http_accessor(self) -> None: + """Regular HTTP URLs should be routed to HTTPAccessor.""" + registry = AccessorRegistry(register_default=False) + http = HTTPAccessor() + git = GitAccessor() + registry.register(http) + registry.register(git) + + test_url = "https://example.com/page.html" + + # Only HTTPAccessor can handle this + assert git.can_handle(test_url) is False + assert http.can_handle(test_url) is True + + # Registry picks HTTPAccessor + accessor = registry.get_accessor(test_url) + assert accessor is not None + assert accessor.__class__.__name__ == "HTTPAccessor" + + def test_accessor_priority_order(self) -> None: + """Accessors should be registered in descending priority order.""" + registry = AccessorRegistry(register_default=False) + http = HTTPAccessor() + git = GitAccessor() + + # Register in any order + registry.register(http) + registry.register(git) + + accessors = registry.list_accessors() + + # GitAccessor (priority 80) should come before HTTPAccessor (priority 50) + assert len(accessors) == 2 + assert accessors[0].__class__.__name__ == "GitAccessor" + assert accessors[1].__class__.__name__ == "HTTPAccessor" diff --git a/tests/unit/test_accessors_registry.py b/tests/unit/test_accessors_registry.py new file mode 100644 index 000000000..858345a69 --- /dev/null +++ b/tests/unit/test_accessors_registry.py @@ -0,0 +1,155 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""Unit tests for AccessorRegistry.""" + +from pathlib import Path +from typing import Union + +import pytest + +from openviking.parse.accessors.base import DataAccessor, LocalResource +from openviking.parse.accessors.registry import AccessorRegistry, get_accessor_registry + + +class TestAccessor(DataAccessor): + """Test accessor implementation.""" + + def __init__(self, name: str, prefix: str, priority: int = 50): + self.name = name + self.prefix = prefix + self._priority = priority + + def can_handle(self, source: Union[str, Path]) -> bool: + return str(source).startswith(self.prefix) + + async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: + return LocalResource( + path=Path(f"/tmp/{self.name}"), + source_type=self.name, + original_source=str(source), + meta={"accessor": self.name, **kwargs}, + ) + + @property + def priority(self) -> int: + return self._priority + + +class TestAccessorRegistry: + """Tests for AccessorRegistry.""" + + @pytest.fixture + def registry(self) -> AccessorRegistry: + """Create a fresh registry (without default accessors).""" + return AccessorRegistry(register_default=False) + + def test_register_and_list(self, registry: AccessorRegistry) -> None: + """Register an accessor and list it.""" + accessor = TestAccessor("test", "test:", 50) + registry.register(accessor) + + accessors = registry.list_accessors() + assert len(accessors) == 1 + assert accessors[0] == accessor + + def test_get_accessor(self, registry: AccessorRegistry) -> None: + """Get an accessor that can handle a source.""" + accessor1 = TestAccessor("test1", "test1:", 50) + accessor2 = TestAccessor("test2", "test2:", 50) + registry.register(accessor1) + registry.register(accessor2) + + result = registry.get_accessor("test1:source") + assert result is accessor1 + + result = registry.get_accessor("test2:source") + assert result is accessor2 + + result = registry.get_accessor("unknown:source") + assert result is None + + def test_priority_ordering(self, registry: AccessorRegistry) -> None: + """Accessors are ordered by priority (descending).""" + low_prio = TestAccessor("low", "common:", 10) + mid_prio = TestAccessor("mid", "common:", 50) + high_prio = TestAccessor("high", "common:", 100) + + # Register in random order + registry.register(low_prio) + registry.register(high_prio) + registry.register(mid_prio) + + accessors = registry.list_accessors() + assert [a.name for a in accessors] == ["high", "mid", "low"] + + # Highest priority should be selected + result = registry.get_accessor("common:source") + assert result is high_prio + + def test_list_accessors_with_filter(self, registry: AccessorRegistry) -> None: + """list_accessors filters by source capability.""" + accessor1 = TestAccessor("a", "a:", 50) + accessor2 = TestAccessor("b", "b:", 50) + accessor3 = TestAccessor("both", "a:", 50) # also handles "a:" + registry.register(accessor1) + registry.register(accessor2) + registry.register(accessor3) + + filtered = registry.list_accessors("a:source") + assert len(filtered) == 2 + names = {a.name for a in filtered} + assert names == {"a", "both"} + + def test_unregister(self, registry: AccessorRegistry) -> None: + """Unregister removes accessor by name.""" + accessor = TestAccessor("test", "test:", 50) + registry.register(accessor) + + assert registry.unregister("TestAccessor") is True + assert len(registry.list_accessors()) == 0 + + assert registry.unregister("NotFound") is False + + def test_clear(self, registry: AccessorRegistry) -> None: + """Clear removes all accessors.""" + registry.register(TestAccessor("a", "a:", 50)) + registry.register(TestAccessor("b", "b:", 50)) + + assert len(registry.list_accessors()) == 2 + registry.clear() + assert len(registry.list_accessors()) == 0 + + @pytest.mark.asyncio + async def test_access_fallback_to_local( + self, registry: AccessorRegistry, tmp_path: Path + ) -> None: + """access() falls back to local file when no accessor matches.""" + test_file = tmp_path / "test.txt" + test_file.write_text("content") + + result = await registry.access(str(test_file)) + + assert result.source_type == "local" + assert result.path == test_file + assert result.is_temporary is False + + @pytest.mark.asyncio + async def test_access_with_accessor(self, registry: AccessorRegistry) -> None: + """access() uses the matching accessor.""" + accessor = TestAccessor("test", "test:", 50) + registry.register(accessor) + + result = await registry.access("test:source", extra="value") + + assert result.source_type == "test" + assert result.meta == {"accessor": "test", "extra": "value"} + + +class TestGlobalRegistry: + """Tests for the global registry.""" + + def test_get_accessor_registry(self) -> None: + """get_accessor_registry returns a singleton.""" + r1 = get_accessor_registry() + r2 = get_accessor_registry() + assert r1 is r2 From 6a2d1898f59d96d1b9a5f18609dc529fe450859d Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 11:30:45 +0800 Subject: [PATCH 02/15] reort: split parser layer to 2-layer: accessor and parser, so that we can reuse more code --- crates/ov_cli/src/client.rs | 12 ++- crates/ov_cli/src/commands/resources.rs | 14 +++ openviking/parse/accessors/__init__.py | 8 +- .../{feishu.py => feishu_accessor.py} | 0 .../accessors/{git.py => git_accessor.py} | 0 .../accessors/{http.py => http_accessor.py} | 0 openviking/parse/accessors/local_accessor.py | 99 +++++++++++++++++++ openviking/parse/accessors/registry.py | 33 ++++--- openviking/parse/parsers/directory.py | 2 +- tests/unit/test_accessors_registry.py | 5 + 10 files changed, 150 insertions(+), 23 deletions(-) rename openviking/parse/accessors/{feishu.py => feishu_accessor.py} (100%) rename openviking/parse/accessors/{git.py => git_accessor.py} (100%) rename openviking/parse/accessors/{http.py => http_accessor.py} (100%) create mode 100644 openviking/parse/accessors/local_accessor.py diff --git a/crates/ov_cli/src/client.rs b/crates/ov_cli/src/client.rs index 77afe352e..5c3ed3ae7 100644 --- a/crates/ov_cli/src/client.rs +++ b/crates/ov_cli/src/client.rs @@ -102,14 +102,20 @@ impl HttpClient { // Remove Content-Type: application/json, let reqwest set multipart/form-data automatically headers.remove(reqwest::header::CONTENT_TYPE); - let response = self - .http + // Use a separate HTTP client with a long timeout for large file uploads + // Long but finite timeout (30 minutes) to prevent hanging indefinitely + let long_timeout_client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(1800)) + .build() + .map_err(|e| Error::Network(format!("Failed to build long-timeout HTTP client: {}", e)))?; + + let response = long_timeout_client .post(&url) .headers(headers) .multipart(form) .send() .await - .map_err(|e| Error::Network(format!("HTTP request failed: {}", e)))?; + .map_err(|e| Error::Network(format!("File upload failed: {}", e)))?; let result: Value = self.handle_response(response).await?; result diff --git a/crates/ov_cli/src/commands/resources.rs b/crates/ov_cli/src/commands/resources.rs index dc29317f5..3bb8a38e9 100644 --- a/crates/ov_cli/src/commands/resources.rs +++ b/crates/ov_cli/src/commands/resources.rs @@ -37,6 +37,13 @@ pub async fn add_resource( watch_interval, ) .await?; + + // Print helpful message for async processing + if !wait && matches!(format, OutputFormat::Table) { + eprintln!("Note: Resource is being processed in the background."); + eprintln!("Use 'ov wait' to wait for completion, or 'ov observer queue' to check status."); + } + output_success(&result, format, compact); Ok(()) } @@ -50,6 +57,13 @@ pub async fn add_skill( compact: bool, ) -> Result<()> { let result = client.add_skill(data, wait, timeout).await?; + + // Print helpful message for async processing + if !wait && matches!(format, OutputFormat::Table) { + eprintln!("Note: Skill is being processed in the background."); + eprintln!("Use 'ov wait' to wait for completion, or 'ov observer queue' to check status."); + } + output_success(&result, format, compact); Ok(()) } diff --git a/openviking/parse/accessors/__init__.py b/openviking/parse/accessors/__init__.py index 1836f30b5..6ae4fe2c5 100644 --- a/openviking/parse/accessors/__init__.py +++ b/openviking/parse/accessors/__init__.py @@ -9,9 +9,10 @@ """ from .base import DataAccessor, LocalResource -from .feishu import FeishuAccessor -from .git import GitAccessor -from .http import HTTPAccessor +from .feishu_accessor import FeishuAccessor +from .git_accessor import GitAccessor +from .http_accessor import HTTPAccessor +from .local_accessor import LocalAccessor from .registry import ( AccessorRegistry, access, @@ -30,4 +31,5 @@ "GitAccessor", "HTTPAccessor", "FeishuAccessor", + "LocalAccessor", ] diff --git a/openviking/parse/accessors/feishu.py b/openviking/parse/accessors/feishu_accessor.py similarity index 100% rename from openviking/parse/accessors/feishu.py rename to openviking/parse/accessors/feishu_accessor.py diff --git a/openviking/parse/accessors/git.py b/openviking/parse/accessors/git_accessor.py similarity index 100% rename from openviking/parse/accessors/git.py rename to openviking/parse/accessors/git_accessor.py diff --git a/openviking/parse/accessors/http.py b/openviking/parse/accessors/http_accessor.py similarity index 100% rename from openviking/parse/accessors/http.py rename to openviking/parse/accessors/http_accessor.py diff --git a/openviking/parse/accessors/local_accessor.py b/openviking/parse/accessors/local_accessor.py new file mode 100644 index 000000000..48305121b --- /dev/null +++ b/openviking/parse/accessors/local_accessor.py @@ -0,0 +1,99 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +""" +Local file system accessor for OpenViking. + +Provides a DataAccessor implementation for local files and directories. +This is the lowest-priority accessor that handles any path-like source +that isn't handled by other accessors. +""" + +from pathlib import Path +from typing import Union + +from .base import DataAccessor, LocalResource, SourceType + + +class LocalAccessor(DataAccessor): + """ + Local file system accessor. + + This accessor handles local files and directories. It should be + registered with the lowest priority so that it only handles sources + that aren't picked up by other accessors (Git, HTTP, Feishu, etc.). + + Features: + - Handles any existing local path (file or directory) + - Marks resources as non-temporary (since they're already local) + - Provides clear local source type metadata + """ + + def can_handle(self, source: Union[str, Path]) -> bool: + """ + Check if this accessor can handle the source. + + LocalAccessor accepts any source that: + 1. Is a Path object, OR + 2. Is a string that looks like a local path and exists, OR + 3. Is a string that doesn't look like a URL (for fallback) + + Since this is a fallback accessor, it returns True for most sources. + The priority system ensures other accessors get a chance first. + + Args: + source: Source string or Path object + + Returns: + True - this is a fallback accessor that can handle any source + """ + # As the fallback accessor, we can handle anything + # The registry will try higher-priority accessors first + return True + + async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: + """ + Access a local file or directory. + + Simply wraps the local path in a LocalResource without any + fetching or copying (since it's already local). + + Args: + source: Local file path or Path object + **kwargs: Additional arguments (unused for local accessor) + + Returns: + LocalResource pointing to the local path + """ + path = Path(source) + + return LocalResource( + path=path, + source_type=SourceType.LOCAL, + original_source=str(source), + meta={ + "filename": path.name, + "suffix": path.suffix.lower() if path.suffix else None, + "is_dir": path.is_dir() if path.exists() else None, + }, + is_temporary=False, + ) + + @property + def priority(self) -> int: + """ + Priority of this accessor. + + Returns 1 - the lowest priority, ensuring this accessor is only + used when no other accessor can handle the source. + + Standard priority levels: + - 100: Specific service (Feishu, etc.) + - 80: Version control (Git, etc.) + - 50: Generic protocols (HTTP, etc.) + - 10: Fallback accessors + - 1: Local file system (this one) + + Returns: + Priority level 1 + """ + return 1 diff --git a/openviking/parse/accessors/registry.py b/openviking/parse/accessors/registry.py index d2bfb8b63..985cb2bf1 100644 --- a/openviking/parse/accessors/registry.py +++ b/openviking/parse/accessors/registry.py @@ -10,7 +10,7 @@ from pathlib import Path from typing import List, Optional, Union -from .base import DataAccessor, LocalResource, SourceType +from .base import DataAccessor, LocalResource logger = logging.getLogger(__name__) @@ -38,7 +38,7 @@ def _register_defaults(self) -> None: """Register default accessors.""" # GitAccessor - handles git repositories try: - from .git import GitAccessor + from .git_accessor import GitAccessor self.register(GitAccessor()) except Exception as e: @@ -46,7 +46,7 @@ def _register_defaults(self) -> None: # HTTPAccessor - handles HTTP/HTTPS URLs try: - from .http import HTTPAccessor + from .http_accessor import HTTPAccessor self.register(HTTPAccessor()) except Exception as e: @@ -54,12 +54,20 @@ def _register_defaults(self) -> None: # FeishuAccessor - handles Feishu/Lark documents try: - from .feishu import FeishuAccessor + from .feishu_accessor import FeishuAccessor self.register(FeishuAccessor()) except Exception as e: logger.debug(f"[AccessorRegistry] Failed to register FeishuAccessor: {e}") + # LocalAccessor - handles local files (lowest priority) + try: + from .local_accessor import LocalAccessor + + self.register(LocalAccessor()) + except Exception as e: + logger.debug(f"[AccessorRegistry] Failed to register LocalAccessor: {e}") + def register(self, accessor: DataAccessor) -> None: """ Register an accessor. @@ -133,8 +141,6 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: """ Access a source by routing to the appropriate accessor. - If no accessor can handle the source, it is treated as a local file. - Args: source: Source string (URL, path, etc.) or Path object **kwargs: Additional arguments passed to the accessor @@ -144,7 +150,7 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: """ source_str = str(source) - # Try to find an accessor + # Find an accessor - LocalAccessor should always be registered as fallback accessor = self.get_accessor(source) if accessor: logger.debug( @@ -152,15 +158,10 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: ) return await accessor.access(source, **kwargs) - # Default: treat as local file - logger.debug(f"[AccessorRegistry] No accessor found, treating as local file: {source_str}") - path = Path(source) - return LocalResource( - path=path, - source_type=SourceType.LOCAL, - original_source=source_str, - meta={}, - is_temporary=False, + # This should not happen if LocalAccessor is registered + raise RuntimeError( + f"No accessor found for source: {source_str}. " + "LocalAccessor should be registered as a fallback." ) def clear(self) -> None: diff --git a/openviking/parse/parsers/directory.py b/openviking/parse/parsers/directory.py index d190d1367..027f13c37 100644 --- a/openviking/parse/parsers/directory.py +++ b/openviking/parse/parsers/directory.py @@ -474,7 +474,7 @@ async def _is_git_repository(source_path: Path) -> bool: async def _add_git_metadata(source_path: Path, kwargs: dict) -> None: """Add git metadata (branch, commit, repo_name) to kwargs dictionary.""" try: - from openviking.parse.accessors.git import GitAccessor + from openviking.parse.accessors.git_accessor import GitAccessor git_accessor = GitAccessor() diff --git a/tests/unit/test_accessors_registry.py b/tests/unit/test_accessors_registry.py index 858345a69..5d5803a6f 100644 --- a/tests/unit/test_accessors_registry.py +++ b/tests/unit/test_accessors_registry.py @@ -124,6 +124,11 @@ async def test_access_fallback_to_local( self, registry: AccessorRegistry, tmp_path: Path ) -> None: """access() falls back to local file when no accessor matches.""" + from openviking.parse.accessors.local_accessor import LocalAccessor + + # Register LocalAccessor as fallback + registry.register(LocalAccessor()) + test_file = tmp_path / "test.txt" test_file.write_text("content") From daaead12657eff00ba84f036e48bd35d6a1e961f Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 11:39:25 +0800 Subject: [PATCH 03/15] reort: split parser layer to 2-layer: accessor and parser, so that we can reuse more code --- crates/ov_cli/src/client.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/ov_cli/src/client.rs b/crates/ov_cli/src/client.rs index 5c3ed3ae7..6250fa6bf 100644 --- a/crates/ov_cli/src/client.rs +++ b/crates/ov_cli/src/client.rs @@ -103,8 +103,10 @@ impl HttpClient { headers.remove(reqwest::header::CONTENT_TYPE); // Use a separate HTTP client with a long timeout for large file uploads - // Long but finite timeout (30 minutes) to prevent hanging indefinitely + // - Connect timeout: 30 seconds (to fail fast if server is unreachable) + // - Total request timeout: 30 minutes (to allow large file transfers) let long_timeout_client = reqwest::Client::builder() + .connect_timeout(std::time::Duration::from_secs(30)) .timeout(std::time::Duration::from_secs(1800)) .build() .map_err(|e| Error::Network(format!("Failed to build long-timeout HTTP client: {}", e)))?; From 42f7160eb7b26eb5992af2e8da37306c4b1021c6 Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 12:06:19 +0800 Subject: [PATCH 04/15] reort: split parser layer to 2-layer: accessor and parser, so that we can reuse more code --- openviking/parse/parsers/pdf.py | 41 +++++++++++++++++++------- openviking/server/local_input_guard.py | 31 +++++++++++++++++-- openviking/server/routers/resources.py | 32 ++++++++++++++++++-- 3 files changed, 88 insertions(+), 16 deletions(-) diff --git a/openviking/parse/parsers/pdf.py b/openviking/parse/parsers/pdf.py index 97c1ce6d1..0372b3e0d 100644 --- a/openviking/parse/parsers/pdf.py +++ b/openviking/parse/parsers/pdf.py @@ -93,7 +93,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) Args: source: Path to PDF file - **kwargs: Additional options (currently unused) + **kwargs: Additional options (resource_name/source_name for original filename) Returns: ParseResult with document tree @@ -105,6 +105,9 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) start_time = time.time() pdf_path = Path(source) + # Get resource name from kwargs, prefer original filename from upload + resource_name = kwargs.get("resource_name") or kwargs.get("source_name") + if not pdf_path.exists(): return create_parse_result( root=ResourceNode(type=NodeType.ROOT), @@ -117,11 +120,19 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) try: # Step 1: Convert PDF to Markdown - markdown_content, conversion_meta = await self._convert_to_markdown(pdf_path) + markdown_content, conversion_meta = await self._convert_to_markdown( + pdf_path, + resource_name=resource_name, + ) - # Step 2: Parse Markdown using MarkdownParser + # Step 2: Parse Markdown using MarkdownParser, pass through resource name md_parser = self._get_markdown_parser() - result = await md_parser.parse_content(markdown_content, source_path=str(pdf_path)) + result = await md_parser.parse_content( + markdown_content, + source_path=str(pdf_path), + resource_name=resource_name, + source_name=resource_name, + ) # Step 3: Update metadata for PDF origin result.source_format = "pdf" # Override markdown format @@ -152,12 +163,17 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) warnings=[f"Failed to parse PDF: {e}"], ) - async def _convert_to_markdown(self, pdf_path: Path) -> tuple[str, Dict[str, Any]]: + async def _convert_to_markdown( + self, + pdf_path: Path, + resource_name: Optional[str] = None, + ) -> tuple[str, Dict[str, Any]]: """ Convert PDF to Markdown using configured strategy. Args: pdf_path: Path to PDF file + resource_name: Optional resource name for organizing saved images Returns: Tuple of (markdown_content, metadata_dict) @@ -166,22 +182,22 @@ async def _convert_to_markdown(self, pdf_path: Path) -> tuple[str, Dict[str, Any ValueError: If all conversion strategies fail """ if self.config.strategy == "local": - return await self._convert_local(pdf_path) + return await self._convert_local(pdf_path, resource_name=resource_name) elif self.config.strategy == "mineru": - return await self._convert_mineru(pdf_path) + return await self._convert_mineru(pdf_path, resource_name=resource_name) elif self.config.strategy == "auto": # Try local first try: - return await self._convert_local(pdf_path) + return await self._convert_local(pdf_path, resource_name=resource_name) except Exception as e: logger.warning(f"Local conversion failed: {e}") # Fallback to MinerU if configured if self.config.mineru_endpoint: logger.info("Falling back to MinerU API") - return await self._convert_mineru(pdf_path) + return await self._convert_mineru(pdf_path, resource_name=resource_name) else: raise ValueError( f"Local conversion failed and no MinerU endpoint configured: {e}" @@ -594,12 +610,17 @@ def _extract_image_from_page(self, page, img_info: dict) -> Optional[bytes]: logger.debug(f"Image extraction error: {e}") return None - async def _convert_mineru(self, pdf_path: Path) -> tuple[str, Dict[str, Any]]: + async def _convert_mineru( + self, + pdf_path: Path, + resource_name: Optional[str] = None, + ) -> tuple[str, Dict[str, Any]]: """ Convert PDF to Markdown using MinerU API. Args: pdf_path: Path to PDF file + resource_name: Optional resource name (unused in MinerU conversion) Returns: Tuple of (markdown_content, metadata) diff --git a/openviking/server/local_input_guard.py b/openviking/server/local_input_guard.py index d7a08a360..ff70620ef 100644 --- a/openviking/server/local_input_guard.py +++ b/openviking/server/local_input_guard.py @@ -4,8 +4,10 @@ from __future__ import annotations +import json import re from pathlib import Path +from typing import Optional, Tuple from openviking.utils.network_guard import ensure_public_remote_target from openviking_cli.exceptions import PermissionDeniedError @@ -51,8 +53,26 @@ def deny_direct_local_skill_input(value: str) -> None: ) -def resolve_uploaded_temp_file_id(temp_file_id: str, upload_temp_dir: Path) -> str: - """Resolve a temp upload id to a regular file under the server upload temp dir.""" +def _read_upload_meta(meta_path: Path) -> Optional[dict]: + """Read upload metadata file if it exists.""" + try: + if meta_path.exists(): + with open(meta_path, "r") as f: + return json.load(f) + except Exception: + pass + return None + + +def resolve_uploaded_temp_file_id( + temp_file_id: str, upload_temp_dir: Path +) -> Tuple[str, Optional[str]]: + """Resolve a temp upload id to a regular file under the server upload temp dir. + + Returns: + Tuple of (resolved_file_path, original_filename) + original_filename is None if no meta file exists. + """ if not temp_file_id or temp_file_id in {".", ".."}: raise PermissionDeniedError( "HTTP server only accepts regular files from the upload temp directory." @@ -90,4 +110,9 @@ def resolve_uploaded_temp_file_id(temp_file_id: str, upload_temp_dir: Path) -> s "HTTP server only accepts regular files from the upload temp directory." ) - return str(resolved_path) + # Try to read original filename from meta file + meta_path = upload_temp_dir / f"{temp_file_id}.ov_upload.meta" + meta = _read_upload_meta(meta_path) + original_filename = meta.get("original_filename") if meta else None + + return (str(resolved_path), original_filename) diff --git a/openviking/server/routers/resources.py b/openviking/server/routers/resources.py index 1f03b5190..0d9703402 100644 --- a/openviking/server/routers/resources.py +++ b/openviking/server/routers/resources.py @@ -130,6 +130,12 @@ def _cleanup_temp_files(temp_dir: Path, max_age_hours: int = 1): if file_age > max_age_seconds: file_path.unlink(missing_ok=True) + # Also clean up corresponding .ov_upload.meta file + if not file_path.name.endswith(".ov_upload.meta"): + meta_path = temp_dir / f"{file_path.name}.ov_upload.meta" + if meta_path.exists(): + meta_path.unlink(missing_ok=True) + @router.post("/resources/temp_upload") async def temp_upload( @@ -140,6 +146,8 @@ async def temp_upload( """Upload a temporary file for add_resource or import_ovpack.""" async def _upload() -> dict[str, str]: + import json + config = get_openviking_config() temp_dir = config.storage.get_upload_temp_dir() @@ -154,6 +162,16 @@ async def _upload() -> dict[str, str]: with open(temp_file_path, "wb") as f: f.write(await file.read()) + # Save metadata with original filename + if file.filename: + meta_path = temp_dir / f"{temp_filename}.ov_upload.meta" + meta = { + "original_filename": file.filename, + "upload_time": time.time(), + } + with open(meta_path, "w") as f: + json.dump(meta, f) + return {"temp_file_id": temp_filename} execution = await run_operation( @@ -181,17 +199,25 @@ async def add_resource( upload_temp_dir = get_openviking_config().storage.get_upload_temp_dir() path = request.path allow_local_path_resolution = False + original_filename = None if request.temp_file_id: - path = resolve_uploaded_temp_file_id(request.temp_file_id, upload_temp_dir) + path, original_filename = resolve_uploaded_temp_file_id( + request.temp_file_id, upload_temp_dir + ) allow_local_path_resolution = True elif path is not None: path = require_remote_resource_source(path) if path is None: raise InvalidArgumentError("Either 'path' or 'temp_file_id' must be provided.") + # Use original_filename from upload if source_name not explicitly provided + source_name = request.source_name + if source_name is None and original_filename is not None: + source_name = original_filename + kwargs = { "strict": request.strict, - "source_name": request.source_name, + "source_name": source_name, "ignore_dirs": request.ignore_dirs, "include": request.include, "exclude": request.exclude, @@ -236,7 +262,7 @@ async def add_skill( data = request.data allow_local_path_resolution = False if request.temp_file_id: - data = resolve_uploaded_temp_file_id(request.temp_file_id, upload_temp_dir) + data, _ = resolve_uploaded_temp_file_id(request.temp_file_id, upload_temp_dir) allow_local_path_resolution = True execution = await run_operation( From 1df49f18140d7f9cc7d166f214c32ace53d9d3f1 Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 13:49:49 +0800 Subject: [PATCH 05/15] fix: zip optimization --- openviking/parse/accessors/git_accessor.py | 242 ++++++++++-------- openviking/parse/parsers/zip_parser.py | 52 ++-- openviking/utils/network_guard.py | 61 ++++- .../utils/config/open_viking_config.py | 14 +- 4 files changed, 235 insertions(+), 134 deletions(-) diff --git a/openviking/parse/accessors/git_accessor.py b/openviking/parse/accessors/git_accessor.py index 1590bca0d..7f2595159 100644 --- a/openviking/parse/accessors/git_accessor.py +++ b/openviking/parse/accessors/git_accessor.py @@ -100,8 +100,8 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: commit = kwargs.get("commit") try: - # Create local temp directory - temp_local_dir = tempfile.mkdtemp(prefix="ov_git_") + # Create local temp directory (non-blocking) + temp_local_dir = await asyncio.to_thread(tempfile.mkdtemp, prefix="ov_git_") logger.info(f"[GitAccessor] Created local temp dir for git: {temp_local_dir}") # Fetch content (Clone or Extract) @@ -410,38 +410,47 @@ def _download() -> None: except Exception as exc: raise RuntimeError(f"Failed to download GitHub ZIP {zip_url}: {exc}") - # Safe extraction with Zip Slip validation - target = Path(extract_dir).resolve() - with zipfile.ZipFile(zip_path, "r") as zf: - for info in zf.infolist(): - mode = info.external_attr >> 16 - if info.is_dir() or stat.S_ISDIR(mode): - continue - if stat.S_ISLNK(mode): - logger.warning( - f"[GitAccessor] Skipping symlink entry in GitHub ZIP: {info.filename}" - ) - continue - raw = info.filename.replace("\\", "/") - raw_parts = [p for p in raw.split("/") if p] - if ".." in raw_parts: - raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}") - if PurePosixPath(raw).is_absolute(): - raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}") - extracted = Path(zf.extract(info, extract_dir)).resolve() - if not extracted.is_relative_to(target): - extracted.unlink(missing_ok=True) - raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}") - - # Remove downloaded archive to free disk space. - try: - os.unlink(zip_path) - except OSError: - pass + # Safe extraction with Zip Slip validation (non-blocking) + def _extract_zip(): + target = Path(extract_dir).resolve() + with zipfile.ZipFile(zip_path, "r") as zf: + for info in zf.infolist(): + mode = info.external_attr >> 16 + if info.is_dir() or stat.S_ISDIR(mode): + continue + if stat.S_ISLNK(mode): + logger.warning( + f"[GitAccessor] Skipping symlink entry in GitHub ZIP: {info.filename}" + ) + continue + raw = info.filename.replace("\\", "/") + raw_parts = [p for p in raw.split("/") if p] + if ".." in raw_parts: + raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}") + if PurePosixPath(raw).is_absolute(): + raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}") + extracted = Path(zf.extract(info, extract_dir)).resolve() + if not extracted.is_relative_to(target): + extracted.unlink(missing_ok=True) + raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}") + + await asyncio.to_thread(_extract_zip) + + # Remove downloaded archive to free disk space (non-blocking) + def _cleanup_archive(): + try: + os.unlink(zip_path) + except OSError: + pass - # GitHub ZIPs have a single top-level directory - top_level = [d for d in Path(extract_dir).iterdir() if d.is_dir()] - content_dir = top_level[0] if len(top_level) == 1 else Path(extract_dir) + await asyncio.to_thread(_cleanup_archive) + + # GitHub ZIPs have a single top-level directory (non-blocking) + def _find_content_dir() -> Path: + top_level = [d for d in Path(extract_dir).iterdir() if d.is_dir()] + return top_level[0] if len(top_level) == 1 else Path(extract_dir) + + content_dir = await asyncio.to_thread(_find_content_dir) logger.info(f"[GitAccessor] GitHub ZIP extracted to {content_dir} ({repo_name})") return content_dir, repo_name @@ -487,38 +496,47 @@ def _download() -> None: except Exception as exc: raise RuntimeError(f"Failed to download GitLab ZIP {zip_url}: {exc}") - # Safe extraction with Zip Slip validation - target = Path(extract_dir).resolve() - with zipfile.ZipFile(zip_path, "r") as zf: - for info in zf.infolist(): - mode = info.external_attr >> 16 - if info.is_dir() or stat.S_ISDIR(mode): - continue - if stat.S_ISLNK(mode): - logger.warning( - f"[GitAccessor] Skipping symlink entry in GitLab ZIP: {info.filename}" - ) - continue - raw = info.filename.replace("\\", "/") - raw_parts = [p for p in raw.split("/") if p] - if ".." in raw_parts: - raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}") - if PurePosixPath(raw).is_absolute(): - raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}") - extracted = Path(zf.extract(info, extract_dir)).resolve() - if not extracted.is_relative_to(target): - extracted.unlink(missing_ok=True) - raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}") - - # Remove downloaded archive to free disk space. - try: - os.unlink(zip_path) - except OSError: - pass + # Safe extraction with Zip Slip validation (non-blocking) + def _extract_zip(): + target = Path(extract_dir).resolve() + with zipfile.ZipFile(zip_path, "r") as zf: + for info in zf.infolist(): + mode = info.external_attr >> 16 + if info.is_dir() or stat.S_ISDIR(mode): + continue + if stat.S_ISLNK(mode): + logger.warning( + f"[GitAccessor] Skipping symlink entry in GitLab ZIP: {info.filename}" + ) + continue + raw = info.filename.replace("\\", "/") + raw_parts = [p for p in raw.split("/") if p] + if ".." in raw_parts: + raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}") + if PurePosixPath(raw).is_absolute(): + raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}") + extracted = Path(zf.extract(info, extract_dir)).resolve() + if not extracted.is_relative_to(target): + extracted.unlink(missing_ok=True) + raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}") - # GitLab ZIPs have a single top-level directory: {repo}-{ref}/ - top_level = [d for d in Path(extract_dir).iterdir() if d.is_dir()] - content_dir = top_level[0] if len(top_level) == 1 else Path(extract_dir) + await asyncio.to_thread(_extract_zip) + + # Remove downloaded archive to free disk space (non-blocking) + def _cleanup_archive(): + try: + os.unlink(zip_path) + except OSError: + pass + + await asyncio.to_thread(_cleanup_archive) + + # GitLab ZIPs have a single top-level directory: {repo}-{ref}/ (non-blocking) + def _find_content_dir() -> Path: + top_level = [d for d in Path(extract_dir).iterdir() if d.is_dir()] + return top_level[0] if len(top_level) == 1 else Path(extract_dir) + + content_dir = await asyncio.to_thread(_find_content_dir) logger.info(f"[GitAccessor] GitLab ZIP extracted to {content_dir} ({repo_name})") return content_dir, repo_name @@ -531,51 +549,61 @@ async def _extract_zip(self, zip_path: str, target_dir: str) -> str: path = Path(zip_path) name = path.stem - with zipfile.ZipFile(zip_path, "r") as zip_ref: - target = Path(target_dir).resolve() - for info in zip_ref.infolist(): - mode = info.external_attr >> 16 - # Skip directory entries - if info.is_dir() or stat.S_ISDIR(mode): - continue - # Skip symlink entries - if stat.S_ISLNK(mode): - logger.warning(f"[GitAccessor] Skipping symlink entry in zip: {info.filename}") - continue - # Reject entries with suspicious raw path components - raw = info.filename.replace("\\", "/") - raw_parts = [p for p in raw.split("/") if p] - if ".." in raw_parts: - raise ValueError(f"Zip Slip detected: entry {info.filename!r} contains '..'") - if PurePosixPath(raw).is_absolute() or (len(raw) >= 2 and raw[1] == ":"): - raise ValueError( - f"Zip Slip detected: entry {info.filename!r} is an absolute path" - ) - # Normalize and verify - arcname = info.filename.replace("/", os.sep) - if os.path.altsep: - arcname = arcname.replace(os.path.altsep, os.sep) - arcname = os.path.splitdrive(arcname)[1] - arcname = os.sep.join(p for p in arcname.split(os.sep) if p not in ("", ".", "..")) - if not arcname: - continue - member_path = (Path(target_dir) / arcname).resolve() - if not member_path.is_relative_to(target): - raise ValueError( - f"Zip Slip detected: entry {info.filename!r} escapes target directory" - ) - # Extract and verify - extracted = Path(zip_ref.extract(info, target_dir)).resolve() - if not extracted.is_relative_to(target): - # Best-effort cleanup - try: - extracted.unlink(missing_ok=True) - except OSError as cleanup_err: + # Extract zip file (non-blocking) + def _extract_zip_sync(): + with zipfile.ZipFile(zip_path, "r") as zip_ref: + target = Path(target_dir).resolve() + for info in zip_ref.infolist(): + mode = info.external_attr >> 16 + # Skip directory entries + if info.is_dir() or stat.S_ISDIR(mode): + continue + # Skip symlink entries + if stat.S_ISLNK(mode): logger.warning( - f"[GitAccessor] Failed to clean up escaped file {extracted}: {cleanup_err}" + f"[GitAccessor] Skipping symlink entry in zip: {info.filename}" + ) + continue + # Reject entries with suspicious raw path components + raw = info.filename.replace("\\", "/") + raw_parts = [p for p in raw.split("/") if p] + if ".." in raw_parts: + raise ValueError( + f"Zip Slip detected: entry {info.filename!r} contains '..'" + ) + if PurePosixPath(raw).is_absolute() or (len(raw) >= 2 and raw[1] == ":"): + raise ValueError( + f"Zip Slip detected: entry {info.filename!r} is an absolute path" ) - raise ValueError( - f"Zip Slip detected: entry {info.filename!r} escapes target directory" + # Normalize and verify + arcname = info.filename.replace("/", os.sep) + if os.path.altsep: + arcname = arcname.replace(os.path.altsep, os.sep) + arcname = os.path.splitdrive(arcname)[1] + arcname = os.sep.join( + p for p in arcname.split(os.sep) if p not in ("", ".", "..") ) + if not arcname: + continue + member_path = (Path(target_dir) / arcname).resolve() + if not member_path.is_relative_to(target): + raise ValueError( + f"Zip Slip detected: entry {info.filename!r} escapes target directory" + ) + # Extract and verify + extracted = Path(zip_ref.extract(info, target_dir)).resolve() + if not extracted.is_relative_to(target): + # Best-effort cleanup + try: + extracted.unlink(missing_ok=True) + except OSError as cleanup_err: + logger.warning( + f"[GitAccessor] Failed to clean up escaped file {extracted}: {cleanup_err}" + ) + raise ValueError( + f"Zip Slip detected: entry {info.filename!r} escapes target directory" + ) + + await asyncio.to_thread(_extract_zip_sync) return name diff --git a/openviking/parse/parsers/zip_parser.py b/openviking/parse/parsers/zip_parser.py index 5dbce8174..17d782fc2 100644 --- a/openviking/parse/parsers/zip_parser.py +++ b/openviking/parse/parsers/zip_parser.py @@ -7,6 +7,7 @@ Supports nested ZIP files via DirectoryParser's recursive parser invocation. """ +import asyncio import shutil import tempfile import zipfile @@ -55,17 +56,28 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) if not path.exists(): raise FileNotFoundError(f"ZIP file not found: {path}") - if not zipfile.is_zipfile(path): + # Check if it's a valid ZIP file (non-blocking) + def _is_zipfile() -> bool: + return zipfile.is_zipfile(path) + + if not await asyncio.to_thread(_is_zipfile): raise ValueError(f"Not a valid ZIP file: {path}") - # Extract ZIP to temporary directory - temp_dir = Path(tempfile.mkdtemp(prefix="ov_zip_")) + # Extract ZIP to temporary directory (non-blocking) + temp_dir = Path(await asyncio.to_thread(tempfile.mkdtemp, prefix="ov_zip_")) try: - with zipfile.ZipFile(path, "r") as zipf: - safe_extract_zip(zipf, temp_dir) + # Extract zip (non-blocking) + def _extract_zip(): + with zipfile.ZipFile(path, "r") as zipf: + safe_extract_zip(zipf, temp_dir) + + await asyncio.to_thread(_extract_zip) - # Check if extracted content has a single root directory - extracted_entries = [p for p in temp_dir.iterdir() if p.name not in {".", ".."}] + # Check if extracted content has a single root directory (non-blocking) + def _list_entries(): + return [p for p in temp_dir.iterdir() if p.name not in {".", ".."}] + + extracted_entries = await asyncio.to_thread(_list_entries) # Prepare kwargs for DirectoryParser dir_kwargs = dict(kwargs) @@ -91,22 +103,28 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) if not result.temp_dir_path: result.temp_dir_path = str(temp_dir) else: - # If DirectoryParser created its own temp, clean up our extraction dir - try: - shutil.rmtree(temp_dir, ignore_errors=True) - except Exception: - pass + # If DirectoryParser created its own temp, clean up our extraction dir (non-blocking) + def _cleanup_temp(): + try: + shutil.rmtree(temp_dir, ignore_errors=True) + except Exception: + pass + + await asyncio.to_thread(_cleanup_temp) result.source_format = "zip" result.parser_name = "ZipParser" return result except Exception: - # Clean up on error - try: - shutil.rmtree(temp_dir, ignore_errors=True) - except Exception: - pass + # Clean up on error (non-blocking) + def _cleanup_temp_on_error(): + try: + shutil.rmtree(temp_dir, ignore_errors=True) + except Exception: + pass + + await asyncio.to_thread(_cleanup_temp_on_error) raise async def parse_content( diff --git a/openviking/utils/network_guard.py b/openviking/utils/network_guard.py index a9dad2c68..ba5d29b08 100644 --- a/openviking/utils/network_guard.py +++ b/openviking/utils/network_guard.py @@ -11,6 +11,7 @@ from urllib.parse import urlparse from openviking_cli.exceptions import PermissionDeniedError +from openviking_cli.utils.config import get_openviking_config RequestValidator = Callable[[str], None] @@ -20,13 +21,48 @@ } +def _get_allowed_code_hosting_domains() -> set[str]: + """Get allowed code hosting domains from config.""" + allowed = set() + try: + config = get_openviking_config() + # Add configured GitHub and GitLab domains + if hasattr(config, "code"): + if hasattr(config.code, "github_domains"): + allowed.update(config.code.github_domains) + if hasattr(config.code, "gitlab_domains"): + allowed.update(config.code.gitlab_domains) + if hasattr(config.code, "code_hosting_domains"): + allowed.update(config.code.code_hosting_domains) + except Exception: + # If config can't be loaded, use defaults + allowed.update({"github.com", "gitlab.com", "www.github.com", "www.gitlab.com"}) + return allowed + + +def _is_allow_private_networks() -> bool: + """Check if private networks are allowed by config.""" + try: + config = get_openviking_config() + return getattr(config, "allow_private_networks", False) + except Exception: + return False + + def extract_remote_host(source: str) -> Optional[str]: """Extract the destination host from a remote resource source.""" if source.startswith("git@"): rest = source[4:] - if ":" not in rest: + # Find the colon separator, handling IPv6 addresses in brackets + if "]:" in rest: + # IPv6 address: git@[::1]:user/repo.git + host_part = rest.split("]:", 1)[0] + "]" + elif ":" in rest: + # Regular hostname: git@github.com:user/repo.git + host_part = rest.split(":", 1)[0] + else: return None - return rest.split(":", 1)[0].strip().strip("[]") + return host_part.strip().strip("[]") parsed = urlparse(source) if parsed.hostname is None: @@ -63,7 +99,12 @@ def _is_public_ip(address: str) -> bool: def ensure_public_remote_target(source: str) -> None: - """Reject loopback, link-local, private, and other non-public targets.""" + """Reject loopback, link-local, private, and other non-public targets. + + Skips validation if: + - allow_private_networks is True in config + - Host is in configured github_domains/gitlab_domains/code_hosting_domains + """ host = extract_remote_host(source) if not host: raise PermissionDeniedError( @@ -77,6 +118,16 @@ def ensure_public_remote_target(source: str) -> None: "loopback, link-local, private, and otherwise non-public destinations are not allowed." ) + # Check if private networks are allowed globally + if _is_allow_private_networks(): + return + + # Check if host is in allowed code hosting domains + allowed_domains = _get_allowed_code_hosting_domains() + normalized_domains = {_normalize_host(d) for d in allowed_domains} + if normalized_host in normalized_domains: + return + resolved_addresses = _resolve_host_addresses(host) if not resolved_addresses: return @@ -85,7 +136,9 @@ def ensure_public_remote_target(source: str) -> None: if non_public: raise PermissionDeniedError( "HTTP server only accepts public remote resource targets; " - f"host '{host}' resolves to non-public address '{non_public[0]}'." + f"host '{host}' resolves to non-public address '{non_public[0]}'. " + "To allow this, add the domain to code.gitlab_domains/code.github_domains/code.code_hosting_domains " + "or set allow_private_networks=true in your ov.conf." ) diff --git a/openviking_cli/utils/config/open_viking_config.py b/openviking_cli/utils/config/open_viking_config.py index 9d0ee615c..fab364471 100644 --- a/openviking_cli/utils/config/open_viking_config.py +++ b/openviking_cli/utils/config/open_viking_config.py @@ -132,12 +132,6 @@ class OpenVikingConfig(BaseModel): default_search_limit: int = Field(default=3, description="Default number of results to return") - enable_memory_decay: bool = Field(default=True, description="Enable automatic memory decay") - - memory_decay_check_interval: int = Field( - default=3600, description="Interval (seconds) to check for expired memories" - ) - language_fallback: str = Field( default="en", description=( @@ -146,6 +140,14 @@ class OpenVikingConfig(BaseModel): ), ) + allow_private_networks: bool = Field( + default=False, + description=( + "Allow fetching resources from private/non-public network addresses. " + "When disabled (default), only public IP addresses and hostnames are allowed." + ), + ) + log: LogConfig = Field(default_factory=lambda: LogConfig(), description="Logging configuration") memory: MemoryConfig = Field( From bf5b3232f778302cb9798be43065c90769bcaa11 Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 13:56:39 +0800 Subject: [PATCH 06/15] fix: zip optimization for gitlab --- openviking/parse/accessors/git_accessor.py | 58 +++++++++++++++++++--- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/openviking/parse/accessors/git_accessor.py b/openviking/parse/accessors/git_accessor.py index 7f2595159..8828e4aea 100644 --- a/openviking/parse/accessors/git_accessor.py +++ b/openviking/parse/accessors/git_accessor.py @@ -119,15 +119,57 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: elif source_str.startswith(("http://", "https://", "git://", "ssh://")): repo_url, branch, commit = self._parse_repo_source(source_str, **kwargs) if self._is_github_url(repo_url): - # Use GitHub ZIP API - local_dir, repo_name = await self._github_zip_download( - repo_url, branch or commit, temp_local_dir - ) + # Try GitHub ZIP API first, fall back to git clone + try: + local_dir, repo_name = await self._github_zip_download( + repo_url, branch or commit, temp_local_dir + ) + except Exception as zip_exc: + logger.warning( + f"[GitAccessor] GitHub ZIP download failed, falling back to git clone: {zip_exc}" + ) + + # Clean up any partial content before cloning + def _cleanup(): + for p in Path(temp_local_dir).iterdir(): + if p.is_file(): + p.unlink(missing_ok=True) + elif p.is_dir(): + shutil.rmtree(p, ignore_errors=True) + + await asyncio.to_thread(_cleanup) + repo_name = await self._git_clone( + repo_url, + temp_local_dir, + branch=branch, + commit=commit, + ) elif self._is_gitlab_url(repo_url): - # Use GitLab ZIP API - local_dir, repo_name = await self._gitlab_zip_download( - repo_url, branch or commit, temp_local_dir - ) + # Try GitLab ZIP API first, fall back to git clone + try: + local_dir, repo_name = await self._gitlab_zip_download( + repo_url, branch or commit, temp_local_dir + ) + except Exception as zip_exc: + logger.warning( + f"[GitAccessor] GitLab ZIP download failed, falling back to git clone: {zip_exc}" + ) + + # Clean up any partial content before cloning + def _cleanup(): + for p in Path(temp_local_dir).iterdir(): + if p.is_file(): + p.unlink(missing_ok=True) + elif p.is_dir(): + shutil.rmtree(p, ignore_errors=True) + + await asyncio.to_thread(_cleanup) + repo_name = await self._git_clone( + repo_url, + temp_local_dir, + branch=branch, + commit=commit, + ) else: # Non-GitHub/GitLab URL: use git clone repo_name = await self._git_clone( From 52438286c700d7fa3b09576fb19ba04eab91b18b Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 14:37:39 +0800 Subject: [PATCH 07/15] fix: zip docs --- docs/design/parser-two-layer-refactor-plan.md | 440 +++++++----------- 1 file changed, 176 insertions(+), 264 deletions(-) diff --git a/docs/design/parser-two-layer-refactor-plan.md b/docs/design/parser-two-layer-refactor-plan.md index 1d99a7cc7..de67d469d 100644 --- a/docs/design/parser-two-layer-refactor-plan.md +++ b/docs/design/parser-two-layer-refactor-plan.md @@ -1,355 +1,267 @@ -# OpenViking 解析器两层架构重构计划 +# OpenViking 解析器两层架构重构 | 项目 | 信息 | |-----|------| -| 状态 | `规划中` | +| 状态 | `已完成` | | 创建日期 | 2026-04-13 | -| 目标版本 | v6.0 | +| 完成日期 | 2026-04-14 | --- -## 一、问题分析 +## 概述 + +将原有的单一层 Parser 架构拆分为 **Accessor(数据访问层)** 和 **Parser(数据解析层)** 两层,实现职责分离和代码复用。 + +--- + +## 目录 + +- [背景与问题](#背景与问题) +- [架构设计](#架构设计) +- [核心抽象](#核心抽象) +- [实现进度](#实现进度) +- [文件结构](#文件结构) + +--- + +## 背景与问题 ### 当前架构的问题 -1. **平铺式注册**:所有 Parser 在同一层级,职责不清晰 -2. **后缀冲突**:`.zip` 可被 `CodeRepositoryParser` 和 `ZipParser` 同时处理 -3. **URL 处理逻辑分散**:`UnifiedResourceProcessor._process_url()` 和 `ParserRegistry.parse()` 都有 URL 检测 -4. **职责混合**:部分 Parser 既负责下载又负责解析(如 `CodeRepositoryParser`, `HTMLParser`, `FeishuParser`) +| 问题 | 说明 | +|-----|------| +| 平铺式注册 | 所有 Parser 在同一层级,职责不清晰 | +| 后缀冲突 | `.zip` 可被 `CodeRepositoryParser` 和 `ZipParser` 同时处理 | +| URL 处理逻辑分散 | `UnifiedResourceProcessor._process_url()` 和 `ParserRegistry.parse()` 都有 URL 检测 | +| 职责混合 | 部分 Parser 既负责下载又负责解析 | + +### 重构目标 + +1. **职责分离**:数据获取 ≠ 数据解析 +2. **代码复用**:HTTP/Git 下载逻辑可被多个 Parser 复用 +3. **易于扩展**:新增数据源只需添加 Accessor +4. **解决冲突**:通过优先级机制解决 `.zip` 等后缀冲突 --- -## 二、新架构设计 +## 架构设计 -### 核心概念 +### 两层架构 | 层级 | 抽象接口 | 职责 | 示例 | |-----|---------|------|------| -| **L1** | `DataAccessor` | 获取数据:将远程 URL / 特殊路径 → 本地文件/目录 | GitAccessor, HTTPAccessor, FeishuAccessor | -| **L2** | `DataParser` | 解析数据:本地文件/目录 → `ParseResult` | MarkdownParser, PDFParser, DirectoryParser | -| **混合** | `HybridParser` | 同时实现两个接口(简化插件开发) | (按需使用) | +| **L1: Accessor** | `DataAccessor` | 获取数据:远程 URL / 特殊路径 → 本地文件/目录 | `GitAccessor`, `HTTPAccessor`, `FeishuAccessor` | +| **L2: Parser** | `BaseParser` | 解析数据:本地文件/目录 → `ParseResult` | `MarkdownParser`, `PDFParser`, `ZipParser` | -### 新的调用流程 +### 调用流程 ``` add_resource(path) ↓ ResourceProcessor.process_resource() ↓ -UnifiedResourceProcessor.process() [重构] +UnifiedResourceProcessor.process() ↓ -【第一阶段:数据访问】 -AccessorRegistry.route(source) - ├─→ 是 URL/远程资源? - │ ├─→ GitAccessor (git@, git://, github.com, ...) - │ ├─→ FeishuAccessor (.feishu.cn, .larksuite.com) - │ ├─→ HTTPAccessor (http://, https://) - │ └─→ 其他 → 下一步 - └─→ 返回: LocalResource (本地路径 + 元数据) - ↓ -【第二阶段:数据解析】 -ParserRegistry.route(local_resource) - ├─→ 是目录? → DirectoryParser - ├─→ 是文件? → 按扩展名匹配 - │ ├─→ .md → MarkdownParser - │ ├─→ .pdf → PDFParser - │ ├─→ .zip → ZipParser - │ └─→ ... - └─→ 返回: ParseResult - ↓ +┌─────────────────────────────────────────┐ +│ 第一阶段:数据访问 (Accessor Layer) │ +├─────────────────────────────────────────┤ +│ AccessorRegistry.route(source) │ +│ ├─→ GitAccessor (priority: 80) │ +│ ├─→ FeishuAccessor (priority: 100) │ +│ ├─→ HTTPAccessor (priority: 50) │ +│ └─→ LocalAccessor (priority: 10) │ +│ ↓ │ +│ 返回: LocalResource │ +│ (本地路径 + 元数据) │ +└─────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────┐ +│ 第二阶段:数据解析 (Parser Layer) │ +├─────────────────────────────────────────┤ +│ ParserRegistry.route(local_resource) │ +│ ├─→ 是目录? → DirectoryParser │ +│ ├─→ 是文件? → 按扩展名匹配 │ +│ │ ├─→ .md → MarkdownParser │ +│ │ ├─→ .pdf → PDFParser │ +│ │ ├─→ .zip → ZipParser │ +│ │ └─→ ... │ +│ └─→ 返回: ParseResult │ +└─────────────────────────────────────────┘ + ↓ TreeBuilder + SemanticQueue (保持不变) ``` --- -## 三、详细设计 +## 核心抽象 -### 3.1 核心抽象接口 +### 1. LocalResource(数据类) -#### 文件 1: `openviking/parse/accessors/base.py` +位置:`openviking/parse/accessors/base.py` -```python -from abc import ABC, abstractmethod -from pathlib import Path -from typing import Optional, Dict, Any, Union -from dataclasses import dataclass +表示一个可在本地访问的资源,是 Accessor 层的输出。 +```python @dataclass class LocalResource: - """数据访问层的输出:表示可访问的本地资源""" path: Path # 本地文件/目录路径 - source_type: str # 原始来源类型: "git", "http", "feishu", "local", ... + source_type: str # 原始来源类型 (SourceType.GIT/HTTP/FEISHU/LOCAL) original_source: str # 原始 source 字符串 - meta: Dict[str, Any] # 元数据(repo_name, branch, 等) + meta: Dict[str, Any] # 元数据(repo_name, branch, content_type 等) is_temporary: bool = True # 是否为临时文件,解析后可清理 -class DataAccessor(ABC): - """数据访问器:负责获取数据到本地""" + def cleanup(self) -> None # 清理临时资源 + def __enter__/__exit__ # 支持上下文管理器 +``` +### 2. DataAccessor(抽象基类) + +位置:`openviking/parse/accessors/base.py` + +```python +class DataAccessor(ABC): @abstractmethod - def can_handle(self, source: Union[str, Path]) -> bool: + def can_handle(self, source: Union[str, Path]) -> bool """判断是否能处理该来源""" - pass @abstractmethod - async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: - """ - 获取数据到本地 - - 返回: LocalResource,包含本地路径和元数据 - """ - pass + async def access(self, source: Union[str, Path], **kwargs) -> LocalResource + """获取数据到本地,返回 LocalResource""" @property @abstractmethod - def priority(self) -> int: + def priority(self) -> int + """优先级:数字越大优先级越高 + - 100: 特定服务 (Feishu) + - 80: 版本控制 (Git) + - 50: 通用协议 (HTTP) + - 10: 兜底 (Local) """ - 优先级,数字越大优先级越高 - 用于解决冲突:多个 Accessor 都 can_handle 时,选优先级高的 - """ - pass - def cleanup(self, resource: LocalResource) -> None: - """ - 清理临时资源(可选) - 默认:如果是临时资源,删除本地文件/目录 - """ - if resource.is_temporary: - # 实现清理逻辑 - pass + def cleanup(self, resource: LocalResource) -> None + """清理资源(默认调用 resource.cleanup())""" ``` -#### 文件 2: `openviking/parse/parsers/base_parser.py` (重构) - -```python -# 保持现有 BaseParser 接口,但明确其职责为 DataParser -# 可以考虑重命名为 DataParser 或保留 BaseParser 作为别名 -``` +### 3. AccessorRegistry -#### 文件 3: `openviking/parse/accessors/registry.py` +位置:`openviking/parse/accessors/registry.py` ```python -from typing import Union, Path, Optional, List -from .base import DataAccessor, LocalResource - class AccessorRegistry: - """数据访问器注册表""" - - def __init__(self): - self._accessors: List[DataAccessor] = [] - - def register(self, accessor: DataAccessor) -> None: - """注册访问器(按优先级插入)""" - # 按 priority 降序插入 - idx = 0 - for i, a in enumerate(self._accessors): - if accessor.priority > a.priority: - idx = i - break - else: - idx = len(self._accessors) - self._accessors.insert(idx, accessor) - - def get_accessor(self, source: Union[str, Path]) -> Optional[DataAccessor]: - """获取能处理该 source 的访问器(按优先级)""" - for accessor in self._accessors: - if accessor.can_handle(source): - return accessor - return None - - async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: - """ - 路由到合适的访问器获取数据 - - 如果没有访问器能处理,视为本地文件返回 - """ - accessor = self.get_accessor(source) - if accessor: - return await accessor.access(source, **kwargs) - - # 默认:视为本地文件 - path = Path(source) - return LocalResource( - path=path, - source_type="local", - original_source=str(source), - meta={}, - is_temporary=False - ) -``` - -### 3.2 现有 Parser 拆分方案 - -| 当前 Parser | 拆分后 | 说明 | -|-----------|--------|------| -| `CodeRepositoryParser` | `GitAccessor` + `DirectoryParser` | Git clone 逻辑移到 Accessor | -| `HTMLParser` | `HTTPAccessor` + `HTMLParser` | HTTP 下载移到 Accessor | -| `FeishuParser` | `FeishuAccessor` + (新) `FeishuDocumentParser` | 飞书 API 调用移到 Accessor | -| `ZipParser` | 保持为 `DataParser` | 只处理本地 .zip 文件 | -| `DirectoryParser` | 保持为 `DataParser` | 只处理本地目录 | -| 其他 (Markdown, PDF, ...) | 保持为 `DataParser` | 无需变动 | - -### 3.3 Accessor 实现示例 - -#### `GitAccessor` (`openviking/parse/accessors/git.py`) - -```python -# 从 CodeRepositoryParser 提取 git clone / GitHub ZIP 下载逻辑 -``` + def __init__(self, register_default: bool = True) + """初始化注册表,可选是否注册默认 Accessor""" -#### `HTTPAccessor` (`openviking/parse/accessors/http.py`) + def register(self, accessor: DataAccessor) -> None + """注册 Accessor(按优先级降序插入)""" -```python -# 从 HTMLParser 提取下载逻辑 -# 支持内容类型检测,下载到临时文件 -``` + def unregister(self, accessor_name: str) -> bool + """注销 Accessor""" -#### `FeishuAccessor` (`openviking/parse/accessors/feishu.py`) + def get_accessor(self, source) -> Optional[DataAccessor] + """获取能处理该 source 的最高优先级 Accessor""" -```python -# 从 FeishuParser 提取 API 调用逻辑 + async def access(self, source, **kwargs) -> LocalResource + """路由到合适的 Accessor 获取数据""" ``` -### 3.4 HybridParser(简化插件开发) - -对于简单的插件,允许同时实现两个接口: - -```python -class HybridParser(DataAccessor, DataParser): - """ - 混合解析器:同时实现访问和解析 - 适用于简单的自定义解析器,不需要拆分 - """ - # 同时实现 DataAccessor 和 DataParser 的接口 - pass -``` +默认注册的 Accessor(按优先级): +1. `FeishuAccessor` (100) - 处理飞书/ Lark 文档 +2. `GitAccessor` (80) - 处理 Git 仓库 +3. `HTTPAccessor` (50) - 处理 HTTP/HTTPS URL +4. `LocalAccessor` (10) - 处理本地文件(兜底) --- -## 四、实施步骤 - -### Phase 1: 基础设施(核心接口) - -**目标**:建立新架构的基础框架,不影响现有功能 - -- [ ] 创建 `openviking/parse/accessors/` 目录结构 -- [ ] 实现 `DataAccessor` 抽象基类和 `LocalResource` 数据类 -- [ ] 实现 `AccessorRegistry` -- [ ] 编写单元测试 -- [ ] 更新 `ParserRegistry`,保持向后兼容 - -### Phase 2: 重构 CodeRepositoryParser - -**目标**:将 CodeRepositoryParser 拆分为 GitAccessor + DirectoryParser +## 实现进度 -- [ ] 实现 `GitAccessor`(从 `CodeRepositoryParser` 提取 clone/download 逻辑) -- [ ] 更新 `CodeRepositoryParser` 为混合模式(向后兼容)或标记为 deprecated -- [ ] 在 `AccessorRegistry` 中注册 `GitAccessor` -- [ ] 更新 `UnifiedResourceProcessor.process()` 使用新流程 -- [ ] 测试:Git URL, GitHub URL, 本地 .zip 等场景 +### ✅ 已完成 -### Phase 3: 重构 HTMLParser 和 FeishuParser - -- [ ] 实现 `HTTPAccessor` -- [ ] 实现 `FeishuAccessor` -- [ ] 重构 `HTMLParser` 只负责解析本地 HTML 文件 -- [ ] 重构 `FeishuParser` 只负责解析下载后的内容 - -### Phase 4: 重构 UnifiedResourceProcessor - -**目标**:简化 `UnifiedResourceProcessor`,使用新的两层架构 - -- [ ] 重构 `UnifiedResourceProcessor.process()`: - ```python - async def process(self, source, ...): - # 第一阶段:获取数据 - local_resource = await accessor_registry.access(source, **kwargs) - - # 第二阶段:解析数据 - parse_result = await parser_registry.parse(local_resource, **kwargs) - - # 清理(如果需要) - # ... - - return parse_result - ``` -- [ ] 移除 `_process_url()`, `_process_file()`, `_process_directory()` 中的重复逻辑 - -### Phase 5: 扩展和优化 - -- [ ] 实现优先级机制解决冲突 -- [ ] 添加 `HybridParser` 支持 -- [ ] 编写迁移文档 -- [ ] 性能优化和测试覆盖 +- [x] 创建 `openviking/parse/accessors/` 目录结构 +- [x] 实现 `DataAccessor` 抽象基类和 `LocalResource` 数据类 +- [x] 实现 `AccessorRegistry`(含优先级机制) +- [x] 实现 `GitAccessor` - 处理 Git 仓库 +- [x] 实现 `HTTPAccessor` - 处理 HTTP URL +- [x] 实现 `FeishuAccessor` - 处理飞书文档 +- [x] 实现 `LocalAccessor` - 处理本地文件 +- [x] 全局注册表 `get_accessor_registry()` +- [x] 更新 `PDFParser`、`resources.py`、`local_input_guard.py` 使用新架构 --- -## 五、兼容性策略 - -### 5.1 向后兼容 - -1. **保持现有 API 不变**: - - `ParserRegistry` 接口保持不变 - - `registry.parse()` 仍然可以工作 - - 自定义 Parser 注册方式不变 - -2. **渐进式迁移**: - - 现有 Parser 可以继续使用 - - 新 Parser 鼓励使用新架构 - - 提供迁移指南 - -### 5.2 废弃策略 - -- 标记旧的 `CodeRepositoryParser` 等为 `@deprecated` -- 在 v6.0 或未来版本移除 - ---- - -## 六、文件结构变更 +## 文件结构 ``` openviking/parse/ -├── accessors/ [新增] +├── accessors/ # ✅ 新增:数据访问层 │ ├── __init__.py -│ ├── base.py # DataAccessor, LocalResource -│ ├── registry.py # AccessorRegistry -│ ├── git.py # GitAccessor -│ ├── http.py # HTTPAccessor -│ └── feishu.py # FeishuAccessor -├── parsers/ -│ ├── base_parser.py # 明确为 DataParser -│ ├── code/ # 保留但简化 +│ ├── base.py # DataAccessor, LocalResource, SourceType +│ ├── registry.py # AccessorRegistry +│ ├── git_accessor.py # GitAccessor +│ ├── http_accessor.py # HTTPAccessor +│ ├── feishu_accessor.py # FeishuAccessor +│ └── local_accessor.py # LocalAccessor +├── parsers/ # 数据解析层(保持不变) +│ ├── base_parser.py │ ├── markdown.py │ ├── pdf.py +│ ├── zip_parser.py │ └── ... -├── registry.py # ParserRegistry (重构) -└── ... +└── registry.py # ParserRegistry(保持不变) ``` --- -## 七、测试计划 +## 使用示例 -| 测试类型 | 测试内容 | -|---------|---------| -| 单元测试 | AccessorRegistry, 各 Accessor, 各 Parser | -| 集成测试 | 完整流程:add_resource → Accessor → Parser → TreeBuilder | -| 回归测试 | 确保现有功能不被破坏 | -| 冲突测试 | 测试优先级机制解决 .zip, URL 等冲突场景 | +### 使用 AccessorRegistry ---- +```python +from openviking.parse.accessors import get_accessor_registry + +# 获取全局注册表 +registry = get_accessor_registry() + +# 访问资源(自动路由) +async with await registry.access("https://github.com/user/repo") as resource: + print(f"本地路径: {resource.path}") + print(f"来源类型: {resource.source_type}") + # 使用 resource.path 进行解析... + # 退出 with 块时自动清理临时资源 +``` -## 八、风险与应对 +### 自定义 Accessor -| 风险 | 影响 | 概率 | 应对措施 | -|-----|------|------|---------| -| 重构范围过大 | 高 | 中 | 分阶段实施,每阶段可独立发布 | -| 性能回退 | 中 | 低 | 保持缓存机制,性能基准测试 | -| 自定义插件受影响 | 高 | 低 | 保持向后兼容,提供迁移工具 | +```python +from openviking.parse.accessors import DataAccessor, LocalResource + +class MyAccessor(DataAccessor): + @property + def priority(self) -> int: + return 90 + + def can_handle(self, source: str) -> bool: + return source.startswith("myprotocol://") + + async def access(self, source: str, **kwargs) -> LocalResource: + # 获取数据到本地... + temp_path = self._create_temp_dir() + # ... 下载逻辑 ... + return LocalResource( + path=temp_path, + source_type="myprotocol", + original_source=source, + meta={}, + is_temporary=True + ) + +# 注册 +registry = get_accessor_registry() +registry.register(MyAccessor()) +``` --- -## 九、相关文档 +## 相关文档 - [解析系统 README](../../openviking/parse/parsers/README.md) - [OpenViking 整体架构](../zh/concepts/01-architecture.md) From a3d2a13a7dc704b8c695d4f777c7b5724e726496 Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 14:59:12 +0800 Subject: [PATCH 08/15] fix: download http files --- openviking/parse/accessors/http_accessor.py | 175 ++++++++++++++++++-- openviking/parse/parsers/html.py | 29 ++++ 2 files changed, 193 insertions(+), 11 deletions(-) diff --git a/openviking/parse/accessors/http_accessor.py b/openviking/parse/accessors/http_accessor.py index 1d228889a..f4ed4f7fe 100644 --- a/openviking/parse/accessors/http_accessor.py +++ b/openviking/parse/accessors/http_accessor.py @@ -5,14 +5,25 @@ Fetches HTTP/HTTPS URLs and makes them available as local files. This is the DataAccessor layer extracted from HTMLParser. + +Features: +- Downloads web pages to local HTML files +- Downloads files (PDF, Markdown, etc.) to local files +- Supports GitHub/GitLab blob to raw URL conversion +- Follows redirects +- Network guard integration +- Detailed error classification (network, timeout, auth, etc.) +- Content-Type detection for URLs without file extensions """ import tempfile +from enum import Enum from pathlib import Path -from typing import Optional, Union +from typing import Any, Dict, Optional, Tuple, Union from urllib.parse import unquote, urlparse from openviking.parse.base import lazy_import +from openviking.parse.parsers.constants import CODE_EXTENSIONS from openviking.utils.network_guard import build_httpx_request_validation_hooks from openviking_cli.utils.logger import get_logger @@ -21,6 +32,134 @@ logger = get_logger(__name__) +class URLType(Enum): + """URL content types.""" + + WEBPAGE = "webpage" # HTML webpage to parse + DOWNLOAD_PDF = "download_pdf" # PDF file download link + DOWNLOAD_MD = "download_md" # Markdown file download link + DOWNLOAD_TXT = "download_txt" # Text file download link + DOWNLOAD_HTML = "download_html" # HTML file download link + UNKNOWN = "unknown" # Unknown or unsupported type + + +class URLTypeDetector: + """ + Detector for URL content types. + + Uses extension and HTTP HEAD request to determine if a URL is: + - A webpage to scrape + - A file download link (and what type) + """ + + # Extension to URL type mapping + # CODE_EXTENSIONS spread comes first so explicit entries below override + # (e.g., .html/.htm -> DOWNLOAD_HTML instead of DOWNLOAD_TXT) + EXTENSION_MAP: Dict[str, URLType] = { + **dict.fromkeys(CODE_EXTENSIONS, URLType.DOWNLOAD_TXT), + ".pdf": URLType.DOWNLOAD_PDF, + ".md": URLType.DOWNLOAD_MD, + ".markdown": URLType.DOWNLOAD_MD, + ".txt": URLType.DOWNLOAD_TXT, + ".text": URLType.DOWNLOAD_TXT, + ".html": URLType.DOWNLOAD_HTML, + ".htm": URLType.DOWNLOAD_HTML, + } + + # Content-Type to URL type mapping + CONTENT_TYPE_MAP: Dict[str, URLType] = { + "application/pdf": URLType.DOWNLOAD_PDF, + "text/markdown": URLType.DOWNLOAD_MD, + "text/plain": URLType.DOWNLOAD_TXT, + "text/html": URLType.WEBPAGE, + "application/xhtml+xml": URLType.WEBPAGE, + } + + # URLType to file extension mapping + URL_TYPE_TO_EXT: Dict[URLType, str] = { + URLType.WEBPAGE: ".html", + URLType.DOWNLOAD_PDF: ".pdf", + URLType.DOWNLOAD_MD: ".md", + URLType.DOWNLOAD_TXT: ".txt", + URLType.DOWNLOAD_HTML: ".html", + URLType.UNKNOWN: ".html", + } + + def __init__(self, timeout: float = 10.0): + """Initialize URL type detector.""" + self.timeout = timeout + + async def detect( + self, + url: str, + request_validator=None, + ) -> Tuple[URLType, Dict[str, Any]]: + """ + Detect URL content type. + + Args: + url: URL to detect + request_validator: Optional network request validator + + Returns: + (URLType, metadata dict) + """ + meta = {"url": url, "detected_by": "unknown"} + parsed = urlparse(url) + path_lower = parsed.path.lower() + + # 1. Check extension first + for ext, url_type in self.EXTENSION_MAP.items(): + if path_lower.endswith(ext): + meta["detected_by"] = "extension" + meta["extension"] = ext + return url_type, meta + + # 2. Send HEAD request to check Content-Type + try: + httpx = lazy_import("httpx") + client_kwargs = { + "timeout": self.timeout, + "follow_redirects": True, + } + event_hooks = build_httpx_request_validation_hooks(request_validator) + if event_hooks: + client_kwargs["event_hooks"] = event_hooks + client_kwargs["trust_env"] = False + + async with httpx.AsyncClient(**client_kwargs) as client: + response = await client.head(url) + content_type = response.headers.get("content-type", "").lower() + + # Remove charset info + if ";" in content_type: + content_type = content_type.split(";")[0].strip() + + meta["content_type"] = content_type + meta["detected_by"] = "content_type" + meta["status_code"] = response.status_code + + # Map content type + for ct_prefix, url_type in self.CONTENT_TYPE_MAP.items(): + if content_type.startswith(ct_prefix): + return url_type, meta + + # Default to webpage for HTML-like content + if "html" in content_type or "xml" in content_type: + return URLType.WEBPAGE, meta + + except Exception as e: + meta["detection_error"] = str(e) + logger.debug(f"[URLTypeDetector] HEAD request failed: {e}, falling back to default") + + # 3. Default: assume webpage + return URLType.WEBPAGE, meta + + def get_extension_for_type(self, url_type: URLType) -> str: + """Get file extension for URL type.""" + return self.URL_TYPE_TO_EXT.get(url_type, ".html") + + class HTTPAccessor(DataAccessor): """ Accessor for HTTP/HTTPS URLs. @@ -50,6 +189,7 @@ def __init__( """Initialize HTTP accessor.""" self.timeout = timeout self.user_agent = user_agent or self.DEFAULT_USER_AGENT + self._url_detector = URLTypeDetector(timeout=min(timeout, 10.0)) @property def priority(self) -> int: @@ -83,16 +223,19 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: request_validator = kwargs.get("request_validator") # Download the URL - temp_path = await self._download_url( + temp_path, url_type, meta = await self._download_url( source_str, request_validator=request_validator, ) # Build metadata - meta = { - "url": source_str, - "downloaded": True, - } + meta.update( + { + "url": source_str, + "downloaded": True, + "url_type": url_type.value, + } + ) return LocalResource( path=Path(temp_path), @@ -124,7 +267,7 @@ async def _download_url( self, url: str, request_validator=None, - ) -> str: + ) -> Tuple[str, URLType, Dict[str, Any]]: """ Download URL content to a temporary file. @@ -133,23 +276,33 @@ async def _download_url( request_validator: Optional network request validator Returns: - Path to the temporary file + Tuple of (path to temporary file, URLType, metadata dict) """ httpx = lazy_import("httpx") # Convert GitHub/GitLab blob URLs to raw url = self._convert_to_raw_url(url) - # Determine file extension from URL (decode first to handle encoded paths) + # Detect URL type first to get proper extension + url_type, detect_meta = await self._url_detector.detect( + url, + request_validator=request_validator, + ) + + # Determine file extension parsed = urlparse(url) decoded_path = unquote(parsed.path) - ext = Path(decoded_path).suffix or ".html" + ext = Path(decoded_path).suffix + if not ext: + ext = self._url_detector.get_extension_for_type(url_type) # Create temp file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext) temp_path = temp_file.name temp_file.close() + meta = {**detect_meta, "extension": ext} + try: # Download content client_kwargs = { @@ -190,7 +343,7 @@ async def _download_url( # Write to temp file Path(temp_path).write_bytes(response.content) - return temp_path + return temp_path, url_type, meta except Exception: # Clean up on error try: diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py index 24331113c..3acf32b1d 100644 --- a/openviking/parse/parsers/html.py +++ b/openviking/parse/parsers/html.py @@ -25,6 +25,14 @@ logger = __import__("openviking_cli.utils.logger").utils.logger.get_logger(__name__) +# Backward compatibility: Re-export URLType and URLTypeDetector from http_accessor +try: + from openviking.parse.accessors.http_accessor import URLType, URLTypeDetector +except ImportError: + URLType = None + URLTypeDetector = None + + class HTMLParser(BaseParser): """ Parser for local HTML files. @@ -43,6 +51,27 @@ def __init__(self): """Initialize HTML parser.""" pass + @staticmethod + def _extract_filename_from_url(url: str) -> str: + """ + Extract and URL-decode the original filename from a URL. + + Args: + url: URL to extract filename from + + Returns: + Decoded filename (e.g., "schemas.py" from ".../schemas.py") + Falls back to "download" if no filename can be extracted. + """ + from pathlib import Path + from urllib.parse import unquote, urlparse + + parsed = urlparse(url) + # URL-decode path to handle encoded characters (e.g., %E7%99%BE -> Chinese chars) + decoded_path = unquote(parsed.path) + basename = Path(decoded_path).name + return basename if basename else "download" + def _get_readabilipy(self): """Lazy import of readabilipy.""" if not hasattr(self, "_readabilipy") or self._readabilipy is None: From abf19d5028fdd8b9b021f02e7346054ce2fe878c Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 15:34:09 +0800 Subject: [PATCH 09/15] fix: download pdf files, and fit IANA standards --- openviking/parse/accessors/http_accessor.py | 269 +++++++++++--- openviking/parse/accessors/mime_types.py | 391 ++++++++++++++++++++ 2 files changed, 617 insertions(+), 43 deletions(-) create mode 100644 openviking/parse/accessors/mime_types.py diff --git a/openviking/parse/accessors/http_accessor.py b/openviking/parse/accessors/http_accessor.py index f4ed4f7fe..258200fec 100644 --- a/openviking/parse/accessors/http_accessor.py +++ b/openviking/parse/accessors/http_accessor.py @@ -13,7 +13,7 @@ - Follows redirects - Network guard integration - Detailed error classification (network, timeout, auth, etc.) -- Content-Type detection for URLs without file extensions +- IANA Media Type (MIME) based content detection for URLs without file extensions """ import tempfile @@ -28,12 +28,13 @@ from openviking_cli.utils.logger import get_logger from .base import DataAccessor, LocalResource, SourceType +from .mime_types import MEDIA_TYPE_ALIASES, IANAMediaType, get_preferred_extension logger = get_logger(__name__) class URLType(Enum): - """URL content types.""" + """URL content types for routing to appropriate parsers.""" WEBPAGE = "webpage" # HTML webpage to parse DOWNLOAD_PDF = "download_pdf" # PDF file download link @@ -47,12 +48,18 @@ class URLTypeDetector: """ Detector for URL content types. - Uses extension and HTTP HEAD request to determine if a URL is: - - A webpage to scrape - - A file download link (and what type) + Uses IANA Media Type (MIME) standards for robust content detection: + 1. Check file extension (fast path) + 2. Check Content-Disposition header for filename (most reliable) + 3. Check Content-Type header (IANA standard media types) + 4. Fall back to default behavior + + References: + - RFC 6838: Media Type Specifications and Registration Procedures + - RFC 6266: Use of the Content-Disposition Header Field in HTTP """ - # Extension to URL type mapping + # === Extension to URL type mapping === # CODE_EXTENSIONS spread comes first so explicit entries below override # (e.g., .html/.htm -> DOWNLOAD_HTML instead of DOWNLOAD_TXT) EXTENSION_MAP: Dict[str, URLType] = { @@ -66,13 +73,27 @@ class URLTypeDetector: ".htm": URLType.DOWNLOAD_HTML, } - # Content-Type to URL type mapping - CONTENT_TYPE_MAP: Dict[str, URLType] = { + # === IANA Media Type to URL type mapping === + # Maps IANA registered media types to our internal URLType + # Patterns can be: + # - Exact match: "application/pdf" + # - Wildcard: "text/*" + # - Type only: "image" (treated as "image/*") + # NOTE: .html/.htm extensions are mapped to DOWNLOAD_HTML via EXTENSION_MAP, + # while text/html Content-Type is mapped to WEBPAGE here for URLs + # without extensions (like https://example.com/page) + MEDIA_TYPE_MAP: Dict[str, URLType] = { + # PDF "application/pdf": URLType.DOWNLOAD_PDF, + # Markdown "text/markdown": URLType.DOWNLOAD_MD, - "text/plain": URLType.DOWNLOAD_TXT, + "text/x-markdown": URLType.DOWNLOAD_MD, + # HTML/webpage (for URLs without .html extension) "text/html": URLType.WEBPAGE, "application/xhtml+xml": URLType.WEBPAGE, + # Plain text + "text/plain": URLType.DOWNLOAD_TXT, + "text/*": URLType.DOWNLOAD_TXT, } # URLType to file extension mapping @@ -95,27 +116,39 @@ async def detect( request_validator=None, ) -> Tuple[URLType, Dict[str, Any]]: """ - Detect URL content type. + Detect URL content type using IANA standards. + + Detection order (most reliable to least reliable): + 1. File extension from URL path (if valid and recognized) + 2. Filename from Content-Disposition header (RFC 6266) + 3. IANA Media Type from Content-Type header (RFC 6838) + 4. Default to WEBPAGE Args: url: URL to detect request_validator: Optional network request validator Returns: - (URLType, metadata dict) + (URLType, metadata dict with detection details) """ - meta = {"url": url, "detected_by": "unknown"} + meta = { + "url": url, + "detected_by": "unknown", + } parsed = urlparse(url) path_lower = parsed.path.lower() - - # 1. Check extension first - for ext, url_type in self.EXTENSION_MAP.items(): - if path_lower.endswith(ext): - meta["detected_by"] = "extension" - meta["extension"] = ext - return url_type, meta - - # 2. Send HEAD request to check Content-Type + valid_extensions = set(self.EXTENSION_MAP.keys()) + + # === Step 1: Check extension from URL path === + path_ext = Path(path_lower).suffix + if path_ext and path_ext in valid_extensions: + for ext, url_type in self.EXTENSION_MAP.items(): + if path_lower.endswith(ext): + meta["detected_by"] = "extension" + meta["extension"] = ext + return url_type, meta + + # === Step 2: Send HEAD request for headers === try: httpx = lazy_import("httpx") client_kwargs = { @@ -129,32 +162,138 @@ async def detect( async with httpx.AsyncClient(**client_kwargs) as client: response = await client.head(url) - content_type = response.headers.get("content-type", "").lower() - # Remove charset info - if ";" in content_type: - content_type = content_type.split(";")[0].strip() + # Get raw headers for debugging + content_type_raw = response.headers.get("content-type", "") + content_disposition = response.headers.get("content-disposition", "") - meta["content_type"] = content_type - meta["detected_by"] = "content_type" + meta["content_type_raw"] = content_type_raw + meta["content_disposition_raw"] = content_disposition meta["status_code"] = response.status_code - # Map content type - for ct_prefix, url_type in self.CONTENT_TYPE_MAP.items(): - if content_type.startswith(ct_prefix): + # === Step 2a: Check Content-Disposition for filename (RFC 6266) === + filename_from_disposition = self._extract_filename_from_disposition( + content_disposition + ) + if filename_from_disposition: + meta["filename_from_disposition"] = filename_from_disposition + filename_lower = filename_from_disposition.lower() + for ext, url_type in self.EXTENSION_MAP.items(): + if filename_lower.endswith(ext): + meta["detected_by"] = "content_disposition" + meta["extension"] = ext + return url_type, meta + + # === Step 2b: Check Content-Type (RFC 6838) === + if content_type_raw: + url_type = self._detect_from_media_type(content_type_raw, meta) + if url_type != URLType.UNKNOWN: return url_type, meta - # Default to webpage for HTML-like content - if "html" in content_type or "xml" in content_type: - return URLType.WEBPAGE, meta - except Exception as e: meta["detection_error"] = str(e) logger.debug(f"[URLTypeDetector] HEAD request failed: {e}, falling back to default") - # 3. Default: assume webpage + # === Step 3: Default behavior === + meta["detected_by"] = "default" return URLType.WEBPAGE, meta + def _detect_from_media_type(self, content_type: str, meta: Dict[str, Any]) -> URLType: + """ + Detect URL type from IANA media type. + + Args: + content_type: Content-Type header value + meta: Metadata dict to update + + Returns: + Detected URLType, or URLType.UNKNOWN if no match + """ + # Normalize and parse according to IANA standards + media_type_str = content_type.lower().strip() + + # Handle common aliases + if media_type_str in MEDIA_TYPE_ALIASES: + meta["media_type_alias"] = media_type_str + media_type_str = MEDIA_TYPE_ALIASES[media_type_str] + + # Parse into structured IANAMediaType + try: + media_type = IANAMediaType.parse(media_type_str) + meta["media_type"] = str(media_type) + meta["media_type_type"] = media_type.type + meta["media_type_subtype"] = media_type.subtype + if media_type.suffix: + meta["media_type_suffix"] = media_type.suffix + except Exception as e: + logger.debug(f"[URLTypeDetector] Failed to parse media type: {e}") + meta["media_type_parse_error"] = str(e) + return URLType.UNKNOWN + + # Check for exact match first + media_type_key = f"{media_type.type}/{media_type.subtype}" + if media_type.suffix: + media_type_with_suffix = f"{media_type_key}+{media_type.suffix}" + if media_type_with_suffix in self.MEDIA_TYPE_MAP: + meta["detected_by"] = "media_type_suffix" + return self.MEDIA_TYPE_MAP[media_type_with_suffix] + + if media_type_key in self.MEDIA_TYPE_MAP: + meta["detected_by"] = "media_type" + return self.MEDIA_TYPE_MAP[media_type_key] + + # Check for wildcard matches + for pattern, url_type in self.MEDIA_TYPE_MAP.items(): + if media_type.matches(pattern): + meta["detected_by"] = "media_type_pattern" + meta["media_type_pattern"] = pattern + return url_type + + return URLType.UNKNOWN + + @staticmethod + def _extract_filename_from_disposition(content_disposition: str) -> Optional[str]: + """ + Extract filename from Content-Disposition header per RFC 6266. + + Handles formats: + - inline; filename="2601.00014v1.pdf" + - attachment; filename=document.pdf + - attachment; filename*=UTF-8''encoded.pdf + - attachment; filename="foo.pdf"; size=12345 + + Args: + content_disposition: Content-Disposition header value + + Returns: + Extracted filename, or None if not found + """ + if not content_disposition: + return None + + import re + + content_disposition = content_disposition.strip() + + # Try filename*=UTF-8''... format first (RFC 5987) + utf8_match = re.search(r"filename\*=UTF-8''([^;]+)", content_disposition, re.I) + if utf8_match: + from urllib.parse import unquote + + return unquote(utf8_match.group(1)) + + # Try filename="..." format (quoted-string) + quoted_match = re.search(r'filename="([^"]+)"', content_disposition, re.I) + if quoted_match: + return quoted_match.group(1) + + # Try filename=... format (token) + simple_match = re.search(r"filename=([^;]+)", content_disposition, re.I) + if simple_match: + return simple_match.group(1).strip() + + return None + def get_extension_for_type(self, url_type: URLType) -> str: """Get file extension for URL type.""" return self.URL_TYPE_TO_EXT.get(url_type, ".html") @@ -171,6 +310,7 @@ class HTTPAccessor(DataAccessor): - Follows redirects - Network guard integration - Detailed error classification (network, timeout, auth, etc.) + - IANA Media Type based detection for URLs without extensions """ PRIORITY = 50 # Lower than GitAccessor, higher than fallback @@ -204,8 +344,6 @@ def can_handle(self, source: Union[str, Path]) -> bool: and will be checked first for their specific URL types. """ source_str = str(source) - - # Only handle http/https URLs return source_str.startswith(("http://", "https://")) async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: @@ -289,12 +427,8 @@ async def _download_url( request_validator=request_validator, ) - # Determine file extension - parsed = urlparse(url) - decoded_path = unquote(parsed.path) - ext = Path(decoded_path).suffix - if not ext: - ext = self._url_detector.get_extension_for_type(url_type) + # Determine file extension using IANA standards + ext = self._determine_file_extension(url, url_type, detect_meta) # Create temp file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext) @@ -354,6 +488,55 @@ async def _download_url( pass raise + def _determine_file_extension( + self, + url: str, + url_type: URLType, + detect_meta: Dict[str, Any], + ) -> str: + """ + Determine appropriate file extension using multiple strategies. + + Strategy order (most reliable first): + 1. Extension from Content-Disposition filename + 2. Extension from URL path (if valid) + 3. Use IANA media type mapping + 4. Use URL type based extension + + Args: + url: Original URL + url_type: Detected URL type + detect_meta: Detection metadata + + Returns: + File extension including dot (e.g., ".pdf") + """ + valid_extensions = set(URLTypeDetector.EXTENSION_MAP.keys()) + + # 1. Try extension from Content-Disposition filename + filename_from_disposition = detect_meta.get("filename_from_disposition") + if filename_from_disposition: + ext = Path(filename_from_disposition.lower()).suffix + if ext and ext in valid_extensions: + return ext + + # 2. Try extension from URL path (if valid) + parsed = urlparse(url) + decoded_path = unquote(parsed.path) + ext = Path(decoded_path).suffix + if ext and ext.lower() in valid_extensions: + return ext.lower() + + # 3. Try IANA media type to extension mapping + media_type_str = detect_meta.get("media_type") or detect_meta.get("content_type_raw") + if media_type_str: + iana_ext = get_preferred_extension(media_type_str) + if iana_ext: + return iana_ext + + # 4. Fall back to URL type based extension + return self._url_detector.get_extension_for_type(url_type) + def _convert_to_raw_url(self, url: str) -> str: """Convert GitHub/GitLab blob URL to raw URL.""" parsed = urlparse(url) diff --git a/openviking/parse/accessors/mime_types.py b/openviking/parse/accessors/mime_types.py new file mode 100644 index 000000000..da9fe119e --- /dev/null +++ b/openviking/parse/accessors/mime_types.py @@ -0,0 +1,391 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +""" +IANA Media Type (MIME Type) handling. + +References: +- RFC 6838: Media Type Specifications and Registration Procedures +- https://www.iana.org/assignments/media-types/media-types.xhtml +""" + +from dataclasses import dataclass +from typing import Dict, List, Optional, Set + + +@dataclass(frozen=True) +class IANAMediaType: + """ + Represents an IANA media type. + + Format: type "/" [tree "."] subtype ["+" suffix] *[";" parameter] + + Examples: + - text/html + - application/pdf + - application/vnd.openxmlformats-officedocument.wordprocessingml.document + - text/plain; charset=utf-8 + """ + + type: str # The top-level type (text, application, image, etc.) + subtype: str # The subtype (may include tree prefix) + suffix: Optional[str] = None # Optional structured syntax suffix (e.g., +xml, +json) + parameters: Dict[str, str] = None # Optional parameters (e.g., charset=utf-8) + + def __post_init__(self): + if self.parameters is None: + object.__setattr__(self, "parameters", {}) + + @classmethod + def parse(cls, media_type: str) -> "IANAMediaType": + """ + Parse a media type string into an IANAMediaType object. + + Args: + media_type: Media type string (e.g., "text/plain; charset=utf-8") + + Returns: + Parsed IANAMediaType object + """ + media_type = media_type.strip().lower() + + # Split into type/subtype and parameters + parts = media_type.split(";", 1) + type_subtype = parts[0].strip() + params_str = parts[1].strip() if len(parts) > 1 else "" + + # Parse type and subtype + if "/" not in type_subtype: + # Invalid format, treat as unknown + return cls(type="application", subtype="octet-stream") + + type_part, subtype_part = type_subtype.split("/", 1) + type_part = type_part.strip() + subtype_part = subtype_part.strip() + + # Check for suffix (e.g., +xml, +json) + suffix = None + if "+" in subtype_part: + subtype_parts = subtype_part.rsplit("+", 1) + # Only recognize standard suffixes + if subtype_parts[1] in { + "xml", + "json", + "yaml", + "ber", + "fastinfoset", + "wbxml", + "zip", + "cbor", + }: + subtype_part = subtype_parts[0] + suffix = subtype_parts[1] + + # Parse parameters + parameters = {} + if params_str: + for param in params_str.split(";"): + param = param.strip() + if not param: + continue + if "=" in param: + key, value = param.split("=", 1) + parameters[key.strip()] = value.strip() + else: + parameters[param] = "" + + return cls( + type=type_part, + subtype=subtype_part, + suffix=suffix, + parameters=parameters, + ) + + def matches(self, pattern: str) -> bool: + """ + Check if this media type matches a pattern. + + Patterns can include wildcards: + - "text/*" matches any text type + - "application/*" matches any application type + - "*/*" matches any type + - "text/plain" matches exactly + - "text" is treated as "text/*" + + Args: + pattern: Pattern to match against + + Returns: + True if matches, False otherwise + """ + pattern = pattern.lower().strip() + + if pattern == "*/*": + return True + + if "/" not in pattern: + # Treat as type/* + return self.type == pattern + + pattern_type, pattern_subtype = pattern.split("/", 1) + pattern_type = pattern_type.strip() + pattern_subtype = pattern_subtype.strip() + + if pattern_type != "*" and self.type != pattern_type: + return False + + if pattern_subtype == "*": + return True + + return self.subtype == pattern_subtype + + def __str__(self) -> str: + """Return the canonical string representation.""" + parts = [f"{self.type}/{self.subtype}"] + if self.suffix: + parts[-1] += f"+{self.suffix}" + for key, value in self.parameters.items(): + if value: + parts.append(f"; {key}={value}") + else: + parts.append(f"; {key}") + return "".join(parts) + + +# IANA registered top-level media types +IANA_TOP_LEVEL_TYPES: Set[str] = { + "application", + "audio", + "example", + "font", + "image", + "message", + "model", + "multipart", + "text", + "video", +} + +# Common media type aliases (non-standard but widely used) +MEDIA_TYPE_ALIASES: Dict[str, str] = { + "application/x-pdf": "application/pdf", + "application/x-zip-compressed": "application/zip", + "application/x-gzip": "application/gzip", + "application/x-tar": "application/x-tar", + "text/x-markdown": "text/markdown", + "text/x-c": "text/plain", + "text/x-c++": "text/plain", + "text/x-python": "text/plain", + "text/x-java": "text/plain", + "text/x-javascript": "text/plain", + "text/x-script.python": "text/plain", + "image/jpg": "image/jpeg", + "audio/mp3": "audio/mpeg", + "video/mp4": "video/mp4", # Keep as is (standard) +} + +# Comprehensive IANA media type to file extension mapping +# Based on IANA registry and common usage +IANA_MEDIA_TYPE_TO_EXTENSION: Dict[str, List[str]] = { + # === Text types === + "text/plain": [".txt", ".text"], + "text/html": [".html", ".htm"], + "text/css": [".css"], + "text/csv": [".csv"], + "text/markdown": [".md", ".markdown"], + "text/calendar": [".ics"], + "text/vcard": [".vcf"], + "text/xml": [".xml"], + "text/yaml": [".yaml", ".yml"], + "text/tab-separated-values": [".tsv"], + "text/sgml": [".sgml"], + "text/rtf": [".rtf"], + "text/jinja2": [".j2", ".jinja2"], + "text/x-yaml": [".yaml", ".yml"], # Non-standard but common + "text/x-markdown": [".md", ".markdown"], # Non-standard but common + # === Application types - Documents === + "application/pdf": [".pdf"], + "application/rtf": [".rtf"], + "application/epub+zip": [".epub"], + "application/x-mobipocket-ebook": [".mobi"], + # === Application types - Office === + "application/msword": [".doc"], + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": [".docx"], + "application/vnd.ms-excel": [".xls"], + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [".xlsx"], + "application/vnd.ms-powerpoint": [".ppt"], + "application/vnd.openxmlformats-officedocument.presentationml.presentation": [".pptx"], + "application/vnd.oasis.opendocument.text": [".odt"], + "application/vnd.oasis.opendocument.spreadsheet": [".ods"], + "application/vnd.oasis.opendocument.presentation": [".odp"], + # === Application types - Code === + "application/javascript": [".js"], + "application/json": [".json"], + "application/ld+json": [".jsonld"], + "application/xml": [".xml"], + "application/yaml": [".yaml", ".yml"], + "application/x-yaml": [".yaml", ".yml"], + "application/xhtml+xml": [".xhtml"], + "application/rss+xml": [".rss"], + "application/atom+xml": [".atom"], + "application/soap+xml": [".wsdl"], + # === Application types - Archives === + "application/zip": [".zip"], + "application/x-zip-compressed": [".zip"], + "application/gzip": [".gz"], + "application/x-gzip": [".gz"], + "application/x-tar": [".tar"], + "application/x-7z-compressed": [".7z"], + "application/x-rar-compressed": [".rar"], + "application/x-bzip2": [".bz2"], + "application/x-lzip": [".lz"], + "application/x-xz": [".xz"], + # === Application types - Other === + "application/octet-stream": [".bin"], + "application/pgp-encrypted": [".pgp"], + "application/pgp-signature": [".sig"], + "application/pkcs12": [".p12", ".pfx"], + "application/pkcs8": [".p8"], + "application/x-pem-file": [".pem"], + "application/wasm": [".wasm"], + # === Image types === + "image/jpeg": [".jpg", ".jpeg"], + "image/png": [".png"], + "image/gif": [".gif"], + "image/svg+xml": [".svg"], + "image/webp": [".webp"], + "image/bmp": [".bmp"], + "image/tiff": [".tiff", ".tif"], + "image/vnd.microsoft.icon": [".ico"], + "image/x-icon": [".ico"], + "image/heic": [".heic"], + "image/heif": [".heif"], + "image/avif": [".avif"], + # === Audio types === + "audio/mpeg": [".mp3", ".mpeg"], + "audio/mp4": [".m4a", ".mp4a"], + "audio/ogg": [".ogg", ".oga"], + "audio/wav": [".wav"], + "audio/webm": [".webm"], + "audio/flac": [".flac"], + "audio/aac": [".aac"], + "audio/x-wav": [".wav"], + # === Video types === + "video/mp4": [".mp4"], + "video/mpeg": [".mpeg", ".mpg"], + "video/quicktime": [".mov"], + "video/webm": [".webm"], + "video/x-msvideo": [".avi"], + "video/x-flv": [".flv"], + "video/x-matroska": [".mkv"], + "video/ogg": [".ogv"], + # === Font types === + "font/otf": [".otf"], + "font/ttf": [".ttf"], + "font/woff": [".woff"], + "font/woff2": [".woff2"], + "application/vnd.ms-fontobject": [".eot"], + "application/font-woff": [".woff"], + "application/x-font-ttf": [".ttf"], + "application/x-font-otf": [".otf"], + # === Model types === + "model/gltf+json": [".gltf"], + "model/gltf-binary": [".glb"], + "model/obj": [".obj"], + "model/stl": [".stl"], + # === Message types === + "message/rfc822": [".eml", ".mht"], + # === Multipart types === + "multipart/mixed": [], + "multipart/alternative": [], + "multipart/related": [], + "multipart/form-data": [], +} + + +def get_preferred_extension(media_type: str) -> Optional[str]: + """ + Get the preferred file extension for a media type. + + Args: + media_type: IANA media type string + + Returns: + Preferred extension (with dot), or None if unknown + """ + if not media_type: + return None + + # Normalize and parse + media_type = media_type.lower().strip() + + # Handle aliases first + if media_type in MEDIA_TYPE_ALIASES: + media_type = MEDIA_TYPE_ALIASES[media_type] + + # Strip parameters + if ";" in media_type: + media_type = media_type.split(";", 1)[0].strip() + + # Exact match + if media_type in IANA_MEDIA_TYPE_TO_EXTENSION: + exts = IANA_MEDIA_TYPE_TO_EXTENSION[media_type] + if exts: + return exts[0] + + # Try parsing for suffix matching + try: + parsed = IANAMediaType.parse(media_type) + # Try with suffix + if parsed.suffix: + full_type = f"{parsed.type}/{parsed.subtype}+{parsed.suffix}" + if full_type in IANA_MEDIA_TYPE_TO_EXTENSION: + exts = IANA_MEDIA_TYPE_TO_EXTENSION[full_type] + if exts: + return exts[0] + # Try without tree prefix if it's a vendor type + if parsed.subtype.startswith("vnd."): + base_subtype = parsed.subtype[4:] # Remove "vnd." + if base_subtype in IANA_MEDIA_TYPE_TO_EXTENSION: + exts = IANA_MEDIA_TYPE_TO_EXTENSION[base_subtype] + if exts: + return exts[0] + except Exception: + pass + + # Try wildcard matches + if media_type.startswith("text/"): + return ".txt" + if media_type.startswith("image/"): + return ".png" + if media_type.startswith("audio/"): + return ".mp3" + if media_type.startswith("video/"): + return ".mp4" + + return None + + +def get_all_extensions(media_type: str) -> List[str]: + """ + Get all known file extensions for a media type. + + Args: + media_type: IANA media type string + + Returns: + List of extensions (with dots), empty list if unknown + """ + if not media_type: + return [] + + media_type = media_type.lower().strip() + + # Handle aliases first + if media_type in MEDIA_TYPE_ALIASES: + media_type = MEDIA_TYPE_ALIASES[media_type] + + # Strip parameters + if ";" in media_type: + media_type = media_type.split(";", 1)[0].strip() + + return IANA_MEDIA_TYPE_TO_EXTENSION.get(media_type, []) From bf2dd288b677d8ac9a391fb61e2e3297e56ba07a Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 15:39:10 +0800 Subject: [PATCH 10/15] fix: read github/gitlab domains from CodeConfig instead of HTMLConfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix config lookup: github_domains/gitlab_domains are in CodeConfig now - Add debug logging instead of swallowing exceptions - Add comment about config location 🤖 Generated with [Claude Code](https://claude.com/claude-code) --- openviking/parse/accessors/http_accessor.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/openviking/parse/accessors/http_accessor.py b/openviking/parse/accessors/http_accessor.py index 258200fec..f8a96d8ac 100644 --- a/openviking/parse/accessors/http_accessor.py +++ b/openviking/parse/accessors/http_accessor.py @@ -544,8 +544,9 @@ def _convert_to_raw_url(self, url: str) -> str: from openviking_cli.utils.config import get_openviking_config config = get_openviking_config() - github_domains = config.html.github_domains - gitlab_domains = config.html.gitlab_domains + # NOTE: github_domains/gitlab_domains are in CodeConfig, not HTMLConfig + github_domains = config.code.github_domains + gitlab_domains = config.code.gitlab_domains github_raw_domain = config.code.github_raw_domain if parsed.netloc in github_domains: @@ -558,7 +559,10 @@ def _convert_to_raw_url(self, url: str) -> str: if parsed.netloc in gitlab_domains and "/blob/" in parsed.path: return url.replace("/blob/", "/raw/") - except Exception: - pass + except Exception as e: + logger.debug( + f"[HTTPAccessor] Failed to convert blob URL to raw: {e}, " + f"falling back to original URL: {url}" + ) return url From 9b7d0cc9eb1fb57f0d65cf1838298e3d59f7713e Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 15:43:24 +0800 Subject: [PATCH 11/15] fix: preserve original filename from URL downloads in two-layer flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - HTTPAccessor now stores original_filename in LocalResource.meta - Priority: Content-Disposition filename > URL path filename - UnifiedResourceProcessor now prefers original_filename from meta - Uses original_filename for resource_name and source_name - Falls back to temp path stem only if no original filename available 🤖 Generated with [Claude Code](https://claude.com/claude-code) --- openviking/parse/accessors/http_accessor.py | 7 ++++++- openviking/utils/media_processor.py | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/openviking/parse/accessors/http_accessor.py b/openviking/parse/accessors/http_accessor.py index f8a96d8ac..9d54c6d9c 100644 --- a/openviking/parse/accessors/http_accessor.py +++ b/openviking/parse/accessors/http_accessor.py @@ -435,7 +435,12 @@ async def _download_url( temp_path = temp_file.name temp_file.close() - meta = {**detect_meta, "extension": ext} + # Get original filename from URL or Content-Disposition + original_filename = detect_meta.get("filename_from_disposition") + if not original_filename: + original_filename = self._extract_filename_from_url(url) + + meta = {**detect_meta, "extension": ext, "original_filename": original_filename} try: # Download content diff --git a/openviking/utils/media_processor.py b/openviking/utils/media_processor.py index 346912273..9bbf7ad6e 100644 --- a/openviking/utils/media_processor.py +++ b/openviking/utils/media_processor.py @@ -122,7 +122,13 @@ async def process( # Use the last part of repo_name as the resource_name (e.g., "OpenViking" from "volcengine/OpenViking") parse_kwargs["resource_name"] = repo_name.split("/")[-1] else: - parse_kwargs.setdefault("resource_name", local_resource.path.stem) + # Prefer original_filename from meta for HTTP downloads + original_filename = local_resource.meta.get("original_filename") + if original_filename: + parse_kwargs.setdefault("resource_name", Path(original_filename).stem) + parse_kwargs.setdefault("source_name", original_filename) + else: + parse_kwargs.setdefault("resource_name", local_resource.path.stem) # If it's a directory, use DirectoryParser which will delegate to CodeRepositoryParser if it's a git repo if local_resource.path.is_dir(): From 8705331a721a981272528418534462d0c3680621 Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 16:38:17 +0800 Subject: [PATCH 12/15] fix: review comments about accessors --- openviking/parse/accessors/git_accessor.py | 8 ++-- openviking/parse/accessors/http_accessor.py | 4 +- openviking/parse/accessors/local_accessor.py | 39 +++++++++++++++----- openviking/parse/parsers/html.py | 11 +++++- openviking/server/routers/pack.py | 2 +- 5 files changed, 45 insertions(+), 19 deletions(-) diff --git a/openviking/parse/accessors/git_accessor.py b/openviking/parse/accessors/git_accessor.py index 8828e4aea..5bb27db1c 100644 --- a/openviking/parse/accessors/git_accessor.py +++ b/openviking/parse/accessors/git_accessor.py @@ -57,7 +57,7 @@ def can_handle(self, source: Union[str, Path]) -> bool: - git@ SSH URLs - git://, ssh:// URLs - GitHub/GitLab repository URLs (http/https) - - Local paths ending with .git or .zip + - Local paths ending with .git (NOT .zip - those go to ZipParser) """ source_str = str(source) @@ -74,14 +74,14 @@ def can_handle(self, source: Union[str, Path]) -> bool: if source_str.startswith(("http://", "https://")): return is_git_repo_url(source_str) - # Local .git or .zip files + # Local .git files (NOT .zip - .zip goes to ZipParser via LocalAccessor) if isinstance(source, Path): path = source else: path = Path(source_str) suffix = path.suffix.lower() - return suffix in (".git", ".zip") + return suffix == ".git" async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: """ @@ -178,8 +178,6 @@ def _cleanup(): branch=branch, commit=commit, ) - elif str(source).endswith(".zip"): - repo_name = await self._extract_zip(source_str, temp_local_dir) else: raise ValueError(f"Unsupported source for GitAccessor: {source}") diff --git a/openviking/parse/accessors/http_accessor.py b/openviking/parse/accessors/http_accessor.py index 9d54c6d9c..19434705e 100644 --- a/openviking/parse/accessors/http_accessor.py +++ b/openviking/parse/accessors/http_accessor.py @@ -113,6 +113,7 @@ def __init__(self, timeout: float = 10.0): async def detect( self, url: str, + timeout: Optional[float] = None, request_validator=None, ) -> Tuple[URLType, Dict[str, Any]]: """ @@ -126,6 +127,7 @@ async def detect( Args: url: URL to detect + timeout: HTTP request timeout in seconds (optional, overrides detector's default) request_validator: Optional network request validator Returns: @@ -152,7 +154,7 @@ async def detect( try: httpx = lazy_import("httpx") client_kwargs = { - "timeout": self.timeout, + "timeout": timeout if timeout is not None else self.timeout, "follow_redirects": True, } event_hooks = build_httpx_request_validation_hooks(request_validator) diff --git a/openviking/parse/accessors/local_accessor.py b/openviking/parse/accessors/local_accessor.py index 48305121b..a5f71acd1 100644 --- a/openviking/parse/accessors/local_accessor.py +++ b/openviking/parse/accessors/local_accessor.py @@ -32,22 +32,34 @@ def can_handle(self, source: Union[str, Path]) -> bool: """ Check if this accessor can handle the source. - LocalAccessor accepts any source that: - 1. Is a Path object, OR - 2. Is a string that looks like a local path and exists, OR - 3. Is a string that doesn't look like a URL (for fallback) + LocalAccessor accepts: + 1. Any Path object, OR + 2. Any string that looks like a local path (exists or not - we'll + validate in access()) - Since this is a fallback accessor, it returns True for most sources. - The priority system ensures other accessors get a chance first. + Since this is a fallback accessor, it only returns True for sources + that appear to be local paths (not URLs). The priority system ensures + other accessors get a chance first. Args: source: Source string or Path object Returns: - True - this is a fallback accessor that can handle any source + True if source appears to be a local path """ - # As the fallback accessor, we can handle anything - # The registry will try higher-priority accessors first + from openviking.server.local_input_guard import is_remote_resource_source + + if isinstance(source, Path): + return True + + source_str = str(source) + + # Don't handle remote URLs - those go to HTTPAccessor/GitAccessor + if is_remote_resource_source(source_str): + return False + + # For strings, accept anything that could be a local path + # (we'll validate existence in access()) return True async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: @@ -63,9 +75,16 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: Returns: LocalResource pointing to the local path + + Raises: + FileNotFoundError: If the path does not exist """ path = Path(source) + # Validate that the path exists - preserve original behavior + if not path.exists(): + raise FileNotFoundError(f"Path does not exist: {path}") + return LocalResource( path=path, source_type=SourceType.LOCAL, @@ -73,7 +92,7 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource: meta={ "filename": path.name, "suffix": path.suffix.lower() if path.suffix else None, - "is_dir": path.is_dir() if path.exists() else None, + "is_dir": path.is_dir(), }, is_temporary=False, ) diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py index 3acf32b1d..898acacae 100644 --- a/openviking/parse/parsers/html.py +++ b/openviking/parse/parsers/html.py @@ -47,8 +47,15 @@ class HTMLParser(BaseParser): in the new two-layer architecture. This parser only handles local files. """ - def __init__(self): - """Initialize HTML parser.""" + def __init__(self, timeout: float = 30.0, **kwargs): + """ + Initialize HTML parser. + + Args: + timeout: [DEPRECATED] Kept for backward compatibility. + URL downloading has been moved to HTTPAccessor. + **kwargs: Additional arguments (kept for backward compatibility) + """ pass @staticmethod diff --git a/openviking/server/routers/pack.py b/openviking/server/routers/pack.py index 7984183be..ca5ab16ee 100644 --- a/openviking/server/routers/pack.py +++ b/openviking/server/routers/pack.py @@ -94,7 +94,7 @@ async def import_ovpack( service = get_service() upload_temp_dir = get_openviking_config().storage.get_upload_temp_dir() - file_path = resolve_uploaded_temp_file_id(request.temp_file_id, upload_temp_dir) + file_path, _ = resolve_uploaded_temp_file_id(request.temp_file_id, upload_temp_dir) result = await service.pack.import_ovpack( file_path, From 0c7d353149497530e95bc532ed181f590089f2d9 Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 17:19:34 +0800 Subject: [PATCH 13/15] fix: review comments about accessors --- tests/misc/test_media_processor_zip_root.py | 101 +++++++++++--------- 1 file changed, 58 insertions(+), 43 deletions(-) diff --git a/tests/misc/test_media_processor_zip_root.py b/tests/misc/test_media_processor_zip_root.py index 567ca8fb5..204a0ada2 100644 --- a/tests/misc/test_media_processor_zip_root.py +++ b/tests/misc/test_media_processor_zip_root.py @@ -1,14 +1,20 @@ #!/usr/bin/env python3 # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. # SPDX-License-Identifier: AGPL-3.0 +"""Tests for ZipParser single-root directory handling. + +Verifies that ZipParser correctly handles: +1. ZIP with single top-level directory -> uses that directory name +2. ZIP with multiple top-level entries -> uses the extract directory +""" import zipfile from pathlib import Path -from unittest.mock import AsyncMock +from unittest.mock import AsyncMock, patch import pytest -from openviking.utils.media_processor import UnifiedResourceProcessor +from openviking.parse.parsers.zip_parser import ZipParser @pytest.mark.asyncio @@ -17,15 +23,20 @@ async def test_zip_single_top_level_dir_uses_real_root(tmp_path: Path): with zipfile.ZipFile(zip_path, "w") as zf: zf.writestr("tt_b/bb/readme.md", "# hello\n") - processor = UnifiedResourceProcessor() - processor._process_directory = AsyncMock(return_value="ok") + parser = ZipParser() + + # Mock DirectoryParser.parse to capture what directory it's called with + with patch("openviking.parse.parsers.directory.DirectoryParser.parse") as mock_dir_parse: + mock_result = AsyncMock() + mock_result.temp_dir_path = None + mock_dir_parse.return_value = mock_result - result = await processor._process_file(zip_path, instruction="") + await parser.parse(zip_path, instruction="") - assert result == "ok" - called_dir = processor._process_directory.await_args.args[0] - assert isinstance(called_dir, Path) - assert called_dir.name == "tt_b" + # Verify DirectoryParser was called with the real root dir "tt_b" + assert mock_dir_parse.called + called_path = Path(mock_dir_parse.await_args.args[0]) + assert called_path.name == "tt_b" @pytest.mark.asyncio @@ -34,20 +45,20 @@ async def test_zip_single_top_level_dir_ignores_zip_source_name(tmp_path: Path): with zipfile.ZipFile(zip_path, "w") as zf: zf.writestr("tt_b/bb/readme.md", "# hello\n") - processor = UnifiedResourceProcessor() - processor._process_directory = AsyncMock(return_value="ok") + parser = ZipParser() - result = await processor._process_file( - zip_path, - instruction="", - source_name="tt_b.zip", - ) + with patch("openviking.parse.parsers.directory.DirectoryParser.parse") as mock_dir_parse: + mock_result = AsyncMock() + mock_result.temp_dir_path = None + mock_dir_parse.return_value = mock_result - assert result == "ok" - called_dir = processor._process_directory.await_args.args[0] - assert isinstance(called_dir, Path) - assert called_dir.name == "tt_b" - assert "source_name" not in processor._process_directory.await_args.kwargs + await parser.parse(zip_path, instruction="", source_name="tt_b.zip") + + # Verify DirectoryParser was called with the real root dir "tt_b" + called_path = Path(mock_dir_parse.await_args.args[0]) + assert called_path.name == "tt_b" + # source_name should NOT be passed to DirectoryParser in this case + assert "source_name" not in mock_dir_parse.await_args.kwargs @pytest.mark.asyncio @@ -57,35 +68,39 @@ async def test_zip_multiple_top_level_entries_keeps_extract_root(tmp_path: Path) zf.writestr("a/readme.md", "# a\n") zf.writestr("b/readme.md", "# b\n") - processor = UnifiedResourceProcessor() - processor._process_directory = AsyncMock(return_value="ok") + parser = ZipParser() + + with patch("openviking.parse.parsers.directory.DirectoryParser.parse") as mock_dir_parse: + mock_result = AsyncMock() + mock_result.temp_dir_path = None + mock_dir_parse.return_value = mock_result - result = await processor._process_file(zip_path, instruction="") + await parser.parse(zip_path, instruction="") - assert result == "ok" - called_dir = processor._process_directory.await_args.args[0] - assert isinstance(called_dir, Path) - assert called_dir.name != "a" - assert called_dir.name != "b" + # Verify DirectoryParser was called with the extract dir, not "a" or "b" + called_path = Path(mock_dir_parse.await_args.args[0]) + assert called_path.name != "a" + assert called_path.name != "b" + # Should have the ov_zip_ prefix + assert called_path.name.startswith("ov_zip_") @pytest.mark.asyncio async def test_single_file_uses_source_name_for_resource_name(tmp_path: Path): - file_path = tmp_path / "upload_123.txt" - file_path.write_text("hello\n") + """Test that source_name is passed through correctly when needed.""" + zip_path = tmp_path / "mixed.zip" + with zipfile.ZipFile(zip_path, "w") as zf: + zf.writestr("a/readme.md", "# a\n") + zf.writestr("b/readme.md", "# b\n") - processor = UnifiedResourceProcessor() + parser = ZipParser() - with pytest.MonkeyPatch.context() as mp: - parse_mock = AsyncMock(return_value="ok") - mp.setattr("openviking.utils.media_processor.parse", parse_mock) + with patch("openviking.parse.parsers.directory.DirectoryParser.parse") as mock_dir_parse: + mock_result = AsyncMock() + mock_result.temp_dir_path = None + mock_dir_parse.return_value = mock_result - result = await processor._process_file( - file_path, - instruction="", - source_name="aa.txt", - ) + await parser.parse(zip_path, instruction="", source_name="aa.txt") - assert result == "ok" - assert parse_mock.await_args.kwargs["resource_name"] == "aa" - assert parse_mock.await_args.kwargs["source_name"] == "aa.txt" + # Verify source_name is passed when we use the extract root + assert mock_dir_parse.await_args.kwargs.get("source_name") == "aa.txt" From 59f3f057042b87f633eedd13eb8ac2c38a186adb Mon Sep 17 00:00:00 2001 From: openviking Date: Tue, 14 Apr 2026 19:32:41 +0800 Subject: [PATCH 14/15] fix: multiple improvements for filename handling, network access, and ragfs exceptions --- crates/ragfs-python/src/lib.rs | 51 +++++++++++- openviking/parse/parsers/html.py | 18 ----- openviking/parse/parsers/markdown.py | 49 ++++++++++-- openviking/pyagfs/__init__.py | 34 ++++++++ openviking/pyagfs/exceptions.py | 102 ++++++++++++++++++++++++ openviking/server/routers/content.py | 46 ++++++++++- openviking/server/routers/filesystem.py | 92 +++++++++++++++------ openviking/server/routers/search.py | 44 +++++++--- openviking/storage/viking_fs.py | 34 +++++++- openviking/utils/media_processor.py | 46 ++++++++++- openviking_cli/utils/uri.py | 14 ++-- 11 files changed, 452 insertions(+), 78 deletions(-) diff --git a/crates/ragfs-python/src/lib.rs b/crates/ragfs-python/src/lib.rs index 249257e33..20f073119 100644 --- a/crates/ragfs-python/src/lib.rs +++ b/crates/ragfs-python/src/lib.rs @@ -6,7 +6,7 @@ use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; -use pyo3::types::{PyBytes, PyDict, PyList}; +use pyo3::types::{PyBytes, PyDict, PyList, PyType}; use std::collections::HashMap; use std::sync::Arc; use std::time::UNIX_EPOCH; @@ -18,9 +18,54 @@ use ragfs::plugins::{ KVFSPlugin, LocalFSPlugin, MemFSPlugin, QueueFSPlugin, SQLFSPlugin, ServerInfoFSPlugin, }; -/// Convert a ragfs error into a Python RuntimeError +/// Get a Python exception class from the pyagfs module +fn get_exception<'py>(py: Python<'py>, name: &str) -> PyResult> { + let pyagfs = PyModule::import(py, "openviking.pyagfs")?; + let exc = pyagfs.getattr(name)?; + Ok(exc.cast_into()?) +} + +/// Create a PyErr from an exception type name and message +fn new_py_err(name: &str, msg: String) -> PyErr { + Python::attach(|py| { + if let Ok(exc) = get_exception(py, name) { + PyErr::from_type(exc, msg) + } else { + PyRuntimeError::new_err(msg) + } + }) +} + +/// Convert a ragfs error into the appropriate Python exception fn to_py_err(e: ragfs::core::Error) -> PyErr { - PyRuntimeError::new_err(e.to_string()) + let msg = e.to_string(); + match e { + ragfs::core::Error::NotFound(_) => new_py_err("AGFSNotFoundError", msg), + ragfs::core::Error::AlreadyExists(_) => new_py_err("AGFSAlreadyExistsError", msg), + ragfs::core::Error::PermissionDenied(_) => new_py_err("AGFSPermissionDeniedError", msg), + ragfs::core::Error::InvalidPath(_) => new_py_err("AGFSInvalidPathError", msg), + ragfs::core::Error::NotADirectory(_) => new_py_err("AGFSNotADirectoryError", msg), + ragfs::core::Error::IsADirectory(_) => new_py_err("AGFSIsADirectoryError", msg), + ragfs::core::Error::DirectoryNotEmpty(_) => new_py_err("AGFSDirectoryNotEmptyError", msg), + ragfs::core::Error::InvalidOperation(_) => new_py_err("AGFSInvalidOperationError", msg), + ragfs::core::Error::Io(_) => new_py_err("AGFSIoError", msg), + ragfs::core::Error::Plugin(_) => { + // Check if the plugin error message contains known patterns + let err_msg = msg.to_lowercase(); + if err_msg.contains("directory not empty") { + new_py_err("AGFSDirectoryNotEmptyError", msg) + } else { + new_py_err("AGFSPluginError", msg) + } + } + ragfs::core::Error::Config(_) => new_py_err("AGFSConfigError", msg), + ragfs::core::Error::MountPointNotFound(_) => new_py_err("AGFSMountPointNotFoundError", msg), + ragfs::core::Error::MountPointExists(_) => new_py_err("AGFSMountPointExistsError", msg), + ragfs::core::Error::Serialization(_) => new_py_err("AGFSSerializationError", msg), + ragfs::core::Error::Network(_) => new_py_err("AGFSNetworkError", msg), + ragfs::core::Error::Timeout(_) => new_py_err("AGFSTimeoutError", msg), + ragfs::core::Error::Internal(_) => new_py_err("AGFSInternalError", msg), + } } /// Convert FileInfo to a Python dict matching the Go binding JSON format: diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py index 898acacae..f21c8d8d0 100644 --- a/openviking/parse/parsers/html.py +++ b/openviking/parse/parsers/html.py @@ -8,8 +8,6 @@ For URL downloading, use HTTPAccessor in the new two-layer architecture. """ -import hashlib -import re import time from pathlib import Path from typing import List, Optional, Union @@ -231,19 +229,3 @@ async def parse_content( result.parser_name = "HTMLParser" return result - - def _sanitize_for_path(self, text: str, max_length: int = 50) -> str: - """Sanitize text for use in file path, hash & shorten if too long.""" - safe = re.sub( - r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s-]", - "", - text, - ) - safe = re.sub(r"\s+", "_", safe) - safe = safe.strip("_") - if not safe: - return "section" - if len(safe) > max_length: - hash_suffix = hashlib.sha256(text.encode()).hexdigest()[:8] - return f"{safe[: max_length - 9]}_{hash_suffix}" - return safe diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py index a812ba1d1..d84285925 100644 --- a/openviking/parse/parsers/markdown.py +++ b/openviking/parse/parsers/markdown.py @@ -23,11 +23,48 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from openviking.parse.accessors.mime_types import IANA_MEDIA_TYPE_TO_EXTENSION from openviking.parse.base import NodeType, ParseResult, ResourceNode, create_parse_result from openviking.parse.parsers.base_parser import BaseParser +from openviking.parse.parsers.constants import ( + CODE_EXTENSIONS, + DOCUMENTATION_EXTENSIONS, + IGNORE_EXTENSIONS, +) from openviking_cli.utils.config.parser_config import ParserConfig from openviking_cli.utils.logger import get_logger +# All known valid extensions - only these should be stripped when getting stem +KNOWN_EXTENSIONS: set[str] = set() +for extensions in IANA_MEDIA_TYPE_TO_EXTENSION.values(): + KNOWN_EXTENSIONS.update(extensions) +KNOWN_EXTENSIONS.update(CODE_EXTENSIONS) +KNOWN_EXTENSIONS.update(DOCUMENTATION_EXTENSIONS) +KNOWN_EXTENSIONS.update(IGNORE_EXTENSIONS) + + +def _smart_stem(path_or_name: str | Path) -> str: + """Get the stem of a filename, but only strip known valid extensions. + + For filenames like "2601.00014" where ".00014" is not a valid extension, + returns the full name instead of just "2601". + + Args: + path_or_name: Path object or string filename + + Returns: + Stem with only known extensions stripped + """ + path = Path(path_or_name) + suffix = path.suffix.lower() + + if suffix in KNOWN_EXTENSIONS: + return path.stem + + # If the suffix is not a known extension, treat the whole name as the stem + return path.name + + logger = get_logger(__name__) if TYPE_CHECKING: @@ -180,9 +217,9 @@ async def parse_content( # the temp upload name (e.g. upload_.txt). doc_title = meta.get("frontmatter", {}).get( "title", - Path(explicit_name).stem + _smart_stem(explicit_name) if explicit_name - else Path(source_path).stem + else _smart_stem(source_path) if source_path else "Document", ) @@ -200,7 +237,7 @@ async def parse_content( headings, root_dir, source_path, - doc_name=self._sanitize_for_path(Path(doc_title).stem), + doc_name=self._sanitize_for_path(doc_title), ) parse_time = time.time() - start_time @@ -358,12 +395,12 @@ def _smart_split_content(self, content: str, max_size: int) -> List[str]: def _sanitize_for_path(self, text: str, max_length: int = 50) -> str: safe = re.sub( - r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s-]", + r"[^\w\u0080-\u02af\u0400-\u052f\u0600-\u077f\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s.-]", "", text, ) safe = re.sub(r"\s+", "_", safe) - safe = safe.strip("_") + safe = safe.strip("._") if not safe: return "section" if len(safe) > max_length: @@ -411,7 +448,7 @@ async def _parse_and_create_structure( # Get document name doc_name = doc_name or self._sanitize_for_path( - Path(source_path).stem if source_path else "content" + _smart_stem(source_path) if source_path else "content" ) # Small document: save as single file (check both token and char limits) diff --git a/openviking/pyagfs/__init__.py b/openviking/pyagfs/__init__.py index 7b487a27e..6ae6bc8cc 100644 --- a/openviking/pyagfs/__init__.py +++ b/openviking/pyagfs/__init__.py @@ -11,10 +11,27 @@ from .client import AGFSClient, FileHandle from .exceptions import ( + AGFSAlreadyExistsError, AGFSClientError, + AGFSConfigError, AGFSConnectionError, + AGFSDirectoryNotEmptyError, + AGFSFileExistsError, AGFSHTTPError, + AGFSInternalError, + AGFSInvalidOperationError, + AGFSInvalidPathError, + AGFSIoError, + AGFSIsADirectoryError, + AGFSMountPointExistsError, + AGFSMountPointNotFoundError, + AGFSNetworkError, + AGFSNotADirectoryError, + AGFSNotFoundError, AGFSNotSupportedError, + AGFSPermissionDeniedError, + AGFSPluginError, + AGFSSerializationError, AGFSTimeoutError, ) from .helpers import cp, download, upload @@ -109,6 +126,23 @@ def get_binding_client(): "AGFSTimeoutError", "AGFSHTTPError", "AGFSNotSupportedError", + "AGFSNotFoundError", + "AGFSAlreadyExistsError", + "AGFSFileExistsError", + "AGFSPermissionDeniedError", + "AGFSInvalidPathError", + "AGFSNotADirectoryError", + "AGFSIsADirectoryError", + "AGFSDirectoryNotEmptyError", + "AGFSInvalidOperationError", + "AGFSIoError", + "AGFSConfigError", + "AGFSMountPointNotFoundError", + "AGFSMountPointExistsError", + "AGFSSerializationError", + "AGFSNetworkError", + "AGFSInternalError", + "AGFSPluginError", "cp", "upload", "download", diff --git a/openviking/pyagfs/exceptions.py b/openviking/pyagfs/exceptions.py index 7247215c9..73fd756a3 100644 --- a/openviking/pyagfs/exceptions.py +++ b/openviking/pyagfs/exceptions.py @@ -31,3 +31,105 @@ class AGFSNotSupportedError(AGFSClientError): """Operation not supported by the server or filesystem (HTTP 501)""" pass + + +class AGFSNotFoundError(AGFSClientError): + """File or directory not found""" + + pass + + +class AGFSAlreadyExistsError(AGFSClientError): + """File or directory already exists""" + + pass + + +class AGFSFileExistsError(AGFSAlreadyExistsError): + """File already exists (alias for AGFSAlreadyExistsError)""" + + pass + + +class AGFSPermissionDeniedError(AGFSClientError): + """Permission denied""" + + pass + + +class AGFSInvalidPathError(AGFSClientError): + """Invalid path""" + + pass + + +class AGFSNotADirectoryError(AGFSClientError): + """Not a directory""" + + pass + + +class AGFSIsADirectoryError(AGFSClientError): + """Is a directory (when file operation expected)""" + + pass + + +class AGFSDirectoryNotEmptyError(AGFSClientError): + """Directory not empty""" + + pass + + +class AGFSInvalidOperationError(AGFSClientError): + """Invalid operation""" + + pass + + +class AGFSIoError(AGFSClientError): + """I/O error""" + + pass + + +class AGFSConfigError(AGFSClientError): + """Configuration error""" + + pass + + +class AGFSMountPointNotFoundError(AGFSClientError): + """Mount point not found""" + + pass + + +class AGFSMountPointExistsError(AGFSClientError): + """Mount point already exists""" + + pass + + +class AGFSSerializationError(AGFSClientError): + """Serialization error""" + + pass + + +class AGFSNetworkError(AGFSClientError): + """Network error""" + + pass + + +class AGFSInternalError(AGFSClientError): + """Internal error""" + + pass + + +class AGFSPluginError(AGFSClientError): + """Plugin error""" + + pass diff --git a/openviking/server/routers/content.py b/openviking/server/routers/content.py index 7801546d0..188166814 100644 --- a/openviking/server/routers/content.py +++ b/openviking/server/routers/content.py @@ -9,12 +9,14 @@ from fastapi.responses import Response as FastAPIResponse from pydantic import BaseModel, ConfigDict +from openviking.pyagfs.exceptions import AGFSClientError, AGFSNotFoundError from openviking.server.auth import get_request_context from openviking.server.dependencies import get_service from openviking.server.identity import RequestContext from openviking.server.models import ErrorInfo, Response from openviking.server.telemetry import run_operation from openviking.telemetry import TelemetryRequest +from openviking_cli.exceptions import NotFoundError from openviking_cli.utils import get_logger logger = get_logger(__name__) @@ -55,7 +57,16 @@ async def read( ): """Read file content (L2).""" service = get_service() - result = await service.fs.read(uri, ctx=_ctx, offset=offset, limit=limit) + try: + result = await service.fs.read(uri, ctx=_ctx, offset=offset, limit=limit) + except AGFSNotFoundError: + raise NotFoundError(uri, "file") + except AGFSClientError as e: + # Fallback for older versions without typed exceptions + err_msg = str(e).lower() + if "not found" in err_msg or "no such file or directory" in err_msg: + raise NotFoundError(uri, "file") + raise # 清理MEMORY_FIELDS隐藏注释(v2记忆加工过程中的临时内部数据,不暴露给外部用户) if isinstance(result, bytes): @@ -80,7 +91,16 @@ async def abstract( ): """Read L0 abstract.""" service = get_service() - result = await service.fs.abstract(uri, ctx=_ctx) + try: + result = await service.fs.abstract(uri, ctx=_ctx) + except AGFSNotFoundError: + raise NotFoundError(uri, "file") + except AGFSClientError as e: + # Fallback for older versions without typed exceptions + err_msg = str(e).lower() + if "not found" in err_msg or "no such file or directory" in err_msg: + raise NotFoundError(uri, "file") + raise return Response(status="ok", result=result) @@ -91,7 +111,16 @@ async def overview( ): """Read L1 overview.""" service = get_service() - result = await service.fs.overview(uri, ctx=_ctx) + try: + result = await service.fs.overview(uri, ctx=_ctx) + except AGFSNotFoundError: + raise NotFoundError(uri, "file") + except AGFSClientError as e: + # Fallback for older versions without typed exceptions + err_msg = str(e).lower() + if "not found" in err_msg or "no such file or directory" in err_msg: + raise NotFoundError(uri, "file") + raise return Response(status="ok", result=result) @@ -102,7 +131,16 @@ async def download( ): """Download file as raw bytes (for images, binaries, etc.).""" service = get_service() - content = await service.fs.read_file_bytes(uri, ctx=_ctx) + try: + content = await service.fs.read_file_bytes(uri, ctx=_ctx) + except AGFSNotFoundError: + raise NotFoundError(uri, "file") + except AGFSClientError as e: + # Fallback for older versions without typed exceptions + err_msg = str(e).lower() + if "not found" in err_msg or "no such file or directory" in err_msg: + raise NotFoundError(uri, "file") + raise # Try to get filename from stat filename = "download" diff --git a/openviking/server/routers/filesystem.py b/openviking/server/routers/filesystem.py index b5dcdb379..4c27e0f88 100644 --- a/openviking/server/routers/filesystem.py +++ b/openviking/server/routers/filesystem.py @@ -7,7 +7,7 @@ from fastapi import APIRouter, Depends, Query from pydantic import BaseModel -from openviking.pyagfs.exceptions import AGFSClientError +from openviking.pyagfs.exceptions import AGFSClientError, AGFSNotFoundError from openviking.server.auth import get_request_context from openviking.server.dependencies import get_service from openviking.server.identity import RequestContext @@ -32,16 +32,25 @@ async def ls( """List directory contents.""" service = get_service() actual_node_limit = limit if limit is not None else node_limit - result = await service.fs.ls( - uri, - ctx=_ctx, - recursive=recursive, - simple=simple, - output=output, - abs_limit=abs_limit, - show_all_hidden=show_all_hidden, - node_limit=actual_node_limit, - ) + try: + result = await service.fs.ls( + uri, + ctx=_ctx, + recursive=recursive, + simple=simple, + output=output, + abs_limit=abs_limit, + show_all_hidden=show_all_hidden, + node_limit=actual_node_limit, + ) + except AGFSNotFoundError: + raise NotFoundError(uri, "file") + except AGFSClientError as e: + # Fallback for older versions without typed exceptions + err_msg = str(e).lower() + if "not found" in err_msg or "no such file or directory" in err_msg: + raise NotFoundError(uri, "file") + raise return Response(status="ok", result=result) @@ -59,15 +68,24 @@ async def tree( """Get directory tree.""" service = get_service() actual_node_limit = limit if limit is not None else node_limit - result = await service.fs.tree( - uri, - ctx=_ctx, - output=output, - abs_limit=abs_limit, - show_all_hidden=show_all_hidden, - node_limit=actual_node_limit, - level_limit=level_limit, - ) + try: + result = await service.fs.tree( + uri, + ctx=_ctx, + output=output, + abs_limit=abs_limit, + show_all_hidden=show_all_hidden, + node_limit=actual_node_limit, + level_limit=level_limit, + ) + except AGFSNotFoundError: + raise NotFoundError(uri, "file") + except AGFSClientError as e: + # Fallback for older versions without typed exceptions + err_msg = str(e).lower() + if "not found" in err_msg or "no such file or directory" in err_msg: + raise NotFoundError(uri, "file") + raise return Response(status="ok", result=result) @@ -81,7 +99,10 @@ async def stat( try: result = await service.fs.stat(uri, ctx=_ctx) return Response(status="ok", result=result) + except AGFSNotFoundError: + raise NotFoundError(uri, "file") except AGFSClientError as e: + # Fallback for older versions without typed exceptions err_msg = str(e).lower() if "not found" in err_msg or "no such file or directory" in err_msg: raise NotFoundError(uri, "file") @@ -101,7 +122,14 @@ async def mkdir( ): """Create directory.""" service = get_service() - await service.fs.mkdir(request.uri, ctx=_ctx) + try: + await service.fs.mkdir(request.uri, ctx=_ctx) + except AGFSClientError as e: + # Handle common AGFS errors + err_msg = str(e).lower() + if "not found" in err_msg or "no such file or directory" in err_msg: + raise NotFoundError(request.uri, "file") + raise return Response(status="ok", result={"uri": request.uri}) @@ -113,7 +141,16 @@ async def rm( ): """Remove resource.""" service = get_service() - await service.fs.rm(uri, ctx=_ctx, recursive=recursive) + try: + await service.fs.rm(uri, ctx=_ctx, recursive=recursive) + except AGFSNotFoundError: + raise NotFoundError(uri, "file") + except AGFSClientError as e: + # Fallback for older versions without typed exceptions + err_msg = str(e).lower() + if "not found" in err_msg or "no such file or directory" in err_msg: + raise NotFoundError(uri, "file") + raise return Response(status="ok", result={"uri": uri}) @@ -131,5 +168,14 @@ async def mv( ): """Move resource.""" service = get_service() - await service.fs.mv(request.from_uri, request.to_uri, ctx=_ctx) + try: + await service.fs.mv(request.from_uri, request.to_uri, ctx=_ctx) + except AGFSNotFoundError: + raise NotFoundError(request.from_uri, "file") + except AGFSClientError as e: + # Fallback for older versions without typed exceptions + err_msg = str(e).lower() + if "not found" in err_msg or "no such file or directory" in err_msg: + raise NotFoundError(request.from_uri, "file") + raise return Response(status="ok", result={"from": request.from_uri, "to": request.to_uri}) diff --git a/openviking/server/routers/search.py b/openviking/server/routers/search.py index f8c14ead8..001351fca 100644 --- a/openviking/server/routers/search.py +++ b/openviking/server/routers/search.py @@ -8,12 +8,14 @@ from fastapi import APIRouter, Depends from pydantic import BaseModel +from openviking.pyagfs.exceptions import AGFSClientError, AGFSNotFoundError from openviking.server.auth import get_request_context from openviking.server.dependencies import get_service from openviking.server.identity import RequestContext from openviking.server.models import Response from openviking.server.telemetry import run_operation from openviking.telemetry import TelemetryRequest +from openviking_cli.exceptions import NotFoundError def _sanitize_floats(obj: Any) -> Any: @@ -156,15 +158,24 @@ async def grep( ): """Content search with pattern.""" service = get_service() - result = await service.fs.grep( - request.uri, - request.pattern, - ctx=_ctx, - exclude_uri=request.exclude_uri, - case_insensitive=request.case_insensitive, - node_limit=request.node_limit, - level_limit=request.level_limit, - ) + try: + result = await service.fs.grep( + request.uri, + request.pattern, + ctx=_ctx, + exclude_uri=request.exclude_uri, + case_insensitive=request.case_insensitive, + node_limit=request.node_limit, + level_limit=request.level_limit, + ) + except AGFSNotFoundError: + raise NotFoundError(request.uri, "file") + except AGFSClientError as e: + # Fallback for older versions without typed exceptions + err_msg = str(e).lower() + if "not found" in err_msg or "no such file or directory" in err_msg: + raise NotFoundError(request.uri, "file") + raise return Response(status="ok", result=result) @@ -175,7 +186,16 @@ async def glob( ): """File pattern matching.""" service = get_service() - result = await service.fs.glob( - request.pattern, ctx=_ctx, uri=request.uri, node_limit=request.node_limit - ) + try: + result = await service.fs.glob( + request.pattern, ctx=_ctx, uri=request.uri, node_limit=request.node_limit + ) + except AGFSNotFoundError: + raise NotFoundError(request.uri or request.pattern, "file") + except AGFSClientError as e: + # Fallback for older versions without typed exceptions + err_msg = str(e).lower() + if "not found" in err_msg or "no such file or directory" in err_msg: + raise NotFoundError(request.uri or request.pattern, "file") + raise return Response(status="ok", result=result) diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index f1bbaf906..c83cc67cd 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -23,12 +23,12 @@ from pathlib import PurePath from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union -from openviking.pyagfs.exceptions import AGFSClientError, AGFSHTTPError +from openviking.pyagfs.exceptions import AGFSClientError, AGFSDirectoryNotEmptyError, AGFSHTTPError from openviking.resource.watch_storage import is_watch_task_control_uri from openviking.server.identity import RequestContext, Role from openviking.telemetry import get_current_telemetry from openviking.utils.time_utils import format_simplified, get_current_timestamp, parse_iso_datetime -from openviking_cli.exceptions import NotFoundError +from openviking_cli.exceptions import FailedPreconditionError, NotFoundError from openviking_cli.session.user_id import UserIdentifier from openviking_cli.utils.logger import get_logger from openviking_cli.utils.uri import VikingURI @@ -41,6 +41,22 @@ logger = get_logger(__name__) +def _is_directory_not_empty_error(message: str) -> bool: + """Check if an error message indicates a directory not empty error. + + Handles multiple possible error message formats from different backends. + """ + msg = message.lower() + return any( + pattern in msg + for pattern in [ + "directory not empty", + "dir not empty", + "directory is not empty", + ] + ) + + # ========== Dataclass ========== @@ -410,7 +426,19 @@ async def rm( uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx) uris_to_delete.append(target_uri) await self._delete_from_vector_store(uris_to_delete, ctx=ctx) - result = self.agfs.rm(path, recursive=recursive) + try: + result = self.agfs.rm(path, recursive=recursive) + except AGFSDirectoryNotEmptyError: + raise FailedPreconditionError( + f"Directory not empty: {uri}. Use recursive=True to delete non-empty directories." + ) + except RuntimeError as e: + # Fallback for older versions without typed exceptions + if _is_directory_not_empty_error(str(e)): + raise FailedPreconditionError( + f"Directory not empty: {uri}. Use recursive=True to delete non-empty directories." + ) + raise return result except LockAcquisitionError: raise ResourceBusyError(f"Resource is being processed: {uri}") diff --git a/openviking/utils/media_processor.py b/openviking/utils/media_processor.py index 9bbf7ad6e..105f5fb77 100644 --- a/openviking/utils/media_processor.py +++ b/openviking/utils/media_processor.py @@ -7,7 +7,13 @@ from openviking.parse import DocumentConverter, parse from openviking.parse.accessors.base import SourceType +from openviking.parse.accessors.mime_types import IANA_MEDIA_TYPE_TO_EXTENSION from openviking.parse.base import ParseResult +from openviking.parse.parsers.constants import ( + CODE_EXTENSIONS, + DOCUMENTATION_EXTENSIONS, + IGNORE_EXTENSIONS, +) from openviking.server.local_input_guard import ( is_remote_resource_source, looks_like_local_path, @@ -15,6 +21,40 @@ from openviking_cli.exceptions import PermissionDeniedError from openviking_cli.utils.logger import get_logger +# All known valid extensions - only these should be stripped when getting stem +# Build from multiple sources: +# 1. IANA media type mappings (comprehensive) +# 2. Code/documentation/ignore extensions (parser-specific) +KNOWN_EXTENSIONS: set[str] = set() +for extensions in IANA_MEDIA_TYPE_TO_EXTENSION.values(): + KNOWN_EXTENSIONS.update(extensions) +KNOWN_EXTENSIONS.update(CODE_EXTENSIONS) +KNOWN_EXTENSIONS.update(DOCUMENTATION_EXTENSIONS) +KNOWN_EXTENSIONS.update(IGNORE_EXTENSIONS) + + +def _smart_stem(path_or_name: str | Path) -> str: + """Get the stem of a filename, but only strip known valid extensions. + + For filenames like "2601.00014" where ".00014" is not a valid extension, + returns the full name instead of just "2601". + + Args: + path_or_name: Path object or string filename + + Returns: + Stem with only known extensions stripped + """ + path = Path(path_or_name) + suffix = path.suffix.lower() + + if suffix in KNOWN_EXTENSIONS: + return path.stem + + # If the suffix is not a known extension, treat the whole name as the stem + return path.name + + if TYPE_CHECKING: from openviking.parse.vlm import VLMProcessor from openviking_cli.utils.storage import StoragePath @@ -113,7 +153,7 @@ async def process( # Set resource_name from source_name or path source_name = kwargs.get("source_name") if source_name: - parse_kwargs["resource_name"] = Path(source_name).stem + parse_kwargs["resource_name"] = _smart_stem(source_name) parse_kwargs.setdefault("source_name", source_name) else: # For git repositories, use repo_name from meta if available @@ -125,10 +165,10 @@ async def process( # Prefer original_filename from meta for HTTP downloads original_filename = local_resource.meta.get("original_filename") if original_filename: - parse_kwargs.setdefault("resource_name", Path(original_filename).stem) + parse_kwargs.setdefault("resource_name", _smart_stem(original_filename)) parse_kwargs.setdefault("source_name", original_filename) else: - parse_kwargs.setdefault("resource_name", local_resource.path.stem) + parse_kwargs.setdefault("resource_name", _smart_stem(local_resource.path)) # If it's a directory, use DirectoryParser which will delegate to CodeRepositoryParser if it's a git repo if local_resource.path.is_dir(): diff --git a/openviking_cli/utils/uri.py b/openviking_cli/utils/uri.py index 61ef4d1c7..d4ab5af9b 100644 --- a/openviking_cli/utils/uri.py +++ b/openviking_cli/utils/uri.py @@ -227,21 +227,23 @@ def sanitize_segment(text: str) -> str: URI-safe string """ # Preserve: - # - Letters, numbers, underscores, hyphens (\w includes [a-zA-Z0-9_]) + # - Letters, numbers, underscores, hyphens, dots (\w includes [a-zA-Z0-9_]) + # - Latin-1 Supplement, Latin Extended-A/B (Western European languages: Spanish, Portuguese, French, German, etc.) + # - Cyrillic (Russian, Bulgarian, Serbian, etc.) + # - Arabic (Arabic, Persian, Urdu, etc.) # - CJK Unified Ideographs (Chinese, Japanese Kanji, Korean Hanja) # - Hiragana and Katakana (Japanese) # - Hangul Syllables (Korean) - # - CJK Unified Ideographs Extension A - # - CJK Unified Ideographs Extension B + # - CJK Unified Ideographs Extension A/B safe = re.sub( - r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\-]", + r"[^\w\u0080-\u02af\u0400-\u052f\u0600-\u077f\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\-.]", "_", text, ) # Merge consecutive underscores safe = re.sub(r"_+", "_", safe) - # Strip leading/trailing underscores and limit length - safe = safe.strip("_")[:50] + # Strip leading/trailing underscores and dots, limit length + safe = safe.strip("_.")[:50] return safe or "unnamed" def __str__(self) -> str: From c4550e733ffb9a06cc1a91980432dbc8553132b1 Mon Sep 17 00:00:00 2001 From: openviking Date: Wed, 15 Apr 2026 10:25:09 +0800 Subject: [PATCH 15/15] fix: multiple improvements for filename handling, network access, and ragfs exceptions --- openviking/parse/accessors/http_accessor.py | 3 +++ tests/server/test_api_local_input_security.py | 12 +----------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/openviking/parse/accessors/http_accessor.py b/openviking/parse/accessors/http_accessor.py index 19434705e..6c9941145 100644 --- a/openviking/parse/accessors/http_accessor.py +++ b/openviking/parse/accessors/http_accessor.py @@ -25,6 +25,7 @@ from openviking.parse.base import lazy_import from openviking.parse.parsers.constants import CODE_EXTENSIONS from openviking.utils.network_guard import build_httpx_request_validation_hooks +from openviking_cli.exceptions import PermissionDeniedError from openviking_cli.utils.logger import get_logger from .base import DataAccessor, LocalResource, SourceType @@ -192,6 +193,8 @@ async def detect( if url_type != URLType.UNKNOWN: return url_type, meta + except PermissionDeniedError: + raise except Exception as e: meta["detection_error"] = str(e) logger.debug(f"[URLTypeDetector] HEAD request failed: {e}, falling back to default") diff --git a/tests/server/test_api_local_input_security.py b/tests/server/test_api_local_input_security.py index 351f2be39..9564ea163 100644 --- a/tests/server/test_api_local_input_security.py +++ b/tests/server/test_api_local_input_security.py @@ -11,7 +11,7 @@ import httpx import pytest -from openviking.parse.parsers.html import HTMLParser, URLTypeDetector +from openviking.parse.parsers.html import URLTypeDetector from openviking.utils.network_guard import ensure_public_remote_target from openviking_cli.exceptions import PermissionDeniedError @@ -227,13 +227,3 @@ async def test_url_detector_request_validator_blocks_loopback_head(loopback_http timeout=2.0, request_validator=ensure_public_remote_target, ) - - -async def test_html_parser_request_validator_blocks_loopback_fetch(loopback_http_url: str): - parser = HTMLParser(timeout=2.0) - - with pytest.raises(PermissionDeniedError): - await parser._fetch_html( - loopback_http_url, - request_validator=ensure_public_remote_target, - )