From 6afc68b7f1d9d3b71ee20449be93558f9b85ad32 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 10:29:23 +0800
Subject: [PATCH 01/15] reort: split parser layer to 2-layer: accessor and
 parser, so that we can reuse more code

---
 docs/design/parser-two-layer-refactor-plan.md | 355 ++++++++++
 examples/ov.conf.example                      |   5 +-
 openviking/parse/__init__.py                  |   2 -
 openviking/parse/accessors/__init__.py        |  33 +
 openviking/parse/accessors/base.py            | 191 ++++++
 openviking/parse/accessors/feishu.py          | 646 ++++++++++++++++++
 openviking/parse/accessors/git.py             | 581 ++++++++++++++++
 openviking/parse/accessors/http.py            | 228 +++++++
 openviking/parse/accessors/registry.py        | 204 ++++++
 openviking/parse/parsers/__init__.py          |   6 +-
 openviking/parse/parsers/code/code.py         | 111 +--
 openviking/parse/parsers/directory.py         |  63 ++
 openviking/parse/parsers/html.py              | 592 +---------------
 openviking/parse/parsers/zip_parser.py        | 264 +++----
 openviking/parse/registry.py                  |  33 +-
 openviking/utils/media_processor.py           | 212 +++---
 openviking/utils/resource_processor.py        |  10 +-
 openviking_cli/utils/config/parser_config.py  |   2 +-
 tests/unit/test_accessors_base.py             | 135 ++++
 tests/unit/test_accessors_git.py              |  84 +++
 tests/unit/test_accessors_http.py             | 116 ++++
 tests/unit/test_accessors_registry.py         | 155 +++++
 22 files changed, 3045 insertions(+), 983 deletions(-)
 create mode 100644 docs/design/parser-two-layer-refactor-plan.md
 create mode 100644 openviking/parse/accessors/__init__.py
 create mode 100644 openviking/parse/accessors/base.py
 create mode 100644 openviking/parse/accessors/feishu.py
 create mode 100644 openviking/parse/accessors/git.py
 create mode 100644 openviking/parse/accessors/http.py
 create mode 100644 openviking/parse/accessors/registry.py
 create mode 100644 tests/unit/test_accessors_base.py
 create mode 100644 tests/unit/test_accessors_git.py
 create mode 100644 tests/unit/test_accessors_http.py
 create mode 100644 tests/unit/test_accessors_registry.py

diff --git a/docs/design/parser-two-layer-refactor-plan.md b/docs/design/parser-two-layer-refactor-plan.md
new file mode 100644
index 000000000..1d99a7cc7
--- /dev/null
+++ b/docs/design/parser-two-layer-refactor-plan.md
@@ -0,0 +1,355 @@
+# OpenViking 解析器两层架构重构计划
+
+| 项目 | 信息 |
+|-----|------|
+| 状态 | `规划中` |
+| 创建日期 | 2026-04-13 |
+| 目标版本 | v6.0 |
+
+---
+
+## 一、问题分析
+
+### 当前架构的问题
+
+1. **平铺式注册**：所有 Parser 在同一层级，职责不清晰
+2. **后缀冲突**：`.zip` 可被 `CodeRepositoryParser` 和 `ZipParser` 同时处理
+3. **URL 处理逻辑分散**：`UnifiedResourceProcessor._process_url()` 和 `ParserRegistry.parse()` 都有 URL 检测
+4. **职责混合**：部分 Parser 既负责下载又负责解析（如 `CodeRepositoryParser`, `HTMLParser`, `FeishuParser`）
+
+---
+
+## 二、新架构设计
+
+### 核心概念
+
+| 层级 | 抽象接口 | 职责 | 示例 |
+|-----|---------|------|------|
+| **L1** | `DataAccessor` | 获取数据：将远程 URL / 特殊路径 → 本地文件/目录 | GitAccessor, HTTPAccessor, FeishuAccessor |
+| **L2** | `DataParser` | 解析数据：本地文件/目录 → `ParseResult` | MarkdownParser, PDFParser, DirectoryParser |
+| **混合** | `HybridParser` | 同时实现两个接口（简化插件开发） | （按需使用） |
+
+### 新的调用流程
+
+```
+add_resource(path)
+    ↓
+ResourceProcessor.process_resource()
+    ↓
+UnifiedResourceProcessor.process()  [重构]
+    ↓
+【第一阶段：数据访问】
+AccessorRegistry.route(source)
+    ├─→ 是 URL/远程资源?
+    │     ├─→ GitAccessor (git@, git://, github.com, ...)
+    │     ├─→ FeishuAccessor (.feishu.cn, .larksuite.com)
+    │     ├─→ HTTPAccessor (http://, https://)
+    │     └─→ 其他 → 下一步
+    └─→ 返回: LocalResource (本地路径 + 元数据)
+           ↓
+【第二阶段：数据解析】
+ParserRegistry.route(local_resource)
+    ├─→ 是目录? → DirectoryParser
+    ├─→ 是文件? → 按扩展名匹配
+    │     ├─→ .md → MarkdownParser
+    │     ├─→ .pdf → PDFParser
+    │     ├─→ .zip → ZipParser
+    │     └─→ ...
+    └─→ 返回: ParseResult
+           ↓
+TreeBuilder + SemanticQueue (保持不变)
+```
+
+---
+
+## 三、详细设计
+
+### 3.1 核心抽象接口
+
+#### 文件 1: `openviking/parse/accessors/base.py`
+
+```python
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Optional, Dict, Any, Union
+from dataclasses import dataclass
+
+@dataclass
+class LocalResource:
+    """数据访问层的输出：表示可访问的本地资源"""
+    path: Path                    # 本地文件/目录路径
+    source_type: str              # 原始来源类型: "git", "http", "feishu", "local", ...
+    original_source: str           # 原始 source 字符串
+    meta: Dict[str, Any]          # 元数据（repo_name, branch, 等）
+    is_temporary: bool = True      # 是否为临时文件，解析后可清理
+
+class DataAccessor(ABC):
+    """数据访问器：负责获取数据到本地"""
+
+    @abstractmethod
+    def can_handle(self, source: Union[str, Path]) -> bool:
+        """判断是否能处理该来源"""
+        pass
+
+    @abstractmethod
+    async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
+        """
+        获取数据到本地
+
+        返回: LocalResource，包含本地路径和元数据
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def priority(self) -> int:
+        """
+        优先级，数字越大优先级越高
+        用于解决冲突：多个 Accessor 都 can_handle 时，选优先级高的
+        """
+        pass
+
+    def cleanup(self, resource: LocalResource) -> None:
+        """
+        清理临时资源（可选）
+        默认：如果是临时资源，删除本地文件/目录
+        """
+        if resource.is_temporary:
+            # 实现清理逻辑
+            pass
+```
+
+#### 文件 2: `openviking/parse/parsers/base_parser.py` (重构)
+
+```python
+# 保持现有 BaseParser 接口，但明确其职责为 DataParser
+# 可以考虑重命名为 DataParser 或保留 BaseParser 作为别名
+```
+
+#### 文件 3: `openviking/parse/accessors/registry.py`
+
+```python
+from typing import Union, Path, Optional, List
+from .base import DataAccessor, LocalResource
+
+class AccessorRegistry:
+    """数据访问器注册表"""
+
+    def __init__(self):
+        self._accessors: List[DataAccessor] = []
+
+    def register(self, accessor: DataAccessor) -> None:
+        """注册访问器（按优先级插入）"""
+        # 按 priority 降序插入
+        idx = 0
+        for i, a in enumerate(self._accessors):
+            if accessor.priority > a.priority:
+                idx = i
+                break
+        else:
+            idx = len(self._accessors)
+        self._accessors.insert(idx, accessor)
+
+    def get_accessor(self, source: Union[str, Path]) -> Optional[DataAccessor]:
+        """获取能处理该 source 的访问器（按优先级）"""
+        for accessor in self._accessors:
+            if accessor.can_handle(source):
+                return accessor
+        return None
+
+    async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
+        """
+        路由到合适的访问器获取数据
+
+        如果没有访问器能处理，视为本地文件返回
+        """
+        accessor = self.get_accessor(source)
+        if accessor:
+            return await accessor.access(source, **kwargs)
+
+        # 默认：视为本地文件
+        path = Path(source)
+        return LocalResource(
+            path=path,
+            source_type="local",
+            original_source=str(source),
+            meta={},
+            is_temporary=False
+        )
+```
+
+### 3.2 现有 Parser 拆分方案
+
+| 当前 Parser | 拆分后 | 说明 |
+|-----------|--------|------|
+| `CodeRepositoryParser` | `GitAccessor` + `DirectoryParser` | Git clone 逻辑移到 Accessor |
+| `HTMLParser` | `HTTPAccessor` + `HTMLParser` | HTTP 下载移到 Accessor |
+| `FeishuParser` | `FeishuAccessor` + (新) `FeishuDocumentParser` | 飞书 API 调用移到 Accessor |
+| `ZipParser` | 保持为 `DataParser` | 只处理本地 .zip 文件 |
+| `DirectoryParser` | 保持为 `DataParser` | 只处理本地目录 |
+| 其他 (Markdown, PDF, ...) | 保持为 `DataParser` | 无需变动 |
+
+### 3.3 Accessor 实现示例
+
+#### `GitAccessor` (`openviking/parse/accessors/git.py`)
+
+```python
+# 从 CodeRepositoryParser 提取 git clone / GitHub ZIP 下载逻辑
+```
+
+#### `HTTPAccessor` (`openviking/parse/accessors/http.py`)
+
+```python
+# 从 HTMLParser 提取下载逻辑
+# 支持内容类型检测，下载到临时文件
+```
+
+#### `FeishuAccessor` (`openviking/parse/accessors/feishu.py`)
+
+```python
+# 从 FeishuParser 提取 API 调用逻辑
+```
+
+### 3.4 HybridParser（简化插件开发）
+
+对于简单的插件，允许同时实现两个接口：
+
+```python
+class HybridParser(DataAccessor, DataParser):
+    """
+    混合解析器：同时实现访问和解析
+    适用于简单的自定义解析器，不需要拆分
+    """
+    # 同时实现 DataAccessor 和 DataParser 的接口
+    pass
+```
+
+---
+
+## 四、实施步骤
+
+### Phase 1: 基础设施（核心接口）
+
+**目标**：建立新架构的基础框架，不影响现有功能
+
+- [ ] 创建 `openviking/parse/accessors/` 目录结构
+- [ ] 实现 `DataAccessor` 抽象基类和 `LocalResource` 数据类
+- [ ] 实现 `AccessorRegistry`
+- [ ] 编写单元测试
+- [ ] 更新 `ParserRegistry`，保持向后兼容
+
+### Phase 2: 重构 CodeRepositoryParser
+
+**目标**：将 CodeRepositoryParser 拆分为 GitAccessor + DirectoryParser
+
+- [ ] 实现 `GitAccessor`（从 `CodeRepositoryParser` 提取 clone/download 逻辑）
+- [ ] 更新 `CodeRepositoryParser` 为混合模式（向后兼容）或标记为 deprecated
+- [ ] 在 `AccessorRegistry` 中注册 `GitAccessor`
+- [ ] 更新 `UnifiedResourceProcessor.process()` 使用新流程
+- [ ] 测试：Git URL, GitHub URL, 本地 .zip 等场景
+
+### Phase 3: 重构 HTMLParser 和 FeishuParser
+
+- [ ] 实现 `HTTPAccessor`
+- [ ] 实现 `FeishuAccessor`
+- [ ] 重构 `HTMLParser` 只负责解析本地 HTML 文件
+- [ ] 重构 `FeishuParser` 只负责解析下载后的内容
+
+### Phase 4: 重构 UnifiedResourceProcessor
+
+**目标**：简化 `UnifiedResourceProcessor`，使用新的两层架构
+
+- [ ] 重构 `UnifiedResourceProcessor.process()`：
+  ```python
+  async def process(self, source, ...):
+      # 第一阶段：获取数据
+      local_resource = await accessor_registry.access(source, **kwargs)
+
+      # 第二阶段：解析数据
+      parse_result = await parser_registry.parse(local_resource, **kwargs)
+
+      # 清理（如果需要）
+      # ...
+
+      return parse_result
+  ```
+- [ ] 移除 `_process_url()`, `_process_file()`, `_process_directory()` 中的重复逻辑
+
+### Phase 5: 扩展和优化
+
+- [ ] 实现优先级机制解决冲突
+- [ ] 添加 `HybridParser` 支持
+- [ ] 编写迁移文档
+- [ ] 性能优化和测试覆盖
+
+---
+
+## 五、兼容性策略
+
+### 5.1 向后兼容
+
+1. **保持现有 API 不变**：
+   - `ParserRegistry` 接口保持不变
+   - `registry.parse()` 仍然可以工作
+   - 自定义 Parser 注册方式不变
+
+2. **渐进式迁移**：
+   - 现有 Parser 可以继续使用
+   - 新 Parser 鼓励使用新架构
+   - 提供迁移指南
+
+### 5.2 废弃策略
+
+- 标记旧的 `CodeRepositoryParser` 等为 `@deprecated`
+- 在 v6.0 或未来版本移除
+
+---
+
+## 六、文件结构变更
+
+```
+openviking/parse/
+├── accessors/              [新增]
+│   ├── __init__.py
+│   ├── base.py            # DataAccessor, LocalResource
+│   ├── registry.py        # AccessorRegistry
+│   ├── git.py             # GitAccessor
+│   ├── http.py            # HTTPAccessor
+│   └── feishu.py          # FeishuAccessor
+├── parsers/
+│   ├── base_parser.py     # 明确为 DataParser
+│   ├── code/              # 保留但简化
+│   ├── markdown.py
+│   ├── pdf.py
+│   └── ...
+├── registry.py            # ParserRegistry (重构)
+└── ...
+```
+
+---
+
+## 七、测试计划
+
+| 测试类型 | 测试内容 |
+|---------|---------|
+| 单元测试 | AccessorRegistry, 各 Accessor, 各 Parser |
+| 集成测试 | 完整流程：add_resource → Accessor → Parser → TreeBuilder |
+| 回归测试 | 确保现有功能不被破坏 |
+| 冲突测试 | 测试优先级机制解决 .zip, URL 等冲突场景 |
+
+---
+
+## 八、风险与应对
+
+| 风险 | 影响 | 概率 | 应对措施 |
+|-----|------|------|---------|
+| 重构范围过大 | 高 | 中 | 分阶段实施，每阶段可独立发布 |
+| 性能回退 | 中 | 低 | 保持缓存机制，性能基准测试 |
+| 自定义插件受影响 | 高 | 低 | 保持向后兼容，提供迁移工具 |
+
+---
+
+## 九、相关文档
+
+- [解析系统 README](../../openviking/parse/parsers/README.md)
+- [OpenViking 整体架构](../zh/concepts/01-architecture.md)
diff --git a/examples/ov.conf.example b/examples/ov.conf.example
index 456c995d5..b5227d734 100644
--- a/examples/ov.conf.example
+++ b/examples/ov.conf.example
@@ -157,10 +157,7 @@
       "extract_text_only": false,
       "preserve_structure": true,
       "clean_html": true,
-      "extract_metadata": true,
-      "code_hosting_domains": ["github.com", "gitlab.com"],
-      "github_domains": ["github.com", "www.github.com"],
-      "gitlab_domains": ["gitlab.com", "www.gitlab.com"]
+      "extract_metadata": true
     },
     "text": {
       "detect_language": true,
diff --git a/openviking/parse/__init__.py b/openviking/parse/__init__.py
index b266a1dae..e447710c1 100644
--- a/openviking/parse/__init__.py
+++ b/openviking/parse/__init__.py
@@ -13,7 +13,6 @@
     scan_directory,
 )
 from openviking.parse.parsers.base_parser import BaseParser
-from openviking.parse.parsers.code import CodeRepositoryParser
 from openviking.parse.parsers.html import HTMLParser
 from openviking.parse.parsers.markdown import MarkdownParser
 from openviking.parse.parsers.pdf import PDFParser
@@ -34,7 +33,6 @@
     "MarkdownParser",
     "PDFParser",
     "HTMLParser",
-    "CodeRepositoryParser",
     "DocumentConverter",
     # Custom parser support
     "CustomParserProtocol",
diff --git a/openviking/parse/accessors/__init__.py b/openviking/parse/accessors/__init__.py
new file mode 100644
index 000000000..1836f30b5
--- /dev/null
+++ b/openviking/parse/accessors/__init__.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""
+Data Accessors for OpenViking.
+
+This module provides the two-layer architecture for resource processing:
+- DataAccessor: Fetches data from remote sources or special paths
+- DataParser: Parses local files/directories (existing Parser system)
+"""
+
+from .base import DataAccessor, LocalResource
+from .feishu import FeishuAccessor
+from .git import GitAccessor
+from .http import HTTPAccessor
+from .registry import (
+    AccessorRegistry,
+    access,
+    get_accessor_registry,
+)
+
+__all__ = [
+    # Base classes
+    "DataAccessor",
+    "LocalResource",
+    # Registry
+    "AccessorRegistry",
+    "get_accessor_registry",
+    "access",
+    # Accessors
+    "GitAccessor",
+    "HTTPAccessor",
+    "FeishuAccessor",
+]
diff --git a/openviking/parse/accessors/base.py b/openviking/parse/accessors/base.py
new file mode 100644
index 000000000..8d5b4c90e
--- /dev/null
+++ b/openviking/parse/accessors/base.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""
+Base classes for Data Accessors.
+
+Data Accessors are responsible for fetching data from remote sources
+or special paths and making them available as local files/directories.
+"""
+
+import os
+import shutil
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, Union
+
+
+class SourceType:
+    """
+    Enumeration of valid source types for LocalResource.
+
+    Provides type safety and consistency across the system.
+    """
+
+    LOCAL = "local"
+    """Local file system resource."""
+
+    GIT = "git"
+    """Git repository (from GitAccessor)."""
+
+    HTTP = "http"
+    """HTTP/HTTPS resource (from HTTPAccessor)."""
+
+    FEISHU = "feishu"
+    """Feishu/Lark document (from FeishuAccessor)."""
+
+
+@dataclass
+class LocalResource:
+    """
+    Represents a locally accessible resource.
+
+    This is the output of the DataAccessor layer, containing the local
+    path to the resource along with metadata about its origin.
+    """
+
+    path: Path
+    """Local file/directory path to the resource."""
+
+    source_type: str
+    """Original source type: one of SourceType constants."""
+
+    original_source: str
+    """Original source string (URL, path, etc.)."""
+
+    meta: Dict[str, Any] = field(default_factory=dict)
+    """Additional metadata (repo_name, branch, content_type, etc.)."""
+
+    is_temporary: bool = True
+    """Whether this is a temporary resource that can be cleaned up after parsing."""
+
+    def cleanup(self) -> None:
+        """
+        Clean up the local resource if it's temporary.
+
+        Removes the file/directory from the local filesystem.
+        """
+        if not self.is_temporary:
+            return
+
+        if not self.path.exists():
+            return
+
+        try:
+            if self.path.is_dir():
+                shutil.rmtree(self.path, ignore_errors=True)
+            else:
+                self.path.unlink(missing_ok=True)
+        except Exception as e:
+            from openviking_cli.utils.logger import get_logger
+
+            logger = get_logger(__name__)
+            logger.warning(f"[LocalResource] Failed to cleanup resource {self.path}: {e}")
+
+    def __enter__(self) -> "LocalResource":
+        """Support context manager protocol."""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Support context manager protocol - cleanup on exit."""
+        self.cleanup()
+
+
+class DataAccessor(ABC):
+    """
+    Abstract base class for data accessors.
+
+    Data Accessors are responsible for:
+    - Detecting if they can handle a given source
+    - Fetching the data from the source to a local path
+    - Providing metadata about the source
+    - Cleaning up temporary resources when done
+    """
+
+    @abstractmethod
+    def can_handle(self, source: Union[str, Path]) -> bool:
+        """
+        Check if this accessor can handle the given source.
+
+        Args:
+            source: Source string (URL, path, etc.) or Path object
+
+        Returns:
+            True if this accessor can handle the source
+        """
+        pass
+
+    @abstractmethod
+    async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
+        """
+        Fetch the source and make it available locally.
+
+        Args:
+            source: Source string (URL, path, etc.) or Path object
+            **kwargs: Additional accessor-specific arguments
+
+        Returns:
+            LocalResource pointing to the locally available data
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def priority(self) -> int:
+        """
+        Priority of this accessor.
+
+        Higher numbers mean higher priority. When multiple accessors
+        can handle the same source, the one with the highest priority wins.
+
+        Standard priority levels:
+        - 100: Specific service (Feishu, etc.)
+        - 80: Version control (Git, etc.)
+        - 50: Generic protocols (HTTP, etc.)
+        - 10: Fallback accessors
+        """
+        pass
+
+    def cleanup(self, resource: LocalResource) -> None:
+        """
+        Clean up the local resource.
+
+        Default implementation calls resource.cleanup().
+        Subclasses can override for custom cleanup logic.
+
+        Args:
+            resource: The LocalResource to clean up
+        """
+        resource.cleanup()
+
+    def _create_temp_dir(self, prefix: str = "ov_accessor_") -> Path:
+        """
+        Create a temporary directory for this accessor.
+
+        Args:
+            prefix: Prefix for the temporary directory name
+
+        Returns:
+            Path to the created temporary directory
+        """
+        import tempfile
+
+        temp_dir = tempfile.mkdtemp(prefix=prefix)
+        return Path(temp_dir)
+
+    def _create_temp_file(self, suffix: str = "", prefix: str = "ov_accessor_") -> Path:
+        """
+        Create a temporary file for this accessor.
+
+        Args:
+            suffix: Suffix for the temporary file name
+            prefix: Prefix for the temporary file name
+
+        Returns:
+            Path to the created temporary file
+        """
+        import tempfile
+
+        fd, temp_path = tempfile.mkstemp(suffix=suffix, prefix=prefix)
+        os.close(fd)
+        return Path(temp_path)
diff --git a/openviking/parse/accessors/feishu.py b/openviking/parse/accessors/feishu.py
new file mode 100644
index 000000000..a1cc6e0fa
--- /dev/null
+++ b/openviking/parse/accessors/feishu.py
@@ -0,0 +1,646 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""
+Feishu/Lark Accessor.
+
+Fetches Feishu/Lark cloud documents using the lark-oapi SDK.
+This is the DataAccessor layer extracted from FeishuParser.
+
+Note: This accessor requires the `lark-oapi` package.
+Install with: pip install 'openviking[bot-feishu]'
+"""
+
+import os
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple, Union
+from urllib.parse import urlparse
+
+from openviking_cli.utils.logger import get_logger
+
+from .base import DataAccessor, LocalResource, SourceType
+
+logger = get_logger(__name__)
+
+
+def _getattr_safe(obj, key: str, default=None):
+    """Get attribute from SDK object or dict, with safe fallback."""
+    if isinstance(obj, dict):
+        return obj.get(key, default)
+    return getattr(obj, key, default)
+
+
+@dataclass
+class FeishuDocument:
+    """Result from fetching a Feishu document."""
+
+    doc_type: str  # "docx" only for now
+    token: str
+    markdown_content: str
+    title: str
+    meta: Dict[str, Any]
+
+
+class FeishuAccessor(DataAccessor):
+    """
+    Accessor for Feishu/Lark cloud documents.
+
+    Supports:
+    - Documents: https://*.feishu.cn/docx/{document_id}
+    - Wiki pages: https://*.feishu.cn/wiki/{token} (resolves to docx)
+
+    Requires:
+    - lark-oapi package
+    - FEISHU_APP_ID and FEISHU_APP_SECRET environment variables,
+      or configuration in ov.conf
+    """
+
+    PRIORITY = 100  # Higher than Git/HTTP, very specific
+
+    # Wiki obj_type normalization (API returns short names)
+    _WIKI_TYPE_MAP = {"doc": "docx", "sheet": "sheets", "bitable": "base"}
+
+    # Attributes that skip processing (structural containers or metadata)
+    _SKIP_ATTRS = {"page", "table_cell", "quote_container", "grid", "grid_column"}
+
+    # Attribute → special handler method (non-text blocks)
+    _SPECIAL_BLOCK_HANDLERS = {
+        "divider": "_handle_divider",
+        "image": "_handle_image",
+        "table": "_table_block_to_markdown",
+    }
+
+    # Attribute → markdown prefix template for text-bearing blocks.
+    # "{text}" is replaced with extracted text content.
+    # Headings are handled dynamically (heading1-heading9 → # through #########).
+    _TEXT_FORMAT = {
+        "bullet": "- {text}",
+        "quote": "> {text}",
+    }
+
+    # Known block_type integer → SDK attribute name mapping.
+    # Primary dispatch mechanism for reliable block detection.
+    # Source: Feishu OpenAPI documentation + lark-oapi SDK Block class.
+    _BLOCK_TYPE_TO_ATTR = {
+        1: "page",
+        2: "text",
+        3: "heading1",
+        4: "heading2",
+        5: "heading3",
+        6: "heading4",
+        7: "heading5",
+        8: "heading6",
+        9: "heading7",
+        10: "heading8",
+        11: "heading9",
+        12: "bullet",
+        13: "ordered",
+        14: "code",
+        15: "quote",
+        17: "todo",
+        19: "callout",
+        22: "divider",
+        27: "image",
+        31: "table",
+        32: "table_cell",
+        34: "quote_container",
+    }
+
+    # All known content attribute names on SDK Block objects (for fallback detection).
+    _KNOWN_CONTENT_ATTRS = frozenset(
+        {
+            "page",
+            "text",
+            "heading1",
+            "heading2",
+            "heading3",
+            "heading4",
+            "heading5",
+            "heading6",
+            "heading7",
+            "heading8",
+            "heading9",
+            "bullet",
+            "ordered",
+            "code",
+            "quote",
+            "todo",
+            "callout",
+            "divider",
+            "image",
+            "table",
+            "table_cell",
+            "quote_container",
+            "equation",
+            "task",
+            "grid",
+            "grid_column",
+        }
+    )
+
+    def __init__(self):
+        """Initialize Feishu accessor."""
+        self._client = None
+        self._config = None
+
+    @property
+    def priority(self) -> int:
+        return self.PRIORITY
+
+    def can_handle(self, source: Union[str, Path]) -> bool:
+        """
+        Check if this accessor can handle the source.
+
+        Handles Feishu/Lark cloud document URLs.
+        """
+        source_str = str(source)
+
+        # Only handle http/https URLs
+        if not source_str.startswith(("http://", "https://")):
+            return False
+
+        return self._is_feishu_url(source_str)
+
+    async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
+        """
+        Fetch a Feishu document and save to a temporary Markdown file.
+
+        Args:
+            source: Feishu document URL
+            **kwargs: Additional arguments
+
+        Returns:
+            LocalResource pointing to the temporary Markdown file
+        """
+        source_str = str(source)
+
+        try:
+            # Fetch the document and convert to Markdown
+            doc = await self._fetch_document(source_str)
+
+            # Create temporary file
+            temp_file = tempfile.NamedTemporaryFile(
+                mode="w",
+                suffix=".md",
+                prefix="ov_feishu_",
+                delete=False,
+            )
+            temp_file.write(doc.markdown_content)
+            temp_file.close()
+
+            # Build metadata
+            meta = {
+                "feishu_doc_type": doc.doc_type,
+                "feishu_token": doc.token,
+                "feishu_title": doc.title,
+                **doc.meta,
+            }
+
+            return LocalResource(
+                path=Path(temp_file.name),
+                source_type=SourceType.FEISHU,
+                original_source=source_str,
+                meta=meta,
+                is_temporary=True,
+            )
+
+        except Exception as e:
+            logger.error(f"[FeishuAccessor] Failed to access {source}: {e}", exc_info=True)
+            raise
+
+    async def _fetch_document(self, url: str) -> FeishuDocument:
+        """
+        Fetch a Feishu document and convert to Markdown.
+
+        This method extracts and adapts the logic from FeishuParser.parse().
+        """
+        import asyncio
+
+        doc_type, token = self._parse_feishu_url(url)
+        title = None
+        meta = {}
+
+        if doc_type == "wiki":
+            # Resolve wiki node to actual document type
+            real_type, real_token, title = await asyncio.to_thread(self._resolve_wiki_node, token)
+            doc_type, token = real_type, real_token
+            meta["wiki_resolved"] = True
+
+        # Only docx is supported
+        if doc_type != "docx":
+            raise ValueError(
+                f"Unsupported Feishu document type: {doc_type}. "
+                f"Only docx is supported in this version."
+            )
+
+        # Call the handler (in thread pool since lark-oapi is sync)
+        markdown, doc_title = await asyncio.to_thread(self._parse_docx, token)
+
+        if title:
+            doc_title = title
+
+        meta["original_url"] = url
+
+        return FeishuDocument(
+            doc_type=doc_type,
+            token=token,
+            markdown_content=markdown,
+            title=doc_title,
+            meta=meta,
+        )
+
+    @staticmethod
+    def _is_feishu_url(url: str) -> bool:
+        """Check if URL is a Feishu/Lark cloud document."""
+        parsed = urlparse(url)
+        host = parsed.hostname or ""
+        path = parsed.path
+        is_feishu_domain = host.endswith(".feishu.cn") or host.endswith(".larksuite.com")
+        has_doc_path = any(path == f"/{t}" or path.startswith(f"/{t}/") for t in ("docx", "wiki"))
+        return is_feishu_domain and has_doc_path
+
+    @staticmethod
+    def _parse_feishu_url(url: str) -> Tuple[str, str]:
+        """
+        Extract doc_type and token from Feishu URL.
+
+        Returns:
+            (doc_type, token) e.g. ("docx", "doxcnABC123")
+        """
+        parsed = urlparse(url)
+        path_parts = [p for p in parsed.path.split("/") if p]
+        if len(path_parts) < 2:
+            raise ValueError(f"Cannot parse Feishu URL: {url}")
+        doc_type = path_parts[0]  # docx, wiki
+        token = path_parts[1]
+        return doc_type, token
+
+    # ========== Configuration & Client ==========
+
+    def _get_config(self):
+        """Get FeishuConfig from OpenViking config."""
+        if self._config is None:
+            from openviking_cli.utils.config import get_openviking_config
+
+            self._config = get_openviking_config().feishu
+        return self._config
+
+    def _get_client(self):
+        """Lazy-init lark-oapi client."""
+        if self._client is None:
+            try:
+                import lark_oapi as lark
+            except ImportError:
+                raise ImportError(
+                    "lark-oapi is required for Feishu document parsing. "
+                    "Install it with: pip install 'openviking[bot-feishu]'"
+                )
+            config = self._get_config()
+            app_id = config.app_id or os.getenv("FEISHU_APP_ID", "")
+            app_secret = config.app_secret or os.getenv("FEISHU_APP_SECRET", "")
+            if not app_id or not app_secret:
+                raise ValueError(
+                    "Feishu credentials not configured. Set FEISHU_APP_ID and "
+                    "FEISHU_APP_SECRET environment variables, or configure in ov.conf."
+                )
+            domain = config.domain or "https://open.feishu.cn"
+            self._client = (
+                lark.Client.builder().app_id(app_id).app_secret(app_secret).domain(domain).build()
+            )
+        return self._client
+
+    # ========== Wiki Resolution ==========
+
+    def _resolve_wiki_node(self, token: str) -> Tuple[str, str, Optional[str]]:
+        """
+        Resolve wiki token to actual document type, token, and title.
+
+        Returns:
+            (doc_type, obj_token, title)
+        """
+        from lark_oapi.api.wiki.v2 import GetNodeSpaceRequest
+
+        client = self._get_client()
+        request = GetNodeSpaceRequest.builder().token(token).build()
+        response = client.wiki.v2.space.get_node(request)
+        if not response.success():
+            raise RuntimeError(
+                f"Failed to resolve wiki node {token}: code={response.code}, msg={response.msg}"
+            )
+        node = response.data.node
+        obj_type = node.obj_type or ""
+        obj_token = node.obj_token or ""
+        title = node.title
+
+        # Normalize type names
+        doc_type = self._WIKI_TYPE_MAP.get(obj_type, obj_type)
+
+        return doc_type, obj_token, title
+
+    # ========== Docx Parsing ==========
+
+    def _parse_docx(self, document_id: str) -> Tuple[str, str]:
+        """
+        Fetch all blocks and convert to Markdown.
+
+        Returns:
+            (markdown_content, document_title)
+        """
+        blocks = self._fetch_all_blocks(document_id)
+        if not blocks:
+            return "", "Untitled"
+
+        # Build block lookup by block_id
+        block_map = {b.block_id: b for b in blocks}
+
+        # Find title from page block
+        doc_title = "Untitled"
+        for b in blocks:
+            if b.page is not None:
+                if b.page.elements:
+                    doc_title = self._extract_text_from_elements(b.page.elements)
+                break
+
+        # Convert blocks to markdown
+        markdown_lines = []
+        ordered_counter: Dict[str, int] = {}
+
+        for block in blocks:
+            if block.page is not None:
+                continue  # Skip page container
+
+            line = self._block_to_markdown(
+                block, block_map, ordered_counter, document_id=document_id
+            )
+            if line is not None:
+                markdown_lines.append(line)
+
+        markdown = "\n\n".join(markdown_lines)
+
+        if doc_title and doc_title != "Untitled":
+            markdown = f"# {doc_title}\n\n{markdown}"
+
+        return markdown, doc_title
+
+    def _fetch_all_blocks(self, document_id: str) -> list:
+        """Fetch all blocks with pagination. Returns list of SDK block objects."""
+        from lark_oapi.api.docx.v1 import ListDocumentBlockRequest
+
+        client = self._get_client()
+        all_blocks = []
+        page_token = None
+
+        while True:
+            builder = (
+                ListDocumentBlockRequest.builder()
+                .document_id(document_id)
+                .page_size(500)
+                .document_revision_id(-1)
+            )
+            if page_token:
+                builder = builder.page_token(page_token)
+
+            request = builder.build()
+            response = client.docx.v1.document_block.list(request)
+
+            if not response.success():
+                raise RuntimeError(
+                    f"Failed to fetch blocks for {document_id}: "
+                    f"code={response.code}, msg={response.msg}"
+                )
+
+            items = response.data.items or []
+            all_blocks.extend(items)
+
+            if not response.data.has_more:
+                break
+            page_token = response.data.page_token
+
+        return all_blocks
+
+    # ========== Block -> Markdown Conversion ==========
+
+    def _detect_block_attr(self, block) -> Optional[str]:
+        """Detect which content attribute is populated on a block object.
+
+        Uses block_type integer as the primary dispatch (reliable), falling
+        back to attribute inspection over a known whitelist for unknown types.
+        """
+        # Primary: lookup by block_type integer
+        block_type = getattr(block, "block_type", None)
+        if block_type is not None:
+            attr = self._BLOCK_TYPE_TO_ATTR.get(block_type)
+            if attr:
+                return attr
+
+        # Fallback: scan known content attributes for unknown block types
+        for attr in self._KNOWN_CONTENT_ATTRS:
+            if getattr(block, attr, None) is not None:
+                return attr
+        return None
+
+    def _block_to_markdown(
+        self, block, block_map: Dict, ordered_counter: Dict[str, int], document_id: str = ""
+    ) -> Optional[str]:
+        """Convert a single SDK block object to markdown string.
+
+        Uses block_type integer for primary dispatch, with attribute whitelist
+        fallback for unknown types. Formatting is data-driven via _TEXT_FORMAT
+        and _SPECIAL_BLOCK_HANDLERS tables.
+        """
+        attr = self._detect_block_attr(block)
+
+        if attr is None:
+            return None
+
+        # Skip structural containers (processed via their children)
+        if attr in self._SKIP_ATTRS:
+            return None
+
+        # Reset ordered list counter when any non-ordered block appears
+        if attr != "ordered":
+            parent_id = block.parent_id or ""
+            if parent_id in ordered_counter:
+                del ordered_counter[parent_id]
+
+        # Special blocks (non-text: divider, image, table)
+        special_handler = self._SPECIAL_BLOCK_HANDLERS.get(attr)
+        if special_handler:
+            return getattr(self, special_handler)(block, block_map, document_id=document_id)
+
+        # --- Text-bearing blocks: extract elements, apply formatting ---
+        content_obj = getattr(block, attr, None)
+        if not content_obj or not hasattr(content_obj, "elements") or not content_obj.elements:
+            return None
+
+        text = self._extract_text_from_elements(content_obj.elements)
+        if not text:
+            return None
+
+        # Headings: heading1 -> #, heading2 -> ##, ...
+        if attr.startswith("heading"):
+            level = int(attr.replace("heading", "") or "1")
+            return f"{'#' * level} {text}"
+
+        # Ordered list (needs counter state)
+        if attr == "ordered":
+            parent_id = block.parent_id or ""
+            counter = ordered_counter.get(parent_id, 0) + 1
+            ordered_counter[parent_id] = counter
+            return f"{counter}. {text}"
+
+        # Code block (needs language from style)
+        if attr == "code":
+            lang = ""
+            if hasattr(content_obj, "style") and content_obj.style:
+                lang = str(getattr(content_obj.style, "language", "") or "")
+            return f"```{lang}\n{text}\n```"
+
+        # Todo (needs done state from style)
+        if attr == "todo":
+            done = False
+            if hasattr(content_obj, "style") and content_obj.style:
+                done = getattr(content_obj.style, "done", False)
+            checkbox = "[x]" if done else "[ ]"
+            return f"- {checkbox} {text}"
+
+        # Simple template formatting (bullet, quote, etc.)
+        fmt = self._TEXT_FORMAT.get(attr)
+        if fmt:
+            return fmt.format(text=text)
+
+        # Default: return plain text (covers callout, equation, task, unknown, etc.)
+        return text
+
+    @staticmethod
+    def _handle_divider(block, block_map: Dict = None, **_) -> str:
+        """Convert divider block to markdown."""
+        return "---"
+
+    @staticmethod
+    def _handle_image(block, block_map: Dict = None, **_) -> Optional[str]:
+        """Convert image block to markdown."""
+        image = block.image
+        if not image:
+            return None
+        file_token = image.token or ""
+        alt_text = getattr(image, "alt", "") or "image"
+        return f"![{alt_text}](feishu://image/{file_token})"
+
+    def _extract_block_text(self, block, attr_name: str) -> str:
+        """Extract text from a block's named attribute (e.g. block.text, block.heading2)."""
+        content_obj = getattr(block, attr_name, None)
+        if content_obj and hasattr(content_obj, "elements") and content_obj.elements:
+            return self._extract_text_from_elements(content_obj.elements)
+        return ""
+
+    def _extract_text_from_elements(self, elements) -> str:
+        """Convert Feishu TextElement SDK objects to formatted text."""
+        if not elements:
+            return ""
+        parts = []
+        for element in elements:
+            # TextRun
+            text_run = element.text_run
+            if text_run:
+                content = text_run.content or ""
+                style = text_run.text_element_style
+                content = self._apply_text_style(content, style)
+                parts.append(content)
+                continue
+
+            # MentionUser
+            mention_user = element.mention_user
+            if mention_user:
+                user_id = _getattr_safe(mention_user, "user_id", "user")
+                parts.append(f"@{user_id}")
+                continue
+
+            # MentionDoc
+            mention_doc = element.mention_doc
+            if mention_doc:
+                title = _getattr_safe(mention_doc, "title", "document")
+                url = _getattr_safe(mention_doc, "url", "")
+                parts.append(f"[{title}]({url})" if url else str(title))
+                continue
+
+            # Equation
+            equation = element.equation
+            if equation:
+                parts.append(f"${_getattr_safe(equation, 'content', '')}$")
+                continue
+
+        return "".join(parts)
+
+    @staticmethod
+    def _apply_text_style(text: str, style) -> str:
+        """Apply markdown formatting based on TextElementStyle SDK object."""
+        if not text or not style:
+            return text
+        # inline_code (SDK uses 'inline_code', not 'code_inline')
+        if getattr(style, "inline_code", False):
+            return f"`{text}`"
+        # link
+        link = getattr(style, "link", None)
+        if link:
+            url = _getattr_safe(link, "url", "")
+            if url:
+                text = f"[{text}]({url})"
+        if getattr(style, "bold", False):
+            text = f"**{text}**"
+        if getattr(style, "italic", False):
+            text = f"*{text}*"
+        if getattr(style, "strikethrough", False):
+            text = f"~~{text}~~"
+        return text
+
+    def _table_block_to_markdown(self, block, block_map: Dict, **_) -> Optional[str]:
+        """Convert table block to markdown table."""
+        table = block.table
+        children = block.children
+        if not table or not children:
+            return None
+
+        prop = table.property
+        if not prop:
+            return None
+        row_size = prop.row_size or 0
+        col_size = prop.column_size or 0
+        if not row_size or not col_size:
+            return None
+
+        rows = []
+        for row_idx in range(row_size):
+            row = []
+            for col_idx in range(col_size):
+                cell_idx = row_idx * col_size + col_idx
+                if cell_idx < len(children):
+                    cell_block_id = children[cell_idx]
+                    cell_block = block_map.get(cell_block_id)
+                    cell_text = self._extract_cell_text(cell_block, block_map)
+                    row.append(cell_text)
+                else:
+                    row.append("")
+            rows.append(row)
+
+        from openviking.parse.base import format_table_to_markdown
+
+        return format_table_to_markdown(rows, has_header=True) if rows else None
+
+    def _extract_cell_text(self, cell_block, block_map: Dict) -> str:
+        """Extract text from a table cell block by reading its children."""
+        if not cell_block or not cell_block.children:
+            return ""
+        texts = []
+        for child_id in cell_block.children:
+            child = block_map.get(child_id)
+            if not child:
+                continue
+            # Use attribute-driven detection to find text in any block type
+            attr = self._detect_block_attr(child)
+            if attr:
+                text = self._extract_block_text(child, attr)
+                if text:
+                    texts.append(text)
+        return " ".join(texts)
diff --git a/openviking/parse/accessors/git.py b/openviking/parse/accessors/git.py
new file mode 100644
index 000000000..1590bca0d
--- /dev/null
+++ b/openviking/parse/accessors/git.py
@@ -0,0 +1,581 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""
+Git Repository Accessor.
+
+Fetches git repositories and zip archives of codebases to local directories.
+This is the DataAccessor layer extracted from CodeRepositoryParser.
+"""
+
+import asyncio
+import os
+import shutil
+import stat
+import tempfile
+import urllib.request
+import zipfile
+from pathlib import Path, PurePosixPath
+from typing import Any, Optional, Tuple, Union
+from urllib.parse import unquote, urlparse
+
+from openviking.utils import is_github_url, is_gitlab_url, parse_code_hosting_url
+from openviking.utils.code_hosting_utils import (
+    is_code_hosting_url,
+    is_git_repo_url,
+    validate_git_ssh_uri,
+)
+from openviking_cli.utils.config import get_openviking_config
+from openviking_cli.utils.logger import get_logger
+
+from .base import DataAccessor, LocalResource, SourceType
+
+logger = get_logger(__name__)
+
+
+class GitAccessor(DataAccessor):
+    """
+    Accessor for Git repositories and code archives.
+
+    Supports:
+    - Git SSH URLs: git@github.com:org/repo.git
+    - Git HTTP/HTTPS URLs: https://github.com/org/repo.git
+    - GitHub/GitLab repository pages: https://github.com/org/repo
+    - Local ZIP files (containing code repositories)
+    """
+
+    PRIORITY = 80  # Higher than generic HTTP, lower than specific services
+
+    @property
+    def priority(self) -> int:
+        return self.PRIORITY
+
+    def can_handle(self, source: Union[str, Path]) -> bool:
+        """
+        Check if this accessor can handle the source.
+
+        Handles:
+        - git@ SSH URLs
+        - git://, ssh:// URLs
+        - GitHub/GitLab repository URLs (http/https)
+        - Local paths ending with .git or .zip
+        """
+        source_str = str(source)
+
+        # Git protocol URLs
+        if source_str.startswith(("git@", "git://", "ssh://")):
+            try:
+                if source_str.startswith("git@"):
+                    validate_git_ssh_uri(source_str)
+                return is_code_hosting_url(source_str)
+            except ValueError:
+                return False
+
+        # HTTP/HTTPS URLs to code hosting repos
+        if source_str.startswith(("http://", "https://")):
+            return is_git_repo_url(source_str)
+
+        # Local .git or .zip files
+        if isinstance(source, Path):
+            path = source
+        else:
+            path = Path(source_str)
+
+        suffix = path.suffix.lower()
+        return suffix in (".git", ".zip")
+
+    async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
+        """
+        Fetch the git repository or code archive to a local directory.
+
+        Args:
+            source: Repository URL (git/http) or local zip path
+            **kwargs: Additional arguments (branch, commit, etc.)
+
+        Returns:
+            LocalResource pointing to the local directory
+        """
+        source_str = str(source)
+        temp_local_dir = None
+        branch = kwargs.get("branch") or kwargs.get("ref")
+        commit = kwargs.get("commit")
+
+        try:
+            # Create local temp directory
+            temp_local_dir = tempfile.mkdtemp(prefix="ov_git_")
+            logger.info(f"[GitAccessor] Created local temp dir for git: {temp_local_dir}")
+
+            # Fetch content (Clone or Extract)
+            repo_name = "repository"
+            local_dir = Path(temp_local_dir)
+
+            if source_str.startswith("git@"):
+                # git@ SSH URL
+                repo_name = await self._git_clone(
+                    source_str,
+                    temp_local_dir,
+                    branch=branch,
+                    commit=commit,
+                )
+            elif source_str.startswith(("http://", "https://", "git://", "ssh://")):
+                repo_url, branch, commit = self._parse_repo_source(source_str, **kwargs)
+                if self._is_github_url(repo_url):
+                    # Use GitHub ZIP API
+                    local_dir, repo_name = await self._github_zip_download(
+                        repo_url, branch or commit, temp_local_dir
+                    )
+                elif self._is_gitlab_url(repo_url):
+                    # Use GitLab ZIP API
+                    local_dir, repo_name = await self._gitlab_zip_download(
+                        repo_url, branch or commit, temp_local_dir
+                    )
+                else:
+                    # Non-GitHub/GitLab URL: use git clone
+                    repo_name = await self._git_clone(
+                        repo_url,
+                        temp_local_dir,
+                        branch=branch,
+                        commit=commit,
+                    )
+            elif str(source).endswith(".zip"):
+                repo_name = await self._extract_zip(source_str, temp_local_dir)
+            else:
+                raise ValueError(f"Unsupported source for GitAccessor: {source}")
+
+            # Build metadata
+            meta = {"repo_name": repo_name}
+            if branch:
+                meta["repo_ref"] = branch
+            if commit:
+                meta["repo_commit"] = commit
+
+            return LocalResource(
+                path=local_dir,
+                source_type=SourceType.GIT,
+                original_source=source_str,
+                meta=meta,
+                is_temporary=True,
+            )
+
+        except Exception as e:
+            logger.error(
+                f"[GitAccessor] Failed to access git repository {source}: {e}", exc_info=True
+            )
+            # Clean up on error
+            if temp_local_dir and os.path.exists(temp_local_dir):
+                try:
+                    shutil.rmtree(temp_local_dir, ignore_errors=True)
+                except Exception:
+                    pass
+            raise
+
+    def _parse_repo_source(self, source: str, **kwargs) -> Tuple[str, Optional[str], Optional[str]]:
+        """Parse repository source URL to extract branch/commit info."""
+        branch = kwargs.get("branch") or kwargs.get("ref")
+        commit = kwargs.get("commit")
+        repo_url = source
+        if source.startswith(("http://", "https://", "git://", "ssh://")):
+            parsed = urlparse(source)
+            repo_url = parsed._replace(query="", fragment="").geturl()
+            if commit is None or branch is None:
+                branch, commit = self._extract_ref_from_url(parsed, branch, commit)
+        repo_url = self._normalize_repo_url(repo_url)
+        return repo_url, branch, commit
+
+    def _extract_ref_from_url(
+        self,
+        parsed: Any,
+        branch: Optional[str],
+        commit: Optional[str],
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """Extract branch/commit from URL path."""
+        if parsed.path:
+            path_branch, path_commit = self._parse_ref_from_path(parsed.path)
+            commit = path_commit or commit
+            # If commit is present in path, ignore branch entirely
+            if commit:
+                branch = None
+            else:
+                branch = branch or path_branch
+        return branch, commit
+
+    def _parse_ref_from_path(self, path: str) -> Tuple[Optional[str], Optional[str]]:
+        """Parse ref from URL path components."""
+        parts = [p for p in path.split("/") if p]
+        branch = None
+        commit = None
+        if "commit" in parts:
+            idx = parts.index("commit")
+            if idx + 1 < len(parts):
+                commit = parts[idx + 1]
+        if "tree" in parts:
+            idx = parts.index("tree")
+            if idx + 1 < len(parts):
+                ref = unquote(parts[idx + 1])
+                if self._looks_like_sha(ref):
+                    commit = ref
+                else:
+                    branch = ref
+        return branch, commit
+
+    @staticmethod
+    def _looks_like_sha(ref: str) -> bool:
+        """Return True if ref looks like a git commit SHA (7-40 hex chars)."""
+        return 7 <= len(ref) <= 40 and all(c in "0123456789abcdefABCDEF" for c in ref)
+
+    def _normalize_repo_url(self, url: str) -> str:
+        """Normalize repository URL to base form."""
+        if url.startswith(("http://", "https://", "git://", "ssh://")):
+            parsed = urlparse(url)
+            path_parts = [p for p in parsed.path.split("/") if p]
+            base_parts = path_parts
+            git_index = next((i for i, p in enumerate(path_parts) if p.endswith(".git")), None)
+            if git_index is not None:
+                base_parts = path_parts[: git_index + 1]
+
+            config = get_openviking_config()
+            if (
+                parsed.netloc in config.code.github_domains + config.code.gitlab_domains
+                and len(path_parts) >= 2
+            ):
+                base_parts = path_parts[:2]
+            base_path = "/" + "/".join(base_parts)
+            return parsed._replace(path=base_path, query="", fragment="").geturl()
+        return url
+
+    def _get_repo_name(self, url: str) -> str:
+        """Get repository name with organization for GitHub/GitLab URLs."""
+        # First try to parse as code hosting URL
+        parsed_org_repo = parse_code_hosting_url(url)
+        if parsed_org_repo:
+            return parsed_org_repo
+
+        # Fallback for other URLs
+        name_source = url
+        if url.startswith(("http://", "https://", "git://", "ssh://")):
+            name_source = urlparse(url).path.rstrip("/")
+        elif ":" in url and not url.startswith("file://"):
+            name_source = url.split(":", 1)[1]
+
+        # Original logic for non-GitHub/GitLab URLs
+        name = name_source.rstrip("/").split("/")[-1]
+        if name.endswith(".git"):
+            name = name[:-4]
+        name = "".join(c if c.isalnum() or c in "-_" else "_" for c in name)
+        return name or "repository"
+
+    async def _run_git(self, args: list[str], cwd: Optional[str] = None) -> str:
+        """Run a git command."""
+        proc = await asyncio.create_subprocess_exec(
+            *args,
+            cwd=cwd,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        stdout, stderr = await proc.communicate()
+        if proc.returncode != 0:
+            error_msg = stderr.decode().strip()
+            user_msg = "Git command failed."
+            if "Could not resolve hostname" in error_msg:
+                user_msg = (
+                    "Git command failed: could not resolve hostname. Check the URL or your network."
+                )
+            elif "Permission denied" in error_msg or "publickey" in error_msg:
+                user_msg = (
+                    "Git command failed: authentication error. Check your SSH keys or credentials."
+                )
+            raise RuntimeError(
+                f"{user_msg} Command: git {' '.join(args[1:])}. Details: {error_msg}"
+            )
+        return stdout.decode().strip()
+
+    async def _has_commit(self, repo_dir: str, commit: str) -> bool:
+        """Check if a commit exists in the repository."""
+        try:
+            await self._run_git(["git", "-C", repo_dir, "rev-parse", "--verify", commit])
+            return True
+        except RuntimeError:
+            return False
+
+    @staticmethod
+    def _is_github_url(url: str) -> bool:
+        """Return True for github.com URLs (supports ZIP archive API)."""
+        return is_github_url(url)
+
+    @staticmethod
+    def _is_gitlab_url(url: str) -> bool:
+        """Return True for gitlab.com URLs (supports ZIP archive API)."""
+        return is_gitlab_url(url)
+
+    async def _git_clone(
+        self,
+        url: str,
+        target_dir: str,
+        branch: Optional[str] = None,
+        commit: Optional[str] = None,
+    ) -> str:
+        """Clone a git repository into target_dir; return the repo name."""
+        name = self._get_repo_name(url)
+        logger.info(f"[GitAccessor] Cloning {url} to {target_dir}...")
+
+        clone_args = [
+            "git",
+            "clone",
+            "--depth",
+            "1",
+            "--recursive",
+        ]
+        if branch and not commit:
+            clone_args.extend(["--branch", branch])
+        clone_args.extend([url, target_dir])
+        await self._run_git(clone_args)
+
+        if commit:
+            try:
+                await self._run_git(["git", "-C", target_dir, "fetch", "origin", commit])
+            except RuntimeError:
+                try:
+                    await self._run_git(
+                        ["git", "-C", target_dir, "fetch", "--all", "--tags", "--prune"]
+                    )
+                except RuntimeError:
+                    pass
+                ok = await self._has_commit(target_dir, commit)
+                if not ok:
+                    try:
+                        await self._run_git(
+                            ["git", "-C", target_dir, "fetch", "--unshallow", "origin"]
+                        )
+                    except RuntimeError:
+                        pass
+                ok = await self._has_commit(target_dir, commit)
+                if not ok:
+                    await self._run_git(
+                        [
+                            "git",
+                            "-C",
+                            target_dir,
+                            "fetch",
+                            "origin",
+                            "+refs/heads/*:refs/remotes/origin/*",
+                        ]
+                    )
+                    ok = await self._has_commit(target_dir, commit)
+                    if not ok:
+                        raise RuntimeError(f"Failed to fetch commit {commit} from {url}")
+            await self._run_git(["git", "-C", target_dir, "checkout", commit])
+
+        return name
+
+    async def _github_zip_download(
+        self,
+        repo_url: str,
+        branch: Optional[str],
+        target_dir: str,
+    ) -> Tuple[Path, str]:
+        """Download a GitHub repo as a ZIP archive and extract it."""
+        repo_name = self._get_repo_name(repo_url)
+
+        # Build archive URL from owner/repo path components.
+        parsed = urlparse(repo_url)
+        path_parts = [p for p in parsed.path.split("/") if p]
+        owner = path_parts[0]
+        repo_raw = path_parts[1]
+        # Strip .git suffix for the archive URL
+        repo_slug = repo_raw[:-4] if repo_raw.endswith(".git") else repo_raw
+
+        if branch:
+            zip_url = f"https://github.com/{owner}/{repo_slug}/archive/{branch}.zip"
+        else:
+            zip_url = f"https://github.com/{owner}/{repo_slug}/archive/HEAD.zip"
+
+        logger.info(f"[GitAccessor] Downloading GitHub ZIP: {zip_url}")
+
+        zip_path = os.path.join(target_dir, "_archive.zip")
+        extract_dir = os.path.join(target_dir, "_extracted")
+        os.makedirs(extract_dir, exist_ok=True)
+
+        # Download (blocking HTTP; run in thread pool)
+        def _download() -> None:
+            headers = {"User-Agent": "OpenViking"}
+            github_token = os.environ.get("GITHUB_TOKEN")
+            if github_token:
+                headers["Authorization"] = f"token {github_token}"
+
+            req = urllib.request.Request(zip_url, headers=headers)
+            with urllib.request.urlopen(req, timeout=1800) as resp, open(zip_path, "wb") as f:
+                shutil.copyfileobj(resp, f)
+
+        try:
+            await asyncio.to_thread(_download)
+        except Exception as exc:
+            raise RuntimeError(f"Failed to download GitHub ZIP {zip_url}: {exc}")
+
+        # Safe extraction with Zip Slip validation
+        target = Path(extract_dir).resolve()
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            for info in zf.infolist():
+                mode = info.external_attr >> 16
+                if info.is_dir() or stat.S_ISDIR(mode):
+                    continue
+                if stat.S_ISLNK(mode):
+                    logger.warning(
+                        f"[GitAccessor] Skipping symlink entry in GitHub ZIP: {info.filename}"
+                    )
+                    continue
+                raw = info.filename.replace("\\", "/")
+                raw_parts = [p for p in raw.split("/") if p]
+                if ".." in raw_parts:
+                    raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}")
+                if PurePosixPath(raw).is_absolute():
+                    raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}")
+                extracted = Path(zf.extract(info, extract_dir)).resolve()
+                if not extracted.is_relative_to(target):
+                    extracted.unlink(missing_ok=True)
+                    raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}")
+
+        # Remove downloaded archive to free disk space.
+        try:
+            os.unlink(zip_path)
+        except OSError:
+            pass
+
+        # GitHub ZIPs have a single top-level directory
+        top_level = [d for d in Path(extract_dir).iterdir() if d.is_dir()]
+        content_dir = top_level[0] if len(top_level) == 1 else Path(extract_dir)
+
+        logger.info(f"[GitAccessor] GitHub ZIP extracted to {content_dir} ({repo_name})")
+        return content_dir, repo_name
+
+    async def _gitlab_zip_download(
+        self,
+        repo_url: str,
+        branch: Optional[str],
+        target_dir: str,
+    ) -> Tuple[Path, str]:
+        """Download a GitLab repo as a ZIP archive and extract it."""
+        repo_name = self._get_repo_name(repo_url)
+
+        # Build archive URL from owner/repo path components.
+        # GitLab archive URL format: https://gitlab.com/{owner}/{repo}/-/archive/{ref}/{repo}-{ref}.zip
+        parsed = urlparse(repo_url)
+        path_parts = [p for p in parsed.path.split("/") if p]
+        owner = path_parts[0]
+        repo_raw = path_parts[1]
+        # Strip .git suffix for the archive URL
+        repo_slug = repo_raw[:-4] if repo_raw.endswith(".git") else repo_raw
+
+        ref = branch or "HEAD"
+        # GitLab uses the format: /{owner}/{repo}/-/archive/{ref}/{repo}-{ref}.zip
+        zip_url = f"{parsed.scheme}://{parsed.netloc}/{owner}/{repo_slug}/-/archive/{ref}/{repo_slug}-{ref}.zip"
+
+        logger.info(f"[GitAccessor] Downloading GitLab ZIP: {zip_url}")
+
+        zip_path = os.path.join(target_dir, "_archive.zip")
+        extract_dir = os.path.join(target_dir, "_extracted")
+        os.makedirs(extract_dir, exist_ok=True)
+
+        # Download (blocking HTTP; run in thread pool)
+        def _download() -> None:
+            headers = {"User-Agent": "OpenViking"}
+
+            req = urllib.request.Request(zip_url, headers=headers)
+            with urllib.request.urlopen(req, timeout=1800) as resp, open(zip_path, "wb") as f:
+                shutil.copyfileobj(resp, f)
+
+        try:
+            await asyncio.to_thread(_download)
+        except Exception as exc:
+            raise RuntimeError(f"Failed to download GitLab ZIP {zip_url}: {exc}")
+
+        # Safe extraction with Zip Slip validation
+        target = Path(extract_dir).resolve()
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            for info in zf.infolist():
+                mode = info.external_attr >> 16
+                if info.is_dir() or stat.S_ISDIR(mode):
+                    continue
+                if stat.S_ISLNK(mode):
+                    logger.warning(
+                        f"[GitAccessor] Skipping symlink entry in GitLab ZIP: {info.filename}"
+                    )
+                    continue
+                raw = info.filename.replace("\\", "/")
+                raw_parts = [p for p in raw.split("/") if p]
+                if ".." in raw_parts:
+                    raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}")
+                if PurePosixPath(raw).is_absolute():
+                    raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}")
+                extracted = Path(zf.extract(info, extract_dir)).resolve()
+                if not extracted.is_relative_to(target):
+                    extracted.unlink(missing_ok=True)
+                    raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}")
+
+        # Remove downloaded archive to free disk space.
+        try:
+            os.unlink(zip_path)
+        except OSError:
+            pass
+
+        # GitLab ZIPs have a single top-level directory: {repo}-{ref}/
+        top_level = [d for d in Path(extract_dir).iterdir() if d.is_dir()]
+        content_dir = top_level[0] if len(top_level) == 1 else Path(extract_dir)
+
+        logger.info(f"[GitAccessor] GitLab ZIP extracted to {content_dir} ({repo_name})")
+        return content_dir, repo_name
+
+    async def _extract_zip(self, zip_path: str, target_dir: str) -> str:
+        """Extract a local zip file into target_dir."""
+        if zip_path.startswith(("http://", "https://")):
+            raise NotImplementedError("Zip URL download not yet implemented in GitAccessor")
+
+        path = Path(zip_path)
+        name = path.stem
+
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            target = Path(target_dir).resolve()
+            for info in zip_ref.infolist():
+                mode = info.external_attr >> 16
+                # Skip directory entries
+                if info.is_dir() or stat.S_ISDIR(mode):
+                    continue
+                # Skip symlink entries
+                if stat.S_ISLNK(mode):
+                    logger.warning(f"[GitAccessor] Skipping symlink entry in zip: {info.filename}")
+                    continue
+                # Reject entries with suspicious raw path components
+                raw = info.filename.replace("\\", "/")
+                raw_parts = [p for p in raw.split("/") if p]
+                if ".." in raw_parts:
+                    raise ValueError(f"Zip Slip detected: entry {info.filename!r} contains '..'")
+                if PurePosixPath(raw).is_absolute() or (len(raw) >= 2 and raw[1] == ":"):
+                    raise ValueError(
+                        f"Zip Slip detected: entry {info.filename!r} is an absolute path"
+                    )
+                # Normalize and verify
+                arcname = info.filename.replace("/", os.sep)
+                if os.path.altsep:
+                    arcname = arcname.replace(os.path.altsep, os.sep)
+                arcname = os.path.splitdrive(arcname)[1]
+                arcname = os.sep.join(p for p in arcname.split(os.sep) if p not in ("", ".", ".."))
+                if not arcname:
+                    continue
+                member_path = (Path(target_dir) / arcname).resolve()
+                if not member_path.is_relative_to(target):
+                    raise ValueError(
+                        f"Zip Slip detected: entry {info.filename!r} escapes target directory"
+                    )
+                # Extract and verify
+                extracted = Path(zip_ref.extract(info, target_dir)).resolve()
+                if not extracted.is_relative_to(target):
+                    # Best-effort cleanup
+                    try:
+                        extracted.unlink(missing_ok=True)
+                    except OSError as cleanup_err:
+                        logger.warning(
+                            f"[GitAccessor] Failed to clean up escaped file {extracted}: {cleanup_err}"
+                        )
+                    raise ValueError(
+                        f"Zip Slip detected: entry {info.filename!r} escapes target directory"
+                    )
+
+        return name
diff --git a/openviking/parse/accessors/http.py b/openviking/parse/accessors/http.py
new file mode 100644
index 000000000..1d228889a
--- /dev/null
+++ b/openviking/parse/accessors/http.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""
+HTTP URL Accessor.
+
+Fetches HTTP/HTTPS URLs and makes them available as local files.
+This is the DataAccessor layer extracted from HTMLParser.
+"""
+
+import tempfile
+from pathlib import Path
+from typing import Optional, Union
+from urllib.parse import unquote, urlparse
+
+from openviking.parse.base import lazy_import
+from openviking.utils.network_guard import build_httpx_request_validation_hooks
+from openviking_cli.utils.logger import get_logger
+
+from .base import DataAccessor, LocalResource, SourceType
+
+logger = get_logger(__name__)
+
+
+class HTTPAccessor(DataAccessor):
+    """
+    Accessor for HTTP/HTTPS URLs.
+
+    Features:
+    - Downloads web pages to local HTML files
+    - Downloads files (PDF, Markdown, etc.) to local files
+    - Supports GitHub/GitLab blob to raw URL conversion
+    - Follows redirects
+    - Network guard integration
+    - Detailed error classification (network, timeout, auth, etc.)
+    """
+
+    PRIORITY = 50  # Lower than GitAccessor, higher than fallback
+
+    DEFAULT_USER_AGENT = (
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/120.0.0.0 Safari/537.36"
+    )
+
+    def __init__(
+        self,
+        timeout: float = 30.0,
+        user_agent: Optional[str] = None,
+    ):
+        """Initialize HTTP accessor."""
+        self.timeout = timeout
+        self.user_agent = user_agent or self.DEFAULT_USER_AGENT
+
+    @property
+    def priority(self) -> int:
+        return self.PRIORITY
+
+    def can_handle(self, source: Union[str, Path]) -> bool:
+        """
+        Check if this accessor can handle the source.
+
+        Handles any HTTP/HTTPS URL.
+        NOTE: GitAccessor and FeishuAccessor have higher priority
+        and will be checked first for their specific URL types.
+        """
+        source_str = str(source)
+
+        # Only handle http/https URLs
+        return source_str.startswith(("http://", "https://"))
+
+    async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
+        """
+        Fetch the HTTP URL to a local file.
+
+        Args:
+            source: HTTP/HTTPS URL
+            **kwargs: Additional arguments (request_validator, etc.)
+
+        Returns:
+            LocalResource pointing to the downloaded file
+        """
+        source_str = str(source)
+        request_validator = kwargs.get("request_validator")
+
+        # Download the URL
+        temp_path = await self._download_url(
+            source_str,
+            request_validator=request_validator,
+        )
+
+        # Build metadata
+        meta = {
+            "url": source_str,
+            "downloaded": True,
+        }
+
+        return LocalResource(
+            path=Path(temp_path),
+            source_type=SourceType.HTTP,
+            original_source=source_str,
+            meta=meta,
+            is_temporary=True,
+        )
+
+    @staticmethod
+    def _extract_filename_from_url(url: str) -> str:
+        """
+        Extract and URL-decode the original filename from a URL.
+
+        Args:
+            url: URL to extract filename from
+
+        Returns:
+            Decoded filename (e.g., "schemas.py" from ".../schemas.py")
+            Falls back to "download" if no filename can be extracted.
+        """
+        parsed = urlparse(url)
+        # URL-decode path to handle encoded characters (e.g., %E7%99%BE -> Chinese chars)
+        decoded_path = unquote(parsed.path)
+        basename = Path(decoded_path).name
+        return basename if basename else "download"
+
+    async def _download_url(
+        self,
+        url: str,
+        request_validator=None,
+    ) -> str:
+        """
+        Download URL content to a temporary file.
+
+        Args:
+            url: URL to download
+            request_validator: Optional network request validator
+
+        Returns:
+            Path to the temporary file
+        """
+        httpx = lazy_import("httpx")
+
+        # Convert GitHub/GitLab blob URLs to raw
+        url = self._convert_to_raw_url(url)
+
+        # Determine file extension from URL (decode first to handle encoded paths)
+        parsed = urlparse(url)
+        decoded_path = unquote(parsed.path)
+        ext = Path(decoded_path).suffix or ".html"
+
+        # Create temp file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
+        temp_path = temp_file.name
+        temp_file.close()
+
+        try:
+            # Download content
+            client_kwargs = {
+                "timeout": self.timeout,
+                "follow_redirects": True,
+            }
+            event_hooks = build_httpx_request_validation_hooks(request_validator)
+            if event_hooks:
+                client_kwargs["event_hooks"] = event_hooks
+                client_kwargs["trust_env"] = False
+
+            async with httpx.AsyncClient(**client_kwargs) as client:
+                headers = {"User-Agent": self.user_agent}
+                try:
+                    response = await client.get(url, headers=headers)
+                    response.raise_for_status()
+                except httpx.ConnectError as e:
+                    user_msg = "HTTP request failed: could not connect to server. Check the URL or your network."
+                    raise RuntimeError(f"{user_msg} URL: {url}. Details: {e}") from e
+                except httpx.TimeoutException as e:
+                    user_msg = "HTTP request failed: timeout. The server took too long to respond."
+                    raise RuntimeError(f"{user_msg} URL: {url}. Details: {e}") from e
+                except httpx.HTTPStatusError as e:
+                    status_code = e.response.status_code if e.response else "unknown"
+                    if status_code == 401 or status_code == 403:
+                        user_msg = f"HTTP request failed: authentication error ({status_code}). Check your credentials or permissions."
+                    elif status_code == 404:
+                        user_msg = f"HTTP request failed: not found ({status_code}). The URL may be invalid or the resource was removed."
+                    elif 500 <= status_code < 600:
+                        user_msg = f"HTTP request failed: server error ({status_code}). The server encountered an error."
+                    else:
+                        user_msg = f"HTTP request failed: status code {status_code}."
+                    raise RuntimeError(f"{user_msg} URL: {url}. Details: {e}") from e
+                except Exception as e:
+                    user_msg = "HTTP request failed: unexpected error."
+                    raise RuntimeError(f"{user_msg} URL: {url}. Details: {e}") from e
+
+                # Write to temp file
+                Path(temp_path).write_bytes(response.content)
+
+            return temp_path
+        except Exception:
+            # Clean up on error
+            try:
+                p = Path(temp_path)
+                if p.exists():
+                    p.unlink(missing_ok=True)
+            except Exception:
+                pass
+            raise
+
+    def _convert_to_raw_url(self, url: str) -> str:
+        """Convert GitHub/GitLab blob URL to raw URL."""
+        parsed = urlparse(url)
+        try:
+            from openviking_cli.utils.config import get_openviking_config
+
+            config = get_openviking_config()
+            github_domains = config.html.github_domains
+            gitlab_domains = config.html.gitlab_domains
+            github_raw_domain = config.code.github_raw_domain
+
+            if parsed.netloc in github_domains:
+                path_parts = parsed.path.strip("/").split("/")
+                if len(path_parts) >= 4 and path_parts[2] == "blob":
+                    # Remove 'blob'
+                    new_path = "/".join(path_parts[:2] + path_parts[3:])
+                    return f"https://{github_raw_domain}/{new_path}"
+
+            if parsed.netloc in gitlab_domains and "/blob/" in parsed.path:
+                return url.replace("/blob/", "/raw/")
+
+        except Exception:
+            pass
+
+        return url
diff --git a/openviking/parse/accessors/registry.py b/openviking/parse/accessors/registry.py
new file mode 100644
index 000000000..d2bfb8b63
--- /dev/null
+++ b/openviking/parse/accessors/registry.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""
+Registry for Data Accessors.
+
+Manages DataAccessor registration and provides automatic source routing.
+"""
+
+import logging
+from pathlib import Path
+from typing import List, Optional, Union
+
+from .base import DataAccessor, LocalResource, SourceType
+
+logger = logging.getLogger(__name__)
+
+
+class AccessorRegistry:
+    """
+    Registry for data accessors.
+
+    Provides automatic accessor selection based on source type and priority.
+    """
+
+    def __init__(self, register_default: bool = True):
+        """
+        Initialize the accessor registry.
+
+        Args:
+            register_default: Whether to register default accessors
+        """
+        self._accessors: List[DataAccessor] = []
+
+        if register_default:
+            self._register_defaults()
+
+    def _register_defaults(self) -> None:
+        """Register default accessors."""
+        # GitAccessor - handles git repositories
+        try:
+            from .git import GitAccessor
+
+            self.register(GitAccessor())
+        except Exception as e:
+            logger.debug(f"[AccessorRegistry] Failed to register GitAccessor: {e}")
+
+        # HTTPAccessor - handles HTTP/HTTPS URLs
+        try:
+            from .http import HTTPAccessor
+
+            self.register(HTTPAccessor())
+        except Exception as e:
+            logger.debug(f"[AccessorRegistry] Failed to register HTTPAccessor: {e}")
+
+        # FeishuAccessor - handles Feishu/Lark documents
+        try:
+            from .feishu import FeishuAccessor
+
+            self.register(FeishuAccessor())
+        except Exception as e:
+            logger.debug(f"[AccessorRegistry] Failed to register FeishuAccessor: {e}")
+
+    def register(self, accessor: DataAccessor) -> None:
+        """
+        Register an accessor.
+
+        Accessors are stored in descending order of priority.
+        When multiple accessors can handle the same source,
+        the one with the highest priority is selected.
+
+        Args:
+            accessor: DataAccessor instance to register
+        """
+        # Insert in priority order (descending)
+        insert_idx = len(self._accessors)
+        for i, existing in enumerate(self._accessors):
+            if accessor.priority > existing.priority:
+                insert_idx = i
+                break
+
+        self._accessors.insert(insert_idx, accessor)
+        logger.debug(
+            f"[AccessorRegistry] Registered accessor {accessor.__class__.__name__} with priority {accessor.priority}"
+        )
+
+    def unregister(self, accessor_name: str) -> bool:
+        """
+        Unregister an accessor by class name.
+
+        Args:
+            accessor_name: Name of the accessor class to unregister
+
+        Returns:
+            True if an accessor was unregistered, False otherwise
+        """
+        for i, accessor in enumerate(self._accessors):
+            if accessor.__class__.__name__ == accessor_name:
+                del self._accessors[i]
+                logger.debug(f"[AccessorRegistry] Unregistered accessor {accessor_name}")
+                return True
+        return False
+
+    def get_accessor(self, source: Union[str, Path]) -> Optional[DataAccessor]:
+        """
+        Get the highest priority accessor that can handle the source.
+
+        Args:
+            source: Source string or Path to check
+
+        Returns:
+            DataAccessor instance or None if no accessor can handle the source
+        """
+        for accessor in self._accessors:
+            if accessor.can_handle(source):
+                return accessor
+        return None
+
+    def list_accessors(self, source: Optional[Union[str, Path]] = None) -> List[DataAccessor]:
+        """
+        List registered accessors.
+
+        Args:
+            source: Optional source to filter by (only returns accessors that can handle it)
+
+        Returns:
+            List of DataAccessor instances
+        """
+        if source is None:
+            return list(self._accessors)
+        return [a for a in self._accessors if a.can_handle(source)]
+
+    async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
+        """
+        Access a source by routing to the appropriate accessor.
+
+        If no accessor can handle the source, it is treated as a local file.
+
+        Args:
+            source: Source string (URL, path, etc.) or Path object
+            **kwargs: Additional arguments passed to the accessor
+
+        Returns:
+            LocalResource pointing to the locally available data
+        """
+        source_str = str(source)
+
+        # Try to find an accessor
+        accessor = self.get_accessor(source)
+        if accessor:
+            logger.debug(
+                f"[AccessorRegistry] Using accessor {accessor.__class__.__name__} for source: {source_str}"
+            )
+            return await accessor.access(source, **kwargs)
+
+        # Default: treat as local file
+        logger.debug(f"[AccessorRegistry] No accessor found, treating as local file: {source_str}")
+        path = Path(source)
+        return LocalResource(
+            path=path,
+            source_type=SourceType.LOCAL,
+            original_source=source_str,
+            meta={},
+            is_temporary=False,
+        )
+
+    def clear(self) -> None:
+        """Remove all registered accessors."""
+        self._accessors.clear()
+        logger.debug("[AccessorRegistry] Cleared all accessors from registry")
+
+
+# Global registry instance
+_default_registry: Optional[AccessorRegistry] = None
+
+
+def get_accessor_registry(register_default: bool = True) -> AccessorRegistry:
+    """
+    Get the default accessor registry.
+
+    Args:
+        register_default: Whether to register default accessors if creating
+            a new registry instance (only used on first call)
+
+    Returns:
+        The global AccessorRegistry instance
+    """
+    global _default_registry
+    if _default_registry is None:
+        _default_registry = AccessorRegistry(register_default=register_default)
+    return _default_registry
+
+
+async def access(source: Union[str, Path], **kwargs) -> LocalResource:
+    """
+    Access a source using the default registry.
+
+    Args:
+        source: Source string or Path
+        **kwargs: Additional arguments
+
+    Returns:
+        LocalResource
+    """
+    return await get_accessor_registry().access(source, **kwargs)
diff --git a/openviking/parse/parsers/__init__.py b/openviking/parse/parsers/__init__.py
index 6ce39e4b3..5f17740d0 100644
--- a/openviking/parse/parsers/__init__.py
+++ b/openviking/parse/parsers/__init__.py
@@ -2,10 +2,9 @@
 # SPDX-License-Identifier: AGPL-3.0
 
 from .base_parser import BaseParser
-from .code import CodeRepositoryParser
 from .epub import EPubParser
 from .excel import ExcelParser
-from .html import HTMLParser, URLType, URLTypeDetector
+from .html import HTMLParser
 from .markdown import MarkdownParser
 from .pdf import PDFParser
 from .powerpoint import PowerPointParser
@@ -15,12 +14,9 @@
 
 __all__ = [
     "BaseParser",
-    "CodeRepositoryParser",
     "EPubParser",
     "ExcelParser",
     "HTMLParser",
-    "URLType",
-    "URLTypeDetector",
     "MarkdownParser",
     "PDFParser",
     "PowerPointParser",
diff --git a/openviking/parse/parsers/code/code.py b/openviking/parse/parsers/code/code.py
index a9c1c7da0..9c829583e 100644
--- a/openviking/parse/parsers/code/code.py
+++ b/openviking/parse/parsers/code/code.py
@@ -13,7 +13,6 @@
 import os
 import shutil
 import stat
-import tempfile
 import time
 import urllib.request
 import zipfile
@@ -54,6 +53,30 @@ class CodeRepositoryParser(BaseParser):
     - Automatic filtering of non-code directories (.git, node_modules, etc.)
     - Direct mapping to VikingFS temp directory
     - Preserves directory structure without chunking
+
+    代码仓库入库处理流程
+
+    输入: https://github.com/markwhen/gogetxueqiu
+        ↓
+    [GitAccessor] → LocalResource
+                    - path: /tmp/.../extracted/repo
+                    - source_type: SourceType.GIT
+                    - original_source: "https://github.com/markwhen/gogetxueqiu"
+                    - meta: {repo_name: "markwhen/gogetxueqiu", ...}
+        ↓
+    [media_processor] → 所有目录都用 DirectoryParser！← 简化了！
+        ↓
+    [DirectoryParser.parse()]
+        ├─→ 检测到 (path/.git).exists() ← 新增！
+        ├─→ 收集 git 元数据
+        └─→ 委托给 CodeRepositoryParser.parse()
+        ↓
+    [CodeRepositoryParser.parse()]
+        - source_path = original_source (https://github.com/...)
+        ↓
+    [TreeBuilder.finalize_from_temp()]
+        - 从 source_path 解析出 "markwhen/gogetxueqiu"
+        - root_uri = "viking://resources/markwhen/gogetxueqiu"
     """
 
     # Class constants imported from constants.py
@@ -87,57 +110,48 @@ def _detect_file_type(self, file_path: Path) -> str:
 
     async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
         """
-        Parse code repository.
+        Parse code repository (only accepts local directories, as network fetching is done in Accessor layer).
 
         Args:
-            source: Repository URL (git/http) or local zip path
+            source: Local directory path (already fetched by DataAccessor)
             instruction: Processing instruction (unused in parser phase)
             **kwargs: Additional arguments
+                _source_meta: Metadata from DataAccessor (must contain repo_name, repo_ref, repo_commit)
+                original_source: Original URL (for repo name extraction if needed)
 
         Returns:
             ParseResult with temp_dir_path pointing to the uploaded content
         """
         start_time = time.time()
-        source_str = str(source)
-        temp_local_dir = None
-        branch = None
-        commit = None
+        source_path = Path(source)
+
+        # Check if source is already a local directory (should always be true)
+        if not source_path.is_dir():
+            raise ValueError(
+                f"CodeRepositoryParser only accepts local directories. "
+                f"Source type: {type(source)}, value: {source}"
+            )
+
+        logger.info(f"[CodeRepositoryParser] Parsing code repository: {source_path}")
 
         try:
-            # 1. Prepare local temp directory
-            temp_local_dir = tempfile.mkdtemp(prefix="ov_repo_")
-            logger.info(f"Created local temp dir: {temp_local_dir}")
-
-            # 2. Fetch content (Clone or Extract)
-            repo_name = "repository"
-            local_dir = Path(temp_local_dir)
-            if source_str.startswith("git@"):
-                # git@ SSH URL: use git clone directly (no GitHub ZIP optimization)
-                repo_name = await self._git_clone(
-                    source_str,
-                    temp_local_dir,
-                    branch=branch,
-                    commit=commit,
+            # Get metadata from DataAccessor
+            source_meta = kwargs.get("_source_meta", {})
+            repo_name = source_meta.get("repo_name", "repository")
+            branch = source_meta.get("repo_ref")
+            commit = source_meta.get("repo_commit")
+
+            # If repo_name is still default, try to extract from original source
+            if repo_name == "repository":
+                original_source = source_meta.get("original_source") or kwargs.get(
+                    "original_source"
                 )
-            elif source_str.startswith(("http://", "https://", "git://", "ssh://")):
-                repo_url, branch, commit = self._parse_repo_source(source_str, **kwargs)
-                if self._is_github_url(repo_url):
-                    # Use GitHub ZIP API: supports branch names, tags, and commit SHAs
-                    local_dir, repo_name = await self._github_zip_download(
-                        repo_url, branch or commit, temp_local_dir
-                    )
-                else:
-                    # Non-GitHub URL: use git clone
-                    repo_name = await self._git_clone(
-                        repo_url,
-                        temp_local_dir,
-                        branch=branch,
-                        commit=commit,
-                    )
-            elif str(source).endswith(".zip"):
-                repo_name = await self._extract_zip(source_str, temp_local_dir)
-            else:
-                raise ValueError(f"Unsupported source for CodeRepositoryParser: {source}")
+                if original_source:
+                    parsed_org_repo = parse_code_hosting_url(original_source)
+                    if parsed_org_repo:
+                        repo_name = parsed_org_repo
+
+            local_dir = source_path
 
             # 3. Create VikingFS temp URI
             viking_fs = self._get_viking_fs()
@@ -161,9 +175,11 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
                 meta={"name": repo_name, "type": "repository"},
             )
 
+            # Use original URL as source_path instead of local temp dir for proper org/repo parsing in TreeBuilder
+            original_source = source_meta.get("original_source") or kwargs.get("original_source")
             result = create_parse_result(
                 root=root,
-                source_path=source_str,
+                source_path=original_source or str(source),
                 source_format="repository",
                 parser_name="CodeRepositoryParser",
                 parse_time=time.time() - start_time,
@@ -180,24 +196,17 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
 
         except Exception as e:
             logger.error(f"Failed to parse repository {source}: {e}", exc_info=True)
+            # Use original URL for error case as well
+            original_source = source_meta.get("original_source") or kwargs.get("original_source")
             return create_parse_result(
                 root=ResourceNode(type=NodeType.ROOT, content_path=None),
-                source_path=source_str,
+                source_path=original_source or str(source),
                 source_format="repository",
                 parser_name="CodeRepositoryParser",
                 parse_time=time.time() - start_time,
                 warnings=[f"Failed to parse repository: {str(e)}"],
             )
 
-        finally:
-            # Cleanup local temp dir
-            if temp_local_dir and os.path.exists(temp_local_dir):
-                try:
-                    shutil.rmtree(temp_local_dir)
-                    logger.debug(f"Cleaned up local temp dir: {temp_local_dir}")
-                except Exception as e:
-                    logger.warning(f"Failed to cleanup local temp dir {temp_local_dir}: {e}")
-
     async def parse_content(
         self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
     ) -> ParseResult:
diff --git a/openviking/parse/parsers/directory.py b/openviking/parse/parsers/directory.py
index d8d2b1c98..d190d1367 100644
--- a/openviking/parse/parsers/directory.py
+++ b/openviking/parse/parsers/directory.py
@@ -87,6 +87,16 @@ async def parse(
         if not source_path.is_dir():
             raise NotADirectoryError(f"Not a directory: {source_path}")
 
+        # Check if this is a git repository, delegate to CodeRepositoryParser
+        if await self._is_git_repository(source_path):
+            logger.debug(
+                f"Directory {source_path} is a git repository, delegating to CodeRepositoryParser"
+            )
+            from openviking.parse.parsers.code.code import CodeRepositoryParser
+
+            await self._add_git_metadata(source_path, kwargs)
+            return await CodeRepositoryParser().parse(str(source_path), instruction, **kwargs)
+
         dir_name = kwargs.get("source_name") or source_path.name
         warnings: List[str] = []
 
@@ -452,6 +462,59 @@ async def _merge_temp(
         except Exception:
             pass
 
+    @staticmethod
+    async def _is_git_repository(source_path: Path) -> bool:
+        """Check if the directory contains a git repository."""
+        try:
+            return (source_path / ".git").exists() and (source_path / ".git").is_dir()
+        except (OSError, PermissionError):
+            return False
+
+    @staticmethod
+    async def _add_git_metadata(source_path: Path, kwargs: dict) -> None:
+        """Add git metadata (branch, commit, repo_name) to kwargs dictionary."""
+        try:
+            from openviking.parse.accessors.git import GitAccessor
+
+            git_accessor = GitAccessor()
+
+            # Get branch
+            try:
+                branch = await git_accessor._run_git(
+                    ["git", "-C", str(source_path), "rev-parse", "--abbrev-ref", "HEAD"]
+                )
+                kwargs["repo_ref"] = branch
+            except Exception as e:
+                logger.debug(f"Failed to get git branch: {e}")
+
+            # Get commit
+            try:
+                commit = await git_accessor._run_git(
+                    ["git", "-C", str(source_path), "rev-parse", "HEAD"]
+                )
+                kwargs["repo_commit"] = commit
+            except Exception as e:
+                logger.debug(f"Failed to get git commit: {e}")
+
+            # Get repo name
+            try:
+                remote_url = await git_accessor._run_git(
+                    ["git", "-C", str(source_path), "config", "--get", "remote.origin.url"]
+                )
+                if remote_url:
+                    repo_name = git_accessor._get_repo_name(remote_url)
+                    if repo_name and repo_name != "repository":
+                        kwargs["repo_name"] = repo_name
+                else:
+                    kwargs["repo_name"] = source_path.name
+            except Exception as e:
+                logger.debug(f"Failed to get git remote info: {e}")
+                kwargs["repo_name"] = source_path.name
+
+        except Exception as e:
+            logger.debug(f"Failed to get git metadata: {e}")
+            kwargs["repo_name"] = source_path.name
+
     @staticmethod
     async def _recursive_move(
         viking_fs: Any,
diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py
index 97d3ece62..24331113c 100644
--- a/openviking/parse/parsers/html.py
+++ b/openviking/parse/parsers/html.py
@@ -1,227 +1,47 @@
 # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
 # SPDX-License-Identifier: AGPL-3.0
 """
-HTML and URL parser for OpenViking.
+HTML Parser for OpenViking.
 
-Unified parser that handles:
-- Local HTML files
-- Web pages (URL -> fetch -> parse)
-- Download links (URL -> download -> delegate to appropriate parser)
+Parses local HTML files.
 
-Preserves natural document hierarchy and filters out navigation/ads.
+For URL downloading, use HTTPAccessor in the new two-layer architecture.
 """
 
 import hashlib
 import re
-import tempfile
 import time
-from enum import Enum
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
-from urllib.parse import unquote, urlparse
+from typing import List, Optional, Union
 
 from openviking.parse.base import (
     NodeType,
     ParseResult,
     ResourceNode,
     create_parse_result,
-    lazy_import,
 )
 from openviking.parse.parsers.base_parser import BaseParser
-from openviking.parse.parsers.constants import CODE_EXTENSIONS
-from openviking.utils.network_guard import build_httpx_request_validation_hooks
-from openviking_cli.exceptions import PermissionDeniedError
-from openviking_cli.utils.config import get_openviking_config
 
-
-class URLType(Enum):
-    """URL content types."""
-
-    WEBPAGE = "webpage"  # HTML webpage to parse
-    DOWNLOAD_PDF = "download_pdf"  # PDF file download link
-    DOWNLOAD_MD = "download_md"  # Markdown file download link
-    DOWNLOAD_TXT = "download_txt"  # Text file download link
-    DOWNLOAD_HTML = "download_html"  # HTML file download link
-    CODE_REPOSITORY = "code_repository"  # Code repository (GitHub, GitLab, etc.)
-    UNKNOWN = "unknown"  # Unknown or unsupported type
-
-
-class URLTypeDetector:
-    """
-    Detector for URL content types.
-
-    Uses extension and HTTP HEAD request to determine if a URL is:
-    - A webpage to scrape
-    - A file download link (and what type)
-    """
-
-    # Extension to URL type mapping
-    # CODE_EXTENSIONS spread comes first so explicit entries below override
-    # (e.g., .html/.htm -> DOWNLOAD_HTML instead of DOWNLOAD_TXT)
-    EXTENSION_MAP = {
-        **dict.fromkeys(CODE_EXTENSIONS, URLType.DOWNLOAD_TXT),
-        ".pdf": URLType.DOWNLOAD_PDF,
-        ".md": URLType.DOWNLOAD_MD,
-        ".markdown": URLType.DOWNLOAD_MD,
-        ".txt": URLType.DOWNLOAD_TXT,
-        ".text": URLType.DOWNLOAD_TXT,
-        ".html": URLType.DOWNLOAD_HTML,
-        ".htm": URLType.DOWNLOAD_HTML,
-        ".git": URLType.CODE_REPOSITORY,
-    }
-
-    # Content-Type to URL type mapping
-    CONTENT_TYPE_MAP = {
-        "application/pdf": URLType.DOWNLOAD_PDF,
-        "text/markdown": URLType.DOWNLOAD_MD,
-        "text/plain": URLType.DOWNLOAD_TXT,
-        "text/html": URLType.WEBPAGE,
-        "application/xhtml+xml": URLType.WEBPAGE,
-    }
-
-    async def detect(
-        self,
-        url: str,
-        timeout: float = 10.0,
-        request_validator=None,
-    ) -> Tuple[URLType, Dict[str, Any]]:
-        """
-        Detect URL content type.
-
-        Args:
-            url: URL to detect
-            timeout: HTTP request timeout
-
-        Returns:
-            (URLType, metadata dict)
-        """
-        meta = {"url": url, "detected_by": "unknown"}
-        parsed = urlparse(url)
-        path_lower = parsed.path.lower()
-
-        # 0. Check for code repository URLs first
-        if self._is_code_repository_url(url):
-            meta["detected_by"] = "code_repository_pattern"
-            return URLType.CODE_REPOSITORY, meta
-
-        # 1. Check extension first
-        for ext, url_type in self.EXTENSION_MAP.items():
-            if path_lower.endswith(ext):
-                meta["detected_by"] = "extension"
-                meta["extension"] = ext
-                return url_type, meta
-
-        # 2. Send HEAD request to check Content-Type
-        try:
-            httpx = lazy_import("httpx")
-            client_kwargs = {
-                "timeout": timeout,
-                "follow_redirects": True,
-            }
-            event_hooks = build_httpx_request_validation_hooks(request_validator)
-            if event_hooks:
-                client_kwargs["event_hooks"] = event_hooks
-                client_kwargs["trust_env"] = False
-
-            async with httpx.AsyncClient(**client_kwargs) as client:
-                response = await client.head(url)
-                content_type = response.headers.get("content-type", "").lower()
-
-                # Remove charset info
-                if ";" in content_type:
-                    content_type = content_type.split(";")[0].strip()
-
-                meta["content_type"] = content_type
-                meta["detected_by"] = "content_type"
-                meta["status_code"] = response.status_code
-
-                # Map content type
-                for ct_prefix, url_type in self.CONTENT_TYPE_MAP.items():
-                    if content_type.startswith(ct_prefix):
-                        return url_type, meta
-
-                # Default to webpage for HTML-like content
-                if "html" in content_type or "xml" in content_type:
-                    return URLType.WEBPAGE, meta
-
-        except PermissionDeniedError:
-            raise
-        except Exception as e:
-            meta["detection_error"] = str(e)
-
-        # 3. Default: assume webpage
-        return URLType.WEBPAGE, meta
-
-    def _is_code_repository_url(self, url: str) -> bool:
-        """
-        Check if URL is a code repository URL.
-
-        Args:
-            url: URL to check
-
-        Returns:
-            True if URL matches code repository patterns
-        """
-        import re
-
-        config = get_openviking_config()
-        github_domains = list(set(config.html.github_domains + config.code.github_domains))
-        gitlab_domains = list(set(config.html.gitlab_domains + config.code.gitlab_domains))
-        # Build repository URL patterns from config
-        repo_patterns = []
-
-        # Add patterns for GitHub domains
-        for domain in github_domains:
-            repo_patterns.append(rf"^https?://{re.escape(domain)}/[^/]+/[^/]+/?$")
-
-        # Add patterns for GitLab domains
-        for domain in gitlab_domains:
-            repo_patterns.append(rf"^https?://{re.escape(domain)}/[^/]+/[^/]+/?$")
-
-        # Add other patterns
-        repo_patterns.extend(
-            [
-                r"^.*\.git$",
-                r"^git@",
-            ]
-        )
-
-        # Check for URL patterns
-        for pattern in repo_patterns:
-            if re.match(pattern, url):
-                return True
-
-        return False
+logger = __import__("openviking_cli.utils.logger").utils.logger.get_logger(__name__)
 
 
 class HTMLParser(BaseParser):
     """
-    Unified parser for HTML files and URLs.
+    Parser for local HTML files.
 
     Features:
     - Parse local HTML files
-    - Fetch and parse web pages
-    - Detect and handle download links
     - Build hierarchy based on heading tags (h1-h6)
     - Filter out navigation, ads, and boilerplate
     - Extract tables and preserve structure
+
+    NOTE: URL downloading functionality has been moved to HTTPAccessor
+    in the new two-layer architecture. This parser only handles local files.
     """
 
-    DEFAULT_USER_AGENT = (
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
-        "AppleWebKit/537.36 (KHTML, like Gecko) "
-        "Chrome/120.0.0.0 Safari/537.36"
-    )
-
-    def __init__(
-        self,
-        timeout: float = 30.0,
-        user_agent: Optional[str] = None,
-    ):
+    def __init__(self):
         """Initialize HTML parser."""
-        self.timeout = timeout
-        self.user_agent = user_agent or self.DEFAULT_USER_AGENT
-        self._url_detector = URLTypeDetector()
+        pass
 
     def _get_readabilipy(self):
         """Lazy import of readabilipy."""
@@ -258,10 +78,10 @@ def supported_extensions(self) -> List[str]:
 
     async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
         """
-        Unified parse method for HTML files and URLs.
+        Parse a local HTML file.
 
         Args:
-            source: HTML file path or URL
+            source: HTML file path
             instruction: Processing instruction, guides LLM how to understand the resource
             **kwargs: Additional options
 
@@ -269,238 +89,9 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
             ParseResult with document tree
         """
         start_time = time.time()
-        source_str = str(source)
-
-        # Detect if source is a URL
-        if source_str.startswith(("http://", "https://")):
-            return await self._parse_url(source_str, start_time, **kwargs)
-        else:
-            return await self._parse_local_file(Path(source), start_time, **kwargs)
-
-    async def _parse_url(self, url: str, start_time: float, **kwargs) -> ParseResult:
-        """
-        Parse URL (webpage or download link).
-
-        Args:
-            url: URL to parse
-            start_time: Parse start timestamp
-
-        Returns:
-            ParseResult
-        """
-        # Detect URL type
-        request_validator = kwargs.get("request_validator")
-        url_type, meta = await self._url_detector.detect(
-            url,
-            timeout=self.timeout,
-            request_validator=request_validator,
-        )
-
-        if url_type == URLType.WEBPAGE:
-            # Fetch and parse as webpage
-            return await self._parse_webpage(url, start_time, meta, **kwargs)
-
-        elif url_type == URLType.DOWNLOAD_PDF:
-            # Download and delegate to PDF parser
-            return await self._handle_download_link(url, "pdf", start_time, meta, **kwargs)
-
-        elif url_type == URLType.DOWNLOAD_MD:
-            # Download and delegate to Markdown parser
-            return await self._handle_download_link(url, "markdown", start_time, meta, **kwargs)
-
-        elif url_type == URLType.DOWNLOAD_TXT:
-            # Download and delegate to Text parser
-            return await self._handle_download_link(url, "text", start_time, meta, **kwargs)
-
-        elif url_type == URLType.DOWNLOAD_HTML:
-            # Download HTML file and parse
-            return await self._handle_download_link(url, "html", start_time, meta, **kwargs)
-
-        elif url_type == URLType.CODE_REPOSITORY:
-            # Delegate to CodeRepositoryParser
-            return await self._handle_code_repository(url, start_time, meta, **kwargs)
-
-        else:
-            # Unknown type - try as webpage
-            return await self._parse_webpage(url, start_time, meta, **kwargs)
-
-    async def _parse_webpage(
-        self, url: str, start_time: float, meta: Dict[str, Any], **kwargs
-    ) -> ParseResult:
-        """
-        Fetch and parse a webpage.
-
-        Args:
-            url: URL to fetch
-            start_time: Parse start time
-            meta: Detection metadata
-
-        Returns:
-            ParseResult
-        """
-        try:
-            # Fetch HTML
-            html_content = await self._fetch_html(
-                url,
-                request_validator=kwargs.get("request_validator"),
-            )
-
-            # Convert to Markdown
-            markdown_content = self._html_to_markdown(html_content, base_url=url)
-
-            # Parse using MarkdownParser
-            from openviking.parse.parsers.markdown import MarkdownParser
-
-            md_parser = MarkdownParser()
-            result = await md_parser.parse_content(markdown_content, source_path=url, **kwargs)
-
-            # Update metadata
-            result.source_format = "html"
-            result.parser_name = "HTMLParser"
-            result.parse_time = time.time() - start_time
-            result.parse_timestamp = None  # Will be set by __post_init__
-            result.meta.update(meta)
-            result.meta["url_type"] = "webpage"
-            result.meta["intermediate_markdown"] = markdown_content[:500]  # Preview
-
-            return result
-
-        except PermissionDeniedError:
-            raise
-        except Exception as e:
-            return create_parse_result(
-                root=ResourceNode(type=NodeType.ROOT, content_path=None),
-                source_path=url,
-                source_format="html",
-                parser_name="HTMLParser",
-                parse_time=time.time() - start_time,
-                warnings=[f"Failed to fetch webpage: {e}"],
-            )
-
-    @staticmethod
-    def _extract_filename_from_url(url: str) -> str:
-        """
-        Extract and URL-decode the original filename from a URL.
-
-        Args:
-            url: URL to extract filename from
-
-        Returns:
-            Decoded filename (e.g., "schemas.py" from ".../schemas.py")
-            Falls back to "download" if no filename can be extracted.
-        """
-        parsed = urlparse(url)
-        # URL-decode path to handle encoded characters (e.g., %E7%99%BE -> Chinese chars)
-        decoded_path = unquote(parsed.path)
-        basename = Path(decoded_path).name
-        return basename if basename else "download"
-
-    async def _handle_download_link(
-        self, url: str, file_type: str, start_time: float, meta: Dict[str, Any], **kwargs
-    ) -> ParseResult:
-        """
-        Download file and delegate to appropriate parser.
-
-        Args:
-            url: URL to download
-            file_type: File type ("pdf", "markdown", "text", "html")
-            start_time: Parse start time
-            meta: Detection metadata
-
-        Returns:
-            ParseResult from delegated parser
-        """
-        temp_path = None
-        try:
-            # Download to temporary file
-            temp_path = await self._download_file(
-                url,
-                request_validator=kwargs.get("request_validator"),
-            )
-
-            # Extract original filename from URL for use as source_path,
-            # so parsers use it instead of the temp file name.
-            original_filename = self._extract_filename_from_url(url)
-
-            # Get appropriate parser
-            if file_type == "pdf":
-                from openviking.parse.parsers.pdf import PDFParser
-
-                parser = PDFParser()
-                result = await parser.parse(temp_path, resource_name=Path(original_filename).stem)
-            elif file_type == "markdown":
-                from openviking.parse.parsers.markdown import MarkdownParser
-
-                parser = MarkdownParser()
-                content = Path(temp_path).read_text(encoding="utf-8")
-                result = await parser.parse_content(
-                    content, source_path=original_filename, **kwargs
-                )
-            elif file_type == "text":
-                # For text/code files, preserve the original filename and extension.
-                # Read the downloaded content and save it with the original name
-                # instead of routing through TextParser->MarkdownParser which
-                # would rename it to .md and split it into sections.
-                result = await self._save_downloaded_text(temp_path, original_filename, start_time)
-            elif file_type == "html":
-                # Parse downloaded HTML locally
-                return await self._parse_local_file(Path(temp_path), start_time, **kwargs)
-            else:
-                raise ValueError(f"Unsupported file type: {file_type}")
-
-            result.meta.update(meta)
-            result.meta["downloaded_from"] = url
-            result.meta["url_type"] = f"download_{file_type}"
-            return result
-
-        except PermissionDeniedError:
-            raise
-        except Exception as e:
-            return create_parse_result(
-                root=ResourceNode(type=NodeType.ROOT, content_path=None),
-                source_path=url,
-                source_format=file_type,
-                parser_name="HTMLParser",
-                parse_time=time.time() - start_time,
-                warnings=[f"Failed to download/parse link: {e}"],
-            )
-        finally:
-            if temp_path:
-                try:
-                    p = Path(temp_path)
-                    if p.exists():
-                        p.unlink()
-                except Exception:
-                    pass
-
-    async def _handle_code_repository(
-        self, url: str, start_time: float, meta: Dict[str, Any], **kwargs
-    ) -> ParseResult:
-        """
-        Handle code repository URL by delegating to CodeRepositoryParser.
-        """
-        try:
-            from openviking.parse.parsers.code import CodeRepositoryParser
+        path = Path(source)
 
-            parser = CodeRepositoryParser()
-            result = await parser.parse(url, **kwargs)
-            result.meta.update(meta)
-            result.meta["downloaded_from"] = url
-            result.meta["url_type"] = "code_repository"
-
-            return result
-
-        except PermissionDeniedError:
-            raise
-        except Exception as e:
-            return create_parse_result(
-                root=ResourceNode(type=NodeType.ROOT, content_path=None),
-                source_path=url,
-                source_format="code_repository",
-                parser_name="HTMLParser",
-                parse_time=time.time() - start_time,
-                warnings=[f"Failed to parse code repository: {e}"],
-            )
+        return await self._parse_local_file(path, start_time, **kwargs)
 
     async def _parse_local_file(self, path: Path, start_time: float, **kwargs) -> ParseResult:
         """Parse local HTML file."""
@@ -521,7 +112,6 @@ async def _parse_local_file(self, path: Path, start_time: float, **kwargs) -> Pa
             # Add timing info
             result.parse_time = time.time() - start_time
             result.parser_name = "HTMLParser"
-            result.parser_version = "2.0"
 
             return result
         except Exception as e:
@@ -534,154 +124,6 @@ async def _parse_local_file(self, path: Path, start_time: float, **kwargs) -> Pa
                 warnings=[f"Failed to read HTML: {e}"],
             )
 
-    async def _fetch_html(self, url: str, request_validator=None) -> str:
-        """
-        Fetch HTML content from URL.
-
-        Args:
-            url: URL to fetch
-
-        Returns:
-            HTML content string
-
-        Raises:
-            Exception: If fetch fails
-        """
-        httpx = lazy_import("httpx")
-
-        client_kwargs = {
-            "timeout": self.timeout,
-            "follow_redirects": True,
-        }
-        event_hooks = build_httpx_request_validation_hooks(request_validator)
-        if event_hooks:
-            client_kwargs["event_hooks"] = event_hooks
-            client_kwargs["trust_env"] = False
-
-        async with httpx.AsyncClient(**client_kwargs) as client:
-            headers = {"User-Agent": self.user_agent}
-            response = await client.get(url, headers=headers)
-            response.raise_for_status()
-            return response.text
-
-    def _convert_to_raw_url(self, url: str) -> str:
-        """Convert GitHub/GitLab blob URL to raw URL."""
-        parsed = urlparse(url)
-        config = get_openviking_config()
-        github_domains = config.html.github_domains
-        gitlab_domains = config.html.gitlab_domains
-        github_raw_domain = config.code.github_raw_domain
-
-        if parsed.netloc in github_domains:
-            path_parts = parsed.path.strip("/").split("/")
-            if len(path_parts) >= 4 and path_parts[2] == "blob":
-                # Remove 'blob'
-                new_path = "/".join(path_parts[:2] + path_parts[3:])
-                return f"https://{github_raw_domain}/{new_path}"
-
-        if parsed.netloc in gitlab_domains and "/blob/" in parsed.path:
-            return url.replace("/blob/", "/raw/")
-
-        return url
-
-    async def _save_downloaded_text(
-        self, temp_path: str, original_filename: str, start_time: float
-    ) -> ParseResult:
-        """
-        Save a downloaded text/code file preserving its original filename and extension.
-
-        Instead of routing through TextParser -> MarkdownParser (which renames to .md
-        and splits into sections), this saves the file directly into a VikingFS temp
-        directory with its original name.
-
-        Args:
-            temp_path: Path to the downloaded temporary file
-            original_filename: Original filename from URL (e.g., "schemas.py")
-            start_time: Parse start timestamp
-
-        Returns:
-            ParseResult with temp_dir_path set
-        """
-        from openviking.storage.viking_fs import get_viking_fs
-
-        content = Path(temp_path).read_text(encoding="utf-8")
-        doc_name = Path(original_filename).stem
-
-        viking_fs = get_viking_fs()
-        temp_uri = viking_fs.create_temp_uri()
-        await viking_fs.mkdir(temp_uri)
-
-        # Create document root directory (TreeBuilder expects exactly one dir)
-        root_dir = f"{temp_uri}/{doc_name}"
-        await viking_fs.mkdir(root_dir)
-
-        # Save with original filename (preserving extension)
-        file_uri = f"{root_dir}/{original_filename}"
-        await viking_fs.write_file(file_uri, content)
-
-        root = ResourceNode(
-            type=NodeType.ROOT,
-            title=doc_name,
-            level=0,
-        )
-
-        result = create_parse_result(
-            root=root,
-            source_path=original_filename,
-            source_format="text",
-            parser_name="HTMLParser",
-            parse_time=time.time() - start_time,
-        )
-        result.temp_dir_path = temp_uri
-        return result
-
-    async def _download_file(self, url: str, request_validator=None) -> str:
-        """
-        Download file from URL to temporary location.
-
-        Args:
-            url: URL to download
-
-        Returns:
-            Path to downloaded temporary file
-
-        Raises:
-            Exception: If download fails
-        """
-        httpx = lazy_import("httpx")
-
-        url = self._convert_to_raw_url(url)
-
-        # Determine file extension from URL (decode first to handle encoded paths)
-        parsed = urlparse(url)
-        decoded_path = unquote(parsed.path)
-        ext = Path(decoded_path).suffix or ".tmp"
-
-        # Create temp file
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
-        temp_path = temp_file.name
-        temp_file.close()
-
-        # Download
-        client_kwargs = {
-            "timeout": self.timeout,
-            "follow_redirects": True,
-        }
-        event_hooks = build_httpx_request_validation_hooks(request_validator)
-        if event_hooks:
-            client_kwargs["event_hooks"] = event_hooks
-            client_kwargs["trust_env"] = False
-
-        async with httpx.AsyncClient(**client_kwargs) as client:
-            headers = {"User-Agent": self.user_agent}
-            response = await client.get(url, headers=headers)
-            response.raise_for_status()
-
-            # Write to temp file
-            Path(temp_path).write_bytes(response.content)
-
-        return temp_path
-
     def _html_to_markdown(self, html: str, base_url: str = "") -> str:
         """
         Convert HTML to Markdown using readabilipy + markdownify (Anthropic approach).
@@ -730,7 +172,7 @@ async def parse_content(
         """
         Parse HTML content.
 
-        Converts HTML to Markdown and delegates to MarkdownParser (three-phase architecture).
+        Converts HTML to Markdown and delegates to MarkdownParser.
 
         Args:
             content: HTML content string
@@ -742,7 +184,7 @@ async def parse_content(
         # Convert HTML to Markdown
         markdown_content = self._html_to_markdown(content, base_url=source_path or "")
 
-        # Delegate to MarkdownParser (using three-phase architecture)
+        # Delegate to MarkdownParser
         from openviking.parse.parsers.markdown import MarkdownParser
 
         md_parser = MarkdownParser()
diff --git a/openviking/parse/parsers/zip_parser.py b/openviking/parse/parsers/zip_parser.py
index 3418770ec..5dbce8174 100644
--- a/openviking/parse/parsers/zip_parser.py
+++ b/openviking/parse/parsers/zip_parser.py
@@ -3,18 +3,19 @@
 """
 ZIP archive parser for OpenViking.
 
-Lists and describes contents of ZIP files.
-Converts to markdown and delegates to MarkdownParser.
+Extracts ZIP archives and delegates to DirectoryParser for recursive processing.
+Supports nested ZIP files via DirectoryParser's recursive parser invocation.
 """
 
+import shutil
+import tempfile
 import zipfile
-from datetime import datetime
 from pathlib import Path
 from typing import List, Optional, Union
 
 from openviking.parse.base import ParseResult
 from openviking.parse.parsers.base_parser import BaseParser
-from openviking_cli.utils.config.parser_config import ParserConfig
+from openviking.utils.zip_safe import safe_extract_zip
 from openviking_cli.utils.logger import get_logger
 
 logger = get_logger(__name__)
@@ -27,200 +28,91 @@ class ZipParser(BaseParser):
     Supports: .zip
 
     Features:
-    - Lists all files in the archive
-    - Shows file sizes and modification dates
-    - Groups files by type/extension
+    - Extracts ZIP archive to temporary directory
+    - Delegates to DirectoryParser for recursive content processing
+    - Supports nested ZIP files (via DirectoryParser recursion)
+    - Preserves temporary directory for TreeBuilder
     """
 
-    def __init__(self, config: Optional[ParserConfig] = None):
-        """Initialize ZIP parser."""
-        from openviking.parse.parsers.markdown import MarkdownParser
-
-        self._md_parser = MarkdownParser(config=config)
-        self.config = config or ParserConfig()
-
     @property
     def supported_extensions(self) -> List[str]:
         return [".zip"]
 
     async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs) -> ParseResult:
-        """Parse from file path."""
-        path = Path(source)
-
-        if path.exists():
-            markdown_content = self._convert_zip_to_markdown(path)
-            result = await self._md_parser.parse_content(
-                markdown_content,
-                source_path=str(path),
-                instruction=instruction,
-                **kwargs,
-            )
-        else:
-            # Treat as raw content string
-            result = await self._md_parser.parse_content(
-                str(source), instruction=instruction, **kwargs
-            )
-        result.source_format = "zip"
-        result.parser_name = "ZipParser"
-        return result
-
-    async def parse_content(
-        self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
-    ) -> ParseResult:
-        """Parse content - for zip, content should be a file path."""
-        result = await self._md_parser.parse_content(content, source_path, **kwargs)
-        result.source_format = "zip"
-        result.parser_name = "ZipParser"
-        return result
-
-    def _convert_zip_to_markdown(self, path: Path) -> str:
-        """
-        Convert ZIP file information to markdown format.
+        """Parse a ZIP file by extracting it and delegating to DirectoryParser.
 
         Args:
-            path: Path to .zip file
+            source: Path to the .zip file
+            instruction: Processing instruction (forwarded to DirectoryParser)
+            **kwargs: Extra options forwarded to DirectoryParser
 
         Returns:
-            Markdown formatted string
+            ParseResult from DirectoryParser, with temp_dir_path preserved
+            for TreeBuilder to use later.
         """
+
+        path = Path(source)
+        if not path.exists():
+            raise FileNotFoundError(f"ZIP file not found: {path}")
+
+        if not zipfile.is_zipfile(path):
+            raise ValueError(f"Not a valid ZIP file: {path}")
+
+        # Extract ZIP to temporary directory
+        temp_dir = Path(tempfile.mkdtemp(prefix="ov_zip_"))
         try:
-            with zipfile.ZipFile(path, "r") as zf:
-                return self._process_zip_contents(zf, path)
-        except zipfile.BadZipFile:
-            raise ValueError(f"Invalid or corrupted ZIP file: {path}")
-        except Exception as e:
-            raise ValueError(f"Error reading ZIP file: {e}")
-
-    def _process_zip_contents(self, zf: zipfile.ZipFile, path: Path) -> str:
-        """Process ZIP file contents and return markdown."""
-        md_parts = []
-
-        # Title
-        md_parts.append(f"# ZIP Archive: {path.name}")
-        md_parts.append("")
-
-        # Archive info
-        md_parts.append("## Archive Information")
-        md_parts.append("")
-        md_parts.append(f"- **File:** {path.name}")
-        md_parts.append(f"- **Total files:** {len(zf.namelist())}")
-        md_parts.append(
-            f"- **Comment:** {zf.comment.decode('utf-8', errors='ignore') if zf.comment else 'None'}"
-        )
-        md_parts.append("")
-
-        # Group files by extension
-        files_by_ext = self._group_files_by_extension(zf.namelist())
-
-        # File listing by category
-        md_parts.append("## Contents")
-        md_parts.append("")
-
-        # Summary table
-        if files_by_ext:
-            md_parts.append("### File Types Summary")
-            md_parts.append("")
-            md_parts.append("| Extension | Count |")
-            md_parts.append("|-----------|-------|")
-            for ext, files in sorted(files_by_ext.items(), key=lambda x: -len(x[1])):
-                display_ext = ext if ext else "(no extension)"
-                md_parts.append(f"| {display_ext} | {len(files)} |")
-            md_parts.append("")
-
-        # Detailed listing
-        md_parts.append("### File List")
-        md_parts.append("")
-
-        # Create a table with file info
-        md_parts.append("| File | Size | Modified |")
-        md_parts.append("|------|------|----------|")
-
-        for info in zf.infolist():
-            # Skip directories
-            if info.is_dir():
-                continue
-
-            filename = info.filename
-            size = self._format_size(info.file_size)
-            modified = self._format_datetime(info.date_time)
-
-            # Escape pipe characters
-            filename = filename.replace("|", "\\|")
-
-            md_parts.append(f"| {filename} | {size} | {modified} |")
-
-        md_parts.append("")
-
-        # Directory structure
-        md_parts.append("## Directory Structure")
-        md_parts.append("")
-        md_parts.append("```")
-        md_parts.append(self._generate_tree_view(zf.namelist()))
-        md_parts.append("```")
-
-        return "\n".join(md_parts)
-
-    def _group_files_by_extension(self, filenames: List[str]) -> dict:
-        """Group files by their extension."""
-        groups = {}
-        for name in filenames:
-            if name.endswith("/"):  # Skip directories
-                continue
-            ext = Path(name).suffix.lower()
-            if ext not in groups:
-                groups[ext] = []
-            groups[ext].append(name)
-        return groups
-
-    def _format_size(self, size: int) -> str:
-        """Format file size in human-readable format."""
-        for unit in ["B", "KB", "MB", "GB", "TB"]:
-            if size < 1024.0:
-                return f"{size:.1f} {unit}"
-            size /= 1024.0
-        return f"{size:.1f} PB"
-
-    def _format_datetime(self, dt_tuple) -> str:
-        """Format datetime tuple from ZIP info."""
-        try:
-            dt = datetime(*dt_tuple)
-            return dt.strftime("%Y-%m-%d %H:%M")
-        except:
-            return "Unknown"
-
-    def _generate_tree_view(self, filenames: List[str]) -> str:
-        """Generate a tree-like view of the archive contents."""
-        # Build a simple tree structure
-        lines = []
-
-        # Get unique directories
-        dirs = set()
-        for name in filenames:
-            parts = name.split("/")
-            for i in range(len(parts) - 1):
-                dirs.add("/".join(parts[: i + 1]) + "/")
-
-        # Sort all items
-        all_items = sorted(set(filenames) | dirs)
-
-        for item in all_items:
-            # Calculate depth
-            depth = item.count("/")
-            if item.endswith("/"):
-                depth -= 1
-
-            # Create indentation
-            indent = "    " * depth
-
-            # Get just the name part
-            name = item.rstrip("/").split("/")[-1] if "/" in item else item
-
-            # Add prefix for directories vs files
-            if item.endswith("/"):
-                prefix = "[dir] "
-            else:
-                prefix = ""
+            with zipfile.ZipFile(path, "r") as zipf:
+                safe_extract_zip(zipf, temp_dir)
+
+            # Check if extracted content has a single root directory
+            extracted_entries = [p for p in temp_dir.iterdir() if p.name not in {".", ".."}]
 
-            lines.append(f"{indent}{prefix}{name}")
+            # Prepare kwargs for DirectoryParser
+            dir_kwargs = dict(kwargs)
+            dir_kwargs["instruction"] = instruction
 
-        return "\n".join(lines)
+            # Use DirectoryParser to process the extracted content
+            from openviking.parse.parsers.directory import DirectoryParser
+
+            parser = DirectoryParser()
+
+            if len(extracted_entries) == 1 and extracted_entries[0].is_dir():
+                # Single root directory - parse that directly
+                dir_kwargs.pop("source_name", None)
+                result = await parser.parse(str(extracted_entries[0]), **dir_kwargs)
+            else:
+                # Multiple entries at root - parse the temp dir itself
+                # Set source_name from zip filename if not provided
+                if "source_name" not in dir_kwargs or not dir_kwargs["source_name"]:
+                    dir_kwargs["source_name"] = path.stem
+                result = await parser.parse(str(temp_dir), **dir_kwargs)
+
+            # Ensure the temporary directory is preserved for TreeBuilder
+            if not result.temp_dir_path:
+                result.temp_dir_path = str(temp_dir)
+            else:
+                # If DirectoryParser created its own temp, clean up our extraction dir
+                try:
+                    shutil.rmtree(temp_dir, ignore_errors=True)
+                except Exception:
+                    pass
+
+            result.source_format = "zip"
+            result.parser_name = "ZipParser"
+            return result
+
+        except Exception:
+            # Clean up on error
+            try:
+                shutil.rmtree(temp_dir, ignore_errors=True)
+            except Exception:
+                pass
+            raise
+
+    async def parse_content(
+        self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
+    ) -> ParseResult:
+        """Parse content - not applicable for ZIP files (needs a file path)."""
+        raise NotImplementedError(
+            "ZipParser does not support parse_content. Please provide a file path to parse()."
+        )
diff --git a/openviking/parse/registry.py b/openviking/parse/registry.py
index add62ddb1..d4294a6f1 100644
--- a/openviking/parse/registry.py
+++ b/openviking/parse/registry.py
@@ -12,7 +12,6 @@
 
 from openviking.parse.base import ParseResult
 from openviking.parse.parsers.base_parser import BaseParser
-from openviking.parse.parsers.code import CodeRepositoryParser
 from openviking.parse.parsers.directory import DirectoryParser
 from openviking.parse.parsers.epub import EPubParser
 from openviking.parse.parsers.excel import ExcelParser
@@ -67,9 +66,6 @@ def __init__(self, register_optional: bool = True):
         self.register("powerpoint", PowerPointParser())
         self.register("excel", ExcelParser())
         self.register("epub", EPubParser())
-        # CodeRepositoryParser also uses .zip; register it before ZipParser
-        # so that .zip resolves to ZipParser (file) rather than code repo.
-        self.register("code", CodeRepositoryParser())
         self.register("zip", ZipParser())
         self.register("directory", DirectoryParser())
 
@@ -77,14 +73,6 @@ def __init__(self, register_optional: bool = True):
         self.register("audio", AudioParser())
         self.register("video", VideoParser())
 
-        # Optional: Feishu/Lark document parser (requires lark-oapi)
-        try:
-            from openviking.parse.parsers.feishu import FeishuParser
-
-            self.register("feishu", FeishuParser())
-        except ImportError:
-            pass
-
     def register(self, name: str, parser: BaseParser) -> None:
         """
         Register a parser.
@@ -215,13 +203,16 @@ def get_parser_for_file(self, path: Union[str, Path]) -> Optional[BaseParser]:
 
     async def parse(self, source: Union[str, Path], **kwargs) -> ParseResult:
         """
-        Parse a file or content string.
+        Parse a local file or content string.
 
         Automatically selects parser based on file extension.
         Falls back to text parser for unknown types.
 
+        NOTE: For URL handling, use AccessorRegistry in the two-layer architecture.
+        This registry only handles local files and raw content.
+
         Args:
-            source: File path or content string
+            source: Local file path or content string
             **kwargs: Additional arguments passed to parser
 
         Returns:
@@ -229,20 +220,6 @@ async def parse(self, source: Union[str, Path], **kwargs) -> ParseResult:
         """
         source_str = str(source)
 
-        # First, check if it's a code repository URL
-        code_parser = self._parsers.get("code")
-        if code_parser:
-            # Check if the parser has the is_repository_url method
-            try:
-                if hasattr(code_parser, "is_repository_url") and code_parser.is_repository_url(
-                    source_str
-                ):
-                    logger.info(f"Detected code repository URL: {source_str}")
-                    return await code_parser.parse(source_str, **kwargs)
-            except Exception as e:
-                logger.warning(f"Error checking if source is repository URL: {e}")
-                # Continue with normal parsing flow
-
         # Check if source looks like a file path (short enough and no newlines)
         is_potential_path = len(source_str) <= 1024 and "\n" not in source_str
 
diff --git a/openviking/utils/media_processor.py b/openviking/utils/media_processor.py
index 3d1e19eda..346912273 100644
--- a/openviking/utils/media_processor.py
+++ b/openviking/utils/media_processor.py
@@ -2,18 +2,16 @@
 # SPDX-License-Identifier: AGPL-3.0
 """Unified resource processor with strategy-based routing."""
 
-import tempfile
-import zipfile
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
 
 from openviking.parse import DocumentConverter, parse
+from openviking.parse.accessors.base import SourceType
 from openviking.parse.base import ParseResult
 from openviking.server.local_input_guard import (
     is_remote_resource_source,
     looks_like_local_path,
 )
-from openviking.utils.zip_safe import safe_extract_zip
 from openviking_cli.exceptions import PermissionDeniedError
 from openviking_cli.utils.logger import get_logger
 
@@ -25,7 +23,12 @@
 
 
 class UnifiedResourceProcessor:
-    """Unified resource processing for files, URLs, and raw content."""
+    """Unified resource processing for files, URLs, and raw content.
+
+    Uses two-layer architecture:
+    - Phase 1: AccessorRegistry gets LocalResource from source
+    - Phase 2: ParserRegistry parses LocalResource to ParseResult
+    """
 
     def __init__(
         self,
@@ -35,6 +38,7 @@ def __init__(
         self.storage = storage
         self._vlm_processor = vlm_processor
         self._document_converter = None
+        self._accessor_registry = None
 
     def _get_vlm_processor(self) -> Optional["VLMProcessor"]:
         if self._vlm_processor is None:
@@ -48,6 +52,14 @@ def _get_document_converter(self) -> DocumentConverter:
             self._document_converter = DocumentConverter()
         return self._document_converter
 
+    def _get_accessor_registry(self):
+        """Lazy initialize AccessorRegistry for two-layer mode."""
+        if self._accessor_registry is None:
+            from openviking.parse.accessors import get_accessor_registry
+
+            self._accessor_registry = get_accessor_registry()
+        return self._accessor_registry
+
     async def process(
         self,
         source: str,
@@ -55,141 +67,83 @@ async def process(
         allow_local_path_resolution: bool = True,
         **kwargs,
     ) -> ParseResult:
-        """Process any source (file/URL/content) with appropriate strategy."""
-        # Check if URL
-        if self._is_url(source):
-            return await self._process_url(source, instruction)
+        """Process any source (file/URL/content) with two-layer architecture.
+
+        Phase 1: Use AccessorRegistry to get LocalResource
+        Phase 2: Use ParserRegistry to parse LocalResource
+
+        Resource Lifecycle:
+        - Temporary resources are managed via context manager or temp_dir_path
+        - Directories needed for TreeBuilder are preserved via ParseResult.temp_dir_path
+        """
 
-        # Check if looks like a file path (short enough and no newlines)
+        # First check if source is raw content (not URL/path)
         is_potential_path = (
             allow_local_path_resolution and len(source) <= 1024 and "\n" not in source
         )
-        if is_potential_path:
-            path = Path(source)
-            if path.exists():
-                if path.is_dir():
-                    return await self._process_directory(path, instruction, **kwargs)
-                return await self._process_file(path, instruction, **kwargs)
-            else:
-                logger.warning(f"Path {path} does not exist")
-                raise FileNotFoundError(f"Path {path} does not exist")
-
-        if not allow_local_path_resolution and looks_like_local_path(source):
+        if not is_potential_path and not self._is_url(source):
+            # Treat as raw content
+            return await parse(source, instruction=instruction)
+
+        # Block local paths in HTTP server mode, but allow remote URLs
+        if (
+            not allow_local_path_resolution
+            and not is_remote_resource_source(source)
+            and looks_like_local_path(source)
+        ):
             raise PermissionDeniedError(
                 "HTTP server only accepts remote resource URLs or temp-uploaded files; "
                 "direct host filesystem paths are not allowed."
             )
 
-        # Treat as raw content
-        return await parse(source, instruction=instruction)
+        # Phase 1: Accessor - get local resource
+        registry = self._get_accessor_registry()
+        local_resource = await registry.access(source, **kwargs)
+
+        # Use context manager for automatic cleanup, but preserve directories for TreeBuilder
+        try:
+            # Phase 2: Parser - parse the local resource
+            parse_kwargs = dict(kwargs)
+            parse_kwargs["instruction"] = instruction
+            parse_kwargs["vlm_processor"] = self._get_vlm_processor()
+            parse_kwargs["storage"] = self.storage
+            parse_kwargs["_source_meta"] = local_resource.meta
+            parse_kwargs["original_source"] = local_resource.original_source
+
+            # Set resource_name from source_name or path
+            source_name = kwargs.get("source_name")
+            if source_name:
+                parse_kwargs["resource_name"] = Path(source_name).stem
+                parse_kwargs.setdefault("source_name", source_name)
+            else:
+                # For git repositories, use repo_name from meta if available
+                repo_name = local_resource.meta.get("repo_name")
+                if repo_name and local_resource.source_type == SourceType.GIT:
+                    # Use the last part of repo_name as the resource_name (e.g., "OpenViking" from "volcengine/OpenViking")
+                    parse_kwargs["resource_name"] = repo_name.split("/")[-1]
+                else:
+                    parse_kwargs.setdefault("resource_name", local_resource.path.stem)
+
+            # If it's a directory, use DirectoryParser which will delegate to CodeRepositoryParser if it's a git repo
+            if local_resource.path.is_dir():
+                from openviking.parse.parsers.directory import DirectoryParser
+
+                parser = DirectoryParser()
+
+                result = await parser.parse(str(local_resource.path), **parse_kwargs)
+                # Preserve temporary directory for TreeBuilder
+                if local_resource.is_temporary and not result.temp_dir_path:
+                    result.temp_dir_path = str(local_resource.path)
+                    # Mark as non-temporary so context manager doesn't clean it up
+                    local_resource.is_temporary = False
+                return result
+
+            # For files, use the unified parse function (including .zip files via ZipParser)
+            return await parse(str(local_resource.path), **parse_kwargs)
+        finally:
+            # Clean up temporary resources unless they need to be preserved
+            local_resource.cleanup()
 
     def _is_url(self, source: str) -> bool:
         """Check if source is a URL."""
         return is_remote_resource_source(source)
-
-    async def _process_url(self, url: str, instruction: str, **kwargs) -> ParseResult:
-        """Process URL source."""
-        from openviking.utils.code_hosting_utils import is_git_repo_url, validate_git_ssh_uri
-
-        # Validate git@ SSH URIs early
-        if url.startswith("git@"):
-            validate_git_ssh_uri(url)
-
-        # Route Feishu/Lark cloud document URLs to FeishuParser
-        if self._is_feishu_url(url):
-            from openviking.parse.registry import get_registry
-
-            parser = get_registry().get_parser("feishu")
-            if parser is None:
-                raise ImportError(
-                    "FeishuParser not available. "
-                    "Install lark-oapi: pip install 'openviking[bot-feishu]'"
-                )
-            return await parser.parse(url, instruction=instruction, **kwargs)
-
-        # Route git protocols and repo URLs to CodeRepositoryParser
-        if url.startswith(("git@", "git://", "ssh://")) or is_git_repo_url(url):
-            from openviking.parse.parsers.code.code import CodeRepositoryParser
-
-            parser = CodeRepositoryParser()
-            return await parser.parse(url, instruction=instruction, **kwargs)
-
-        from openviking.parse.parsers.html import HTMLParser
-
-        parser = HTMLParser()
-        return await parser.parse(url, instruction=instruction, **kwargs)
-
-    @staticmethod
-    def _is_feishu_url(url: str) -> bool:
-        """Check if URL is a Feishu/Lark cloud document."""
-        from urllib.parse import urlparse
-
-        parsed = urlparse(url)
-        host = parsed.hostname or ""
-        path = parsed.path
-        is_feishu_domain = host.endswith(".feishu.cn") or host.endswith(".larksuite.com")
-        has_doc_path = any(
-            path == f"/{t}" or path.startswith(f"/{t}/") for t in ("docx", "wiki", "sheets", "base")
-        )
-        return is_feishu_domain and has_doc_path
-
-    async def _process_directory(
-        self,
-        dir_path: Path,
-        instruction: str,
-        **kwargs,
-    ) -> ParseResult:
-        """Process directory source via DirectoryParser.
-
-        Args:
-            dir_path: Path to the directory.
-            instruction: Processing instruction.
-            **kwargs: Forwarded to ``DirectoryParser.parse()`` →
-                ``scan_directory()``: ``strict``, ``ignore_dirs``,
-                ``include``, ``exclude``.
-        """
-        from openviking.parse.parsers.directory import DirectoryParser
-
-        parser = DirectoryParser()
-        return await parser.parse(str(dir_path), instruction=instruction, **kwargs)
-
-    async def _process_file(
-        self,
-        file_path: Path,
-        instruction: str,
-        **kwargs,
-    ) -> ParseResult:
-        """Process file with unified parsing."""
-        ext = file_path.suffix.lower()
-        # Only treat .zip files as archives to extract.
-        if ext == ".zip" and zipfile.is_zipfile(file_path):
-            temp_dir = Path(tempfile.mkdtemp())
-            try:
-                with zipfile.ZipFile(file_path, "r") as zipf:
-                    safe_extract_zip(zipf, temp_dir)
-
-                extracted_entries = [p for p in temp_dir.iterdir() if p.name not in {".", ".."}]
-                if len(extracted_entries) == 1 and extracted_entries[0].is_dir():
-                    dir_kwargs = dict(kwargs)
-                    dir_kwargs.pop("source_name", None)
-                    return await self._process_directory(
-                        extracted_entries[0], instruction, **dir_kwargs
-                    )
-
-                return await self._process_directory(temp_dir, instruction, **kwargs)
-            finally:
-                pass  # Don't delete temp_dir yet, it will be used by TreeBuilder
-        source_name = kwargs.get("source_name")
-        if source_name:
-            kwargs["resource_name"] = Path(source_name).stem
-            kwargs.setdefault("source_name", source_name)
-        else:
-            kwargs.setdefault("resource_name", file_path.stem)
-
-        return await parse(
-            str(file_path),
-            instruction=instruction,
-            vlm_processor=self._get_vlm_processor(),
-            storage=self.storage,
-            **kwargs,
-        )
diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py
index 01aaf75df..122f389e0 100644
--- a/openviking/utils/resource_processor.py
+++ b/openviking/utils/resource_processor.py
@@ -216,6 +216,7 @@ async def process_resource(
             # ============ Phase 3.5: 首次添加立即落盘 + 生命周期锁 ============
             root_uri = result.get("root_uri")
             temp_uri = result.get("temp_uri")  # temp_doc_uri
+            original_temp_uri = temp_uri  # 保存原始 temp_uri 用于最终输出
             candidate_uri = getattr(context_tree, "_candidate_uri", None) if context_tree else None
             lifecycle_lock_handle_id = ""
 
@@ -257,7 +258,8 @@ async def process_resource(
                     except Exception:
                         pass
 
-                    result["temp_uri"] = root_uri
+                    # 数据已移动到 root_uri，后续处理使用 root_uri
+                    temp_uri = root_uri
                 else:
                     # 增量更新：对目标目录加 SUBTREE 锁
                     resource_path = viking_fs._uri_to_path(root_uri, ctx=ctx)
@@ -267,7 +269,7 @@ async def process_resource(
 
             # ============ Phase 4: Optional Steps ============
             build_index = kwargs.get("build_index", True)
-            temp_uri_for_summarize = result.get("temp_uri") or parse_result.temp_dir_path
+            temp_uri_for_summarize = temp_uri or parse_result.temp_dir_path
             should_summarize = summarize or build_index
             if should_summarize:
                 skip_vec = not build_index
@@ -294,6 +296,10 @@ async def process_resource(
                 if handle:
                     await get_lock_manager().release(handle)
 
+            # 恢复原始 temp_uri 用于输出
+            if original_temp_uri is not None:
+                result["temp_uri"] = original_temp_uri
+
             return result
 
     @staticmethod
diff --git a/openviking_cli/utils/config/parser_config.py b/openviking_cli/utils/config/parser_config.py
index 6c369915c..fbdfdad0b 100644
--- a/openviking_cli/utils/config/parser_config.py
+++ b/openviking_cli/utils/config/parser_config.py
@@ -415,7 +415,7 @@ def validate(self) -> None:
 
 
 @dataclass
-class HTMLConfig(CodeHostingConfig):
+class HTMLConfig(ParserConfig):
     """
     Configuration for HTML parsing.
 
diff --git a/tests/unit/test_accessors_base.py b/tests/unit/test_accessors_base.py
new file mode 100644
index 000000000..cd2602fc6
--- /dev/null
+++ b/tests/unit/test_accessors_base.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""Unit tests for DataAccessor base classes and LocalResource."""
+
+from pathlib import Path
+from typing import Union
+
+import pytest
+
+from openviking.parse.accessors.base import DataAccessor, LocalResource
+
+
+class TestLocalResource:
+    """Tests for LocalResource dataclass."""
+
+    def test_create_local_resource(self, tmp_path: Path) -> None:
+        """Basic LocalResource creation."""
+        test_file = tmp_path / "test.txt"
+        test_file.write_text("content")
+
+        resource = LocalResource(
+            path=test_file,
+            source_type="local",
+            original_source="/original/path",
+            meta={"key": "value"},
+            is_temporary=False,
+        )
+
+        assert resource.path == test_file
+        assert resource.source_type == "local"
+        assert resource.original_source == "/original/path"
+        assert resource.meta == {"key": "value"}
+        assert resource.is_temporary is False
+
+    def test_local_resource_defaults(self, tmp_path: Path) -> None:
+        """LocalResource with default values."""
+        test_file = tmp_path / "test.txt"
+        test_file.write_text("content")
+
+        resource = LocalResource(
+            path=test_file,
+            source_type="local",
+            original_source="/original/path",
+        )
+
+        assert resource.meta == {}
+        assert resource.is_temporary is True
+
+    def test_local_resource_cleanup_temporary(self, tmp_path: Path) -> None:
+        """cleanup() removes temporary resources."""
+        test_dir = tmp_path / "test_dir"
+        test_dir.mkdir()
+        (test_dir / "file.txt").write_text("content")
+
+        resource = LocalResource(
+            path=test_dir,
+            source_type="test",
+            original_source="test",
+            is_temporary=True,
+        )
+
+        assert test_dir.exists()
+        resource.cleanup()
+        assert not test_dir.exists()
+
+    def test_local_resource_cleanup_non_temporary(self, tmp_path: Path) -> None:
+        """cleanup() does not remove non-temporary resources."""
+        test_file = tmp_path / "test.txt"
+        test_file.write_text("content")
+
+        resource = LocalResource(
+            path=test_file,
+            source_type="local",
+            original_source="test",
+            is_temporary=False,
+        )
+
+        assert test_file.exists()
+        resource.cleanup()
+        assert test_file.exists()
+
+    def test_local_resource_cleanup_missing(self, tmp_path: Path) -> None:
+        """cleanup() handles missing paths gracefully."""
+        missing_path = tmp_path / "missing"
+
+        resource = LocalResource(
+            path=missing_path,
+            source_type="test",
+            original_source="test",
+            is_temporary=True,
+        )
+
+        # Should not raise
+        resource.cleanup()
+
+
+class TestDataAccessor:
+    """Tests for DataAccessor abstract base class."""
+
+    def test_cannot_instantiate_abstract(self) -> None:
+        """Cannot directly instantiate DataAccessor."""
+        with pytest.raises(TypeError):
+            DataAccessor()  # type: ignore
+
+    def test_subclass_must_implement_methods(self) -> None:
+        """Subclasses must implement all abstract methods."""
+
+        class IncompleteAccessor(DataAccessor):
+            pass
+
+        with pytest.raises(TypeError):
+            IncompleteAccessor()
+
+    def test_complete_subclass(self) -> None:
+        """A complete subclass can be instantiated."""
+
+        class TestAccessor(DataAccessor):
+            def can_handle(self, source: Union[str, Path]) -> bool:
+                return str(source).startswith("test:")
+
+            async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
+                return LocalResource(
+                    path=Path("/tmp/test"),
+                    source_type="test",
+                    original_source=str(source),
+                )
+
+            @property
+            def priority(self) -> int:
+                return 50
+
+        accessor = TestAccessor()
+        assert accessor.can_handle("test:something")
+        assert not accessor.can_handle("other:something")
+        assert accessor.priority == 50
diff --git a/tests/unit/test_accessors_git.py b/tests/unit/test_accessors_git.py
new file mode 100644
index 000000000..c8f316c38
--- /dev/null
+++ b/tests/unit/test_accessors_git.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""Unit tests for GitAccessor."""
+
+from pathlib import Path
+
+import pytest
+
+from openviking.parse.accessors import GitAccessor
+
+
+class TestGitAccessor:
+    """Tests for GitAccessor."""
+
+    @pytest.fixture
+    def accessor(self) -> GitAccessor:
+        """Create a GitAccessor instance."""
+        return GitAccessor()
+
+    def test_priority(self, accessor: GitAccessor) -> None:
+        """GitAccessor should have correct priority."""
+        assert accessor.priority == 80
+
+    @pytest.mark.parametrize(
+        "source",
+        [
+            "git@github.com:volcengine/OpenViking.git",
+            "git@gitlab.com:org/repo.git",
+        ],
+    )
+    def test_can_handle_git_ssh_url(self, accessor: GitAccessor, source: str) -> None:
+        """GitAccessor should handle git@ SSH URLs."""
+        assert accessor.can_handle(source) is True
+
+    @pytest.mark.parametrize(
+        "source",
+        [
+            "https://github.com/volcengine/OpenViking",
+            "https://github.com/volcengine/OpenViking.git",
+            "https://gitlab.com/org/repo",
+            "http://github.com/org/repo",
+        ],
+    )
+    def test_can_handle_github_http_url(self, accessor: GitAccessor, source: str) -> None:
+        """GitAccessor should handle GitHub/GitLab HTTP URLs."""
+        assert accessor.can_handle(source) is True
+
+    @pytest.mark.parametrize(
+        "source",
+        [
+            "https://github.com/volcengine/OpenViking/tree/main",
+            "https://github.com/volcengine/OpenViking/tree/abc1234",
+        ],
+    )
+    def test_can_handle_github_with_ref(self, accessor: GitAccessor, source: str) -> None:
+        """GitAccessor should handle GitHub URLs with branch/commit."""
+        assert accessor.can_handle(source) is True
+
+    def test_can_handle_git_protocol_url(self, accessor: GitAccessor) -> None:
+        """GitAccessor should handle git:// URLs."""
+        assert accessor.can_handle("git://github.com/volcengine/OpenViking.git") is True
+
+    @pytest.mark.parametrize(
+        "source",
+        [
+            "/path/to/repo.git",
+            "/path/to/archive.zip",
+        ],
+    )
+    def test_can_handle_local_files(self, accessor: GitAccessor, source: str) -> None:
+        """GitAccessor should handle local .git and .zip files."""
+        assert accessor.can_handle(Path(source)) is True
+
+    @pytest.mark.parametrize(
+        "source",
+        [
+            "https://example.com/page.html",
+            "https://github.com/volcengine/OpenViking/issues/123",
+            "git@example.com:repo",
+        ],
+    )
+    def test_cannot_handle_other_urls(self, accessor: GitAccessor, source: str) -> None:
+        """GitAccessor should not handle non-git URLs or files."""
+        assert accessor.can_handle(source) is False
diff --git a/tests/unit/test_accessors_http.py b/tests/unit/test_accessors_http.py
new file mode 100644
index 000000000..7bf99e6ed
--- /dev/null
+++ b/tests/unit/test_accessors_http.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""Unit tests for HTTPAccessor."""
+
+import pytest
+
+from openviking.parse.accessors import AccessorRegistry, GitAccessor, HTTPAccessor
+
+
+class TestHTTPAccessor:
+    """Tests for HTTPAccessor."""
+
+    @pytest.fixture
+    def accessor(self) -> HTTPAccessor:
+        """Create a HTTPAccessor instance."""
+        return HTTPAccessor()
+
+    def test_priority(self, accessor: HTTPAccessor) -> None:
+        """HTTPAccessor should have correct priority."""
+        assert accessor.priority == 50
+
+    @pytest.mark.parametrize(
+        "source",
+        [
+            "https://example.com/page.html",
+            "http://example.com/document.pdf",
+            "https://example.org/file.md",
+        ],
+    )
+    def test_can_handle_http_urls(self, accessor: HTTPAccessor, source: str) -> None:
+        """HTTPAccessor should handle regular HTTP/HTTPS URLs."""
+        assert accessor.can_handle(source) is True
+
+    @pytest.mark.parametrize(
+        "source",
+        [
+            "/path/to/file.html",
+            "git@github.com:org/repo.git",
+            "plain text content",
+        ],
+    )
+    def test_cannot_handle_non_http(self, accessor: HTTPAccessor, source: str) -> None:
+        """HTTPAccessor should NOT handle non-HTTP sources."""
+        assert accessor.can_handle(source) is False
+
+    @pytest.mark.parametrize(
+        "url, expected",
+        [
+            ("https://example.com/path/file.html", "file.html"),
+            ("https://example.com/path/doc.pdf", "doc.pdf"),
+            ("https://example.com/path/", "path"),
+            ("https://example.com", "download"),
+        ],
+    )
+    def test_extract_filename_from_url(self, url: str, expected: str) -> None:
+        """Test filename extraction from URLs."""
+        assert HTTPAccessor._extract_filename_from_url(url) == expected
+
+
+class TestHTTPAccessorPriorityRouting:
+    """Tests that verify HTTPAccessor works correctly with priority-based routing."""
+
+    def test_git_url_routed_to_git_accessor(self) -> None:
+        """Git URLs should be routed to GitAccessor, not HTTPAccessor."""
+        registry = AccessorRegistry(register_default=False)
+        http = HTTPAccessor()
+        git = GitAccessor()
+        registry.register(http)
+        registry.register(git)
+
+        test_url = "https://github.com/volcengine/OpenViking"
+
+        # Both can handle the URL individually (this is OK!)
+        assert git.can_handle(test_url) is True
+        assert http.can_handle(test_url) is True
+
+        # But registry picks the higher priority one (GitAccessor)
+        accessor = registry.get_accessor(test_url)
+        assert accessor is not None
+        assert accessor.__class__.__name__ == "GitAccessor"
+
+    def test_regular_http_url_routed_to_http_accessor(self) -> None:
+        """Regular HTTP URLs should be routed to HTTPAccessor."""
+        registry = AccessorRegistry(register_default=False)
+        http = HTTPAccessor()
+        git = GitAccessor()
+        registry.register(http)
+        registry.register(git)
+
+        test_url = "https://example.com/page.html"
+
+        # Only HTTPAccessor can handle this
+        assert git.can_handle(test_url) is False
+        assert http.can_handle(test_url) is True
+
+        # Registry picks HTTPAccessor
+        accessor = registry.get_accessor(test_url)
+        assert accessor is not None
+        assert accessor.__class__.__name__ == "HTTPAccessor"
+
+    def test_accessor_priority_order(self) -> None:
+        """Accessors should be registered in descending priority order."""
+        registry = AccessorRegistry(register_default=False)
+        http = HTTPAccessor()
+        git = GitAccessor()
+
+        # Register in any order
+        registry.register(http)
+        registry.register(git)
+
+        accessors = registry.list_accessors()
+
+        # GitAccessor (priority 80) should come before HTTPAccessor (priority 50)
+        assert len(accessors) == 2
+        assert accessors[0].__class__.__name__ == "GitAccessor"
+        assert accessors[1].__class__.__name__ == "HTTPAccessor"
diff --git a/tests/unit/test_accessors_registry.py b/tests/unit/test_accessors_registry.py
new file mode 100644
index 000000000..858345a69
--- /dev/null
+++ b/tests/unit/test_accessors_registry.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""Unit tests for AccessorRegistry."""
+
+from pathlib import Path
+from typing import Union
+
+import pytest
+
+from openviking.parse.accessors.base import DataAccessor, LocalResource
+from openviking.parse.accessors.registry import AccessorRegistry, get_accessor_registry
+
+
+class TestAccessor(DataAccessor):
+    """Test accessor implementation."""
+
+    def __init__(self, name: str, prefix: str, priority: int = 50):
+        self.name = name
+        self.prefix = prefix
+        self._priority = priority
+
+    def can_handle(self, source: Union[str, Path]) -> bool:
+        return str(source).startswith(self.prefix)
+
+    async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
+        return LocalResource(
+            path=Path(f"/tmp/{self.name}"),
+            source_type=self.name,
+            original_source=str(source),
+            meta={"accessor": self.name, **kwargs},
+        )
+
+    @property
+    def priority(self) -> int:
+        return self._priority
+
+
+class TestAccessorRegistry:
+    """Tests for AccessorRegistry."""
+
+    @pytest.fixture
+    def registry(self) -> AccessorRegistry:
+        """Create a fresh registry (without default accessors)."""
+        return AccessorRegistry(register_default=False)
+
+    def test_register_and_list(self, registry: AccessorRegistry) -> None:
+        """Register an accessor and list it."""
+        accessor = TestAccessor("test", "test:", 50)
+        registry.register(accessor)
+
+        accessors = registry.list_accessors()
+        assert len(accessors) == 1
+        assert accessors[0] == accessor
+
+    def test_get_accessor(self, registry: AccessorRegistry) -> None:
+        """Get an accessor that can handle a source."""
+        accessor1 = TestAccessor("test1", "test1:", 50)
+        accessor2 = TestAccessor("test2", "test2:", 50)
+        registry.register(accessor1)
+        registry.register(accessor2)
+
+        result = registry.get_accessor("test1:source")
+        assert result is accessor1
+
+        result = registry.get_accessor("test2:source")
+        assert result is accessor2
+
+        result = registry.get_accessor("unknown:source")
+        assert result is None
+
+    def test_priority_ordering(self, registry: AccessorRegistry) -> None:
+        """Accessors are ordered by priority (descending)."""
+        low_prio = TestAccessor("low", "common:", 10)
+        mid_prio = TestAccessor("mid", "common:", 50)
+        high_prio = TestAccessor("high", "common:", 100)
+
+        # Register in random order
+        registry.register(low_prio)
+        registry.register(high_prio)
+        registry.register(mid_prio)
+
+        accessors = registry.list_accessors()
+        assert [a.name for a in accessors] == ["high", "mid", "low"]
+
+        # Highest priority should be selected
+        result = registry.get_accessor("common:source")
+        assert result is high_prio
+
+    def test_list_accessors_with_filter(self, registry: AccessorRegistry) -> None:
+        """list_accessors filters by source capability."""
+        accessor1 = TestAccessor("a", "a:", 50)
+        accessor2 = TestAccessor("b", "b:", 50)
+        accessor3 = TestAccessor("both", "a:", 50)  # also handles "a:"
+        registry.register(accessor1)
+        registry.register(accessor2)
+        registry.register(accessor3)
+
+        filtered = registry.list_accessors("a:source")
+        assert len(filtered) == 2
+        names = {a.name for a in filtered}
+        assert names == {"a", "both"}
+
+    def test_unregister(self, registry: AccessorRegistry) -> None:
+        """Unregister removes accessor by name."""
+        accessor = TestAccessor("test", "test:", 50)
+        registry.register(accessor)
+
+        assert registry.unregister("TestAccessor") is True
+        assert len(registry.list_accessors()) == 0
+
+        assert registry.unregister("NotFound") is False
+
+    def test_clear(self, registry: AccessorRegistry) -> None:
+        """Clear removes all accessors."""
+        registry.register(TestAccessor("a", "a:", 50))
+        registry.register(TestAccessor("b", "b:", 50))
+
+        assert len(registry.list_accessors()) == 2
+        registry.clear()
+        assert len(registry.list_accessors()) == 0
+
+    @pytest.mark.asyncio
+    async def test_access_fallback_to_local(
+        self, registry: AccessorRegistry, tmp_path: Path
+    ) -> None:
+        """access() falls back to local file when no accessor matches."""
+        test_file = tmp_path / "test.txt"
+        test_file.write_text("content")
+
+        result = await registry.access(str(test_file))
+
+        assert result.source_type == "local"
+        assert result.path == test_file
+        assert result.is_temporary is False
+
+    @pytest.mark.asyncio
+    async def test_access_with_accessor(self, registry: AccessorRegistry) -> None:
+        """access() uses the matching accessor."""
+        accessor = TestAccessor("test", "test:", 50)
+        registry.register(accessor)
+
+        result = await registry.access("test:source", extra="value")
+
+        assert result.source_type == "test"
+        assert result.meta == {"accessor": "test", "extra": "value"}
+
+
+class TestGlobalRegistry:
+    """Tests for the global registry."""
+
+    def test_get_accessor_registry(self) -> None:
+        """get_accessor_registry returns a singleton."""
+        r1 = get_accessor_registry()
+        r2 = get_accessor_registry()
+        assert r1 is r2

From 6a2d1898f59d96d1b9a5f18609dc529fe450859d Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 11:30:45 +0800
Subject: [PATCH 02/15] reort: split parser layer to 2-layer: accessor and
 parser, so that we can reuse more code

---
 crates/ov_cli/src/client.rs                   | 12 ++-
 crates/ov_cli/src/commands/resources.rs       | 14 +++
 openviking/parse/accessors/__init__.py        |  8 +-
 .../{feishu.py => feishu_accessor.py}         |  0
 .../accessors/{git.py => git_accessor.py}     |  0
 .../accessors/{http.py => http_accessor.py}   |  0
 openviking/parse/accessors/local_accessor.py  | 99 +++++++++++++++++++
 openviking/parse/accessors/registry.py        | 33 ++++---
 openviking/parse/parsers/directory.py         |  2 +-
 tests/unit/test_accessors_registry.py         |  5 +
 10 files changed, 150 insertions(+), 23 deletions(-)
 rename openviking/parse/accessors/{feishu.py => feishu_accessor.py} (100%)
 rename openviking/parse/accessors/{git.py => git_accessor.py} (100%)
 rename openviking/parse/accessors/{http.py => http_accessor.py} (100%)
 create mode 100644 openviking/parse/accessors/local_accessor.py

diff --git a/crates/ov_cli/src/client.rs b/crates/ov_cli/src/client.rs
index 77afe352e..5c3ed3ae7 100644
--- a/crates/ov_cli/src/client.rs
+++ b/crates/ov_cli/src/client.rs
@@ -102,14 +102,20 @@ impl HttpClient {
         // Remove Content-Type: application/json, let reqwest set multipart/form-data automatically
         headers.remove(reqwest::header::CONTENT_TYPE);
 
-        let response = self
-            .http
+        // Use a separate HTTP client with a long timeout for large file uploads
+        // Long but finite timeout (30 minutes) to prevent hanging indefinitely
+        let long_timeout_client = reqwest::Client::builder()
+            .timeout(std::time::Duration::from_secs(1800))
+            .build()
+            .map_err(|e| Error::Network(format!("Failed to build long-timeout HTTP client: {}", e)))?;
+
+        let response = long_timeout_client
             .post(&url)
             .headers(headers)
             .multipart(form)
             .send()
             .await
-            .map_err(|e| Error::Network(format!("HTTP request failed: {}", e)))?;
+            .map_err(|e| Error::Network(format!("File upload failed: {}", e)))?;
 
         let result: Value = self.handle_response(response).await?;
         result
diff --git a/crates/ov_cli/src/commands/resources.rs b/crates/ov_cli/src/commands/resources.rs
index dc29317f5..3bb8a38e9 100644
--- a/crates/ov_cli/src/commands/resources.rs
+++ b/crates/ov_cli/src/commands/resources.rs
@@ -37,6 +37,13 @@ pub async fn add_resource(
             watch_interval,
         )
         .await?;
+
+    // Print helpful message for async processing
+    if !wait && matches!(format, OutputFormat::Table) {
+        eprintln!("Note: Resource is being processed in the background.");
+        eprintln!("Use 'ov wait' to wait for completion, or 'ov observer queue' to check status.");
+    }
+
     output_success(&result, format, compact);
     Ok(())
 }
@@ -50,6 +57,13 @@ pub async fn add_skill(
     compact: bool,
 ) -> Result<()> {
     let result = client.add_skill(data, wait, timeout).await?;
+
+    // Print helpful message for async processing
+    if !wait && matches!(format, OutputFormat::Table) {
+        eprintln!("Note: Skill is being processed in the background.");
+        eprintln!("Use 'ov wait' to wait for completion, or 'ov observer queue' to check status.");
+    }
+
     output_success(&result, format, compact);
     Ok(())
 }
diff --git a/openviking/parse/accessors/__init__.py b/openviking/parse/accessors/__init__.py
index 1836f30b5..6ae4fe2c5 100644
--- a/openviking/parse/accessors/__init__.py
+++ b/openviking/parse/accessors/__init__.py
@@ -9,9 +9,10 @@
 """
 
 from .base import DataAccessor, LocalResource
-from .feishu import FeishuAccessor
-from .git import GitAccessor
-from .http import HTTPAccessor
+from .feishu_accessor import FeishuAccessor
+from .git_accessor import GitAccessor
+from .http_accessor import HTTPAccessor
+from .local_accessor import LocalAccessor
 from .registry import (
     AccessorRegistry,
     access,
@@ -30,4 +31,5 @@
     "GitAccessor",
     "HTTPAccessor",
     "FeishuAccessor",
+    "LocalAccessor",
 ]
diff --git a/openviking/parse/accessors/feishu.py b/openviking/parse/accessors/feishu_accessor.py
similarity index 100%
rename from openviking/parse/accessors/feishu.py
rename to openviking/parse/accessors/feishu_accessor.py
diff --git a/openviking/parse/accessors/git.py b/openviking/parse/accessors/git_accessor.py
similarity index 100%
rename from openviking/parse/accessors/git.py
rename to openviking/parse/accessors/git_accessor.py
diff --git a/openviking/parse/accessors/http.py b/openviking/parse/accessors/http_accessor.py
similarity index 100%
rename from openviking/parse/accessors/http.py
rename to openviking/parse/accessors/http_accessor.py
diff --git a/openviking/parse/accessors/local_accessor.py b/openviking/parse/accessors/local_accessor.py
new file mode 100644
index 000000000..48305121b
--- /dev/null
+++ b/openviking/parse/accessors/local_accessor.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""
+Local file system accessor for OpenViking.
+
+Provides a DataAccessor implementation for local files and directories.
+This is the lowest-priority accessor that handles any path-like source
+that isn't handled by other accessors.
+"""
+
+from pathlib import Path
+from typing import Union
+
+from .base import DataAccessor, LocalResource, SourceType
+
+
+class LocalAccessor(DataAccessor):
+    """
+    Local file system accessor.
+
+    This accessor handles local files and directories. It should be
+    registered with the lowest priority so that it only handles sources
+    that aren't picked up by other accessors (Git, HTTP, Feishu, etc.).
+
+    Features:
+    - Handles any existing local path (file or directory)
+    - Marks resources as non-temporary (since they're already local)
+    - Provides clear local source type metadata
+    """
+
+    def can_handle(self, source: Union[str, Path]) -> bool:
+        """
+        Check if this accessor can handle the source.
+
+        LocalAccessor accepts any source that:
+        1. Is a Path object, OR
+        2. Is a string that looks like a local path and exists, OR
+        3. Is a string that doesn't look like a URL (for fallback)
+
+        Since this is a fallback accessor, it returns True for most sources.
+        The priority system ensures other accessors get a chance first.
+
+        Args:
+            source: Source string or Path object
+
+        Returns:
+            True - this is a fallback accessor that can handle any source
+        """
+        # As the fallback accessor, we can handle anything
+        # The registry will try higher-priority accessors first
+        return True
+
+    async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
+        """
+        Access a local file or directory.
+
+        Simply wraps the local path in a LocalResource without any
+        fetching or copying (since it's already local).
+
+        Args:
+            source: Local file path or Path object
+            **kwargs: Additional arguments (unused for local accessor)
+
+        Returns:
+            LocalResource pointing to the local path
+        """
+        path = Path(source)
+
+        return LocalResource(
+            path=path,
+            source_type=SourceType.LOCAL,
+            original_source=str(source),
+            meta={
+                "filename": path.name,
+                "suffix": path.suffix.lower() if path.suffix else None,
+                "is_dir": path.is_dir() if path.exists() else None,
+            },
+            is_temporary=False,
+        )
+
+    @property
+    def priority(self) -> int:
+        """
+        Priority of this accessor.
+
+        Returns 1 - the lowest priority, ensuring this accessor is only
+        used when no other accessor can handle the source.
+
+        Standard priority levels:
+        - 100: Specific service (Feishu, etc.)
+        - 80: Version control (Git, etc.)
+        - 50: Generic protocols (HTTP, etc.)
+        - 10: Fallback accessors
+        - 1: Local file system (this one)
+
+        Returns:
+            Priority level 1
+        """
+        return 1
diff --git a/openviking/parse/accessors/registry.py b/openviking/parse/accessors/registry.py
index d2bfb8b63..985cb2bf1 100644
--- a/openviking/parse/accessors/registry.py
+++ b/openviking/parse/accessors/registry.py
@@ -10,7 +10,7 @@
 from pathlib import Path
 from typing import List, Optional, Union
 
-from .base import DataAccessor, LocalResource, SourceType
+from .base import DataAccessor, LocalResource
 
 logger = logging.getLogger(__name__)
 
@@ -38,7 +38,7 @@ def _register_defaults(self) -> None:
         """Register default accessors."""
         # GitAccessor - handles git repositories
         try:
-            from .git import GitAccessor
+            from .git_accessor import GitAccessor
 
             self.register(GitAccessor())
         except Exception as e:
@@ -46,7 +46,7 @@ def _register_defaults(self) -> None:
 
         # HTTPAccessor - handles HTTP/HTTPS URLs
         try:
-            from .http import HTTPAccessor
+            from .http_accessor import HTTPAccessor
 
             self.register(HTTPAccessor())
         except Exception as e:
@@ -54,12 +54,20 @@ def _register_defaults(self) -> None:
 
         # FeishuAccessor - handles Feishu/Lark documents
         try:
-            from .feishu import FeishuAccessor
+            from .feishu_accessor import FeishuAccessor
 
             self.register(FeishuAccessor())
         except Exception as e:
             logger.debug(f"[AccessorRegistry] Failed to register FeishuAccessor: {e}")
 
+        # LocalAccessor - handles local files (lowest priority)
+        try:
+            from .local_accessor import LocalAccessor
+
+            self.register(LocalAccessor())
+        except Exception as e:
+            logger.debug(f"[AccessorRegistry] Failed to register LocalAccessor: {e}")
+
     def register(self, accessor: DataAccessor) -> None:
         """
         Register an accessor.
@@ -133,8 +141,6 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
         """
         Access a source by routing to the appropriate accessor.
 
-        If no accessor can handle the source, it is treated as a local file.
-
         Args:
             source: Source string (URL, path, etc.) or Path object
             **kwargs: Additional arguments passed to the accessor
@@ -144,7 +150,7 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
         """
         source_str = str(source)
 
-        # Try to find an accessor
+        # Find an accessor - LocalAccessor should always be registered as fallback
         accessor = self.get_accessor(source)
         if accessor:
             logger.debug(
@@ -152,15 +158,10 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
             )
             return await accessor.access(source, **kwargs)
 
-        # Default: treat as local file
-        logger.debug(f"[AccessorRegistry] No accessor found, treating as local file: {source_str}")
-        path = Path(source)
-        return LocalResource(
-            path=path,
-            source_type=SourceType.LOCAL,
-            original_source=source_str,
-            meta={},
-            is_temporary=False,
+        # This should not happen if LocalAccessor is registered
+        raise RuntimeError(
+            f"No accessor found for source: {source_str}. "
+            "LocalAccessor should be registered as a fallback."
         )
 
     def clear(self) -> None:
diff --git a/openviking/parse/parsers/directory.py b/openviking/parse/parsers/directory.py
index d190d1367..027f13c37 100644
--- a/openviking/parse/parsers/directory.py
+++ b/openviking/parse/parsers/directory.py
@@ -474,7 +474,7 @@ async def _is_git_repository(source_path: Path) -> bool:
     async def _add_git_metadata(source_path: Path, kwargs: dict) -> None:
         """Add git metadata (branch, commit, repo_name) to kwargs dictionary."""
         try:
-            from openviking.parse.accessors.git import GitAccessor
+            from openviking.parse.accessors.git_accessor import GitAccessor
 
             git_accessor = GitAccessor()
 
diff --git a/tests/unit/test_accessors_registry.py b/tests/unit/test_accessors_registry.py
index 858345a69..5d5803a6f 100644
--- a/tests/unit/test_accessors_registry.py
+++ b/tests/unit/test_accessors_registry.py
@@ -124,6 +124,11 @@ async def test_access_fallback_to_local(
         self, registry: AccessorRegistry, tmp_path: Path
     ) -> None:
         """access() falls back to local file when no accessor matches."""
+        from openviking.parse.accessors.local_accessor import LocalAccessor
+
+        # Register LocalAccessor as fallback
+        registry.register(LocalAccessor())
+
         test_file = tmp_path / "test.txt"
         test_file.write_text("content")
 

From daaead12657eff00ba84f036e48bd35d6a1e961f Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 11:39:25 +0800
Subject: [PATCH 03/15] reort: split parser layer to 2-layer: accessor and
 parser, so that we can reuse more code

---
 crates/ov_cli/src/client.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/crates/ov_cli/src/client.rs b/crates/ov_cli/src/client.rs
index 5c3ed3ae7..6250fa6bf 100644
--- a/crates/ov_cli/src/client.rs
+++ b/crates/ov_cli/src/client.rs
@@ -103,8 +103,10 @@ impl HttpClient {
         headers.remove(reqwest::header::CONTENT_TYPE);
 
         // Use a separate HTTP client with a long timeout for large file uploads
-        // Long but finite timeout (30 minutes) to prevent hanging indefinitely
+        // - Connect timeout: 30 seconds (to fail fast if server is unreachable)
+        // - Total request timeout: 30 minutes (to allow large file transfers)
         let long_timeout_client = reqwest::Client::builder()
+            .connect_timeout(std::time::Duration::from_secs(30))
             .timeout(std::time::Duration::from_secs(1800))
             .build()
             .map_err(|e| Error::Network(format!("Failed to build long-timeout HTTP client: {}", e)))?;

From 42f7160eb7b26eb5992af2e8da37306c4b1021c6 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 12:06:19 +0800
Subject: [PATCH 04/15] reort: split parser layer to 2-layer: accessor and
 parser, so that we can reuse more code

---
 openviking/parse/parsers/pdf.py        | 41 +++++++++++++++++++-------
 openviking/server/local_input_guard.py | 31 +++++++++++++++++--
 openviking/server/routers/resources.py | 32 ++++++++++++++++++--
 3 files changed, 88 insertions(+), 16 deletions(-)

diff --git a/openviking/parse/parsers/pdf.py b/openviking/parse/parsers/pdf.py
index 97c1ce6d1..0372b3e0d 100644
--- a/openviking/parse/parsers/pdf.py
+++ b/openviking/parse/parsers/pdf.py
@@ -93,7 +93,7 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
 
         Args:
             source: Path to PDF file
-            **kwargs: Additional options (currently unused)
+            **kwargs: Additional options (resource_name/source_name for original filename)
 
         Returns:
             ParseResult with document tree
@@ -105,6 +105,9 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         start_time = time.time()
         pdf_path = Path(source)
 
+        # Get resource name from kwargs, prefer original filename from upload
+        resource_name = kwargs.get("resource_name") or kwargs.get("source_name")
+
         if not pdf_path.exists():
             return create_parse_result(
                 root=ResourceNode(type=NodeType.ROOT),
@@ -117,11 +120,19 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
 
         try:
             # Step 1: Convert PDF to Markdown
-            markdown_content, conversion_meta = await self._convert_to_markdown(pdf_path)
+            markdown_content, conversion_meta = await self._convert_to_markdown(
+                pdf_path,
+                resource_name=resource_name,
+            )
 
-            # Step 2: Parse Markdown using MarkdownParser
+            # Step 2: Parse Markdown using MarkdownParser, pass through resource name
             md_parser = self._get_markdown_parser()
-            result = await md_parser.parse_content(markdown_content, source_path=str(pdf_path))
+            result = await md_parser.parse_content(
+                markdown_content,
+                source_path=str(pdf_path),
+                resource_name=resource_name,
+                source_name=resource_name,
+            )
 
             # Step 3: Update metadata for PDF origin
             result.source_format = "pdf"  # Override markdown format
@@ -152,12 +163,17 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
                 warnings=[f"Failed to parse PDF: {e}"],
             )
 
-    async def _convert_to_markdown(self, pdf_path: Path) -> tuple[str, Dict[str, Any]]:
+    async def _convert_to_markdown(
+        self,
+        pdf_path: Path,
+        resource_name: Optional[str] = None,
+    ) -> tuple[str, Dict[str, Any]]:
         """
         Convert PDF to Markdown using configured strategy.
 
         Args:
             pdf_path: Path to PDF file
+            resource_name: Optional resource name for organizing saved images
 
         Returns:
             Tuple of (markdown_content, metadata_dict)
@@ -166,22 +182,22 @@ async def _convert_to_markdown(self, pdf_path: Path) -> tuple[str, Dict[str, Any
             ValueError: If all conversion strategies fail
         """
         if self.config.strategy == "local":
-            return await self._convert_local(pdf_path)
+            return await self._convert_local(pdf_path, resource_name=resource_name)
 
         elif self.config.strategy == "mineru":
-            return await self._convert_mineru(pdf_path)
+            return await self._convert_mineru(pdf_path, resource_name=resource_name)
 
         elif self.config.strategy == "auto":
             # Try local first
             try:
-                return await self._convert_local(pdf_path)
+                return await self._convert_local(pdf_path, resource_name=resource_name)
             except Exception as e:
                 logger.warning(f"Local conversion failed: {e}")
 
                 # Fallback to MinerU if configured
                 if self.config.mineru_endpoint:
                     logger.info("Falling back to MinerU API")
-                    return await self._convert_mineru(pdf_path)
+                    return await self._convert_mineru(pdf_path, resource_name=resource_name)
                 else:
                     raise ValueError(
                         f"Local conversion failed and no MinerU endpoint configured: {e}"
@@ -594,12 +610,17 @@ def _extract_image_from_page(self, page, img_info: dict) -> Optional[bytes]:
             logger.debug(f"Image extraction error: {e}")
             return None
 
-    async def _convert_mineru(self, pdf_path: Path) -> tuple[str, Dict[str, Any]]:
+    async def _convert_mineru(
+        self,
+        pdf_path: Path,
+        resource_name: Optional[str] = None,
+    ) -> tuple[str, Dict[str, Any]]:
         """
         Convert PDF to Markdown using MinerU API.
 
         Args:
             pdf_path: Path to PDF file
+            resource_name: Optional resource name (unused in MinerU conversion)
 
         Returns:
             Tuple of (markdown_content, metadata)
diff --git a/openviking/server/local_input_guard.py b/openviking/server/local_input_guard.py
index d7a08a360..ff70620ef 100644
--- a/openviking/server/local_input_guard.py
+++ b/openviking/server/local_input_guard.py
@@ -4,8 +4,10 @@
 
 from __future__ import annotations
 
+import json
 import re
 from pathlib import Path
+from typing import Optional, Tuple
 
 from openviking.utils.network_guard import ensure_public_remote_target
 from openviking_cli.exceptions import PermissionDeniedError
@@ -51,8 +53,26 @@ def deny_direct_local_skill_input(value: str) -> None:
         )
 
 
-def resolve_uploaded_temp_file_id(temp_file_id: str, upload_temp_dir: Path) -> str:
-    """Resolve a temp upload id to a regular file under the server upload temp dir."""
+def _read_upload_meta(meta_path: Path) -> Optional[dict]:
+    """Read upload metadata file if it exists."""
+    try:
+        if meta_path.exists():
+            with open(meta_path, "r") as f:
+                return json.load(f)
+    except Exception:
+        pass
+    return None
+
+
+def resolve_uploaded_temp_file_id(
+    temp_file_id: str, upload_temp_dir: Path
+) -> Tuple[str, Optional[str]]:
+    """Resolve a temp upload id to a regular file under the server upload temp dir.
+
+    Returns:
+        Tuple of (resolved_file_path, original_filename)
+        original_filename is None if no meta file exists.
+    """
     if not temp_file_id or temp_file_id in {".", ".."}:
         raise PermissionDeniedError(
             "HTTP server only accepts regular files from the upload temp directory."
@@ -90,4 +110,9 @@ def resolve_uploaded_temp_file_id(temp_file_id: str, upload_temp_dir: Path) -> s
             "HTTP server only accepts regular files from the upload temp directory."
         )
 
-    return str(resolved_path)
+    # Try to read original filename from meta file
+    meta_path = upload_temp_dir / f"{temp_file_id}.ov_upload.meta"
+    meta = _read_upload_meta(meta_path)
+    original_filename = meta.get("original_filename") if meta else None
+
+    return (str(resolved_path), original_filename)
diff --git a/openviking/server/routers/resources.py b/openviking/server/routers/resources.py
index 1f03b5190..0d9703402 100644
--- a/openviking/server/routers/resources.py
+++ b/openviking/server/routers/resources.py
@@ -130,6 +130,12 @@ def _cleanup_temp_files(temp_dir: Path, max_age_hours: int = 1):
             if file_age > max_age_seconds:
                 file_path.unlink(missing_ok=True)
 
+                # Also clean up corresponding .ov_upload.meta file
+                if not file_path.name.endswith(".ov_upload.meta"):
+                    meta_path = temp_dir / f"{file_path.name}.ov_upload.meta"
+                    if meta_path.exists():
+                        meta_path.unlink(missing_ok=True)
+
 
 @router.post("/resources/temp_upload")
 async def temp_upload(
@@ -140,6 +146,8 @@ async def temp_upload(
     """Upload a temporary file for add_resource or import_ovpack."""
 
     async def _upload() -> dict[str, str]:
+        import json
+
         config = get_openviking_config()
         temp_dir = config.storage.get_upload_temp_dir()
 
@@ -154,6 +162,16 @@ async def _upload() -> dict[str, str]:
         with open(temp_file_path, "wb") as f:
             f.write(await file.read())
 
+        # Save metadata with original filename
+        if file.filename:
+            meta_path = temp_dir / f"{temp_filename}.ov_upload.meta"
+            meta = {
+                "original_filename": file.filename,
+                "upload_time": time.time(),
+            }
+            with open(meta_path, "w") as f:
+                json.dump(meta, f)
+
         return {"temp_file_id": temp_filename}
 
     execution = await run_operation(
@@ -181,17 +199,25 @@ async def add_resource(
     upload_temp_dir = get_openviking_config().storage.get_upload_temp_dir()
     path = request.path
     allow_local_path_resolution = False
+    original_filename = None
     if request.temp_file_id:
-        path = resolve_uploaded_temp_file_id(request.temp_file_id, upload_temp_dir)
+        path, original_filename = resolve_uploaded_temp_file_id(
+            request.temp_file_id, upload_temp_dir
+        )
         allow_local_path_resolution = True
     elif path is not None:
         path = require_remote_resource_source(path)
     if path is None:
         raise InvalidArgumentError("Either 'path' or 'temp_file_id' must be provided.")
 
+    # Use original_filename from upload if source_name not explicitly provided
+    source_name = request.source_name
+    if source_name is None and original_filename is not None:
+        source_name = original_filename
+
     kwargs = {
         "strict": request.strict,
-        "source_name": request.source_name,
+        "source_name": source_name,
         "ignore_dirs": request.ignore_dirs,
         "include": request.include,
         "exclude": request.exclude,
@@ -236,7 +262,7 @@ async def add_skill(
     data = request.data
     allow_local_path_resolution = False
     if request.temp_file_id:
-        data = resolve_uploaded_temp_file_id(request.temp_file_id, upload_temp_dir)
+        data, _ = resolve_uploaded_temp_file_id(request.temp_file_id, upload_temp_dir)
         allow_local_path_resolution = True
 
     execution = await run_operation(

From 1df49f18140d7f9cc7d166f214c32ace53d9d3f1 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 13:49:49 +0800
Subject: [PATCH 05/15] fix: zip optimization

---
 openviking/parse/accessors/git_accessor.py    | 242 ++++++++++--------
 openviking/parse/parsers/zip_parser.py        |  52 ++--
 openviking/utils/network_guard.py             |  61 ++++-
 .../utils/config/open_viking_config.py        |  14 +-
 4 files changed, 235 insertions(+), 134 deletions(-)

diff --git a/openviking/parse/accessors/git_accessor.py b/openviking/parse/accessors/git_accessor.py
index 1590bca0d..7f2595159 100644
--- a/openviking/parse/accessors/git_accessor.py
+++ b/openviking/parse/accessors/git_accessor.py
@@ -100,8 +100,8 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
         commit = kwargs.get("commit")
 
         try:
-            # Create local temp directory
-            temp_local_dir = tempfile.mkdtemp(prefix="ov_git_")
+            # Create local temp directory (non-blocking)
+            temp_local_dir = await asyncio.to_thread(tempfile.mkdtemp, prefix="ov_git_")
             logger.info(f"[GitAccessor] Created local temp dir for git: {temp_local_dir}")
 
             # Fetch content (Clone or Extract)
@@ -410,38 +410,47 @@ def _download() -> None:
         except Exception as exc:
             raise RuntimeError(f"Failed to download GitHub ZIP {zip_url}: {exc}")
 
-        # Safe extraction with Zip Slip validation
-        target = Path(extract_dir).resolve()
-        with zipfile.ZipFile(zip_path, "r") as zf:
-            for info in zf.infolist():
-                mode = info.external_attr >> 16
-                if info.is_dir() or stat.S_ISDIR(mode):
-                    continue
-                if stat.S_ISLNK(mode):
-                    logger.warning(
-                        f"[GitAccessor] Skipping symlink entry in GitHub ZIP: {info.filename}"
-                    )
-                    continue
-                raw = info.filename.replace("\\", "/")
-                raw_parts = [p for p in raw.split("/") if p]
-                if ".." in raw_parts:
-                    raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}")
-                if PurePosixPath(raw).is_absolute():
-                    raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}")
-                extracted = Path(zf.extract(info, extract_dir)).resolve()
-                if not extracted.is_relative_to(target):
-                    extracted.unlink(missing_ok=True)
-                    raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}")
-
-        # Remove downloaded archive to free disk space.
-        try:
-            os.unlink(zip_path)
-        except OSError:
-            pass
+        # Safe extraction with Zip Slip validation (non-blocking)
+        def _extract_zip():
+            target = Path(extract_dir).resolve()
+            with zipfile.ZipFile(zip_path, "r") as zf:
+                for info in zf.infolist():
+                    mode = info.external_attr >> 16
+                    if info.is_dir() or stat.S_ISDIR(mode):
+                        continue
+                    if stat.S_ISLNK(mode):
+                        logger.warning(
+                            f"[GitAccessor] Skipping symlink entry in GitHub ZIP: {info.filename}"
+                        )
+                        continue
+                    raw = info.filename.replace("\\", "/")
+                    raw_parts = [p for p in raw.split("/") if p]
+                    if ".." in raw_parts:
+                        raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}")
+                    if PurePosixPath(raw).is_absolute():
+                        raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}")
+                    extracted = Path(zf.extract(info, extract_dir)).resolve()
+                    if not extracted.is_relative_to(target):
+                        extracted.unlink(missing_ok=True)
+                        raise ValueError(f"Zip Slip detected in GitHub archive: {info.filename!r}")
+
+        await asyncio.to_thread(_extract_zip)
+
+        # Remove downloaded archive to free disk space (non-blocking)
+        def _cleanup_archive():
+            try:
+                os.unlink(zip_path)
+            except OSError:
+                pass
 
-        # GitHub ZIPs have a single top-level directory
-        top_level = [d for d in Path(extract_dir).iterdir() if d.is_dir()]
-        content_dir = top_level[0] if len(top_level) == 1 else Path(extract_dir)
+        await asyncio.to_thread(_cleanup_archive)
+
+        # GitHub ZIPs have a single top-level directory (non-blocking)
+        def _find_content_dir() -> Path:
+            top_level = [d for d in Path(extract_dir).iterdir() if d.is_dir()]
+            return top_level[0] if len(top_level) == 1 else Path(extract_dir)
+
+        content_dir = await asyncio.to_thread(_find_content_dir)
 
         logger.info(f"[GitAccessor] GitHub ZIP extracted to {content_dir} ({repo_name})")
         return content_dir, repo_name
@@ -487,38 +496,47 @@ def _download() -> None:
         except Exception as exc:
             raise RuntimeError(f"Failed to download GitLab ZIP {zip_url}: {exc}")
 
-        # Safe extraction with Zip Slip validation
-        target = Path(extract_dir).resolve()
-        with zipfile.ZipFile(zip_path, "r") as zf:
-            for info in zf.infolist():
-                mode = info.external_attr >> 16
-                if info.is_dir() or stat.S_ISDIR(mode):
-                    continue
-                if stat.S_ISLNK(mode):
-                    logger.warning(
-                        f"[GitAccessor] Skipping symlink entry in GitLab ZIP: {info.filename}"
-                    )
-                    continue
-                raw = info.filename.replace("\\", "/")
-                raw_parts = [p for p in raw.split("/") if p]
-                if ".." in raw_parts:
-                    raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}")
-                if PurePosixPath(raw).is_absolute():
-                    raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}")
-                extracted = Path(zf.extract(info, extract_dir)).resolve()
-                if not extracted.is_relative_to(target):
-                    extracted.unlink(missing_ok=True)
-                    raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}")
-
-        # Remove downloaded archive to free disk space.
-        try:
-            os.unlink(zip_path)
-        except OSError:
-            pass
+        # Safe extraction with Zip Slip validation (non-blocking)
+        def _extract_zip():
+            target = Path(extract_dir).resolve()
+            with zipfile.ZipFile(zip_path, "r") as zf:
+                for info in zf.infolist():
+                    mode = info.external_attr >> 16
+                    if info.is_dir() or stat.S_ISDIR(mode):
+                        continue
+                    if stat.S_ISLNK(mode):
+                        logger.warning(
+                            f"[GitAccessor] Skipping symlink entry in GitLab ZIP: {info.filename}"
+                        )
+                        continue
+                    raw = info.filename.replace("\\", "/")
+                    raw_parts = [p for p in raw.split("/") if p]
+                    if ".." in raw_parts:
+                        raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}")
+                    if PurePosixPath(raw).is_absolute():
+                        raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}")
+                    extracted = Path(zf.extract(info, extract_dir)).resolve()
+                    if not extracted.is_relative_to(target):
+                        extracted.unlink(missing_ok=True)
+                        raise ValueError(f"Zip Slip detected in GitLab archive: {info.filename!r}")
 
-        # GitLab ZIPs have a single top-level directory: {repo}-{ref}/
-        top_level = [d for d in Path(extract_dir).iterdir() if d.is_dir()]
-        content_dir = top_level[0] if len(top_level) == 1 else Path(extract_dir)
+        await asyncio.to_thread(_extract_zip)
+
+        # Remove downloaded archive to free disk space (non-blocking)
+        def _cleanup_archive():
+            try:
+                os.unlink(zip_path)
+            except OSError:
+                pass
+
+        await asyncio.to_thread(_cleanup_archive)
+
+        # GitLab ZIPs have a single top-level directory: {repo}-{ref}/ (non-blocking)
+        def _find_content_dir() -> Path:
+            top_level = [d for d in Path(extract_dir).iterdir() if d.is_dir()]
+            return top_level[0] if len(top_level) == 1 else Path(extract_dir)
+
+        content_dir = await asyncio.to_thread(_find_content_dir)
 
         logger.info(f"[GitAccessor] GitLab ZIP extracted to {content_dir} ({repo_name})")
         return content_dir, repo_name
@@ -531,51 +549,61 @@ async def _extract_zip(self, zip_path: str, target_dir: str) -> str:
         path = Path(zip_path)
         name = path.stem
 
-        with zipfile.ZipFile(zip_path, "r") as zip_ref:
-            target = Path(target_dir).resolve()
-            for info in zip_ref.infolist():
-                mode = info.external_attr >> 16
-                # Skip directory entries
-                if info.is_dir() or stat.S_ISDIR(mode):
-                    continue
-                # Skip symlink entries
-                if stat.S_ISLNK(mode):
-                    logger.warning(f"[GitAccessor] Skipping symlink entry in zip: {info.filename}")
-                    continue
-                # Reject entries with suspicious raw path components
-                raw = info.filename.replace("\\", "/")
-                raw_parts = [p for p in raw.split("/") if p]
-                if ".." in raw_parts:
-                    raise ValueError(f"Zip Slip detected: entry {info.filename!r} contains '..'")
-                if PurePosixPath(raw).is_absolute() or (len(raw) >= 2 and raw[1] == ":"):
-                    raise ValueError(
-                        f"Zip Slip detected: entry {info.filename!r} is an absolute path"
-                    )
-                # Normalize and verify
-                arcname = info.filename.replace("/", os.sep)
-                if os.path.altsep:
-                    arcname = arcname.replace(os.path.altsep, os.sep)
-                arcname = os.path.splitdrive(arcname)[1]
-                arcname = os.sep.join(p for p in arcname.split(os.sep) if p not in ("", ".", ".."))
-                if not arcname:
-                    continue
-                member_path = (Path(target_dir) / arcname).resolve()
-                if not member_path.is_relative_to(target):
-                    raise ValueError(
-                        f"Zip Slip detected: entry {info.filename!r} escapes target directory"
-                    )
-                # Extract and verify
-                extracted = Path(zip_ref.extract(info, target_dir)).resolve()
-                if not extracted.is_relative_to(target):
-                    # Best-effort cleanup
-                    try:
-                        extracted.unlink(missing_ok=True)
-                    except OSError as cleanup_err:
+        # Extract zip file (non-blocking)
+        def _extract_zip_sync():
+            with zipfile.ZipFile(zip_path, "r") as zip_ref:
+                target = Path(target_dir).resolve()
+                for info in zip_ref.infolist():
+                    mode = info.external_attr >> 16
+                    # Skip directory entries
+                    if info.is_dir() or stat.S_ISDIR(mode):
+                        continue
+                    # Skip symlink entries
+                    if stat.S_ISLNK(mode):
                         logger.warning(
-                            f"[GitAccessor] Failed to clean up escaped file {extracted}: {cleanup_err}"
+                            f"[GitAccessor] Skipping symlink entry in zip: {info.filename}"
+                        )
+                        continue
+                    # Reject entries with suspicious raw path components
+                    raw = info.filename.replace("\\", "/")
+                    raw_parts = [p for p in raw.split("/") if p]
+                    if ".." in raw_parts:
+                        raise ValueError(
+                            f"Zip Slip detected: entry {info.filename!r} contains '..'"
+                        )
+                    if PurePosixPath(raw).is_absolute() or (len(raw) >= 2 and raw[1] == ":"):
+                        raise ValueError(
+                            f"Zip Slip detected: entry {info.filename!r} is an absolute path"
                         )
-                    raise ValueError(
-                        f"Zip Slip detected: entry {info.filename!r} escapes target directory"
+                    # Normalize and verify
+                    arcname = info.filename.replace("/", os.sep)
+                    if os.path.altsep:
+                        arcname = arcname.replace(os.path.altsep, os.sep)
+                    arcname = os.path.splitdrive(arcname)[1]
+                    arcname = os.sep.join(
+                        p for p in arcname.split(os.sep) if p not in ("", ".", "..")
                     )
+                    if not arcname:
+                        continue
+                    member_path = (Path(target_dir) / arcname).resolve()
+                    if not member_path.is_relative_to(target):
+                        raise ValueError(
+                            f"Zip Slip detected: entry {info.filename!r} escapes target directory"
+                        )
+                    # Extract and verify
+                    extracted = Path(zip_ref.extract(info, target_dir)).resolve()
+                    if not extracted.is_relative_to(target):
+                        # Best-effort cleanup
+                        try:
+                            extracted.unlink(missing_ok=True)
+                        except OSError as cleanup_err:
+                            logger.warning(
+                                f"[GitAccessor] Failed to clean up escaped file {extracted}: {cleanup_err}"
+                            )
+                        raise ValueError(
+                            f"Zip Slip detected: entry {info.filename!r} escapes target directory"
+                        )
+
+        await asyncio.to_thread(_extract_zip_sync)
 
         return name
diff --git a/openviking/parse/parsers/zip_parser.py b/openviking/parse/parsers/zip_parser.py
index 5dbce8174..17d782fc2 100644
--- a/openviking/parse/parsers/zip_parser.py
+++ b/openviking/parse/parsers/zip_parser.py
@@ -7,6 +7,7 @@
 Supports nested ZIP files via DirectoryParser's recursive parser invocation.
 """
 
+import asyncio
 import shutil
 import tempfile
 import zipfile
@@ -55,17 +56,28 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
         if not path.exists():
             raise FileNotFoundError(f"ZIP file not found: {path}")
 
-        if not zipfile.is_zipfile(path):
+        # Check if it's a valid ZIP file (non-blocking)
+        def _is_zipfile() -> bool:
+            return zipfile.is_zipfile(path)
+
+        if not await asyncio.to_thread(_is_zipfile):
             raise ValueError(f"Not a valid ZIP file: {path}")
 
-        # Extract ZIP to temporary directory
-        temp_dir = Path(tempfile.mkdtemp(prefix="ov_zip_"))
+        # Extract ZIP to temporary directory (non-blocking)
+        temp_dir = Path(await asyncio.to_thread(tempfile.mkdtemp, prefix="ov_zip_"))
         try:
-            with zipfile.ZipFile(path, "r") as zipf:
-                safe_extract_zip(zipf, temp_dir)
+            # Extract zip (non-blocking)
+            def _extract_zip():
+                with zipfile.ZipFile(path, "r") as zipf:
+                    safe_extract_zip(zipf, temp_dir)
+
+            await asyncio.to_thread(_extract_zip)
 
-            # Check if extracted content has a single root directory
-            extracted_entries = [p for p in temp_dir.iterdir() if p.name not in {".", ".."}]
+            # Check if extracted content has a single root directory (non-blocking)
+            def _list_entries():
+                return [p for p in temp_dir.iterdir() if p.name not in {".", ".."}]
+
+            extracted_entries = await asyncio.to_thread(_list_entries)
 
             # Prepare kwargs for DirectoryParser
             dir_kwargs = dict(kwargs)
@@ -91,22 +103,28 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
             if not result.temp_dir_path:
                 result.temp_dir_path = str(temp_dir)
             else:
-                # If DirectoryParser created its own temp, clean up our extraction dir
-                try:
-                    shutil.rmtree(temp_dir, ignore_errors=True)
-                except Exception:
-                    pass
+                # If DirectoryParser created its own temp, clean up our extraction dir (non-blocking)
+                def _cleanup_temp():
+                    try:
+                        shutil.rmtree(temp_dir, ignore_errors=True)
+                    except Exception:
+                        pass
+
+                await asyncio.to_thread(_cleanup_temp)
 
             result.source_format = "zip"
             result.parser_name = "ZipParser"
             return result
 
         except Exception:
-            # Clean up on error
-            try:
-                shutil.rmtree(temp_dir, ignore_errors=True)
-            except Exception:
-                pass
+            # Clean up on error (non-blocking)
+            def _cleanup_temp_on_error():
+                try:
+                    shutil.rmtree(temp_dir, ignore_errors=True)
+                except Exception:
+                    pass
+
+            await asyncio.to_thread(_cleanup_temp_on_error)
             raise
 
     async def parse_content(
diff --git a/openviking/utils/network_guard.py b/openviking/utils/network_guard.py
index a9dad2c68..ba5d29b08 100644
--- a/openviking/utils/network_guard.py
+++ b/openviking/utils/network_guard.py
@@ -11,6 +11,7 @@
 from urllib.parse import urlparse
 
 from openviking_cli.exceptions import PermissionDeniedError
+from openviking_cli.utils.config import get_openviking_config
 
 RequestValidator = Callable[[str], None]
 
@@ -20,13 +21,48 @@
 }
 
 
+def _get_allowed_code_hosting_domains() -> set[str]:
+    """Get allowed code hosting domains from config."""
+    allowed = set()
+    try:
+        config = get_openviking_config()
+        # Add configured GitHub and GitLab domains
+        if hasattr(config, "code"):
+            if hasattr(config.code, "github_domains"):
+                allowed.update(config.code.github_domains)
+            if hasattr(config.code, "gitlab_domains"):
+                allowed.update(config.code.gitlab_domains)
+            if hasattr(config.code, "code_hosting_domains"):
+                allowed.update(config.code.code_hosting_domains)
+    except Exception:
+        # If config can't be loaded, use defaults
+        allowed.update({"github.com", "gitlab.com", "www.github.com", "www.gitlab.com"})
+    return allowed
+
+
+def _is_allow_private_networks() -> bool:
+    """Check if private networks are allowed by config."""
+    try:
+        config = get_openviking_config()
+        return getattr(config, "allow_private_networks", False)
+    except Exception:
+        return False
+
+
 def extract_remote_host(source: str) -> Optional[str]:
     """Extract the destination host from a remote resource source."""
     if source.startswith("git@"):
         rest = source[4:]
-        if ":" not in rest:
+        # Find the colon separator, handling IPv6 addresses in brackets
+        if "]:" in rest:
+            # IPv6 address: git@[::1]:user/repo.git
+            host_part = rest.split("]:", 1)[0] + "]"
+        elif ":" in rest:
+            # Regular hostname: git@github.com:user/repo.git
+            host_part = rest.split(":", 1)[0]
+        else:
             return None
-        return rest.split(":", 1)[0].strip().strip("[]")
+        return host_part.strip().strip("[]")
 
     parsed = urlparse(source)
     if parsed.hostname is None:
@@ -63,7 +99,12 @@ def _is_public_ip(address: str) -> bool:
 
 
 def ensure_public_remote_target(source: str) -> None:
-    """Reject loopback, link-local, private, and other non-public targets."""
+    """Reject loopback, link-local, private, and other non-public targets.
+
+    Skips validation if:
+    - allow_private_networks is True in config
+    - Host is in configured github_domains/gitlab_domains/code_hosting_domains
+    """
     host = extract_remote_host(source)
     if not host:
         raise PermissionDeniedError(
@@ -77,6 +118,16 @@ def ensure_public_remote_target(source: str) -> None:
             "loopback, link-local, private, and otherwise non-public destinations are not allowed."
         )
 
+    # Check if private networks are allowed globally
+    if _is_allow_private_networks():
+        return
+
+    # Check if host is in allowed code hosting domains
+    allowed_domains = _get_allowed_code_hosting_domains()
+    normalized_domains = {_normalize_host(d) for d in allowed_domains}
+    if normalized_host in normalized_domains:
+        return
+
     resolved_addresses = _resolve_host_addresses(host)
     if not resolved_addresses:
         return
@@ -85,7 +136,9 @@ def ensure_public_remote_target(source: str) -> None:
     if non_public:
         raise PermissionDeniedError(
             "HTTP server only accepts public remote resource targets; "
-            f"host '{host}' resolves to non-public address '{non_public[0]}'."
+            f"host '{host}' resolves to non-public address '{non_public[0]}'. "
+            "To allow this, add the domain to code.gitlab_domains/code.github_domains/code.code_hosting_domains "
+            "or set allow_private_networks=true in your ov.conf."
         )
 
 
diff --git a/openviking_cli/utils/config/open_viking_config.py b/openviking_cli/utils/config/open_viking_config.py
index 9d0ee615c..fab364471 100644
--- a/openviking_cli/utils/config/open_viking_config.py
+++ b/openviking_cli/utils/config/open_viking_config.py
@@ -132,12 +132,6 @@ class OpenVikingConfig(BaseModel):
 
     default_search_limit: int = Field(default=3, description="Default number of results to return")
 
-    enable_memory_decay: bool = Field(default=True, description="Enable automatic memory decay")
-
-    memory_decay_check_interval: int = Field(
-        default=3600, description="Interval (seconds) to check for expired memories"
-    )
-
     language_fallback: str = Field(
         default="en",
         description=(
@@ -146,6 +140,14 @@ class OpenVikingConfig(BaseModel):
         ),
     )
 
+    allow_private_networks: bool = Field(
+        default=False,
+        description=(
+            "Allow fetching resources from private/non-public network addresses. "
+            "When disabled (default), only public IP addresses and hostnames are allowed."
+        ),
+    )
+
     log: LogConfig = Field(default_factory=lambda: LogConfig(), description="Logging configuration")
 
     memory: MemoryConfig = Field(

From bf5b3232f778302cb9798be43065c90769bcaa11 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 13:56:39 +0800
Subject: [PATCH 06/15] fix: zip optimization for gitlab

---
 openviking/parse/accessors/git_accessor.py | 58 +++++++++++++++++++---
 1 file changed, 50 insertions(+), 8 deletions(-)

diff --git a/openviking/parse/accessors/git_accessor.py b/openviking/parse/accessors/git_accessor.py
index 7f2595159..8828e4aea 100644
--- a/openviking/parse/accessors/git_accessor.py
+++ b/openviking/parse/accessors/git_accessor.py
@@ -119,15 +119,57 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
             elif source_str.startswith(("http://", "https://", "git://", "ssh://")):
                 repo_url, branch, commit = self._parse_repo_source(source_str, **kwargs)
                 if self._is_github_url(repo_url):
-                    # Use GitHub ZIP API
-                    local_dir, repo_name = await self._github_zip_download(
-                        repo_url, branch or commit, temp_local_dir
-                    )
+                    # Try GitHub ZIP API first, fall back to git clone
+                    try:
+                        local_dir, repo_name = await self._github_zip_download(
+                            repo_url, branch or commit, temp_local_dir
+                        )
+                    except Exception as zip_exc:
+                        logger.warning(
+                            f"[GitAccessor] GitHub ZIP download failed, falling back to git clone: {zip_exc}"
+                        )
+
+                        # Clean up any partial content before cloning
+                        def _cleanup():
+                            for p in Path(temp_local_dir).iterdir():
+                                if p.is_file():
+                                    p.unlink(missing_ok=True)
+                                elif p.is_dir():
+                                    shutil.rmtree(p, ignore_errors=True)
+
+                        await asyncio.to_thread(_cleanup)
+                        repo_name = await self._git_clone(
+                            repo_url,
+                            temp_local_dir,
+                            branch=branch,
+                            commit=commit,
+                        )
                 elif self._is_gitlab_url(repo_url):
-                    # Use GitLab ZIP API
-                    local_dir, repo_name = await self._gitlab_zip_download(
-                        repo_url, branch or commit, temp_local_dir
-                    )
+                    # Try GitLab ZIP API first, fall back to git clone
+                    try:
+                        local_dir, repo_name = await self._gitlab_zip_download(
+                            repo_url, branch or commit, temp_local_dir
+                        )
+                    except Exception as zip_exc:
+                        logger.warning(
+                            f"[GitAccessor] GitLab ZIP download failed, falling back to git clone: {zip_exc}"
+                        )
+
+                        # Clean up any partial content before cloning
+                        def _cleanup():
+                            for p in Path(temp_local_dir).iterdir():
+                                if p.is_file():
+                                    p.unlink(missing_ok=True)
+                                elif p.is_dir():
+                                    shutil.rmtree(p, ignore_errors=True)
+
+                        await asyncio.to_thread(_cleanup)
+                        repo_name = await self._git_clone(
+                            repo_url,
+                            temp_local_dir,
+                            branch=branch,
+                            commit=commit,
+                        )
                 else:
                     # Non-GitHub/GitLab URL: use git clone
                     repo_name = await self._git_clone(

From 52438286c700d7fa3b09576fb19ba04eab91b18b Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 14:37:39 +0800
Subject: [PATCH 07/15] fix: zip docs

---
 docs/design/parser-two-layer-refactor-plan.md | 440 +++++++-----------
 1 file changed, 176 insertions(+), 264 deletions(-)

diff --git a/docs/design/parser-two-layer-refactor-plan.md b/docs/design/parser-two-layer-refactor-plan.md
index 1d99a7cc7..de67d469d 100644
--- a/docs/design/parser-two-layer-refactor-plan.md
+++ b/docs/design/parser-two-layer-refactor-plan.md
@@ -1,355 +1,267 @@
-# OpenViking 解析器两层架构重构计划
+# OpenViking 解析器两层架构重构
 
 | 项目 | 信息 |
 |-----|------|
-| 状态 | `规划中` |
+| 状态 | `已完成` |
 | 创建日期 | 2026-04-13 |
-| 目标版本 | v6.0 |
+| 完成日期 | 2026-04-14 |
 
 ---
 
-## 一、问题分析
+## 概述
+
+将原有的单一层 Parser 架构拆分为 **Accessor（数据访问层）** 和 **Parser（数据解析层）** 两层，实现职责分离和代码复用。
+
+---
+
+## 目录
+
+- [背景与问题](#背景与问题)
+- [架构设计](#架构设计)
+- [核心抽象](#核心抽象)
+- [实现进度](#实现进度)
+- [文件结构](#文件结构)
+
+---
+
+## 背景与问题
 
 ### 当前架构的问题
 
-1. **平铺式注册**：所有 Parser 在同一层级，职责不清晰
-2. **后缀冲突**：`.zip` 可被 `CodeRepositoryParser` 和 `ZipParser` 同时处理
-3. **URL 处理逻辑分散**：`UnifiedResourceProcessor._process_url()` 和 `ParserRegistry.parse()` 都有 URL 检测
-4. **职责混合**：部分 Parser 既负责下载又负责解析（如 `CodeRepositoryParser`, `HTMLParser`, `FeishuParser`）
+| 问题 | 说明 |
+|-----|------|
+| 平铺式注册 | 所有 Parser 在同一层级，职责不清晰 |
+| 后缀冲突 | `.zip` 可被 `CodeRepositoryParser` 和 `ZipParser` 同时处理 |
+| URL 处理逻辑分散 | `UnifiedResourceProcessor._process_url()` 和 `ParserRegistry.parse()` 都有 URL 检测 |
+| 职责混合 | 部分 Parser 既负责下载又负责解析 |
+
+### 重构目标
+
+1. **职责分离**：数据获取 ≠ 数据解析
+2. **代码复用**：HTTP/Git 下载逻辑可被多个 Parser 复用
+3. **易于扩展**：新增数据源只需添加 Accessor
+4. **解决冲突**：通过优先级机制解决 `.zip` 等后缀冲突
 
 ---
 
-## 二、新架构设计
+## 架构设计
 
-### 核心概念
+### 两层架构
 
 | 层级 | 抽象接口 | 职责 | 示例 |
 |-----|---------|------|------|
-| **L1** | `DataAccessor` | 获取数据：将远程 URL / 特殊路径 → 本地文件/目录 | GitAccessor, HTTPAccessor, FeishuAccessor |
-| **L2** | `DataParser` | 解析数据：本地文件/目录 → `ParseResult` | MarkdownParser, PDFParser, DirectoryParser |
-| **混合** | `HybridParser` | 同时实现两个接口（简化插件开发） | （按需使用） |
+| **L1: Accessor** | `DataAccessor` | 获取数据：远程 URL / 特殊路径 → 本地文件/目录 | `GitAccessor`, `HTTPAccessor`, `FeishuAccessor` |
+| **L2: Parser** | `BaseParser` | 解析数据：本地文件/目录 → `ParseResult` | `MarkdownParser`, `PDFParser`, `ZipParser` |
 
-### 新的调用流程
+### 调用流程
 
 ```
 add_resource(path)
     ↓
 ResourceProcessor.process_resource()
     ↓
-UnifiedResourceProcessor.process()  [重构]
+UnifiedResourceProcessor.process()
     ↓
-【第一阶段：数据访问】
-AccessorRegistry.route(source)
-    ├─→ 是 URL/远程资源?
-    │     ├─→ GitAccessor (git@, git://, github.com, ...)
-    │     ├─→ FeishuAccessor (.feishu.cn, .larksuite.com)
-    │     ├─→ HTTPAccessor (http://, https://)
-    │     └─→ 其他 → 下一步
-    └─→ 返回: LocalResource (本地路径 + 元数据)
-           ↓
-【第二阶段：数据解析】
-ParserRegistry.route(local_resource)
-    ├─→ 是目录? → DirectoryParser
-    ├─→ 是文件? → 按扩展名匹配
-    │     ├─→ .md → MarkdownParser
-    │     ├─→ .pdf → PDFParser
-    │     ├─→ .zip → ZipParser
-    │     └─→ ...
-    └─→ 返回: ParseResult
-           ↓
+┌─────────────────────────────────────────┐
+│  第一阶段：数据访问 (Accessor Layer)     │
+├─────────────────────────────────────────┤
+│  AccessorRegistry.route(source)         │
+│    ├─→ GitAccessor    (priority: 80)   │
+│    ├─→ FeishuAccessor (priority: 100)  │
+│    ├─→ HTTPAccessor   (priority: 50)   │
+│    └─→ LocalAccessor  (priority: 10)   │
+│         ↓                                │
+│  返回: LocalResource                     │
+│         (本地路径 + 元数据)              │
+└─────────────────────────────────────────┘
+                    ↓
+┌─────────────────────────────────────────┐
+│  第二阶段：数据解析 (Parser Layer)       │
+├─────────────────────────────────────────┤
+│  ParserRegistry.route(local_resource)   │
+│    ├─→ 是目录? → DirectoryParser        │
+│    ├─→ 是文件? → 按扩展名匹配           │
+│    │     ├─→ .md  → MarkdownParser     │
+│    │     ├─→ .pdf → PDFParser          │
+│    │     ├─→ .zip → ZipParser          │
+│    │     └─→ ...                        │
+│    └─→ 返回: ParseResult                │
+└─────────────────────────────────────────┘
+                    ↓
 TreeBuilder + SemanticQueue (保持不变)
 ```
 
 ---
 
-## 三、详细设计
+## 核心抽象
 
-### 3.1 核心抽象接口
+### 1. LocalResource（数据类）
 
-#### 文件 1: `openviking/parse/accessors/base.py`
+位置：`openviking/parse/accessors/base.py`
 
-```python
-from abc import ABC, abstractmethod
-from pathlib import Path
-from typing import Optional, Dict, Any, Union
-from dataclasses import dataclass
+表示一个可在本地访问的资源，是 Accessor 层的输出。
 
+```python
 @dataclass
 class LocalResource:
-    """数据访问层的输出：表示可访问的本地资源"""
     path: Path                    # 本地文件/目录路径
-    source_type: str              # 原始来源类型: "git", "http", "feishu", "local", ...
+    source_type: str              # 原始来源类型 (SourceType.GIT/HTTP/FEISHU/LOCAL)
     original_source: str           # 原始 source 字符串
-    meta: Dict[str, Any]          # 元数据（repo_name, branch, 等）
+    meta: Dict[str, Any]          # 元数据（repo_name, branch, content_type 等）
     is_temporary: bool = True      # 是否为临时文件，解析后可清理
 
-class DataAccessor(ABC):
-    """数据访问器：负责获取数据到本地"""
+    def cleanup(self) -> None     # 清理临时资源
+    def __enter__/__exit__         # 支持上下文管理器
+```
 
+### 2. DataAccessor（抽象基类）
+
+位置：`openviking/parse/accessors/base.py`
+
+```python
+class DataAccessor(ABC):
     @abstractmethod
-    def can_handle(self, source: Union[str, Path]) -> bool:
+    def can_handle(self, source: Union[str, Path]) -> bool
         """判断是否能处理该来源"""
-        pass
 
     @abstractmethod
-    async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
-        """
-        获取数据到本地
-
-        返回: LocalResource，包含本地路径和元数据
-        """
-        pass
+    async def access(self, source: Union[str, Path], **kwargs) -> LocalResource
+        """获取数据到本地，返回 LocalResource"""
 
     @property
     @abstractmethod
-    def priority(self) -> int:
+    def priority(self) -> int
+        """优先级：数字越大优先级越高
+           - 100: 特定服务 (Feishu)
+           - 80: 版本控制 (Git)
+           - 50: 通用协议 (HTTP)
+           - 10: 兜底 (Local)
         """
-        优先级，数字越大优先级越高
-        用于解决冲突：多个 Accessor 都 can_handle 时，选优先级高的
-        """
-        pass
 
-    def cleanup(self, resource: LocalResource) -> None:
-        """
-        清理临时资源（可选）
-        默认：如果是临时资源，删除本地文件/目录
-        """
-        if resource.is_temporary:
-            # 实现清理逻辑
-            pass
+    def cleanup(self, resource: LocalResource) -> None
+        """清理资源（默认调用 resource.cleanup()）"""
 ```
 
-#### 文件 2: `openviking/parse/parsers/base_parser.py` (重构)
-
-```python
-# 保持现有 BaseParser 接口，但明确其职责为 DataParser
-# 可以考虑重命名为 DataParser 或保留 BaseParser 作为别名
-```
+### 3. AccessorRegistry
 
-#### 文件 3: `openviking/parse/accessors/registry.py`
+位置：`openviking/parse/accessors/registry.py`
 
 ```python
-from typing import Union, Path, Optional, List
-from .base import DataAccessor, LocalResource
-
 class AccessorRegistry:
-    """数据访问器注册表"""
-
-    def __init__(self):
-        self._accessors: List[DataAccessor] = []
-
-    def register(self, accessor: DataAccessor) -> None:
-        """注册访问器（按优先级插入）"""
-        # 按 priority 降序插入
-        idx = 0
-        for i, a in enumerate(self._accessors):
-            if accessor.priority > a.priority:
-                idx = i
-                break
-        else:
-            idx = len(self._accessors)
-        self._accessors.insert(idx, accessor)
-
-    def get_accessor(self, source: Union[str, Path]) -> Optional[DataAccessor]:
-        """获取能处理该 source 的访问器（按优先级）"""
-        for accessor in self._accessors:
-            if accessor.can_handle(source):
-                return accessor
-        return None
-
-    async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
-        """
-        路由到合适的访问器获取数据
-
-        如果没有访问器能处理，视为本地文件返回
-        """
-        accessor = self.get_accessor(source)
-        if accessor:
-            return await accessor.access(source, **kwargs)
-
-        # 默认：视为本地文件
-        path = Path(source)
-        return LocalResource(
-            path=path,
-            source_type="local",
-            original_source=str(source),
-            meta={},
-            is_temporary=False
-        )
-```
-
-### 3.2 现有 Parser 拆分方案
-
-| 当前 Parser | 拆分后 | 说明 |
-|-----------|--------|------|
-| `CodeRepositoryParser` | `GitAccessor` + `DirectoryParser` | Git clone 逻辑移到 Accessor |
-| `HTMLParser` | `HTTPAccessor` + `HTMLParser` | HTTP 下载移到 Accessor |
-| `FeishuParser` | `FeishuAccessor` + (新) `FeishuDocumentParser` | 飞书 API 调用移到 Accessor |
-| `ZipParser` | 保持为 `DataParser` | 只处理本地 .zip 文件 |
-| `DirectoryParser` | 保持为 `DataParser` | 只处理本地目录 |
-| 其他 (Markdown, PDF, ...) | 保持为 `DataParser` | 无需变动 |
-
-### 3.3 Accessor 实现示例
-
-#### `GitAccessor` (`openviking/parse/accessors/git.py`)
-
-```python
-# 从 CodeRepositoryParser 提取 git clone / GitHub ZIP 下载逻辑
-```
+    def __init__(self, register_default: bool = True)
+        """初始化注册表，可选是否注册默认 Accessor"""
 
-#### `HTTPAccessor` (`openviking/parse/accessors/http.py`)
+    def register(self, accessor: DataAccessor) -> None
+        """注册 Accessor（按优先级降序插入）"""
 
-```python
-# 从 HTMLParser 提取下载逻辑
-# 支持内容类型检测，下载到临时文件
-```
+    def unregister(self, accessor_name: str) -> bool
+        """注销 Accessor"""
 
-#### `FeishuAccessor` (`openviking/parse/accessors/feishu.py`)
+    def get_accessor(self, source) -> Optional[DataAccessor]
+        """获取能处理该 source 的最高优先级 Accessor"""
 
-```python
-# 从 FeishuParser 提取 API 调用逻辑
+    async def access(self, source, **kwargs) -> LocalResource
+        """路由到合适的 Accessor 获取数据"""
 ```
 
-### 3.4 HybridParser（简化插件开发）
-
-对于简单的插件，允许同时实现两个接口：
-
-```python
-class HybridParser(DataAccessor, DataParser):
-    """
-    混合解析器：同时实现访问和解析
-    适用于简单的自定义解析器，不需要拆分
-    """
-    # 同时实现 DataAccessor 和 DataParser 的接口
-    pass
-```
+默认注册的 Accessor（按优先级）：
+1. `FeishuAccessor` (100) - 处理飞书/ Lark 文档
+2. `GitAccessor` (80) - 处理 Git 仓库
+3. `HTTPAccessor` (50) - 处理 HTTP/HTTPS URL
+4. `LocalAccessor` (10) - 处理本地文件（兜底）
 
 ---
 
-## 四、实施步骤
-
-### Phase 1: 基础设施（核心接口）
-
-**目标**：建立新架构的基础框架，不影响现有功能
-
-- [ ] 创建 `openviking/parse/accessors/` 目录结构
-- [ ] 实现 `DataAccessor` 抽象基类和 `LocalResource` 数据类
-- [ ] 实现 `AccessorRegistry`
-- [ ] 编写单元测试
-- [ ] 更新 `ParserRegistry`，保持向后兼容
-
-### Phase 2: 重构 CodeRepositoryParser
-
-**目标**：将 CodeRepositoryParser 拆分为 GitAccessor + DirectoryParser
+## 实现进度
 
-- [ ] 实现 `GitAccessor`（从 `CodeRepositoryParser` 提取 clone/download 逻辑）
-- [ ] 更新 `CodeRepositoryParser` 为混合模式（向后兼容）或标记为 deprecated
-- [ ] 在 `AccessorRegistry` 中注册 `GitAccessor`
-- [ ] 更新 `UnifiedResourceProcessor.process()` 使用新流程
-- [ ] 测试：Git URL, GitHub URL, 本地 .zip 等场景
+### ✅ 已完成
 
-### Phase 3: 重构 HTMLParser 和 FeishuParser
-
-- [ ] 实现 `HTTPAccessor`
-- [ ] 实现 `FeishuAccessor`
-- [ ] 重构 `HTMLParser` 只负责解析本地 HTML 文件
-- [ ] 重构 `FeishuParser` 只负责解析下载后的内容
-
-### Phase 4: 重构 UnifiedResourceProcessor
-
-**目标**：简化 `UnifiedResourceProcessor`，使用新的两层架构
-
-- [ ] 重构 `UnifiedResourceProcessor.process()`：
-  ```python
-  async def process(self, source, ...):
-      # 第一阶段：获取数据
-      local_resource = await accessor_registry.access(source, **kwargs)
-
-      # 第二阶段：解析数据
-      parse_result = await parser_registry.parse(local_resource, **kwargs)
-
-      # 清理（如果需要）
-      # ...
-
-      return parse_result
-  ```
-- [ ] 移除 `_process_url()`, `_process_file()`, `_process_directory()` 中的重复逻辑
-
-### Phase 5: 扩展和优化
-
-- [ ] 实现优先级机制解决冲突
-- [ ] 添加 `HybridParser` 支持
-- [ ] 编写迁移文档
-- [ ] 性能优化和测试覆盖
+- [x] 创建 `openviking/parse/accessors/` 目录结构
+- [x] 实现 `DataAccessor` 抽象基类和 `LocalResource` 数据类
+- [x] 实现 `AccessorRegistry`（含优先级机制）
+- [x] 实现 `GitAccessor` - 处理 Git 仓库
+- [x] 实现 `HTTPAccessor` - 处理 HTTP URL
+- [x] 实现 `FeishuAccessor` - 处理飞书文档
+- [x] 实现 `LocalAccessor` - 处理本地文件
+- [x] 全局注册表 `get_accessor_registry()`
+- [x] 更新 `PDFParser`、`resources.py`、`local_input_guard.py` 使用新架构
 
 ---
 
-## 五、兼容性策略
-
-### 5.1 向后兼容
-
-1. **保持现有 API 不变**：
-   - `ParserRegistry` 接口保持不变
-   - `registry.parse()` 仍然可以工作
-   - 自定义 Parser 注册方式不变
-
-2. **渐进式迁移**：
-   - 现有 Parser 可以继续使用
-   - 新 Parser 鼓励使用新架构
-   - 提供迁移指南
-
-### 5.2 废弃策略
-
-- 标记旧的 `CodeRepositoryParser` 等为 `@deprecated`
-- 在 v6.0 或未来版本移除
-
----
-
-## 六、文件结构变更
+## 文件结构
 
 ```
 openviking/parse/
-├── accessors/              [新增]
+├── accessors/                    # ✅ 新增：数据访问层
 │   ├── __init__.py
-│   ├── base.py            # DataAccessor, LocalResource
-│   ├── registry.py        # AccessorRegistry
-│   ├── git.py             # GitAccessor
-│   ├── http.py            # HTTPAccessor
-│   └── feishu.py          # FeishuAccessor
-├── parsers/
-│   ├── base_parser.py     # 明确为 DataParser
-│   ├── code/              # 保留但简化
+│   ├── base.py                  # DataAccessor, LocalResource, SourceType
+│   ├── registry.py              # AccessorRegistry
+│   ├── git_accessor.py          # GitAccessor
+│   ├── http_accessor.py         # HTTPAccessor
+│   ├── feishu_accessor.py       # FeishuAccessor
+│   └── local_accessor.py        # LocalAccessor
+├── parsers/                      # 数据解析层（保持不变）
+│   ├── base_parser.py
 │   ├── markdown.py
 │   ├── pdf.py
+│   ├── zip_parser.py
 │   └── ...
-├── registry.py            # ParserRegistry (重构)
-└── ...
+└── registry.py                   # ParserRegistry（保持不变）
 ```
 
 ---
 
-## 七、测试计划
+## 使用示例
 
-| 测试类型 | 测试内容 |
-|---------|---------|
-| 单元测试 | AccessorRegistry, 各 Accessor, 各 Parser |
-| 集成测试 | 完整流程：add_resource → Accessor → Parser → TreeBuilder |
-| 回归测试 | 确保现有功能不被破坏 |
-| 冲突测试 | 测试优先级机制解决 .zip, URL 等冲突场景 |
+### 使用 AccessorRegistry
 
----
+```python
+from openviking.parse.accessors import get_accessor_registry
+
+# 获取全局注册表
+registry = get_accessor_registry()
+
+# 访问资源（自动路由）
+async with await registry.access("https://github.com/user/repo") as resource:
+    print(f"本地路径: {resource.path}")
+    print(f"来源类型: {resource.source_type}")
+    # 使用 resource.path 进行解析...
+    # 退出 with 块时自动清理临时资源
+```
 
-## 八、风险与应对
+### 自定义 Accessor
 
-| 风险 | 影响 | 概率 | 应对措施 |
-|-----|------|------|---------|
-| 重构范围过大 | 高 | 中 | 分阶段实施，每阶段可独立发布 |
-| 性能回退 | 中 | 低 | 保持缓存机制，性能基准测试 |
-| 自定义插件受影响 | 高 | 低 | 保持向后兼容，提供迁移工具 |
+```python
+from openviking.parse.accessors import DataAccessor, LocalResource
+
+class MyAccessor(DataAccessor):
+    @property
+    def priority(self) -> int:
+        return 90
+
+    def can_handle(self, source: str) -> bool:
+        return source.startswith("myprotocol://")
+
+    async def access(self, source: str, **kwargs) -> LocalResource:
+        # 获取数据到本地...
+        temp_path = self._create_temp_dir()
+        # ... 下载逻辑 ...
+        return LocalResource(
+            path=temp_path,
+            source_type="myprotocol",
+            original_source=source,
+            meta={},
+            is_temporary=True
+        )
+
+# 注册
+registry = get_accessor_registry()
+registry.register(MyAccessor())
+```
 
 ---
 
-## 九、相关文档
+## 相关文档
 
 - [解析系统 README](../../openviking/parse/parsers/README.md)
 - [OpenViking 整体架构](../zh/concepts/01-architecture.md)

From a3d2a13a7dc704b8c695d4f777c7b5724e726496 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 14:59:12 +0800
Subject: [PATCH 08/15] fix: download http files

---
 openviking/parse/accessors/http_accessor.py | 175 ++++++++++++++++++--
 openviking/parse/parsers/html.py            |  29 ++++
 2 files changed, 193 insertions(+), 11 deletions(-)

diff --git a/openviking/parse/accessors/http_accessor.py b/openviking/parse/accessors/http_accessor.py
index 1d228889a..f4ed4f7fe 100644
--- a/openviking/parse/accessors/http_accessor.py
+++ b/openviking/parse/accessors/http_accessor.py
@@ -5,14 +5,25 @@
 
 Fetches HTTP/HTTPS URLs and makes them available as local files.
 This is the DataAccessor layer extracted from HTMLParser.
+
+Features:
+- Downloads web pages to local HTML files
+- Downloads files (PDF, Markdown, etc.) to local files
+- Supports GitHub/GitLab blob to raw URL conversion
+- Follows redirects
+- Network guard integration
+- Detailed error classification (network, timeout, auth, etc.)
+- Content-Type detection for URLs without file extensions
 """
 
 import tempfile
+from enum import Enum
 from pathlib import Path
-from typing import Optional, Union
+from typing import Any, Dict, Optional, Tuple, Union
 from urllib.parse import unquote, urlparse
 
 from openviking.parse.base import lazy_import
+from openviking.parse.parsers.constants import CODE_EXTENSIONS
 from openviking.utils.network_guard import build_httpx_request_validation_hooks
 from openviking_cli.utils.logger import get_logger
 
@@ -21,6 +32,134 @@
 logger = get_logger(__name__)
 
 
+class URLType(Enum):
+    """URL content types."""
+
+    WEBPAGE = "webpage"  # HTML webpage to parse
+    DOWNLOAD_PDF = "download_pdf"  # PDF file download link
+    DOWNLOAD_MD = "download_md"  # Markdown file download link
+    DOWNLOAD_TXT = "download_txt"  # Text file download link
+    DOWNLOAD_HTML = "download_html"  # HTML file download link
+    UNKNOWN = "unknown"  # Unknown or unsupported type
+
+
+class URLTypeDetector:
+    """
+    Detector for URL content types.
+
+    Uses extension and HTTP HEAD request to determine if a URL is:
+    - A webpage to scrape
+    - A file download link (and what type)
+    """
+
+    # Extension to URL type mapping
+    # CODE_EXTENSIONS spread comes first so explicit entries below override
+    # (e.g., .html/.htm -> DOWNLOAD_HTML instead of DOWNLOAD_TXT)
+    EXTENSION_MAP: Dict[str, URLType] = {
+        **dict.fromkeys(CODE_EXTENSIONS, URLType.DOWNLOAD_TXT),
+        ".pdf": URLType.DOWNLOAD_PDF,
+        ".md": URLType.DOWNLOAD_MD,
+        ".markdown": URLType.DOWNLOAD_MD,
+        ".txt": URLType.DOWNLOAD_TXT,
+        ".text": URLType.DOWNLOAD_TXT,
+        ".html": URLType.DOWNLOAD_HTML,
+        ".htm": URLType.DOWNLOAD_HTML,
+    }
+
+    # Content-Type to URL type mapping
+    CONTENT_TYPE_MAP: Dict[str, URLType] = {
+        "application/pdf": URLType.DOWNLOAD_PDF,
+        "text/markdown": URLType.DOWNLOAD_MD,
+        "text/plain": URLType.DOWNLOAD_TXT,
+        "text/html": URLType.WEBPAGE,
+        "application/xhtml+xml": URLType.WEBPAGE,
+    }
+
+    # URLType to file extension mapping
+    URL_TYPE_TO_EXT: Dict[URLType, str] = {
+        URLType.WEBPAGE: ".html",
+        URLType.DOWNLOAD_PDF: ".pdf",
+        URLType.DOWNLOAD_MD: ".md",
+        URLType.DOWNLOAD_TXT: ".txt",
+        URLType.DOWNLOAD_HTML: ".html",
+        URLType.UNKNOWN: ".html",
+    }
+
+    def __init__(self, timeout: float = 10.0):
+        """Initialize URL type detector."""
+        self.timeout = timeout
+
+    async def detect(
+        self,
+        url: str,
+        request_validator=None,
+    ) -> Tuple[URLType, Dict[str, Any]]:
+        """
+        Detect URL content type.
+
+        Args:
+            url: URL to detect
+            request_validator: Optional network request validator
+
+        Returns:
+            (URLType, metadata dict)
+        """
+        meta = {"url": url, "detected_by": "unknown"}
+        parsed = urlparse(url)
+        path_lower = parsed.path.lower()
+
+        # 1. Check extension first
+        for ext, url_type in self.EXTENSION_MAP.items():
+            if path_lower.endswith(ext):
+                meta["detected_by"] = "extension"
+                meta["extension"] = ext
+                return url_type, meta
+
+        # 2. Send HEAD request to check Content-Type
+        try:
+            httpx = lazy_import("httpx")
+            client_kwargs = {
+                "timeout": self.timeout,
+                "follow_redirects": True,
+            }
+            event_hooks = build_httpx_request_validation_hooks(request_validator)
+            if event_hooks:
+                client_kwargs["event_hooks"] = event_hooks
+                client_kwargs["trust_env"] = False
+
+            async with httpx.AsyncClient(**client_kwargs) as client:
+                response = await client.head(url)
+                content_type = response.headers.get("content-type", "").lower()
+
+                # Remove charset info
+                if ";" in content_type:
+                    content_type = content_type.split(";")[0].strip()
+
+                meta["content_type"] = content_type
+                meta["detected_by"] = "content_type"
+                meta["status_code"] = response.status_code
+
+                # Map content type
+                for ct_prefix, url_type in self.CONTENT_TYPE_MAP.items():
+                    if content_type.startswith(ct_prefix):
+                        return url_type, meta
+
+                # Default to webpage for HTML-like content
+                if "html" in content_type or "xml" in content_type:
+                    return URLType.WEBPAGE, meta
+
+        except Exception as e:
+            meta["detection_error"] = str(e)
+            logger.debug(f"[URLTypeDetector] HEAD request failed: {e}, falling back to default")
+
+        # 3. Default: assume webpage
+        return URLType.WEBPAGE, meta
+
+    def get_extension_for_type(self, url_type: URLType) -> str:
+        """Get file extension for URL type."""
+        return self.URL_TYPE_TO_EXT.get(url_type, ".html")
+
+
 class HTTPAccessor(DataAccessor):
     """
     Accessor for HTTP/HTTPS URLs.
@@ -50,6 +189,7 @@ def __init__(
         """Initialize HTTP accessor."""
         self.timeout = timeout
         self.user_agent = user_agent or self.DEFAULT_USER_AGENT
+        self._url_detector = URLTypeDetector(timeout=min(timeout, 10.0))
 
     @property
     def priority(self) -> int:
@@ -83,16 +223,19 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
         request_validator = kwargs.get("request_validator")
 
         # Download the URL
-        temp_path = await self._download_url(
+        temp_path, url_type, meta = await self._download_url(
             source_str,
             request_validator=request_validator,
         )
 
         # Build metadata
-        meta = {
-            "url": source_str,
-            "downloaded": True,
-        }
+        meta.update(
+            {
+                "url": source_str,
+                "downloaded": True,
+                "url_type": url_type.value,
+            }
+        )
 
         return LocalResource(
             path=Path(temp_path),
@@ -124,7 +267,7 @@ async def _download_url(
         self,
         url: str,
         request_validator=None,
-    ) -> str:
+    ) -> Tuple[str, URLType, Dict[str, Any]]:
         """
         Download URL content to a temporary file.
 
@@ -133,23 +276,33 @@ async def _download_url(
             request_validator: Optional network request validator
 
         Returns:
-            Path to the temporary file
+            Tuple of (path to temporary file, URLType, metadata dict)
         """
         httpx = lazy_import("httpx")
 
         # Convert GitHub/GitLab blob URLs to raw
         url = self._convert_to_raw_url(url)
 
-        # Determine file extension from URL (decode first to handle encoded paths)
+        # Detect URL type first to get proper extension
+        url_type, detect_meta = await self._url_detector.detect(
+            url,
+            request_validator=request_validator,
+        )
+
+        # Determine file extension
         parsed = urlparse(url)
         decoded_path = unquote(parsed.path)
-        ext = Path(decoded_path).suffix or ".html"
+        ext = Path(decoded_path).suffix
+        if not ext:
+            ext = self._url_detector.get_extension_for_type(url_type)
 
         # Create temp file
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
         temp_path = temp_file.name
         temp_file.close()
 
+        meta = {**detect_meta, "extension": ext}
+
         try:
             # Download content
             client_kwargs = {
@@ -190,7 +343,7 @@ async def _download_url(
                 # Write to temp file
                 Path(temp_path).write_bytes(response.content)
 
-            return temp_path
+            return temp_path, url_type, meta
         except Exception:
             # Clean up on error
             try:
diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py
index 24331113c..3acf32b1d 100644
--- a/openviking/parse/parsers/html.py
+++ b/openviking/parse/parsers/html.py
@@ -25,6 +25,14 @@
 logger = __import__("openviking_cli.utils.logger").utils.logger.get_logger(__name__)
 
 
+# Backward compatibility: Re-export URLType and URLTypeDetector from http_accessor
+try:
+    from openviking.parse.accessors.http_accessor import URLType, URLTypeDetector
+except ImportError:
+    URLType = None
+    URLTypeDetector = None
+
+
 class HTMLParser(BaseParser):
     """
     Parser for local HTML files.
@@ -43,6 +51,27 @@ def __init__(self):
         """Initialize HTML parser."""
         pass
 
+    @staticmethod
+    def _extract_filename_from_url(url: str) -> str:
+        """
+        Extract and URL-decode the original filename from a URL.
+
+        Args:
+            url: URL to extract filename from
+
+        Returns:
+            Decoded filename (e.g., "schemas.py" from ".../schemas.py")
+            Falls back to "download" if no filename can be extracted.
+        """
+        from pathlib import Path
+        from urllib.parse import unquote, urlparse
+
+        parsed = urlparse(url)
+        # URL-decode path to handle encoded characters (e.g., %E7%99%BE -> Chinese chars)
+        decoded_path = unquote(parsed.path)
+        basename = Path(decoded_path).name
+        return basename if basename else "download"
+
     def _get_readabilipy(self):
         """Lazy import of readabilipy."""
         if not hasattr(self, "_readabilipy") or self._readabilipy is None:

From abf19d5028fdd8b9b021f02e7346054ce2fe878c Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 15:34:09 +0800
Subject: [PATCH 09/15] fix: download pdf files, and fit IANA standards

---
 openviking/parse/accessors/http_accessor.py | 269 +++++++++++---
 openviking/parse/accessors/mime_types.py    | 391 ++++++++++++++++++++
 2 files changed, 617 insertions(+), 43 deletions(-)
 create mode 100644 openviking/parse/accessors/mime_types.py

diff --git a/openviking/parse/accessors/http_accessor.py b/openviking/parse/accessors/http_accessor.py
index f4ed4f7fe..258200fec 100644
--- a/openviking/parse/accessors/http_accessor.py
+++ b/openviking/parse/accessors/http_accessor.py
@@ -13,7 +13,7 @@
 - Follows redirects
 - Network guard integration
 - Detailed error classification (network, timeout, auth, etc.)
-- Content-Type detection for URLs without file extensions
+- IANA Media Type (MIME) based content detection for URLs without file extensions
 """
 
 import tempfile
@@ -28,12 +28,13 @@
 from openviking_cli.utils.logger import get_logger
 
 from .base import DataAccessor, LocalResource, SourceType
+from .mime_types import MEDIA_TYPE_ALIASES, IANAMediaType, get_preferred_extension
 
 logger = get_logger(__name__)
 
 
 class URLType(Enum):
-    """URL content types."""
+    """URL content types for routing to appropriate parsers."""
 
     WEBPAGE = "webpage"  # HTML webpage to parse
     DOWNLOAD_PDF = "download_pdf"  # PDF file download link
@@ -47,12 +48,18 @@ class URLTypeDetector:
     """
     Detector for URL content types.
 
-    Uses extension and HTTP HEAD request to determine if a URL is:
-    - A webpage to scrape
-    - A file download link (and what type)
+    Uses IANA Media Type (MIME) standards for robust content detection:
+    1. Check file extension (fast path)
+    2. Check Content-Disposition header for filename (most reliable)
+    3. Check Content-Type header (IANA standard media types)
+    4. Fall back to default behavior
+
+    References:
+        - RFC 6838: Media Type Specifications and Registration Procedures
+        - RFC 6266: Use of the Content-Disposition Header Field in HTTP
     """
 
-    # Extension to URL type mapping
+    # === Extension to URL type mapping ===
     # CODE_EXTENSIONS spread comes first so explicit entries below override
     # (e.g., .html/.htm -> DOWNLOAD_HTML instead of DOWNLOAD_TXT)
     EXTENSION_MAP: Dict[str, URLType] = {
@@ -66,13 +73,27 @@ class URLTypeDetector:
         ".htm": URLType.DOWNLOAD_HTML,
     }
 
-    # Content-Type to URL type mapping
-    CONTENT_TYPE_MAP: Dict[str, URLType] = {
+    # === IANA Media Type to URL type mapping ===
+    # Maps IANA registered media types to our internal URLType
+    # Patterns can be:
+    #   - Exact match: "application/pdf"
+    #   - Wildcard: "text/*"
+    #   - Type only: "image" (treated as "image/*")
+    # NOTE: .html/.htm extensions are mapped to DOWNLOAD_HTML via EXTENSION_MAP,
+    #       while text/html Content-Type is mapped to WEBPAGE here for URLs
+    #       without extensions (like https://example.com/page)
+    MEDIA_TYPE_MAP: Dict[str, URLType] = {
+        # PDF
         "application/pdf": URLType.DOWNLOAD_PDF,
+        # Markdown
         "text/markdown": URLType.DOWNLOAD_MD,
-        "text/plain": URLType.DOWNLOAD_TXT,
+        "text/x-markdown": URLType.DOWNLOAD_MD,
+        # HTML/webpage (for URLs without .html extension)
         "text/html": URLType.WEBPAGE,
         "application/xhtml+xml": URLType.WEBPAGE,
+        # Plain text
+        "text/plain": URLType.DOWNLOAD_TXT,
+        "text/*": URLType.DOWNLOAD_TXT,
     }
 
     # URLType to file extension mapping
@@ -95,27 +116,39 @@ async def detect(
         request_validator=None,
     ) -> Tuple[URLType, Dict[str, Any]]:
         """
-        Detect URL content type.
+        Detect URL content type using IANA standards.
+
+        Detection order (most reliable to least reliable):
+        1. File extension from URL path (if valid and recognized)
+        2. Filename from Content-Disposition header (RFC 6266)
+        3. IANA Media Type from Content-Type header (RFC 6838)
+        4. Default to WEBPAGE
 
         Args:
             url: URL to detect
             request_validator: Optional network request validator
 
         Returns:
-            (URLType, metadata dict)
+            (URLType, metadata dict with detection details)
         """
-        meta = {"url": url, "detected_by": "unknown"}
+        meta = {
+            "url": url,
+            "detected_by": "unknown",
+        }
         parsed = urlparse(url)
         path_lower = parsed.path.lower()
-
-        # 1. Check extension first
-        for ext, url_type in self.EXTENSION_MAP.items():
-            if path_lower.endswith(ext):
-                meta["detected_by"] = "extension"
-                meta["extension"] = ext
-                return url_type, meta
-
-        # 2. Send HEAD request to check Content-Type
+        valid_extensions = set(self.EXTENSION_MAP.keys())
+
+        # === Step 1: Check extension from URL path ===
+        path_ext = Path(path_lower).suffix
+        if path_ext and path_ext in valid_extensions:
+            for ext, url_type in self.EXTENSION_MAP.items():
+                if path_lower.endswith(ext):
+                    meta["detected_by"] = "extension"
+                    meta["extension"] = ext
+                    return url_type, meta
+
+        # === Step 2: Send HEAD request for headers ===
         try:
             httpx = lazy_import("httpx")
             client_kwargs = {
@@ -129,32 +162,138 @@ async def detect(
 
             async with httpx.AsyncClient(**client_kwargs) as client:
                 response = await client.head(url)
-                content_type = response.headers.get("content-type", "").lower()
 
-                # Remove charset info
-                if ";" in content_type:
-                    content_type = content_type.split(";")[0].strip()
+                # Get raw headers for debugging
+                content_type_raw = response.headers.get("content-type", "")
+                content_disposition = response.headers.get("content-disposition", "")
 
-                meta["content_type"] = content_type
-                meta["detected_by"] = "content_type"
+                meta["content_type_raw"] = content_type_raw
+                meta["content_disposition_raw"] = content_disposition
                 meta["status_code"] = response.status_code
 
-                # Map content type
-                for ct_prefix, url_type in self.CONTENT_TYPE_MAP.items():
-                    if content_type.startswith(ct_prefix):
+                # === Step 2a: Check Content-Disposition for filename (RFC 6266) ===
+                filename_from_disposition = self._extract_filename_from_disposition(
+                    content_disposition
+                )
+                if filename_from_disposition:
+                    meta["filename_from_disposition"] = filename_from_disposition
+                    filename_lower = filename_from_disposition.lower()
+                    for ext, url_type in self.EXTENSION_MAP.items():
+                        if filename_lower.endswith(ext):
+                            meta["detected_by"] = "content_disposition"
+                            meta["extension"] = ext
+                            return url_type, meta
+
+                # === Step 2b: Check Content-Type (RFC 6838) ===
+                if content_type_raw:
+                    url_type = self._detect_from_media_type(content_type_raw, meta)
+                    if url_type != URLType.UNKNOWN:
                         return url_type, meta
 
-                # Default to webpage for HTML-like content
-                if "html" in content_type or "xml" in content_type:
-                    return URLType.WEBPAGE, meta
-
         except Exception as e:
             meta["detection_error"] = str(e)
             logger.debug(f"[URLTypeDetector] HEAD request failed: {e}, falling back to default")
 
-        # 3. Default: assume webpage
+        # === Step 3: Default behavior ===
+        meta["detected_by"] = "default"
         return URLType.WEBPAGE, meta
 
+    def _detect_from_media_type(self, content_type: str, meta: Dict[str, Any]) -> URLType:
+        """
+        Detect URL type from IANA media type.
+
+        Args:
+            content_type: Content-Type header value
+            meta: Metadata dict to update
+
+        Returns:
+            Detected URLType, or URLType.UNKNOWN if no match
+        """
+        # Normalize and parse according to IANA standards
+        media_type_str = content_type.lower().strip()
+
+        # Handle common aliases
+        if media_type_str in MEDIA_TYPE_ALIASES:
+            meta["media_type_alias"] = media_type_str
+            media_type_str = MEDIA_TYPE_ALIASES[media_type_str]
+
+        # Parse into structured IANAMediaType
+        try:
+            media_type = IANAMediaType.parse(media_type_str)
+            meta["media_type"] = str(media_type)
+            meta["media_type_type"] = media_type.type
+            meta["media_type_subtype"] = media_type.subtype
+            if media_type.suffix:
+                meta["media_type_suffix"] = media_type.suffix
+        except Exception as e:
+            logger.debug(f"[URLTypeDetector] Failed to parse media type: {e}")
+            meta["media_type_parse_error"] = str(e)
+            return URLType.UNKNOWN
+
+        # Check for exact match first
+        media_type_key = f"{media_type.type}/{media_type.subtype}"
+        if media_type.suffix:
+            media_type_with_suffix = f"{media_type_key}+{media_type.suffix}"
+            if media_type_with_suffix in self.MEDIA_TYPE_MAP:
+                meta["detected_by"] = "media_type_suffix"
+                return self.MEDIA_TYPE_MAP[media_type_with_suffix]
+
+        if media_type_key in self.MEDIA_TYPE_MAP:
+            meta["detected_by"] = "media_type"
+            return self.MEDIA_TYPE_MAP[media_type_key]
+
+        # Check for wildcard matches
+        for pattern, url_type in self.MEDIA_TYPE_MAP.items():
+            if media_type.matches(pattern):
+                meta["detected_by"] = "media_type_pattern"
+                meta["media_type_pattern"] = pattern
+                return url_type
+
+        return URLType.UNKNOWN
+
+    @staticmethod
+    def _extract_filename_from_disposition(content_disposition: str) -> Optional[str]:
+        """
+        Extract filename from Content-Disposition header per RFC 6266.
+
+        Handles formats:
+            - inline; filename="2601.00014v1.pdf"
+            - attachment; filename=document.pdf
+            - attachment; filename*=UTF-8''encoded.pdf
+            - attachment; filename="foo.pdf"; size=12345
+
+        Args:
+            content_disposition: Content-Disposition header value
+
+        Returns:
+            Extracted filename, or None if not found
+        """
+        if not content_disposition:
+            return None
+
+        import re
+
+        content_disposition = content_disposition.strip()
+
+        # Try filename*=UTF-8''... format first (RFC 5987)
+        utf8_match = re.search(r"filename\*=UTF-8''([^;]+)", content_disposition, re.I)
+        if utf8_match:
+            from urllib.parse import unquote
+
+            return unquote(utf8_match.group(1))
+
+        # Try filename="..." format (quoted-string)
+        quoted_match = re.search(r'filename="([^"]+)"', content_disposition, re.I)
+        if quoted_match:
+            return quoted_match.group(1)
+
+        # Try filename=... format (token)
+        simple_match = re.search(r"filename=([^;]+)", content_disposition, re.I)
+        if simple_match:
+            return simple_match.group(1).strip()
+
+        return None
+
     def get_extension_for_type(self, url_type: URLType) -> str:
         """Get file extension for URL type."""
         return self.URL_TYPE_TO_EXT.get(url_type, ".html")
@@ -171,6 +310,7 @@ class HTTPAccessor(DataAccessor):
     - Follows redirects
     - Network guard integration
     - Detailed error classification (network, timeout, auth, etc.)
+    - IANA Media Type based detection for URLs without extensions
     """
 
     PRIORITY = 50  # Lower than GitAccessor, higher than fallback
@@ -204,8 +344,6 @@ def can_handle(self, source: Union[str, Path]) -> bool:
         and will be checked first for their specific URL types.
         """
         source_str = str(source)
-
-        # Only handle http/https URLs
         return source_str.startswith(("http://", "https://"))
 
     async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
@@ -289,12 +427,8 @@ async def _download_url(
             request_validator=request_validator,
         )
 
-        # Determine file extension
-        parsed = urlparse(url)
-        decoded_path = unquote(parsed.path)
-        ext = Path(decoded_path).suffix
-        if not ext:
-            ext = self._url_detector.get_extension_for_type(url_type)
+        # Determine file extension using IANA standards
+        ext = self._determine_file_extension(url, url_type, detect_meta)
 
         # Create temp file
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
@@ -354,6 +488,55 @@ async def _download_url(
                 pass
             raise
 
+    def _determine_file_extension(
+        self,
+        url: str,
+        url_type: URLType,
+        detect_meta: Dict[str, Any],
+    ) -> str:
+        """
+        Determine appropriate file extension using multiple strategies.
+
+        Strategy order (most reliable first):
+        1. Extension from Content-Disposition filename
+        2. Extension from URL path (if valid)
+        3. Use IANA media type mapping
+        4. Use URL type based extension
+
+        Args:
+            url: Original URL
+            url_type: Detected URL type
+            detect_meta: Detection metadata
+
+        Returns:
+            File extension including dot (e.g., ".pdf")
+        """
+        valid_extensions = set(URLTypeDetector.EXTENSION_MAP.keys())
+
+        # 1. Try extension from Content-Disposition filename
+        filename_from_disposition = detect_meta.get("filename_from_disposition")
+        if filename_from_disposition:
+            ext = Path(filename_from_disposition.lower()).suffix
+            if ext and ext in valid_extensions:
+                return ext
+
+        # 2. Try extension from URL path (if valid)
+        parsed = urlparse(url)
+        decoded_path = unquote(parsed.path)
+        ext = Path(decoded_path).suffix
+        if ext and ext.lower() in valid_extensions:
+            return ext.lower()
+
+        # 3. Try IANA media type to extension mapping
+        media_type_str = detect_meta.get("media_type") or detect_meta.get("content_type_raw")
+        if media_type_str:
+            iana_ext = get_preferred_extension(media_type_str)
+            if iana_ext:
+                return iana_ext
+
+        # 4. Fall back to URL type based extension
+        return self._url_detector.get_extension_for_type(url_type)
+
     def _convert_to_raw_url(self, url: str) -> str:
         """Convert GitHub/GitLab blob URL to raw URL."""
         parsed = urlparse(url)
diff --git a/openviking/parse/accessors/mime_types.py b/openviking/parse/accessors/mime_types.py
new file mode 100644
index 000000000..da9fe119e
--- /dev/null
+++ b/openviking/parse/accessors/mime_types.py
@@ -0,0 +1,391 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""
+IANA Media Type (MIME Type) handling.
+
+References:
+- RFC 6838: Media Type Specifications and Registration Procedures
+- https://www.iana.org/assignments/media-types/media-types.xhtml
+"""
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set
+
+
+@dataclass(frozen=True)
+class IANAMediaType:
+    """
+    Represents an IANA media type.
+
+    Format: type "/" [tree "."] subtype ["+" suffix] *[";" parameter]
+
+    Examples:
+        - text/html
+        - application/pdf
+        - application/vnd.openxmlformats-officedocument.wordprocessingml.document
+        - text/plain; charset=utf-8
+    """
+
+    type: str  # The top-level type (text, application, image, etc.)
+    subtype: str  # The subtype (may include tree prefix)
+    suffix: Optional[str] = None  # Optional structured syntax suffix (e.g., +xml, +json)
+    parameters: Dict[str, str] = None  # Optional parameters (e.g., charset=utf-8)
+
+    def __post_init__(self):
+        if self.parameters is None:
+            object.__setattr__(self, "parameters", {})
+
+    @classmethod
+    def parse(cls, media_type: str) -> "IANAMediaType":
+        """
+        Parse a media type string into an IANAMediaType object.
+
+        Args:
+            media_type: Media type string (e.g., "text/plain; charset=utf-8")
+
+        Returns:
+            Parsed IANAMediaType object
+        """
+        media_type = media_type.strip().lower()
+
+        # Split into type/subtype and parameters
+        parts = media_type.split(";", 1)
+        type_subtype = parts[0].strip()
+        params_str = parts[1].strip() if len(parts) > 1 else ""
+
+        # Parse type and subtype
+        if "/" not in type_subtype:
+            # Invalid format, treat as unknown
+            return cls(type="application", subtype="octet-stream")
+
+        type_part, subtype_part = type_subtype.split("/", 1)
+        type_part = type_part.strip()
+        subtype_part = subtype_part.strip()
+
+        # Check for suffix (e.g., +xml, +json)
+        suffix = None
+        if "+" in subtype_part:
+            subtype_parts = subtype_part.rsplit("+", 1)
+            # Only recognize standard suffixes
+            if subtype_parts[1] in {
+                "xml",
+                "json",
+                "yaml",
+                "ber",
+                "fastinfoset",
+                "wbxml",
+                "zip",
+                "cbor",
+            }:
+                subtype_part = subtype_parts[0]
+                suffix = subtype_parts[1]
+
+        # Parse parameters
+        parameters = {}
+        if params_str:
+            for param in params_str.split(";"):
+                param = param.strip()
+                if not param:
+                    continue
+                if "=" in param:
+                    key, value = param.split("=", 1)
+                    parameters[key.strip()] = value.strip()
+                else:
+                    parameters[param] = ""
+
+        return cls(
+            type=type_part,
+            subtype=subtype_part,
+            suffix=suffix,
+            parameters=parameters,
+        )
+
+    def matches(self, pattern: str) -> bool:
+        """
+        Check if this media type matches a pattern.
+
+        Patterns can include wildcards:
+            - "text/*" matches any text type
+            - "application/*" matches any application type
+            - "*/*" matches any type
+            - "text/plain" matches exactly
+            - "text" is treated as "text/*"
+
+        Args:
+            pattern: Pattern to match against
+
+        Returns:
+            True if matches, False otherwise
+        """
+        pattern = pattern.lower().strip()
+
+        if pattern == "*/*":
+            return True
+
+        if "/" not in pattern:
+            # Treat as type/*
+            return self.type == pattern
+
+        pattern_type, pattern_subtype = pattern.split("/", 1)
+        pattern_type = pattern_type.strip()
+        pattern_subtype = pattern_subtype.strip()
+
+        if pattern_type != "*" and self.type != pattern_type:
+            return False
+
+        if pattern_subtype == "*":
+            return True
+
+        return self.subtype == pattern_subtype
+
+    def __str__(self) -> str:
+        """Return the canonical string representation."""
+        parts = [f"{self.type}/{self.subtype}"]
+        if self.suffix:
+            parts[-1] += f"+{self.suffix}"
+        for key, value in self.parameters.items():
+            if value:
+                parts.append(f"; {key}={value}")
+            else:
+                parts.append(f"; {key}")
+        return "".join(parts)
+
+
+# IANA registered top-level media types
+IANA_TOP_LEVEL_TYPES: Set[str] = {
+    "application",
+    "audio",
+    "example",
+    "font",
+    "image",
+    "message",
+    "model",
+    "multipart",
+    "text",
+    "video",
+}
+
+# Common media type aliases (non-standard but widely used)
+MEDIA_TYPE_ALIASES: Dict[str, str] = {
+    "application/x-pdf": "application/pdf",
+    "application/x-zip-compressed": "application/zip",
+    "application/x-gzip": "application/gzip",
+    "application/x-tar": "application/x-tar",
+    "text/x-markdown": "text/markdown",
+    "text/x-c": "text/plain",
+    "text/x-c++": "text/plain",
+    "text/x-python": "text/plain",
+    "text/x-java": "text/plain",
+    "text/x-javascript": "text/plain",
+    "text/x-script.python": "text/plain",
+    "image/jpg": "image/jpeg",
+    "audio/mp3": "audio/mpeg",
+    "video/mp4": "video/mp4",  # Keep as is (standard)
+}
+
+# Comprehensive IANA media type to file extension mapping
+# Based on IANA registry and common usage
+IANA_MEDIA_TYPE_TO_EXTENSION: Dict[str, List[str]] = {
+    # === Text types ===
+    "text/plain": [".txt", ".text"],
+    "text/html": [".html", ".htm"],
+    "text/css": [".css"],
+    "text/csv": [".csv"],
+    "text/markdown": [".md", ".markdown"],
+    "text/calendar": [".ics"],
+    "text/vcard": [".vcf"],
+    "text/xml": [".xml"],
+    "text/yaml": [".yaml", ".yml"],
+    "text/tab-separated-values": [".tsv"],
+    "text/sgml": [".sgml"],
+    "text/rtf": [".rtf"],
+    "text/jinja2": [".j2", ".jinja2"],
+    "text/x-yaml": [".yaml", ".yml"],  # Non-standard but common
+    "text/x-markdown": [".md", ".markdown"],  # Non-standard but common
+    # === Application types - Documents ===
+    "application/pdf": [".pdf"],
+    "application/rtf": [".rtf"],
+    "application/epub+zip": [".epub"],
+    "application/x-mobipocket-ebook": [".mobi"],
+    # === Application types - Office ===
+    "application/msword": [".doc"],
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": [".docx"],
+    "application/vnd.ms-excel": [".xls"],
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [".xlsx"],
+    "application/vnd.ms-powerpoint": [".ppt"],
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation": [".pptx"],
+    "application/vnd.oasis.opendocument.text": [".odt"],
+    "application/vnd.oasis.opendocument.spreadsheet": [".ods"],
+    "application/vnd.oasis.opendocument.presentation": [".odp"],
+    # === Application types - Code ===
+    "application/javascript": [".js"],
+    "application/json": [".json"],
+    "application/ld+json": [".jsonld"],
+    "application/xml": [".xml"],
+    "application/yaml": [".yaml", ".yml"],
+    "application/x-yaml": [".yaml", ".yml"],
+    "application/xhtml+xml": [".xhtml"],
+    "application/rss+xml": [".rss"],
+    "application/atom+xml": [".atom"],
+    "application/soap+xml": [".wsdl"],
+    # === Application types - Archives ===
+    "application/zip": [".zip"],
+    "application/x-zip-compressed": [".zip"],
+    "application/gzip": [".gz"],
+    "application/x-gzip": [".gz"],
+    "application/x-tar": [".tar"],
+    "application/x-7z-compressed": [".7z"],
+    "application/x-rar-compressed": [".rar"],
+    "application/x-bzip2": [".bz2"],
+    "application/x-lzip": [".lz"],
+    "application/x-xz": [".xz"],
+    # === Application types - Other ===
+    "application/octet-stream": [".bin"],
+    "application/pgp-encrypted": [".pgp"],
+    "application/pgp-signature": [".sig"],
+    "application/pkcs12": [".p12", ".pfx"],
+    "application/pkcs8": [".p8"],
+    "application/x-pem-file": [".pem"],
+    "application/wasm": [".wasm"],
+    # === Image types ===
+    "image/jpeg": [".jpg", ".jpeg"],
+    "image/png": [".png"],
+    "image/gif": [".gif"],
+    "image/svg+xml": [".svg"],
+    "image/webp": [".webp"],
+    "image/bmp": [".bmp"],
+    "image/tiff": [".tiff", ".tif"],
+    "image/vnd.microsoft.icon": [".ico"],
+    "image/x-icon": [".ico"],
+    "image/heic": [".heic"],
+    "image/heif": [".heif"],
+    "image/avif": [".avif"],
+    # === Audio types ===
+    "audio/mpeg": [".mp3", ".mpeg"],
+    "audio/mp4": [".m4a", ".mp4a"],
+    "audio/ogg": [".ogg", ".oga"],
+    "audio/wav": [".wav"],
+    "audio/webm": [".webm"],
+    "audio/flac": [".flac"],
+    "audio/aac": [".aac"],
+    "audio/x-wav": [".wav"],
+    # === Video types ===
+    "video/mp4": [".mp4"],
+    "video/mpeg": [".mpeg", ".mpg"],
+    "video/quicktime": [".mov"],
+    "video/webm": [".webm"],
+    "video/x-msvideo": [".avi"],
+    "video/x-flv": [".flv"],
+    "video/x-matroska": [".mkv"],
+    "video/ogg": [".ogv"],
+    # === Font types ===
+    "font/otf": [".otf"],
+    "font/ttf": [".ttf"],
+    "font/woff": [".woff"],
+    "font/woff2": [".woff2"],
+    "application/vnd.ms-fontobject": [".eot"],
+    "application/font-woff": [".woff"],
+    "application/x-font-ttf": [".ttf"],
+    "application/x-font-otf": [".otf"],
+    # === Model types ===
+    "model/gltf+json": [".gltf"],
+    "model/gltf-binary": [".glb"],
+    "model/obj": [".obj"],
+    "model/stl": [".stl"],
+    # === Message types ===
+    "message/rfc822": [".eml", ".mht"],
+    # === Multipart types ===
+    "multipart/mixed": [],
+    "multipart/alternative": [],
+    "multipart/related": [],
+    "multipart/form-data": [],
+}
+
+
+def get_preferred_extension(media_type: str) -> Optional[str]:
+    """
+    Get the preferred file extension for a media type.
+
+    Args:
+        media_type: IANA media type string
+
+    Returns:
+        Preferred extension (with dot), or None if unknown
+    """
+    if not media_type:
+        return None
+
+    # Normalize and parse
+    media_type = media_type.lower().strip()
+
+    # Handle aliases first
+    if media_type in MEDIA_TYPE_ALIASES:
+        media_type = MEDIA_TYPE_ALIASES[media_type]
+
+    # Strip parameters
+    if ";" in media_type:
+        media_type = media_type.split(";", 1)[0].strip()
+
+    # Exact match
+    if media_type in IANA_MEDIA_TYPE_TO_EXTENSION:
+        exts = IANA_MEDIA_TYPE_TO_EXTENSION[media_type]
+        if exts:
+            return exts[0]
+
+    # Try parsing for suffix matching
+    try:
+        parsed = IANAMediaType.parse(media_type)
+        # Try with suffix
+        if parsed.suffix:
+            full_type = f"{parsed.type}/{parsed.subtype}+{parsed.suffix}"
+            if full_type in IANA_MEDIA_TYPE_TO_EXTENSION:
+                exts = IANA_MEDIA_TYPE_TO_EXTENSION[full_type]
+                if exts:
+                    return exts[0]
+        # Try without tree prefix if it's a vendor type
+        if parsed.subtype.startswith("vnd."):
+            base_subtype = parsed.subtype[4:]  # Remove "vnd."
+            if base_subtype in IANA_MEDIA_TYPE_TO_EXTENSION:
+                exts = IANA_MEDIA_TYPE_TO_EXTENSION[base_subtype]
+                if exts:
+                    return exts[0]
+    except Exception:
+        pass
+
+    # Try wildcard matches
+    if media_type.startswith("text/"):
+        return ".txt"
+    if media_type.startswith("image/"):
+        return ".png"
+    if media_type.startswith("audio/"):
+        return ".mp3"
+    if media_type.startswith("video/"):
+        return ".mp4"
+
+    return None
+
+
+def get_all_extensions(media_type: str) -> List[str]:
+    """
+    Get all known file extensions for a media type.
+
+    Args:
+        media_type: IANA media type string
+
+    Returns:
+        List of extensions (with dots), empty list if unknown
+    """
+    if not media_type:
+        return []
+
+    media_type = media_type.lower().strip()
+
+    # Handle aliases first
+    if media_type in MEDIA_TYPE_ALIASES:
+        media_type = MEDIA_TYPE_ALIASES[media_type]
+
+    # Strip parameters
+    if ";" in media_type:
+        media_type = media_type.split(";", 1)[0].strip()
+
+    return IANA_MEDIA_TYPE_TO_EXTENSION.get(media_type, [])

From bf2dd288b677d8ac9a391fb61e2e3297e56ba07a Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 15:39:10 +0800
Subject: [PATCH 10/15] fix: read github/gitlab domains from CodeConfig instead
 of HTMLConfig
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix config lookup: github_domains/gitlab_domains are in CodeConfig now
- Add debug logging instead of swallowing exceptions
- Add comment about config location

🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
 openviking/parse/accessors/http_accessor.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/openviking/parse/accessors/http_accessor.py b/openviking/parse/accessors/http_accessor.py
index 258200fec..f8a96d8ac 100644
--- a/openviking/parse/accessors/http_accessor.py
+++ b/openviking/parse/accessors/http_accessor.py
@@ -544,8 +544,9 @@ def _convert_to_raw_url(self, url: str) -> str:
             from openviking_cli.utils.config import get_openviking_config
 
             config = get_openviking_config()
-            github_domains = config.html.github_domains
-            gitlab_domains = config.html.gitlab_domains
+            # NOTE: github_domains/gitlab_domains are in CodeConfig, not HTMLConfig
+            github_domains = config.code.github_domains
+            gitlab_domains = config.code.gitlab_domains
             github_raw_domain = config.code.github_raw_domain
 
             if parsed.netloc in github_domains:
@@ -558,7 +559,10 @@ def _convert_to_raw_url(self, url: str) -> str:
             if parsed.netloc in gitlab_domains and "/blob/" in parsed.path:
                 return url.replace("/blob/", "/raw/")
 
-        except Exception:
-            pass
+        except Exception as e:
+            logger.debug(
+                f"[HTTPAccessor] Failed to convert blob URL to raw: {e}, "
+                f"falling back to original URL: {url}"
+            )
 
         return url

From 9b7d0cc9eb1fb57f0d65cf1838298e3d59f7713e Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 15:43:24 +0800
Subject: [PATCH 11/15] fix: preserve original filename from URL downloads in
 two-layer flow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- HTTPAccessor now stores original_filename in LocalResource.meta
  - Priority: Content-Disposition filename > URL path filename
- UnifiedResourceProcessor now prefers original_filename from meta
  - Uses original_filename for resource_name and source_name
  - Falls back to temp path stem only if no original filename available

🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
 openviking/parse/accessors/http_accessor.py | 7 ++++++-
 openviking/utils/media_processor.py         | 8 +++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/openviking/parse/accessors/http_accessor.py b/openviking/parse/accessors/http_accessor.py
index f8a96d8ac..9d54c6d9c 100644
--- a/openviking/parse/accessors/http_accessor.py
+++ b/openviking/parse/accessors/http_accessor.py
@@ -435,7 +435,12 @@ async def _download_url(
         temp_path = temp_file.name
         temp_file.close()
 
-        meta = {**detect_meta, "extension": ext}
+        # Get original filename from URL or Content-Disposition
+        original_filename = detect_meta.get("filename_from_disposition")
+        if not original_filename:
+            original_filename = self._extract_filename_from_url(url)
+
+        meta = {**detect_meta, "extension": ext, "original_filename": original_filename}
 
         try:
             # Download content
diff --git a/openviking/utils/media_processor.py b/openviking/utils/media_processor.py
index 346912273..9bbf7ad6e 100644
--- a/openviking/utils/media_processor.py
+++ b/openviking/utils/media_processor.py
@@ -122,7 +122,13 @@ async def process(
                     # Use the last part of repo_name as the resource_name (e.g., "OpenViking" from "volcengine/OpenViking")
                     parse_kwargs["resource_name"] = repo_name.split("/")[-1]
                 else:
-                    parse_kwargs.setdefault("resource_name", local_resource.path.stem)
+                    # Prefer original_filename from meta for HTTP downloads
+                    original_filename = local_resource.meta.get("original_filename")
+                    if original_filename:
+                        parse_kwargs.setdefault("resource_name", Path(original_filename).stem)
+                        parse_kwargs.setdefault("source_name", original_filename)
+                    else:
+                        parse_kwargs.setdefault("resource_name", local_resource.path.stem)
 
             # If it's a directory, use DirectoryParser which will delegate to CodeRepositoryParser if it's a git repo
             if local_resource.path.is_dir():

From 8705331a721a981272528418534462d0c3680621 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 16:38:17 +0800
Subject: [PATCH 12/15] fix: review comments about accessors

---
 openviking/parse/accessors/git_accessor.py   |  8 ++--
 openviking/parse/accessors/http_accessor.py  |  4 +-
 openviking/parse/accessors/local_accessor.py | 39 +++++++++++++++-----
 openviking/parse/parsers/html.py             | 11 +++++-
 openviking/server/routers/pack.py            |  2 +-
 5 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/openviking/parse/accessors/git_accessor.py b/openviking/parse/accessors/git_accessor.py
index 8828e4aea..5bb27db1c 100644
--- a/openviking/parse/accessors/git_accessor.py
+++ b/openviking/parse/accessors/git_accessor.py
@@ -57,7 +57,7 @@ def can_handle(self, source: Union[str, Path]) -> bool:
         - git@ SSH URLs
         - git://, ssh:// URLs
         - GitHub/GitLab repository URLs (http/https)
-        - Local paths ending with .git or .zip
+        - Local paths ending with .git (NOT .zip - those go to ZipParser)
         """
         source_str = str(source)
 
@@ -74,14 +74,14 @@ def can_handle(self, source: Union[str, Path]) -> bool:
         if source_str.startswith(("http://", "https://")):
             return is_git_repo_url(source_str)
 
-        # Local .git or .zip files
+        # Local .git files (NOT .zip - .zip goes to ZipParser via LocalAccessor)
         if isinstance(source, Path):
             path = source
         else:
             path = Path(source_str)
 
         suffix = path.suffix.lower()
-        return suffix in (".git", ".zip")
+        return suffix == ".git"
 
     async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
         """
@@ -178,8 +178,6 @@ def _cleanup():
                         branch=branch,
                         commit=commit,
                     )
-            elif str(source).endswith(".zip"):
-                repo_name = await self._extract_zip(source_str, temp_local_dir)
             else:
                 raise ValueError(f"Unsupported source for GitAccessor: {source}")
 
diff --git a/openviking/parse/accessors/http_accessor.py b/openviking/parse/accessors/http_accessor.py
index 9d54c6d9c..19434705e 100644
--- a/openviking/parse/accessors/http_accessor.py
+++ b/openviking/parse/accessors/http_accessor.py
@@ -113,6 +113,7 @@ def __init__(self, timeout: float = 10.0):
     async def detect(
         self,
         url: str,
+        timeout: Optional[float] = None,
         request_validator=None,
     ) -> Tuple[URLType, Dict[str, Any]]:
         """
@@ -126,6 +127,7 @@ async def detect(
 
         Args:
             url: URL to detect
+            timeout: HTTP request timeout in seconds (optional, overrides detector's default)
             request_validator: Optional network request validator
 
         Returns:
@@ -152,7 +154,7 @@ async def detect(
         try:
             httpx = lazy_import("httpx")
             client_kwargs = {
-                "timeout": self.timeout,
+                "timeout": timeout if timeout is not None else self.timeout,
                 "follow_redirects": True,
             }
             event_hooks = build_httpx_request_validation_hooks(request_validator)
diff --git a/openviking/parse/accessors/local_accessor.py b/openviking/parse/accessors/local_accessor.py
index 48305121b..a5f71acd1 100644
--- a/openviking/parse/accessors/local_accessor.py
+++ b/openviking/parse/accessors/local_accessor.py
@@ -32,22 +32,34 @@ def can_handle(self, source: Union[str, Path]) -> bool:
         """
         Check if this accessor can handle the source.
 
-        LocalAccessor accepts any source that:
-        1. Is a Path object, OR
-        2. Is a string that looks like a local path and exists, OR
-        3. Is a string that doesn't look like a URL (for fallback)
+        LocalAccessor accepts:
+        1. Any Path object, OR
+        2. Any string that looks like a local path (exists or not - we'll
+           validate in access())
 
-        Since this is a fallback accessor, it returns True for most sources.
-        The priority system ensures other accessors get a chance first.
+        Since this is a fallback accessor, it only returns True for sources
+        that appear to be local paths (not URLs). The priority system ensures
+        other accessors get a chance first.
 
         Args:
             source: Source string or Path object
 
         Returns:
-            True - this is a fallback accessor that can handle any source
+            True if source appears to be a local path
         """
-        # As the fallback accessor, we can handle anything
-        # The registry will try higher-priority accessors first
+        from openviking.server.local_input_guard import is_remote_resource_source
+
+        if isinstance(source, Path):
+            return True
+
+        source_str = str(source)
+
+        # Don't handle remote URLs - those go to HTTPAccessor/GitAccessor
+        if is_remote_resource_source(source_str):
+            return False
+
+        # For strings, accept anything that could be a local path
+        # (we'll validate existence in access())
         return True
 
     async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
@@ -63,9 +75,16 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
 
         Returns:
             LocalResource pointing to the local path
+
+        Raises:
+            FileNotFoundError: If the path does not exist
         """
         path = Path(source)
 
+        # Validate that the path exists - preserve original behavior
+        if not path.exists():
+            raise FileNotFoundError(f"Path does not exist: {path}")
+
         return LocalResource(
             path=path,
             source_type=SourceType.LOCAL,
@@ -73,7 +92,7 @@ async def access(self, source: Union[str, Path], **kwargs) -> LocalResource:
             meta={
                 "filename": path.name,
                 "suffix": path.suffix.lower() if path.suffix else None,
-                "is_dir": path.is_dir() if path.exists() else None,
+                "is_dir": path.is_dir(),
             },
             is_temporary=False,
         )
diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py
index 3acf32b1d..898acacae 100644
--- a/openviking/parse/parsers/html.py
+++ b/openviking/parse/parsers/html.py
@@ -47,8 +47,15 @@ class HTMLParser(BaseParser):
     in the new two-layer architecture. This parser only handles local files.
     """
 
-    def __init__(self):
-        """Initialize HTML parser."""
+    def __init__(self, timeout: float = 30.0, **kwargs):
+        """
+        Initialize HTML parser.
+
+        Args:
+            timeout: [DEPRECATED] Kept for backward compatibility.
+                URL downloading has been moved to HTTPAccessor.
+            **kwargs: Additional arguments (kept for backward compatibility)
+        """
         pass
 
     @staticmethod
diff --git a/openviking/server/routers/pack.py b/openviking/server/routers/pack.py
index 7984183be..ca5ab16ee 100644
--- a/openviking/server/routers/pack.py
+++ b/openviking/server/routers/pack.py
@@ -94,7 +94,7 @@ async def import_ovpack(
     service = get_service()
 
     upload_temp_dir = get_openviking_config().storage.get_upload_temp_dir()
-    file_path = resolve_uploaded_temp_file_id(request.temp_file_id, upload_temp_dir)
+    file_path, _ = resolve_uploaded_temp_file_id(request.temp_file_id, upload_temp_dir)
 
     result = await service.pack.import_ovpack(
         file_path,

From 0c7d353149497530e95bc532ed181f590089f2d9 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 17:19:34 +0800
Subject: [PATCH 13/15] fix: review comments about accessors

---
 tests/misc/test_media_processor_zip_root.py | 101 +++++++++++---------
 1 file changed, 58 insertions(+), 43 deletions(-)

diff --git a/tests/misc/test_media_processor_zip_root.py b/tests/misc/test_media_processor_zip_root.py
index 567ca8fb5..204a0ada2 100644
--- a/tests/misc/test_media_processor_zip_root.py
+++ b/tests/misc/test_media_processor_zip_root.py
@@ -1,14 +1,20 @@
 #!/usr/bin/env python3
 # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
 # SPDX-License-Identifier: AGPL-3.0
+"""Tests for ZipParser single-root directory handling.
+
+Verifies that ZipParser correctly handles:
+1. ZIP with single top-level directory -> uses that directory name
+2. ZIP with multiple top-level entries -> uses the extract directory
+"""
 
 import zipfile
 from pathlib import Path
-from unittest.mock import AsyncMock
+from unittest.mock import AsyncMock, patch
 
 import pytest
 
-from openviking.utils.media_processor import UnifiedResourceProcessor
+from openviking.parse.parsers.zip_parser import ZipParser
 
 
 @pytest.mark.asyncio
@@ -17,15 +23,20 @@ async def test_zip_single_top_level_dir_uses_real_root(tmp_path: Path):
     with zipfile.ZipFile(zip_path, "w") as zf:
         zf.writestr("tt_b/bb/readme.md", "# hello\n")
 
-    processor = UnifiedResourceProcessor()
-    processor._process_directory = AsyncMock(return_value="ok")
+    parser = ZipParser()
+
+    # Mock DirectoryParser.parse to capture what directory it's called with
+    with patch("openviking.parse.parsers.directory.DirectoryParser.parse") as mock_dir_parse:
+        mock_result = AsyncMock()
+        mock_result.temp_dir_path = None
+        mock_dir_parse.return_value = mock_result
 
-    result = await processor._process_file(zip_path, instruction="")
+        await parser.parse(zip_path, instruction="")
 
-    assert result == "ok"
-    called_dir = processor._process_directory.await_args.args[0]
-    assert isinstance(called_dir, Path)
-    assert called_dir.name == "tt_b"
+        # Verify DirectoryParser was called with the real root dir "tt_b"
+        assert mock_dir_parse.called
+        called_path = Path(mock_dir_parse.await_args.args[0])
+        assert called_path.name == "tt_b"
 
 
 @pytest.mark.asyncio
@@ -34,20 +45,20 @@ async def test_zip_single_top_level_dir_ignores_zip_source_name(tmp_path: Path):
     with zipfile.ZipFile(zip_path, "w") as zf:
         zf.writestr("tt_b/bb/readme.md", "# hello\n")
 
-    processor = UnifiedResourceProcessor()
-    processor._process_directory = AsyncMock(return_value="ok")
+    parser = ZipParser()
 
-    result = await processor._process_file(
-        zip_path,
-        instruction="",
-        source_name="tt_b.zip",
-    )
+    with patch("openviking.parse.parsers.directory.DirectoryParser.parse") as mock_dir_parse:
+        mock_result = AsyncMock()
+        mock_result.temp_dir_path = None
+        mock_dir_parse.return_value = mock_result
 
-    assert result == "ok"
-    called_dir = processor._process_directory.await_args.args[0]
-    assert isinstance(called_dir, Path)
-    assert called_dir.name == "tt_b"
-    assert "source_name" not in processor._process_directory.await_args.kwargs
+        await parser.parse(zip_path, instruction="", source_name="tt_b.zip")
+
+        # Verify DirectoryParser was called with the real root dir "tt_b"
+        called_path = Path(mock_dir_parse.await_args.args[0])
+        assert called_path.name == "tt_b"
+        # source_name should NOT be passed to DirectoryParser in this case
+        assert "source_name" not in mock_dir_parse.await_args.kwargs
 
 
 @pytest.mark.asyncio
@@ -57,35 +68,39 @@ async def test_zip_multiple_top_level_entries_keeps_extract_root(tmp_path: Path)
         zf.writestr("a/readme.md", "# a\n")
         zf.writestr("b/readme.md", "# b\n")
 
-    processor = UnifiedResourceProcessor()
-    processor._process_directory = AsyncMock(return_value="ok")
+    parser = ZipParser()
+
+    with patch("openviking.parse.parsers.directory.DirectoryParser.parse") as mock_dir_parse:
+        mock_result = AsyncMock()
+        mock_result.temp_dir_path = None
+        mock_dir_parse.return_value = mock_result
 
-    result = await processor._process_file(zip_path, instruction="")
+        await parser.parse(zip_path, instruction="")
 
-    assert result == "ok"
-    called_dir = processor._process_directory.await_args.args[0]
-    assert isinstance(called_dir, Path)
-    assert called_dir.name != "a"
-    assert called_dir.name != "b"
+        # Verify DirectoryParser was called with the extract dir, not "a" or "b"
+        called_path = Path(mock_dir_parse.await_args.args[0])
+        assert called_path.name != "a"
+        assert called_path.name != "b"
+        # Should have the ov_zip_ prefix
+        assert called_path.name.startswith("ov_zip_")
 
 
 @pytest.mark.asyncio
 async def test_single_file_uses_source_name_for_resource_name(tmp_path: Path):
-    file_path = tmp_path / "upload_123.txt"
-    file_path.write_text("hello\n")
+    """Test that source_name is passed through correctly when needed."""
+    zip_path = tmp_path / "mixed.zip"
+    with zipfile.ZipFile(zip_path, "w") as zf:
+        zf.writestr("a/readme.md", "# a\n")
+        zf.writestr("b/readme.md", "# b\n")
 
-    processor = UnifiedResourceProcessor()
+    parser = ZipParser()
 
-    with pytest.MonkeyPatch.context() as mp:
-        parse_mock = AsyncMock(return_value="ok")
-        mp.setattr("openviking.utils.media_processor.parse", parse_mock)
+    with patch("openviking.parse.parsers.directory.DirectoryParser.parse") as mock_dir_parse:
+        mock_result = AsyncMock()
+        mock_result.temp_dir_path = None
+        mock_dir_parse.return_value = mock_result
 
-        result = await processor._process_file(
-            file_path,
-            instruction="",
-            source_name="aa.txt",
-        )
+        await parser.parse(zip_path, instruction="", source_name="aa.txt")
 
-    assert result == "ok"
-    assert parse_mock.await_args.kwargs["resource_name"] == "aa"
-    assert parse_mock.await_args.kwargs["source_name"] == "aa.txt"
+        # Verify source_name is passed when we use the extract root
+        assert mock_dir_parse.await_args.kwargs.get("source_name") == "aa.txt"

From 59f3f057042b87f633eedd13eb8ac2c38a186adb Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Tue, 14 Apr 2026 19:32:41 +0800
Subject: [PATCH 14/15] fix: multiple improvements for filename handling,
 network access, and ragfs exceptions

---
 crates/ragfs-python/src/lib.rs          |  51 +++++++++++-
 openviking/parse/parsers/html.py        |  18 -----
 openviking/parse/parsers/markdown.py    |  49 ++++++++++--
 openviking/pyagfs/__init__.py           |  34 ++++++++
 openviking/pyagfs/exceptions.py         | 102 ++++++++++++++++++++++++
 openviking/server/routers/content.py    |  46 ++++++++++-
 openviking/server/routers/filesystem.py |  92 +++++++++++++++------
 openviking/server/routers/search.py     |  44 +++++++---
 openviking/storage/viking_fs.py         |  34 +++++++-
 openviking/utils/media_processor.py     |  46 ++++++++++-
 openviking_cli/utils/uri.py             |  14 ++--
 11 files changed, 452 insertions(+), 78 deletions(-)

diff --git a/crates/ragfs-python/src/lib.rs b/crates/ragfs-python/src/lib.rs
index 249257e33..20f073119 100644
--- a/crates/ragfs-python/src/lib.rs
+++ b/crates/ragfs-python/src/lib.rs
@@ -6,7 +6,7 @@
 
 use pyo3::exceptions::PyRuntimeError;
 use pyo3::prelude::*;
-use pyo3::types::{PyBytes, PyDict, PyList};
+use pyo3::types::{PyBytes, PyDict, PyList, PyType};
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
@@ -18,9 +18,54 @@ use ragfs::plugins::{
     KVFSPlugin, LocalFSPlugin, MemFSPlugin, QueueFSPlugin, SQLFSPlugin, ServerInfoFSPlugin,
 };
 
-/// Convert a ragfs error into a Python RuntimeError
+/// Get a Python exception class from the pyagfs module
+fn get_exception<'py>(py: Python<'py>, name: &str) -> PyResult<Bound<'py, PyType>> {
+    let pyagfs = PyModule::import(py, "openviking.pyagfs")?;
+    let exc = pyagfs.getattr(name)?;
+    Ok(exc.cast_into()?)
+}
+
+/// Create a PyErr from an exception type name and message
+fn new_py_err(name: &str, msg: String) -> PyErr {
+    Python::attach(|py| {
+        if let Ok(exc) = get_exception(py, name) {
+            PyErr::from_type(exc, msg)
+        } else {
+            PyRuntimeError::new_err(msg)
+        }
+    })
+}
+
+/// Convert a ragfs error into the appropriate Python exception
 fn to_py_err(e: ragfs::core::Error) -> PyErr {
-    PyRuntimeError::new_err(e.to_string())
+    let msg = e.to_string();
+    match e {
+        ragfs::core::Error::NotFound(_) => new_py_err("AGFSNotFoundError", msg),
+        ragfs::core::Error::AlreadyExists(_) => new_py_err("AGFSAlreadyExistsError", msg),
+        ragfs::core::Error::PermissionDenied(_) => new_py_err("AGFSPermissionDeniedError", msg),
+        ragfs::core::Error::InvalidPath(_) => new_py_err("AGFSInvalidPathError", msg),
+        ragfs::core::Error::NotADirectory(_) => new_py_err("AGFSNotADirectoryError", msg),
+        ragfs::core::Error::IsADirectory(_) => new_py_err("AGFSIsADirectoryError", msg),
+        ragfs::core::Error::DirectoryNotEmpty(_) => new_py_err("AGFSDirectoryNotEmptyError", msg),
+        ragfs::core::Error::InvalidOperation(_) => new_py_err("AGFSInvalidOperationError", msg),
+        ragfs::core::Error::Io(_) => new_py_err("AGFSIoError", msg),
+        ragfs::core::Error::Plugin(_) => {
+            // Check if the plugin error message contains known patterns
+            let err_msg = msg.to_lowercase();
+            if err_msg.contains("directory not empty") {
+                new_py_err("AGFSDirectoryNotEmptyError", msg)
+            } else {
+                new_py_err("AGFSPluginError", msg)
+            }
+        }
+        ragfs::core::Error::Config(_) => new_py_err("AGFSConfigError", msg),
+        ragfs::core::Error::MountPointNotFound(_) => new_py_err("AGFSMountPointNotFoundError", msg),
+        ragfs::core::Error::MountPointExists(_) => new_py_err("AGFSMountPointExistsError", msg),
+        ragfs::core::Error::Serialization(_) => new_py_err("AGFSSerializationError", msg),
+        ragfs::core::Error::Network(_) => new_py_err("AGFSNetworkError", msg),
+        ragfs::core::Error::Timeout(_) => new_py_err("AGFSTimeoutError", msg),
+        ragfs::core::Error::Internal(_) => new_py_err("AGFSInternalError", msg),
+    }
 }
 
 /// Convert FileInfo to a Python dict matching the Go binding JSON format:
diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py
index 898acacae..f21c8d8d0 100644
--- a/openviking/parse/parsers/html.py
+++ b/openviking/parse/parsers/html.py
@@ -8,8 +8,6 @@
 For URL downloading, use HTTPAccessor in the new two-layer architecture.
 """
 
-import hashlib
-import re
 import time
 from pathlib import Path
 from typing import List, Optional, Union
@@ -231,19 +229,3 @@ async def parse_content(
         result.parser_name = "HTMLParser"
 
         return result
-
-    def _sanitize_for_path(self, text: str, max_length: int = 50) -> str:
-        """Sanitize text for use in file path, hash & shorten if too long."""
-        safe = re.sub(
-            r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s-]",
-            "",
-            text,
-        )
-        safe = re.sub(r"\s+", "_", safe)
-        safe = safe.strip("_")
-        if not safe:
-            return "section"
-        if len(safe) > max_length:
-            hash_suffix = hashlib.sha256(text.encode()).hexdigest()[:8]
-            return f"{safe[: max_length - 9]}_{hash_suffix}"
-        return safe
diff --git a/openviking/parse/parsers/markdown.py b/openviking/parse/parsers/markdown.py
index a812ba1d1..d84285925 100644
--- a/openviking/parse/parsers/markdown.py
+++ b/openviking/parse/parsers/markdown.py
@@ -23,11 +23,48 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
+from openviking.parse.accessors.mime_types import IANA_MEDIA_TYPE_TO_EXTENSION
 from openviking.parse.base import NodeType, ParseResult, ResourceNode, create_parse_result
 from openviking.parse.parsers.base_parser import BaseParser
+from openviking.parse.parsers.constants import (
+    CODE_EXTENSIONS,
+    DOCUMENTATION_EXTENSIONS,
+    IGNORE_EXTENSIONS,
+)
 from openviking_cli.utils.config.parser_config import ParserConfig
 from openviking_cli.utils.logger import get_logger
 
+# All known valid extensions - only these should be stripped when getting stem
+KNOWN_EXTENSIONS: set[str] = set()
+for extensions in IANA_MEDIA_TYPE_TO_EXTENSION.values():
+    KNOWN_EXTENSIONS.update(extensions)
+KNOWN_EXTENSIONS.update(CODE_EXTENSIONS)
+KNOWN_EXTENSIONS.update(DOCUMENTATION_EXTENSIONS)
+KNOWN_EXTENSIONS.update(IGNORE_EXTENSIONS)
+
+
+def _smart_stem(path_or_name: str | Path) -> str:
+    """Get the stem of a filename, but only strip known valid extensions.
+
+    For filenames like "2601.00014" where ".00014" is not a valid extension,
+    returns the full name instead of just "2601".
+
+    Args:
+        path_or_name: Path object or string filename
+
+    Returns:
+        Stem with only known extensions stripped
+    """
+    path = Path(path_or_name)
+    suffix = path.suffix.lower()
+
+    if suffix in KNOWN_EXTENSIONS:
+        return path.stem
+
+    # If the suffix is not a known extension, treat the whole name as the stem
+    return path.name
+
+
 logger = get_logger(__name__)
 
 if TYPE_CHECKING:
@@ -180,9 +217,9 @@ async def parse_content(
             # the temp upload name (e.g. upload_<uuid>.txt).
             doc_title = meta.get("frontmatter", {}).get(
                 "title",
-                Path(explicit_name).stem
+                _smart_stem(explicit_name)
                 if explicit_name
-                else Path(source_path).stem
+                else _smart_stem(source_path)
                 if source_path
                 else "Document",
             )
@@ -200,7 +237,7 @@ async def parse_content(
                 headings,
                 root_dir,
                 source_path,
-                doc_name=self._sanitize_for_path(Path(doc_title).stem),
+                doc_name=self._sanitize_for_path(doc_title),
             )
 
             parse_time = time.time() - start_time
@@ -358,12 +395,12 @@ def _smart_split_content(self, content: str, max_size: int) -> List[str]:
 
     def _sanitize_for_path(self, text: str, max_length: int = 50) -> str:
         safe = re.sub(
-            r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s-]",
+            r"[^\w\u0080-\u02af\u0400-\u052f\u0600-\u077f\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\s.-]",
             "",
             text,
         )
         safe = re.sub(r"\s+", "_", safe)
-        safe = safe.strip("_")
+        safe = safe.strip("._")
         if not safe:
             return "section"
         if len(safe) > max_length:
@@ -411,7 +448,7 @@ async def _parse_and_create_structure(
 
         # Get document name
         doc_name = doc_name or self._sanitize_for_path(
-            Path(source_path).stem if source_path else "content"
+            _smart_stem(source_path) if source_path else "content"
         )
 
         # Small document: save as single file (check both token and char limits)
diff --git a/openviking/pyagfs/__init__.py b/openviking/pyagfs/__init__.py
index 7b487a27e..6ae6bc8cc 100644
--- a/openviking/pyagfs/__init__.py
+++ b/openviking/pyagfs/__init__.py
@@ -11,10 +11,27 @@
 
 from .client import AGFSClient, FileHandle
 from .exceptions import (
+    AGFSAlreadyExistsError,
     AGFSClientError,
+    AGFSConfigError,
     AGFSConnectionError,
+    AGFSDirectoryNotEmptyError,
+    AGFSFileExistsError,
     AGFSHTTPError,
+    AGFSInternalError,
+    AGFSInvalidOperationError,
+    AGFSInvalidPathError,
+    AGFSIoError,
+    AGFSIsADirectoryError,
+    AGFSMountPointExistsError,
+    AGFSMountPointNotFoundError,
+    AGFSNetworkError,
+    AGFSNotADirectoryError,
+    AGFSNotFoundError,
     AGFSNotSupportedError,
+    AGFSPermissionDeniedError,
+    AGFSPluginError,
+    AGFSSerializationError,
     AGFSTimeoutError,
 )
 from .helpers import cp, download, upload
@@ -109,6 +126,23 @@ def get_binding_client():
     "AGFSTimeoutError",
     "AGFSHTTPError",
     "AGFSNotSupportedError",
+    "AGFSNotFoundError",
+    "AGFSAlreadyExistsError",
+    "AGFSFileExistsError",
+    "AGFSPermissionDeniedError",
+    "AGFSInvalidPathError",
+    "AGFSNotADirectoryError",
+    "AGFSIsADirectoryError",
+    "AGFSDirectoryNotEmptyError",
+    "AGFSInvalidOperationError",
+    "AGFSIoError",
+    "AGFSConfigError",
+    "AGFSMountPointNotFoundError",
+    "AGFSMountPointExistsError",
+    "AGFSSerializationError",
+    "AGFSNetworkError",
+    "AGFSInternalError",
+    "AGFSPluginError",
     "cp",
     "upload",
     "download",
diff --git a/openviking/pyagfs/exceptions.py b/openviking/pyagfs/exceptions.py
index 7247215c9..73fd756a3 100644
--- a/openviking/pyagfs/exceptions.py
+++ b/openviking/pyagfs/exceptions.py
@@ -31,3 +31,105 @@ class AGFSNotSupportedError(AGFSClientError):
     """Operation not supported by the server or filesystem (HTTP 501)"""
 
     pass
+
+
+class AGFSNotFoundError(AGFSClientError):
+    """File or directory not found"""
+
+    pass
+
+
+class AGFSAlreadyExistsError(AGFSClientError):
+    """File or directory already exists"""
+
+    pass
+
+
+class AGFSFileExistsError(AGFSAlreadyExistsError):
+    """File already exists (alias for AGFSAlreadyExistsError)"""
+
+    pass
+
+
+class AGFSPermissionDeniedError(AGFSClientError):
+    """Permission denied"""
+
+    pass
+
+
+class AGFSInvalidPathError(AGFSClientError):
+    """Invalid path"""
+
+    pass
+
+
+class AGFSNotADirectoryError(AGFSClientError):
+    """Not a directory"""
+
+    pass
+
+
+class AGFSIsADirectoryError(AGFSClientError):
+    """Is a directory (when file operation expected)"""
+
+    pass
+
+
+class AGFSDirectoryNotEmptyError(AGFSClientError):
+    """Directory not empty"""
+
+    pass
+
+
+class AGFSInvalidOperationError(AGFSClientError):
+    """Invalid operation"""
+
+    pass
+
+
+class AGFSIoError(AGFSClientError):
+    """I/O error"""
+
+    pass
+
+
+class AGFSConfigError(AGFSClientError):
+    """Configuration error"""
+
+    pass
+
+
+class AGFSMountPointNotFoundError(AGFSClientError):
+    """Mount point not found"""
+
+    pass
+
+
+class AGFSMountPointExistsError(AGFSClientError):
+    """Mount point already exists"""
+
+    pass
+
+
+class AGFSSerializationError(AGFSClientError):
+    """Serialization error"""
+
+    pass
+
+
+class AGFSNetworkError(AGFSClientError):
+    """Network error"""
+
+    pass
+
+
+class AGFSInternalError(AGFSClientError):
+    """Internal error"""
+
+    pass
+
+
+class AGFSPluginError(AGFSClientError):
+    """Plugin error"""
+
+    pass
diff --git a/openviking/server/routers/content.py b/openviking/server/routers/content.py
index 7801546d0..188166814 100644
--- a/openviking/server/routers/content.py
+++ b/openviking/server/routers/content.py
@@ -9,12 +9,14 @@
 from fastapi.responses import Response as FastAPIResponse
 from pydantic import BaseModel, ConfigDict
 
+from openviking.pyagfs.exceptions import AGFSClientError, AGFSNotFoundError
 from openviking.server.auth import get_request_context
 from openviking.server.dependencies import get_service
 from openviking.server.identity import RequestContext
 from openviking.server.models import ErrorInfo, Response
 from openviking.server.telemetry import run_operation
 from openviking.telemetry import TelemetryRequest
+from openviking_cli.exceptions import NotFoundError
 from openviking_cli.utils import get_logger
 
 logger = get_logger(__name__)
@@ -55,7 +57,16 @@ async def read(
 ):
     """Read file content (L2)."""
     service = get_service()
-    result = await service.fs.read(uri, ctx=_ctx, offset=offset, limit=limit)
+    try:
+        result = await service.fs.read(uri, ctx=_ctx, offset=offset, limit=limit)
+    except AGFSNotFoundError:
+        raise NotFoundError(uri, "file")
+    except AGFSClientError as e:
+        # Fallback for older versions without typed exceptions
+        err_msg = str(e).lower()
+        if "not found" in err_msg or "no such file or directory" in err_msg:
+            raise NotFoundError(uri, "file")
+        raise
 
     # 清理MEMORY_FIELDS隐藏注释（v2记忆加工过程中的临时内部数据，不暴露给外部用户）
     if isinstance(result, bytes):
@@ -80,7 +91,16 @@ async def abstract(
 ):
     """Read L0 abstract."""
     service = get_service()
-    result = await service.fs.abstract(uri, ctx=_ctx)
+    try:
+        result = await service.fs.abstract(uri, ctx=_ctx)
+    except AGFSNotFoundError:
+        raise NotFoundError(uri, "file")
+    except AGFSClientError as e:
+        # Fallback for older versions without typed exceptions
+        err_msg = str(e).lower()
+        if "not found" in err_msg or "no such file or directory" in err_msg:
+            raise NotFoundError(uri, "file")
+        raise
     return Response(status="ok", result=result)
 
 
@@ -91,7 +111,16 @@ async def overview(
 ):
     """Read L1 overview."""
     service = get_service()
-    result = await service.fs.overview(uri, ctx=_ctx)
+    try:
+        result = await service.fs.overview(uri, ctx=_ctx)
+    except AGFSNotFoundError:
+        raise NotFoundError(uri, "file")
+    except AGFSClientError as e:
+        # Fallback for older versions without typed exceptions
+        err_msg = str(e).lower()
+        if "not found" in err_msg or "no such file or directory" in err_msg:
+            raise NotFoundError(uri, "file")
+        raise
     return Response(status="ok", result=result)
 
 
@@ -102,7 +131,16 @@ async def download(
 ):
     """Download file as raw bytes (for images, binaries, etc.)."""
     service = get_service()
-    content = await service.fs.read_file_bytes(uri, ctx=_ctx)
+    try:
+        content = await service.fs.read_file_bytes(uri, ctx=_ctx)
+    except AGFSNotFoundError:
+        raise NotFoundError(uri, "file")
+    except AGFSClientError as e:
+        # Fallback for older versions without typed exceptions
+        err_msg = str(e).lower()
+        if "not found" in err_msg or "no such file or directory" in err_msg:
+            raise NotFoundError(uri, "file")
+        raise
 
     # Try to get filename from stat
     filename = "download"
diff --git a/openviking/server/routers/filesystem.py b/openviking/server/routers/filesystem.py
index b5dcdb379..4c27e0f88 100644
--- a/openviking/server/routers/filesystem.py
+++ b/openviking/server/routers/filesystem.py
@@ -7,7 +7,7 @@
 from fastapi import APIRouter, Depends, Query
 from pydantic import BaseModel
 
-from openviking.pyagfs.exceptions import AGFSClientError
+from openviking.pyagfs.exceptions import AGFSClientError, AGFSNotFoundError
 from openviking.server.auth import get_request_context
 from openviking.server.dependencies import get_service
 from openviking.server.identity import RequestContext
@@ -32,16 +32,25 @@ async def ls(
     """List directory contents."""
     service = get_service()
     actual_node_limit = limit if limit is not None else node_limit
-    result = await service.fs.ls(
-        uri,
-        ctx=_ctx,
-        recursive=recursive,
-        simple=simple,
-        output=output,
-        abs_limit=abs_limit,
-        show_all_hidden=show_all_hidden,
-        node_limit=actual_node_limit,
-    )
+    try:
+        result = await service.fs.ls(
+            uri,
+            ctx=_ctx,
+            recursive=recursive,
+            simple=simple,
+            output=output,
+            abs_limit=abs_limit,
+            show_all_hidden=show_all_hidden,
+            node_limit=actual_node_limit,
+        )
+    except AGFSNotFoundError:
+        raise NotFoundError(uri, "file")
+    except AGFSClientError as e:
+        # Fallback for older versions without typed exceptions
+        err_msg = str(e).lower()
+        if "not found" in err_msg or "no such file or directory" in err_msg:
+            raise NotFoundError(uri, "file")
+        raise
     return Response(status="ok", result=result)
 
 
@@ -59,15 +68,24 @@ async def tree(
     """Get directory tree."""
     service = get_service()
     actual_node_limit = limit if limit is not None else node_limit
-    result = await service.fs.tree(
-        uri,
-        ctx=_ctx,
-        output=output,
-        abs_limit=abs_limit,
-        show_all_hidden=show_all_hidden,
-        node_limit=actual_node_limit,
-        level_limit=level_limit,
-    )
+    try:
+        result = await service.fs.tree(
+            uri,
+            ctx=_ctx,
+            output=output,
+            abs_limit=abs_limit,
+            show_all_hidden=show_all_hidden,
+            node_limit=actual_node_limit,
+            level_limit=level_limit,
+        )
+    except AGFSNotFoundError:
+        raise NotFoundError(uri, "file")
+    except AGFSClientError as e:
+        # Fallback for older versions without typed exceptions
+        err_msg = str(e).lower()
+        if "not found" in err_msg or "no such file or directory" in err_msg:
+            raise NotFoundError(uri, "file")
+        raise
     return Response(status="ok", result=result)
 
 
@@ -81,7 +99,10 @@ async def stat(
     try:
         result = await service.fs.stat(uri, ctx=_ctx)
         return Response(status="ok", result=result)
+    except AGFSNotFoundError:
+        raise NotFoundError(uri, "file")
     except AGFSClientError as e:
+        # Fallback for older versions without typed exceptions
         err_msg = str(e).lower()
         if "not found" in err_msg or "no such file or directory" in err_msg:
             raise NotFoundError(uri, "file")
@@ -101,7 +122,14 @@ async def mkdir(
 ):
     """Create directory."""
     service = get_service()
-    await service.fs.mkdir(request.uri, ctx=_ctx)
+    try:
+        await service.fs.mkdir(request.uri, ctx=_ctx)
+    except AGFSClientError as e:
+        # Handle common AGFS errors
+        err_msg = str(e).lower()
+        if "not found" in err_msg or "no such file or directory" in err_msg:
+            raise NotFoundError(request.uri, "file")
+        raise
     return Response(status="ok", result={"uri": request.uri})
 
 
@@ -113,7 +141,16 @@ async def rm(
 ):
     """Remove resource."""
     service = get_service()
-    await service.fs.rm(uri, ctx=_ctx, recursive=recursive)
+    try:
+        await service.fs.rm(uri, ctx=_ctx, recursive=recursive)
+    except AGFSNotFoundError:
+        raise NotFoundError(uri, "file")
+    except AGFSClientError as e:
+        # Fallback for older versions without typed exceptions
+        err_msg = str(e).lower()
+        if "not found" in err_msg or "no such file or directory" in err_msg:
+            raise NotFoundError(uri, "file")
+        raise
     return Response(status="ok", result={"uri": uri})
 
 
@@ -131,5 +168,14 @@ async def mv(
 ):
     """Move resource."""
     service = get_service()
-    await service.fs.mv(request.from_uri, request.to_uri, ctx=_ctx)
+    try:
+        await service.fs.mv(request.from_uri, request.to_uri, ctx=_ctx)
+    except AGFSNotFoundError:
+        raise NotFoundError(request.from_uri, "file")
+    except AGFSClientError as e:
+        # Fallback for older versions without typed exceptions
+        err_msg = str(e).lower()
+        if "not found" in err_msg or "no such file or directory" in err_msg:
+            raise NotFoundError(request.from_uri, "file")
+        raise
     return Response(status="ok", result={"from": request.from_uri, "to": request.to_uri})
diff --git a/openviking/server/routers/search.py b/openviking/server/routers/search.py
index f8c14ead8..001351fca 100644
--- a/openviking/server/routers/search.py
+++ b/openviking/server/routers/search.py
@@ -8,12 +8,14 @@
 from fastapi import APIRouter, Depends
 from pydantic import BaseModel
 
+from openviking.pyagfs.exceptions import AGFSClientError, AGFSNotFoundError
 from openviking.server.auth import get_request_context
 from openviking.server.dependencies import get_service
 from openviking.server.identity import RequestContext
 from openviking.server.models import Response
 from openviking.server.telemetry import run_operation
 from openviking.telemetry import TelemetryRequest
+from openviking_cli.exceptions import NotFoundError
 
 
 def _sanitize_floats(obj: Any) -> Any:
@@ -156,15 +158,24 @@ async def grep(
 ):
     """Content search with pattern."""
     service = get_service()
-    result = await service.fs.grep(
-        request.uri,
-        request.pattern,
-        ctx=_ctx,
-        exclude_uri=request.exclude_uri,
-        case_insensitive=request.case_insensitive,
-        node_limit=request.node_limit,
-        level_limit=request.level_limit,
-    )
+    try:
+        result = await service.fs.grep(
+            request.uri,
+            request.pattern,
+            ctx=_ctx,
+            exclude_uri=request.exclude_uri,
+            case_insensitive=request.case_insensitive,
+            node_limit=request.node_limit,
+            level_limit=request.level_limit,
+        )
+    except AGFSNotFoundError:
+        raise NotFoundError(request.uri, "file")
+    except AGFSClientError as e:
+        # Fallback for older versions without typed exceptions
+        err_msg = str(e).lower()
+        if "not found" in err_msg or "no such file or directory" in err_msg:
+            raise NotFoundError(request.uri, "file")
+        raise
     return Response(status="ok", result=result)
 
 
@@ -175,7 +186,16 @@ async def glob(
 ):
     """File pattern matching."""
     service = get_service()
-    result = await service.fs.glob(
-        request.pattern, ctx=_ctx, uri=request.uri, node_limit=request.node_limit
-    )
+    try:
+        result = await service.fs.glob(
+            request.pattern, ctx=_ctx, uri=request.uri, node_limit=request.node_limit
+        )
+    except AGFSNotFoundError:
+        raise NotFoundError(request.uri or request.pattern, "file")
+    except AGFSClientError as e:
+        # Fallback for older versions without typed exceptions
+        err_msg = str(e).lower()
+        if "not found" in err_msg or "no such file or directory" in err_msg:
+            raise NotFoundError(request.uri or request.pattern, "file")
+        raise
     return Response(status="ok", result=result)
diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py
index f1bbaf906..c83cc67cd 100644
--- a/openviking/storage/viking_fs.py
+++ b/openviking/storage/viking_fs.py
@@ -23,12 +23,12 @@
 from pathlib import PurePath
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
-from openviking.pyagfs.exceptions import AGFSClientError, AGFSHTTPError
+from openviking.pyagfs.exceptions import AGFSClientError, AGFSDirectoryNotEmptyError, AGFSHTTPError
 from openviking.resource.watch_storage import is_watch_task_control_uri
 from openviking.server.identity import RequestContext, Role
 from openviking.telemetry import get_current_telemetry
 from openviking.utils.time_utils import format_simplified, get_current_timestamp, parse_iso_datetime
-from openviking_cli.exceptions import NotFoundError
+from openviking_cli.exceptions import FailedPreconditionError, NotFoundError
 from openviking_cli.session.user_id import UserIdentifier
 from openviking_cli.utils.logger import get_logger
 from openviking_cli.utils.uri import VikingURI
@@ -41,6 +41,22 @@
 logger = get_logger(__name__)
 
 
+def _is_directory_not_empty_error(message: str) -> bool:
+    """Check if an error message indicates a directory not empty error.
+
+    Handles multiple possible error message formats from different backends.
+    """
+    msg = message.lower()
+    return any(
+        pattern in msg
+        for pattern in [
+            "directory not empty",
+            "dir not empty",
+            "directory is not empty",
+        ]
+    )
+
+
 # ========== Dataclass ==========
 
 
@@ -410,7 +426,19 @@ async def rm(
                 uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx)
                 uris_to_delete.append(target_uri)
                 await self._delete_from_vector_store(uris_to_delete, ctx=ctx)
-                result = self.agfs.rm(path, recursive=recursive)
+                try:
+                    result = self.agfs.rm(path, recursive=recursive)
+                except AGFSDirectoryNotEmptyError:
+                    raise FailedPreconditionError(
+                        f"Directory not empty: {uri}. Use recursive=True to delete non-empty directories."
+                    )
+                except RuntimeError as e:
+                    # Fallback for older versions without typed exceptions
+                    if _is_directory_not_empty_error(str(e)):
+                        raise FailedPreconditionError(
+                            f"Directory not empty: {uri}. Use recursive=True to delete non-empty directories."
+                        )
+                    raise
                 return result
         except LockAcquisitionError:
             raise ResourceBusyError(f"Resource is being processed: {uri}")
diff --git a/openviking/utils/media_processor.py b/openviking/utils/media_processor.py
index 9bbf7ad6e..105f5fb77 100644
--- a/openviking/utils/media_processor.py
+++ b/openviking/utils/media_processor.py
@@ -7,7 +7,13 @@
 
 from openviking.parse import DocumentConverter, parse
 from openviking.parse.accessors.base import SourceType
+from openviking.parse.accessors.mime_types import IANA_MEDIA_TYPE_TO_EXTENSION
 from openviking.parse.base import ParseResult
+from openviking.parse.parsers.constants import (
+    CODE_EXTENSIONS,
+    DOCUMENTATION_EXTENSIONS,
+    IGNORE_EXTENSIONS,
+)
 from openviking.server.local_input_guard import (
     is_remote_resource_source,
     looks_like_local_path,
@@ -15,6 +21,40 @@
 from openviking_cli.exceptions import PermissionDeniedError
 from openviking_cli.utils.logger import get_logger
 
+# All known valid extensions - only these should be stripped when getting stem
+# Build from multiple sources:
+# 1. IANA media type mappings (comprehensive)
+# 2. Code/documentation/ignore extensions (parser-specific)
+KNOWN_EXTENSIONS: set[str] = set()
+for extensions in IANA_MEDIA_TYPE_TO_EXTENSION.values():
+    KNOWN_EXTENSIONS.update(extensions)
+KNOWN_EXTENSIONS.update(CODE_EXTENSIONS)
+KNOWN_EXTENSIONS.update(DOCUMENTATION_EXTENSIONS)
+KNOWN_EXTENSIONS.update(IGNORE_EXTENSIONS)
+
+
+def _smart_stem(path_or_name: str | Path) -> str:
+    """Get the stem of a filename, but only strip known valid extensions.
+
+    For filenames like "2601.00014" where ".00014" is not a valid extension,
+    returns the full name instead of just "2601".
+
+    Args:
+        path_or_name: Path object or string filename
+
+    Returns:
+        Stem with only known extensions stripped
+    """
+    path = Path(path_or_name)
+    suffix = path.suffix.lower()
+
+    if suffix in KNOWN_EXTENSIONS:
+        return path.stem
+
+    # If the suffix is not a known extension, treat the whole name as the stem
+    return path.name
+
+
 if TYPE_CHECKING:
     from openviking.parse.vlm import VLMProcessor
     from openviking_cli.utils.storage import StoragePath
@@ -113,7 +153,7 @@ async def process(
             # Set resource_name from source_name or path
             source_name = kwargs.get("source_name")
             if source_name:
-                parse_kwargs["resource_name"] = Path(source_name).stem
+                parse_kwargs["resource_name"] = _smart_stem(source_name)
                 parse_kwargs.setdefault("source_name", source_name)
             else:
                 # For git repositories, use repo_name from meta if available
@@ -125,10 +165,10 @@ async def process(
                     # Prefer original_filename from meta for HTTP downloads
                     original_filename = local_resource.meta.get("original_filename")
                     if original_filename:
-                        parse_kwargs.setdefault("resource_name", Path(original_filename).stem)
+                        parse_kwargs.setdefault("resource_name", _smart_stem(original_filename))
                         parse_kwargs.setdefault("source_name", original_filename)
                     else:
-                        parse_kwargs.setdefault("resource_name", local_resource.path.stem)
+                        parse_kwargs.setdefault("resource_name", _smart_stem(local_resource.path))
 
             # If it's a directory, use DirectoryParser which will delegate to CodeRepositoryParser if it's a git repo
             if local_resource.path.is_dir():
diff --git a/openviking_cli/utils/uri.py b/openviking_cli/utils/uri.py
index 61ef4d1c7..d4ab5af9b 100644
--- a/openviking_cli/utils/uri.py
+++ b/openviking_cli/utils/uri.py
@@ -227,21 +227,23 @@ def sanitize_segment(text: str) -> str:
             URI-safe string
         """
         # Preserve:
-        # - Letters, numbers, underscores, hyphens (\w includes [a-zA-Z0-9_])
+        # - Letters, numbers, underscores, hyphens, dots (\w includes [a-zA-Z0-9_])
+        # - Latin-1 Supplement, Latin Extended-A/B (Western European languages: Spanish, Portuguese, French, German, etc.)
+        # - Cyrillic (Russian, Bulgarian, Serbian, etc.)
+        # - Arabic (Arabic, Persian, Urdu, etc.)
         # - CJK Unified Ideographs (Chinese, Japanese Kanji, Korean Hanja)
         # - Hiragana and Katakana (Japanese)
         # - Hangul Syllables (Korean)
-        # - CJK Unified Ideographs Extension A
-        # - CJK Unified Ideographs Extension B
+        # - CJK Unified Ideographs Extension A/B
         safe = re.sub(
-            r"[^\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\-]",
+            r"[^\w\u0080-\u02af\u0400-\u052f\u0600-\u077f\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3400-\u4dbf\U00020000-\U0002a6df\-.]",
             "_",
             text,
         )
         # Merge consecutive underscores
         safe = re.sub(r"_+", "_", safe)
-        # Strip leading/trailing underscores and limit length
-        safe = safe.strip("_")[:50]
+        # Strip leading/trailing underscores and dots, limit length
+        safe = safe.strip("_.")[:50]
         return safe or "unnamed"
 
     def __str__(self) -> str:

From c4550e733ffb9a06cc1a91980432dbc8553132b1 Mon Sep 17 00:00:00 2001
From: openviking <openviking@example.com>
Date: Wed, 15 Apr 2026 10:25:09 +0800
Subject: [PATCH 15/15] fix: multiple improvements for filename handling,
 network access, and ragfs exceptions

---
 openviking/parse/accessors/http_accessor.py   |  3 +++
 tests/server/test_api_local_input_security.py | 12 +-----------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/openviking/parse/accessors/http_accessor.py b/openviking/parse/accessors/http_accessor.py
index 19434705e..6c9941145 100644
--- a/openviking/parse/accessors/http_accessor.py
+++ b/openviking/parse/accessors/http_accessor.py
@@ -25,6 +25,7 @@
 from openviking.parse.base import lazy_import
 from openviking.parse.parsers.constants import CODE_EXTENSIONS
 from openviking.utils.network_guard import build_httpx_request_validation_hooks
+from openviking_cli.exceptions import PermissionDeniedError
 from openviking_cli.utils.logger import get_logger
 
 from .base import DataAccessor, LocalResource, SourceType
@@ -192,6 +193,8 @@ async def detect(
                     if url_type != URLType.UNKNOWN:
                         return url_type, meta
 
+        except PermissionDeniedError:
+            raise
         except Exception as e:
             meta["detection_error"] = str(e)
             logger.debug(f"[URLTypeDetector] HEAD request failed: {e}, falling back to default")
diff --git a/tests/server/test_api_local_input_security.py b/tests/server/test_api_local_input_security.py
index 351f2be39..9564ea163 100644
--- a/tests/server/test_api_local_input_security.py
+++ b/tests/server/test_api_local_input_security.py
@@ -11,7 +11,7 @@
 import httpx
 import pytest
 
-from openviking.parse.parsers.html import HTMLParser, URLTypeDetector
+from openviking.parse.parsers.html import URLTypeDetector
 from openviking.utils.network_guard import ensure_public_remote_target
 from openviking_cli.exceptions import PermissionDeniedError
 
@@ -227,13 +227,3 @@ async def test_url_detector_request_validator_blocks_loopback_head(loopback_http
             timeout=2.0,
             request_validator=ensure_public_remote_target,
         )
-
-
-async def test_html_parser_request_validator_blocks_loopback_fetch(loopback_http_url: str):
-    parser = HTMLParser(timeout=2.0)
-
-    with pytest.raises(PermissionDeniedError):
-        await parser._fetch_html(
-            loopback_http_url,
-            request_validator=ensure_public_remote_target,
-        )