diff --git a/README.md b/README.md index 3f4b804..9cb0f74 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,7 @@ Tell Claw Recall where your agent conversations are stored: |----------|----------------------| | **OpenClaw** | `~/.openclaw/agents-archive/` (completed) and `~/.openclaw/agents/` (active) | | **Claude Code** | `~/.claude/projects/` | +| **Codex CLI** | `~/.codex/sessions/` | Run the indexer on your session directory: @@ -97,6 +98,9 @@ python3 -m claw_recall.indexing.indexer --source ~/.openclaw/agents-archive/ --i # Claude Code users: python3 -m claw_recall.indexing.indexer --source ~/.claude/projects/ --incremental + +# Codex CLI users: +python3 -m claw_recall.indexing.indexer --source ~/.codex/sessions/ --incremental ``` You should see output like: @@ -483,7 +487,7 @@ Claw Recall is a solo-maintained project. Donations go directly toward hosting c - **Star this repo** to help others find it - **Report bugs** via [GitHub Issues](https://github.com/rodbland2021/claw-recall/issues) - [Buy Me a Coffee](https://buymeacoffee.com/rodbland) -- Make a Bitcoin donation — `bc1qxmyqnx04es3knztthxh3t7gkmgqjj0mnz6lr0p` +- Make a Bitcoin donation — `bc1qga5v975rhjal9768hv826z6xdw5ae9z29rgpkm` ## License diff --git a/app.yaml b/app.yaml new file mode 100644 index 0000000..9d1f5ba --- /dev/null +++ b/app.yaml @@ -0,0 +1,13 @@ +key: claw-recall +name: Claw Recall +service: convo-memory-web +port: 8765 +url: https://recall.srv912889.hstgr.cloud/ +tier: internal +auth: oauth2 +purpose: Searchable AI memory - indexed conversation messages and captured thoughts across all agents +traefik: recall +sources: + - OpenClaw session JSONL + - MCP capture_thought + - Gmail / Slack / Drive captures via capture_sources.py diff --git a/claw_recall/api/web.py b/claw_recall/api/web.py index 5b5a034..dfcb422 100644 --- a/claw_recall/api/web.py +++ b/claw_recall/api/web.py @@ -212,8 +212,9 @@ def _extract_path_suffix(source_path: str) -> str: Examples: /home/user/.claude/projects/-test/abc.jsonl -> .claude/projects/-test/abc.jsonl /home/user/.openclaw/agents/main/sessions/x.jsonl -> .openclaw/agents/main/sessions/x.jsonl + /home/user/.codex/sessions/2026/05/03/x.jsonl -> .codex/sessions/2026/05/03/x.jsonl """ - for marker in ['.claude/projects', '.openclaw/agents', '.openclaw/agents-archive']: + for marker in ['.claude/projects', '.openclaw/agents', '.openclaw/agents-archive', '.codex/sessions']: idx = source_path.find(marker) if idx >= 0: return source_path[idx:] diff --git a/claw_recall/capture/thoughts.py b/claw_recall/capture/thoughts.py index d4f75c7..86f420a 100644 --- a/claw_recall/capture/thoughts.py +++ b/claw_recall/capture/thoughts.py @@ -32,8 +32,13 @@ def _get_openai_client() -> Optional['OpenAI']: global _openai_client if not OPENAI_AVAILABLE: return None - if _openai_client is None: - _openai_client = OpenAI() + # Return existing client if already set (e.g. by tests via monkeypatch) + if _openai_client is not None: + return _openai_client + import os + if not os.environ.get('OPENAI_API_KEY'): + return None + _openai_client = OpenAI() return _openai_client diff --git a/claw_recall/config.py b/claw_recall/config.py index 51fac3d..31657ac 100644 --- a/claw_recall/config.py +++ b/claw_recall/config.py @@ -19,6 +19,7 @@ DEFAULT_ARCHIVE_PATH = Path.home() / ".openclaw" / "agents-archive" DEFAULT_SESSIONS_PATH = Path.home() / ".openclaw" / "agents" +DEFAULT_CODEX_SESSIONS_PATH = Path.home() / ".codex" / "sessions" EXCLUDE_CONF_PATH = REPO_DIR / "exclude.conf" AGENTS_JSON_PATH = REPO_DIR / "agents.json" @@ -42,6 +43,7 @@ Path.home() / ".openclaw" / "agents", Path.home() / ".openclaw" / "agents-archive", Path.home() / ".claude" / "projects", + DEFAULT_CODEX_SESSIONS_PATH, ] # ── Agent name mapping ───────────────────────────────────────────────────────── diff --git a/claw_recall/indexing/indexer.py b/claw_recall/indexing/indexer.py index 30b1727..5896a58 100644 --- a/claw_recall/indexing/indexer.py +++ b/claw_recall/indexing/indexer.py @@ -26,6 +26,7 @@ DB_PATH, DEFAULT_ARCHIVE_PATH, DEFAULT_SESSIONS_PATH, + DEFAULT_CODEX_SESSIONS_PATH, EXCLUDE_CONF_PATH, EMBEDDING_MODEL, EMBEDDING_BATCH_SIZE, @@ -68,6 +69,7 @@ def _load_exclude_patterns() -> list[str]: _re.compile(r'^SECURITY NOTICE: The following content is from an EXTERNAL'), _re.compile(r'^If BOOT\.md asks you to send a message'), _re.compile(r'^If nothing needs attention.*reply with ONLY: NO_REPLY', _re.DOTALL), + _re.compile(r'^# AGENTS\.md instructions for '), ] @@ -155,7 +157,7 @@ def extract_session_metadata(filepath: Path) -> dict: Resolution order (most reliable first): 1. Directory path -- agents/{name}/sessions/ or agents-archive-{name}/ 2. Filename prefix -- agent-{name}-{channel}-... format - 3. UUID detection -- Claude Code sessions in .claude/projects/ + 3. Codex/Claude Code path detection 4. Fallback -- first filename part if it's a known agent name """ filename = filepath.name @@ -243,6 +245,12 @@ def extract_session_metadata(filepath: Path) -> dict: except Exception: pass + # Pattern: .codex/sessions/YYYY/MM/DD/rollout-*.jsonl (Codex CLI sessions) + if metadata['agent_id'] == 'unknown': + if '.codex/sessions' in path_str: + metadata['agent_id'] = _normalize_agent_id('codex') + metadata['channel'] = 'terminal' + # === PHASE 2: Filename-based detection === # Strip .deleted.* suffix for parsing @@ -332,8 +340,9 @@ def _extract_text(raw_content) -> Optional[str]: if isinstance(raw_content, str): return raw_content if isinstance(raw_content, list): + text_part_types = {'text', 'input_text', 'output_text'} parts = [p.get('text', '') for p in raw_content - if isinstance(p, dict) and p.get('type') == 'text'] + if isinstance(p, dict) and p.get('type') in text_part_types] return ' '.join(parts) if parts else None return None @@ -392,6 +401,7 @@ def extract_messages(filepath: Path, start_offset: int = 0, start_index: int = 0 role = None raw_content = None is_cc = False + is_codex = False if entry_type == 'message': # OpenClaw: {"type": "message", "message": {...}} @@ -404,6 +414,16 @@ def extract_messages(filepath: Path, start_offset: int = 0, start_index: int = 0 role = msg.get('role', entry_type) raw_content = msg.get('content', '') is_cc = True + elif entry_type == 'response_item': + # Codex CLI: {"type": "response_item", "payload": {"type": "message", ...}} + payload = entry.get('payload') or {} + if payload.get('type') != 'message': + continue + role = payload.get('role') + if role not in ('user', 'assistant'): + continue + raw_content = payload.get('content', '') + is_codex = True elif 'role' in entry and 'content' in entry and entry_type is None: # Legacy: {"role": "user", "content": "..."} role = entry.get('role') @@ -419,8 +439,8 @@ def extract_messages(filepath: Path, start_offset: int = 0, start_index: int = 0 continue content = content.strip() - # Strip CC system tags - if is_cc: + # Strip CC/Codex system tags + if is_cc or is_codex: content = CC_SYSTEM_TAG_RE.sub('', content).strip() if not content: continue @@ -821,7 +841,7 @@ def main(): import time as _time cutoff = _time.time() - (20 * 60) # 20 minutes ago results = {'indexed': 0, 'skipped': 0, 'errors': 0, 'total_messages': 0, 'total_embeddings': 0} - all_dirs = [args.source, DEFAULT_SESSIONS_PATH] + all_dirs = [args.source, DEFAULT_SESSIONS_PATH, DEFAULT_CODEX_SESSIONS_PATH] for scan_dir in all_dirs: if not scan_dir.exists(): continue @@ -863,6 +883,15 @@ def main(): results['total_messages'] += r['total_messages'] results['total_embeddings'] += r['total_embeddings'] + if DEFAULT_CODEX_SESSIONS_PATH.exists(): + print(f"\nIndexing Codex sessions: {DEFAULT_CODEX_SESSIONS_PATH}") + r = index_directory(DEFAULT_CODEX_SESSIONS_PATH, conn, args.embeddings, openai_client) + results['indexed'] += r['indexed'] + results['skipped'] += r['skipped'] + results['errors'] += r['errors'] + results['total_messages'] += r['total_messages'] + results['total_embeddings'] += r['total_embeddings'] + # Backfill embeddings for any previously-indexed messages that lack them if args.embeddings and openai_client: print(f"\nChecking for messages missing embeddings...") diff --git a/docs/guide.md b/docs/guide.md index 7cec7ce..3a833f6 100644 --- a/docs/guide.md +++ b/docs/guide.md @@ -25,10 +25,11 @@ For a quick overview, see the [README](../README.md). ### Conversation Sessions -Claw Recall indexes `.jsonl` session files from two agent platforms: +Claw Recall indexes `.jsonl` session files from three agent platforms: - **OpenClaw** — `~/.openclaw/agents/` (active) and `~/.openclaw/agents-archive/` (completed) - **Claude Code** — `~/.claude/projects/` (auto-detected by path and JSON structure) +- **Codex CLI** — `~/.codex/sessions/` (auto-detected by path and JSON structure) **Real-time indexing** (recommended): ```bash diff --git a/hooks/quick-index.sh b/hooks/quick-index.sh index df4de16..3ef10e8 100755 --- a/hooks/quick-index.sh +++ b/hooks/quick-index.sh @@ -47,6 +47,9 @@ index_dir ~/.openclaw/agents-chat-sessions/ "Chat sessions" # Claude Code server sessions (local terminal agent) index_dir ~/.claude/projects/ "CC-VPS sessions" "--include-active" +# Codex CLI sessions (local terminal agent) +index_dir ~/.codex/sessions/ "Codex sessions" + if [ "$TOTAL_INDEXED" -gt 0 ] || [ "$TOTAL_ERRORS" -gt 0 ]; then log "Done: indexed=$TOTAL_INDEXED errors=$TOTAL_ERRORS" else diff --git a/requirements.txt b/requirements.txt index 64994b9..e3af568 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,9 @@ numpy>=1.24.0 # For semantic search (optional) openai>=1.0.0 +# For MCP server +mcp>=1.0.0 + # For web interface (optional) flask>=2.0.0 diff --git a/scripts/cc_session_watcher.py b/scripts/cc_session_watcher.py index fa3f514..5d52b70 100644 --- a/scripts/cc_session_watcher.py +++ b/scripts/cc_session_watcher.py @@ -41,6 +41,7 @@ WATCH_DIRS = [ Path.home() / ".claude" / "projects", # Claude Code sessions + Path.home() / ".codex" / "sessions", # Codex CLI sessions Path.home() / ".openclaw" / "agents" / "main" / "sessions", # Claude (OpenClaw) active Path.home() / ".openclaw" / "agents-archive", # Claude (OpenClaw) archived ] @@ -201,7 +202,7 @@ def _rsync_push(filepath: Path, dry_run: bool = False) -> dict: source_path = str(filepath) # Build VPS staging path preserving directory structure for agent detection - for marker in ['.claude/projects', '.openclaw/agents', '.openclaw/agents-archive']: + for marker in ['.claude/projects', '.codex/sessions', '.openclaw/agents', '.openclaw/agents-archive']: idx = source_path.find(marker) if idx >= 0: path_suffix = source_path[idx:] @@ -210,7 +211,7 @@ def _rsync_push(filepath: Path, dry_run: bool = False) -> dict: path_suffix = filepath.name # Validate path_suffix contains expected markers (defense-in-depth) - if not any(m in path_suffix for m in ['.claude/', '.openclaw/']): + if not any(m in path_suffix for m in ['.claude/', '.codex/', '.openclaw/']): log.warning(f"Unexpected path_suffix: {path_suffix}") return {"status": "error", "reason": "unexpected_path"} remote_path = f"{VPS_REMOTE_STAGING}/{os.path.dirname(path_suffix)}/" diff --git a/scripts/health-check.sh b/scripts/health-check.sh index 1792939..99df580 100755 --- a/scripts/health-check.sh +++ b/scripts/health-check.sh @@ -19,7 +19,7 @@ # CLAW_RECALL_STATE_FILE State file for alert dedup (default: /tmp/claw-recall-health-state.json) # CLAW_RECALL_EMB_GAP_THRESHOLD Embedding gap alert threshold (default: 400000) # CLAW_RECALL_SESSION_DIRS Colon-separated session directories to check for indexing -# (default: ~/.openclaw/agents-archive/:~/.openclaw/agents/:~/.claude/projects/) +# (default: ~/.openclaw/agents-archive/:~/.openclaw/agents/:~/.claude/projects/:~/.codex/sessions/) # # Example crontab entry: # */15 * * * * CLAW_RECALL_MCP_URL=http://10.0.0.1:8766/health \ @@ -37,7 +37,7 @@ ALERT_SCRIPT="${CLAW_RECALL_ALERT_SCRIPT:-}" LOG="${CLAW_RECALL_LOG:-/tmp/claw-recall-health.log}" STATE_FILE="${CLAW_RECALL_STATE_FILE:-/tmp/claw-recall-health-state.json}" EMB_GAP_THRESHOLD="${CLAW_RECALL_EMB_GAP_THRESHOLD:-400000}" -SESSION_DIRS="${CLAW_RECALL_SESSION_DIRS:-$HOME/.openclaw/agents-archive/:$HOME/.openclaw/agents/:$HOME/.claude/projects/}" +SESSION_DIRS="${CLAW_RECALL_SESSION_DIRS:-$HOME/.openclaw/agents-archive/:$HOME/.openclaw/agents/:$HOME/.claude/projects/:$HOME/.codex/sessions/}" log() { echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] $1" >> "$LOG"; } diff --git a/scripts/quick-index.sh b/scripts/quick-index.sh index 66c569c..7b5e760 100755 --- a/scripts/quick-index.sh +++ b/scripts/quick-index.sh @@ -70,6 +70,15 @@ if [ -d ~/.claude/projects ]; then [ -n "$ERRORS" ] && TOTAL_ERRORS=$((TOTAL_ERRORS + ERRORS)) fi +# Index local Codex CLI sessions +if [ -d ~/.codex/sessions ]; then + OUTPUT=$(python3 -m claw_recall.indexing.indexer --source ~/.codex/sessions/ --incremental --embeddings 2>&1) + INDEXED=$(echo "$OUTPUT" | grep -oP 'Indexed: \K\d+') + ERRORS=$(echo "$OUTPUT" | grep -oP 'Errors: \K\d+') + [ -n "$INDEXED" ] && TOTAL_INDEXED=$((TOTAL_INDEXED + INDEXED)) + [ -n "$ERRORS" ] && TOTAL_ERRORS=$((TOTAL_ERRORS + ERRORS)) +fi + if [ "$TOTAL_INDEXED" -gt 0 ] || [ "$TOTAL_ERRORS" -gt 0 ]; then log "Done: indexed=$TOTAL_INDEXED errors=$TOTAL_ERRORS" else diff --git a/tests/test_claw_recall.py b/tests/test_claw_recall.py index 4110f5a..0d7a3c5 100644 --- a/tests/test_claw_recall.py +++ b/tests/test_claw_recall.py @@ -1019,6 +1019,13 @@ def test_openclaw_archive_path(self): ) assert result == ".openclaw/agents-archive/main-abc-123.jsonl" + def test_codex_sessions_path(self): + from claw_recall.api.web import _extract_path_suffix + result = _extract_path_suffix( + "/home/testuser/.codex/sessions/2026/05/03/rollout-2026-05-03T01-02-03-abc.jsonl" + ) + assert result == ".codex/sessions/2026/05/03/rollout-2026-05-03T01-02-03-abc.jsonl" + def test_fallback_basename(self): from claw_recall.api.web import _extract_path_suffix result = _extract_path_suffix("/some/random/path/file.jsonl") @@ -1089,6 +1096,35 @@ def test_override_dedup_by_custom_path(self, test_db, tmp_path): assert r2['status'] == 'skipped' assert r2['reason'] == 'already indexed' + def test_codex_session_indexes_user_and_assistant_messages(self, test_db, tmp_path): + conn, _ = test_db + session_dir = tmp_path / ".codex" / "sessions" / "2026" / "05" / "03" + session_dir.mkdir(parents=True) + session_file = session_dir / "rollout-2026-05-03T01-02-03-abc.jsonl" + session_file.write_text( + '{"timestamp":"2026-05-03T01:02:03Z","type":"session_meta","payload":{"id":"abc"}}\n' + '{"timestamp":"2026-05-03T01:02:04Z","type":"response_item","payload":{"type":"message","role":"developer","content":[{"type":"input_text","text":"skip developer"}]}}\n' + '{"timestamp":"2026-05-03T01:02:05Z","type":"response_item","payload":{"type":"message","role":"user","content":[{"type":"input_text","text":"include Codex user request"}]}}\n' + '{"timestamp":"2026-05-03T01:02:06Z","type":"event_msg","payload":{"type":"user_message","message":"duplicate event"}}\n' + '{"timestamp":"2026-05-03T01:02:07Z","type":"response_item","payload":{"type":"message","role":"assistant","content":[{"type":"output_text","text":"include Codex assistant response"}]}}\n' + ) + + from claw_recall.indexing.indexer import index_session_file + result = index_session_file(session_file, conn) + + assert result['status'] == 'indexed' + assert result['agent'] == 'codex' + assert result['messages'] == 2 + + rows = conn.execute( + "SELECT role, content FROM messages WHERE session_id = ? ORDER BY message_index", + (session_file.stem,), + ).fetchall() + assert rows == [ + ("user", "include Codex user request"), + ("assistant", "include Codex assistant response"), + ] + class TestIndexSessionEndpoint: """Test the POST /index-session HTTP endpoint.""" @@ -1420,6 +1456,7 @@ def test_needs_indexing_missing_file(self, tmp_path): def test_should_handle(self): assert self.watcher._should_handle("/path/to/session.jsonl") is True + assert self.watcher._should_handle("/home/user/.codex/sessions/2026/05/03/session.jsonl") is True assert self.watcher._should_handle("/path/to/session.json") is False assert self.watcher._should_handle("/path/subagents/agent.jsonl") is False assert self.watcher._should_handle("/path/.deleted.session.jsonl") is False