Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ Tell Claw Recall where your agent conversations are stored:
|----------|----------------------|
| **OpenClaw** | `~/.openclaw/agents-archive/` (completed) and `~/.openclaw/agents/` (active) |
| **Claude Code** | `~/.claude/projects/` |
| **Codex CLI** | `~/.codex/sessions/` |

Run the indexer on your session directory:

Expand All @@ -97,6 +98,9 @@ python3 -m claw_recall.indexing.indexer --source ~/.openclaw/agents-archive/ --i

# Claude Code users:
python3 -m claw_recall.indexing.indexer --source ~/.claude/projects/ --incremental

# Codex CLI users:
python3 -m claw_recall.indexing.indexer --source ~/.codex/sessions/ --incremental
```

You should see output like:
Expand Down Expand Up @@ -483,7 +487,7 @@ Claw Recall is a solo-maintained project. Donations go directly toward hosting c
- **Star this repo** to help others find it
- **Report bugs** via [GitHub Issues](https://github.com/rodbland2021/claw-recall/issues)
- [Buy Me a Coffee](https://buymeacoffee.com/rodbland)
- Make a Bitcoin donation — `bc1qxmyqnx04es3knztthxh3t7gkmgqjj0mnz6lr0p`
- Make a Bitcoin donation — `bc1qga5v975rhjal9768hv826z6xdw5ae9z29rgpkm`

## License

Expand Down
13 changes: 13 additions & 0 deletions app.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
key: claw-recall
name: Claw Recall
service: convo-memory-web
port: 8765
url: https://recall.srv912889.hstgr.cloud/
tier: internal
auth: oauth2
purpose: Searchable AI memory - indexed conversation messages and captured thoughts across all agents
traefik: recall
sources:
- OpenClaw session JSONL
- MCP capture_thought
- Gmail / Slack / Drive captures via capture_sources.py
3 changes: 2 additions & 1 deletion claw_recall/api/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,9 @@ def _extract_path_suffix(source_path: str) -> str:
Examples:
/home/user/.claude/projects/-test/abc.jsonl -> .claude/projects/-test/abc.jsonl
/home/user/.openclaw/agents/main/sessions/x.jsonl -> .openclaw/agents/main/sessions/x.jsonl
/home/user/.codex/sessions/2026/05/03/x.jsonl -> .codex/sessions/2026/05/03/x.jsonl
"""
for marker in ['.claude/projects', '.openclaw/agents', '.openclaw/agents-archive']:
for marker in ['.claude/projects', '.openclaw/agents', '.openclaw/agents-archive', '.codex/sessions']:
idx = source_path.find(marker)
if idx >= 0:
return source_path[idx:]
Expand Down
9 changes: 7 additions & 2 deletions claw_recall/capture/thoughts.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,13 @@ def _get_openai_client() -> Optional['OpenAI']:
global _openai_client
if not OPENAI_AVAILABLE:
return None
if _openai_client is None:
_openai_client = OpenAI()
# Return existing client if already set (e.g. by tests via monkeypatch)
if _openai_client is not None:
return _openai_client
import os
if not os.environ.get('OPENAI_API_KEY'):
return None
_openai_client = OpenAI()
return _openai_client


Expand Down
2 changes: 2 additions & 0 deletions claw_recall/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

DEFAULT_ARCHIVE_PATH = Path.home() / ".openclaw" / "agents-archive"
DEFAULT_SESSIONS_PATH = Path.home() / ".openclaw" / "agents"
DEFAULT_CODEX_SESSIONS_PATH = Path.home() / ".codex" / "sessions"

EXCLUDE_CONF_PATH = REPO_DIR / "exclude.conf"
AGENTS_JSON_PATH = REPO_DIR / "agents.json"
Expand All @@ -42,6 +43,7 @@
Path.home() / ".openclaw" / "agents",
Path.home() / ".openclaw" / "agents-archive",
Path.home() / ".claude" / "projects",
DEFAULT_CODEX_SESSIONS_PATH,
]

# ── Agent name mapping ─────────────────────────────────────────────────────────
Expand Down
39 changes: 34 additions & 5 deletions claw_recall/indexing/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
DB_PATH,
DEFAULT_ARCHIVE_PATH,
DEFAULT_SESSIONS_PATH,
DEFAULT_CODEX_SESSIONS_PATH,
EXCLUDE_CONF_PATH,
EMBEDDING_MODEL,
EMBEDDING_BATCH_SIZE,
Expand Down Expand Up @@ -68,6 +69,7 @@ def _load_exclude_patterns() -> list[str]:
_re.compile(r'^SECURITY NOTICE: The following content is from an EXTERNAL'),
_re.compile(r'^If BOOT\.md asks you to send a message'),
_re.compile(r'^If nothing needs attention.*reply with ONLY: NO_REPLY', _re.DOTALL),
_re.compile(r'^# AGENTS\.md instructions for '),
]


Expand Down Expand Up @@ -155,7 +157,7 @@ def extract_session_metadata(filepath: Path) -> dict:
Resolution order (most reliable first):
1. Directory path -- agents/{name}/sessions/ or agents-archive-{name}/
2. Filename prefix -- agent-{name}-{channel}-... format
3. UUID detection -- Claude Code sessions in .claude/projects/
3. Codex/Claude Code path detection
4. Fallback -- first filename part if it's a known agent name
"""
filename = filepath.name
Expand Down Expand Up @@ -243,6 +245,12 @@ def extract_session_metadata(filepath: Path) -> dict:
except Exception:
pass

# Pattern: .codex/sessions/YYYY/MM/DD/rollout-*.jsonl (Codex CLI sessions)
if metadata['agent_id'] == 'unknown':
if '.codex/sessions' in path_str:
metadata['agent_id'] = _normalize_agent_id('codex')
metadata['channel'] = 'terminal'

# === PHASE 2: Filename-based detection ===

# Strip .deleted.* suffix for parsing
Expand Down Expand Up @@ -332,8 +340,9 @@ def _extract_text(raw_content) -> Optional[str]:
if isinstance(raw_content, str):
return raw_content
if isinstance(raw_content, list):
text_part_types = {'text', 'input_text', 'output_text'}
parts = [p.get('text', '') for p in raw_content
if isinstance(p, dict) and p.get('type') == 'text']
if isinstance(p, dict) and p.get('type') in text_part_types]
return ' '.join(parts) if parts else None
return None

Expand Down Expand Up @@ -392,6 +401,7 @@ def extract_messages(filepath: Path, start_offset: int = 0, start_index: int = 0
role = None
raw_content = None
is_cc = False
is_codex = False

if entry_type == 'message':
# OpenClaw: {"type": "message", "message": {...}}
Expand All @@ -404,6 +414,16 @@ def extract_messages(filepath: Path, start_offset: int = 0, start_index: int = 0
role = msg.get('role', entry_type)
raw_content = msg.get('content', '')
is_cc = True
elif entry_type == 'response_item':
# Codex CLI: {"type": "response_item", "payload": {"type": "message", ...}}
payload = entry.get('payload') or {}
if payload.get('type') != 'message':
continue
role = payload.get('role')
if role not in ('user', 'assistant'):
continue
raw_content = payload.get('content', '')
is_codex = True
elif 'role' in entry and 'content' in entry and entry_type is None:
# Legacy: {"role": "user", "content": "..."}
role = entry.get('role')
Expand All @@ -419,8 +439,8 @@ def extract_messages(filepath: Path, start_offset: int = 0, start_index: int = 0
continue
content = content.strip()

# Strip CC system tags
if is_cc:
# Strip CC/Codex system tags
if is_cc or is_codex:
content = CC_SYSTEM_TAG_RE.sub('', content).strip()
if not content:
continue
Expand Down Expand Up @@ -821,7 +841,7 @@ def main():
import time as _time
cutoff = _time.time() - (20 * 60) # 20 minutes ago
results = {'indexed': 0, 'skipped': 0, 'errors': 0, 'total_messages': 0, 'total_embeddings': 0}
all_dirs = [args.source, DEFAULT_SESSIONS_PATH]
all_dirs = [args.source, DEFAULT_SESSIONS_PATH, DEFAULT_CODEX_SESSIONS_PATH]
for scan_dir in all_dirs:
if not scan_dir.exists():
continue
Expand Down Expand Up @@ -863,6 +883,15 @@ def main():
results['total_messages'] += r['total_messages']
results['total_embeddings'] += r['total_embeddings']

if DEFAULT_CODEX_SESSIONS_PATH.exists():
print(f"\nIndexing Codex sessions: {DEFAULT_CODEX_SESSIONS_PATH}")
r = index_directory(DEFAULT_CODEX_SESSIONS_PATH, conn, args.embeddings, openai_client)
results['indexed'] += r['indexed']
results['skipped'] += r['skipped']
results['errors'] += r['errors']
results['total_messages'] += r['total_messages']
results['total_embeddings'] += r['total_embeddings']

# Backfill embeddings for any previously-indexed messages that lack them
if args.embeddings and openai_client:
print(f"\nChecking for messages missing embeddings...")
Expand Down
3 changes: 2 additions & 1 deletion docs/guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@ For a quick overview, see the [README](../README.md).

### Conversation Sessions

Claw Recall indexes `.jsonl` session files from two agent platforms:
Claw Recall indexes `.jsonl` session files from three agent platforms:

- **OpenClaw** — `~/.openclaw/agents/` (active) and `~/.openclaw/agents-archive/` (completed)
- **Claude Code** — `~/.claude/projects/` (auto-detected by path and JSON structure)
- **Codex CLI** — `~/.codex/sessions/` (auto-detected by path and JSON structure)

**Real-time indexing** (recommended):
```bash
Expand Down
3 changes: 3 additions & 0 deletions hooks/quick-index.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ index_dir ~/.openclaw/agents-chat-sessions/ "Chat sessions"
# Claude Code server sessions (local terminal agent)
index_dir ~/.claude/projects/ "CC-VPS sessions" "--include-active"

# Codex CLI sessions (local terminal agent)
index_dir ~/.codex/sessions/ "Codex sessions"

if [ "$TOTAL_INDEXED" -gt 0 ] || [ "$TOTAL_ERRORS" -gt 0 ]; then
log "Done: indexed=$TOTAL_INDEXED errors=$TOTAL_ERRORS"
else
Expand Down
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ numpy>=1.24.0
# For semantic search (optional)
openai>=1.0.0

# For MCP server
mcp>=1.0.0

# For web interface (optional)
flask>=2.0.0

Expand Down
5 changes: 3 additions & 2 deletions scripts/cc_session_watcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

WATCH_DIRS = [
Path.home() / ".claude" / "projects", # Claude Code sessions
Path.home() / ".codex" / "sessions", # Codex CLI sessions
Path.home() / ".openclaw" / "agents" / "main" / "sessions", # Claude (OpenClaw) active
Path.home() / ".openclaw" / "agents-archive", # Claude (OpenClaw) archived
]
Expand Down Expand Up @@ -201,7 +202,7 @@ def _rsync_push(filepath: Path, dry_run: bool = False) -> dict:

source_path = str(filepath)
# Build VPS staging path preserving directory structure for agent detection
for marker in ['.claude/projects', '.openclaw/agents', '.openclaw/agents-archive']:
for marker in ['.claude/projects', '.codex/sessions', '.openclaw/agents', '.openclaw/agents-archive']:
idx = source_path.find(marker)
if idx >= 0:
path_suffix = source_path[idx:]
Expand All @@ -210,7 +211,7 @@ def _rsync_push(filepath: Path, dry_run: bool = False) -> dict:
path_suffix = filepath.name

# Validate path_suffix contains expected markers (defense-in-depth)
if not any(m in path_suffix for m in ['.claude/', '.openclaw/']):
if not any(m in path_suffix for m in ['.claude/', '.codex/', '.openclaw/']):
log.warning(f"Unexpected path_suffix: {path_suffix}")
return {"status": "error", "reason": "unexpected_path"}
remote_path = f"{VPS_REMOTE_STAGING}/{os.path.dirname(path_suffix)}/"
Expand Down
4 changes: 2 additions & 2 deletions scripts/health-check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
# CLAW_RECALL_STATE_FILE State file for alert dedup (default: /tmp/claw-recall-health-state.json)
# CLAW_RECALL_EMB_GAP_THRESHOLD Embedding gap alert threshold (default: 400000)
# CLAW_RECALL_SESSION_DIRS Colon-separated session directories to check for indexing
# (default: ~/.openclaw/agents-archive/:~/.openclaw/agents/:~/.claude/projects/)
# (default: ~/.openclaw/agents-archive/:~/.openclaw/agents/:~/.claude/projects/:~/.codex/sessions/)
#
# Example crontab entry:
# */15 * * * * CLAW_RECALL_MCP_URL=http://10.0.0.1:8766/health \
Expand All @@ -37,7 +37,7 @@ ALERT_SCRIPT="${CLAW_RECALL_ALERT_SCRIPT:-}"
LOG="${CLAW_RECALL_LOG:-/tmp/claw-recall-health.log}"
STATE_FILE="${CLAW_RECALL_STATE_FILE:-/tmp/claw-recall-health-state.json}"
EMB_GAP_THRESHOLD="${CLAW_RECALL_EMB_GAP_THRESHOLD:-400000}"
SESSION_DIRS="${CLAW_RECALL_SESSION_DIRS:-$HOME/.openclaw/agents-archive/:$HOME/.openclaw/agents/:$HOME/.claude/projects/}"
SESSION_DIRS="${CLAW_RECALL_SESSION_DIRS:-$HOME/.openclaw/agents-archive/:$HOME/.openclaw/agents/:$HOME/.claude/projects/:$HOME/.codex/sessions/}"

log() { echo "[$(date -u '+%Y-%m-%d %H:%M:%S UTC')] $1" >> "$LOG"; }

Expand Down
9 changes: 9 additions & 0 deletions scripts/quick-index.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,15 @@ if [ -d ~/.claude/projects ]; then
[ -n "$ERRORS" ] && TOTAL_ERRORS=$((TOTAL_ERRORS + ERRORS))
fi

# Index local Codex CLI sessions
if [ -d ~/.codex/sessions ]; then
OUTPUT=$(python3 -m claw_recall.indexing.indexer --source ~/.codex/sessions/ --incremental --embeddings 2>&1)
INDEXED=$(echo "$OUTPUT" | grep -oP 'Indexed: \K\d+')
ERRORS=$(echo "$OUTPUT" | grep -oP 'Errors: \K\d+')
[ -n "$INDEXED" ] && TOTAL_INDEXED=$((TOTAL_INDEXED + INDEXED))
[ -n "$ERRORS" ] && TOTAL_ERRORS=$((TOTAL_ERRORS + ERRORS))
fi

if [ "$TOTAL_INDEXED" -gt 0 ] || [ "$TOTAL_ERRORS" -gt 0 ]; then
log "Done: indexed=$TOTAL_INDEXED errors=$TOTAL_ERRORS"
else
Expand Down
37 changes: 37 additions & 0 deletions tests/test_claw_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,13 @@ def test_openclaw_archive_path(self):
)
assert result == ".openclaw/agents-archive/main-abc-123.jsonl"

def test_codex_sessions_path(self):
from claw_recall.api.web import _extract_path_suffix
result = _extract_path_suffix(
"/home/testuser/.codex/sessions/2026/05/03/rollout-2026-05-03T01-02-03-abc.jsonl"
)
assert result == ".codex/sessions/2026/05/03/rollout-2026-05-03T01-02-03-abc.jsonl"

def test_fallback_basename(self):
from claw_recall.api.web import _extract_path_suffix
result = _extract_path_suffix("/some/random/path/file.jsonl")
Expand Down Expand Up @@ -1089,6 +1096,35 @@ def test_override_dedup_by_custom_path(self, test_db, tmp_path):
assert r2['status'] == 'skipped'
assert r2['reason'] == 'already indexed'

def test_codex_session_indexes_user_and_assistant_messages(self, test_db, tmp_path):
conn, _ = test_db
session_dir = tmp_path / ".codex" / "sessions" / "2026" / "05" / "03"
session_dir.mkdir(parents=True)
session_file = session_dir / "rollout-2026-05-03T01-02-03-abc.jsonl"
session_file.write_text(
'{"timestamp":"2026-05-03T01:02:03Z","type":"session_meta","payload":{"id":"abc"}}\n'
'{"timestamp":"2026-05-03T01:02:04Z","type":"response_item","payload":{"type":"message","role":"developer","content":[{"type":"input_text","text":"skip developer"}]}}\n'
'{"timestamp":"2026-05-03T01:02:05Z","type":"response_item","payload":{"type":"message","role":"user","content":[{"type":"input_text","text":"include Codex user request"}]}}\n'
'{"timestamp":"2026-05-03T01:02:06Z","type":"event_msg","payload":{"type":"user_message","message":"duplicate event"}}\n'
'{"timestamp":"2026-05-03T01:02:07Z","type":"response_item","payload":{"type":"message","role":"assistant","content":[{"type":"output_text","text":"include Codex assistant response"}]}}\n'
)

from claw_recall.indexing.indexer import index_session_file
result = index_session_file(session_file, conn)

assert result['status'] == 'indexed'
assert result['agent'] == 'codex'
assert result['messages'] == 2

rows = conn.execute(
"SELECT role, content FROM messages WHERE session_id = ? ORDER BY message_index",
(session_file.stem,),
).fetchall()
assert rows == [
("user", "include Codex user request"),
("assistant", "include Codex assistant response"),
]


class TestIndexSessionEndpoint:
"""Test the POST /index-session HTTP endpoint."""
Expand Down Expand Up @@ -1420,6 +1456,7 @@ def test_needs_indexing_missing_file(self, tmp_path):

def test_should_handle(self):
assert self.watcher._should_handle("/path/to/session.jsonl") is True
assert self.watcher._should_handle("/home/user/.codex/sessions/2026/05/03/session.jsonl") is True
assert self.watcher._should_handle("/path/to/session.json") is False
assert self.watcher._should_handle("/path/subagents/agent.jsonl") is False
assert self.watcher._should_handle("/path/.deleted.session.jsonl") is False
Expand Down
Loading