Skip to content

Commit 5a96c80

Browse files
Merge pull request google#1709 from google:my-feature-branch-llmstxt
PiperOrigin-RevId: 778239103
2 parents 83092f5 + c237af0 commit 5a96c80

File tree

3 files changed

+33638
-0
lines changed

3 files changed

+33638
-0
lines changed
Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,338 @@
1+
#!/usr/bin/env python3
2+
"""
3+
build_llms_txt.py – produce llms.txt and llms-full.txt
4+
– skips ```java``` blocks
5+
– README can be next to docs/ or inside docs/
6+
– includes Python API reference from HTML files
7+
– includes adk-python repository README
8+
"""
9+
from __future__ import annotations
10+
11+
import argparse
12+
from pathlib import Path
13+
import re
14+
import sys
15+
import textwrap
16+
from typing import List
17+
from typing import Tuple
18+
import urllib.error
19+
import urllib.request
20+
21+
RE_JAVA = re.compile(r"```java[ \t\r\n][\s\S]*?```", re.I | re.M)
22+
RE_SNIPPET = re.compile(r"^(\s*)--8<--\s+\"([^\"]+?)(?::([^\"]+))?\"$", re.M)
23+
24+
25+
def fetch_adk_python_readme() -> str:
26+
"""Fetch README content from adk-python repository"""
27+
try:
28+
url = "https://raw.githubusercontent.com/google/adk-python/main/README.md"
29+
with urllib.request.urlopen(url) as response:
30+
return response.read().decode("utf-8")
31+
except (urllib.error.URLError, urllib.error.HTTPError) as e:
32+
print(f"Warning: Could not fetch adk-python README: {e}")
33+
return ""
34+
35+
36+
def strip_java(md: str) -> str:
37+
return RE_JAVA.sub("", md)
38+
39+
40+
def first_heading(md: str) -> str | None:
41+
for line in md.splitlines():
42+
if line.startswith("#"):
43+
return line.lstrip("#").strip()
44+
return None
45+
46+
47+
def md_to_text(md: str) -> str:
48+
import bs4
49+
import markdown
50+
51+
html = markdown.markdown(
52+
md, extensions=["fenced_code", "tables", "attr_list"]
53+
)
54+
return bs4.BeautifulSoup(html, "html.parser").get_text("\n")
55+
56+
57+
def html_to_text(html_file: Path) -> str:
58+
"""Extract text content from HTML files (for Python API reference)"""
59+
import bs4
60+
61+
try:
62+
html_content = html_file.read_text(encoding="utf-8")
63+
soup = bs4.BeautifulSoup(html_content, "html.parser")
64+
65+
# Remove script and style elements
66+
for script in soup(["script", "style"]):
67+
script.decompose()
68+
69+
# Get text and clean it up
70+
text = soup.get_text()
71+
lines = (line.strip() for line in text.splitlines())
72+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
73+
text = "\n".join(chunk for chunk in chunks if chunk)
74+
75+
return text
76+
except Exception as e:
77+
print(f"Warning: Could not process {html_file}: {e}")
78+
return ""
79+
80+
81+
def count_tokens(text: str, model: str = "cl100k_base") -> int:
82+
try:
83+
import tiktoken
84+
85+
return len(tiktoken.get_encoding(model).encode(text))
86+
except Exception:
87+
return len(text.split())
88+
89+
90+
def expand_code_snippets(content: str, project_root: Path) -> str:
91+
"""
92+
Expands code snippets marked with --8<-- "path/to/file.py" or
93+
--8<-- "path/to/file.py:section_name" into the content.
94+
"""
95+
96+
def replace_snippet(match):
97+
indent = match.group(1) # Capture leading spaces
98+
snippet_path_str = match.group(
99+
2
100+
) # Capture the file path (e.g., "examples/python/snippets/file.py")
101+
section_name = match.group(
102+
3
103+
) # Capture the section name if present (e.g., "init")
104+
snippet_full_path = (
105+
project_root / snippet_path_str
106+
) # Changed from base_path to project_root
107+
108+
# If not found in project root, try adk-docs directory
109+
if not snippet_full_path.exists():
110+
script_dir = Path(__file__).resolve().parent
111+
adk_docs_path = script_dir / "adk-docs" / snippet_path_str
112+
if adk_docs_path.exists():
113+
snippet_full_path = adk_docs_path
114+
115+
if snippet_full_path.exists():
116+
try:
117+
file_content = snippet_full_path.read_text(encoding="utf-8")
118+
if section_name:
119+
# Extract content based on section markers
120+
# Handle both single and double hash markers with optional spacing
121+
start_marker_patterns = [
122+
f"# --8<-- [start:{section_name.strip()}]",
123+
f"## --8<-- [start:{section_name.strip()}]",
124+
]
125+
end_marker_patterns = [
126+
f"# --8<-- [end:{section_name.strip()}]",
127+
f"## --8<-- [end:{section_name.strip()}]",
128+
f"## --8<-- [end:{section_name.strip()}]", # Handle extra space
129+
]
130+
131+
start_index = -1
132+
end_index = -1
133+
134+
# Find start marker
135+
for pattern in start_marker_patterns:
136+
start_index = file_content.find(pattern)
137+
if start_index != -1:
138+
start_marker = pattern
139+
break
140+
141+
# Find end marker
142+
for pattern in end_marker_patterns:
143+
end_index = file_content.find(pattern)
144+
if end_index != -1:
145+
break
146+
147+
if start_index != -1 and end_index != -1 and start_index < end_index:
148+
# Adjust start_index to begin immediately after the start_marker
149+
start_of_code = start_index + len(start_marker)
150+
temp_content = file_content[start_of_code:end_index]
151+
lines = temp_content.splitlines(keepends=True)
152+
extracted_lines = []
153+
for line in lines:
154+
if (
155+
not line.strip().startswith("# --8<--")
156+
and not line.strip().startswith("## --8<--")
157+
and line.strip() != ""
158+
):
159+
extracted_lines.append(line)
160+
extracted_content = "".join(extracted_lines).strip("\n")
161+
162+
return textwrap.indent(extracted_content, indent)
163+
else:
164+
print(
165+
f"Warning: Section '{section_name}' not found or markers"
166+
f" malformed in {snippet_full_path}"
167+
)
168+
return match.group(0)
169+
else:
170+
# Read entire file if no section name
171+
return textwrap.indent(file_content, indent)
172+
except Exception as e:
173+
print(f"Warning: Could not read snippet file {snippet_full_path}: {e}")
174+
return match.group(0)
175+
else:
176+
print(f"Warning: Snippet file not found: {snippet_full_path}")
177+
return match.group(0)
178+
179+
expanded_content = RE_SNIPPET.sub(replace_snippet, content)
180+
return expanded_content
181+
182+
183+
# ---------- index (llms.txt) ----------
184+
def build_index(docs: Path) -> str:
185+
# Locate README
186+
for cand in (docs / "README.md", docs.parent / "README.md"):
187+
if cand.exists():
188+
readme = cand.read_text(encoding="utf-8")
189+
break
190+
else:
191+
sys.exit("README.md not found in docs/ or its parent")
192+
193+
title = first_heading(readme) or "Documentation"
194+
summary = md_to_text(readme).split("\n\n")[0]
195+
lines = [f"# {title}", "", f"> {summary}", ""]
196+
197+
# Add adk-python repository README content
198+
adk_readme = fetch_adk_python_readme()
199+
if adk_readme:
200+
lines.append("## ADK Python Repository")
201+
lines.append("")
202+
# Include the full README content, properly formatted
203+
adk_text = md_to_text(strip_java(adk_readme))
204+
lines.append(adk_text)
205+
lines.append("")
206+
lines.append(
207+
f"**Source:** [adk-python"
208+
f" repository](https://github.com/google/adk-python)"
209+
)
210+
lines.append("")
211+
212+
primary: List[Tuple[str, str]] = []
213+
secondary: List[Tuple[str, str]] = []
214+
215+
# Process Markdown files
216+
for md in sorted(docs.rglob("*.md")):
217+
# Skip Java API reference files
218+
if "api-reference" in md.parts and "java" in md.parts:
219+
continue
220+
221+
rel = md.relative_to(docs)
222+
# Construct the correct GitHub URL for the Markdown file
223+
url = f"https://github.com/google/adk-docs/blob/main/docs/{rel}".replace(
224+
" ", "%20"
225+
)
226+
h = first_heading(strip_java(md.read_text(encoding="utf-8"))) or rel.stem
227+
(
228+
secondary
229+
if "sample" in rel.parts or "tutorial" in rel.parts
230+
else primary
231+
).append((h, url))
232+
233+
# Add Python API reference
234+
python_api_dir = docs / "api-reference" / "python"
235+
if python_api_dir.exists():
236+
primary.append((
237+
"Python API Reference",
238+
"https://github.com/google/adk-docs/blob/main/docs/api-reference/python/",
239+
))
240+
241+
def emit(name: str, items: List[Tuple[str, str]]):
242+
nonlocal lines
243+
if items:
244+
lines.append(f"## {name}")
245+
lines += [f"- [{h}]({u})" for h, u in items]
246+
lines.append("")
247+
248+
emit("Documentation", primary)
249+
emit("Optional", secondary)
250+
return "\n".join(lines)
251+
252+
253+
# ---------- full corpus ----------
254+
def build_full(docs: Path) -> str:
255+
out = []
256+
257+
script_dir = Path(__file__).resolve().parent
258+
project_root = script_dir.parents[2] # Correct project root
259+
print(f"DEBUG: Project Root: {project_root}")
260+
print(f"DEBUG: Docs Dir: {docs}")
261+
262+
# Add adk-python repository README content at the beginning
263+
adk_readme = fetch_adk_python_readme()
264+
if adk_readme:
265+
# Expand snippets in README if any
266+
expanded_adk_readme = expand_code_snippets(
267+
strip_java(adk_readme), project_root
268+
) # Pass project_root
269+
out.append("# ADK Python Repository")
270+
out.append("")
271+
out.append(expanded_adk_readme) # Use expanded content
272+
out.append("")
273+
out.append("---")
274+
out.append("")
275+
276+
# Process Markdown files
277+
for md in sorted(docs.rglob("*.md")):
278+
# Skip Java API reference files
279+
if "api-reference" in md.parts and "java" in md.parts:
280+
continue
281+
282+
md_content = md.read_text(encoding="utf-8")
283+
print(f"DEBUG: Processing markdown file: {md.relative_to(docs)}")
284+
expanded_md_content = expand_code_snippets(
285+
strip_java(md_content), project_root
286+
) # Changed back to project_root
287+
out.append(expanded_md_content) # Use expanded content
288+
289+
# Process Python API reference HTML files
290+
python_api_dir = docs / "api-reference" / "python"
291+
if python_api_dir.exists():
292+
# Add a separator and header for Python API reference
293+
out.append("\n\n# Python API Reference\n")
294+
295+
# Process main HTML files (skip static assets and generated files)
296+
html_files = [
297+
python_api_dir / "index.html",
298+
python_api_dir / "google-adk.html",
299+
python_api_dir / "genindex.html",
300+
python_api_dir / "py-modindex.html",
301+
]
302+
303+
for html_file in html_files:
304+
if html_file.exists():
305+
text = html_to_text(html_file)
306+
if text.strip():
307+
out.append(f"\n## {html_file.stem}\n")
308+
out.append(text)
309+
310+
return "\n\n".join(out)
311+
312+
313+
def main() -> None:
314+
ap = argparse.ArgumentParser(
315+
description="Generate llms.txt / llms-full.txt",
316+
formatter_class=argparse.RawDescriptionHelpFormatter,
317+
)
318+
ap.add_argument("--docs-dir", required=True, type=Path)
319+
ap.add_argument("--out-root", default=Path("."), type=Path)
320+
ap.add_argument("--index-limit", type=int, default=50_000)
321+
ap.add_argument("--full-limit", type=int, default=500_000)
322+
args = ap.parse_args()
323+
324+
idx, full = build_index(args.docs_dir), build_full(args.docs_dir)
325+
if (tok := count_tokens(idx)) > args.index_limit:
326+
sys.exit(f"Index too big: {tok:,}")
327+
if (tok := count_tokens(full)) > args.full_limit:
328+
sys.exit(f"Full text too big: {tok:,}")
329+
330+
(args.out_root / "llms.txt").write_text(idx, encoding="utf-8")
331+
(args.out_root / "llms-full.txt").write_text(full, encoding="utf-8")
332+
print("✅ Generated llms.txt and llms-full.txt successfully")
333+
print(f"llms.txt tokens: {count_tokens(idx)}")
334+
print(f"llms-full.txt tokens: {count_tokens(full)}")
335+
336+
337+
if __name__ == "__main__":
338+
main()

0 commit comments

Comments
 (0)