Skip to content

Commit 0cb5542

Browse files
author
async
committed
add selective TOC item extraction feature
1 parent 6f8f63d commit 0cb5542

File tree

5 files changed

+1128
-16
lines changed

5 files changed

+1128
-16
lines changed

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ A command-line tool to scrape and structure GitBook documentation into a single,
1111
- 🛠️ Configurable output format and structure
1212
- 🔄 Automatic retry on failed requests
1313
- 📋 Table of contents generation
14+
- 🎯 Selective TOC item extraction
1415

1516
## Installation
1617

@@ -32,6 +33,9 @@ gitbook-scraper https://your-gitbook-url.io --toc
3233

3334
# Custom rate limiting
3435
gitbook-scraper https://your-gitbook-url.io --delay 1.0
36+
37+
# Extract specific TOC items
38+
gitbook-scraper https://your-gitbook-url.io -t "Getting Started" -t "Advanced Topics"
3539
```
3640

3741
## Advanced Usage
@@ -47,6 +51,7 @@ Options:
4751
--timeout INTEGER Request timeout in seconds [default: 10]
4852
--debug Enable debug logging [default: False]
4953
--no-cleanup Keep intermediate files [default: False]
54+
-t, --toc-items TEXT Specific TOC items to extract (can be specified multiple times)
5055
--help Show this message and exit
5156
```
5257

@@ -55,13 +60,22 @@ Options:
5560
```python
5661
from gitbook_scraper import GitbookScraper
5762

63+
# Basic usage
5864
scraper = GitbookScraper(
5965
base_url="https://your-gitbook-url.io",
6066
output_file="documentation.md",
6167
generate_toc=True,
6268
delay=0.5
6369
)
6470

71+
# Extract specific TOC items
72+
scraper = GitbookScraper(
73+
base_url="https://your-gitbook-url.io",
74+
output_file="documentation.md",
75+
generate_toc=True,
76+
toc_items=["Getting Started", "Advanced Topics"]
77+
)
78+
6579
scraper.scrape()
6680
```
6781

gitbook_scraper/cli.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@
2626
help='Enable debug logging [default: False]')
2727
@click.option('--selector-file', type=click.Path(exists=True, dir_okay=False),
2828
help='Path to JSON file with custom CSS selectors')
29+
@click.option('--toc-items', '-t', multiple=True,
30+
help='Specific TOC items to extract (can be specified multiple times)')
2931
def main(url: str, output: str, toc: bool, delay: float, retries: int,
30-
timeout: int, debug: bool, selector_file: Optional[str]) -> int:
32+
timeout: int, debug: bool, selector_file: Optional[str], toc_items: tuple) -> int:
3133
"""Scrape and structure GitBook documentation into a single markdown file."""
3234
try:
3335
# Validate URL
@@ -43,7 +45,8 @@ def main(url: str, output: str, toc: bool, delay: float, retries: int,
4345
retries=retries,
4446
timeout=timeout,
4547
debug=debug,
46-
selector_file=selector_file
48+
selector_file=selector_file,
49+
toc_items=list(toc_items) if toc_items else None
4750
)
4851

4952
with console.status("[bold green]Scraping documentation..."):

gitbook_scraper/scraper.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ def __init__(
2222
retries: int = 3,
2323
timeout: int = 10,
2424
debug: bool = False,
25-
selector_file: Optional[str] = None
25+
selector_file: Optional[str] = None,
26+
toc_items: Optional[List[str]] = None
2627
):
2728
"""
2829
Initialize the GitBook scraper.
@@ -36,6 +37,7 @@ def __init__(
3637
timeout: Request timeout in seconds
3738
debug: Enable debug logging
3839
selector_file: Optional JSON file with custom CSS selectors
40+
toc_items: Optional list of TOC item titles to extract. If None, extracts all items.
3941
"""
4042
self.base_url = self.normalize_url(base_url)
4143
self.domain = urlparse(self.base_url).netloc
@@ -44,6 +46,7 @@ def __init__(
4446
self.delay = delay
4547
self.retries = retries
4648
self.timeout = timeout
49+
self.toc_items = set(toc_items) if toc_items else None
4750

4851
# Setup logging
4952
log_level = logging.DEBUG if debug else logging.INFO
@@ -369,6 +372,23 @@ def generate_markdown(self, structure: List[Dict], level: int = 1) -> str:
369372

370373
return "\n".join(content)
371374

375+
def filter_nav_structure(self, structure: List[Dict]) -> List[Dict]:
376+
"""Filter navigation structure to only include specified TOC items and their children."""
377+
if not self.toc_items:
378+
return structure
379+
380+
filtered = []
381+
for item in structure:
382+
if item['title'] in self.toc_items:
383+
filtered.append(item)
384+
elif item['children']:
385+
filtered_children = self.filter_nav_structure(item['children'])
386+
if filtered_children:
387+
item_copy = item.copy()
388+
item_copy['children'] = filtered_children
389+
filtered.append(item_copy)
390+
return filtered
391+
372392
def scrape(self):
373393
"""Main scraping method."""
374394
with Progress(
@@ -381,6 +401,12 @@ def scrape(self):
381401

382402
if not self.nav_structure:
383403
raise NavigationExtractionError("Empty navigation structure")
404+
405+
# Filter navigation structure if specific TOC items are requested
406+
if self.toc_items:
407+
self.nav_structure = self.filter_nav_structure(self.nav_structure)
408+
if not self.nav_structure:
409+
raise NavigationExtractionError("No matching TOC items found")
384410

385411
progress.add_task("Generating documentation...", total=None)
386412
content = self.generate_markdown(self.nav_structure)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "gitbook-scraper"
7-
version = "0.1.0"
7+
version = "0.1.1"
88
authors = [
99
{ name="Async", email="[email protected]" },
1010
]

0 commit comments

Comments
 (0)