Skip to content

Commit ba4aae4

Browse files
petyosiKludex
andauthored
Improve Algolia indexing (#809)
Co-authored-by: Marcelo Trylesinski <[email protected]>
1 parent 6b263c0 commit ba4aae4

File tree

3 files changed

+35
-4
lines changed

3 files changed

+35
-4
lines changed

docs/plugins/build_index.py renamed to docs/plugins/algolia.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
ALGOLIA_INDEX_NAME = 'logfire-docs'
1414
ALGOLIA_APP_ID = 'KPPUDTIAVX'
1515
ALGOLIA_WRITE_API_KEY = os.environ.get('ALGOLIA_WRITE_API_KEY')
16+
# Algolia accepts 100k, leaaving some room for other fields
17+
MAX_CONTENT_SIZE = 90_000
1618

1719

1820
def on_page_content(html: str, page: Page, config: Config, files: Files) -> str:
@@ -24,6 +26,30 @@ def on_page_content(html: str, page: Page, config: Config, files: Files) -> str:
2426

2527
soup = BeautifulSoup(html, 'html.parser')
2628

29+
# Clean up presentational and UI elements
30+
for element in soup.find_all(['autoref']):
31+
element.decompose()
32+
33+
# this removes the large source code embeds from Github
34+
for element in soup.find_all('details'):
35+
element.decompose()
36+
37+
for el_with_class in soup.find_all(class_=['doc-section-item', 'doc-section-title', 'doc-md-description', 'doc']):
38+
# delete the class attribute
39+
del el_with_class['class']
40+
41+
# Cleanup code examples
42+
for extra in soup.find_all('div', attrs={'class': ['language-py highlight', 'language-python highlight']}):
43+
extra.replace_with(BeautifulSoup(f'<pre>{extra.find("code").get_text()}</pre>', 'html.parser'))
44+
45+
# Cleanup code examples, part 2
46+
for extra in soup.find_all('div', attrs={'class': 'language-python doc-signature highlight'}):
47+
extra.replace_with(BeautifulSoup(f'<pre>{extra.find("code").get_text()}</pre>', 'html.parser'))
48+
49+
# The API reference generates HTML tables with line numbers, this strips the line numbers cell and goes back to a code block
50+
for extra in soup.find_all('table', attrs={'class': 'highlighttable'}):
51+
extra.replace_with(BeautifulSoup(f'<pre>{extra.find("code").get_text()}</pre>', 'html.parser'))
52+
2753
# Find all h1 and h2 headings
2854
headings = soup.find_all(['h1', 'h2'])
2955

@@ -65,7 +91,12 @@ def on_post_build(config: Config) -> None:
6591

6692
client = SearchClient.create(ALGOLIA_APP_ID, ALGOLIA_WRITE_API_KEY)
6793
index = client.init_index(ALGOLIA_INDEX_NAME)
68-
# temporary filter the records from the index if the content is bigger than 10k characters
69-
filtered_records = list(filter(lambda record: len(record['content']) < 9000, records))
94+
95+
for large_record in list(filter(lambda record: len(record['content']) >= MAX_CONTENT_SIZE, records)):
96+
print(f'Content for {large_record["abs_url"]} is too large to be indexed. Skipping...')
97+
print(f'Content : {large_record["content"]} characters')
98+
99+
# filter the records from the index if the content is bigger than 10k characters
100+
filtered_records = list(filter(lambda record: len(record['content']) < MAX_CONTENT_SIZE, records))
70101
print(f'Uploading {len(filtered_records)} out of {len(records)} records to Algolia...')
71102
index.replace_all_objects(filtered_records, {'createIfNotExists': True}).wait() # type: ignore[reportUnknownMemberType]

docs/plugins/build_llms_txt.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111

1212
def on_config(config: MkDocsConfig):
13-
os.mkdir(config.site_dir)
13+
os.makedirs(config.site_dir, exist_ok=True)
1414
llms_path = os.path.join(config.site_dir, 'llms.txt')
1515
with open(llms_path, 'w') as f:
1616
f.write('')

mkdocs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,4 +309,4 @@ plugins:
309309
hooks:
310310
- docs/plugins/main.py
311311
- docs/plugins/build_llms_txt.py
312-
- docs/plugins/build_index.py
312+
- docs/plugins/algolia.py

0 commit comments

Comments
 (0)