13
13
ALGOLIA_INDEX_NAME = 'logfire-docs'
14
14
ALGOLIA_APP_ID = 'KPPUDTIAVX'
15
15
ALGOLIA_WRITE_API_KEY = os .environ .get ('ALGOLIA_WRITE_API_KEY' )
16
+ # Algolia accepts 100k, leaaving some room for other fields
17
+ MAX_CONTENT_SIZE = 90_000
16
18
17
19
18
20
def on_page_content (html : str , page : Page , config : Config , files : Files ) -> str :
@@ -24,6 +26,30 @@ def on_page_content(html: str, page: Page, config: Config, files: Files) -> str:
24
26
25
27
soup = BeautifulSoup (html , 'html.parser' )
26
28
29
+ # Clean up presentational and UI elements
30
+ for element in soup .find_all (['autoref' ]):
31
+ element .decompose ()
32
+
33
+ # this removes the large source code embeds from Github
34
+ for element in soup .find_all ('details' ):
35
+ element .decompose ()
36
+
37
+ for el_with_class in soup .find_all (class_ = ['doc-section-item' , 'doc-section-title' , 'doc-md-description' , 'doc' ]):
38
+ # delete the class attribute
39
+ del el_with_class ['class' ]
40
+
41
+ # Cleanup code examples
42
+ for extra in soup .find_all ('div' , attrs = {'class' : ['language-py highlight' , 'language-python highlight' ]}):
43
+ extra .replace_with (BeautifulSoup (f'<pre>{ extra .find ("code" ).get_text ()} </pre>' , 'html.parser' ))
44
+
45
+ # Cleanup code examples, part 2
46
+ for extra in soup .find_all ('div' , attrs = {'class' : 'language-python doc-signature highlight' }):
47
+ extra .replace_with (BeautifulSoup (f'<pre>{ extra .find ("code" ).get_text ()} </pre>' , 'html.parser' ))
48
+
49
+ # The API reference generates HTML tables with line numbers, this strips the line numbers cell and goes back to a code block
50
+ for extra in soup .find_all ('table' , attrs = {'class' : 'highlighttable' }):
51
+ extra .replace_with (BeautifulSoup (f'<pre>{ extra .find ("code" ).get_text ()} </pre>' , 'html.parser' ))
52
+
27
53
# Find all h1 and h2 headings
28
54
headings = soup .find_all (['h1' , 'h2' ])
29
55
@@ -65,7 +91,12 @@ def on_post_build(config: Config) -> None:
65
91
66
92
client = SearchClient .create (ALGOLIA_APP_ID , ALGOLIA_WRITE_API_KEY )
67
93
index = client .init_index (ALGOLIA_INDEX_NAME )
68
- # temporary filter the records from the index if the content is bigger than 10k characters
69
- filtered_records = list (filter (lambda record : len (record ['content' ]) < 9000 , records ))
94
+
95
+ for large_record in list (filter (lambda record : len (record ['content' ]) >= MAX_CONTENT_SIZE , records )):
96
+ print (f'Content for { large_record ["abs_url" ]} is too large to be indexed. Skipping...' )
97
+ print (f'Content : { large_record ["content" ]} characters' )
98
+
99
+ # filter the records from the index if the content is bigger than 10k characters
100
+ filtered_records = list (filter (lambda record : len (record ['content' ]) < MAX_CONTENT_SIZE , records ))
70
101
print (f'Uploading { len (filtered_records )} out of { len (records )} records to Algolia...' )
71
102
index .replace_all_objects (filtered_records , {'createIfNotExists' : True }).wait () # type: ignore[reportUnknownMemberType]
0 commit comments