Skip to content

Commit ba009fb

Browse files
heiskrCopilot
andauthored
Strip GitHub alert markers from search index content (#59709)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 5ed9369 commit ba009fb

File tree

2 files changed

+37
-1
lines changed

2 files changed

+37
-1
lines changed

src/search/scripts/scrape/lib/build-records-from-api.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ import type {
3535
Redirects,
3636
} from '@/search/scripts/scrape/types'
3737

38+
// GitHub-style alert markers (> [!NOTE], > [!TIP], etc.) that appear in
39+
// markdown returned by the Article API. The rehype alerts plugin only runs
40+
// in the HTML pipeline, so these leak through as literal text when we index
41+
// the markdown-only output. Strip them so they don't appear in search results.
42+
const ALERT_MARKER_REGEXP = /\[!(NOTE|TIP|WARNING|IMPORTANT|CAUTION)\]\n?/gi
43+
3844
// Same ignored headings as the HTML scraping approach
3945
const IGNORED_HEADING_SLUGS = new Set(['in-this-article', 'further-reading', 'prerequisites'])
4046

@@ -190,7 +196,7 @@ export function extractFromMarkdown(markdown: string): { headings: string; conte
190196

191197
// 2. Convert full AST to plain text (code blocks are kept so that terms
192198
// appearing only in code examples remain searchable).
193-
const content = astToPlainText(ast)
199+
const content = astToPlainText(ast).replace(ALERT_MARKER_REGEXP, '')
194200

195201
return { headings: headings.join('\n'), content }
196202
}

src/search/tests/build-records-from-api.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,36 @@ More text.
212212
expect(text).toContain('Second paragraph in blockquote.')
213213
})
214214

215+
test('strips GitHub alert markers from plain text', () => {
216+
const markdown = `> [!NOTE]
217+
> This is a note.
218+
219+
> [!TIP]
220+
> This is a tip.
221+
222+
> [!WARNING]
223+
> This is a warning.
224+
225+
> [!IMPORTANT]
226+
> This is important.
227+
228+
> [!CAUTION]
229+
> This is a caution.
230+
`
231+
const text = markdownToPlainText(markdown)
232+
expect(text).not.toContain('[!NOTE]')
233+
expect(text).not.toContain('[!TIP]')
234+
expect(text).not.toContain('[!WARNING]')
235+
expect(text).not.toContain('[!IMPORTANT]')
236+
expect(text).not.toContain('[!CAUTION]')
237+
// The alert body text should still be present
238+
expect(text).toContain('This is a note.')
239+
expect(text).toContain('This is a tip.')
240+
expect(text).toContain('This is a warning.')
241+
expect(text).toContain('This is important.')
242+
expect(text).toContain('This is a caution.')
243+
})
244+
215245
test('handles GFM tables cleanly', () => {
216246
const markdown = `Some intro.
217247

0 commit comments

Comments
 (0)