Skip to content

Commit 13f69bd

Browse files
heiskrCopilot
andauthored
Report invalid search records as failures instead of crashing (#59776)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 2316ae8 commit 13f69bd

File tree

3 files changed

+91
-50
lines changed

3 files changed

+91
-50
lines changed

src/search/scripts/scrape/lib/scrape-into-index-json.ts

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,42 @@ export default async function scrapeIntoIndexJson({
7979
})
8080
}
8181

82-
const fileWritten = await writeIndexRecords(indexName, records, outDirectory)
83-
console.log(`wrote records to ${fileWritten}`)
82+
const { filePath: fileWritten, skippedRecords } = await writeIndexRecords(
83+
indexName,
84+
records,
85+
outDirectory,
86+
)
87+
if (fileWritten) {
88+
console.log(`wrote records to ${fileWritten}`)
89+
} else {
90+
totalFailedPages += 1
91+
allFailures.push({
92+
indexName,
93+
languageCode,
94+
indexVersion,
95+
failures: [
96+
{
97+
error: `No valid records to write for ${indexName}`,
98+
errorType: 'no-valid-records',
99+
},
100+
],
101+
})
102+
}
103+
104+
if (skippedRecords.length > 0) {
105+
const skippedFailures = skippedRecords.map(({ objectID, reason }) => ({
106+
url: objectID,
107+
error: `Record skipped: ${reason}`,
108+
errorType: 'invalid-record',
109+
}))
110+
totalFailedPages += skippedRecords.length
111+
allFailures.push({
112+
indexName,
113+
languageCode,
114+
indexVersion,
115+
failures: skippedFailures,
116+
})
117+
}
84118
}
85119
}
86120
const t1 = new Date()

src/search/scripts/scrape/lib/search-index-records.ts

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,26 @@ import { isArray, isString } from 'lodash-es'
66

77
import type { Record } from '@/search/scripts/scrape/types'
88

9+
export interface SkippedRecord {
10+
objectID: string
11+
reason: string
12+
}
13+
14+
export interface WriteIndexRecordsResult {
15+
filePath: string
16+
skippedRecords: SkippedRecord[]
17+
}
18+
919
export async function writeIndexRecords(
1020
name: string,
1121
records: Record[],
1222
outDirectory: string,
13-
): Promise<string> {
14-
const validRecords = validateRecords(name, records)
23+
): Promise<WriteIndexRecordsResult> {
24+
const { validRecords, skippedRecords } = validateRecords(name, records)
25+
26+
if (validRecords.length === 0) {
27+
return { filePath: '', skippedRecords }
28+
}
1529

1630
const recordsObject = Object.fromEntries(validRecords.map((record) => [record.objectID, record]))
1731
const content = JSON.stringify(recordsObject, undefined, 0)
@@ -24,15 +38,26 @@ export async function writeIndexRecords(
2438
const filePath = path.join(outDirectory, `${name}-records.json`)
2539
await fs.writeFile(filePath, content, { flag: 'w' })
2640

27-
return filePath
41+
return { filePath, skippedRecords }
42+
}
43+
44+
interface ValidateResult {
45+
validRecords: Record[]
46+
skippedRecords: SkippedRecord[]
2847
}
2948

30-
function validateRecords(name: string, records: Record[]): Record[] {
49+
function validateRecords(name: string, records: Record[]): ValidateResult {
3150
if (!isString(name) || !name.length) {
32-
throw new Error('`name` is required')
51+
return {
52+
validRecords: [],
53+
skippedRecords: [{ objectID: '(unknown)', reason: 'name is required' }],
54+
}
3355
}
3456
if (!isArray(records) || !records.length) {
35-
throw new Error('`records` must be a non-empty array')
57+
return {
58+
validRecords: [],
59+
skippedRecords: [{ objectID: '(unknown)', reason: 'records array is empty' }],
60+
}
3661
}
3762

3863
// each ID is unique — deduplicate rather than crash
@@ -48,19 +73,16 @@ function validateRecords(name: string, records: Record[]): Record[] {
4873

4974
const seen = new Set<string>()
5075
const validRecords: Record[] = []
76+
const skippedRecords: SkippedRecord[] = []
5177

5278
for (const record of records) {
5379
if (!isString(record.objectID) || !record.objectID.length) {
54-
console.warn(
55-
chalk.yellow(
56-
`⚠ Skipping record with invalid objectID: ${JSON.stringify({ objectID: record.objectID, title: record.title })}`,
57-
),
58-
)
80+
skippedRecords.push({ objectID: '(unknown)', reason: 'invalid objectID' })
5981
continue
6082
}
6183

6284
if (!isString(record.title) || !record.title.length) {
63-
console.warn(chalk.yellow(`⚠ Skipping record with empty title: ${record.objectID}`))
85+
skippedRecords.push({ objectID: record.objectID, reason: 'empty title' })
6486
continue
6587
}
6688

@@ -72,7 +94,7 @@ function validateRecords(name: string, records: Record[]): Record[] {
7294
validRecords.push(record)
7395
}
7496

75-
return validRecords
97+
return { validRecords, skippedRecords }
7698
}
7799

78100
function countArrayValues(arr: string[]) {

src/search/scripts/scrape/tests/search-index-records.ts

Lines changed: 20 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -33,37 +33,34 @@ describe('writeIndexRecords', () => {
3333

3434
const result = await writeIndexRecords('test-index', records, '/tmp/out')
3535

36-
expect(result).toBe('/tmp/out/test-index-records.json')
36+
expect(result.filePath).toBe('/tmp/out/test-index-records.json')
37+
expect(result.skippedRecords).toEqual([])
3738
expect(fs.writeFile).toHaveBeenCalledOnce()
3839
const writtenJson = vi.mocked(fs.writeFile).mock.calls[0][1] as string
3940
const parsed = JSON.parse(writtenJson)
4041
expect(Object.keys(parsed)).toEqual(['/en/test-page', '/en/other-page'])
4142
})
4243

43-
test('filters out records with empty titles', async () => {
44-
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
44+
test('filters out records with empty titles and returns them as skipped', async () => {
4545
const records = [makeRecord(), makeRecord({ objectID: '/en/bad-page', title: '' })]
4646

47-
await writeIndexRecords('test-index', records, '/tmp/out')
47+
const result = await writeIndexRecords('test-index', records, '/tmp/out')
4848

4949
const writtenJson = vi.mocked(fs.writeFile).mock.calls[0][1] as string
5050
const parsed = JSON.parse(writtenJson)
5151
expect(Object.keys(parsed)).toEqual(['/en/test-page'])
52-
expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining('empty title'))
53-
warnSpy.mockRestore()
52+
expect(result.skippedRecords).toEqual([{ objectID: '/en/bad-page', reason: 'empty title' }])
5453
})
5554

56-
test('filters out records with missing objectID', async () => {
57-
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
55+
test('filters out records with missing objectID and returns them as skipped', async () => {
5856
const records = [makeRecord(), makeRecord({ objectID: '', title: 'No ID' })]
5957

60-
await writeIndexRecords('test-index', records, '/tmp/out')
58+
const result = await writeIndexRecords('test-index', records, '/tmp/out')
6159

6260
const writtenJson = vi.mocked(fs.writeFile).mock.calls[0][1] as string
6361
const parsed = JSON.parse(writtenJson)
6462
expect(Object.keys(parsed)).toEqual(['/en/test-page'])
65-
expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining('invalid objectID'))
66-
warnSpy.mockRestore()
63+
expect(result.skippedRecords).toEqual([{ objectID: '(unknown)', reason: 'invalid objectID' }])
6764
})
6865

6966
test('deduplicates records with the same objectID', async () => {
@@ -82,34 +79,22 @@ describe('writeIndexRecords', () => {
8279
warnSpy.mockRestore()
8380
})
8481

85-
test('does not log full record content for invalid objectID', async () => {
86-
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
87-
const records = [
88-
makeRecord(),
89-
makeRecord({
90-
objectID: '',
91-
title: 'No ID',
92-
content: 'A very long content string that should not appear in logs',
93-
}),
94-
]
82+
test('returns empty filePath and skipped record when name is empty', async () => {
83+
const result = await writeIndexRecords('', [makeRecord()], '/tmp/out')
9584

96-
await writeIndexRecords('test-index', records, '/tmp/out')
97-
98-
const warnMessage = warnSpy.mock.calls[0][0] as string
99-
expect(warnMessage).not.toContain('very long content string')
100-
warnSpy.mockRestore()
85+
expect(result.filePath).toBe('')
86+
expect(result.skippedRecords).toEqual([{ objectID: '(unknown)', reason: 'name is required' }])
87+
expect(fs.writeFile).not.toHaveBeenCalled()
10188
})
10289

103-
test('throws when name is empty', async () => {
104-
await expect(writeIndexRecords('', [makeRecord()], '/tmp/out')).rejects.toThrow(
105-
'`name` is required',
106-
)
107-
})
90+
test('returns empty filePath and skipped record when records array is empty', async () => {
91+
const result = await writeIndexRecords('test-index', [], '/tmp/out')
10892

109-
test('throws when records array is empty', async () => {
110-
await expect(writeIndexRecords('test-index', [], '/tmp/out')).rejects.toThrow(
111-
'`records` must be a non-empty array',
112-
)
93+
expect(result.filePath).toBe('')
94+
expect(result.skippedRecords).toEqual([
95+
{ objectID: '(unknown)', reason: 'records array is empty' },
96+
])
97+
expect(fs.writeFile).not.toHaveBeenCalled()
11398
})
11499

115100
test('creates output directory if it does not exist', async () => {

0 commit comments

Comments
 (0)