Skip to content

Commit 5739cb1

Browse files
committed
fix(graphrag): add retries for batch AI operations
- Implement exponential backoff for batch LLM requests - Include response parsing in the retry logic - Remove failed nodes from current index to allow deferral - Enhance error reporting for batch processing failures
1 parent d1469db commit 5739cb1

2 files changed

Lines changed: 62 additions & 25 deletions

File tree

src/indexer/graphrag/ai.rs

Lines changed: 48 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use anyhow::Result;
2121
use serde::Deserialize;
2222
use serde_json::json;
2323
use std::collections::HashMap;
24+
use std::time::Duration;
2425

2526
pub struct AIEnhancements {
2627
config: Config,
@@ -355,6 +356,9 @@ impl AIEnhancements {
355356
sample
356357
}
357358

359+
/// Maximum retries for batch AI operations (covers both LLM call and response parsing)
360+
const MAX_BATCH_RETRIES: u32 = 3;
361+
358362
// Extract AI-powered descriptions for multiple files in a single batch call
359363
pub async fn extract_ai_descriptions_batch(
360364
&self,
@@ -364,33 +368,54 @@ impl AIEnhancements {
364368
return Ok(HashMap::new());
365369
}
366370

367-
// Build batch user message with all files
368-
let user_message = self.build_batch_user_message(files);
369-
370-
// Create JSON schema for structured response
371371
let json_schema = self.create_batch_response_schema();
372+
let mut last_error = None;
372373

373-
// Single API call for multiple files
374-
// LLM call includes retry with exponential backoff (in LlmClient).
375-
// If it still fails after retries, propagate error to stop indexing.
376-
let response = self
377-
.call_llm(
378-
&self.config.graphrag.llm.description_model,
379-
self.config.graphrag.llm.description_system_prompt.clone(),
380-
user_message,
381-
Some(json_schema),
382-
)
383-
.await
384-
.map_err(|e| {
385-
anyhow::anyhow!(
386-
"GraphRAG AI description failed for {} files after retries: {}. \
387-
Stopping indexing to prevent storing data without LLM descriptions.",
388-
files.len(),
389-
e
374+
for attempt in 0..=Self::MAX_BATCH_RETRIES {
375+
if attempt > 0 {
376+
let delay = Duration::from_secs(5 * (1 << (attempt - 1))); // 5s, 10s, 20s
377+
if !self.quiet {
378+
eprintln!(
379+
"⚠️ AI batch attempt {}/{} failed, retrying in {:?}...",
380+
attempt,
381+
Self::MAX_BATCH_RETRIES + 1,
382+
delay
383+
);
384+
}
385+
tokio::time::sleep(delay).await;
386+
}
387+
388+
// Build fresh message each attempt
389+
let user_message = self.build_batch_user_message(files);
390+
391+
match self
392+
.call_llm(
393+
&self.config.graphrag.llm.description_model,
394+
self.config.graphrag.llm.description_system_prompt.clone(),
395+
user_message,
396+
Some(json_schema.clone()),
390397
)
391-
})?;
398+
.await
399+
{
400+
Ok(response) => match self.parse_batch_response(&response, files) {
401+
Ok(results) => return Ok(results),
402+
Err(e) => {
403+
last_error = Some(e);
404+
}
405+
},
406+
Err(e) => {
407+
last_error = Some(e);
408+
}
409+
}
410+
}
392411

393-
self.parse_batch_response(&response, files)
412+
Err(last_error.unwrap_or_else(|| {
413+
anyhow::anyhow!(
414+
"AI batch description failed for {} files after {} retries",
415+
files.len(),
416+
Self::MAX_BATCH_RETRIES
417+
)
418+
}))
394419
}
395420

396421
// Build user message for batch processing

src/indexer/graphrag/builder.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -324,8 +324,14 @@ impl GraphBuilder {
324324
}
325325
Err(e) => {
326326
if !self.quiet {
327-
eprintln!("⚠️ AI batch processing failed: {}", e);
327+
eprintln!(
328+
"⚠️ AI batch failed after retries: {}. Deferring {} files to next run.",
329+
e, ai_batch_queue.len()
330+
);
328331
}
332+
let failed_ids: HashSet<String> =
333+
ai_batch_queue.iter().map(|f| f.file_id.clone()).collect();
334+
new_nodes.retain(|n| !failed_ids.contains(&n.id));
329335
}
330336
}
331337
}
@@ -455,8 +461,14 @@ impl GraphBuilder {
455461
}
456462
Err(e) => {
457463
if !self.quiet {
458-
eprintln!("⚠️ Final batch AI processing failed: {}", e);
464+
eprintln!(
465+
"⚠️ Final AI batch failed after retries: {}. Deferring {} files to next run.",
466+
e, ai_batch_queue.len()
467+
);
459468
}
469+
let failed_ids: HashSet<String> =
470+
ai_batch_queue.iter().map(|f| f.file_id.clone()).collect();
471+
new_nodes.retain(|n| !failed_ids.contains(&n.id));
460472
}
461473
}
462474
}

0 commit comments

Comments
 (0)