fix(graphrag): add retries for batch AI operations

donhardman · donhardman · commit 5739cb1b546d · 2026-04-04T17:54:45.000+03:00
- Implement exponential backoff for batch LLM requests
- Include response parsing in the retry logic
- Remove failed nodes from current index to allow deferral
- Enhance error reporting for batch processing failures
diff --git a/src/indexer/graphrag/ai.rs b/src/indexer/graphrag/ai.rs
@@ -21,6 +21,7 @@ use anyhow::Result;
 use serde::Deserialize;
 use serde_json::json;
 use std::collections::HashMap;
+use std::time::Duration;
 
 pub struct AIEnhancements {
 	config: Config,
@@ -355,6 +356,9 @@ impl AIEnhancements {
 		sample
 	}
 
+	/// Maximum retries for batch AI operations (covers both LLM call and response parsing)
+	const MAX_BATCH_RETRIES: u32 = 3;
+
 	// Extract AI-powered descriptions for multiple files in a single batch call
 	pub async fn extract_ai_descriptions_batch(
 		&self,
@@ -364,33 +368,54 @@ impl AIEnhancements {
 			return Ok(HashMap::new());
 		}
 
-		// Build batch user message with all files
-		let user_message = self.build_batch_user_message(files);
-
-		// Create JSON schema for structured response
 		let json_schema = self.create_batch_response_schema();
+		let mut last_error = None;
 
-		// Single API call for multiple files
-		// LLM call includes retry with exponential backoff (in LlmClient).
-		// If it still fails after retries, propagate error to stop indexing.
-		let response = self
-			.call_llm(
-				&self.config.graphrag.llm.description_model,
-				self.config.graphrag.llm.description_system_prompt.clone(),
-				user_message,
-				Some(json_schema),
-			)
-			.await
-			.map_err(|e| {
-				anyhow::anyhow!(
-					"GraphRAG AI description failed for {} files after retries: {}. \
-					 Stopping indexing to prevent storing data without LLM descriptions.",
-					files.len(),
-					e
+		for attempt in 0..=Self::MAX_BATCH_RETRIES {
+			if attempt > 0 {
+				let delay = Duration::from_secs(5 * (1 << (attempt - 1))); // 5s, 10s, 20s
+				if !self.quiet {
+					eprintln!(
+						"⚠️  AI batch attempt {}/{} failed, retrying in {:?}...",
+						attempt,
+						Self::MAX_BATCH_RETRIES + 1,
+						delay
+					);
+				}
+				tokio::time::sleep(delay).await;
+			}
+
+			// Build fresh message each attempt
+			let user_message = self.build_batch_user_message(files);
+
+			match self
+				.call_llm(
+					&self.config.graphrag.llm.description_model,
+					self.config.graphrag.llm.description_system_prompt.clone(),
+					user_message,
+					Some(json_schema.clone()),
 				)
-			})?;
+				.await
+			{
+				Ok(response) => match self.parse_batch_response(&response, files) {
+					Ok(results) => return Ok(results),
+					Err(e) => {
+						last_error = Some(e);
+					}
+				},
+				Err(e) => {
+					last_error = Some(e);
+				}
+			}
+		}
 
-		self.parse_batch_response(&response, files)
+		Err(last_error.unwrap_or_else(|| {
+			anyhow::anyhow!(
+				"AI batch description failed for {} files after {} retries",
+				files.len(),
+				Self::MAX_BATCH_RETRIES
+			)
+		}))
 	}
 
 	// Build user message for batch processing
diff --git a/src/indexer/graphrag/builder.rs b/src/indexer/graphrag/builder.rs
@@ -324,8 +324,14 @@ impl GraphBuilder {
 								}
 								Err(e) => {
 									if !self.quiet {
-										eprintln!("⚠️  AI batch processing failed: {}", e);
+										eprintln!(
+											"⚠️  AI batch failed after retries: {}. Deferring {} files to next run.",
+											e, ai_batch_queue.len()
+										);
 									}
+									let failed_ids: HashSet<String> =
+										ai_batch_queue.iter().map(|f| f.file_id.clone()).collect();
+									new_nodes.retain(|n| !failed_ids.contains(&n.id));
 								}
 							}
 						}
@@ -455,8 +461,14 @@ impl GraphBuilder {
 					}
 					Err(e) => {
 						if !self.quiet {
-							eprintln!("⚠️  Final batch AI processing failed: {}", e);
+							eprintln!(
+								"⚠️  Final AI batch failed after retries: {}. Deferring {} files to next run.",
+								e, ai_batch_queue.len()
+							);
 						}
+						let failed_ids: HashSet<String> =
+							ai_batch_queue.iter().map(|f| f.file_id.clone()).collect();
+						new_nodes.retain(|n| !failed_ids.contains(&n.id));
 					}
 				}
 			}

Original file line number	Diff line number	Diff line change
`@@ -324,8 +324,14 @@ impl GraphBuilder {`
`324`	`324`	`}`
`325`	`325`	`Err(e) => {`
`326`	`326`	`if !self.quiet {`
`327`		`- eprintln!("⚠️ AI batch processing failed: {}", e);`
	`327`	`+ eprintln!(`
	`328`	`+ "⚠️ AI batch failed after retries: {}. Deferring {} files to next run.",`
	`329`	`+ e, ai_batch_queue.len()`
	`330`	`+ );`
`328`	`331`	`}`
	`332`	`+ let failed_ids: HashSet<String> =`
	`333`	`+ ai_batch_queue.iter().map(\|f\| f.file_id.clone()).collect();`
	`334`	`+ new_nodes.retain(\|n\| !failed_ids.contains(&n.id));`
`329`	`335`	`}`
`330`	`336`	`}`
`331`	`337`	`}`
`@@ -455,8 +461,14 @@ impl GraphBuilder {`
`455`	`461`	`}`
`456`	`462`	`Err(e) => {`
`457`	`463`	`if !self.quiet {`
`458`		`- eprintln!("⚠️ Final batch AI processing failed: {}", e);`
	`464`	`+ eprintln!(`
	`465`	`+ "⚠️ Final AI batch failed after retries: {}. Deferring {} files to next run.",`
	`466`	`+ e, ai_batch_queue.len()`
	`467`	`+ );`
`459`	`468`	`}`
	`469`	`+ let failed_ids: HashSet<String> =`
	`470`	`+ ai_batch_queue.iter().map(\|f\| f.file_id.clone()).collect();`
	`471`	`+ new_nodes.retain(\|n\| !failed_ids.contains(&n.id));`
`460`	`472`	`}`
`461`	`473`	`}`
`462`	`474`	`}`