add: enable mid-auto save in searcher

CHERRY-ui8 · CHERRY-ui8 · commit af49ba2e9367 · 2025-12-13T04:00:43.000+08:00
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -1,3 +1,4 @@
+import hashlib
 import os
 import time
 from typing import Dict
@@ -173,20 +174,52 @@ async def search(self, search_config: Dict):
         if len(seeds) == 0:
             logger.warning("All documents are already been searched")
             return
+        
+        # Get save_interval from config (default: 1000, 0 to disable)
+        save_interval = search_config.get("save_interval", 1000)
+        
         search_results = await search_all(
             seed_data=seeds,
             search_config=search_config,
+            search_storage=self.search_storage if save_interval > 0 else None,
+            save_interval=save_interval,
         )
 
-        _add_search_keys = self.search_storage.filter_keys(list(search_results.keys()))
+        # Convert search_results from {data_source: [results]} to {key: result}
+        # This maintains backward compatibility
+        flattened_results = {}
+        for data_source, result_list in search_results.items():
+            if not isinstance(result_list, list):
+                continue
+            for result in result_list:
+                if result is None:
+                    continue
+                # Use _search_query as key if available, otherwise generate a key
+                if isinstance(result, dict) and "_search_query" in result:
+                    query = result["_search_query"]
+                    key = f"{data_source}:{query}"
+                else:
+                    # Generate a unique key
+                    result_str = str(result)
+                    key_hash = hashlib.md5(result_str.encode()).hexdigest()[:8]
+                    key = f"{data_source}:{key_hash}"
+                flattened_results[key] = result
+
+        _add_search_keys = self.search_storage.filter_keys(list(flattened_results.keys()))
         search_results = {
-            k: v for k, v in search_results.items() if k in _add_search_keys
+            k: v for k, v in flattened_results.items() if k in _add_search_keys
         }
         if len(search_results) == 0:
             logger.warning("All search results are already in the storage")
             return
-        self.search_storage.upsert(search_results)
-        self.search_storage.index_done_callback()
+        
+        # Only save if not using periodic saving (to avoid duplicate saves)
+        if save_interval == 0:
+            self.search_storage.upsert(search_results)
+            self.search_storage.index_done_callback()
+        else:
+            # Results were already saved periodically, just update index
+            self.search_storage.index_done_callback()
 
     @async_to_sync_method
     async def quiz_and_judge(self, quiz_and_judge_config: Dict):
diff --git a/graphgen/operators/search/search_all.py b/graphgen/operators/search/search_all.py
@@ -15,12 +15,16 @@
 async def search_all(
     seed_data: dict,
     search_config: dict,
+    search_storage=None,
+    save_interval: int = 1000,
 ) -> dict:
     """
     Perform searches across multiple search types and aggregate the results.
     :param seed_data: A dictionary containing seed data with entity names.
     :param search_config: A dictionary specifying which data sources to use for searching.
-    :return: A dictionary with
+    :param search_storage: Optional storage instance for periodic saving of results.
+    :param save_interval: Number of search results to accumulate before saving (default: 1000, 0 to disable).
+    :return: A dictionary with search results
     """
 
     results = {}
@@ -31,6 +35,41 @@ async def search_all(
         data = [d["content"] for d in data if "content" in d]
         data = list(set(data))  # Remove duplicates
 
+        # Prepare save callback for this data source
+        def make_save_callback(source_name):
+            def save_callback(intermediate_results, completed_count):
+                """Save intermediate search results."""
+                if search_storage is None:
+                    return
+                
+                # Convert results list to dict format
+                # Results are tuples of (query, result_dict) or just result_dict
+                batch_results = {}
+                for result in intermediate_results:
+                    if result is None:
+                        continue
+                    # Check if result is a dict with _search_query key
+                    if isinstance(result, dict) and "_search_query" in result:
+                        query = result["_search_query"]
+                        # Create a key for the result (using query as key)
+                        key = f"{source_name}:{query}"
+                        batch_results[key] = result
+                    elif isinstance(result, dict):
+                        # If no _search_query, use a generated key
+                        key = f"{source_name}:{completed_count}"
+                        batch_results[key] = result
+                
+                if batch_results:
+                    # Filter out already existing keys
+                    new_keys = search_storage.filter_keys(list(batch_results.keys()))
+                    new_results = {k: v for k, v in batch_results.items() if k in new_keys}
+                    if new_results:
+                        search_storage.upsert(new_results)
+                        search_storage.index_done_callback()
+                        logger.debug("Saved %d intermediate results for %s", len(new_results), source_name)
+            
+            return save_callback
+
         if data_source == "uniprot":
             from graphgen.models import UniProtSearch
 
@@ -43,6 +82,8 @@ async def search_all(
                 data,
                 desc="Searching UniProt database",
                 unit="keyword",
+                save_interval=save_interval if save_interval > 0 else 0,
+                save_callback=make_save_callback("uniprot") if search_storage and save_interval > 0 else None,
             )
             results[data_source] = uniprot_results
 
@@ -58,6 +99,8 @@ async def search_all(
                 data,
                 desc="Searching NCBI database",
                 unit="keyword",
+                save_interval=save_interval if save_interval > 0 else 0,
+                save_callback=make_save_callback("ncbi") if search_storage and save_interval > 0 else None,
             )
             results[data_source] = ncbi_results
 
@@ -73,6 +116,8 @@ async def search_all(
                 data,
                 desc="Searching RNAcentral database",
                 unit="keyword",
+                save_interval=save_interval if save_interval > 0 else 0,
+                save_callback=make_save_callback("rnacentral") if search_storage and save_interval > 0 else None,
             )
             results[data_source] = rnacentral_results
 
diff --git a/graphgen/utils/run_concurrent.py b/graphgen/utils/run_concurrent.py
@@ -17,11 +17,26 @@ async def run_concurrent(
     desc: str = "processing",
     unit: str = "item",
     progress_bar: Optional[gr.Progress] = None,
+    save_interval: int = 0,
+    save_callback: Optional[Callable[[List[R], int], None]] = None,
 ) -> List[R]:
+    """
+    Run coroutines concurrently with optional periodic saving.
+    
+    :param coro_fn: Coroutine function to run for each item
+    :param items: List of items to process
+    :param desc: Description for progress bar
+    :param unit: Unit name for progress bar
+    :param progress_bar: Optional Gradio progress bar
+    :param save_interval: Number of completed tasks before calling save_callback (0 to disable)
+    :param save_callback: Callback function to save intermediate results (results, completed_count)
+    :return: List of results
+    """
     tasks = [asyncio.create_task(coro_fn(it)) for it in items]
 
     completed_count = 0
     results = []
+    pending_save_results = []
 
     pbar = tqdm_async(total=len(items), desc=desc, unit=unit)
 
@@ -32,6 +47,8 @@ async def run_concurrent(
         try:
             result = await future
             results.append(result)
+            if save_interval > 0 and save_callback is not None:
+                pending_save_results.append(result)
         except Exception as e:  # pylint: disable=broad-except
             logger.exception("Task failed: %s", e)
             # even if failed, record it to keep results consistent with tasks
@@ -44,11 +61,31 @@ async def run_concurrent(
             progress = completed_count / len(items)
             progress_bar(progress, desc=f"{desc} ({completed_count}/{len(items)})")
 
+        # Periodic save
+        if save_interval > 0 and save_callback is not None and completed_count % save_interval == 0:
+            try:
+                # Filter out exceptions before saving
+                valid_results = [res for res in pending_save_results if not isinstance(res, Exception)]
+                save_callback(valid_results, completed_count)
+                pending_save_results = []  # Clear after saving
+                logger.info("Saved intermediate results: %d/%d completed", completed_count, len(items))
+            except Exception as e:
+                logger.warning("Failed to save intermediate results: %s", e)
+
     pbar.close()
 
     if progress_bar is not None:
         progress_bar(1.0, desc=f"{desc} (completed)")
 
+    # Save remaining results if any
+    if save_interval > 0 and save_callback is not None and pending_save_results:
+        try:
+            valid_results = [res for res in pending_save_results if not isinstance(res, Exception)]
+            save_callback(valid_results, completed_count)
+            logger.info("Saved final intermediate results: %d completed", completed_count)
+        except Exception as e:
+            logger.warning("Failed to save final intermediate results: %s", e)
+
     # filter out exceptions
     results = [res for res in results if not isinstance(res, Exception)]