1515async def search_all (
1616 seed_data : dict ,
1717 search_config : dict ,
18+ search_storage = None ,
19+ save_interval : int = 1000 ,
1820) -> dict :
1921 """
2022 Perform searches across multiple search types and aggregate the results.
2123 :param seed_data: A dictionary containing seed data with entity names.
2224 :param search_config: A dictionary specifying which data sources to use for searching.
23- :return: A dictionary with
25+ :param search_storage: Optional storage instance for periodic saving of results.
26+ :param save_interval: Number of search results to accumulate before saving (default: 1000, 0 to disable).
27+ :return: A dictionary with search results
2428 """
2529
2630 results = {}
@@ -31,6 +35,41 @@ async def search_all(
3135 data = [d ["content" ] for d in data if "content" in d ]
3236 data = list (set (data )) # Remove duplicates
3337
38+ # Prepare save callback for this data source
39+ def make_save_callback (source_name ):
40+ def save_callback (intermediate_results , completed_count ):
41+ """Save intermediate search results."""
42+ if search_storage is None :
43+ return
44+
45+ # Convert results list to dict format
46+ # Results are tuples of (query, result_dict) or just result_dict
47+ batch_results = {}
48+ for result in intermediate_results :
49+ if result is None :
50+ continue
51+ # Check if result is a dict with _search_query key
52+ if isinstance (result , dict ) and "_search_query" in result :
53+ query = result ["_search_query" ]
54+ # Create a key for the result (using query as key)
55+ key = f"{ source_name } :{ query } "
56+ batch_results [key ] = result
57+ elif isinstance (result , dict ):
58+ # If no _search_query, use a generated key
59+ key = f"{ source_name } :{ completed_count } "
60+ batch_results [key ] = result
61+
62+ if batch_results :
63+ # Filter out already existing keys
64+ new_keys = search_storage .filter_keys (list (batch_results .keys ()))
65+ new_results = {k : v for k , v in batch_results .items () if k in new_keys }
66+ if new_results :
67+ search_storage .upsert (new_results )
68+ search_storage .index_done_callback ()
69+ logger .debug ("Saved %d intermediate results for %s" , len (new_results ), source_name )
70+
71+ return save_callback
72+
3473 if data_source == "uniprot" :
3574 from graphgen .models import UniProtSearch
3675
@@ -43,6 +82,8 @@ async def search_all(
4382 data ,
4483 desc = "Searching UniProt database" ,
4584 unit = "keyword" ,
85+ save_interval = save_interval if save_interval > 0 else 0 ,
86+ save_callback = make_save_callback ("uniprot" ) if search_storage and save_interval > 0 else None ,
4687 )
4788 results [data_source ] = uniprot_results
4889
@@ -58,6 +99,8 @@ async def search_all(
5899 data ,
59100 desc = "Searching NCBI database" ,
60101 unit = "keyword" ,
102+ save_interval = save_interval if save_interval > 0 else 0 ,
103+ save_callback = make_save_callback ("ncbi" ) if search_storage and save_interval > 0 else None ,
61104 )
62105 results [data_source ] = ncbi_results
63106
@@ -73,6 +116,8 @@ async def search_all(
73116 data ,
74117 desc = "Searching RNAcentral database" ,
75118 unit = "keyword" ,
119+ save_interval = save_interval if save_interval > 0 else 0 ,
120+ save_callback = make_save_callback ("rnacentral" ) if search_storage and save_interval > 0 else None ,
76121 )
77122 results [data_source ] = rnacentral_results
78123
0 commit comments