@@ -109,7 +109,7 @@ def import_archive(
109109 if not (merge_extras [0 ] in ['k' , 'n' ] and merge_extras [1 ] in ['c' , 'n' ] and merge_extras [2 ] in ['l' , 'u' , 'd' ]):
110110 raise ValueError ('merge_extras contains invalid values' )
111111 if merge_comments not in ('leave' , 'newest' , 'overwrite' ):
112- raise ValueError (f" merge_comments not in { (' leave' , ' newest' , ' overwrite' )!r} " )
112+ raise ValueError (f' merge_comments not in { (" leave" , " newest" , " overwrite" )!r} ' )
113113 type_check (group , orm .Group , allow_none = True )
114114 type_check (test_run , bool )
115115 backend = backend or get_manager ().get_profile_storage ()
@@ -232,23 +232,26 @@ def _add_new_entities(
232232 for (ufield ,) in query .distinct ().iterall (batch_size = batch_size ):
233233 if ufield not in backend_unique_id :
234234 ufields .append (ufield )
235-
236- breakpoint ()
237- with get_progress_reporter ()(desc = f'Adding new { etype .value } (s)' , total = total ) as progress :
238- rows = [
239- transform (row )
240- for row in QueryBuilder (backend = backend_from )
241- .append (
235+
236+ with get_progress_reporter ()(desc = f'Importing new { etype .value } (s)' , total = total ) as progress :
237+ # For UX: batch large ID lists so queries start returning results faster
238+ # Even though the improved IN clause handles any size, query planning for 500k+ IDs can be slow
239+ query_batch_size = 50_000
240+
241+ # Batch the IDs for querying (UX optimization, not a technical requirement)
242+ for _ , ufields_batch in batch_iter (ufields , query_batch_size ):
243+ query = QueryBuilder (backend = backend_from ).append (
242244 entity_type_to_orm [etype ],
243- filters = {unique_field : {'in' : ufields }},
245+ filters = {unique_field : {'in' : ufields_batch }},
244246 project = ['**' ],
245247 tag = 'entity' ,
246248 )
247- .dict (batch_size = batch_size )
248- ]
249- new_ids = backend_to .bulk_insert (etype , rows )
250- backend_unique_id .update ({row [unique_field ]: pk for pk , row in zip (new_ids , rows )})
251- progress .update (len (rows ))
249+
250+ # Batch the results processing for progress updates and memory efficiency
251+ for nrows , rows_batch in batch_iter (query .dict (batch_size = batch_size ), batch_size , transform ):
252+ new_ids = backend_to .bulk_insert (etype , rows_batch )
253+ backend_unique_id .update ({row [unique_field ]: pk for pk , row in zip (new_ids , rows_batch )})
254+ progress .update (nrows )
252255
253256
254257def _import_users (
0 commit comments