Skip to content

Commit a0a7665

Browse files
committed
restore and improve progress bar updates during adding nodes on archive import
1 parent 103615d commit a0a7665

File tree

1 file changed

+17
-14
lines changed

1 file changed

+17
-14
lines changed

src/aiida/tools/archive/imports.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def import_archive(
109109
if not (merge_extras[0] in ['k', 'n'] and merge_extras[1] in ['c', 'n'] and merge_extras[2] in ['l', 'u', 'd']):
110110
raise ValueError('merge_extras contains invalid values')
111111
if merge_comments not in ('leave', 'newest', 'overwrite'):
112-
raise ValueError(f"merge_comments not in {('leave', 'newest', 'overwrite')!r}")
112+
raise ValueError(f'merge_comments not in {("leave", "newest", "overwrite")!r}')
113113
type_check(group, orm.Group, allow_none=True)
114114
type_check(test_run, bool)
115115
backend = backend or get_manager().get_profile_storage()
@@ -232,23 +232,26 @@ def _add_new_entities(
232232
for (ufield,) in query.distinct().iterall(batch_size=batch_size):
233233
if ufield not in backend_unique_id:
234234
ufields.append(ufield)
235-
236-
breakpoint()
237-
with get_progress_reporter()(desc=f'Adding new {etype.value}(s)', total=total) as progress:
238-
rows = [
239-
transform(row)
240-
for row in QueryBuilder(backend=backend_from)
241-
.append(
235+
236+
with get_progress_reporter()(desc=f'Importing new {etype.value}(s)', total=total) as progress:
237+
# For UX: batch large ID lists so queries start returning results faster
238+
# Even though the improved IN clause handles any size, query planning for 500k+ IDs can be slow
239+
query_batch_size = 50_000
240+
241+
# Batch the IDs for querying (UX optimization, not a technical requirement)
242+
for _, ufields_batch in batch_iter(ufields, query_batch_size):
243+
query = QueryBuilder(backend=backend_from).append(
242244
entity_type_to_orm[etype],
243-
filters={unique_field: {'in': ufields}},
245+
filters={unique_field: {'in': ufields_batch}},
244246
project=['**'],
245247
tag='entity',
246248
)
247-
.dict(batch_size=batch_size)
248-
]
249-
new_ids = backend_to.bulk_insert(etype, rows)
250-
backend_unique_id.update({row[unique_field]: pk for pk, row in zip(new_ids, rows)})
251-
progress.update(len(rows))
249+
250+
# Batch the results processing for progress updates and memory efficiency
251+
for nrows, rows_batch in batch_iter(query.dict(batch_size=batch_size), batch_size, transform):
252+
new_ids = backend_to.bulk_insert(etype, rows_batch)
253+
backend_unique_id.update({row[unique_field]: pk for pk, row in zip(new_ids, rows_batch)})
254+
progress.update(nrows)
252255

253256

254257
def _import_users(

0 commit comments

Comments
 (0)