Semi-solution with comments describing why it's not perfect

tw4l · tw4l · commit 3ae056557090 · 2025-11-24T16:45:53.000-05:00
diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py
@@ -1539,16 +1539,28 @@ async def update_crawl_state(
         print(f"status.stopReason: {status.stopReason}", flush=True)
 
         print(f"stats.size initial: {stats.size}", flush=True)
+        print(f"status.filesAdded: {status.filesAdded}", flush=True)
         print(f"status.filesAddedSize: {status.filesAddedSize}", flush=True)
 
         # need to add size of previously completed WACZ files as well!
         # TODO: This sometimes results in the crawl's stats.size being
         # twice as large as expected when pausing crawls, as stats.size
-        # is not necessarily decremented once WACZ files are uploaded
-        # This then can have a downstream effects on the storage quota check
-        stats.size += status.filesAddedSize
-
-        print(f"stats.size after adding filesAddedSize: {stats.size}", flush=True)
+        # isn't decremented once WACZ files are uploaded, so there's a
+        # period of time where uploaded WACZs can be counted twice during
+        # pausing
+        if status.stopReason not in PAUSED_STATES:
+            # This is close to a solution except it results in pauses after
+            # the first showing a smaller-than-expected size because it
+            # no longer counts files added previous to resuming the crawl
+            # Kind of seems like what we need here is a way of distinguishing
+            # files added prior to previous pauses (which we want to continue
+            # to add) from those that were just added
+            stats.size += status.filesAddedSize
+            print(f"stats.size after adding filesAddedSize: {stats.size}", flush=True)
+        else:
+            print(
+                "not adding filesAddedSize to stats.size, crawl is pausing", flush=True
+            )
 
         # update status
         status.pagesDone = stats.done