@@ -1539,16 +1539,28 @@ async def update_crawl_state(
15391539 print (f"status.stopReason: { status .stopReason } " , flush = True )
15401540
15411541 print (f"stats.size initial: { stats .size } " , flush = True )
1542+ print (f"status.filesAdded: { status .filesAdded } " , flush = True )
15421543 print (f"status.filesAddedSize: { status .filesAddedSize } " , flush = True )
15431544
15441545 # need to add size of previously completed WACZ files as well!
15451546 # TODO: This sometimes results in the crawl's stats.size being
15461547 # twice as large as expected when pausing crawls, as stats.size
1547- # is not necessarily decremented once WACZ files are uploaded
1548- # This then can have a downstream effects on the storage quota check
1549- stats .size += status .filesAddedSize
1550-
1551- print (f"stats.size after adding filesAddedSize: { stats .size } " , flush = True )
1548+ # isn't decremented once WACZ files are uploaded, so there's a
1549+ # period of time where uploaded WACZs can be counted twice during
1550+ # pausing
1551+ if status .stopReason not in PAUSED_STATES :
1552+ # This is close to a solution except it results in pauses after
1553+ # the first showing a smaller-than-expected size because it
1554+ # no longer counts files added previous to resuming the crawl
1555+ # Kind of seems like what we need here is a way of distinguishing
1556+ # files added prior to previous pauses (which we want to continue
1557+ # to add) from those that were just added
1558+ stats .size += status .filesAddedSize
1559+ print (f"stats.size after adding filesAddedSize: { stats .size } " , flush = True )
1560+ else :
1561+ print (
1562+ "not adding filesAddedSize to stats.size, crawl is pausing" , flush = True
1563+ )
15521564
15531565 # update status
15541566 status .pagesDone = stats .done
0 commit comments