Skip to content

Commit 48678da

Browse files
Keep WARC download running on individual connection errors
1 parent d43a466 commit 48678da

File tree

1 file changed

+14
-5
lines changed
  • archive_query_log/downloaders

1 file changed

+14
-5
lines changed

archive_query_log/downloaders/warc.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22
from itertools import chain
33
from typing import Iterable, Iterator, TypeVar, Generic, Type, Callable
44
from uuid import uuid5
5+
from warnings import warn
56

67
from click import echo
78
from elasticsearch_dsl import Search
89
from elasticsearch_dsl.function import RandomScore
910
from elasticsearch_dsl.query import Exists, FunctionScore, Term, RankFeature
11+
from requests import ConnectionError as RequestsConnectionError
1012
from tqdm.auto import tqdm
1113
from warc_s3 import WarcS3Record
1214
from warcio.recordloader import ArcWarcRecord
@@ -71,11 +73,18 @@ def _download_serp_warc(
7173
api_url=serp.archive.memento_api_url,
7274
session=config.http.session,
7375
)
74-
records = memento_api.load_url_warc(
75-
url=serp.capture.url,
76-
timestamp=serp.capture.timestamp,
77-
raw=True,
78-
)
76+
try:
77+
records = memento_api.load_url_warc(
78+
url=serp.capture.url,
79+
timestamp=serp.capture.timestamp,
80+
raw=True,
81+
)
82+
except RequestsConnectionError:
83+
warn(RuntimeWarning(
84+
f"Connection error while downloading WARC "
85+
f"for capture URL {serp.capture.url} at {serp.capture.timestamp}."
86+
))
87+
return
7988
for record in records:
8089
yield _SerpArcWarcRecord(serp, record)
8190

0 commit comments

Comments
 (0)