Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 28 additions & 4 deletions scrapy_wayback_machine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ class UnhandledIgnoreRequest(IgnoreRequest):

class WaybackMachineMiddleware:
cdx_url_template = ('https://web.archive.org/cdx/search/cdx?url={url}'
'&output=json&fl=timestamp,original,statuscode,digest')
'&output=json&fl={fields}')
cdx_fields = ['timestamp','original','statuscode','digest']
snapshot_url_template = 'https://web.archive.org/web/{timestamp}id_/{original}'
robots_txt = 'https://web.archive.org/robots.txt'
timestamp_format = '%Y%m%d%H%M%S'
Expand All @@ -29,6 +30,13 @@ def __init__(self, crawler):
raise NotConfigured
self.set_time_range(time_range)

# If the WAYBACK_MACHINE_FIELDS settings has not been defined,
# we default to the existing value
extra_fields = crawler.settings.get('WAYBACK_MACHINE_FIELDS', self.cdx_fields)

# Merge the new values in if they are unique
self.cdx_fields.extend(f for f in extra_fields if f not in self.cdx_fields)

def set_time_range(self, time_range):
# allow a single time to be passed in place of a range
if type(time_range) not in [tuple, list]:
Expand Down Expand Up @@ -80,7 +88,7 @@ def process_response(self, request, response, spider):

# schedule all of the snapshots
for snapshot_request in snapshot_requests:
self.crawler.engine.schedule(snapshot_request, spider)
self.crawler.engine.crawl(snapshot_request)

# abort this request
raise UnhandledIgnoreRequest
Expand All @@ -93,9 +101,16 @@ def process_response(self, request, response, spider):

def build_cdx_request(self, request):
if os.name == 'nt':
cdx_url = self.cdx_url_template.format(url=pathname2url(request.url.split('://')[1]))
cdx_url = pathname2url(request.url.split('://')[1])
else:
cdx_url = self.cdx_url_template.format(url=pathname2url(request.url))
cdx_url = pathname2url(request.url)


cdx_url = self.cdx_url_template.format(
url = cdx_url,
fields = ','.join(self.cdx_fields)
)

cdx_request = Request(cdx_url)
cdx_request.meta['wayback_machine_original_request'] = request
cdx_request.meta['wayback_machine_cdx_request'] = True
Expand Down Expand Up @@ -142,6 +157,15 @@ def build_dict(row):
'wayback_machine_time': snapshot['datetime'],
})

# Add snapshot metadata into the request object
for k,v in snapshot.items():
if k in ['datetime', 'timestamp', 'statuscode']:
continue

snapshot_request.meta.update({
'wayback_machine_' + k: v
})

snapshot_requests.append(snapshot_request)

return snapshot_requests
Expand Down