sangaline · shanept · Feb 18, 2024 · Jan 28, 2025
diff --git a/scrapy_wayback_machine/__init__.py b/scrapy_wayback_machine/__init__.py
@@ -15,7 +15,8 @@ class UnhandledIgnoreRequest(IgnoreRequest):
 
 class WaybackMachineMiddleware:
     cdx_url_template = ('https://web.archive.org/cdx/search/cdx?url={url}'
-                    '&output=json&fl=timestamp,original,statuscode,digest')
+                    '&output=json&fl={fields}')
+    cdx_fields = ['timestamp','original','statuscode','digest']
     snapshot_url_template = 'https://web.archive.org/web/{timestamp}id_/{original}'
     robots_txt = 'https://web.archive.org/robots.txt'
     timestamp_format = '%Y%m%d%H%M%S'
@@ -29,6 +30,13 @@ def __init__(self, crawler):
             raise NotConfigured
         self.set_time_range(time_range)
 
+        # If the WAYBACK_MACHINE_FIELDS settings has not been defined,
+        # we default to the existing value
+        extra_fields = crawler.settings.get('WAYBACK_MACHINE_FIELDS', self.cdx_fields)
+
+        # Merge the new values in if they are unique
+        self.cdx_fields.extend(f for f in extra_fields if f not in self.cdx_fields)
+
     def set_time_range(self, time_range):
         # allow a single time to be passed in place of a range
         if type(time_range) not in [tuple, list]:
@@ -80,7 +88,7 @@ def process_response(self, request, response, spider):
 
             # schedule all of the snapshots
             for snapshot_request in snapshot_requests:
-                self.crawler.engine.schedule(snapshot_request, spider)
+                self.crawler.engine.crawl(snapshot_request)
 
             # abort this request
             raise UnhandledIgnoreRequest
@@ -93,9 +101,16 @@ def process_response(self, request, response, spider):
 
     def build_cdx_request(self, request):
         if os.name == 'nt':
-            cdx_url = self.cdx_url_template.format(url=pathname2url(request.url.split('://')[1]))
+            cdx_url = pathname2url(request.url.split('://')[1])
         else:
-            cdx_url = self.cdx_url_template.format(url=pathname2url(request.url))
+            cdx_url = pathname2url(request.url)
+
+
+        cdx_url = self.cdx_url_template.format(
+            url = cdx_url,
+            fields = ','.join(self.cdx_fields)
+        )
+
         cdx_request = Request(cdx_url)
         cdx_request.meta['wayback_machine_original_request'] = request
         cdx_request.meta['wayback_machine_cdx_request'] = True
@@ -142,6 +157,15 @@ def build_dict(row):
                 'wayback_machine_time': snapshot['datetime'],
             })
 
+            # Add snapshot metadata into the request object
+            for k,v in snapshot.items():
+                if k in ['datetime', 'timestamp', 'statuscode']:
+                    continue
+
+                snapshot_request.meta.update({
+                    'wayback_machine_' + k: v
+                })
+
             snapshot_requests.append(snapshot_request)
 
         return snapshot_requests