Refactor WET extractor to use identified charset

sebastian-nagel · sebastian-nagel · commit b01de464c183 · 2025-11-01T12:03:14.000+01:00
and language from the metadata record

Transfer information from warcinfo record
diff --git a/.gitignore b/.gitignore
@@ -16,4 +16,5 @@ spark-warehouse/
 
 # get-data.sh puts data into
 crawl-data/
-input/
+input/
+/.pytest_cache/
diff --git a/sparkcc.py b/sparkcc.py
@@ -421,11 +421,23 @@ def get_warc_header(record: ArcWarcRecord, header: str, default: str=None):
     def get_http_headers(record: ArcWarcRecord):
         return record.http_headers.headers
 
+    @staticmethod
+    def is_warcinfo_record(record: ArcWarcRecord):
+        """Return true if WARC record is a WARC info record"""
+        return (record.rec_type == 'warcinfo' and
+                record.content_type == 'application/warc-fields')
+
     @staticmethod
     def is_response_record(record: ArcWarcRecord):
         """Return true if WARC record is a WARC response record"""
         return record.rec_type == 'response'
 
+    @staticmethod
+    def is_metadata_record(record: ArcWarcRecord):
+        """Return true if WARC record is a WARC metadata record"""
+        return (record.record_type == 'metadata' and
+                record.content_type == 'application/warc-fields')
+
     @staticmethod
     def is_wet_text_record(record: ArcWarcRecord):
         """Return true if WARC record is a WET text/plain record"""
diff --git a/sparkcc_fastwarc.py b/sparkcc_fastwarc.py
@@ -49,11 +49,23 @@ def get_warc_header(record: WarcRecord, header: str, default: str=None):
     def get_http_headers(record: WarcRecord):
         return record.http_headers.astuples()
 
+    @staticmethod
+    def is_warcinfo_record(record: WarcRecord):
+        """Return true if WARC record is a WARC info record"""
+        return (record.record_type == WarcRecordType.warcinfo and
+                record.headers.get('Content-Type') == 'application/warc-fields')
+
     @staticmethod
     def is_response_record(record: WarcRecord):
         """Return true if WARC record is a WARC response record"""
         return record.record_type == WarcRecordType.response
 
+    @staticmethod
+    def is_metadata_record(record: WarcRecord):
+        """Return true if WARC record is a WARC metadata record"""
+        return (record.record_type == WarcRecordType.metadata and
+                record.headers.get('Content-Type') == 'application/warc-fields')
+
     @staticmethod
     def is_wet_text_record(record: WarcRecord):
         """Return true if WARC record is a WET text/plain record"""
diff --git a/wet_extractor.py b/wet_extractor.py
@@ -1,6 +1,10 @@
+import json
+
+from datetime import datetime
 from io import BytesIO
 from tempfile import SpooledTemporaryFile
 
+from fastwarc.warc import WarcRecordType
 from pyspark.sql.types import IntegerType, StructType, StructField, StringType
 from resiliparse_parser import HTMLParser
 from warcio import WARCWriter
@@ -29,6 +33,8 @@ class WETExtractor(CCFastWarcSparkJob, CCFileProcessorSparkJob):
         ]), True)
     ])
 
+    fastwarc_record_filter = WarcRecordType.warcinfo | WarcRecordType.response | WarcRecordType.metadata
+
     def __init__(self):
         self.html_parser = HTMLParser()
 
@@ -50,20 +56,88 @@ def log_accumulators(self, session):
         self.log_accumulator(session, self.wet_records_written,
                              'WET records written = {}')
 
+    def create_warcinfo_record(self, wet_writer, wet_file_name, record, content):
+        wet_headers = {
+            'Software-Info':
+                'cc-pyspark wet_extractor.py based on FastWARC, Resiliparse, warcio',
+            'Extracted-Date':
+                datetime.utcnow()
+        }
+        wet_headers_from_warc = {
+            'robots', 'ispartof', 'operator', 'description', 'publisher'}
+        if self.is_warcinfo_record(record):
+            # Add warcinfo fields from source WARC's warcinfo record
+            try:
+                content = content.decode('utf-8')
+                for line in content.split('\r\n'):
+                    if ':' in line:
+                        name, value = line.split(':', 1)
+                        if name.lower() in wet_headers_from_warc:
+                            wet_headers[name] = value.strip()
+            except Exception as e:
+                self.get_logger().error('Error parsing warcinfo: %s', e)
+        return wet_writer.create_warcinfo_record(wet_file_name, wet_headers)
+
+    def create_and_write_wet_record(self, wet_writer, cache):
+        """Write WET record using the data in cache.
+
+        If there is a metadata record:
+        - pass the identified character encoding to the HTML parser
+        - extract identified languages metadata record and add as
+          WARC header field "WARC-Identified-Content-Language
+        """
+        self.get_logger().debug('Converting %s', cache['uri'])
+        if 'response' not in cache:
+            self.get_logger().error('No response record for %s', cache['uri'])
+            return
+        elif cache['response'][1] is None:
+            # no content because not HTML
+            return
+        encoding = None
+        languages = None
+        if 'metadata' in cache:
+            try:
+                content = cache['metadata'][1].decode('utf-8')
+                for line in content.split('\r\n'):
+                    if line.startswith('charset-detected:'):
+                        _, value = line.split(':', 1)
+                        encoding = value.strip()
+                    elif line.startswith('languages-cld2:'):
+                        _, value = line.split(':', 1)
+                        lang_cld2 = json.loads(value.strip())
+                        if 'languages' in lang_cld2:
+                            languages = ','.join(map(lambda l: l['code-iso-639-3'],
+                                                     lang_cld2['languages']))
+            except Exception as e:
+                self.get_logger().error('Error parsing metadata: %s', e)
+        html_tree = self.html_parser.get_html_tree(cache['response'][1],
+                                                   encoding=encoding)
+        text = self.html_parser.html_to_text(html_tree)
+        wet_headers_dict = {
+            'WARC-Date': cache['date'],
+            'WARC-Refers-To': cache['id'],
+            'Content-Type': 'text/plain',
+        }
+        if languages:
+            wet_headers_dict['WARC-Identified-Content-Language'] = languages
+        wet_record = wet_writer.create_warc_record(
+            cache['uri'], 'conversion',
+            payload=BytesIO(str.encode(text, 'UTF-8')),
+            warc_headers_dict=wet_headers_dict)
+        wet_writer.write_record(wet_record)
+
     def process_record(self, record):
+        if self.is_warcinfo_record(record) or self.is_metadata_record(record):
+            yield record, self.get_payload_stream(record).read()
         if not self.is_response_record(record):
-            # skip over WARC request or metadata records
-            return ''
+            # skip over WARC request records
+            return
         if not self.is_html(record):
+            # non-HTML record: yield without content
             self.records_non_html.add(1)
-            return ''
-        self.get_logger().debug('Converting %s',
-                                self.get_warc_header(record, 'WARC-Target-URI'))
-        page = self.get_payload_stream(record).read()
-        encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset')
-        html_tree = self.html_parser.get_html_tree(page, encoding=encoding)
-        text = self.html_parser.html_to_text(html_tree)
-        yield record, text
+            yield record, None
+            return
+        yield record, self.get_payload_stream(record).read()
 
     def process_warc(self, uri, stream):
         wet_file_name = uri.split('/')[-1].replace('.warc.gz', '.warc.wet.gz')
@@ -74,31 +148,37 @@ def process_warc(self, uri, stream):
                                   mode='w+b',
                                   dir=self.args.local_temp_dir) as wet_temp:
             wet_writer = WARCWriter(wet_temp, gzip=True)
-            wet_writer.create_warcinfo_record(wet_file_name, {
-                'Software-Info':
-                    'cc-pyspark wet_extractor.py based on FastWARC, Resiliparse, warcio',
-                # TODO: Add more warcinfo fields from source WARC's warcinfo record
-            })
             offset = wet_temp.tell()
+            cache = {'uri': None, 'date': None}
             for res in super(WETExtractor, self).process_warc(uri, stream):
-                warc_target_uri = self.get_warc_header(res[0], 'WARC-Target-URI')
-                warc_date = self.get_warc_header(res[0], 'WARC-Date')
-                warc_record_id = self.get_warc_header(res[0], 'WARC-Record-ID')
-                # TODO: extract language from following metadata record and add as
-                #       WARC header field "WARC-Identified-Content-Language"
-                wet_record = wet_writer.create_warc_record(
-                    warc_target_uri, 'conversion',
-                    payload=BytesIO(str.encode(res[1], 'UTF-8')),
-                    warc_headers_dict={
-                        'WARC-Date': warc_date,
-                        'WARC-Refers-To': warc_record_id,
-                        'Content-Type': 'text/plain',
-                    })
-                wet_writer.write_record(wet_record)
-                end_offset = wet_temp.tell()
-                yield warc_target_uri, (wet_file_name, offset, (end_offset-offset))
-                offset = end_offset
-                self.wet_records_written.add(1)
+                (record, content) = res
+                if offset == 0:
+                    wet_writer.write_record(
+                        self.create_warcinfo_record(wet_writer, wet_file_name, record, content))
+                    offset = wet_temp.tell()
+                    if self.is_warcinfo_record(record):
+                        continue
+                warc_target_uri = self.get_warc_header(record, 'WARC-Target-URI')
+                warc_date = self.get_warc_header(record, 'WARC-Date')
+                if cache['uri'] and (cache['uri'] != warc_target_uri
+                                     or cache['date'] != warc_date):
+                    self.create_and_write_wet_record(wet_writer, cache)
+                    end_offset = wet_temp.tell()
+                    yield cache['uri'], (wet_file_name, offset, (end_offset-offset))
+                    offset = end_offset
+                    self.wet_records_written.add(1)
+                    cache = {'uri': None, 'date': None}
+                if cache['uri'] is None and cache['date'] is None:
+                    cache['uri'] = warc_target_uri
+                    cache['date'] = warc_date
+                if self.is_response_record(record):
+                    cache['response'] = (record, content)
+                    cache['id'] = self.get_warc_header(record, 'WARC-Record-ID')
+                elif self.is_metadata_record(record):
+                    cache['metadata'] = (record, content)
+            self.create_and_write_wet_record(wet_writer, cache)
+            end_offset = wet_temp.tell()
+            yield cache['uri'], (wet_file_name, offset, (end_offset-offset))
             wet_temp.seek(0)
             self.write_output_file(wet_file_name, wet_temp, self.args.output_base_url)