From b37b7feb0dd87ff91b0b0c0dd2271e52aeecf70a Mon Sep 17 00:00:00 2001 From: David Graham Date: Fri, 14 Feb 2025 10:05:46 -0800 Subject: [PATCH 1/5] first --- python/dolma/cli/deduper.py | 1 - python/dolma/cli/mixer.py | 5 +++- python/dolma/cli/tagger.py | 5 ++++ python/dolma/core/runtime.py | 3 +- python/dolma/warc/processor.py | 15 +++++++--- src/shard.rs | 18 ++++++++--- tests/python/test_mixer.py | 32 ++++++++++++++++++++ tests/python/test_paths.py | 2 -- tests/python/test_warc.py | 51 +++++++++++++++++++++++++++++++ tests/python/utils.py | 55 +++++++++++++++++----------------- 10 files changed, 147 insertions(+), 40 deletions(-) diff --git a/python/dolma/cli/deduper.py b/python/dolma/cli/deduper.py index de6a43d5..d263d4ca 100644 --- a/python/dolma/cli/deduper.py +++ b/python/dolma/cli/deduper.py @@ -192,7 +192,6 @@ def run(cls, parsed_config: DeduperConfig): # perform some path validation to make sure we don't call the mixer with invalid config total_matching_documents = 0 for document in parsed_config.documents: - if not any( fnmatch.fnmatch(dict_config["dedupe"]["document_dir"], part) for part in document.split(os.sep) ): diff --git a/python/dolma/cli/mixer.py b/python/dolma/cli/mixer.py index 2ac6c5c5..41f632c6 100644 --- a/python/dolma/cli/mixer.py +++ b/python/dolma/cli/mixer.py @@ -66,6 +66,9 @@ class StreamConfig: "from the file extension." ), ) + document_dir: str = field( + default="documents", help="Folder in source path to replace with 'attributes' when looking for attributes" + ) @dataclass @@ -145,7 +148,6 @@ def run(cls, parsed_config: MixerConfig): # perform some path validation to make sure we don't call the mixer with invalid config total_matching_documents = 0 for document in stream_config.documents: - current_matching_documents = sum(1 for _ in glob_path(document)) if current_matching_documents == 0: # only raise a warning if no documents are found for a single path @@ -159,6 +161,7 @@ def run(cls, parsed_config: MixerConfig): # populate the stream config dict stream_config_dict["name"] = stream_config.name stream_config_dict["documents"] = [str(d) for d in stream_config.documents] + stream_config_dict["document_dir"] = stream_config.document_dir stream_config_dict["attributes"] = [str(a) for a in list(stream_config.attributes)] stream_config_dict["output"] = { "path": str(stream_config.output.path), diff --git a/python/dolma/cli/tagger.py b/python/dolma/cli/tagger.py index 9982ec05..34df31ba 100644 --- a/python/dolma/cli/tagger.py +++ b/python/dolma/cli/tagger.py @@ -91,6 +91,10 @@ class TaggerConfig: default=False, help="If true, only print the configuration and exit without running the taggers.", ) + document_dir: Optional[str] = field( + default="documents", + help="The folder in source paths to replace with 'attributes' to store results, if not 'documents'", + ) class TaggerCli(BaseCli): @@ -140,6 +144,7 @@ def run(cls, parsed_config: TaggerConfig): profile_output=parsed_config.profile.output, profile_steps=parsed_config.profile.steps, profile_sort_key=parsed_config.profile.sort_key, + document_dir=parsed_config.document_dir, ) diff --git a/python/dolma/core/runtime.py b/python/dolma/core/runtime.py index ac5e2a23..320ca901 100644 --- a/python/dolma/core/runtime.py +++ b/python/dolma/core/runtime.py @@ -392,6 +392,7 @@ def create_and_run_tagger( profile_steps: Optional[int] = None, profile_sort_key: str = "tottime", profile_lines: int = 100, + document_dir: Optional[str] = "documents", ): """This function creates a tagger and runs it on a list of documents. @@ -444,7 +445,7 @@ def create_and_run_tagger( if destination is None: try: - destination = _make_paths_from_substitution(documents, "documents", f"attributes/{experiment}") + destination = _make_paths_from_substitution(documents, document_dir, f"attributes/{experiment}") except Exception as exp: raise RuntimeError("Could not make destination paths from documents paths") from exp elif isinstance(destination, str): diff --git a/python/dolma/warc/processor.py b/python/dolma/warc/processor.py index 474c6ca9..a20e0263 100644 --- a/python/dolma/warc/processor.py +++ b/python/dolma/warc/processor.py @@ -107,6 +107,7 @@ def process_single( pre_taggers_names: List[str] = kwargs.get("pre_taggers") or [] pre_taggers = {make_variable_name(name): TaggerRegistry.get(name)() for name in pre_taggers_names} + # create the html extractor linearizer_name: str = kwargs.get("linearizer_name") or "resiliparse" linearizer = LinearizerRegistry.get(linearizer_name)() @@ -127,6 +128,7 @@ def process_single( # whether to skip this document if post-taggers find nothing skip_no_post_taggers: bool = kwargs.get("skip_no_post_taggers") or False + skip_linearization: bool = kwargs.get("skip_linearization") or False # derive the destination path if it is not provided by splitting out all the # extensions, removing gz and warc, and adding jsonl.gz if not destination_path.endswith(".jsonl.gz"): @@ -192,12 +194,15 @@ def process_single( continue # extract text - doc.text = linearizer.linearize(content=decoded_content) + if skip_linearization: + doc.text = decoded_content + else: + doc.text = linearizer.linearize(content=decoded_content) # these are the properties extracted from the HTML content - post_attributes = {name: tagger.tag(doc) for name, tagger in post_taggers.items()} - if skip_no_post_taggers and not sum(map(len, post_attributes.values())): - continue + # post_attributes = {name: tagger.tag(doc) for name, tagger in post_taggers.items()} + # if skip_no_post_taggers and not sum(map(len, post_attributes.values())): + # continue doc.attributes = { f"{t_name}__{t_name}__{make_variable_name(a_name)}": attr_values @@ -247,6 +252,7 @@ def create_and_run_warc_pipeline( store_html_in_metadata: bool = False, skip_no_pre_taggers: bool = False, skip_no_post_taggers: bool = False, + skip_linearization: bool = False, ): with ExitStack() as stack: if metadata is None: @@ -302,4 +308,5 @@ def create_and_run_warc_pipeline( skip_no_pre_taggers=skip_no_pre_taggers, skip_no_post_taggers=skip_no_post_taggers, source_name=source_name, + skip_linearization=skip_linearization ) diff --git a/src/shard.rs b/src/shard.rs index 226ba194..c66d6511 100644 --- a/src/shard.rs +++ b/src/shard.rs @@ -40,6 +40,10 @@ impl Shard { pub fn split_streams(streams: &Vec) -> Result, IoError> { let mut shards: Vec = Vec::new(); for stream_config in streams { + let document_dir = format!( + "/{}/", + stream_config.document_dir.as_deref().unwrap_or("documents") + ); let mut stream_shard_count = 0; log::info!("Computing shards for stream {}...", stream_config.name); let stream_inputs = find_objects_matching_patterns(&stream_config.documents)?; @@ -50,7 +54,7 @@ impl Shard { let mut attr_paths = Vec::new(); for prefix in stream_config.attributes.iter() { let attr_prefix = format!("/attributes/{}/", prefix); - let attr_path = input.replace("/documents/", &attr_prefix); + let attr_path = input.replace(&document_dir, &attr_prefix); attr_paths.push(attr_path); } ( @@ -135,13 +139,17 @@ impl Shard { // dataset is a strict subset of the original and is intended to be unshuffled and unsharded. let mut shards: Vec = Vec::new(); for stream_config in streams { + let document_dir = format!( + "/{}/", + stream_config.document_dir.as_deref().unwrap_or("documents") + ); let stream_inputs = find_objects_matching_patterns(&stream_config.documents)?; let input_count = stream_inputs.len(); let inputs = stream_inputs.into_iter().map(|input| { let mut attr_paths = Vec::new(); for prefix in stream_config.attributes.iter() { let attr_prefix = format!("/attributes/{}/", prefix); - let attr_path = input.replace("/documents/", &attr_prefix); + let attr_path = input.replace(&document_dir, &attr_prefix); attr_paths.push(attr_path); } DocumentPaths { @@ -152,10 +160,11 @@ impl Shard { for input in inputs { let doc_path_clone = input.doc_path.clone(); - let output_suffix = doc_path_clone.split("/documents/").last().unwrap(); + let output_suffix = doc_path_clone.split(&document_dir).last().unwrap(); let output = format!( - "{}/documents/{}", + "{}{}{}", stream_config.output.path.clone(), + document_dir, output_suffix ); log::info!("Creating shard for {}", output); @@ -543,6 +552,7 @@ pub mod shard_config { pub span_replacement: Option>, pub output: StreamOutputConfig, pub compression: Option, + pub document_dir: Option, } #[derive(Serialize, Deserialize, Clone)] diff --git a/tests/python/test_mixer.py b/tests/python/test_mixer.py index 68ea1721..952e4c20 100644 --- a/tests/python/test_mixer.py +++ b/tests/python/test_mixer.py @@ -22,6 +22,8 @@ EMAIL_SPANS_JQ = Path(__file__).parent.parent / "config/email-spans-jq.yaml" FILTER_BY_SPANS = Path(__file__).parent.parent / "config/filter-by-spans.json" MIXER = Path(__file__).parent.parent / "config/mixer.json" +ALT_DOC_PATH_MIXER = Path(__file__).parent.parent / "config/alt-path-mixer.json" + PARAGRAPH_SPANS = Path(__file__).parent.parent / "config/paragraph-spans.json" @@ -150,6 +152,36 @@ def test_remote_input_remote_output(self): provided = self.checkAndRemoveProvenance(provided) self.assertEqual(expected, provided) + def test_alt_doc_path_mixer(self): + if self.remote_test_prefix is None: + return self.skipTest("Skipping AWS tests") + + with open(ALT_DOC_PATH_MIXER, mode="r", encoding="utf8") as f: + config = json.load(f) + + # keep track of local output path + local_input = config["streams"][0]["documents"][0] + local_output = config["streams"][0]["output"]["path"] + + # replace results path with s3 path + config["streams"][0]["output"]["path"] = f"{self.remote_test_prefix}/{local_output}" + + # upload local input to s3, replace local input with s3 path + config["streams"][0]["documents"][0] = f"{self.remote_test_prefix}/{local_input}" + + with NamedTemporaryFile("w") as f: + json.dump(config, f) + f.flush() + + main(argv=["-c", f.name, "mix"]) + + download_s3_prefix(f"{self.remote_test_prefix}/tests/work", "tests/work/remote") + expected = load_jsonl("tests/data/expected/mixer.json.gz") + provided = load_jsonl("tests/work/remote/output/mixer/mixer-test-0000.json.gz") + provided = self.checkAndRemoveProvenance(provided) + self.assertEqual(expected, provided) + + def test_remote_input_local_output(self): if self.remote_test_prefix is None: return self.skipTest("Skipping AWS tests") diff --git a/tests/python/test_paths.py b/tests/python/test_paths.py index e920af74..df758e22 100644 --- a/tests/python/test_paths.py +++ b/tests/python/test_paths.py @@ -295,7 +295,6 @@ def test_split_glob(self): class TestSplitExt(TestCase): def test_file(self): - prot, parts, ext = split_ext("file.txt") self.assertEqual(prot, "") @@ -318,7 +317,6 @@ def test_file(self): self.assertEqual(ext, ".") def test_path(self): - prot, parts, ext = split_ext("path/to/file.txt") self.assertEqual(prot, "") diff --git a/tests/python/test_warc.py b/tests/python/test_warc.py index 04f0e9a7..4e37a843 100644 --- a/tests/python/test_warc.py +++ b/tests/python/test_warc.py @@ -103,3 +103,54 @@ def test_pretag_html(self): {"by_4_0", "by_3_0"}, ) self.assertIn("cc_re__cc_re__cc_by_4_0", sample1[2]["attributes"]) + + def test_skip_linearization(self): + """Test that when skip_linearization is True, the raw HTML content is preserved.""" + outputs = self._run_pipeline_with_skip_linearization() + self.assertEqual(len(outputs), 2) + self.assertIn("sample-0000.jsonl.gz", outputs) + self.assertIn("sample-0001.jsonl.gz", outputs) + + sample0 = outputs["sample-0000.jsonl.gz"] + sample1 = outputs["sample-0001.jsonl.gz"] + + # Check that we got some documents + self.assertGreater(len(sample0), 0) + self.assertGreater(len(sample1), 0) + + # For all documents, verify they contain raw HTML instead of linearized text + for sample in chain(sample0, sample1): + # HTML content should be in the text field + self.assertIn("<", sample["text"]) + self.assertIn(">", sample["text"]) + + # Common HTML tags that should be present in raw HTML + html_indicators = [" Dict[str, List[dict]]: + """Helper method to run pipeline with skip_linearization=True.""" + create_and_run_warc_pipeline( + documents=[f"{DATA_PATH}/*.warc.gz"], + destination=[self.tempdir], + num_processes=1, + ignore_existing=False, + debug=True, + source_name="test", + skip_no_pre_taggers=False, + skip_no_post_taggers=False, + store_html_in_metadata=False, + linearizer_name="resiliparse", + skip_linearization=True, + pre_taggers=["cc_re"], + post_taggers=["lingua_1e2"], + ) diff --git a/tests/python/utils.py b/tests/python/utils.py index 9813f2d3..ca194f9c 100644 --- a/tests/python/utils.py +++ b/tests/python/utils.py @@ -70,33 +70,33 @@ def skip_aws_tests() -> bool: return (dolma_tests_skip or "false").lower() == "true" -def upload_test_documents(local_input: str, test_prefix: str) -> Tuple[str, str]: - remote_input = f"{test_prefix}/input/documents" - remote_output = f"{test_prefix}/output/documents" +# def upload_test_documents(local_input: str, test_prefix: str, document_dir: str = "documents") -> Tuple[str, str]: +# remote_input = f"{test_prefix}/input/{document_dir}" +# remote_output = f"{test_prefix}/output/{document_dir}" - for i, local_fp in enumerate(glob_path(local_input)): - remote_fp = f"{remote_input}/{i:05d}.json.gz" +# for i, local_fp in enumerate(glob_path(local_input)): +# remote_fp = f"{remote_input}/{i:05d}.json.gz" - with open(local_fp, "rb") as f, open(remote_fp, "wb") as g: - g.write(f.read()) +# with open(local_fp, "rb") as f, open(remote_fp, "wb") as g: +# g.write(f.read()) - return remote_input, remote_output +# return remote_input, remote_output -def upload_test_attributes(local_attributes: str, test_prefix: str): - remote_attributes = f"{test_prefix}/input/attributes" +# def upload_test_attributes(local_attributes: str, test_prefix: str): +# remote_attributes = f"{test_prefix}/input/attributes" - for i, local_fp in enumerate(glob_path(local_attributes)): - matched = re.match(r"^(attributes|duplicate)-(\w+)", local_fp) - if not matched: - raise RuntimeError(f"Unexpected filename: {local_fp}") +# for i, local_fp in enumerate(glob_path(local_attributes)): +# matched = re.match(r"^(attributes|duplicate)-(\w+)", local_fp) +# if not matched: +# raise RuntimeError(f"Unexpected filename: {local_fp}") - _, name = matched.groups() +# _, name = matched.groups() - remote_fp = f"{remote_attributes}/{name}/{i:05d}.json.gz" +# remote_fp = f"{remote_attributes}/{name}/{i:05d}.json.gz" - with open(local_fp, "rb") as f, open(remote_fp, "wb") as g: - g.write(f.read()) +# with open(local_fp, "rb") as f, open(remote_fp, "wb") as g: +# g.write(f.read()) def clean_test_data(test_prefix: str): @@ -127,6 +127,7 @@ def upload_s3_prefix(s3_prefix: str, local_prefix: str): bucket_name, prefix = parse_s3_path(s3_prefix) for local_fp in glob_path(local_prefix): + print(f"LOCAL_FP {local_fp}") name = local_fp.replace(local_prefix, "").lstrip("/") s3.upload_file(Bucket=bucket_name, Key=f"{prefix}/{name}", Filename=local_fp) @@ -167,9 +168,9 @@ def writeUnits( return [str(p) for p in file_paths] - def writeDocs(self, docs: List[str], partitions: int = 1, ext_dir: Optional[Path] = None) -> List[str]: + def writeDocs(self, docs: List[str], partitions: int = 1, ext_dir: Optional[Path] = None,unit_type: str = "documents") -> List[str]: encoded_docs = [{"id": str(i), "text": d, "source": __file__} for i, d in enumerate(docs)] - return self.writeUnits(units=encoded_docs, unit_type="documents", partitions=partitions, ext_dir=ext_dir) + return self.writeUnits(units=encoded_docs, unit_type=unit_type, partitions=partitions, ext_dir=ext_dir) def writeAttributes( self, @@ -199,10 +200,10 @@ def writeConfig(self, config: dict, ext_dir: Optional[Path] = None) -> str: def combineIntoDoc(self, *lines: str, join: str = "\n") -> str: return join.join(lines) - def makeDocsCopy(self, path: Union[str, Path]) -> str: - path = Path(path) - dest = Path(self.makeUniquePath()) / "documents" - dest.mkdir(parents=True) - for fp in path.iterdir(): - shutil.copy(fp, dest / fp.name) - return str(dest) + # def makeDocsCopy(self, path: Union[str, Path]) -> str: + # path = Path(path) + # dest = Path(self.makeUniquePath()) / "documents" + # dest.mkdir(parents=True) + # for fp in path.iterdir(): + # shutil.copy(fp, dest / fp.name) + # return str(dest) From 19089da15b01d800217f4574560073e10c307655 Mon Sep 17 00:00:00 2001 From: David Graham Date: Fri, 14 Feb 2025 11:08:09 -0800 Subject: [PATCH 2/5] . --- python/dolma/warc/linearizers.py | 8 ++++++++ python/dolma/warc/processor.py | 14 ++++---------- tests/python/test_mixer.py | 1 - tests/python/test_warc.py | 4 ++-- tests/python/utils.py | 4 +++- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/python/dolma/warc/linearizers.py b/python/dolma/warc/linearizers.py index a99c0775..9ebd594e 100644 --- a/python/dolma/warc/linearizers.py +++ b/python/dolma/warc/linearizers.py @@ -143,3 +143,11 @@ def linearize(self, content: Union[str, bytes]) -> str: ) self._flush() return output or "" + + + + +@LinearizerRegistry.add("no-op") +class NoOpLinearizer(BaseLinearizer): + def linearize(self, content: Union[str, bytes]) -> str: + return content \ No newline at end of file diff --git a/python/dolma/warc/processor.py b/python/dolma/warc/processor.py index a20e0263..1f9744ca 100644 --- a/python/dolma/warc/processor.py +++ b/python/dolma/warc/processor.py @@ -107,7 +107,6 @@ def process_single( pre_taggers_names: List[str] = kwargs.get("pre_taggers") or [] pre_taggers = {make_variable_name(name): TaggerRegistry.get(name)() for name in pre_taggers_names} - # create the html extractor linearizer_name: str = kwargs.get("linearizer_name") or "resiliparse" linearizer = LinearizerRegistry.get(linearizer_name)() @@ -128,7 +127,6 @@ def process_single( # whether to skip this document if post-taggers find nothing skip_no_post_taggers: bool = kwargs.get("skip_no_post_taggers") or False - skip_linearization: bool = kwargs.get("skip_linearization") or False # derive the destination path if it is not provided by splitting out all the # extensions, removing gz and warc, and adding jsonl.gz if not destination_path.endswith(".jsonl.gz"): @@ -194,14 +192,11 @@ def process_single( continue # extract text - if skip_linearization: - doc.text = decoded_content - else: - doc.text = linearizer.linearize(content=decoded_content) + doc.text = linearizer.linearize(content=decoded_content) # these are the properties extracted from the HTML content - # post_attributes = {name: tagger.tag(doc) for name, tagger in post_taggers.items()} - # if skip_no_post_taggers and not sum(map(len, post_attributes.values())): + # post_attributes = {name: tagger.tag(doc) for name, tagger in post_taggers.items()} + # if skip_no_post_taggers and not sum(map(len, post_attributes.values())): # continue doc.attributes = { @@ -307,6 +302,5 @@ def create_and_run_warc_pipeline( post_taggers=post_taggers, skip_no_pre_taggers=skip_no_pre_taggers, skip_no_post_taggers=skip_no_post_taggers, - source_name=source_name, - skip_linearization=skip_linearization + source_name=source_name ) diff --git a/tests/python/test_mixer.py b/tests/python/test_mixer.py index 952e4c20..5c6d4718 100644 --- a/tests/python/test_mixer.py +++ b/tests/python/test_mixer.py @@ -181,7 +181,6 @@ def test_alt_doc_path_mixer(self): provided = self.checkAndRemoveProvenance(provided) self.assertEqual(expected, provided) - def test_remote_input_local_output(self): if self.remote_test_prefix is None: return self.skipTest("Skipping AWS tests") diff --git a/tests/python/test_warc.py b/tests/python/test_warc.py index 4e37a843..75b10fa4 100644 --- a/tests/python/test_warc.py +++ b/tests/python/test_warc.py @@ -123,11 +123,11 @@ def test_skip_linearization(self): # HTML content should be in the text field self.assertIn("<", sample["text"]) self.assertIn(">", sample["text"]) - + # Common HTML tags that should be present in raw HTML html_indicators = [" List[str]: + def writeDocs( + self, docs: List[str], partitions: int = 1, ext_dir: Optional[Path] = None, unit_type: str = "documents" + ) -> List[str]: encoded_docs = [{"id": str(i), "text": d, "source": __file__} for i, d in enumerate(docs)] return self.writeUnits(units=encoded_docs, unit_type=unit_type, partitions=partitions, ext_dir=ext_dir) From 41bf80e66549356690e8c3690416d414f13afe84 Mon Sep 17 00:00:00 2001 From: David Graham Date: Fri, 14 Feb 2025 13:17:18 -0800 Subject: [PATCH 3/5] style --- python/dolma/cli/tagger.py | 2 +- python/dolma/core/runtime.py | 2 +- python/dolma/warc/linearizers.py | 4 +-- python/dolma/warc/processor.py | 8 ++--- tests/python/test_warc.py | 6 ++++ tests/python/utils.py | 50 ++++++++++++++++---------------- 6 files changed, 38 insertions(+), 34 deletions(-) diff --git a/python/dolma/cli/tagger.py b/python/dolma/cli/tagger.py index 34df31ba..9d29eafe 100644 --- a/python/dolma/cli/tagger.py +++ b/python/dolma/cli/tagger.py @@ -91,7 +91,7 @@ class TaggerConfig: default=False, help="If true, only print the configuration and exit without running the taggers.", ) - document_dir: Optional[str] = field( + document_dir: str = field( default="documents", help="The folder in source paths to replace with 'attributes' to store results, if not 'documents'", ) diff --git a/python/dolma/core/runtime.py b/python/dolma/core/runtime.py index 320ca901..8ebfa0d3 100644 --- a/python/dolma/core/runtime.py +++ b/python/dolma/core/runtime.py @@ -392,7 +392,7 @@ def create_and_run_tagger( profile_steps: Optional[int] = None, profile_sort_key: str = "tottime", profile_lines: int = 100, - document_dir: Optional[str] = "documents", + document_dir: str = "documents", ): """This function creates a tagger and runs it on a list of documents. diff --git a/python/dolma/warc/linearizers.py b/python/dolma/warc/linearizers.py index 9ebd594e..c4d588cd 100644 --- a/python/dolma/warc/linearizers.py +++ b/python/dolma/warc/linearizers.py @@ -145,9 +145,7 @@ def linearize(self, content: Union[str, bytes]) -> str: return output or "" - - @LinearizerRegistry.add("no-op") class NoOpLinearizer(BaseLinearizer): def linearize(self, content: Union[str, bytes]) -> str: - return content \ No newline at end of file + return str(content) diff --git a/python/dolma/warc/processor.py b/python/dolma/warc/processor.py index 1f9744ca..a3d949d1 100644 --- a/python/dolma/warc/processor.py +++ b/python/dolma/warc/processor.py @@ -195,9 +195,9 @@ def process_single( doc.text = linearizer.linearize(content=decoded_content) # these are the properties extracted from the HTML content - # post_attributes = {name: tagger.tag(doc) for name, tagger in post_taggers.items()} - # if skip_no_post_taggers and not sum(map(len, post_attributes.values())): - # continue + post_attributes = {name: tagger.tag(doc) for name, tagger in post_taggers.items()} + if skip_no_post_taggers and not sum(map(len, post_attributes.values())): + continue doc.attributes = { f"{t_name}__{t_name}__{make_variable_name(a_name)}": attr_values @@ -302,5 +302,5 @@ def create_and_run_warc_pipeline( post_taggers=post_taggers, skip_no_pre_taggers=skip_no_pre_taggers, skip_no_post_taggers=skip_no_post_taggers, - source_name=source_name + source_name=source_name, ) diff --git a/tests/python/test_warc.py b/tests/python/test_warc.py index 75b10fa4..e69d3852 100644 --- a/tests/python/test_warc.py +++ b/tests/python/test_warc.py @@ -154,3 +154,9 @@ def _run_pipeline_with_skip_linearization(self) -> Dict[str, List[dict]]: pre_taggers=["cc_re"], post_taggers=["lingua_1e2"], ) + outputs: Dict[str, List[dict]] = {} + for fn in os.listdir(self.tempdir): + with smart_open.open(os.path.join(self.tempdir, fn), mode="rt", encoding="utf-8") as f: + for ln in f: + outputs.setdefault(fn, []).append(json.loads(ln)) + return outputs diff --git a/tests/python/utils.py b/tests/python/utils.py index 6c336c48..a96c24df 100644 --- a/tests/python/utils.py +++ b/tests/python/utils.py @@ -70,33 +70,33 @@ def skip_aws_tests() -> bool: return (dolma_tests_skip or "false").lower() == "true" -# def upload_test_documents(local_input: str, test_prefix: str, document_dir: str = "documents") -> Tuple[str, str]: -# remote_input = f"{test_prefix}/input/{document_dir}" -# remote_output = f"{test_prefix}/output/{document_dir}" +def upload_test_documents(local_input: str, test_prefix: str, document_dir: str = "documents") -> Tuple[str, str]: + remote_input = f"{test_prefix}/input/{document_dir}" + remote_output = f"{test_prefix}/output/{document_dir}" -# for i, local_fp in enumerate(glob_path(local_input)): -# remote_fp = f"{remote_input}/{i:05d}.json.gz" + for i, local_fp in enumerate(glob_path(local_input)): + remote_fp = f"{remote_input}/{i:05d}.json.gz" -# with open(local_fp, "rb") as f, open(remote_fp, "wb") as g: -# g.write(f.read()) + with open(local_fp, "rb") as f, open(remote_fp, "wb") as g: + g.write(f.read()) -# return remote_input, remote_output + return remote_input, remote_output -# def upload_test_attributes(local_attributes: str, test_prefix: str): -# remote_attributes = f"{test_prefix}/input/attributes" +def upload_test_attributes(local_attributes: str, test_prefix: str): + remote_attributes = f"{test_prefix}/input/attributes" -# for i, local_fp in enumerate(glob_path(local_attributes)): -# matched = re.match(r"^(attributes|duplicate)-(\w+)", local_fp) -# if not matched: -# raise RuntimeError(f"Unexpected filename: {local_fp}") + for i, local_fp in enumerate(glob_path(local_attributes)): + matched = re.match(r"^(attributes|duplicate)-(\w+)", local_fp) + if not matched: + raise RuntimeError(f"Unexpected filename: {local_fp}") -# _, name = matched.groups() + _, name = matched.groups() -# remote_fp = f"{remote_attributes}/{name}/{i:05d}.json.gz" + remote_fp = f"{remote_attributes}/{name}/{i:05d}.json.gz" -# with open(local_fp, "rb") as f, open(remote_fp, "wb") as g: -# g.write(f.read()) + with open(local_fp, "rb") as f, open(remote_fp, "wb") as g: + g.write(f.read()) def clean_test_data(test_prefix: str): @@ -202,10 +202,10 @@ def writeConfig(self, config: dict, ext_dir: Optional[Path] = None) -> str: def combineIntoDoc(self, *lines: str, join: str = "\n") -> str: return join.join(lines) - # def makeDocsCopy(self, path: Union[str, Path]) -> str: - # path = Path(path) - # dest = Path(self.makeUniquePath()) / "documents" - # dest.mkdir(parents=True) - # for fp in path.iterdir(): - # shutil.copy(fp, dest / fp.name) - # return str(dest) + def makeDocsCopy(self, path: Union[str, Path]) -> str: + path = Path(path) + dest = Path(self.makeUniquePath()) / "documents" + dest.mkdir(parents=True) + for fp in path.iterdir(): + shutil.copy(fp, dest / fp.name) + return str(dest) From f62f6287dccd3c3cee382f1c6afdca8ca5dca2fa Mon Sep 17 00:00:00 2001 From: David Graham Date: Fri, 14 Feb 2025 14:22:10 -0800 Subject: [PATCH 4/5] test --- .../data/provided/alternative_term/000.json.gz | Bin 0 -> 25985 bytes tests/python/test_warc.py | 3 +-- 2 files changed, 1 insertion(+), 2 deletions(-) create mode 100644 tests/data/provided/alternative_term/000.json.gz diff --git a/tests/data/provided/alternative_term/000.json.gz b/tests/data/provided/alternative_term/000.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..f5419508fa4c3f7552b22a599130c7208c6e11eb GIT binary patch literal 25985 zcmY(qV{9(Y7p`5~-L-Aowr$&XyKCF#Q`@#}8@smMzQ6N7U*3~U?v-4#Rwgsa`m%Bl zK{OOphMl=75U8nxiJP67y{ikowTpv2&{dC&3yD-KdFxu;-Gz;MZUrpy{onc;RSd8J zQflZVF~a2B6|b5ZV9o$a2;%vBLR2rC>B;7K#Cb~=^PMdw9GvUs1gXuIAE|GY+*{mN z&w%Oai;7-=zN!vJB2>H2+tJ(o(dm7z|EC;o>B8XGiRQOCvVEJ8V{^n)@Nah1M{L{h zl3dbNoI&^e)Td)1D(WTT3Zr(P;Xk;5_lVYI5%$gygqNj{!GK3iFCV!Q4!*lncu`ZM2p06&>qdjbOrgYfV3l?p-=7gE~v=@*1BxUc*DB>w#&L_|>+eQEMyoUW|Fzaw~=c4?3?_W&A7U1qTmnd-Mzz z>K>RS!$|g4N&yaQj7-vvsv3;&OFzNO1G@4L<;+(efhWcXAC(c}a=MxbeG^TcYo+*w zL`?^|&>ytY%h-?ev0u6C1uBLE=*>{4s~Nvwjv6`RGS)P&?ll{R`92k)jMOqQMHvR> z=;1w@n>&gcdcm zAtcqDN$mS-X$&vfXvXcp4E6(&RSV*I)z9s^eiSirAherJP|3%E@~(hNt6k3JEQ|^< zOfXh*MWmikeF9WE?L)Q82-lP`^F*Q0?g0QoD-sJM=24o!5#~F>Eg) zcYDCww*va>3XOo~u_j_$==V)IxGuA={V-b!&oBUb$v=ez_@p@Rz7n zAlT*|nx-;6Z2sY~TT5enORuztm(VL4p5t4PJS7FF88p|+4_A0PsjvoN7GEv@`-FC zGY=9gm|&Mt(1@BrLM$8#fD+@GEaH>pP2AimaoLOJSkC5z7D`^%i1V3{4$QDJ!AbZE zTMc@9i*o;yh>++mjI+knlQD)oj@3@fEUhn}HO**v^JfGr~Lx5iPS)5^o8s5|8;CZ`&8jm#+k9=;0 zT+gJlO_c9-I?rnGUClCYimu1(nJbz09+z9&W4LKPFD_!=7ql0Hf!HCgtrD%lm_V2K z-VuV&b0REn^rYvG`p`$H`x&L%QmRW~Da}(H>@nwLQD<{RUr~vOtTO$&E(&R>EdCM( zIYR<>_(7lJXurhJMpV$Ze6xZ+DId~)j^ZP#wcMw?EvVR5o0dZUjL-SLGUFRH zSQ5ApO}Ug>B3w#GVsCeeVbO36H90yae#Q>FrykYFr^C3nrC?j0QUY={qBod065Avg znBYsF1@+E>Y?54mg&aPCvO`xf7hyb;kGZh9@LT8ejw2|yXl4Yt0KHKPgo*s0^j|jA z*797crA%&)O^VLXK!mVoY}3q{K5yqA0TJc*K6&Erfkzr^sKEh=<}yE7SSAhveoyWK z;vvU|`yR)xvND_xy1Sbp$-X+nptJ+_H2*6`K)!{g=eilpt;;j6AEC|~guSwnwD|t_^qw-1-vMuXArSFX zvIV=stenv4pC)XJw6r-%l=wo;lcgc1OfIkVm^*NBD}s8-qUb@W`tw~Y&`z-)19bPm z=IENQ>OFBcY;hM7Zf-#&SCHEfl4`Rm^{XmWdg2M5 zuiZtQua|2%Cb*k(>4zvc?)+Gd*`KSI3PqA;?QeIZ+b*=4LhBM=1Ifd*DVUD607CKd zyp>}yF`|E-tJj8EWVg~UrXSmyW!u#|7DeiBBebiQ(tS1OF8IDak~%StzvL@(e2{Ek zF0}e&_UI+mqx^bVDs*z^nI5Y{1XbT>i*UIm9Fov1004Y_}O*~H~P*;VnoO|35{`!Gxt*PeXAp}#B70F z^bEx!-wUt53J%NIY+1#p7dUt+1UZWc&KghrZr5Jti}VV$=u#O1mi%SuS4qLl!LD*k z#(hU(6@7CwzR4&XQ*AFUMZ13bj`h*Jdw;L@Idsz+qe79}!E&K42==>Ent< zd&#m#T~)yWM*NzFEK|c58dJ=}i43BL5-38Anmy&Chb*0wN0i2s2Pw0*rLBAOI~cTBga4R{ECx9mgBmoLzXEo zZo!BVwz#o}oW-W>2(Q_A;v()UPMEP0(-rrYn9mXvV0;znVM_kK!s7s6BsU-Oal&G7 zjxg^;u9c~Ostn00-^g4m1T`Uk%D5ue$Uc>Ljb&M~K2}IApf9RE!)n&N3Bp4y`~njf z%Wvh=WaY3)){-Eb-amP!Xz%XVmq6AhL3c*c0`FUyTIhd-gXwkz&{ZEME0Nc#G0U{8 zA6G63W8kaO2$h5>U#qooD{sNG@g!Y~qb)%|!FCGaT@(1Fcz~<$ZD;iUERIm2TUm>x z+HH|aTquK{v0RE2G?}{XA#~cQ1`7U+&ntz7hS1N3UDJ{lv&LJ`0L1;&FL608#$w%D z)@++Udi-lw;%wOd=ie#} zf4H7HX*-Rbm4j51T-Kq$sa&*Tw|?RaC?qPqy{7j=?E&k-IRR?+nYU=A@`{{$sZ`Ma zSV83d?}0|mW34PeF=%xpGIBA74i-!qdwG)*-SpR$@1woTwl3VYYyM_y3b(9z-y0?= zTX}ESyl#D~Q+#e(-mPiZEZAwB&o-1Rmt1^+dhff+`cFe_)?g5vJ}%rGL8g2F+C&Eg znyD-Glyajgml&QQzSoL>&#ZFs*Y0yps&@guAB=Jf{;IV?$Jna1z%FR1Hi{h(|4Al! zs+IEpU%q#X-Lq6n#muz-AGvNW*NvWRDc8-OY^nIag40vtY)2j5Vyn`@o_eX6iSWH! zxW|L}zrg+NKjP?CFK?A@=p}oXJ(w&o3fnoq_^92Yu<}U_ zZyQU0E0#$48}awR|ELkh#E*j>{?=kGu zat@ipeH=?}CsJBr;#f-vJ+ ztn7~IQ}Y{cn6<>+#q#tD1z4%{M?N`X1w;f{R)UW+)o!Ztrja`>OQ!}VSWW5rp!!-x zBKvUfKeRN=-k1yx8eX-;xOWfoN@jq$>M8!5;;2p*^Dz-)H1Bp4pRIuh&n-R7gY}B2 z&KNo%uf)1Y7Tns?;J=7sAG%K;)s6pl-{*mAS+|bY`yp7W)rJKuGh%cJBDOtI)z)VR z+;OkQa_fFmvZz}(;2I%!PsR#cOd_--O;EqQ0Duys;3Tz+PIM)Xkastc?^q1}A8*!N zhoKb8(bo!JW02Nzs>FOt@`ZbttXw-nhZ@%$+?tR~EaF)&>;~NzAej-6&vsU*Zj?`E z=F8!Ig8*$t{J>{*4ddYPCg&?*l&)G~G^~HE-Aa4OknY@6qjn%qmDSICZ9K z3v+}9?3pG_qiMpr4Zbu2^+d@KN0@f%u6f=+%sXnHfQ6rop=vBKAB(|Jp8xuwfleVN zkyVoW$?;;IdU^(C75MVRco(&SVm5B<8$8o*mD8i3X12yCmNj6YLbY5}$Hh32=aROCC@_Dv-`NWsIyBGI27Zj>DcBu`&&Y-<*YZ|M^xjZk z+v^dkSMteQb2DPkGkTI_p=$6_(Yu}s|KX;p9)i1vARnIqprLu3=K7Thq+9(K>v21B z;3FX1f#$=NsvX!?AfspqsjlSTw12jaz7xgA&WSz@uelSYo8D_b#6yhm0T~ks3{&5) zf?S0Q^KE_hu`>T$Rd}f(vQ`&XYXGm;hu!&?&GrA;pn%iM)d&1)iD+E6FL2ei{C;9~ zMltX(y6Sb~_u$5mQ!wx(-)g5PzpOaEwKCb|A(Crg^!0Urx0;?=zZx1-VRmNu|98Y) z&_}$@)`e?+tKEvtPG6N>mF3z6-g;}4d(G_LM(auodrel04O@=h<>^+Vc6e*Gc%RPo zH?z(vuig$rrMHsYt-j^nX&nP8R7vO7RqxM|hAV@9ad%r^$mn}j1pS}a_69UM-6bAG zC6O`zjDCl2JMDnjJNzD2eKdwTtDr8>~o)_IK1;=7VNna&eJUt`SD zfx*W;aCX zRvzQb`P_B)TJmkb>t;njC^p@)WuZ7Mn|Ara|Bwzj6 zBpY}6f>11iwW}f^`2Qjdw$Abeu~>R*S0z9+w%+ntk@)|RNG!hflOiA-n{WB7P#lrX zyL{n)NC^;+?YDde5GP>!C=X=2HJSO;eB`g;k}_udsyI>r_0M`+8q8<^alI>x7T`nP z((E*GN*XanzkhEXWI$uDY$US|V1aU7WbA#;2Lt>SPe)JNPh*1nU@s(=!Ey)3i{u|?M#SN=v-TE&CRErkZs^hNk$^dwv*w(~VTv&94UZAch z8p8dix`C*@d_bHW1rW^HK_Rj2g&5qVnXz*Z6L;;p!E(oK2GCm(zF9O*xj!J{8FLKTIP zdONg;Zs|9)*OG$|T**YPx0l6Gqw=Xx*f1-`doi9eik1XOzVVK~R-;9%i!2zS{Z^WvS^+K2#knp6KxWmQn&!GnfeC~ zPx#yo9YqL4%|P57o@UxgQW*^ull+=|+r~hS<70L{N2C}>dcU3!=%0gW6A{D+xiPz* zmy}rUndQxZRY2Yun2{F-R6ZPT9+=cT_dXSMFW~J~ul2*pXtzK~Qd3W98^Tz=@9u<= z8Bec28J3>9zUB_>5X@75^U)=l*U;pu@ugZKI&5L-1*J*`wQzQW5#!Umj(m+*>;IU(q^W22YuzURyDbcd&uaA1-@pBB zsr+$7>DA!tM2KUs?_W%sNybAC(j$b+$3rNUDmIbb`_Nnt_1)wjGDv6}k&BcYJxnv7 z&EwJKsdEB*`nYZmG>Zvfg%I1-ohWmvZD zW4W99V?i=}#|;D)>vpK|M?LN@Nv9DKc+s*ejRiz0qX=Jp;Lcr}$lBRz4!*5B9Kv%% ziWi2VdrttDwDP)>XuPFVf!6-2L^L@ZNDt6du@wSo(BHZH^Xvp@jr9X~7X)&O4(V*L z(6_6i)CC*dVpr}#Z{Iqr&{A!TIROtGR^KY!6Yof8on}_291z1`^gz1Vx3zy4GoGwn zowyYT`zNY1qfOBZ4c&$H?89%z*J^BheOY=jWlbh*k;_#9mIe~-NJALLAZd;Y$&!2n zy%g4U^Ezq1=@8lw2;o3^krU)_FV>>V-ZfnL zeh~T7cg7!>debt~xqw?YZvDmhGfLU>JHBDjHH#CLRk5t7jtRSB_%?hElWajM8@{>au z9do9=jmpBcHtr*W$XJ6OHEe`&DLv+)G_Z8 zm*FQq*m#VOOx^pI5gL*S9m&7Y{D!3w8|pvDs?j2M`kUp1y_5ZIQO&SJov7OM`OXN~ zbZYY3zVV;^cz{h^$ZweVr^i`8k+av`o9kYRaXorf`EB?4@8S7+qODD%P9pv2RzLUq zr_(ms%I!leTNW=Zd`vuZP*EBJ%VXf@^0(q|EBP0}imA#iTVCr1D>9jC~6!y!Ly5c!4;-a@95hhN=R5$Jd>KX4fe1$ zqjtB9q6OM#MO89JgmO#5}vWZvW<5?3=bxWqG~sBd6lA4&Y{m^W`}Wp%!Nd(-QXSuWY(|v!Ul2VV_YXj zW=%s~iSr^}3sWzA73Gt|D;yt?{%V9BiXDN7p1aI?o=(v9pIF4 zQO!%bK)FQbo?LeDmdMqz*TT1qBT;>>_wnSTBbTic7_C#-_8oKr`jOFiTuv4n;I|+Z zDSzD!zB{qz=H7sg#onLPxI%+oKAXAWDEGELiUX^LAx4Q{VLwgh^Ro2hEQ>NUCFK?g z+7NsM1RVz!%pQd{gtp}XJ12Hr7P&Dn@t#9!i;uKV;so#xcrgZir;19O?Ug^i`AxlA zI6YZR2JUsut=lti^!|-di7#>COj$I#rRjnAF(+mr$s3JGBM!`}q;=vMX2_UwYQj+nPNT&e{mqlWE7TQyh%2cH1+4eLq zsST}{LWrIr7Hh{rT3Tf`p}o5|3+Tltl}QsU5+pwmcWCpjkbl|<6J_5(Xm4L!=V=@&>l6YOkpW(6R_4GEm_M@1*ycRw1_7%3J zMKTF7-ZM%J>iyzNve-Bh0ZE*J(%!(E&W)fKQ$T_*!REfLutebFB6_*JjwL+Iu>7aX z6A`1-ab|Lrbn!eb5OqM>rD$Tbv7c^HMsr8A2xs>XXC1GHJ~Y{g+KQL>)!r;4UDWjP zn7lB`K2~5#A$}i*9p`6b;iHZggKW6oWxRbKC^s;9GuU$Bv`qmt!*M_?%?435R%OyG z{c5a|TMVV#75`i&xQCGt4*C!0MpA?KCN!kz6Fvd~B2Yj8DxBR!|A-h;SNf2}m}frw z^rH1GXTc;qHHLKa2R}Gb9k4A<$iga!qjrG>*da3}J!{1Dbn)q_kzmunD6#|##>wzo z>U4Gg!F03bGQiDL)(FF|n5CJ}OGDQepxU6>0Bo%Lo8ql`+#kv!kQG!kPa-YF5Sbej zBxhYjH!I5f$10FuY;^mZIJ1NmQP7yo+TiGr8jt!cADk-fMUrwokBEI_;QXLd*{FWs z&~N&Ld2Mi%S(xvYvmC5)4BlswaZZYaGh=h-sDIGPd~on3ir}^m2AvRC%@dC75|f#Q ziq)hjVwH6lCiR{x2|MhFoa{kBR-_8#?lMVVY+&VJcP4jnPqjY}@3= z7?shN!?YSHcX#Por3@jm#3?P;i^MXY+AVZDqbcz7xx zJJ8w>>jq_1HIB?us*}Wl1K2@=!zVtnQ?d_>r1hW8y_?A}yG<#Y)t{!W6UPTMQd1eX z_;oDDmD~tmfq-sQS0vidY2)zMDV;IlO!1&QLqumo^fGks{s&}Z!9fpW^n%PLOv`5= zJc6l}X{aK>Ow^Rk(+t<}RM;-N?oA8;09H@{wOZcWb6IZNPN19akxQM=NsJ|)r9;iKM}wV`3!aw25PzT7`>4`;$V zI?8!jU4>XibeWcon`*5vd2HI^9VJKMX4S#5&>f4ottiMZT_DE6enD^QZ*!s%4LDey zQ(TCR08`(3(e@^*bn&kFL08kk(Ds9)ePc}`s2^7if5yj}QL=l;a*8QdL2RA z*4WI6QjMQ(l^GM*vv}hHj`t^I!N?aOCnM`|5)a5vey?F6VyKqE{J-j^C#1AfQzy79 zgdI!4Ir*JfT8EX0XSj50%Q^a0fqsRsZ!*mBg61Y$+6w5K9?C!R!jD67CoYyVR&0H( zMA+}JVu~TudUDk}El5;}uhB0TgOU05(uC=Pno1h!7m#M6W$-$OeWud*-uxl&qAza( zE7?5=Eh{&>+soE1A6@%(zhbPo8(5Bo1TX7*nCtD$19M&(9K6$W)VRq^b^PbGSL^wK9h(FP22AZDa$^NR>oRBuW>TzwC&mza3EK6Z)c8r62C=rfGAqkL# zUtj$}vq$|w@-4qIEA_kKT-TB9(j)r)iXH)gCSf`o+df$5t;VI-zCpCcjmts ztTu)c(-NN;M@LSqSy$S=ue){mz8%p%xEm?=VV7gdH6ke55y{?Vx83Rz@ zdeiI^wV_S1Iap!E&^R_BB+3r6-v{!E;i^VQm@W?-I(Pw68o3w3GOsU#MOAk8IynO} z7=ZC1J%$AIz#ZVx^*ykUUE}MxR5jIR{ou`wU%7B>>&gPS<}LKEm>Mu;Ra{QmUuy&PWa}Xb@%2uIKH5^e!4rzc{p(Ml*<`FW=;- zvx30(BR=;3>&MrfRj_z<@eGfi-ZK#sTD@?QuN@a~Od+TNyo3_*bhk%`n}!ZeTMG1?W` zo8b@U*xa)#cL1huq`V)61C8M2S_5S*lBt@;$M9{c^~4ds*l_fH)}>6Hx|=>0 z$kXSmdgHUK)kMDF zO_&5W@TK-7pR*Iz;Fe|61sCHt;~&R$I!M+;7{#iE&mJ#_NQ5-cN0*z>8{SOk?G+4$ z0G36CxP(NZ8R+7SPRuhAx#Np8$06Ypj(6mWNG6Go6Y>O3Uq!UPe9|jUvt>Hq7FLCj zsLL(aA)P|MSGY*RWF6H&Ym4GV4*=#TM&n|Qb^s$)HeU6aT}M8TSrqLdjxz6f&+}no zAoRP}CHSXCZ?%O|pmrcn?;~WHE6*i{nk5JW$1TyyzOx?J+S{+nHFMpgh9^WXF4tA5 z14M0=-_V=U*UohGf+;h#L0k~mrBaW8f<0YnX<#{0*wnzGQq5PTg`Z3_o8l&@k)44N zV1ORltENs*d^t1~cN&#ig2jnHxS27|;;?PkUW^mG<5_E?CbP)v?~<%gy~6#cRQW9Z zl}1Him4$gd@TrmSDMntkEDM!N{8F?ILGc9i1F@VQzyw+5Qf?Qgeq z!y?nVEgCSY|7eQ&Pa!ylC*>c4>P3OqTJPx<)5nF-jnnhNExh5va>@t4QoV9WN{+H} z+dYd%B_Ra>0FC`K34JRT^hXLhl3@N;)QHw$$es}nZE~Yn`U;Xbwtf^u!PHjzKvWwrj?Y72xDjZVeI!q~7t00bnqEHLlT|kiD z5^zuoCi$IVsV3~Ga9wWIIFbnB6_*U^sCzvWlJPiV=LlB7ii=@ToY!KG_{45di8ZAm zsu@`(>ZY(7Md*o&Rtl~swF4b}jjq&sXgY({RD@iIw_Q9|^q&l-)nW4@I6w;oPTHfe zFrKq<%eib}>q!I_Wh&UYQnZ>i{Ppv!GWjTkwN`O&PVFFMhLMOSx28_!a*`vd-4rf& zw+5=IO31KDW`>#cp#Ll`b*Lud6f$f$lanZHsn8J?EjlfNO74Ow?V^boydR~YZSgzo zF%#jPC^Poq$f>AbM3{0+?&x5tjsqv=euCMJC~SWeaWNF!`mQjFC|fci>lLZf!KQKO z5%fBTiQV|ZB)wc%A>r}%0f#)bx>Vfx9S8~wnW*(OaF2Q}4)k5=vsURU?^9N=a2V02 zfhsbueu8iZ>t_Tm^d8^(jOZirDStiDSs>;)a!~8S0*J|MP$0G(?MY7Z?BwCSvwRq+ z#Bv(##xHmd$E!PoQXA4A7k+K4o8fC(kKdV)0EwpD@Pv9n zzY&eT`(thxk@2e%{<{t>N0YP)E%fd%vAeZ}TuUXqP!l{u?1OJ+E-a)Y^OiRF>;)w2 zw2msO!!L7hIgBsVs_=4NA|5sVzhdPno^~a}^wbykWaoN!3Yl@E6_#-ooX-g1EZe)N z@@8KcmgN1v#Dr#B6!ql|S-(+Cs$)+eFZaC`Bf}H=v0ouk{;2jr9s@LB2KqSv=z5Io zrGnTfgOby!!}tpDCdZPC#B`w`HdXLU)gn5Dr74o$mqwx1AgC8{%mX@b8!zN1__Tl|(#1RY>>U4)rNUf^|H` zY88=QK?swRFY*p3&e9mWKcc-~!Gqa}b-8F<&gq2nzexhhq95wt zg)uojXFQGK%)7G~BKK$y+9hnWJ?Lee>o~tJ zn`0tmbu8J)HZZUE4pG2#Q5cK7m@tnPyQ7tsG)N*FlL|4uE`c=KRytN!^Ji5g`<3P2$GC$3JQ zua3tY0cA0vhME5iKLB z^5nBN>3(RX7}uy*GsyH~+UA5HQxt;!G*$+dx*wD`cd(@luI8FIk35W5m1+v7;$iw^ z)corEB?C-p_GHeJ?jGtE&hQy0WJ`^skC}`(vV!gv@f0Cx?qrN71jlZK$8qNDc68wA z$As4{zn_IY7eZEfOF!Jug2Zt*JZZu`1J8R$=zJ!u_5H1!ijUT7)Q%0G^ieCv<4SaJ z43Qlyxz`IPlu@yRF8ld>I(>F!?^SKO|7Ysz!sN-OOd%?u=wqUi6Jywbsi!NrB*SAW zYfv|f$v)@?U8X2_N@>T_xq;_CX$)>wH&Y)uiPXf^XcKm!E$B*D(2sZTr5keRQ7tgjLqU%|C!3)h}8REI^X z0{2{DZnlP2Z-J}O4pX5Ms=zNOM@Lq5xm(8NU>lO9+b=<{SDxamAd6XM=C{VeX$gt; z4v5qq7MW>KbirMQ18oWL*Y4-1JIYIARG9KCGb>+0nLY;|v_jry9b=mzl4Idq0!gs~ z9&Ls*(hg#v6G=}ql95R$fpjbpD_+DcVU?hqnU{hcF9ug!WX89Gj%tPz(GDY|8;Hjz zj7JxeaOq#bwP78?qBD#^zZ099FCtT2U@o%6l4}kj^$sA`7Dbv_5OMi8569LVfT=AA zOGgr#CMZ1hR$yj3k3xS2OlpaYu>4$}_FR7AtvcoNe-@7*BHdHD`jt|RKM-}+YeerK zA5nv7LqMy<7sHLxjMKUHY>5q5dTnaEYt4AcxlDsZk8*qzZ4u+Wl45 z(MC&-#hP=T9?kugcA^ErQ(=Fk4HiYMSx$Qz%f5Ev+^V#1tNOp{7G{eYG)^13qw1EJ zrd3>vn&X_7G?!KF#Pn6^?q>Bv-4^20%2Y0mn*I)JtkX)foT?K}4Z2g^mK>KA>6|L{ zf4VIw|0Qs0)Qopno6V}xIM!;9{x6klR&DCPy5U-DF?ThZdYS?cG>39QzbNoA#bTKP zW@0O>!5pjy$$bI?7GI)}c@FO9^EHCM1+bXUnz#J7CoAV|TLKiZ#sI%zJh;uZyXyVa<)JJE#(FuT-Zn)=IiFY%gK$>Gp2!%A&G3oG!g9a~)?8XAvCj zG4MQN$=&J2A}pPhUA1A!sp4?Z>B627!?0U9@t=v3CCnPIopjmHHmG{udm@&&@$PN3 z?&fY|Ih`k05OF~BLpRI9H*o*A;?+s4KwHng(G#PTk#G^Cvusc5Z*fFy2hN;^x@pa(TvHLGKM1AQFfPvl%=JHSUEaP8es8#Pxdtb1~#h&9$q!mqjUgcG+m- zFzc`Y+fI*Kgs}N46 zI%@KdqqVQmAVMtYs58eDCG!-{Cv-i5UwhJb0K`dws>It+J+J^1pAr%U>m*86H$(Sj z+bvKTmZ0A&4{Qx=QXx?#9uxxg9HMKCd>8pNT&JJoEN_98Ops8ffePLykdrVfKlD&j zc$b+8-tHhrjdP`nhGd$%^_HS@A&)GU4sO91ILrncB*^ptj4iR;0d0VicW=x8k0ds~ z09jV*0N8jfR>RuCq(p>k%5&uPPxspT5}16XOlwNhy_oR=`fv~l1ohHQg7NJQxafFg zhSe}&%#QHYeH1Bc!y(mo689O*{CLqO=?P71UW6e*w)c+Sf`I_x4f&Tn;Eb3A;DFWe zrmpx~J-3g(t2a7QMk38|)CQhc-%@^E3+r@Z+itAF)_KOVD>5xXS{Lj_wz(bFn>S7jQt?-496LJd%`$<@YznVEd)W29DdE zq}9O0;3nLyRcJBpvJ|5xcZkAi9Qh>;RN^WM-x*T8$leD`4E$Y4M^ z+xN2irQXe&VuZ-7%{O(Rn$XA|%$0!V&8g$w4{0l3Ca>CPi)I|_^Udrp@QTg#r_s)y zcFvQJ4>?B64F-{J`iRSS;Vs*4HA=?4w|%1q9OHmH&Qf_xAhCef?+(}aVJ0cL@6jvg zUpRk0#nqA4-KSXl_iQLZ*TEFXzEog}gIQ3BfYvB9q}tpo|31&B0}QG$q+OboNLN=# z!oC=Ubd;A%9p?2i>86OWvGG4=LzZyK3jcsCN<5EYn_J^c&Z$Fij_Xd#G9=K}?=c8R z*>&RQKc_O3aLxm1JNueNtsQ}EVA$>)=(3idxRJA5A$c2xV3RjQ8+VgvqQKBXFL~(T zKw07QyTm?C59H*6t|rXyTm~dgdB>koJqmD;OXrsCF21$zO6(Npo!67nqxSd}Jv2D^}S z_x5kXlp18@MN>_aF6<#45nd2O2Q&m6ZK085@WJB2SqzQX+HKHzU-O(89%Gi!?S<^^ zEHi;oui2)EHO>Yn^sxy!jt2iumQgl2O%?Hb9Rh1xIIsYgxTB+kbR3+lLm9}wZNg=k zXr~LhTiP=pVqqL!dQuz8eo$v{rHMq*Z>SSow6IuO^ovm?aO`#H?e<@B*Xf(jbDK$I zQb4ZP_jq1cDxWtSmkyJtU6g8PPQ!FGrev=dr70$99DL`vj}0q0&eMjFNZX^l&~%$h zN~@)eTJ#l}3)nr4StCZ6X9wt&0hI3>GE#qKXc+Ujjs9X^>bnTsyNdJ%lH zqO^UJ5=v1c&ewJ=-9Ni8o>xkkD7re8K0Q|NYXf6si0v!5Dd*m(v?u|gXfhY3{!-!^ zCtl=1T)_1uZpan@kO5C+F?=W{LsNq*Y&&4F4C~y*jwc}*sGaTC{V_+;-ULw((L*m# zrUmQOwgBF^fCj zWs@jiZqU0lV?U_^LKw12X&IZ)6gPM&|rCLOjS)k!oAlNZjGKZauJ8;SZUhN z4iZU?L&q4Kz`YUn`fvVfyw4DFY$#X1JEDV5-Us4Qk80ecK|j;2GW2pnE2Sq_#FP{p zJY^2`sMy^4^b7*UzP_1=q!sYx+&!XS+098%Q;*wwaq!pLj9k6iiPt*S#C7M+Q6f>2 z<-`J}V9nfWk2apU2N$dX+{8qR6uLArcLsINSQL8dkuMx870|r@@2J6Pxp)b5`0ie# zN_BC_i5)w_#1LG*rW0g0eEUGFrQx_dc(|RgSCnyYMCXZQk+#*E67Fu3i7O3fR|3VkHsUFmA&h}O(ty+W&WL^b zG<~zZPvjmxP_;KSI>u&1Xu9un=sLuzH`v>I+{#et5(-7YUZ}o>T1lkY%_?-uo zm8B5!kWb`8%5C;Kz^%qdQ$9NK(dQ2st??m6V7kUfS3Y|3Atf79vLPj5z{ZC(){qJf zsnC!LY{#Y#d2LfF2oy@EC9SliNK1;eq)1EJY)Q$sytXYB+ESq{{cKBxw!F42ukA?5 zjTvn#{wcDc=-jG7ER%CoA2PCGN?UTRuASA=TSbvMoj0@>|>fCQY}cY3>VsND=$?j{Me@ zR=QH8D~)xfdRL0{q%S=w(vwztQlu}v?n{xrRPRf7`%<#c@2l6iLY-M2ebnWnB_AF6 zkdn;u`t^pCY}pSfDKk-*nP9wr`e;iLUXDJa_Txh;$h_2LUg|O-4Bd|psUY)G???w_ zUKq6>AJSM?UfY$DGB5R>6p?wU_vB$;1|}0ymkDVwLDNS=KAQ4jf9uFcPd>PnhRjk! zW~ot^dUdHMlhu&PYSg7(Q;Nt;G-M17S=$ElR-*N~e6;1GD<534DRbRyNJ-v6^dTjg z_t1w_;0@GoHl>2Rw%L@DO)1%wl1*Du-XL_{6uNH8GB#xyn?lgdmQ;{2G-WlLvW(5P zRFKtd3VAnW8JitxT2`|u#NO;ky^hqAC2k7ow`3+-vK%d0fR;>1t0BM1OtfSsS~3$Y znTb}zmh8y~kFOV`#}3TGIKJbiO5>ZwW=V zWQ(?hB3oVioAgBpvn7Pt>PefjeOp4NZ5eyJX1{T-+p_#^8FX7l+m?Z~Wu@CqsV58G zmhrV^tF>h)ZCQ@C45ckYY0Gl7WkhWmQCrrbE%ezI`fSTqYs>iBGO%`6O3F~$!dOJ4 z>Wid9jJ75p@{mtM`rru>NvAI&PG3ZvzQ{K8Aw^_WIx-ZIZRn#TA5v0A)R7U1Y||In zrZ2KhUt}Bluoa|f*@hyV&__c)xI$NUxQHY4ArFO!x$n%wkU>KYN6EF>puG#+yu8CF;SUQ{B#>u*fREGzkrb z4H;LyD7I$ei1est{B_~o19}i764oCl3o4b;b!p2Oaf8T6iqIVOg2o5G>}y{OO4A7| zjQJW~oF|kl+1O>OzGgHe>SB(B+7cwxM%}$1LA=l@*qGL3p|7{M~{|3{wB785-^$|wFKZ?auGtZiE^ zHp43~YN^vq?MyA5pXOsk|FT#<_1FI=%LJy*Xocp0m$+t(AE|NIkv^QBy4R2L`L&&oH}usO#NJj(U$5ulKVf2QWe9$N}oyFOD48Eqy(&nlz|}(fWnW5OZYH z*2cg4-ewv0mR}+AbXY$&Jq z2qQ@U=zzkZ!+X73gxhjv$l+9Cwa>&MewZa4=k{Pse(D!k3kvYvejk}oDE;xO@fuTwix{U zUbbU|c#DIs2jCDxSr&4+GxA#?2|uJ-Zd6-|4MhxuF-g$m!FN#aIl|cT)HO~M8yje=*SCW z$&bq5gBw7r9EI$NF0gN_Jm)DRqM|AGmSRXe_f^o*3Q!mj=-l_5hlq8Cv2dkckh>X1hT#W!t35s6>|qLbA$DHk z6kdfmg`c%0`Cj7-K9{mgY9Tx*qpqc3gqbX#%TtV}%AzJr3KiE5SdqnJr6c~X=&T0f z>jTC4p$U#4o`1F@$%%eN$os86fXp1 z77JCqTBEB~fr#msdP8xB7g;NwOKTqYb;P-nBrYpX9HJ_*UpF~>V{<-X&je2&j~RAc z;e^clZdotT6_@4Rf>@uYH)(;49RT#e`puEl|mQY6o!~F4&5H zo&CtBMp@8|!Mlu_1^mk5I0`%~#)1LuPWIZkaDLwca#_&Hk&YD&T9EGX?TNa5`5Y$h z!$oES{+n9ivpK`d6;5awHNxCJM%`f0@NPFnB*Vg?TLA6Fi;^NMKwt}ZID>BW*#W}g zU0Ppzo9PF_5W|xN+!_lkqt};wDq=VsZw~O*#O^MZ=PWTv98Q2-KD?WEZZ}(SPVNl5 zI57j?AsRsl0`1X0`qYLQiZnroz^B+i#1W};n~5orL;!iuj|FcGK$*i2a4~ndb`@f{ z0UYlubb$_FfUkfpF~W%jSw4=86u3y=)4OQjt}E&iZbtMY4E^aEBII;O zf(m`mSU&)N2{mSEbZ0E&D?0c3mB?~mJPH0&H`eW4|J19&S{tpJAEin*yvR+m8&zzxy`joLV9FfJ4Bd`WBw zPyV$jx%DFtTTzs?0|nVgYC!I(UD%apY|;z%8KiT9LyR3hvEY{jKaxjng^)rnBTzR& zSoECJJi+}IwBgyrn0yD2)NJv`4)u(*Z4!FkpZV*wMgF&`A-$b;dBLpAyCfMIUQ2QD zT;gONc7@PgTF952kd9Hp&%!8$5Hi)!e`O_3u^sc9p@Zy%+5%DFR<^)E=(N!mPSX$h z`d1S4%K6;vjle<&Z^e6)rl8x*MG8b7ioT#cU)Gh(pr<5r&S)j?yd<9(2%@Z1EnK}( z&sE5%m*a{61Pz8aHx3}0MV^Bdbqu0J+7%Yo@)ta@kROHicAd!u5)0r=CtDDCeAT38 zV+1iH>zU?^$4Nm}DDgFxf{#cHuDU+5gzDUV>Acb+R=Qp31roy(J5XpO!bv!Mo{uXD zdQM|G;3bmb;B!vpoeoP&>{q z*Rlnl4D4#Sb{XFU;P3}!vy>5AqVv>=G>E_&jHf3T)Ewk&HACv&Sas^^UuURx5LSv}NeHSz6Qui5`On)qh_xYhiu zCjRfM1pACqKJU_jQvU7u>g=1zX}i-LPg|qO?A5oo!`}~v(@A6U?davOtdwuDZ&@KL zS@%hmd}(AQUHpxe$1*ZMvIp%ysy-bG>L=9cEw-4JwCT079{t}(WByAJU%K^Qs<$O_ zPeOPPZvi9DWV^uWf~&|8Nj!+QDC4?}h&j$ehrE!$LlcblToTjp6FQ_J)oWPHfP(BA zgaMWtWc&%4AJ~!sgN>9%rv~1oHIezVip&^;0usOmGLh|@XnG`Gk1ySF0c9qr)fTy@ zlT;Fs2WBmTm$%(%PEulTiQXy~;Hbkctt%Y$IZZ2T-8lT4`%_gTxbBnK$8z30TmlS% zcwubcnhUJiR1%j38nPr+M$EB%7x(ehX1>@td;)tqacWFsGsnnj$uYxx6eFk%ec~bR z{JrEj2z_&=^hJ7O70!MH*eV_uWD+P_aiNLnzdCJRubbR+55cWgZ?4 z%cUo9X%sLc@g_@PizqIe#u{zgE0+@v%A@p(vIPr+2TI}8dIN-XzHO`OJbW;$G(%vB z&EJgx5jxM7sXg}c^^j!i^qY*pb|2SFH;rgVZUj=x>M`=zSolEJ=dm#@TzKH6M=9-dYk4hCqsTOG(BNx zN9W#Ho_&s=T-^CN!iF6yN7ddLv*TTpja+VVkEquID`ramB%!rDVHYd$vTU=E@qmmJaDbuHFbSN6+xRlSl1&;A zM8f{5g3ps`5J$wEsu1$SEk`ybhY0kgcij3^t zk9)#<9ht+0nLjR2TFeXBCw5RGoA=u3E~^~TK$YVyF6hYd5ou9o#aXPUls7rTiM#|_ zwx@UQGF&8KR9t02cR>muckQw_IY0i$sVo;i`|4%Jpc9vY8W?LXY4-w|#gM^s;W=($ z4^fFo&NiuL<&oQhbmlCeK}rzv9b{3E#j_;mz);3`HqBdZG+>RYxv+Q;q@U#oJ6RWjVJ?=_X(f`{HX{Ps@vhj(&mI zuO;gao&OT#<;1KdTF-*a+b4UddC}kyeX^qZmz+TZfH;pD8Zje9c9dy?3~$#Py$B7R zUNH>vF0&0)j6BW~J`AG(ux-OM0jv!5bsW>L2qi`83pKq%es#K9d5f>ixo9`D5|ldc{!Yz7_)^?~mpGh+~PXA$ohT6HEUl&6Z#cRGd%+X_^Q_Xa;f)U z)!Ich#>Bu;iuub3{!)g|JaARX6o$k_oOFk?cJuYEtu4E$`t0Yiqo@YTa~Ro;Tl4U%j23yy*^3+KuMh^Yinw^UKNgWtp`# zOCaG-s$L7jEOE3?5b8fF6RSV%$lUBbp?J-Bm{q_2Y@P&P;RBx3?pB+>j3-Ile%P&K$wUL&FGzl8*Ex7pE5|d2v9gXo_@ap>FHC`|O;RG>CKk9dC&l%f5>{)?$pb(dY zaJ$UBHb@Q@)E@C#$nJ+yjqVqb=FGuPF@;~OxlOK3%TUs~#l{>UF@297YxEc%OYqla zrHK^Z3Zg3AwQY5H2d29!WXqkAJA>#Wd``@3b9~1;0DIuQaI{5Iyf_Eh1%jkZkSG9Y zBt(82y^Av(ZFw2PQCaAliXghNvmN`*(xkc}2>cCn39MDlkro9(i4_~7zQ%O{cf6t^ zMfxJQcUFtiY>L&HH8d6mk?$tbYg#s^!Bn(-G#B)ws-_%xb?rm1eWsK6xRCk|u|~)# z0SE;Pg22EWY7lMd(grW5XLJjsTN*5d#53B@iz&rsSL)>TTfC=HRTst_-E#8C|A>FX z4{}Wt@@#N@MSlV@i^D_C0fb#y1`P+%{S@&TMIiUrhM;ra1o?*kz;R1Iv9H9MH z?TG%g3+P`sd}936>$CT@=gXZ7&p*=x-Q z_zmsWM;h$N;Kl*xp}>V5Jv;M(=8N``bWu$f7nC7Nj&Eb;>liIhyETe zS3IEFQxn=FUf6*{|Y~p2)hfmV~AZv09=#_%_9*S<)J3v+hc~xYi^j7K?nU2i>Q8Ks%nq z2NLvNre30DGv6Z7F3Ch|ge>l{iUDTyh5an1C3$g1oJq&~7YAvAdE+#neC6MKvT&vv z_>r(4x7)7CX~z?kr3;P)z)1cLFBnmRF0~+ljveEqe+jsrAVp@HN9O1|q|}UaP&QUU z7MjHme84;dOS$yk)20-lD@hVy&0Q!`VQb$QKF4B`T&2NtIP~c`tvIxe)_dY&A1rM) zUWT6e#@lZ9&~GGQ)Q1$$NWA(ziG3O{JwVXGsUltBol|0LBugMn#2^FnMqUShdS9LE z?M|R~pq5%9b>Uh|3pdm;-ggiya;YKHZos5nPMw8oB@%%_Lw#nImYjr`BV=zGo98Nb zlM}^$A`*{4Qi)C%8uz{WK4j;2tMfyJ+>MqM-=0gcD;`ou>~8dCqTQFXPOCP`X?g z88@>`Et+|u?~sqS`U8Wrfm9?OBR9%qdDL?^Uoiv~LsMAXK*oD`WM`myf+a}BqmT&! z-7NgdQV-kT#E&vZNn?P)2a&cIM%r63+Hx#*5sk#{39D{y0;n|BHpB$2y?mRd1EXVvo#2-h-?`}*0n}QGocU0LEMtM62BSwc8(ahfCWn+zKlX6*^+PMPN`jS* z$BzFD>i570?m8zfp-oG7w|is<#l1d;-nZf$MKRrT_OrL0P9XekZB`Qw(y+McN#0#xS%kueINN{no~5^xdmP`z_ysr}Sj-gm&RL8?A9$fJ zy$o78e|cz=)N8EV1!MDC;;J~uXlsk)&bU*Ee)H%71EV_Og*m4m-XCt5;^Gy8+i0Gz zqU>5Ksb1u8&ellWYZw9+jDe#C{hv-lz8}baxI7hl9kO@~kF~8J4H)as0NEP(O_G3Eih6cU<#8p9=HO%( z>Edp$F2%=U>KnR-J-2(0uS3HPN$e*G=#^n|r|l-eP_82{b6yI4J3hX2?Gl+P&t`y% z(`ZF(d+82P<~n=?dyc~uJ#@1hUh#^`(~HYe zn7X+a?&yViDl=)LyfQzl2_CRZcH{4J&m`pIfc*JXmm<~&WWjL7$oYld0aFQF!cN|e zRaM-wv`G&xa2$idJ`j=7be4Nf_i@kdtkT`Zg^)q?MVL37N#I(U`2%GD_fA>kyc?yP z6*xlKQ1&$N{*+2#w_U2VF@&Y%q)iK^Z2+Z~#bzF`(!ujQ9x@AOXLdZET+31=I0>zS@)R>cS#_M!v z7efaDV$8Dea3qMcj^&Zb`J(Ru@fjKt>-j0H(Sl~+onBnrwW5#P&UGAXL|A6gO|jh1 zaeHve`7=z3W*6@A;agBAU=Cxsm&j#@sC6Cb-G=o5sOEN-93EB;_rQXA=$8UoQJVdN zX~vstU!ynDBf3RZ)a&ZB zsu+b&of_zV>|zVLiJB3CP3ca+u>YITy*oRw^WNEcDfEb0mkYcNOX2KQmlu>e^mGZn z;@RbbxNf4Yb>Zg490N9hX404KzJfr@1i%q&!jqS*1_=%}h>$d<4=VTU`~pP~iB>a; zdxqhTl|Xw~d64zE0q0@#`M#2o<8$#>wSLr`=L-(>#`@xlYRDbg3m0iEbZK>M3<1I0 zF_Q-!9t0zfITpefm7I!-#s-!_x9sFDaiuZKIu+!av|`bCa(V)kGtei7gJg==A=IW9 zk{yh9=NTBw<@Jqd*gxj)mOHRzY2@oJjeI>%BhSyoGp=L+=4!cDTeIZV)+l+kHHuzs zjZ$KJ`kMEn;S$^PzobVE*P}+>qekANMyW@Q5`s^uNA*&V>ct+_%RQrpN1QO)(JmiMTZ_o!CtQLWgc z|F4dB!qM%RBZg06Vl_mp{rkL zNs8t0>7%TK9VSsTcXWEad5_p{33(jCy@P{W#nBN7LaExZ6(2E!P@?t{Q-o3`HIhJ$ zmZ?^#QP(!Dd6dI2S-jpS+l}^&PupYN^|#B|DAcGb_KKopo!%BUS|wJ%k>*L{GcU%u z{4acDqV&-L*|rz4DE>~NC_jGYC=z*A#x|*seZNU%B1bGE>O|^eg>g)wXl5m%*CDd@ dFti&=S>zL-8<`*)*g=%G`T%y4ljH*r0RXkMhs*!~ literal 0 HcmV?d00001 diff --git a/tests/python/test_warc.py b/tests/python/test_warc.py index e69d3852..bd70da91 100644 --- a/tests/python/test_warc.py +++ b/tests/python/test_warc.py @@ -149,8 +149,7 @@ def _run_pipeline_with_skip_linearization(self) -> Dict[str, List[dict]]: skip_no_pre_taggers=False, skip_no_post_taggers=False, store_html_in_metadata=False, - linearizer_name="resiliparse", - skip_linearization=True, + linearizer_name="no-op", pre_taggers=["cc_re"], post_taggers=["lingua_1e2"], ) From 80e3c96fd9bd5f06251156dfcc18443cb08623c7 Mon Sep 17 00:00:00 2001 From: David Graham Date: Fri, 14 Feb 2025 14:29:55 -0800 Subject: [PATCH 5/5] . --- tests/config/alt-path-mixer.json | 34 ++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 tests/config/alt-path-mixer.json diff --git a/tests/config/alt-path-mixer.json b/tests/config/alt-path-mixer.json new file mode 100644 index 00000000..cdcbe596 --- /dev/null +++ b/tests/config/alt-path-mixer.json @@ -0,0 +1,34 @@ +{ + "streams": [ + { + "name": "mixer-test", + "documents": [ + "tests/data/provided/alternative_term/*.gz" + ], + "document_dir":"alternative_term", + "output": { + "path": "tests/work/output/mixer", + "max_size_in_bytes": 100000 + }, + "attributes": [ + "pii", + "toxicity" + ], + "filter": { + "include": [ + "$.metadata[?(@.length < 10000)]" + ], + "exclude": [ + "$.metadata[?(@.length < 500)]", + "$.attributes[?(@.pii.too_much_pii == true)]", + "$.attributes[?(@.toxicity > 0.8)]" + ] + } + } + ], + "work_dir": { + "input": "tests/work/temp/mixer/input", + "output": "tests/work/temp/mixer/output" + }, + "processes": 1 +}