diff --git a/Cargo.lock b/Cargo.lock index c94883ac..64752c2f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -682,7 +682,7 @@ dependencies = [ [[package]] name = "dolma" -version = "0.9.0" +version = "0.9.1" dependencies = [ "ahash", "aws-config", diff --git a/configs/baselines/decontamination/c4.yaml b/configs/baselines/decontamination/c4.yaml new file mode 100644 index 00000000..c175c300 --- /dev/null +++ b/configs/baselines/decontamination/c4.yaml @@ -0,0 +1,20 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz + +dedupe: + name: perplexity_suite_v3_option2_redo + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 33554432 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 224 + +work_dir: + input: /mnt/tank/dolma_tmp/c4_input + output: /mnt/tank/dolma_tmp/c4_output \ No newline at end of file diff --git a/configs/baselines/decontamination/falcon-refinedweb.yaml b/configs/baselines/decontamination/falcon-refinedweb.yaml new file mode 100644 index 00000000..cb7f00aa --- /dev/null +++ b/configs/baselines/decontamination/falcon-refinedweb.yaml @@ -0,0 +1,20 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/falcon-refinedweb/v0-0.05-heldout-complement/documents/*.gz + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 33554432 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 224 + +work_dir: + input: /mnt/tank/dolma_tmp/falcon_input + output: /mnt/tank/dolma_tmp/falcon_output \ No newline at end of file diff --git a/configs/baselines/decontamination/mc4.yaml b/configs/baselines/decontamination/mc4.yaml new file mode 100644 index 00000000..dace598e --- /dev/null +++ b/configs/baselines/decontamination/mc4.yaml @@ -0,0 +1,20 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/mc4/en_wimbd_splits/documents/train/*.gz + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 33554432 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 224 + +work_dir: + input: /mnt/tank/dolma_tmp/mc4_input + output: /mnt/tank/dolma_tmp/mc4_output \ No newline at end of file diff --git a/configs/baselines/decontamination/pile.yaml b/configs/baselines/decontamination/pile.yaml new file mode 100644 index 00000000..397bff22 --- /dev/null +++ b/configs/baselines/decontamination/pile.yaml @@ -0,0 +1,20 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/pile/v0/documents/train/*.gz + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 33554432 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 224 + +work_dir: + input: /mnt/tank/dolma_tmp/pile_input + output: /mnt/tank/dolma_tmp/pile_output \ No newline at end of file diff --git a/configs/baselines/decontamination/redpajama.yaml b/configs/baselines/decontamination/redpajama.yaml new file mode 100644 index 00000000..c2a7116e --- /dev/null +++ b/configs/baselines/decontamination/redpajama.yaml @@ -0,0 +1,25 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/redpajama/v1/documents/split=train/dataset=arxiv/*.gz + - s3://ai2-llm/pretraining-data/sources/redpajama/v1/documents/split=train/dataset=book/*.gz + - s3://ai2-llm/pretraining-data/sources/redpajama/v1/documents/split=train/dataset=c4/*.gz + - s3://ai2-llm/pretraining-data/sources/redpajama/v1/documents/split=train/dataset=common_crawl/*.gz + - s3://ai2-llm/pretraining-data/sources/redpajama/v1/documents/split=train/dataset=stackexchange/*.gz + - s3://ai2-llm/pretraining-data/sources/redpajama/v1/documents/split=train/dataset=wikipedia/*.gz + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 33554432 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 224 + +work_dir: + input: /mnt/tank/dolma_tmp/rp_input + output: /mnt/tank/dolma_tmp/rp_output \ No newline at end of file diff --git a/configs/baselines/mixing/c4.json b/configs/baselines/mixing/c4.json new file mode 100644 index 00000000..bba3db5c --- /dev/null +++ b/configs/baselines/mixing/c4.json @@ -0,0 +1,27 @@ +{ + "streams": [ + { + "name": "c4", + "documents": [ + "s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz" + ], + "output": { + "path": "s3://ai2-llm/pretraining-data/sources/c4/v0_decon_ppl_suite_v3", + "max_size_in_bytes": 1000000000 + }, + "attributes": [ + "perplexity_suite_v3_option2_redo" + ], + "filter": { + "exclude": [ + "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + ] + } + } + ], + "work_dir": { + "input" : "/mnt/tank/dolma_tmp/c4_input_mix", + "output" : "/mnt/tank/dolma_tmp/c4_output_mix" + }, + "processes": 1 + } \ No newline at end of file diff --git a/configs/baselines/mixing/falcon-refinedweb.json b/configs/baselines/mixing/falcon-refinedweb.json new file mode 100644 index 00000000..109b5863 --- /dev/null +++ b/configs/baselines/mixing/falcon-refinedweb.json @@ -0,0 +1,27 @@ +{ + "streams": [ + { + "name": "falcon-refinedweb", + "documents": [ + "s3://ai2-llm/pretraining-data/sources/falcon-refinedweb/v0-0.05-heldout-complement/documents/*.gz" + ], + "output": { + "path": "s3://ai2-llm/pretraining-data/sources/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3", + "max_size_in_bytes": 1000000000 + }, + "attributes": [ + "perplexity_suite_v3_option2" + ], + "filter": { + "exclude": [ + "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + ] + } + } + ], + "work_dir": { + "input" : "/mnt/tank/dolma_tmp/falcon_input_mix", + "output" : "/mnt/tank/dolma_tmp/falcon_output_mix" + }, + "processes": 1 + } \ No newline at end of file diff --git a/configs/baselines/mixing/mc4.json b/configs/baselines/mixing/mc4.json new file mode 100644 index 00000000..02d38b9d --- /dev/null +++ b/configs/baselines/mixing/mc4.json @@ -0,0 +1,27 @@ +{ + "streams": [ + { + "name": "mc4", + "documents": [ + "s3://ai2-llm/pretraining-data/sources/mc4/en_wimbd_splits/documents/train/*.gz" + ], + "output": { + "path": "s3://ai2-llm/pretraining-data/sources/mc4/en_wimbd_splits_decon_ppl_suite_v3/", + "max_size_in_bytes": 1000000000 + }, + "attributes": [ + "perplexity_suite_v3_option2" + ], + "filter": { + "exclude": [ + "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + ] + } + } + ], + "work_dir": { + "input" : "/mnt/tank/dolma_tmp/mc4_input_mix", + "output" : "/mnt/tank/dolma_tmp/mc4_output_mix" + }, + "processes": 1 + } \ No newline at end of file diff --git a/configs/baselines/mixing/pile.json b/configs/baselines/mixing/pile.json new file mode 100644 index 00000000..1f2a0552 --- /dev/null +++ b/configs/baselines/mixing/pile.json @@ -0,0 +1,27 @@ +{ + "streams": [ + { + "name": "pile", + "documents": [ + "s3://ai2-llm/pretraining-data/sources/pile/v0/documents/train/*.gz" + ], + "output": { + "path": "s3://ai2-llm/pretraining-data/sources/pile/v0_decon_ppl_suite_v3", + "max_size_in_bytes": 1000000000 + }, + "attributes": [ + "perplexity_suite_v3_option2" + ], + "filter": { + "exclude": [ + "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + ] + } + } + ], + "work_dir": { + "input" : "/mnt/tank/dolma_tmp/pile_input_mix", + "output" : "/mnt/tank/dolma_tmp/pile_output_mix" + }, + "processes": 1 + } \ No newline at end of file diff --git a/configs/baselines/mixing/redpajama.json b/configs/baselines/mixing/redpajama.json new file mode 100644 index 00000000..d91857b8 --- /dev/null +++ b/configs/baselines/mixing/redpajama.json @@ -0,0 +1,32 @@ +{ + "streams": [ + { + "name": "redpajama", + "documents": [ + "s3://ai2-llm/pretraining-data/sources/redpajama/v1/documents/split=train/dataset=arxiv/*.gz", + "s3://ai2-llm/pretraining-data/sources/redpajama/v1/documents/split=train/dataset=book/*.gz", + "s3://ai2-llm/pretraining-data/sources/redpajama/v1/documents/split=train/dataset=c4/*.gz", + "s3://ai2-llm/pretraining-data/sources/redpajama/v1/documents/split=train/dataset=common_crawl/*.gz", + "s3://ai2-llm/pretraining-data/sources/redpajama/v1/documents/split=train/dataset=stackexchange/*.gz", + "s3://ai2-llm/pretraining-data/sources/redpajama/v1/documents/split=train/dataset=wikipedia/*.gz" + ], + "output": { + "path": "s3://ai2-llm/pretraining-data/sources/redpajama/v1_decon_ppl_suite_v3", + "max_size_in_bytes": 1000000000 + }, + "attributes": [ + "perplexity_suite_v3_option2" + ], + "filter": { + "exclude": [ + "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + ] + } + } + ], + "work_dir": { + "input" : "/mnt/tank/dolma_tmp/rp_input_mix", + "output" : "/mnt/tank/dolma_tmp/rp_output_mix" + }, + "processes": 1 + } \ No newline at end of file diff --git a/configs/baselines/tokenization/c4.yaml b/configs/baselines/tokenization/c4.yaml new file mode 100644 index 00000000..10ab2ea9 --- /dev/null +++ b/configs/baselines/tokenization/c4.yaml @@ -0,0 +1,9 @@ +destination: s3://ai2-llm/preprocessed/c4/v0_decon_ppl_suite_v3/gpt-neox-20b-pii-special +documents: +- s3://ai2-llm/pretraining-data/sources/c4/v0_decon_ppl_suite_v3/*.json.gz +processes: 224 +seed: 3920 +tokenizer_name_or_path: allenai/eleuther-ai-gpt-neox-20b-pii-special +work_dir: + input: /mnt/tank/dolma_tmp/c4_input_tokenized + output: /mnt/tank/dolma_tmp/c4_output_tokenized \ No newline at end of file diff --git a/configs/baselines/tokenization/dolma_v1_5_cc_only.yaml b/configs/baselines/tokenization/dolma_v1_5_cc_only.yaml new file mode 100644 index 00000000..1fc4282d --- /dev/null +++ b/configs/baselines/tokenization/dolma_v1_5_cc_only.yaml @@ -0,0 +1,11 @@ +destination: s3://ai2-llm/preprocessed/olmo-mix/v1_5_cc_only/gpt-neox-20b-pii-special/ +documents: +- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head/*.json.gz +- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.json.gz +- s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.json.gz +processes: 224 +seed: 3920 +tokenizer_name_or_path: allenai/eleuther-ai-gpt-neox-20b-pii-special +work_dir: + input: /mnt/tank/dolma_tmp/v1_5_cc_only_input_tokenized + output: /mnt/tank/dolma_tmp/v1_5_cc_only_output_tokenized \ No newline at end of file diff --git a/configs/baselines/tokenization/falcon-refinedweb.yaml b/configs/baselines/tokenization/falcon-refinedweb.yaml new file mode 100644 index 00000000..f40d22bf --- /dev/null +++ b/configs/baselines/tokenization/falcon-refinedweb.yaml @@ -0,0 +1,9 @@ +destination: s3://ai2-llm/preprocessed/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/gpt-neox-20b-pii-special +documents: +- s3://ai2-llm/pretraining-data/sources/falcon-refinedweb/v0-0.05-heldout-complement_decon_ppl_suite_v3/*.json.gz +processes: 224 +seed: 3920 +tokenizer_name_or_path: allenai/eleuther-ai-gpt-neox-20b-pii-special +work_dir: + input: /mnt/tank/dolma_tmp/falcon_input_tokenized + output: /mnt/tank/dolma_tmp/falcon_output_tokenized \ No newline at end of file diff --git a/configs/baselines/tokenization/mc4.yaml b/configs/baselines/tokenization/mc4.yaml new file mode 100644 index 00000000..1a28908d --- /dev/null +++ b/configs/baselines/tokenization/mc4.yaml @@ -0,0 +1,9 @@ +destination: s3://ai2-llm/preprocessed/mc4/en_wimbd_splits_decon_ppl_suite_v3/gpt-neox-20b-pii-special +documents: +- s3://ai2-llm/pretraining-data/sources/mc4/en_wimbd_splits_decon_ppl_suite_v3/*.json.gz +processes: 224 +seed: 3920 +tokenizer_name_or_path: allenai/eleuther-ai-gpt-neox-20b-pii-special +work_dir: + input: /mnt/tank/dolma_tmp/mc4_input_tokenized + output: /mnt/tank/dolma_tmp/mc4_output_tokenized \ No newline at end of file diff --git a/configs/baselines/tokenization/pile.yaml b/configs/baselines/tokenization/pile.yaml new file mode 100644 index 00000000..c6c387cd --- /dev/null +++ b/configs/baselines/tokenization/pile.yaml @@ -0,0 +1,9 @@ +destination: s3://ai2-llm/preprocessed/pile/v0_decon_ppl_suite_v3_fixed/gpt-neox-20b-pii-special +documents: +- s3://ai2-llm/pretraining-data/sources/pile/v0_decon_ppl_suite_v3/*.json.gz +processes: 150 +seed: 3920 +tokenizer_name_or_path: allenai/eleuther-ai-gpt-neox-20b-pii-special +work_dir: + input: /mnt/tank/tmp/pile_v0_decon_ppl_suite_v3_fixed_input + output: /mnt/tank/tmp/pile_v0_decon_ppl_suite_v3_fixed_output \ No newline at end of file diff --git a/configs/baselines/tokenization/redpajama.yaml b/configs/baselines/tokenization/redpajama.yaml new file mode 100644 index 00000000..f0379dce --- /dev/null +++ b/configs/baselines/tokenization/redpajama.yaml @@ -0,0 +1,9 @@ +destination: s3://ai2-llm/preprocessed/redpajama/v1_decon_ppl_suite_v3/gpt-neox-20b-pii-special +documents: +- s3://ai2-llm/pretraining-data/sources/redpajama/v1_decon_ppl_suite_v3/*.json.gz +processes: 224 +seed: 3920 +tokenizer_name_or_path: allenai/eleuther-ai-gpt-neox-20b-pii-special +work_dir: + input: /mnt/tank/dolma_tmp/rp_input_tokenized + output: /mnt/tank/dolma_tmp/rp_output_tokenized \ No newline at end of file diff --git a/configs/dolma-v1_5/README.md b/configs/dolma-v1_5/README.md new file mode 100644 index 00000000..aff3e5be --- /dev/null +++ b/configs/dolma-v1_5/README.md @@ -0,0 +1,3 @@ +# Dolma 1.5 + +This directory diff --git a/configs/dolma-v1_5/decontamination/README.md b/configs/dolma-v1_5/decontamination/README.md new file mode 100644 index 00000000..f45c9520 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/README.md @@ -0,0 +1,96 @@ +# Decontamination Runbook + +## Step 1: Create decontamination bloom filter + +> Okay I think every thing is ready for decon testing now. The finalized ppl suite v3 is in `s3://ai2-llm/eval-data/perplexity/v3/`. And here is my proposed plan for decon testing if you agree and it's not too much compute. The following is the sequence of things to try. At each step if the document removal rate is >0.1% or so we back off to the next step and hope the remove rate is lower: +> +> - **Option 1** Decon against PPL Suite v3 (`s3://ai2-llm/eval-data/perplexity/v3/`) + PPL Suite v2 (`s3://ai2-llm/eval-data/perplexity/v2/`) for full backwards compatibility. +> - **Option 2** Decon against PPL Suite v3 (`s3://ai2-llm/eval-data/perplexity/v3/`) + PPL Suite v2-small (`s3://ai2-llm/eval-data/perplexity/v2_small/`) for at least full backwards for the in-loop metrics the model team was using. +> - **Option 3** Decon against PPL Suite v3 (`s3://ai2-llm/eval-data/perplexity/v3/`) + a subset of PPL Suite v2-small requested by Dirk and Iz (`s3://ai2-llm/eval-data/perplexity/v2_small/c4_en/`, `s3://ai2-llm/eval-data/perplexity/v2_small/pile/`, `s3://ai2-llm/eval-data/perplexity/v2_small/m2d2_s2orc/`, `s3://ai2-llm/eval-data/perplexity/v2_small/ice/`) +> +> Let me know if you disagree with any of this or if there's any thing I can do to help run the decon trials! + + +### Step 1.1: copy data locally + +We copy data locally since the directory structure of the eval data in S3 is slightly different from the one we need. +In particular, we need all documents to be under `documents/` directory. + +```bash +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2 $HOME/perplexity/v2/documents +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small $HOME/perplexity/v2_small/documents +aws s3 sync s3://ai2-llm/eval-data/perplexity/v3 $HOME/perplexity/v3/documents + +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/c4_en $HOME/perplexity/v2_small_subset/documents/c4_en +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/pile $HOME/perplexity/v2_small_subset/documents/pile +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/m2d2_s2orc $HOME/perplexity/v2_small_subset/documents/m2d2_s2orc +aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/ice $HOME/perplexity/v2_small_subset/documents/ice +``` + +### Step 1.1b: change type of IDs in v3 subset (TEMPORARY FIX) + +v3 accidentally contains ids that are integers instead of strings. Until that's fixed, run: + +```bash +python config/dolma-v1_5/decontamination/fix_ids_type.py +``` + +### Step 1.2: tag out paragraphs by uniseg length + +For dolma, we want to decontaminate against paragraphs that are at least 13 uniseg words long, +so we need to compute their length first. + +```bash +dolma tag --documents "${HOME}/perplexity/v2/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188 +dolma tag --documents "${HOME}/perplexity/v2_small/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188 +dolma tag --documents "${HOME}/perplexity/v3/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188 +dolma tag --documents "${HOME}/perplexity/v2_small_subset/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188 +``` + +### Step 1.3: filter out paragraphs that are too short + +After tagging, we can filter out to make option 1/2/3. + +```bash + +dolma -c configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option1.yaml mix +dolma -c configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option2.yaml mix +dolma -c configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option3.yaml mix + +``` + +### Step 1.4: create bloom filter + +First, we cat the contents of each dataset to get number of documents: + +```bash +zcat $HOME/perplexity/option1/documents/* | jq '.text' -cr | wc -l +>>> 3681169 +zcat $HOME/perplexity/option2/documents/* | jq '.text' -cr | wc -l +>>> 2336120 +zcat $HOME/perplexity/option3/documents/* | jq '.text' -cr | wc -l +>>> 2020471 +``` + +We use this numbers in the config files at `bloom_filter.estimated_doc_count`. For all three options, we set a `bloom_filter.desired_false_positive_rate` of 0.00001. + +```bash +dolma -c configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option1.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option2.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option3.yaml dedupe +``` + +## Step 2: Run decontamination + +Tag content for Dolma V1.5 for decontamination: + + +```bash +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/cc.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/c4.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/stack.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/reddit.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/peS2o.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/books.yaml dedupe +dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/wiki.yaml dedupe +``` diff --git a/configs/dolma-v1_5/decontamination/fix_ids_type.py b/configs/dolma-v1_5/decontamination/fix_ids_type.py new file mode 100644 index 00000000..69bd4dbd --- /dev/null +++ b/configs/dolma-v1_5/decontamination/fix_ids_type.py @@ -0,0 +1,33 @@ +import argparse +import json +from dolma.core.paths import glob_path +import tqdm + +import smart_open + + +def fix_path(p: str): + with smart_open.open(p, 'rt') as f: + data = [json.loads(line) for line in f] + + with smart_open.open(p, 'wt') as f: + for d in data: + if 'id' in d: + d['id'] = str(d['id']) + f.write(json.dumps(d) + '\n') + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument('path', nargs='+') + args = ap.parse_args() + + with tqdm.tqdm(desc='Files') as pbar: + for p in args.path: + for sp in glob_path(p): + fix_path(sp) + pbar.update() + + +if __name__ == '__main__': + main() diff --git a/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option1.yaml b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option1.yaml new file mode 100644 index 00000000..2dcb5c1b --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option1.yaml @@ -0,0 +1,86 @@ +streams: + - name: "v2" + documents: + - ${oc.env:HOME}/perplexity/v2/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/ice/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/ice/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/m2d2_s2orc/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/m2d2_s2orc/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/m2d2_wiki/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/m2d2_wiki/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/manosphere/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/manosphere/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/mc4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/mc4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/twitterAEE/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/twitterAEE/test/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v2/documents/wikitext_103/test/*.gz + + output: &output + path: ${oc.env:HOME}/perplexity/option1/documents + max_size_in_bytes: 500000000 + discard_fields: + - attributes + + attributes: &attributes + - uniseg_length_paragraphs_with_empty_v1 + - not_alphanum_paragraph_v1 + + span_replacement: &span_replacement + - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph + min_score: -12 + replacement: "" + - span: $.attributes.not_alphanum_paragraph_v1__not_alphanum_paragraph_v1__all_punct + min_score: 0.5 + replacement: "" + + - name: "v3" + documents: + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/test/*.gz + + output: *output + attributes: *attributes + span_replacement: *span_replacement diff --git a/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option2.yaml b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option2.yaml new file mode 100644 index 00000000..37b7be6d --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option2.yaml @@ -0,0 +1,86 @@ +streams: + - name: "v2_small" + documents: + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/test/*.gz + + output: &output + path: ${oc.env:HOME}/perplexity/option2/documents + max_size_in_bytes: 500000000 + discard_fields: + - attributes + + attributes: &attributes + - uniseg_length_paragraphs_with_empty_v1 + - not_alphanum_paragraph_v1 + + span_replacement: &span_replacement + - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph + min_score: -12 + replacement: "" + - span: $.attributes.not_alphanum_paragraph_v1__not_alphanum_paragraph_v1__all_punct + min_score: 0.5 + replacement: "" + + - name: "v3" + documents: + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/test/*.gz + + output: *output + attributes: *attributes + span_replacement: *span_replacement diff --git a/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option3.yaml b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option3.yaml new file mode 100644 index 00000000..4f912a2b --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option3.yaml @@ -0,0 +1,70 @@ +streams: + - name: "v2_small_subset" + documents: + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/ice/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/ice/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/m2d2_s2orc/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/m2d2_s2orc/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small_subset/documents/pile/test/*.gz + + output: &output + path: ${oc.env:HOME}/perplexity/option3/documents + max_size_in_bytes: 500000000 + discard_fields: + - attributes + + attributes: &attributes + - uniseg_length_paragraphs_with_empty_v1 + - not_alphanum_paragraph_v1 + + span_replacement: &span_replacement + - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph + min_score: -12 + replacement: "" + - span: $.attributes.not_alphanum_paragraph_v1__not_alphanum_paragraph_v1__all_punct + min_score: 0.5 + replacement: "" + + - name: "v3" + documents: + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/mc4/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/redpajama/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/test/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/test/*.gz + + output: *output + attributes: *attributes + span_replacement: *span_replacement diff --git a/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/ppl_v2.yaml b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/ppl_v2.yaml new file mode 100644 index 00000000..9ef386d5 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/ppl_v2.yaml @@ -0,0 +1,41 @@ +streams: + - name: "v2_small" + documents: + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/gab/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ice/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/pile/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/test/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/val/*.gz + - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/test/*.gz + + output: + path: ${oc.env:HOME}/perplexity/ppl_v2/documents + max_size_in_bytes: 500000000 + discard_fields: + - attributes + + attributes: + - uniseg_length_paragraphs_with_empty_v1 + + span_replacement: &span_replacement + - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph + min_score: -12 + replacement: "" diff --git a/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option1.yaml b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option1.yaml new file mode 100644 index 00000000..f2b21ea5 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option1.yaml @@ -0,0 +1,17 @@ +documents: + - ${oc.env:HOME}/perplexity/option1/documents/*.gz + +dedupe: + name: perplexity_suite_v3 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: false + estimated_doc_count: 3686676 + # size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + desired_false_positive_rate: 1e-15 + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option1.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option2.yaml b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option2.yaml new file mode 100644 index 00000000..9936ce75 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option2.yaml @@ -0,0 +1,17 @@ +documents: + - ${oc.env:HOME}/perplexity/option2/documents/*.gz + +dedupe: + name: perplexity_suite_v3 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: false + estimated_doc_count: 2337305 + # size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + desired_false_positive_rate: 1e-15 + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option3.yaml b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option3.yaml new file mode 100644 index 00000000..9d617645 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option3.yaml @@ -0,0 +1,17 @@ +documents: + - ${oc.env:HOME}/perplexity/option3/documents/*.gz + +dedupe: + name: perplexity_suite_v3 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: false + estimated_doc_count: 2021613 + # size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + desired_false_positive_rate: 1e-15 + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option3.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/ppl_v2.yaml b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/ppl_v2.yaml new file mode 100644 index 00000000..c5d55bfc --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/ppl_v2.yaml @@ -0,0 +1,19 @@ +documents: + - ${oc.env:HOME}/perplexity/ppl_v2/documents/*.gz + +dedupe: + name: perplexity_suite_v2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + + +bloom_filter: + file: s3://ai2-llm/bloom-filters/perplexity-suite-v2-8M.bin + size_in_bytes: 8388608 + read_only: false + estimated_doc_count: 3898706 + desired_false_positive_rate: 0.001 + + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/books.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/books.yaml new file mode 100644 index 00000000..af0eca22 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/books.yaml @@ -0,0 +1,17 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/gutenberg/v0/documents/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/c4.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/c4.yaml new file mode 100644 index 00000000..1e2f2848 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/c4.yaml @@ -0,0 +1,17 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/cc.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/cc.yaml new file mode 100644 index 00000000..15d88a17 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/cc.yaml @@ -0,0 +1,20 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.gz + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/*.gz + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/*.gz + # - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/cc_en_middle-0954.json.gz + # - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/cc_en_tail-1690.json.gz + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/peS2o.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/peS2o.yaml new file mode 100644 index 00000000..6381098c --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/peS2o.yaml @@ -0,0 +1,36 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=0/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=1/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=2/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=3/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=4/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=5/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=6/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=7/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=8/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=9/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=0/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=1/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=2/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=3/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=4/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=5/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=6/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=7/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=8/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=9/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/reddit.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/reddit.yaml new file mode 100644 index 00000000..bea2e6d8 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/reddit.yaml @@ -0,0 +1,17 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/reddit/v5-dedupe-pii-nsfw-toxic/documents/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/stack.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/stack.yaml new file mode 100644 index 00000000..2e9291cd --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/stack.yaml @@ -0,0 +1,367 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/abap/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/actionscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ada/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/agda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ags-script/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/alloy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ampl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/antlr/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/apacheconf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/api-blueprint/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/apl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/applescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/arc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/arduino/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/asciidoc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/asp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/aspectj/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ats/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/augeas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/autohotkey/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/autoit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/awk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/batchfile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/befunge/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bison/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bitbake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/blitzbasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/blitzmax/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bluespec/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/boo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/brainfuck/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/brightscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bro/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c-sharp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c++/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c2hs-haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cap'n-proto/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cartocss/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ceylon/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/chapel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/chuck/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cirru/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clarion/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clean/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/click/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clips/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clojure/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cmake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cobol/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coffeescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coldfusion/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coldfusion-cfc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/common-lisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/component-pascal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coq/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/creole/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/crystal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/csound/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/css/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cucumber/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cuda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cycript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cython/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/d/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/darcs-patch/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dart/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/desktop/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/diff/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/digital-command-language/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dns-zone/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dockerfile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dogescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dylan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/eagle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ec/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ecere-projects/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ecl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/edn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/eiffel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/elixir/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/elm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/emacs-lisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/emberscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/erlang/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/f-sharp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/factor/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fancy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fantom/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fish/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/flux/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/forth/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fortran/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/freemarker/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/g-code/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gams/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gap/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gdscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/genshi/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gentoo-ebuild/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gentoo-eclass/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gettext-catalog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/glsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/glyph/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gnuplot/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/go/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/golo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gosu/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/grace/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/grammatical-framework/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/graphql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/graphviz-(dot)/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groff/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groovy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groovy-server-pages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/handlebars/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/harbour/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haxe/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hlsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+django/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+eex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+erb/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+php/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/http/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/idl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/idris/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/igor-pro/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/inform-7/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ini/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/inno-setup/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/io/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ioke/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/irc-log/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/isabelle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/j/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jade/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jasmin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/java/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/java-server-pages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/javascript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jflex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jsx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/julia/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jupyter-notebook/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kicad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kotlin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/krl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/labview/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lasso/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/latte/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lean/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/less/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lfe/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lilypond/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/linker-script/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/liquid/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-agda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-coffeescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/livescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/llvm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/logos/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/logtalk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lolcode/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lookml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lua/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/m/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/m4/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/makefile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mako/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/maple/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/markdown/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mask/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mathematica/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/matlab/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/max/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/maxscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mediawiki/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/metal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mirah/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/modelica/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/module-management-system/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/monkey/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/moonscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mtml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/muf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mupad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/myghty/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nesc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/netlinx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/netlogo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nginx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nimrod/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ninja/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nix/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nsis/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nu/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/numpy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objdump/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objective-c++/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objective-j/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ocaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/octave/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/omgrofl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ooc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opa/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opencl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/openscad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/org/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ox/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/oxygene/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/oz/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/papyrus/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot-assembly/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot-internal-representation/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pascal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pawn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/perl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/perl6/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/php/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/piglatin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pike/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pod/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pogoscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pony/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/postscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pov-ray-sdl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/powershell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/processing/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/prolog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/propeller-spin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/protocol-buffer/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pure-data/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/purebasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/purescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/python/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/python-traceback/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/qmake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/qml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/r/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/racket/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ragel-in-ruby-host/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/raml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rdoc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/realbasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rebol/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/red/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/redcode/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ren'py/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/renderscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/restructuredtext/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rhtml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rmarkdown/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/robotframework/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rouge/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ruby/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rust/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sage/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/saltstack/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sass/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scala/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scheme/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scilab/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scss/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/self/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shellsession/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shen/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/slash/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/slim/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smali/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smalltalk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smarty/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/solidity/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sourcepawn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sparql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sqf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/squirrel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/standard-ml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stata/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ston/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stylus/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/supercollider/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/swift/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/systemverilog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tcsh/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tea/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/text/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/textile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/thrift/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/toml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/turing/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/turtle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/twig/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/txl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/typescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unified-parallel-c/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unity3d-asset/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/uno/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unrealscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/urweb/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vala/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/verilog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vhdl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/viml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/visual-basic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/volt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vue/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/web-ontology-language/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/webassembly/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/webidl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/wisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/x10/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xbase/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xojo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xpages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xproc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xquery/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xs/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xslt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xtend/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yacc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yang/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zephir/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zig/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zimpl/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikibooks.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikibooks.yaml new file mode 100644 index 00000000..5022d512 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikibooks.yaml @@ -0,0 +1,132 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=af/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ak/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ang/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ar/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=as/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ast/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=az/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ba/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=be/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=bg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=bm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=bn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=bo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=bs/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ca/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ch/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=co/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=cs/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=cv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=cy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=da/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=de/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=el/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=en/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=eo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=es/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=et/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=eu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=fa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=fi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=fr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=fy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=gl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=gn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=got/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=gu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=he/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=hi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=hr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=hu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=hy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ia/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=id/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ie/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=is/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=it/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ja/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ka/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=kk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=km/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=kn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ko/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ku/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ky/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=la/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=lb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=li/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ln/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=lt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=lv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=mg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=mi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=mk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ml/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=mn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=mr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ms/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=my/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=na/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=nah/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=nds/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ne/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=nl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=no/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=oc/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=pa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=pl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ps/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=pt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=qu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=rm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ro/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ru/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=se/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=shn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=si/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=simple/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sq/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=su/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=sw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ta/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=te/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=tg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=th/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=tk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=tl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=tr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=tt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ug/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=uk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=ur/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=uz/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=vi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=vo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=wa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=xh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=yo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=za/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=zh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=zh_min_nan/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=zu/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikipedia.yaml b/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikipedia.yaml new file mode 100644 index 00000000..937e36ba --- /dev/null +++ b/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikipedia.yaml @@ -0,0 +1,329 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ady/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=af/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ak/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=als/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=am/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ami/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=an/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ang/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ar/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=arc/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ary/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=arz/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=as/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ast/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=atj/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=av/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=avk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=awa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ay/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=az/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=azb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ba/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ban/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bar/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bat_smg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bcl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=be/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bjn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=blk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bpy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=br/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bs/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bug/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=bxr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ca/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cbk_zam/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cdo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ce/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ceb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ch/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=chr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=chy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ckb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=co/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=crh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cs/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=csb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=cy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=da/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=dag/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=de/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=din/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=diq/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=dsb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=dty/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=dv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=dz/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ee/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=el/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=eml/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=en/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=eo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=es/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=et/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=eu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ext/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ff/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fiu_vro/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fj/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=frp/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=frr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fur/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=fy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ga/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gag/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gan/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gcr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gd/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=glk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gom/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gor/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=got/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=guw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=gv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ha/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hak/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=haw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=he/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hif/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hsb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ht/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=hyw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ia/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=id/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ie/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ig/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ik/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ilo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=inh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=io/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=is/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=it/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=iu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ja/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=jam/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=jbo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=jv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ka/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kaa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kab/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kbd/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kbp/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kcg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ki/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=km/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ko/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=koi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=krc/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ks/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ksh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ku/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=kw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ky/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=la/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lad/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lbe/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lez/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lfn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=li/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lij/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lld/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lmo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ln/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ltg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=lv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mad/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mai/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=map_bms/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mdf/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mhr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=min/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ml/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mni/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mnw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mrj/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ms/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mwl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=my/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=myv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=mzn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=na/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nah/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nap/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nds/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nds_nl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ne/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=new/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nia/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=no/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nov/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nqo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nrm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nso/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=nv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ny/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=oc/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=olo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=om/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=or/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=os/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pag/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pam/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pap/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pcd/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pcm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pdc/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pfl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pih/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pms/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pnb/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pnt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ps/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=pwn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=qu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=rm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=rmy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=rn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ro/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=roa_tara/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ru/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=rue/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=rw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sah/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sat/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sc/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=scn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sco/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sd/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=se/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=shi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=shn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=si/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=simple/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=skr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=smn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=so/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sq/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=srn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ss/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=st/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=stq/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=su/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=sw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=szl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=szy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ta/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tay/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tcy/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=te/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tet/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tg/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=th/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ti/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tl/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tn/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=to/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tpi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tr/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=trv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ts/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tt/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tum/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tw/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ty/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=tyv/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=udm/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ug/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=uk/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ur/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=uz/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=ve/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=vec/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=vep/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=vi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=vls/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=vo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=wa/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=war/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=wo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=wuu/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=xal/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=xh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=xmf/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=yi/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=yo/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=za/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=zea/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=zh/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=zh_classical/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=zh_min_nan/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=zh_yue/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=zu/*.gz + + +dedupe: + name: perplexity_suite_v3_option2 + paragraphs: + attribute_name: bff_duplicate_paragraph_spans_decontamination + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 488541 + size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + +processes: 188 diff --git a/configs/dolma-v1_5/decontamination/tokenize_v3.sh b/configs/dolma-v1_5/decontamination/tokenize_v3.sh new file mode 100644 index 00000000..83dbc94e --- /dev/null +++ b/configs/dolma-v1_5/decontamination/tokenize_v3.sh @@ -0,0 +1,35 @@ +#! /usr/bin/env bash + +datasets=( + '4chan_meta_sep' + 'c4_100_domains' + 'c4_en' + 'dolma_100_subreddits' + 'dolma-v1_5' + 'falcon-refinedweb' + 'gab' + 'ice_fixed' + 'm2d2_s2orc_unsplit' + 'm2d2_wikipedia_unsplit' + 'manosphere_meta_sep' + 'mc4' + 'pile' + 'ptb' + 'redpajama' + 'twitterAAE_HELM_fixed' + 'wikitext_103' +) + +splits=( + 'test' + 'val' +) + +for dataset in "${datasets[@]}"; do + for split in "${splits[@]}"; do + dolma tokens \ + --documents "s3://ai2-llm/eval-data/perplexity/v3_small/${dataset}/${split}" \ + --destination "s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/${dataset}/${split}" \ + --tokenizer 'allenai/eleuther-ai-gpt-neox-20b-pii-special' + done +done diff --git a/configs/dolma-v1_5/decontamination/tokenize_v3_small.sh b/configs/dolma-v1_5/decontamination/tokenize_v3_small.sh new file mode 100644 index 00000000..cdcb36d3 --- /dev/null +++ b/configs/dolma-v1_5/decontamination/tokenize_v3_small.sh @@ -0,0 +1,29 @@ +#! /usr/bin/env bash + +datasets=( + 'c4_en' + 'dolma_books' + 'dolma_common-crawl' + 'dolma_pes2o' + 'dolma_reddit' + 'dolma_stack' + 'dolma_wiki' + 'ice' + 'm2d2_s2orc' + 'pile' + 'wikitext_103' +) + +splits=( + 'test' + 'val' +) + +for dataset in "${datasets[@]}"; do + for split in "${splits[@]}"; do + dolma tokens \ + --documents "s3://ai2-llm/eval-data/perplexity/v3_small/${dataset}/${split}" \ + --destination "s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/${dataset}/${split}" \ + --tokenizer 'allenai/eleuther-ai-gpt-neox-20b-pii-special' + done +done diff --git a/configs/dolma-v1_5/eval-set.md b/configs/dolma-v1_5/eval-set.md new file mode 100644 index 00000000..d608a98b --- /dev/null +++ b/configs/dolma-v1_5/eval-set.md @@ -0,0 +1,135 @@ +# Dolma v1.5 Eval set + +We create the eval set by sampling documents in each subset. Some subsets already have an eval set (e.g. C4), so we use that. Also, for some subsets, creation of eval set was done using a different strategy (e.g., reddit; documented below), so we use other approaches. + +For each subset, we aim for roughly 1M tokens + + +## CommonCrawl + +```bash +python scripts/hash_sample.py \ + -s 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_head/*.gz' 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_middle/*.gz' 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_tail/*.gz' \ + -p 0.0000005 \ + -d s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/common-crawl \ + -n 188 + +``` + +Output: + +```plain-text +{ + "debug": false, + "destination": "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/common-crawl", + "dryrun": false, + "num_workers": 188, + "probability": 5e-07, + "source": [ + "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_head/*.gz", + "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_middle/*.gz", + "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_tail/*.gz" + ] +} +Sampling with probability 5e-07 using MD5 suffixes ['ffffff', 'fffffe', 'fffffd', 'fffffc', 'fffffb', 'fffffa', 'fffff9', 'fffff8'] +Found 2,878 files to process +uniseg_words: 1.00Mu [19:23, 860u/s] +extracted: 1.91ke [19:23, 1.64e/s]] +documents: 4.60Gd [19:23, 3.95Md/s] +files: 2.88kf [19:23, 2.47f/s]59u/s] +``` + + +## PeS2o + +```bash +python scripts/hash_sample.py \ + -s s3://ai2-llm/pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/*/*.gz \ + s3://ai2-llm/pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/*/*.gz \ + -p 0.004 \ + -d s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/pes2o \ + -n 188 +``` + +Output: +```plain-text +{ + "debug": false, + "destination": "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/pes2o", + "dryrun": false, + "num_workers": 188, + "probability": 0.004, + "source": [ + "s3://ai2-llm/pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/*/*.gz", + "s3://ai2-llm/pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/*/*.gz" + ] +} +Sampling with probability 0.004 using MD5 suffixes ['ff'] +Found 600 files to process +uniseg_words: 1.21Mu [00:06, 177ku/s] +extracted: 610e [00:06, 89.4e/s]s] +documents: 161kd [00:06, 23.6kd/s] +files: 600f [00:06, 87.9f/s] 77.4ku/s] +``` + +## Books + +```bash +python scripts/hash_sample.py \ + -s 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/books/*.gz'\ + -p 0.00035\ + -d s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/books \ + -n 188 +``` + +Output: + +```plain-text +{ + "debug": false, + "destination": "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/books", + "dryrun": false, + "num_workers": 188, + "probability": 0.00038, + "source": [ + "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/books/*.gz" + ] +} +Sampling with probability 0.00038 using MD5 suffixes ['fff', 'ffe'] +Found 3 files to process +uniseg_words: 1.73Mu [01:12, 23.7ku/s] +extracted: 30.0e [01:12, 2.42s/e] +documents: 52.1kd [01:12, 717d/s] +files: 3.00f [01:12, 24.2s/f]20.2ku/s] +``` + +## Wiki + +```bash +python scripts/hash_sample.py \ + -s 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/wiki/*.gz'\ + -p 0.00038\ + -d s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/wiki \ + -n 188 +``` + +Output: + +```plain-text +{ + "debug": false, + "destination": "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-eval/documents/wiki", + "dryrun": false, + "num_workers": 188, + "probability": 0.00038, + "source": [ + "s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/wiki/*.gz" + ] +} +Sampling with probability 0.00038 using MD5 suffixes ['fff', 'ffe'] +Found 2 files to process +uniseg_words: 1.43Mu [01:58, 12.0ku/s] +extracted: 2.94ke [01:58, 24.7e/s]] +documents: 6.11Md [01:58, 51.4kd/s] +files: 2.00f [01:58, 59.4s/f]7.85ku/s] +``` diff --git a/configs/dolma-v1_5/mixing/books.yaml b/configs/dolma-v1_5/mixing/books.yaml new file mode 100644 index 00000000..4283e905 --- /dev/null +++ b/configs/dolma-v1_5/mixing/books.yaml @@ -0,0 +1,31 @@ + +streams: +- name: books + + documents: + - s3://ai2-llm/pretraining-data/sources/gutenberg/v0/documents/*.gz + + attributes: + - perplexity_suite_v3_option2 + - olmo_mix_v1_taggers + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/books + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + filter: + exclude: + - "$.attributes[?(@.olmo_mix_v1_taggers__uniseg_length_paragraphs_with_doc_length_v1__document[0][2] + < 25)]" + - "$.attributes[?(@.olmo_mix_v1_taggers__ft_lang_id_en_paragraph_with_doc_score_v2__doc_en[0][2] + < 0.5)]" + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/c4.yaml b/configs/dolma-v1_5/mixing/c4.yaml new file mode 100644 index 00000000..5973929a --- /dev/null +++ b/configs/dolma-v1_5/mixing/c4.yaml @@ -0,0 +1,119 @@ +streams: +- name: c4 + documents: + - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz + + attributes: + - olmo_mix_v1_taggers + - perplexity_suite_v3_option2 + - dedupe_paragraphs + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/c4 + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + # filter: + # include: [] + # exclude: + # - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + # && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + # span_replacement: [] + + filter: + include: [] + exclude: + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] + && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character + && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] + < 0.8)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] + && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] + > 0.9)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis + && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] + > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] + && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && + @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > + 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] + > 0.2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] + > 0.18)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] + > 0.16)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] + > 0.15)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] + > 0.14)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] + > 0.13)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] + > 0.12)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] + > 0.11)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] + > 0.10)]" + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && + @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] + > 5)]" + span_replacement: + - span: "$.attributes.bff_duplicate_paragraph_spans" + min_score: 0.5 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" + min_score: 0.4 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" + min_score: 0.4 + replacement: '' + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" + min_score: 0.5 + replacement: " |||EMAIL_ADDRESS||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" + min_score: 0.5 + replacement: " |||PHONE_NUMBER||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" + min_score: 0.5 + replacement: " |||IP_ADDRESS||| " + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/cc-head.yaml b/configs/dolma-v1_5/mixing/cc-head.yaml new file mode 100644 index 00000000..eaa0e909 --- /dev/null +++ b/configs/dolma-v1_5/mixing/cc-head.yaml @@ -0,0 +1,116 @@ +streams: + +- name: cc_en_head + documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.json.gz + + attributes: + - perplexity_suite_v3_option2 + - dedupe_paragraphs + - gopher_rules + - hatespeech_nsfw_cc_v3 + - pii_detection + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + + filter: + include: [] + exclude: + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] + && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character + && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] + < 0.8)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] + && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] + > 0.9)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis + && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] + > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] + && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && + @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > + 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] + > 0.2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] + > 0.18)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] + > 0.16)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] + > 0.15)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] + > 0.14)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] + > 0.13)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] + > 0.12)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] + > 0.11)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] + > 0.10)]" + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && + @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] + > 5)]" + span_replacement: + - span: "$.attributes.bff_duplicate_paragraph_spans" + min_score: 0.5 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" + min_score: 0.4 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" + min_score: 0.4 + replacement: '' + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" + min_score: 0.5 + replacement: " |||EMAIL_ADDRESS||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" + min_score: 0.5 + replacement: " |||PHONE_NUMBER||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" + min_score: 0.5 + replacement: " |||IP_ADDRESS||| " + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/cc-middle.yaml b/configs/dolma-v1_5/mixing/cc-middle.yaml new file mode 100644 index 00000000..bae79c93 --- /dev/null +++ b/configs/dolma-v1_5/mixing/cc-middle.yaml @@ -0,0 +1,116 @@ +streams: + +- name: cc_en_middle + documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/*.json.gz + + attributes: + - perplexity_suite_v3_option2 + - dedupe_paragraphs + - gopher_rules + - hatespeech_nsfw_cc_v3 + - pii_detection + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + + filter: + include: [] + exclude: + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] + && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character + && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] + < 0.8)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] + && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] + > 0.9)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis + && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] + > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] + && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && + @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > + 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] + > 0.2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] + > 0.18)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] + > 0.16)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] + > 0.15)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] + > 0.14)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] + > 0.13)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] + > 0.12)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] + > 0.11)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] + > 0.10)]" + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && + @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] + > 5)]" + span_replacement: + - span: "$.attributes.bff_duplicate_paragraph_spans" + min_score: 0.5 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" + min_score: 0.4 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" + min_score: 0.4 + replacement: '' + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" + min_score: 0.5 + replacement: " |||EMAIL_ADDRESS||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" + min_score: 0.5 + replacement: " |||PHONE_NUMBER||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" + min_score: 0.5 + replacement: " |||IP_ADDRESS||| " + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/cc-tail.yaml b/configs/dolma-v1_5/mixing/cc-tail.yaml new file mode 100644 index 00000000..ad06d090 --- /dev/null +++ b/configs/dolma-v1_5/mixing/cc-tail.yaml @@ -0,0 +1,116 @@ +streams: + +- name: cc_en_tail + documents: + - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/*.json.gz + + attributes: + - perplexity_suite_v3_option2 + - dedupe_paragraphs + - gopher_rules + - hatespeech_nsfw_cc_v3 + - pii_detection + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + + filter: + include: [] + exclude: + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] + && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] + && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] + && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character + && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] + < 0.8)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] + && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] + && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] + > 0.9)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis + && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] + > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] + && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && + @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > + 0.3)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] + > 0.2)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] + > 0.18)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] + > 0.16)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] + > 0.15)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] + > 0.14)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] + > 0.13)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] + > 0.12)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] + > 0.11)]" + - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] + && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] + > 0.10)]" + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && + @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] + > 5)]" + span_replacement: + - span: "$.attributes.bff_duplicate_paragraph_spans" + min_score: 0.5 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" + min_score: 0.4 + replacement: '' + - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" + min_score: 0.4 + replacement: '' + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" + min_score: 0.5 + replacement: " |||EMAIL_ADDRESS||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" + min_score: 0.5 + replacement: " |||PHONE_NUMBER||| " + - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" + min_score: 0.5 + replacement: " |||IP_ADDRESS||| " + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/pes2o.yaml b/configs/dolma-v1_5/mixing/pes2o.yaml new file mode 100644 index 00000000..2208f5cb --- /dev/null +++ b/configs/dolma-v1_5/mixing/pes2o.yaml @@ -0,0 +1,43 @@ +--- +streams: +- name: pes2o_v2 + documents: + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=0/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=1/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=2/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=3/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=4/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=5/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=6/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=7/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=8/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=9/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=0/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=1/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=2/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=3/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=4/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=5/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=6/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=7/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=8/*.gz + - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=9/*.gz + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/pes2o + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + attributes: + - perplexity_suite_v3_option2 + + filter: + include: [] + exclude: + - "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] + && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]" + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/reddit.yaml b/configs/dolma-v1_5/mixing/reddit.yaml new file mode 100644 index 00000000..86d0d157 --- /dev/null +++ b/configs/dolma-v1_5/mixing/reddit.yaml @@ -0,0 +1,26 @@ + +streams: +- name: reddit-v5-dedupe-pii-nsfw-toxic + + documents: + - s3://ai2-llm/pretraining-data/sources/reddit/v5-dedupe-pii-nsfw-toxic/documents/*.gz + + attributes: + - perplexity_suite_v3_option2 + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/reddit + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + filter: + exclude: + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/stack.yaml b/configs/dolma-v1_5/mixing/stack.yaml new file mode 100644 index 00000000..e67a6d6a --- /dev/null +++ b/configs/dolma-v1_5/mixing/stack.yaml @@ -0,0 +1,375 @@ +streams: +- name: stack-v4-train + documents: + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/abap/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/actionscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ada/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/agda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ags-script/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/alloy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ampl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/antlr/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/apacheconf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/api-blueprint/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/apl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/applescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/arc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/arduino/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/asciidoc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/asp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/aspectj/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ats/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/augeas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/autohotkey/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/autoit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/awk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/batchfile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/befunge/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bison/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bitbake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/blitzbasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/blitzmax/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bluespec/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/boo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/brainfuck/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/brightscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bro/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c-sharp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c++/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c2hs-haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cap'n-proto/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cartocss/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ceylon/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/chapel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/chuck/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cirru/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clarion/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clean/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/click/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clips/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clojure/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cmake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cobol/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coffeescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coldfusion/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coldfusion-cfc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/common-lisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/component-pascal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coq/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/creole/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/crystal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/csound/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/css/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cucumber/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cuda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cycript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cython/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/d/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/darcs-patch/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dart/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/desktop/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/diff/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/digital-command-language/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dns-zone/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dockerfile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dogescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dylan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/eagle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ec/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ecere-projects/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ecl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/edn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/eiffel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/elixir/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/elm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/emacs-lisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/emberscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/erlang/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/f-sharp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/factor/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fancy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fantom/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fish/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/flux/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/forth/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fortran/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/freemarker/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/g-code/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gams/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gap/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gdscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/genshi/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gentoo-ebuild/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gentoo-eclass/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gettext-catalog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/glsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/glyph/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gnuplot/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/go/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/golo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gosu/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/grace/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/grammatical-framework/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/graphql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/graphviz-(dot)/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groff/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groovy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groovy-server-pages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/handlebars/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/harbour/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haxe/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hlsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+django/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+eex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+erb/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+php/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/http/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/idl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/idris/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/igor-pro/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/inform-7/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ini/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/inno-setup/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/io/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ioke/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/irc-log/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/isabelle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/j/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jade/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jasmin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/java/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/java-server-pages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/javascript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jflex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jsx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/julia/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jupyter-notebook/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kicad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kotlin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/krl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/labview/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lasso/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/latte/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lean/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/less/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lfe/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lilypond/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/linker-script/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/liquid/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-agda/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-coffeescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-haskell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/livescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/llvm/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/logos/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/logtalk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lolcode/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lookml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lsl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lua/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/m/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/m4/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/makefile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mako/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/maple/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/markdown/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mask/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mathematica/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/matlab/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/max/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/maxscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mediawiki/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/metal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mirah/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/modelica/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/module-management-system/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/monkey/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/moonscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mtml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/muf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mupad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/myghty/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nesc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/netlinx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/netlogo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nginx/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nimrod/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ninja/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nit/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nix/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nsis/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nu/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/numpy/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objdump/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objective-c++/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objective-j/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ocaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/octave/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/omgrofl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ooc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opa/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opencl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/openscad/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/org/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ox/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/oxygene/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/oz/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/papyrus/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot-assembly/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot-internal-representation/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pascal/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pawn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/perl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/perl6/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/php/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/piglatin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pike/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pod/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pogoscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pony/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/postscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pov-ray-sdl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/powershell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/processing/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/prolog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/propeller-spin/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/protocol-buffer/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pure-data/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/purebasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/purescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/python/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/python-traceback/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/qmake/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/qml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/r/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/racket/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ragel-in-ruby-host/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/raml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rdoc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/realbasic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rebol/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/red/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/redcode/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ren'py/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/renderscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/restructuredtext/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rhtml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rmarkdown/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/robotframework/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rouge/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ruby/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rust/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sage/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/saltstack/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sas/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sass/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scala/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scheme/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scilab/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scss/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/self/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shell/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shellsession/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shen/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/slash/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/slim/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smali/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smalltalk/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smarty/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/solidity/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sourcepawn/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sparql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sqf/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sql/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/squirrel/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stan/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/standard-ml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stata/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ston/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stylus/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/supercollider/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/swift/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/systemverilog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tcsh/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tea/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tex/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/text/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/textile/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/thrift/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/toml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/turing/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/turtle/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/twig/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/txl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/typescript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unified-parallel-c/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unity3d-asset/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/uno/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unrealscript/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/urweb/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vala/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vcl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/verilog/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vhdl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/viml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/visual-basic/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/volt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vue/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/web-ontology-language/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/webassembly/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/webidl/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/wisp/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/x10/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xbase/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xojo/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xpages/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xproc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xquery/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xs/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xslt/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xtend/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yacc/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yaml/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yang/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zephir/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zig/*.gz + - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zimpl/*.gz + + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/stack + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + attributes: + - perplexity_suite_v3_option2 + + filter: + include: [] + exclude: + - "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] + && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]" + + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/mixing/wiki.yaml b/configs/dolma-v1_5/mixing/wiki.yaml new file mode 100644 index 00000000..fe7ce101 --- /dev/null +++ b/configs/dolma-v1_5/mixing/wiki.yaml @@ -0,0 +1,28 @@ +--- +streams: +- name: en_simple_wiki_v0 + documents: + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=en/*.gz + - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=simple/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=en/*.gz + - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=simple/*.gz + attributes: + - perplexity_suite_v3_option2 + - olmo_mix_v1_taggers + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/wiki + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + + filter: + exclude: + - "$.attributes[?(@.olmo_mix_v1_taggers__uniseg_length_paragraphs_with_doc_length_v1__document[0][2] + < 25)]" + - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] + && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_5/para_dedupe/c4.yaml b/configs/dolma-v1_5/para_dedupe/c4.yaml new file mode 100644 index 00000000..e51fe0eb --- /dev/null +++ b/configs/dolma-v1_5/para_dedupe/c4.yaml @@ -0,0 +1,16 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz + +dedupe: + name: dedupe_paragraphs + paragraphs: + attribute_name: bff_duplicate_paragraph_spans + skip_empty: true + +bloom_filter: + file: /tmp/c4.bloom + read_only: false + estimated_doc_count: 30000000000 + desired_false_positive_rate: 1e-06 + +processes: 188 diff --git a/configs/dolma-v1_5/sample/cc-head.yaml b/configs/dolma-v1_5/sample/cc-head.yaml new file mode 100644 index 00000000..11b40b96 --- /dev/null +++ b/configs/dolma-v1_5/sample/cc-head.yaml @@ -0,0 +1,18 @@ + +streams: +- name: cc_en_head + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head/*.gz + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_head + max_size_in_bytes: 3894967296 + attributes: + - random_number_v1 + filter: + include: + - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.5104606781)]" + +work_dir: + input: "/tmp/cc-head-sample/mixer/input" + output: "/tmp/cc-head-sample/mixer/output" +processes: 188 diff --git a/configs/dolma-v1_5/sample/cc-middle.yaml b/configs/dolma-v1_5/sample/cc-middle.yaml new file mode 100644 index 00000000..0f4d1aff --- /dev/null +++ b/configs/dolma-v1_5/sample/cc-middle.yaml @@ -0,0 +1,18 @@ + +streams: +- name: cc_en_middle + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.gz + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_middle + max_size_in_bytes: 3894967296 + attributes: + - random_number_v1 + filter: + include: + - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.5104606781)]" + +work_dir: + input: "/tmp/cc-head-sample/mixer/input" + output: "/tmp/cc-head-sample/mixer/output" +processes: 188 diff --git a/configs/dolma-v1_5/sample/cc-tail.yaml b/configs/dolma-v1_5/sample/cc-tail.yaml new file mode 100644 index 00000000..d07547a3 --- /dev/null +++ b/configs/dolma-v1_5/sample/cc-tail.yaml @@ -0,0 +1,18 @@ + +streams: +- name: cc_en_tail + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.gz + output: + path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_tail + max_size_in_bytes: 3894967296 + attributes: + - random_number_v1 + filter: + include: + - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.5104606781)]" + +work_dir: + input: "/tmp/cc-head-sample/mixer/input" + output: "/tmp/cc-head-sample/mixer/output" +processes: 188 diff --git a/configs/dolma-v1_5/train-set.md b/configs/dolma-v1_5/train-set.md new file mode 100644 index 00000000..09141258 --- /dev/null +++ b/configs/dolma-v1_5/train-set.md @@ -0,0 +1,47 @@ +# Dolma v1.5 + +Files is this directory are used to generate Dolma v1.5. + +## Tagging + +Tagging is largely the same as v1, but we report it here for completeness. + +### C4 + +```bash +dolma tag --documents 's3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz' --taggers pii_regex_with_counts_v2 --processes 188 +``` + +### Common Crawl + +## Filtering + +## Sampling of CC + +```bash +dolma tag --documents 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head/*.gz' 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.gz' --taggers random_number_v1 --processes 188 +``` + +dolma tag --documents 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.gz' --taggers random_number_v1 --processes 188 + +## Tokenization + +```bash +python -m dolma.tokenizer --sources 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/*/*' --destination $HOME/preprocessed/olmo-mix/v1_5/gpt-neox-20b-pii-special --num-writers 188 --max-size 17179869184 +``` + +```bash +python -m dolma.tokenizer \ + --sources 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/books/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/c4/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_head/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_middle/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_tail/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/pes2o/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/reddit/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/stack/*' \ + 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/wiki/*' \ + --destination $HOME/preprocessed/olmo-mix/v1_5-sample/gpt-neox-20b-pii-special \ + --num-writers 188 \ + --max-size 5368709120 +``` diff --git a/python/dolma/core/taggers.py b/python/dolma/core/taggers.py index 8d51d9aa..f22422c8 100644 --- a/python/dolma/core/taggers.py +++ b/python/dolma/core/taggers.py @@ -6,6 +6,7 @@ """ from abc import abstractmethod +from functools import cached_property from typing import List from .data_types import DocResult, Document, InputSpec, TaggerOutputDictType diff --git a/python/dolma/tokenizer/executor.py b/python/dolma/tokenizer/executor.py index ee322ce3..3f9438a9 100644 --- a/python/dolma/tokenizer/executor.py +++ b/python/dolma/tokenizer/executor.py @@ -188,8 +188,6 @@ def tokenize_in_parallel( dtype: str = "uint16", debug: bool = False, ): - multiprocessing.set_start_method("spawn") - # variables for the nice debugging and tokenizers os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace" os.environ["TOKENIZERS_PARALLELISM"] = "false" diff --git a/scripts/eval_html_extraction.py b/scripts/eval_html_extraction.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/mm_tokenize.py b/scripts/mm_tokenize.py new file mode 100644 index 00000000..3b9bb62e --- /dev/null +++ b/scripts/mm_tokenize.py @@ -0,0 +1,683 @@ +""" +Use this to prepare a numpy memory-mapped language modeling dataset from raw *.json.gz +dataset files, such as those from c4. Each file is expected to be a gzipped JSON lines +file, which each JSON line has a field named "text" that is a string representing a single +document from the dataset. + +To test out this script, run: + +```bash +python scripts/prepare_memmap_dataset.py test_fixtures/*.json.gz -o /tmp/out.npy +``` +""" + +import concurrent.futures +from csv import writer +import csv +import functools +from io import BytesIO +import itertools +import json +import logging +import multiprocessing as mp +from enum import Enum +import os +import random +from contextlib import ExitStack +from pathlib import Path +import re +from tempfile import NamedTemporaryFile +from typing import IO, Any, Generator, List, NamedTuple, Optional, Sequence, TextIO, Tuple, TypeVar, Union + +import click +import msgspec +import numpy as np +from rich.progress import ( + BarColumn, + MofNCompleteColumn, + Progress, + TaskProgressColumn, + TimeElapsedColumn, +) +from typing_extensions import TypeAlias +import smart_open + +from dolma.core.data_types import InputSpec +from dolma.core.paths import glob_path + +from tokenizers import Tokenizer as BaseTokenizer + + +PathOrStr: TypeAlias = Union[str, os.PathLike] + +log = logging.getLogger(__name__) + +T = TypeVar("T", bound=Sequence) + + +class StrEnum(str, Enum): + """ + This is equivalent to Python's :class:`enum.StrEnum` since version 3.11. + We include this here for compatibility with older version of Python. + """ + + def __str__(self) -> str: + return self.value + + def __repr__(self) -> str: + return f"'{str(self)}'" + + +class TruncationDirection(StrEnum): + right = "right" + left = "left" + + +class Tokenizer: + """ + A :class:`Tokenizer` is a light-weight wrapper around a HuggingFace :class:`tokenizers.Tokenizer`. + + :param base_tokenizer: The :class:`tokenizers.Tokenizer` to use. + :param eos_token_id: The token ID corresponding to the "end-of-sentence" token. + :param truncate_to: Truncate when tokenizing to this number of token IDs. + :param truncate_direction: The direction to truncate in. "right" means truncate the tokens + on the right. "left" means truncate the tokens on the left. If ``truncate_to`` is null, + this setting has no effect. + """ + + def __init__( + self, + base_tokenizer: BaseTokenizer, + eos_token_id: int, + pad_token_id: Optional[int] = None, + truncate_to: Optional[int] = None, + truncate_direction: Union[str, TruncationDirection] = TruncationDirection.right, + ): + self.base_tokenizer = base_tokenizer + self.base_tokenizer.no_truncation() + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id if pad_token_id is not None else eos_token_id + self.truncate_to = truncate_to + self.truncate_direction = TruncationDirection(truncate_direction) + + @property + def vocab_size(self) -> int: + return self.base_tokenizer.get_vocab_size() + + @classmethod + def from_pretrained(cls, identifier: str, **kwargs) -> 'Tokenizer': + """ + Initialize a tokenizer from a pretrained tokenizer on the HuggingFace Hub. + + :param identifier: The identifier of a model on the Hub that contains a + ``tokenizer.json`` file. + :param kwargs: Other key word arguments passed to :class:`Tokenizer`. + """ + base_tokenizer = BaseTokenizer.from_pretrained(identifier) + eos_token_id = kwargs.pop("eos_token_id", base_tokenizer.get_vocab_size() - 1) + return cls(base_tokenizer, eos_token_id, **kwargs) + + def add_special_tokens(self, input_ids: List[int]) -> List[int]: + """ + Add special tokens in-place (if not already present) to the given token IDs. + """ + if not input_ids or input_ids[-1] != self.eos_token_id: + input_ids.append(self.eos_token_id) + return input_ids + + def num_special_tokens_to_add(self, is_pair: bool = False) -> int: + return 2 if is_pair else 1 + + def _truncate( + self, input_ids: List[int], truncate_to: Optional[int], direction: TruncationDirection + ) -> List[int]: + if truncate_to is None or len(input_ids) <= truncate_to: + return input_ids + elif direction == TruncationDirection.left: + return input_ids[len(input_ids) - truncate_to :] + else: + return input_ids[: -(len(input_ids) - truncate_to)] + + def encode(self, input: str, add_special_tokens: bool = True) -> List[int]: + """ + Encode a string into token IDs. + """ + return self.encode_batch([input], add_special_tokens=add_special_tokens)[0] + + def encode_batch(self, inputs: List[str], add_special_tokens: bool = True) -> List[List[int]]: + """ + Encode a batch of strings into token IDs. + """ + truncate_to = self.truncate_to + if truncate_to is not None and add_special_tokens: + truncate_to -= self.num_special_tokens_to_add(False) + + batch_encoding = self.base_tokenizer.encode_batch(inputs) + + all_input_ids = [] + for encoding in batch_encoding: + input_ids = self._truncate(encoding.ids, truncate_to, self.truncate_direction) + if add_special_tokens: + input_ids = self.add_special_tokens(input_ids) + all_input_ids.append(input_ids) + + return all_input_ids + + def decode(self, token_ids: List[int], skip_special_tokens: bool = True) -> str: + """ + Decode a list of token IDs to a string. + """ + return self.base_tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) + + +def get_progress() -> Progress: + return Progress( + "[progress.description]{task.description}", + MofNCompleteColumn(), + "files", + BarColumn(), + TaskProgressColumn(), + TimeElapsedColumn(), + ) + + +class OutputSpec(NamedTuple): + id: str + src: str + loc: int + tokens: List[int] + start: int + end: int + + @classmethod + def from_tokens(cls, id: str, src: str, loc: int, tokens: List[int]) -> "OutputSpec": + return cls(id=id, src=src, loc=loc, tokens=tokens, start=0, end=len(tokens)) + + @classmethod + def from_output_spec(cls, output_spec: "OutputSpec", start: int = -1, end: int = -1) -> "OutputSpec": + start = start if start >= 0 else output_spec.start + end = end if end >= 0 else output_spec.end + return cls(id=output_spec.id, src=output_spec.src, loc=output_spec.loc, tokens=output_spec.tokens, start=start, end=end) + + +def tokenize_file(tokenizer: Tokenizer, path: str) -> Generator[OutputSpec, None, None]: + """Tokenize a file of documents using the provided tokenizer; file is expected to be a gzipped JSON lines + file, each containing a field named `text`. + """ + decoder = msgspec.json.Decoder(InputSpec) + with smart_open.open(path, mode="rt") as input_stream: + for i, line in enumerate(input_stream, start=1): + try: + row = decoder.decode(line) + if text := row.text.strip(): + # skip empty docs + tokens = tokenizer.encode(text, add_special_tokens=True) + yield OutputSpec.from_tokens(id=row.id, src=path, loc=i, tokens=tokens) + i += 1 + except Exception as ex: + log.error("Error processing %s:%d", path, i, exc_info=ex) + + +class Metadata(NamedTuple): + id: str + src: str + loc: int + start: int + end: int + + def to_csv(self) -> str: + return f"{self.id},{self.src},{self.loc},{self.start},{self.end}" + + +class MemMapWriter: + """Context manager responsible for writing, resizing, and closing / uploading a memmap file.""" + + DEFAULT_MAX_TOKENS = 512 * 1024 * 1024 # 500M tokens / 1GB + MEMMAP_EXTENSION = ".npy" + METADATA_EXTENSION = ".jsonl.gz" + + def __init__( + self, + path: str, + dtype: np.dtype, + max_tokens: int = DEFAULT_MAX_TOKENS, + ): + """Create a new memmap file. + + Args: + path (str): Location for the memmap file. If the path is not local, the memmap file will be + written to a temporary file first and then uploaded to the destination. + dtype (np.dtype): Data type for the memmap file; must be a valid numpy dtype. + max_tokens (int, optional): Maximum number of tokens per file. Defaults to 500M tokens, which is 1GB. + """ + base_path = re.sub(r"(\.npy?)?(\.[a-zA-Z]+)*$", "", path) + self.memmap_path = f"{base_path}{self.MEMMAP_EXTENSION}" + self.metadata_path = f"{base_path}{self.METADATA_EXTENSION}" + self.dtype = dtype + self.max_tokens = max_tokens + + self._local_memmap_path: Optional[Path] = None + self._local_metadata_path: Optional[Path] = None + self._written_tokens = 0 + self._memmap_file: Optional[np.memmap] = None + self._metadata_file: Optional[TextIO] = None + + def __len__(self) -> int: + """Length of the memmap file in tokens that have been written.""" + return self._written_tokens + + @functools.cached_property + def metadata_writer(self): + if self._metadata_file is None: + raise RuntimeError("Metadata file is not open") + return writer(self._metadata_file) + + # def write(self, values: List[int], flush: bool = False) -> Optional[List[int]]: + def write(self, output: OutputSpec, flush: bool = False) -> Optional[OutputSpec]: + """Write a list of token IDs to the memmap file; if only a subset of the values can be written, + return the rest. + + Args: + values (List[int]): List of token IDs to write. + flush (bool, optional): Whether to flush the memmap file after writing. Defaults to False. + """ + + if self._memmap_file is None: + raise RuntimeError("MemmapFile is not open") + + if self._metadata_file is None: + raise RuntimeError("Metadata file is not open") + + if (len(output.tokens) + self._written_tokens) >= self.max_tokens: + values = output.tokens[: self.max_tokens - self._written_tokens] + start = 0 + end = self.max_tokens - self._written_tokens + rest = OutputSpec.from_output_spec(output_spec=output, start=end) + else: + values = output.tokens + start = 0 + end = len(output.tokens) + rest = None + + metadata = Metadata( + id=output.id, + src=output.src, + loc=output.loc, + start=start, + end=end, + ) + self._memmap_file[self._written_tokens : self._written_tokens + end] = values + self._written_tokens += end - start + + # self._metadata_file.write(msgspec.json.encode(metadata) + b"\n") + self.metadata_writer.writerow(metadata) + + if flush: + self._memmap_file.flush() + self._metadata_file.flush() + + return rest + + @property + def is_remote_path(self) -> bool: + return re.match('[a-zA-Z0-9]+://', self.memmap_path) is not None and not self.memmap_path.startswith("file://") + + def __enter__(self) -> "MemMapWriter": + """Context manager entry point. Creates the memmap file and returns self.""" + + assert self._memmap_file is None and self._metadata_file is None, "MemmapFile is already open" + + if self.is_remote_path: + with ExitStack() as stack: + # if the destination for the memmap is not local, we need to write to a temporary file first + _memmap_file = stack.enter_context( + NamedTemporaryFile(delete=False, prefix="olmo_memmap", suffix=self.MEMMAP_EXTENSION) + ) + self._local_memmap_path = Path(_memmap_file.name) + _metadata_file = stack.enter_context( + NamedTemporaryFile(delete=False, prefix="olmo_metadata", suffix=self.METADATA_EXTENSION) + ) + self._local_metadata_path = Path(_metadata_file.name) + else: + self._local_memmap_path = Path(self.memmap_path) + self._local_metadata_path = Path(self.metadata_path) + # make sure the directory exists + self._local_memmap_path.parent.mkdir(parents=True, exist_ok=True) + self._local_metadata_path.parent.mkdir(parents=True, exist_ok=True) + + # these two assertions ensure type checking + assert self._local_memmap_path is not None + assert self._local_metadata_path is not None + + self._memmap_file = np.memmap( + mode="w+", + filename=self._local_memmap_path, + dtype=self.dtype, + shape=(self.max_tokens,) + ) + self._metadata_file = smart_open.open(self._local_metadata_path, mode="wt") + + log.info(f"Created memmap file at {self._local_memmap_path} of size {self._memmap_file.nbytes:,} bytes") + + return self + + def __exit__(self, *_): + """Context manager exit point. Closes the memmap file.""" + return self.close() + + def close(self): + """Close the memmap file and optionally upload it to the destination (in the case of a remote path).""" + assert self._local_memmap_path is not None, "Local Memmap path is not provided" + assert self._local_metadata_path is not None, "Local Metadata path is not provided" + assert self._memmap_file is not None, "Memmap file is not open" + assert self._metadata_file is not None, "Metadata file is not open" + + try: + # write the memmap to the destination + self._memmap_file.flush() + self._metadata_file.flush() + self._metadata_file.close() + + # we resize the memmap to the number of tokens actually written + if self._written_tokens < self.max_tokens: + del self._memmap_file + os.rename(self._local_memmap_path, (temp_path := self._local_memmap_path.with_suffix(".tmp"))) + new_memmap = np.memmap( + mode="w+", filename=self._local_memmap_path, dtype=self.dtype, shape=(self._written_tokens,) + ) + old_memmap = np.memmap(mode="r", filename=temp_path, dtype=self.dtype, shape=(self.max_tokens,)) + new_memmap[:] = old_memmap[: self._written_tokens] + new_memmap.flush() + log.info(f"Resized memmap file from {old_memmap.nbytes:,} to {new_memmap.nbytes:,} bytes") + os.remove(temp_path) + + if self.is_remote_path: + with ExitStack() as stack: + f = stack.enter_context(smart_open.open(self._local_memmap_path, "rb")) + g = stack.enter_context(smart_open.open(self.memmap_path, mode="wb")) + g.write(f.read()) + log.info(f"Written memmap file to {self.memmap_path}") + finally: + if self.is_remote_path: + # delete the temporary file under any circumstances + os.remove(self._local_memmap_path) + + # reset to none, clear cache + self._local_memmap_path = self._memmap_file = None + self._local_metadata_path = self._metadata_file = None + del self.metadata_writer + + +def fill_memmap( + tokenizer_id: str, + path_or_paths: Union[str, List[str]], + memmap_path: str, + dtype: np.dtype, + max_tokens: int = 1024 * 1024 * 1024, # 1024 tokens * 2 bytes per token (uint16) = 2GB + sample_rate: float = 1.0, + random_seed: int = 3920, + repeat_sequence: int = 1, +) -> int: + """Write a memmap file from a file of documents.""" + + # set the seed in case we need to sample + np.random.seed(random_seed) + + # we need to make a new tokenizer here because it's not pickleable + tokenizer = Tokenizer.from_pretrained(tokenizer_id, truncate_to=None) + + # first memmap file will be created in the loop below + memmap_writer: Optional[MemMapWriter] = None + + # we increment this every time we create a new memmap file + file_index = 0 + + # total number of tokens written + total_tokens = 0 + + # make sure path is a list + path_or_paths = [path_or_paths] if isinstance(path_or_paths, str) else path_or_paths + + with ExitStack() as stack: + it = itertools.chain.from_iterable( + # repeat the sequence if necessary + tokenize_file(tokenizer=tokenizer, path=path) + for _ in range(repeat_sequence) + for path in path_or_paths + ) + + import tqdm + import time + start = time.time() + + for line_no, output in tqdm.tqdm(enumerate(it, start=1)): + # perform sampling if necessary + if sample_rate < 1.0 and np.random.rand() > sample_rate: + continue + + # flush any 10k lines or so; improves stability + flush = line_no % 10_000 == 0 + + # increment the total number of tokens written + total_tokens += len(output.tokens) + + if memmap_writer is not None: + # leftovers_to_write is gonna be an OutputSpec with the tokens that didn't fit in the + # current memmap, or None if all tokens fit + leftovers_to_write = memmap_writer.write(output=output, flush=flush) + else: + # memmap hasn't been created yet, so technically the entire output is leftovers + leftovers_to_write = output + + if leftovers_to_write is not None: + # close the previous memmap (if one is open) + stack.pop_all().close() + + # create a new memmap file; progressively name them with an index + curr_memmap_path = f"{memmap_path}_{file_index:05d}.npy" + memmap_writer = stack.enter_context( + MemMapWriter(path=curr_memmap_path, dtype=dtype, max_tokens=max_tokens) + ) + + # increment the file index and reset the tokens index + file_index += 1 + + # do the actual writing + memmap_writer.write(leftovers_to_write) + + if line_no > 50_000: + break + + # close the last memmap + stack.pop_all().close() + + end = time.time() + print(f"Time elapsed: {end - start:.2f}s") + + return total_tokens + + +def make_source_and_target( + src: Tuple[str, ...], + output: str, + random_seed: int = 3920, + paths_per_worker: int = 1, +) -> Tuple[Tuple[Union[str, List[str]], ...], Tuple[str, ...]]: + """Recursively list all files in the source directories and create a corresponding list of destination.""" + + np.random.seed(random_seed) + random.seed(random_seed) + + exploded_src = list(set(path for prefix in src for path in glob_path(prefix))) + output_digits = np.ceil(np.log10(len(exploded_src) + 1)).astype(int) + + # shuffle the source paths + random.shuffle(exploded_src) + + grouped_src: Union[List[str], List[List[str]]] + if paths_per_worker > 1: + assert ( + len(exploded_src) >= paths_per_worker + ), f"Number of paths ({len(exploded_src)}) must be <= paths_per_worker ({paths_per_worker})" + + # group the paths into chunks of paths_per_worker + grouped_src = [ + sorted(exploded_src[i : i + paths_per_worker]) for i in range(0, len(exploded_src), paths_per_worker) + ] + else: + grouped_src = exploded_src + + # determine the destination paths + exploded_dst = [f'{output.rstrip("/")}/{i:0{output_digits}d}' for i in range(len(grouped_src))] + + return tuple(grouped_src), tuple(exploded_dst) + + +@click.command() +@click.argument( + "src", + nargs=-1, + type=str, + required=True, +) +@click.option( + "-o", + "--output", + type=str, + help="Specify the output path.", + prompt="Output directory", +) +@click.option( + "--tokenizer", + "tokenizer_id", + type=str, + help="Name of path of a pretrained tokenizer", + default="allenai/eleuther-ai-gpt-neox-20b-pii-special", +) +@click.option("--dtype", "dtype_str", default="uint16") +@click.option("--validate/--no-validate", default=False) +@click.option("--sample-rate", type=click.FloatRange(min=0.0, max=1.0), default=1.0) +@click.option("--random-seed", type=int, default=3920) +@click.option("--repeat-sequence", type=click.IntRange(min=1), default=1) +@click.option("--paths-per-worker", type=click.IntRange(min=1), default=1) +@click.option( + "--cache-dir", + type=str, + default=None, + help="Cache directory for the tokenizer; use system default if not specified", +) +@click.option( + "--max-tokens", + default=512 * 1024 * 1024, + type=int, + help="Maximum number of tokens to store in a single memmap file (default: 512M tokens or 1GB)", +) +@click.option("--debug/--no-debug", default=False, help="Enable debug (single process mode)") +@click.option( + "--safe-mode/--fast-mode", default=False, help="Safe mode caches locally and decompresses using gzip.open" +) +@click.option("-j", "--workers", "max_workers", type=int, default=1, help="Defaults to number of CPUs") +def main( + src: Tuple[str, ...], + output: str, + tokenizer_id: str = "EleutherAI/gpt-neox-20b", + dtype_str: str = "uint16", + validate: bool = False, + max_tokens: int = 512 * 1024 * 1024, + safe_mode: bool = False, + debug: bool = False, + sample_rate: float = 1.0, + random_seed: int = 3920, + repeat_sequence: int = 1, + paths_per_worker: int = 1, + max_workers: int = 1, + cache_dir: Optional[str] = None, +): + print("=== CONFIGURATION ===") + print(f"src: {src}") + print(f"output: {output}") + print(f"tokenizer_id: {tokenizer_id}") + print(f"dtype_str: {dtype_str}") + print(f"validate: {validate}") + print(f"max_tokens: {max_tokens}") + print(f"debug: {debug}") + print(f"sample_rate: {sample_rate}") + print(f"random_seed: {random_seed}") + print(f"repeat_sequence: {repeat_sequence}") + print(f"paths_per_worker: {paths_per_worker}") + print(f"max_workers: {max_workers}") + print("=====================") + + dtype = np.dtype(dtype_str) + exploded_src, exploded_dst = make_source_and_target( + src=src, output=output, random_seed=random_seed, paths_per_worker=paths_per_worker + ) + + # creating a partial here with all the arguments we need to pass to fill_memmap except for the paths + # so that we don't make mistakes between debug and non-debug mode + fill_memmap_fn = functools.partial( + fill_memmap, + tokenizer_id=tokenizer_id, + dtype=dtype, + max_tokens=max_tokens, + sample_rate=sample_rate, + random_seed=random_seed, + repeat_sequence=repeat_sequence, + ) + + total_tokens_written = 0 + + if debug: + log.info("Running in debug mode. Only one process will be used.") + for src_path, dst_path in zip(exploded_src, exploded_dst): + total_tokens_written += fill_memmap_fn(path_or_paths=src_path, memmap_path=dst_path) + else: + # Now tokenizer all documents again and populate the memmap array. We do this in parallel. + workers_cnt = min(max_workers or os.cpu_count() or 1, len(exploded_src)) + with concurrent.futures.ProcessPoolExecutor(max_workers=workers_cnt) as executor: + futures: List[concurrent.futures.Future[int]] = [] + for src_path, dst_path in zip(exploded_src, exploded_dst): + future = executor.submit(fill_memmap_fn, path_or_paths=src_path, memmap_path=dst_path) + futures.append(future) + with get_progress() as progress: + for future in progress.track( + concurrent.futures.as_completed(futures), + description="Filling memmap arrays...", + total=len(futures), + ): + total_tokens_written += future.result() + + log.info(f"Done! File(s) written to {output}") + log.info(f"Total tokens written: {total_tokens_written:,}") + + if validate: + log.info("Validating...") + tokenizer = Tokenizer.from_pretrained(tokenizer_id, truncate_to=None) + + def encode_fn(row): + return tokenizer.encode(json.loads(row)["text"], add_special_tokens=True) # noqa + + total_tokens = total_docs = 0 + for input_path in (path for prefix in src for path in glob_path(prefix)): + with smart_open.open(input_path, mode="rb") as g: + for row in g: + total_docs += 1 + total_tokens += len(encode_fn(row)) + + for output_path in glob_path(output): + if not output_path.endswith(".npy"): + continue + memmap = np.memmap(output_path, mode="r", dtype=dtype) + total_tokens -= len(memmap) + total_docs -= (memmap == tokenizer.eos_token_id).sum() + assert (memmap < tokenizer.vocab_size).all(), f"Invalid token ID in {output_path}" + + assert total_tokens == 0, f"Total tokens mismatch: {total_tokens} != 0" + assert total_docs == 0, f"Total docs mismatch: {total_docs} != 0" + + log.info("All good!") + + +if __name__ == "__main__": + mp.set_start_method("spawn") + main()