diff --git a/configs/dolma-v1_5r2/doc_dedupe/cc_en_head.yaml b/configs/dolma-v1_5r2/doc_dedupe/cc_en_head.yaml deleted file mode 100644 index f2060630..00000000 --- a/configs/dolma-v1_5r2/doc_dedupe/cc_en_head.yaml +++ /dev/null @@ -1,17 +0,0 @@ -documents: - - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.gz - -dedupe: - name: dedupe_docs_v2 - documents: - attribute_name: bff_duplicate_docs - key: $.text - skip_empty: false - -bloom_filter: - file: /tmp/cc_en_head_dedupe_docs.bloom - read_only: false - estimated_doc_count: 60000000000 - desired_false_positive_rate: 1e-06 - -processes: 188 diff --git a/configs/dolma-v1_5r2/doc_dedupe/cc_en_middle.yaml b/configs/dolma-v1_5r2/doc_dedupe/cc_en_middle.yaml deleted file mode 100644 index a23c6b33..00000000 --- a/configs/dolma-v1_5r2/doc_dedupe/cc_en_middle.yaml +++ /dev/null @@ -1,17 +0,0 @@ -documents: - - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/*.gz - -dedupe: - name: dedupe_docs_v2 - documents: - attribute_name: bff_duplicate_docs - key: $.text - skip_empty: true - -bloom_filter: - file: /tmp/cc_en_middle_dedupe_docs.bloom - read_only: false - estimated_doc_count: 30000000000 - desired_false_positive_rate: 1e-06 - -processes: 188 diff --git a/configs/dolma-v1_5r2/doc_dedupe/cc_en_tail_part1.yaml b/configs/dolma-v1_5r2/doc_dedupe/cc_en_tail_part1.yaml deleted file mode 100644 index 2763a5a0..00000000 --- a/configs/dolma-v1_5r2/doc_dedupe/cc_en_tail_part1.yaml +++ /dev/null @@ -1,17 +0,0 @@ -documents: - - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/cc_en_tail-0*.json.gz - -dedupe: - name: dedupe_docs_v2 - documents: - attribute_name: bff_duplicate_docs - key: $.text - skip_empty: true - -bloom_filter: - file: /tmp/cc_en_tail_dedupe_docs.bloom - read_only: false - estimated_doc_count: 30000000000 - desired_false_positive_rate: 1e-06 - -processes: 188 diff --git a/configs/dolma-v1_5r2/doc_dedupe/cc_en_tail_part2.yaml b/configs/dolma-v1_5r2/doc_dedupe/cc_en_tail_part2.yaml deleted file mode 100644 index 41969673..00000000 --- a/configs/dolma-v1_5r2/doc_dedupe/cc_en_tail_part2.yaml +++ /dev/null @@ -1,17 +0,0 @@ -documents: - - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/cc_en_tail-1*.json.gz - -dedupe: - name: dedupe_docs_v2 - documents: - attribute_name: bff_duplicate_docs - key: $.text - skip_empty: true - -bloom_filter: - file: /tmp/cc_en_tail_dedupe_docs.bloom - read_only: false - estimated_doc_count: 30000000000 - desired_false_positive_rate: 1e-06 - -processes: 188 diff --git a/configs/dolma-v1_5r2/doc_dedupe/cc_en_tail_part3.yaml b/configs/dolma-v1_5r2/doc_dedupe/cc_en_tail_part3.yaml deleted file mode 100644 index 555589ee..00000000 --- a/configs/dolma-v1_5r2/doc_dedupe/cc_en_tail_part3.yaml +++ /dev/null @@ -1,17 +0,0 @@ -documents: - - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/cc_en_tail-2*.json.gz - -dedupe: - name: dedupe_docs_v2 - documents: - attribute_name: bff_duplicate_docs - key: $.text - skip_empty: true - -bloom_filter: - file: /tmp/cc_en_tail_dedupe_docs.bloom - read_only: false - estimated_doc_count: 30000000000 - desired_false_positive_rate: 1e-06 - -processes: 188 diff --git a/configs/dolma-v1_5r2/mixing/books.yaml b/configs/dolma-v1_5r2/mixing/books.yaml deleted file mode 100644 index 82df4650..00000000 --- a/configs/dolma-v1_5r2/mixing/books.yaml +++ /dev/null @@ -1,33 +0,0 @@ - -streams: -- name: books - - documents: - - s3://ai2-llm/pretraining-data/sources/gutenberg/v0/documents/*.gz - - attributes: - # - perplexity_suite_v3_option2 - - olmo_mix_v1_taggers - - tokenizer_repetitions_v2r2 - - output: - path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/books - max_size_in_bytes: 4294967296 - discard_fields: - - attributes - - filter: - exclude: - - "$.attributes[?(@.olmo_mix_v1_taggers__uniseg_length_paragraphs_with_doc_length_v1__document[0][2] - < 25)]" - - "$.attributes[?(@.olmo_mix_v1_taggers__ft_lang_id_en_paragraph_with_doc_score_v2__doc_en[0][2] - < 0.5)]" - - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2] >= 100)]" - - # - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" - - -work_dir: - input: "/tmp/olmo-mix-v1_5/input" - output: "/tmp/olmo-mix-v1_5/output" -processes: 188 diff --git a/configs/dolma-v1_5r2/mixing/c4.yaml b/configs/dolma-v1_5r2/mixing/c4.yaml deleted file mode 100644 index ccf5d6ff..00000000 --- a/configs/dolma-v1_5r2/mixing/c4.yaml +++ /dev/null @@ -1,132 +0,0 @@ -streams: -- name: c4 - documents: - - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz - - attributes: - - olmo_mix_v1_taggers - # - perplexity_suite_v3_option2 - - dedupe_paragraphs - - dedupe_docs - - tokenizer_repetitions_v2r2 - - output: - path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/c4 - max_size_in_bytes: 4294967296 - min_text_length: 1 - discard_fields: - - attributes - - # filter: - # include: [] - # exclude: - # - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] - # && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" - # span_replacement: [] - - filter: - include: [] - exclude: - - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] - && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] - && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] - && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] - && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] - && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character - && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] - < 0.8)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] - && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point - && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] - && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] - > 0.9)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis - && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] - > 0.3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] - && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && - @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > - 0.3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] - > 0.2)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] - > 0.18)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] - > 0.16)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] - > 0.15)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] - > 0.14)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] - > 0.13)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] - > 0.12)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] - > 0.11)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] - > 0.10)]" - - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] - && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" - - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && - @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] - > 5)]" - - # 100+ repetitions - - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition - && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] - && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2] - >= 100)]" - - # remove duplicate docs - - "$@.attributes[?(@.bff_duplicate_docs && @.bff_duplicate_docs[0] - && @.bff_duplicate_docs[0][2] >= 1.0)]" - span_replacement: - - span: "$.attributes.bff_duplicate_paragraph_spans" - min_score: 0.5 - replacement: '' - - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" - min_score: 0.4 - replacement: '' - - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" - min_score: 0.4 - replacement: '' - - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" - min_score: 0.5 - replacement: " |||EMAIL_ADDRESS||| " - - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" - min_score: 0.5 - replacement: " |||PHONE_NUMBER||| " - - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" - min_score: 0.5 - replacement: " |||IP_ADDRESS||| " - -work_dir: - input: "/tmp/olmo-mix-v1_5/input" - output: "/tmp/olmo-mix-v1_5/output" -processes: 188 diff --git a/configs/dolma-v1_5r2/mixing/cc-head.yaml b/configs/dolma-v1_5r2/mixing/cc-head.yaml deleted file mode 100644 index 1d7a1f40..00000000 --- a/configs/dolma-v1_5r2/mixing/cc-head.yaml +++ /dev/null @@ -1,127 +0,0 @@ -streams: - -- name: cc_en_head - documents: - - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.json.gz - - attributes: - - dedupe_paragraphs - - gopher_rules - - hatespeech_nsfw_cc_v3 - - pii_detection - - tokenizer_repetitions_v2r2 - - dedupe_docs_v2 - - output: - path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_head - max_size_in_bytes: 4294967296 - min_text_length: 1 - discard_fields: - - attributes - - - filter: - include: [] - exclude: - - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] - && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] - && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] - && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] - && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] - && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character - && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] - < 0.8)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] - && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point - && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] - && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] - > 0.9)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis - && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] - > 0.3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] - && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && - @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > - 0.3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] - > 0.2)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] - > 0.18)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] - > 0.16)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] - > 0.15)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] - > 0.14)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] - > 0.13)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] - > 0.12)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] - > 0.11)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] - > 0.10)]" - - - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && - @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] - > 5)]" - - # 100+ repetitions - - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition - && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] - && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2] - >= 100)]" - - # remove duplicate docs - - "$@.attributes[?(@.bff_duplicate_docs && @.bff_duplicate_docs[0] - && @.bff_duplicate_docs[0][2] >= 1.0)]" - span_replacement: - - span: "$.attributes.bff_duplicate_paragraph_spans" - min_score: 0.5 - replacement: '' - - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" - min_score: 0.4 - replacement: '' - - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" - min_score: 0.4 - replacement: '' - - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" - min_score: 0.5 - replacement: " |||EMAIL_ADDRESS||| " - - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" - min_score: 0.5 - replacement: " |||PHONE_NUMBER||| " - - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" - min_score: 0.5 - replacement: " |||IP_ADDRESS||| " - -work_dir: - input: "/tmp/olmo-mix-v1_5/input" - output: "/tmp/olmo-mix-v1_5/output" -processes: 188 diff --git a/configs/dolma-v1_5r2/mixing/cc-middle.yaml b/configs/dolma-v1_5r2/mixing/cc-middle.yaml deleted file mode 100644 index d6822ca9..00000000 --- a/configs/dolma-v1_5r2/mixing/cc-middle.yaml +++ /dev/null @@ -1,127 +0,0 @@ -streams: - -- name: cc_en_middle - documents: - - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/*.json.gz - - attributes: - - dedupe_paragraphs_v2 - - gopher_rules - - hatespeech_nsfw_cc_v3 - - pii_detection - - tokenizer_repetitions_v2r2 - - dedupe_docs_v2 - - output: - path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_middle - max_size_in_bytes: 4294967296 - min_text_length: 1 - discard_fields: - - attributes - - - filter: - include: [] - exclude: - - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] - && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] - && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] - && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] - && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] - && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character - && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] - < 0.8)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] - && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point - && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] - && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] - > 0.9)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis - && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] - > 0.3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] - && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && - @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > - 0.3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] - > 0.2)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] - > 0.18)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] - > 0.16)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] - > 0.15)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] - > 0.14)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] - > 0.13)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] - > 0.12)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] - > 0.11)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] - > 0.10)]" - - - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && - @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] - > 5)]" - - # 100+ repetitions - - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition - && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] - && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2] - >= 100)]" - - # remove duplicate docs - - "$@.attributes[?(@.bff_duplicate_docs && @.bff_duplicate_docs[0] - && @.bff_duplicate_docs[0][2] >= 1.0)]" - span_replacement: - - span: "$.attributes.bff_duplicate_paragraph_spans" - min_score: 0.5 - replacement: '' - - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" - min_score: 0.4 - replacement: '' - - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" - min_score: 0.4 - replacement: '' - - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" - min_score: 0.5 - replacement: " |||EMAIL_ADDRESS||| " - - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" - min_score: 0.5 - replacement: " |||PHONE_NUMBER||| " - - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" - min_score: 0.5 - replacement: " |||IP_ADDRESS||| " - -work_dir: - input: "/tmp/olmo-mix-v1_5/input" - output: "/tmp/olmo-mix-v1_5/output" -processes: 188 diff --git a/configs/dolma-v1_5r2/mixing/cc-tail.yaml b/configs/dolma-v1_5r2/mixing/cc-tail.yaml deleted file mode 100644 index f652dfeb..00000000 --- a/configs/dolma-v1_5r2/mixing/cc-tail.yaml +++ /dev/null @@ -1,127 +0,0 @@ -streams: - -- name: cc_en_tail - documents: - - s3://ai2-llm/pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/*.json.gz - - attributes: - - dedupe_paragraphs - - gopher_rules - - hatespeech_nsfw_cc_v3 - - pii_detection - - tokenizer_repetitions_v2r2 - - dedupe_docs_v2 - - output: - path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_tail - max_size_in_bytes: 4294967296 - min_text_length: 1 - discard_fields: - - attributes - - - filter: - include: [] - exclude: - - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] - && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] - && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] - && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] - && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] - && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character - && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] - < 0.8)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] - && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point - && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] - && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] - > 0.9)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis - && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] - > 0.3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] - && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && - @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > - 0.3)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] - > 0.2)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] - > 0.18)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] - > 0.16)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] - > 0.15)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] - > 0.14)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] - > 0.13)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] - > 0.12)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] - > 0.11)]" - - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] - && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] - > 0.10)]" - - - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && - @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] - > 5)]" - - # 100+ repetitions - - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition - && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] - && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2] - >= 100)]" - - # remove duplicate docs - - "$@.attributes[?(@.bff_duplicate_docs && @.bff_duplicate_docs[0] - && @.bff_duplicate_docs[0][2] >= 1.0)]" - span_replacement: - - span: "$.attributes.bff_duplicate_paragraph_spans" - min_score: 0.5 - replacement: '' - - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_hatespeech_sentence_v2____label__toxic" - min_score: 0.4 - replacement: '' - - span: "$.attributes.hatespeech_nsfw_cc_v3__jigsaw_nsfw_sencence_v2____label__nsfw" - min_score: 0.4 - replacement: '' - - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS" - min_score: 0.5 - replacement: " |||EMAIL_ADDRESS||| " - - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER" - min_score: 0.5 - replacement: " |||PHONE_NUMBER||| " - - span: "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS" - min_score: 0.5 - replacement: " |||IP_ADDRESS||| " - -work_dir: - input: "/tmp/olmo-mix-v1_5/input" - output: "/tmp/olmo-mix-v1_5/output" -processes: 188 diff --git a/configs/dolma-v1_5r2/mixing/pes2o.yaml b/configs/dolma-v1_5r2/mixing/pes2o.yaml deleted file mode 100644 index 221937f5..00000000 --- a/configs/dolma-v1_5r2/mixing/pes2o.yaml +++ /dev/null @@ -1,46 +0,0 @@ ---- -streams: -- name: pes2o_v2 - documents: - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=0/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=1/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=2/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=3/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=4/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=5/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=6/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=7/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=8/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag/split=train/part_id=9/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=0/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=1/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=2/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=3/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=4/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=5/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=6/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=7/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=8/*.gz - - s3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc/split=train/part_id=9/*.gz - output: - path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/pes2o - max_size_in_bytes: 4294967296 - min_text_length: 1 - discard_fields: - - attributes - - attributes: - - tokenizer_repetitions_v2r2 - - filter: - include: [] - exclude: - # 100+ repetitions - - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2]>= 100)]" - # - "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] - # && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]" - -work_dir: - input: "/tmp/olmo-mix-v1_5/input" - output: "/tmp/olmo-mix-v1_5/output" -processes: 188 diff --git a/configs/dolma-v1_5r2/mixing/reddit.yaml b/configs/dolma-v1_5r2/mixing/reddit.yaml deleted file mode 100644 index 9c85f4b0..00000000 --- a/configs/dolma-v1_5r2/mixing/reddit.yaml +++ /dev/null @@ -1,29 +0,0 @@ - -streams: -- name: reddit-v5-dedupe-pii-nsfw-toxic - - documents: - - s3://ai2-llm/pretraining-data/sources/reddit/v5-dedupe-pii-nsfw-toxic/documents/*.gz - - attributes: - - perplexity_suite_v3_option2 - - tokenizer_repetitions_v2r2 - - output: - path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/reddit - max_size_in_bytes: 4294967296 - min_text_length: 1 - discard_fields: - - attributes - - filter: - exclude: - # - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" - # 100+ repetitions - - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2] >= 100)]" - - -work_dir: - input: "/tmp/olmo-mix-v1_5/input" - output: "/tmp/olmo-mix-v1_5/output" -processes: 188 diff --git a/configs/dolma-v1_5r2/mixing/stack.yaml b/configs/dolma-v1_5r2/mixing/stack.yaml deleted file mode 100644 index cba68e62..00000000 --- a/configs/dolma-v1_5r2/mixing/stack.yaml +++ /dev/null @@ -1,381 +0,0 @@ -streams: -- name: stack-v4-train - documents: - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/abap/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/actionscript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ada/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/agda/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ags-script/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/alloy/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ampl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/antlr/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/apacheconf/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/api-blueprint/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/apl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/applescript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/arc/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/arduino/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/asciidoc/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/asp/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/aspectj/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ats/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/augeas/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/autohotkey/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/autoit/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/awk/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/batchfile/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/befunge/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bison/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bitbake/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/blitzbasic/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/blitzmax/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bluespec/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/boo/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/brainfuck/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/brightscript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/bro/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c-sharp/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c++/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/c2hs-haskell/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cap'n-proto/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cartocss/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ceylon/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/chapel/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/chuck/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cirru/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clarion/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clean/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/click/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clips/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/clojure/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cmake/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cobol/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coffeescript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coldfusion/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coldfusion-cfc/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/common-lisp/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/component-pascal/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/coq/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/creole/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/crystal/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/csound/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/css/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cucumber/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cuda/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cycript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/cython/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/d/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/darcs-patch/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dart/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/desktop/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/diff/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/digital-command-language/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dm/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dns-zone/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dockerfile/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dogescript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/dylan/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/eagle/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ec/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ecere-projects/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ecl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/edn/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/eiffel/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/elixir/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/elm/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/emacs-lisp/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/emberscript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/erlang/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/f-sharp/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/factor/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fancy/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fantom/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fish/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/flux/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/forth/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/fortran/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/freemarker/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/g-code/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gams/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gap/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gas/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gdscript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/genshi/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gentoo-ebuild/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gentoo-eclass/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gettext-catalog/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/glsl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/glyph/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gnuplot/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/go/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/golo/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/gosu/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/grace/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/grammatical-framework/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/graphql/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/graphviz-(dot)/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groff/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groovy/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/groovy-server-pages/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haml/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/handlebars/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/harbour/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haskell/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/haxe/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hcl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hlsl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+django/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+eex/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+erb/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/html+php/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/http/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/hy/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/idl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/idris/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/igor-pro/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/inform-7/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ini/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/inno-setup/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/io/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ioke/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/irc-log/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/isabelle/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/j/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jade/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jasmin/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/java/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/java-server-pages/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/javascript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jflex/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jsx/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/julia/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/jupyter-notebook/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kicad/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kit/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/kotlin/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/krl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/labview/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lasso/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/latte/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lean/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/less/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lex/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lfe/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lilypond/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/linker-script/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/liquid/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-agda/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-coffeescript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/literate-haskell/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/livescript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/llvm/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/logos/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/logtalk/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lolcode/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lookml/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lsl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/lua/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/m/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/m4/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/makefile/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mako/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/maple/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/markdown/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mask/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mathematica/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/matlab/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/max/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/maxscript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mediawiki/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/metal/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mirah/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/modelica/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/module-management-system/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/monkey/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/moonscript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mtml/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/muf/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/mupad/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/myghty/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nesc/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/netlinx/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/netlogo/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nginx/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nimrod/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ninja/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nit/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nix/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nsis/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/nu/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/numpy/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objdump/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objective-c++/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/objective-j/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ocaml/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/octave/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/omgrofl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ooc/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opa/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opal/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/opencl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/openscad/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/org/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ox/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/oxygene/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/oz/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pan/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/papyrus/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot-assembly/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/parrot-internal-representation/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pascal/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pawn/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/perl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/perl6/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/php/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/piglatin/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pike/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pod/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pogoscript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pony/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/postscript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pov-ray-sdl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/powershell/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/processing/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/prolog/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/propeller-spin/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/protocol-buffer/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/pure-data/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/purebasic/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/purescript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/python/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/python-traceback/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/qmake/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/qml/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/r/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/racket/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ragel-in-ruby-host/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/raml/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rdoc/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/realbasic/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rebol/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/red/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/redcode/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ren'py/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/renderscript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/restructuredtext/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rhtml/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rmarkdown/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/robotframework/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rouge/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ruby/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/rust/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sage/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/saltstack/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sas/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sass/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scala/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scaml/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scheme/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scilab/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/scss/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/self/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shell/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shellsession/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/shen/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/slash/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/slim/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smali/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smalltalk/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smarty/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/smt/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/solidity/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sourcepawn/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sparql/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sqf/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/sql/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/squirrel/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stan/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/standard-ml/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stata/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/ston/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/stylus/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/supercollider/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/swift/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/systemverilog/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tcl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tcsh/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tea/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/tex/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/text/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/textile/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/thrift/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/toml/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/turing/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/turtle/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/twig/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/txl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/typescript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unified-parallel-c/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unity3d-asset/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/uno/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/unrealscript/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/urweb/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vala/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vcl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/verilog/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vhdl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/viml/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/visual-basic/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/volt/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/vue/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/web-ontology-language/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/webassembly/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/webidl/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/wisp/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/x10/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xbase/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xc/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xml/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xojo/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xpages/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xproc/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xquery/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xs/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xslt/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/xtend/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yacc/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yaml/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/yang/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zephir/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zig/*.gz - - s3://ai2-llm/pretraining-data/sources/stack-dedup/v4-train/documents/zimpl/*.gz - - output: - path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/stack - max_size_in_bytes: 4294967296 - min_text_length: 1 - discard_fields: - - attributes - - attributes: - # - perplexity_suite_v3_option2 - - dedupe_docs - - tokenizer_repetitions_v2r2 - - filter: - include: [] - exclude: - # 100+ repetitions - - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2]>= 100)]" - - # remove duplicate docs - - "$@.attributes[?(@.bff_duplicate_docs && @.bff_duplicate_docs[0] && @.bff_duplicate_docs[0][2] >= 1.0)]" - - -work_dir: - input: "/tmp/olmo-mix-v1_5/input" - output: "/tmp/olmo-mix-v1_5/output" -processes: 188 diff --git a/configs/dolma-v1_5r2/mixing/wiki.yaml b/configs/dolma-v1_5r2/mixing/wiki.yaml deleted file mode 100644 index 5c9eb7b4..00000000 --- a/configs/dolma-v1_5r2/mixing/wiki.yaml +++ /dev/null @@ -1,36 +0,0 @@ ---- -streams: -- name: en_simple_wiki_v0 - documents: - - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=en/*.gz - - s3://ai2-llm/pretraining-data/sources/wikipedia/v0/documents/lang=simple/*.gz - - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=en/*.gz - - s3://ai2-llm/pretraining-data/sources/wikibooks/v0/documents/lang=simple/*.gz - attributes: - # - perplexity_suite_v3_option2 - - olmo_mix_v1_taggers - - tokenizer_repetitions_v2r2 - - output: - path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/wiki - max_size_in_bytes: 4294967296 - min_text_length: 1 - discard_fields: - - attributes - - filter: - exclude: - - "$.attributes[?(@.olmo_mix_v1_taggers__uniseg_length_paragraphs_with_doc_length_v1__document[0][2] < 25)]" - - # 100+ repetitions - - "$.attributes[?(@.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0] && @.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition[0][2]>= 100)]" - - # remove duplicate docs - # - "$@.attributes[?(@.bff_duplicate_docs && @.bff_duplicate_docs[0] && @.bff_duplicate_docs[0][2] >= 1.0)]" - - # - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" - -work_dir: - input: "/tmp/olmo-mix-v1_5/input" - output: "/tmp/olmo-mix-v1_5/output" -processes: 188 diff --git a/configs/dolma-v1_5r2/sample.yaml b/configs/dolma-v1_5r2/sample.yaml deleted file mode 100644 index 3ad0a66c..00000000 --- a/configs/dolma-v1_5r2/sample.yaml +++ /dev/null @@ -1,31 +0,0 @@ ---- -streams: -- name: v1_5r2_sample - documents: - - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/books/*.json.gz - - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/c4/*.json.gz - - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_head/*.json.gz - - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_middle/*.json.gz - - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_tail/*.json.gz - - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/pes2o/*.json.gz - - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/reddit/*.json.gz - - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/stack/*.json.gz - - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/wiki/*.json.gz - attributes: - - random_number_v1 - - output: - path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2_03p_sample/documents/ - max_size_in_bytes: 53_687_091_200 - min_text_length: 1 - discard_fields: - - attributes - - filter: - exclude: - - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] >= 0.003)]" - -work_dir: - input: "/tmp/olmo-mix-v1_5r2/input" - output: "/tmp/olmo-mix-v1_5r2/output" -processes: 188 diff --git a/configs/dolma-v1_5r2/sample/cc-head.yaml b/configs/dolma-v1_5r2/sample/cc-head.yaml deleted file mode 100644 index fbe474b4..00000000 --- a/configs/dolma-v1_5r2/sample/cc-head.yaml +++ /dev/null @@ -1,18 +0,0 @@ - -streams: -- name: cc_en_head - documents: - - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_head/*.gz - output: - path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2-sample/documents/cc_en_head - max_size_in_bytes: 3894967296 - attributes: - - random_number_v1 - filter: - include: - - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.5104606781)]" - -work_dir: - input: "/tmp/cc-head-sample/mixer/input" - output: "/tmp/cc-head-sample/mixer/output" -processes: 188 diff --git a/configs/dolma-v1_5r2/sample/cc-middle.yaml b/configs/dolma-v1_5r2/sample/cc-middle.yaml deleted file mode 100644 index 0f4d1aff..00000000 --- a/configs/dolma-v1_5r2/sample/cc-middle.yaml +++ /dev/null @@ -1,18 +0,0 @@ - -streams: -- name: cc_en_middle - documents: - - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.gz - output: - path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_middle - max_size_in_bytes: 3894967296 - attributes: - - random_number_v1 - filter: - include: - - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.5104606781)]" - -work_dir: - input: "/tmp/cc-head-sample/mixer/input" - output: "/tmp/cc-head-sample/mixer/output" -processes: 188 diff --git a/configs/dolma-v1_5r2/sample/cc-tail.yaml b/configs/dolma-v1_5r2/sample/cc-tail.yaml deleted file mode 100644 index d07547a3..00000000 --- a/configs/dolma-v1_5r2/sample/cc-tail.yaml +++ /dev/null @@ -1,18 +0,0 @@ - -streams: -- name: cc_en_tail - documents: - - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.gz - output: - path: s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5-sample/documents/cc_en_tail - max_size_in_bytes: 3894967296 - attributes: - - random_number_v1 - filter: - include: - - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.5104606781)]" - -work_dir: - input: "/tmp/cc-head-sample/mixer/input" - output: "/tmp/cc-head-sample/mixer/output" -processes: 188 diff --git a/configs/dolma-v1_5r2/tokenizer.yaml b/configs/dolma-v1_5r2/tokenizer.yaml deleted file mode 100644 index 25ebedd7..00000000 --- a/configs/dolma-v1_5r2/tokenizer.yaml +++ /dev/null @@ -1,7 +0,0 @@ -destination: s3://ai2-llm/preprocessed/olmo-mix/v1_5r2/gpt-neox-olmo-dolma-v1_5 -documents: - - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/*/*.json.gz -processes: 188 -seed: 3920 -max_size: 20_000_000_000 -tokenizer_name_or_path: allenai/gpt-neox-olmo-dolma-v1_5