Skip to content

Commit

Permalink
Text Modification Config
Browse files Browse the repository at this point in the history
  • Loading branch information
rodneykinney committed Oct 19, 2023
1 parent 51c4e08 commit a1a58ec
Show file tree
Hide file tree
Showing 9 changed files with 207 additions and 143 deletions.
21 changes: 11 additions & 10 deletions configs/c4-replication/mixer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,16 @@ streams:
# exclude documents that contain one or more naughty words
- $.attributes[?(@.c4_v2__c4_v2__has_naughty_word && @.c4_v2__c4_v2__has_naughty_word[0] && @.c4_v2__c4_v2__has_naughty_word[0][2] > 0.5)]

span_replacement:
# remove lines that do not end in punctuation
- span: $.attributes.c4_v2__c4_v2__lines_with_no_ending_punctuation
min_score: 0.5
replacement: ""

# remove lines that are too short (less than 3 words as defined by C4 rules)
- span: $.attributes.c4_v2__c4_v2__lines_with_too_few_words
min_score: 0.5
replacement: ""
text_modification:
span_replacement:
# remove lines that do not end in punctuation
- span: $.attributes.c4_v2__c4_v2__lines_with_no_ending_punctuation
min_score: 0.5
replacement: ""

# remove lines that are too short (less than 3 words as defined by C4 rules)
- span: $.attributes.c4_v2__c4_v2__lines_with_too_few_words
min_score: 0.5
replacement: ""

processes: 8
9 changes: 5 additions & 4 deletions docs/examples/wikipedia-mixer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ streams:
- "$.attributes[?(@.exp__ft_lang_id_en_paragraph_with_doc_score_v2__doc_en[0][2] <= 0.5)]"
- "[email protected][?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]"

span_replacement:
- span: "$.attributes.exp__cld2_en_paragraph_with_doc_score_v2__not_en"
min_score: 0.1
replacement: ''
text_modification:
span_replacement:
- span: "$.attributes.exp__cld2_en_paragraph_with_doc_score_v2__not_en"
min_score: 0.1
replacement: ''

processes: 1
20 changes: 11 additions & 9 deletions docs/getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,15 +157,17 @@ Further, we override the number of processes to use to 96 using the `--processes
"[email protected][?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]"
]
},
# span replacement allows you to replace spans of text with a different string
"span_replacement": [
{
# remove paragraphs whose not-English cld2 socre is below 0.9 in a document
"span": "$.attributes.exp__cld2_en_paragraph_with_doc_score_v2__not_en",
"min_score": 0.1,
"replacement": ""
}
]
"text_modification": {
# span replacement allows you to replace spans of text with a different string
"span_replacement": [
{
# remove paragraphs whose not-English cld2 socre is below 0.9 in a document
"span": "$.attributes.exp__cld2_en_paragraph_with_doc_score_v2__not_en",
"min_score": 0.1,
"replacement": ""
}
]
}
}
],
# this process option is overridden by the command line flag
Expand Down
9 changes: 5 additions & 4 deletions docs/mixer.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,11 @@ The following parameters are supported either via CLI (e.g. `dolma mix --paramet
|`streams[].output.discard_fields`|No| Top-level fields in the `discard_fields` list will be dropped from the output documents. |
|`streams[].filter.include`|No| Optional content-based filtering. Default = keep everything. Documents are retained if they match any of the `include` patterns (or if no `include` patterns are specified) AND if they match none of the `exclude` patterns. Pattern syntax is [jsonpath](https://support.smartbear.com/alertsite/docs/monitors/api/endpoint/jsonpath.html#filters). |
|`streams[].filter.exclude`|No| Optional content-based filtering. Default = keep everything. Documents are retained if they match any of the `include` patterns (or if no `include` patterns are specified) AND if they match none of the `exclude` patterns. Pattern syntax is [jsonpath](https://support.smartbear.com/alertsite/docs/monitors/api/endpoint/jsonpath.html#filters). |
|`streams[].span_replacement`|No| A list of objects specifying spans of text to be replaced. |
|`streams[].span_replacement[].span`|No| A json-path expression for an attribute that contains an array of spans. Each span should be list of length three: `[start, end, score]`. |
|`streams[].span_replacement[].min_score`|No| If the span score is less than this value, the span will not be replaced. |
|`streams[].span_replacement[].replacement`|No| The text that should be inserted in place of the span. Use `{}` to represent the original text. |
|`streams[].text_modification.trim_whitespace`|No| Remove leading and trailing whitespace from document text. |
|`streams[].text_modification.minimum_text_length`|No| Skip writing the document if the final text is shorter than this size (in bytes). |
|`streams[].text_modification.span_replacement[].span`|No| A json-path expression for an attribute that contains an array of spans. Each span should be list of length three: `[start, end, score]`. |
|`streams[].text_modification.span_replacement[].min_score`|No| If the span score is less than this value, the span will not be replaced. |
|`streams[].text_modification.span_replacement[].replacement`|No| The text that should be inserted in place of the span. Use `{}` to represent the original text. |
|`work_dir.input`|No| Path to a local scratch directory where temporary input files can be placed. If not provided, Dolma will make one for you and delete it upon completion. |
|`work_dir.output`|No| Path to a local scratch directory where temporary output files can be placed. If not provided, Dolma will make one for you and delete it upon completion. |
|`processes`|No| Number of processes to use for mixing. By default 1 process is used. |
Expand Down
43 changes: 31 additions & 12 deletions python/dolma/cli/mixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@ class SpanReplacementConfig:
replacement: str = field(default="", help="Replacement for the span")


@dataclass
class TextModificationConfig:
span_replacement: List[SpanReplacementConfig] = field(default=[], help="Configuration for replacing spans.")
trim_whitespace: bool = field(
default=False, help="If true, trim leading and trailing whitespace from text (after span replacement)"
)
minimum_text_length: int = field(
default=0, help="Skip writing the document if the final text is shorter than this size (in bytes)"
)


@dataclass
class StreamConfig:
name: str = field(help="Name of the stream. Required.")
Expand All @@ -42,7 +53,9 @@ class StreamConfig:
filter: Optional[FilterConfig] = field( # pyright: ignore
default=None, help="Configuration for filtering documents."
)
span_replacement: List[SpanReplacementConfig] = field(default=[], help="Configuration for replacing spans.")
text_modification: Optional[TextModificationConfig] = field(
default=None, help="Configuration for modifying the document text"
)


@dataclass
Expand Down Expand Up @@ -83,17 +96,23 @@ def run(cls, parsed_config: MixerConfig):
"exclude": [str(i) for i in stream_config.filter.exclude],
}

for span_replacement in stream_config.span_replacement:
stream_config_dict.setdefault("span_replacement", []).append(
{
"span": str(span_replacement.span),
"min_score": float(span_replacement.min_score),
"replacement": str(span_replacement.replacement),
}
)

if "span_replacement" not in stream_config_dict and "filter" not in stream_config_dict:
raise DolmaConfigError("Either `filter` or `span_replacement` must be specified")
if stream_config.text_modification is not None:
text_modification_dict = {
"trim_whitespace": stream_config.text_modification.trim_whitespace,
"minimum_text_length": stream_config.text_modification.minimum_text_length,
}
stream_config_dict["text_modification"] = text_modification_dict
for span_replacement in stream_config.text_modification.span_replacement:
text_modification_dict.setdefault("span_replacement", []).append(
{
"span": str(span_replacement.span),
"min_score": float(span_replacement.min_score),
"replacement": str(span_replacement.replacement),
}
)

if "text_modification" not in stream_config_dict and "filter" not in stream_config_dict:
raise DolmaConfigError("Either `filter` or `text_modification` must be specified")

# perform some path validation to make sure we don't call the mixer with invalid config
total_matching_documents = 0
Expand Down
Loading

0 comments on commit a1a58ec

Please sign in to comment.