From 1a6aec7bb0fe670874a8605944a99cf20b299144 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Wed, 25 Sep 2024 10:44:28 -0700 Subject: [PATCH 1/2] DNM: Patch FT Tagger --- python/dolma/core/ft_tagger.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/dolma/core/ft_tagger.py b/python/dolma/core/ft_tagger.py index 53bcc031..016125ad 100644 --- a/python/dolma/core/ft_tagger.py +++ b/python/dolma/core/ft_tagger.py @@ -15,8 +15,8 @@ from fasttext import train_supervised from fasttext.FastText import _FastText -from .data_types import DocResult, Document, Span, TextSlice -from .taggers import BaseTagger +from .data_types import DocResult, Document, DocumentWithMetadata, Span, TextSlice +from .taggers import BaseTaggerWithMetadata from .utils import split_paragraphs, split_sentences @@ -25,7 +25,7 @@ class Prediction(NamedTuple): score: float -class BaseFastTextTagger(BaseTagger): +class BaseFastTextTagger(BaseTaggerWithMetadata): SENTENCE_LEVEL_TAGGER = "sentence" PARAGRAPH_LEVEL_TAGGER = "paragraph" DOCUMENT_LEVEL_TAGGER = "document" @@ -135,13 +135,13 @@ def test( model_performance = classifier.test(local_test_file) print(model_performance) - def predict(self, doc: Document) -> DocResult: + def predict(self, doc: DocumentWithMetadata) -> DocResult: if self.mode == self.SENTENCE_LEVEL_TAGGER: units = split_sentences(doc.text) elif self.mode == self.PARAGRAPH_LEVEL_TAGGER: units = split_paragraphs(doc.text) elif self.mode == self.DOCUMENT_LEVEL_TAGGER: - units = [TextSlice(doc=doc.text, start=0, end=len(doc.text))] + units = [TextSlice(doc=doc.metadata["original_text"], start=0, end=len(doc.metadata["original_text"]))] else: raise ValueError(f"Unknown mode {self.mode}") From ba87b1c69d0b8617a81eabc6d3ca8249013ba3d0 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Mon, 7 Oct 2024 15:45:26 -0700 Subject: [PATCH 2/2] Tag og text --- python/dolma/taggers/length.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/python/dolma/taggers/length.py b/python/dolma/taggers/length.py index a1420c7b..8b379421 100644 --- a/python/dolma/taggers/length.py +++ b/python/dolma/taggers/length.py @@ -12,9 +12,9 @@ import uniseg.wordbreak from tokenizers import Regex, Tokenizer, pre_tokenizers -from ..core.data_types import DocResult, Document, Span, TextSlice +from ..core.data_types import DocResult, Document, DocumentWithMetadata, Span, TextSlice from ..core.registry import TaggerRegistry -from ..core.taggers import BaseTagger +from ..core.taggers import BaseTagger, BaseTaggerWithMetadata from ..core.utils import split_paragraphs @@ -161,16 +161,17 @@ def predict(self, doc: Document) -> DocResult: @TaggerRegistry.add("dolma_v1_tokenizer") -class DolmaV1Tokenizer(BaseTagger): +class DolmaV1Tokenizer(BaseTaggerWithMetadata): TOKENIZER_NAME_OR_PATH = "allenai/gpt-neox-olmo-dolma-v1_5" def __init__(self) -> None: self.tokenizer = Tokenizer.from_pretrained(self.TOKENIZER_NAME_OR_PATH) super().__init__() - def predict(self, doc: Document) -> DocResult: - score = len(self.tokenizer.encode(text)) if (text := doc.text.strip()) else 0 - return DocResult(doc=doc, spans=[Span(start=0, end=len(doc.text), type="length", score=score)]) + def predict(self, doc: DocumentWithMetadata) -> DocResult: + text = doc.metadata["original_text"].strip() + score = len(self.tokenizer.encode(text)) if (text) else 0 + return DocResult(doc=doc, spans=[Span(start=0, end=len(text), type="length", score=score)]) @TaggerRegistry.add("dolma_v2_tokenizer")