From 41e95f66d15a5c50599a14529d3966868944a5b6 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Tue, 11 Feb 2025 16:30:50 -0800 Subject: [PATCH] simpler logic for calculating code taggers --- python/dolma/taggers/code/__init__.py | 2 + python/dolma/taggers/code/code_taggers.py | 48 +++++++++++++++++++++++ tests/python/test_code.py | 2 +- 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/python/dolma/taggers/code/__init__.py b/python/dolma/taggers/code/__init__.py index 13117d6c..48343fd4 100644 --- a/python/dolma/taggers/code/__init__.py +++ b/python/dolma/taggers/code/__init__.py @@ -1,5 +1,6 @@ from .code_taggers import ( CodeCopyrightTagger, + CodeDolma2Taggers, CodeRedPajamaTaggers, CodeSecretsTagger, CodeStarCoderTaggers, @@ -12,4 +13,5 @@ "CodeRedPajamaTaggers", "CodeStarCoderTaggers", "CodeStarCoderTaggers2", + "CodeDolma2Taggers" ] diff --git a/python/dolma/taggers/code/code_taggers.py b/python/dolma/taggers/code/code_taggers.py index 31b57087..e2a72fa0 100644 --- a/python/dolma/taggers/code/code_taggers.py +++ b/python/dolma/taggers/code/code_taggers.py @@ -269,3 +269,51 @@ def predict(self, doc: DocumentWithMetadata) -> DocResult: # type: ignore spans.append(Span(start=0, end=doc_length, type="code_to_text_ratio_html_doc", score=code_to_text_ratio)) return DocResult(doc=doc, spans=spans) + + +@TaggerRegistry.add("code_dolma_taggers_v1") +class CodeDolma2Taggers(BaseTaggerWithMetadata): + """ + Based on StarCoder v1 and v2 taggers, but simpler because it + always calculates the code_to_comment_ratio_doc and code_to_text_ratio. + + Language-specific logic (e.g. Python? HTML?) is offloaded to mixer. + """ + + def __init__(self) -> None: + check_code_dependencies() + self.ext_to_lang_mapping = get_ext_to_lang_mapping() + super().__init__() + + def predict(self, doc: DocumentWithMetadata) -> DocResult: # type: ignore + spans: List[Span] = [] + doc_length = len(doc.text) + + has_xml_template = 1.0 if "