Skip to content

Commit

Permalink
Merge pull request #353 from HLasse/fix-quality-threshold-not-set
Browse files Browse the repository at this point in the history
fix: `contains` and `symbols` not updated in `Quality`
  • Loading branch information
KennethEnevoldsen authored May 7, 2024
2 parents e470db7 + 7915eeb commit cfe9b87
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 16 deletions.
20 changes: 5 additions & 15 deletions src/textdescriptives/components/quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,8 +367,6 @@ def __init__( # pylint: disable=dangerous-default-value
self,
nlp: Language,
name: str,
symbols: List[str],
contains: List[str],
top_ngram_range: Tuple[int, int],
top_ngram_min_count: int,
duplicate_n_gram_fraction_range: Tuple[int, int],
Expand All @@ -379,14 +377,14 @@ def __init__( # pylint: disable=dangerous-default-value
"""Initialise components."""
self.name = name
self.force = force
self.symbols = symbols
self.contains = contains
self.top_ngram_range = top_ngram_range
self.top_ngram_min_count = top_ngram_min_count
self.duplicate_n_gram_fraction_range = duplicate_n_gram_fraction_range

if quality_thresholds is None:
quality_thresholds = QualityThresholds()
self.quality_thresholds = quality_thresholds
self.set_quality_thresholds(quality_thresholds)

self.vocab = vocab

self.set_extensions()
Expand Down Expand Up @@ -560,6 +558,8 @@ def set_quality_thresholds(self, thresholds: QualityThresholds) -> None:
thresholds (QualityThresholds): The desired quality thresholds.
"""
self.quality_thresholds = thresholds
self.contains = list(self.quality_thresholds.contains.keys())
self.symbols = list(self.quality_thresholds.symbol_to_word_ratio.keys())

def __call__(self, doc: Doc):
"""Run the pipeline component."""
Expand All @@ -576,8 +576,6 @@ def __call__(self, doc: Doc):
"span._.passed_quality_check",
],
default_config={
"symbols": ["#"],
"contains": ["lorem ipsum"],
"top_ngram_range": [2, 4],
"top_ngram_min_count": 3,
"duplicate_n_gram_fraction_range": [5, 10],
Expand All @@ -588,8 +586,6 @@ def __call__(self, doc: Doc):
def create_quality_component(
nlp: Language,
name: str,
symbols: List[str],
contains: List[str],
top_ngram_range: Tuple[int, int],
top_ngram_min_count: int,
duplicate_n_gram_fraction_range: Tuple[int, int],
Expand Down Expand Up @@ -628,10 +624,6 @@ def create_quality_component(
nlp.add_pipe call.
name (str): name of the component. Can be optionally specified in the
nlp.add_pipe call, using the name argument.
symbols (List[str]): list of symbols for which to calculate the
proportion the ratio of symbols to words. Defaults to ["#"].
contains (List[str]): list of strings for which to check whether the
document contains them. Defaults to ["lorem ipsum"].
top_ngram_range (Tuple[int]): range of n-grams to calculate the
proportion of the top n-gram. Defaults to [2, 4].
top_ngram_min_count (int): minimum number of times a n-gram must occur to
Expand Down Expand Up @@ -663,8 +655,6 @@ def create_quality_component(
return Quality(
nlp,
name=name,
symbols=symbols,
contains=contains,
top_ngram_range=top_ngram_range,
top_ngram_min_count=top_ngram_min_count,
duplicate_n_gram_fraction_range=duplicate_n_gram_fraction_range,
Expand Down
1 change: 0 additions & 1 deletion tests/test_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,6 @@ def test_quality_component_with_config(nlp: spacy.Language):
quality_pipe = nlp.add_pipe(
"textdescriptives/quality",
config={
"symbols": ["."],
"force": True,
},
)
Expand Down

0 comments on commit cfe9b87

Please sign in to comment.