From 199cb241e0d9adfc1a40e772c41df08432c5bedc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Primo=C5=BE=20Godec?= Date: Sun, 1 Mar 2020 12:59:20 +0100 Subject: [PATCH] Statistics widget --- orangecontrib/text/widgets/owstatistics.py | 329 +++++++++++++++++++++ 1 file changed, 329 insertions(+) create mode 100644 orangecontrib/text/widgets/owstatistics.py diff --git a/orangecontrib/text/widgets/owstatistics.py b/orangecontrib/text/widgets/owstatistics.py new file mode 100644 index 000000000..366efc56b --- /dev/null +++ b/orangecontrib/text/widgets/owstatistics.py @@ -0,0 +1,329 @@ +import re +from copy import copy +from typing import List, Tuple, Callable + +import numpy as np + +from Orange.widgets import gui +from Orange.widgets.settings import DomainContextHandler +from Orange.widgets.utils.concurrent import TaskState, ConcurrentWidgetMixin +from Orange.widgets.widget import OWWidget +from AnyQt.QtCore import QSize +from AnyQt.QtWidgets import QGridLayout, QLabel, QLineEdit, QComboBox + +from orangecontrib.text import Corpus +from orangewidget.settings import Setting, ContextSetting +from orangewidget.utils.signals import Input, Output +from orangewidget.utils.widgetpreview import WidgetPreview + + +# every statistic returns a np.ndarray with statistics +# and list with variables names - it must be implemented here since some +# statistics will have more variables (count of each pos tag) + + +def number_of_words(corpus: Corpus, text: str, callback: Callable): + def cust_len(tokens: List[str]): + callback() + return len(tokens) + # TODO: discuss if ok on tokens + # np.c_ makes column vector (ndarray) out of the list + # [1, 2, 3] -> [[1], [2], [3]] + return np.c_[list(map(cust_len, corpus.tokens))], ["Words count"] + + +def characters_count(corpus: Corpus, text: str, callback: Callable): + def chara_count(tokens: List[str]): + callback() + return sum(len(t) for t in tokens) + # TODO: discuss if ok on tokens + return ( + np.c_[list(map(chara_count, corpus.tokens))], + ["Characters count"] + ) + + +def starts_with(corpus: Corpus, prefix: str, callback: Callable): + def number_starts_with(tokens: List[str]): + callback() + return sum(t.startswith(prefix) for t in tokens) + return ( + np.c_[list(map(number_starts_with, corpus.tokens))], + [f"Starts with {prefix}"] + ) + + +def ends_with(corpus: Corpus, postfix: str, callback: Callable): + def number_ends_with(tokens: List[str]): + callback() + return sum(t.endswith(postfix) for t in tokens) + return ( + np.c_[list(map(number_ends_with, corpus.tokens))], + [f"Ends with {postfix}"] + ) + + +def contains(corpus: Corpus, text: str, callback: Callable): + def number_contains(tokens: List[str]): + callback() + return sum(text in t for t in tokens) + return ( + np.c_[list(map(number_contains, corpus.tokens))], + [f"Contains {text}"] + ) + + +def regex(corpus: Corpus, expression: str, callback: Callable): + pattern = re.compile(expression) + + def number_regex(tokens: List[str]): + callback() + return sum(bool(pattern.match(t)) for t in tokens) + return ( + np.c_[list(map(number_regex, corpus.tokens))], + [f"Regex {expression}"] + ) + + +STATISTICS = [ + # (name of the statistics, function to compute, needs text box) + ("Words count", number_of_words, False), + ("Characters count", characters_count, False), + ("Starts with", starts_with, True), + ("Ends with", ends_with, True), + ("Contains", contains, True), + ("Regex", regex, True) +] +STATISTICS_NAMES = list(list(zip(*STATISTICS))[0]) +STATISTICS_FUNCTIONS = list(list(zip(*STATISTICS))[1]) +STATISTICS_NEEDS_TB = list(list(zip(*STATISTICS))[2]) + + +def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None: + """ + This function runs the computation for new features. + All results will be reported as a partial results. + + Parameters + ---------- + corpus + The corpus on which the computation is held. + statistics + Tuple of statistic pairs to be computed: + (statistics id, string pattern) + state + State used to report progress and partial results. + """ + # callback is called for each corpus element statistics time + tick_values = iter(np.linspace(0, 100, len(corpus) * len(statistics))) + + def advance(): + state.set_progress_value(next(tick_values)) + + for s, patern in statistics: + result = STATISTICS_FUNCTIONS[s](corpus, patern, advance) + state.set_partial_result((s, patern, result)) + + +class OWStatistics(OWWidget, ConcurrentWidgetMixin): + name = "Statistics" + description = "Create new statistic variables for documents." + keywords = [] + + class Inputs: + corpus = Input("Corpus", Corpus) + + class Outputs: + corpus = Output("Corpus", Corpus) + + want_main_area = False + settingsHandler = DomainContextHandler() + + # settings + active_rules: List[Tuple[int, str]] = ContextSetting([(0, ""), (1, "")]) + # rules active at time of apply clicked + applied_rules: Tuple[int, str] = None + autocommit = Setting(True) + + result_dict = {} + + def __init__(self): + OWWidget.__init__(self) + ConcurrentWidgetMixin.__init__(self) + self.corpus = None + + # the list with combos from the widget + self.combos = [] + # the list with line edits from the widget + self.line_edits = [] + # the list of buttons in front of controls that removes them + self.remove_buttons = [] + + self._init_controls() + + def _init_controls(self): + self._init_statistics_box() + box = gui.hBox(self.controlArea) + gui.rubber(box) + gui.button( + box, self, "Apply", + autoDefault=False, + width=180, + callback=self.apply + ) + + def _init_statistics_box(self): + patternbox = gui.vBox(self.controlArea, box=True) + self.rules_box = rules_box = QGridLayout() + patternbox.layout().addLayout(self.rules_box) + box = gui.hBox(patternbox) + gui.button( + box, self, "+", callback=self._add_row, autoDefault=False, + flat=True, + minimumSize=(QSize(20, 20))) + gui.rubber(box) + self.rules_box.setColumnMinimumWidth(1, 70) + self.rules_box.setColumnMinimumWidth(0, 10) + self.rules_box.setColumnStretch(0, 1) + self.rules_box.setColumnStretch(1, 1) + self.rules_box.setColumnStretch(2, 100) + rules_box.addWidget(QLabel("Feature"), 0, 1) + rules_box.addWidget(QLabel("Pattern"), 0, 2) + self._update_rules() + + def adjust_n_rule_rows(self): + """Add or remove lines if needed and fix the tab order.""" + def _add_line(): + n_lines = len(self.combos) + 1 + + # add delete symbol + button = gui.button( + None, self, label='×', flat=True, height=20, + styleSheet='* {font-size: 16pt; color: silver}' + '*:hover {color: black}', + autoDefault=False, callback=self._remove_row) + button.setMinimumSize(QSize(12, 20)) + self.rules_box.addWidget(button, n_lines, 0) + self.remove_buttons.append(button) + + # add statistics type dropdown + combo = QComboBox() + combo.addItems(STATISTICS_NAMES) + combo.currentIndexChanged.connect(self._sync_edit_combo) + self.rules_box.addWidget(combo, n_lines, 1) + self.combos.append(combo) + + # add line edit for patern + line_edit = QLineEdit() + self.rules_box.addWidget(line_edit, n_lines, 2) + line_edit.textChanged.connect(self._sync_edit_line) + self.line_edits.append(line_edit) + + def _remove_line(): + self.combos.pop().deleteLater() + self.line_edits.pop().deleteLater() + self.remove_buttons.pop().deleteLater() + + def _fix_tab_order(): + # TODO: write it differently - check create class + for i, (r, c, l) in enumerate( + zip(self.active_rules, self.combos, self.line_edits) + ): + c.setCurrentIndex(r[0]) # update combo + l.setText(r[1]) # update line edit + if STATISTICS_NEEDS_TB[r[0]]: + l.setVisible(True) + else: + l.setVisible(False) + + n = len(self.active_rules) + while n > len(self.combos): + _add_line() + while len(self.combos) > n: + _remove_line() + _fix_tab_order() + + def _update_rules(self): + self.adjust_n_rule_rows() + + def _add_row(self): + self.active_rules.append((0, "")) + self.adjust_n_rule_rows() + + def _remove_row(self): + remove_idx = self.remove_buttons.index(self.sender()) + del self.active_rules[remove_idx] + self.adjust_n_rule_rows() + + def _sync_edit_combo(self): + combo = self.sender() + edit_index = self.combos.index(combo) + self.active_rules[edit_index] = ( + combo.currentIndex(), self.active_rules[edit_index][1] + ) + self.adjust_n_rule_rows() + + def _sync_edit_line(self): + line_edit = self.sender() + edit_index = self.line_edits.index(line_edit) + self.active_rules[edit_index] = ( + self.active_rules[edit_index][0], line_edit.text() + ) + + @Inputs.corpus + def set_data(self, corpus): + self.corpus = corpus + self.result_dict = {} # empty computational results when new data + + def apply(self): + """ + This function is called when user click apply button. It starts + the computation. When computation is finished results are shown + on the output - on_done. + """ + self.applied_rules = copy(self.active_rules) + self.cancel() # cancel task since user clicked apply again + rules_to_compute = [ + r for r in self.active_rules if r not in self.result_dict + ] + self.start(run, self.corpus, rules_to_compute) + + def on_exception(self, exception) -> None: + raise exception + + def on_partial_result(self, result: Tuple[int, str, Tuple[np.ndarray, str]]): + statistic, patern, result = result + self.result_dict[(statistic, patern)] = result + + def on_done(self, result: None): + # join results + self.output_results() + + # remove unnecessary results from dict - it can happen that user + # already removes the statistic from gui but it is still computed + for k in self.result_dict.keys(): + if k not in self.active_rules: + del self.result_dict[k] + + def output_results(self): + to_stack = [self.corpus.X] + attributes = [] + for rule in self.applied_rules: + # check for safety reasons - in practice should not happen + if rule in self.result_dict: + data, variables = self.result_dict[rule] + to_stack.append(data) + attributes += variables + # here we will use extend_attributes function - this function add + # attributes to existing corpus so it must be copied first + # TODO: when change of preprocessing is finished change this function + # to have inplace parameter which is False by default, + # also I would prefer extend_attriubtes where you give variables + # instead of strings on input + new_corpus = self.corpus.copy() + new_corpus.extend_attributes(np.hstack(to_stack), attributes) + self.Outputs.corpus.send(new_corpus) + + +if __name__ == "__main__": + WidgetPreview(OWStatistics).run(Corpus.from_file('book-excerpts'))