From 199cb241e0d9adfc1a40e772c41df08432c5bedc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Sun, 1 Mar 2020 12:59:20 +0100
Subject: [PATCH] Statistics widget
---
orangecontrib/text/widgets/owstatistics.py | 329 +++++++++++++++++++++
1 file changed, 329 insertions(+)
create mode 100644 orangecontrib/text/widgets/owstatistics.py
diff --git a/orangecontrib/text/widgets/owstatistics.py b/orangecontrib/text/widgets/owstatistics.py
new file mode 100644
index 000000000..366efc56b
--- /dev/null
+++ b/orangecontrib/text/widgets/owstatistics.py
@@ -0,0 +1,329 @@
+import re
+from copy import copy
+from typing import List, Tuple, Callable
+
+import numpy as np
+
+from Orange.widgets import gui
+from Orange.widgets.settings import DomainContextHandler
+from Orange.widgets.utils.concurrent import TaskState, ConcurrentWidgetMixin
+from Orange.widgets.widget import OWWidget
+from AnyQt.QtCore import QSize
+from AnyQt.QtWidgets import QGridLayout, QLabel, QLineEdit, QComboBox
+
+from orangecontrib.text import Corpus
+from orangewidget.settings import Setting, ContextSetting
+from orangewidget.utils.signals import Input, Output
+from orangewidget.utils.widgetpreview import WidgetPreview
+
+
+# every statistic returns a np.ndarray with statistics
+# and list with variables names - it must be implemented here since some
+# statistics will have more variables (count of each pos tag)
+
+
+def number_of_words(corpus: Corpus, text: str, callback: Callable):
+ def cust_len(tokens: List[str]):
+ callback()
+ return len(tokens)
+ # TODO: discuss if ok on tokens
+ # np.c_ makes column vector (ndarray) out of the list
+ # [1, 2, 3] -> [[1], [2], [3]]
+ return np.c_[list(map(cust_len, corpus.tokens))], ["Words count"]
+
+
+def characters_count(corpus: Corpus, text: str, callback: Callable):
+ def chara_count(tokens: List[str]):
+ callback()
+ return sum(len(t) for t in tokens)
+ # TODO: discuss if ok on tokens
+ return (
+ np.c_[list(map(chara_count, corpus.tokens))],
+ ["Characters count"]
+ )
+
+
+def starts_with(corpus: Corpus, prefix: str, callback: Callable):
+ def number_starts_with(tokens: List[str]):
+ callback()
+ return sum(t.startswith(prefix) for t in tokens)
+ return (
+ np.c_[list(map(number_starts_with, corpus.tokens))],
+ [f"Starts with {prefix}"]
+ )
+
+
+def ends_with(corpus: Corpus, postfix: str, callback: Callable):
+ def number_ends_with(tokens: List[str]):
+ callback()
+ return sum(t.endswith(postfix) for t in tokens)
+ return (
+ np.c_[list(map(number_ends_with, corpus.tokens))],
+ [f"Ends with {postfix}"]
+ )
+
+
+def contains(corpus: Corpus, text: str, callback: Callable):
+ def number_contains(tokens: List[str]):
+ callback()
+ return sum(text in t for t in tokens)
+ return (
+ np.c_[list(map(number_contains, corpus.tokens))],
+ [f"Contains {text}"]
+ )
+
+
+def regex(corpus: Corpus, expression: str, callback: Callable):
+ pattern = re.compile(expression)
+
+ def number_regex(tokens: List[str]):
+ callback()
+ return sum(bool(pattern.match(t)) for t in tokens)
+ return (
+ np.c_[list(map(number_regex, corpus.tokens))],
+ [f"Regex {expression}"]
+ )
+
+
+STATISTICS = [
+ # (name of the statistics, function to compute, needs text box)
+ ("Words count", number_of_words, False),
+ ("Characters count", characters_count, False),
+ ("Starts with", starts_with, True),
+ ("Ends with", ends_with, True),
+ ("Contains", contains, True),
+ ("Regex", regex, True)
+]
+STATISTICS_NAMES = list(list(zip(*STATISTICS))[0])
+STATISTICS_FUNCTIONS = list(list(zip(*STATISTICS))[1])
+STATISTICS_NEEDS_TB = list(list(zip(*STATISTICS))[2])
+
+
+def run(corpus: Corpus, statistics: Tuple[int, str], state: TaskState) -> None:
+ """
+ This function runs the computation for new features.
+ All results will be reported as a partial results.
+
+ Parameters
+ ----------
+ corpus
+ The corpus on which the computation is held.
+ statistics
+ Tuple of statistic pairs to be computed:
+ (statistics id, string pattern)
+ state
+ State used to report progress and partial results.
+ """
+ # callback is called for each corpus element statistics time
+ tick_values = iter(np.linspace(0, 100, len(corpus) * len(statistics)))
+
+ def advance():
+ state.set_progress_value(next(tick_values))
+
+ for s, patern in statistics:
+ result = STATISTICS_FUNCTIONS[s](corpus, patern, advance)
+ state.set_partial_result((s, patern, result))
+
+
+class OWStatistics(OWWidget, ConcurrentWidgetMixin):
+ name = "Statistics"
+ description = "Create new statistic variables for documents."
+ keywords = []
+
+ class Inputs:
+ corpus = Input("Corpus", Corpus)
+
+ class Outputs:
+ corpus = Output("Corpus", Corpus)
+
+ want_main_area = False
+ settingsHandler = DomainContextHandler()
+
+ # settings
+ active_rules: List[Tuple[int, str]] = ContextSetting([(0, ""), (1, "")])
+ # rules active at time of apply clicked
+ applied_rules: Tuple[int, str] = None
+ autocommit = Setting(True)
+
+ result_dict = {}
+
+ def __init__(self):
+ OWWidget.__init__(self)
+ ConcurrentWidgetMixin.__init__(self)
+ self.corpus = None
+
+ # the list with combos from the widget
+ self.combos = []
+ # the list with line edits from the widget
+ self.line_edits = []
+ # the list of buttons in front of controls that removes them
+ self.remove_buttons = []
+
+ self._init_controls()
+
+ def _init_controls(self):
+ self._init_statistics_box()
+ box = gui.hBox(self.controlArea)
+ gui.rubber(box)
+ gui.button(
+ box, self, "Apply",
+ autoDefault=False,
+ width=180,
+ callback=self.apply
+ )
+
+ def _init_statistics_box(self):
+ patternbox = gui.vBox(self.controlArea, box=True)
+ self.rules_box = rules_box = QGridLayout()
+ patternbox.layout().addLayout(self.rules_box)
+ box = gui.hBox(patternbox)
+ gui.button(
+ box, self, "+", callback=self._add_row, autoDefault=False,
+ flat=True,
+ minimumSize=(QSize(20, 20)))
+ gui.rubber(box)
+ self.rules_box.setColumnMinimumWidth(1, 70)
+ self.rules_box.setColumnMinimumWidth(0, 10)
+ self.rules_box.setColumnStretch(0, 1)
+ self.rules_box.setColumnStretch(1, 1)
+ self.rules_box.setColumnStretch(2, 100)
+ rules_box.addWidget(QLabel("Feature"), 0, 1)
+ rules_box.addWidget(QLabel("Pattern"), 0, 2)
+ self._update_rules()
+
+ def adjust_n_rule_rows(self):
+ """Add or remove lines if needed and fix the tab order."""
+ def _add_line():
+ n_lines = len(self.combos) + 1
+
+ # add delete symbol
+ button = gui.button(
+ None, self, label='×', flat=True, height=20,
+ styleSheet='* {font-size: 16pt; color: silver}'
+ '*:hover {color: black}',
+ autoDefault=False, callback=self._remove_row)
+ button.setMinimumSize(QSize(12, 20))
+ self.rules_box.addWidget(button, n_lines, 0)
+ self.remove_buttons.append(button)
+
+ # add statistics type dropdown
+ combo = QComboBox()
+ combo.addItems(STATISTICS_NAMES)
+ combo.currentIndexChanged.connect(self._sync_edit_combo)
+ self.rules_box.addWidget(combo, n_lines, 1)
+ self.combos.append(combo)
+
+ # add line edit for patern
+ line_edit = QLineEdit()
+ self.rules_box.addWidget(line_edit, n_lines, 2)
+ line_edit.textChanged.connect(self._sync_edit_line)
+ self.line_edits.append(line_edit)
+
+ def _remove_line():
+ self.combos.pop().deleteLater()
+ self.line_edits.pop().deleteLater()
+ self.remove_buttons.pop().deleteLater()
+
+ def _fix_tab_order():
+ # TODO: write it differently - check create class
+ for i, (r, c, l) in enumerate(
+ zip(self.active_rules, self.combos, self.line_edits)
+ ):
+ c.setCurrentIndex(r[0]) # update combo
+ l.setText(r[1]) # update line edit
+ if STATISTICS_NEEDS_TB[r[0]]:
+ l.setVisible(True)
+ else:
+ l.setVisible(False)
+
+ n = len(self.active_rules)
+ while n > len(self.combos):
+ _add_line()
+ while len(self.combos) > n:
+ _remove_line()
+ _fix_tab_order()
+
+ def _update_rules(self):
+ self.adjust_n_rule_rows()
+
+ def _add_row(self):
+ self.active_rules.append((0, ""))
+ self.adjust_n_rule_rows()
+
+ def _remove_row(self):
+ remove_idx = self.remove_buttons.index(self.sender())
+ del self.active_rules[remove_idx]
+ self.adjust_n_rule_rows()
+
+ def _sync_edit_combo(self):
+ combo = self.sender()
+ edit_index = self.combos.index(combo)
+ self.active_rules[edit_index] = (
+ combo.currentIndex(), self.active_rules[edit_index][1]
+ )
+ self.adjust_n_rule_rows()
+
+ def _sync_edit_line(self):
+ line_edit = self.sender()
+ edit_index = self.line_edits.index(line_edit)
+ self.active_rules[edit_index] = (
+ self.active_rules[edit_index][0], line_edit.text()
+ )
+
+ @Inputs.corpus
+ def set_data(self, corpus):
+ self.corpus = corpus
+ self.result_dict = {} # empty computational results when new data
+
+ def apply(self):
+ """
+ This function is called when user click apply button. It starts
+ the computation. When computation is finished results are shown
+ on the output - on_done.
+ """
+ self.applied_rules = copy(self.active_rules)
+ self.cancel() # cancel task since user clicked apply again
+ rules_to_compute = [
+ r for r in self.active_rules if r not in self.result_dict
+ ]
+ self.start(run, self.corpus, rules_to_compute)
+
+ def on_exception(self, exception) -> None:
+ raise exception
+
+ def on_partial_result(self, result: Tuple[int, str, Tuple[np.ndarray, str]]):
+ statistic, patern, result = result
+ self.result_dict[(statistic, patern)] = result
+
+ def on_done(self, result: None):
+ # join results
+ self.output_results()
+
+ # remove unnecessary results from dict - it can happen that user
+ # already removes the statistic from gui but it is still computed
+ for k in self.result_dict.keys():
+ if k not in self.active_rules:
+ del self.result_dict[k]
+
+ def output_results(self):
+ to_stack = [self.corpus.X]
+ attributes = []
+ for rule in self.applied_rules:
+ # check for safety reasons - in practice should not happen
+ if rule in self.result_dict:
+ data, variables = self.result_dict[rule]
+ to_stack.append(data)
+ attributes += variables
+ # here we will use extend_attributes function - this function add
+ # attributes to existing corpus so it must be copied first
+ # TODO: when change of preprocessing is finished change this function
+ # to have inplace parameter which is False by default,
+ # also I would prefer extend_attriubtes where you give variables
+ # instead of strings on input
+ new_corpus = self.corpus.copy()
+ new_corpus.extend_attributes(np.hstack(to_stack), attributes)
+ self.Outputs.corpus.send(new_corpus)
+
+
+if __name__ == "__main__":
+ WidgetPreview(OWStatistics).run(Corpus.from_file('book-excerpts'))