From f54f8ddeb1efd56f89c3b4f4538829cdc9ddaa2f Mon Sep 17 00:00:00 2001 From: marcel Date: Sat, 2 Dec 2023 01:53:12 +0100 Subject: [PATCH 1/3] add fast KL-Sum --- sumy/summarizers/fast_kl.py | 163 +++++++++++++++++++++ tests/test_summarizers/test_fast_kl.py | 194 +++++++++++++++++++++++++ 2 files changed, 357 insertions(+) create mode 100644 sumy/summarizers/fast_kl.py create mode 100644 tests/test_summarizers/test_fast_kl.py diff --git a/sumy/summarizers/fast_kl.py b/sumy/summarizers/fast_kl.py new file mode 100644 index 0000000..1d991b4 --- /dev/null +++ b/sumy/summarizers/fast_kl.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +try: + import numpy as np +except ImportError: + numpy = None + +from sumy.summarizers._summarizer import AbstractSummarizer + + +class KLSummarizer(AbstractSummarizer): + """ + Method that greedily adds sentences to a summary so long as it decreases the + KL Divergence. + Source: http://www.aclweb.org/anthology/N09-1041 + """ + MISSING_WORD_VAL = 42. # placeholder value used for missing words in document + stop_words = frozenset() + + def __call__(self, document, sentences_count): + self._ensure_dependencies_installed() + + sentences = document.sentences + ratings = self._compute_ratings(sentences) + + return self._get_best_sentences(sentences, sentences_count, ratings) + + @staticmethod + def _ensure_dependencies_installed(): + if np is None: + raise ValueError("Fast KL-Sum summarizer requires NumPy." + "Please, install it by command 'pip install numpy'.") + + @staticmethod + def _get_all_words_in_doc(sentences): + return [w for s in sentences for w in s.words] + + def _get_content_words_in_sentence(self, sentence): + normalized_words = self._normalize_words(sentence.words) + normalized_content_words = self._filter_out_stop_words(normalized_words) + return normalized_content_words + + def _normalize_words(self, words): + return [self.normalize_word(w) for w in words] + + def _filter_out_stop_words(self, words): + return [w for w in words if w not in self.stop_words] + + @staticmethod + def _old_compute_word_freq(list_of_words, d=None): + word_freq = {} if d is None else d + for w in list_of_words: + word_freq[w] = word_freq.get(w, 0) + 1 + return word_freq + + @staticmethod + def _compute_word_freq(list_of_words, word_freq_arr, word_to_ind): + for w in list_of_words: + word_freq_arr[word_to_ind[w]] += 1 + return word_freq_arr + + def _get_all_content_words_in_doc(self, sentences): + all_words = self._get_all_words_in_doc(sentences) + normalized_words = self._normalize_words(all_words) + normalized_content_words = self._filter_out_stop_words(normalized_words) + return normalized_content_words + + def compute_tf(self, sentences): + """ + Computes the normalized term frequency as explained in http://www.tfidf.com/ + + :type sentences: [sumy.models.dom.Sentence] + """ + content_words = self._get_all_content_words_in_doc(sentences) + content_words_count = len(content_words) + content_words_freq = self._old_compute_word_freq(content_words) + content_word_tf = dict((w, f / content_words_count) for w, f in content_words_freq.items()) + return content_word_tf + + @staticmethod + def _joint_freq(wc1, wc2, total_len): + if total_len == 0: + return np.zeros_like(wc1) + joint_sum = wc1 + wc2 + return joint_sum / total_len + + @staticmethod + def _kl_divergence(summary_freq, doc_freq, doc_missing_word_mask): + summary_freq = np.where((summary_freq != 0.) & doc_missing_word_mask, summary_freq, doc_freq) + return (doc_freq * np.log(doc_freq / summary_freq)).sum() + + @staticmethod + def _find_index_of_best_sentence(kls): + """ + the best sentence is the one with the smallest kl_divergence + """ + return kls.index(min(kls)) + + def _compute_ratings(self, sentences): + word_to_freq = self.compute_tf(sentences) + + vocabulary = set(self._get_all_words_in_doc(sentences)).union(word_to_freq.keys()) + word_to_ind = {word: index for index, word in enumerate(vocabulary)} + + word_freq = np.repeat(self.MISSING_WORD_VAL, len(vocabulary)) + for k, v in word_to_freq.items(): + word_freq[word_to_ind[k]] = v + missing_word_mask = word_freq != self.MISSING_WORD_VAL + + ratings = {} + + # Keep track of number of words in summary and word frequency + summary_word_list_len = 0 + summary_word_freq = np.repeat(0., len(vocabulary)) + + # make it a list so that it can be modified + sentences_list = list(sentences) + + # get all content words once for efficiency + sentences_as_words = [self._get_content_words_in_sentence(s) for s in sentences] + + # calculate all sentence lengths and word frequencies once for efficiency + i_to_sent_word_freq = {} + i_to_sent_len = {} + for i, s in enumerate(sentences_as_words): + sent_word_freq = np.zeros_like(word_freq) + sent_word_freq = self._compute_word_freq(s, sent_word_freq, word_to_ind) + i_to_sent_word_freq[i] = sent_word_freq + i_to_sent_len[i] = len(s) + + iterations = 0 + indices = list(range(len(sentences_as_words))) + # Removes one sentence per iteration by adding to summary + while len(indices) > 0: + iterations += 1 + # will store all the kls values for this pass + kls = [] + + for i in indices: + # calculates the joint frequency + joint_freq = self._joint_freq(i_to_sent_word_freq[i], summary_word_freq, + i_to_sent_len[i] + summary_word_list_len) + + # adds the calculated kl divergence to the list in index = sentence used + kls.append(self._kl_divergence(joint_freq, word_freq, missing_word_mask)) + + # to consider and then add it into the summary + index_to_remove = self._find_index_of_best_sentence(kls) + best_sentence = sentences_list[indices[index_to_remove]] + del indices[index_to_remove] + best_sentence_word_list = self._get_all_words_in_doc([best_sentence]) + # update summary length and word frequencies + summary_word_list_len += len(best_sentence_word_list) + summary_word_freq = self._compute_word_freq(best_sentence_word_list, summary_word_freq, word_to_ind) + + # value is the iteration in which it was removed multiplied by -1 so that + # the first sentences removed (the most important) have highest values + ratings[best_sentence] = -1 * len(ratings) + print(f"Num interations: {iterations}") + return ratings diff --git a/tests/test_summarizers/test_fast_kl.py b/tests/test_summarizers/test_fast_kl.py new file mode 100644 index 0000000..08c057c --- /dev/null +++ b/tests/test_summarizers/test_fast_kl.py @@ -0,0 +1,194 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +import pytest +import numpy as np + +from sumy.models.dom._sentence import Sentence +from sumy.nlp.tokenizers import Tokenizer +from sumy.summarizers.fast_kl import KLSummarizer +from ..utils import build_document + + +@pytest.fixture +def empty_stop_words(): + return [] + + +@pytest.fixture +def stop_words(): + return ["the", "and", "i"] + + +@pytest.fixture +def summarizer(stop_words): + summarizer = KLSummarizer() + summarizer.stop_words = stop_words + return summarizer + + +def test_empty_document(summarizer): + document = build_document() + returned = summarizer(document, 10) + + assert len(returned) == 0 + + +def test_single_sentence(summarizer): + s = Sentence("I am one slightly longer sentence.", Tokenizer("english")) + document = build_document([s]) + + returned = summarizer(document, 10) + + assert len(returned) == 1 + + +def test_compute_word_freq(summarizer): + words = ["one", "two", "three", "four"] + word_freq = np.zeros(len(words)) + word_to_ind = {word: index for index, word in enumerate(words)} + freq = summarizer._compute_word_freq(words, word_freq, word_to_ind) + + assert np.all(freq == 1) + + words = ["one", "one", "two", "two"] + word_freq = np.zeros(len(set(words))) + word_to_ind = {word: index for index, word in enumerate(set(words))} + freq = summarizer._compute_word_freq(words, word_freq, word_to_ind) + + assert np.all(freq == 2) + + +def test_joint_freq(summarizer): + w1 = ["one", "two", "three", "four"] + w2 = ["one", "two", "three", "four"] + + word_freq1 = np.zeros(len(w1)) + word_freq2 = np.zeros_like(word_freq1) + word_to_ind = {word: index for index, word in enumerate(w1)} + freq1 = summarizer._compute_word_freq(w1, word_freq1, word_to_ind) + freq2 = summarizer._compute_word_freq(w1, word_freq2, word_to_ind) + + freq = summarizer._joint_freq(freq1, freq2, len(w1) + len(w2)) + + assert np.all(freq == 1.0/4) + + w1 = ["one", "two", "three", "four"] + w2 = ["one", "one", "three", "five"] + + vocabulary = set(w1).union(set(w2)) + word_freq1 = np.zeros(len(vocabulary)) + word_freq2 = np.zeros_like(word_freq1) + word_to_ind = {word: index for index, word in enumerate(vocabulary)} + freq1 = summarizer._compute_word_freq(w1, word_freq1, word_to_ind) + freq2 = summarizer._compute_word_freq(w2, word_freq2, word_to_ind) + + freq = summarizer._joint_freq(freq1, freq2, len(w1) + len(w2)) + + assert freq[word_to_ind["one"]] == 3.0/8 + assert freq[word_to_ind["two"]] == 1.0/8 + assert freq[word_to_ind["three"]] == 1.0/4 + assert freq[word_to_ind["four"]] == 1.0/8 + assert freq[word_to_ind["five"]] == 1.0/8 + + +def test_kl_divergence(summarizer): + EPS = 0.00001 + + words = ["one", "two", "three"] + word_freq1 = np.zeros(len(words)) + word_freq2 = np.zeros_like(word_freq1) + word_to_ind = {word: index for index, word in enumerate(words)} + + word_freq1[word_to_ind["one"]] = 0.35 + word_freq1[word_to_ind["two"]] = 0.5 + word_freq1[word_to_ind["three"]] = 0.15 + + word_freq2[word_to_ind["one"]] = 1.0/3.0 + word_freq2[word_to_ind["two"]] = 1.0/3.0 + word_freq2[word_to_ind["three"]] = 1.0/3.0 + + missing_word_mask = np.repeat(True, 3) + + # This value comes from scipy.stats.entropy(w2_, w1_) + # Note: the order of params is different + kl_correct = 0.11475080798005841 + assert abs(summarizer._kl_divergence(word_freq1, word_freq2, missing_word_mask) - kl_correct) < EPS + + word_freq1[word_to_ind["one"]] = 0.1 + word_freq1[word_to_ind["two"]] = 0.2 + word_freq1[word_to_ind["three"]] = 0.7 + + word_freq2[word_to_ind["one"]] = 0.2 + word_freq2[word_to_ind["two"]] = 0.4 + word_freq2[word_to_ind["three"]] = 0.4 + + # This value comes from scipy.stats.entropy(w2_, w1_) + # Note: the order of params is different + kl_correct = 0.1920419931617981 + assert abs(summarizer._kl_divergence(word_freq1, word_freq2, missing_word_mask) - kl_correct) < EPS + + +def test_missing_word_in_document_during_kl_divergence_computation(summarizer): + """ + Missing word should not affect the result. + See https://github.com/miso-belica/sumy/issues/41 + """ + EPS = 0.00001 + + words = ["one", "two", "three", "four"] + summary_frequences = np.zeros(len(words)) + document_frequencies = np.repeat(summarizer.MISSING_WORD_VAL, len(words)) + word_to_ind = {word: index for index, word in enumerate(words)} + + summary_frequences[word_to_ind["one"]] = 0.35 + summary_frequences[word_to_ind["two"]] = 0.5 + summary_frequences[word_to_ind["three"]] = 0.15 + summary_frequences[word_to_ind["four"]] = 0.9 + + document_frequencies[word_to_ind["one"]] = 1.0 / 3.0 + document_frequencies[word_to_ind["two"]] = 1.0 / 3.0 + document_frequencies[word_to_ind["three"]] = 1.0 / 3.0 + + missing_word_mask = np.repeat(False, len(summary_frequences)) + missing_word_mask[word_to_ind["one"]] = True + missing_word_mask[word_to_ind["two"]] = True + missing_word_mask[word_to_ind["three"]] = True + + # This value comes from scipy.stats.entropy(w2_, w1_) + # Note: the order of params is different + kl_correct = 0.11475080798005841 + assert abs(summarizer._kl_divergence(summary_frequences, document_frequencies, + missing_word_mask) - kl_correct) < EPS + + +def test_tf_idf_metric_should_be_real_number(): + """https://github.com/miso-belica/sumy/issues/41""" + summarizer = KLSummarizer() + frequencies = summarizer.compute_tf([Sentence("There are five words, jop.", Tokenizer("english"))]) + + assert frequencies == { + "there": 0.2, + "are": 0.2, + "five": 0.2, + "words": 0.2, + "jop": 0.2, + } + + +def test_the_sentences_should_be_in_different_order(summarizer): + """https://github.com/miso-belica/sumy/issues/146""" + paragraphs = [ + ["This is 1st sentence.", "This is 2nd sentence."], + ["This is 3rd sentence.", "This is 4th sentence."], + ["This is 5th sentence."], + ] + document = build_document(*paragraphs) + reversed_document = build_document(*(reversed(p) for p in reversed(paragraphs))) + + sentences = summarizer(document, "100%") + reversed_sentences = summarizer(reversed_document, "100%") + + assert tuple(reversed(sentences)) == reversed_sentences From 3e73148e4831ed699a8ea76948b4694d19424b7e Mon Sep 17 00:00:00 2001 From: marcel Date: Sat, 2 Dec 2023 02:08:06 +0100 Subject: [PATCH 2/3] remove print statement --- sumy/summarizers/fast_kl.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sumy/summarizers/fast_kl.py b/sumy/summarizers/fast_kl.py index 1d991b4..42c860d 100644 --- a/sumy/summarizers/fast_kl.py +++ b/sumy/summarizers/fast_kl.py @@ -17,7 +17,7 @@ class KLSummarizer(AbstractSummarizer): KL Divergence. Source: http://www.aclweb.org/anthology/N09-1041 """ - MISSING_WORD_VAL = 42. # placeholder value used for missing words in document + MISSING_WORD_VAL = 42.0 # placeholder value used for missing words in document stop_words = frozenset() def __call__(self, document, sentences_count): @@ -89,7 +89,7 @@ def _joint_freq(wc1, wc2, total_len): @staticmethod def _kl_divergence(summary_freq, doc_freq, doc_missing_word_mask): - summary_freq = np.where((summary_freq != 0.) & doc_missing_word_mask, summary_freq, doc_freq) + summary_freq = np.where((summary_freq != 0.0) & doc_missing_word_mask, summary_freq, doc_freq) return (doc_freq * np.log(doc_freq / summary_freq)).sum() @staticmethod @@ -114,7 +114,7 @@ def _compute_ratings(self, sentences): # Keep track of number of words in summary and word frequency summary_word_list_len = 0 - summary_word_freq = np.repeat(0., len(vocabulary)) + summary_word_freq = np.repeat(0.0, len(vocabulary)) # make it a list so that it can be modified sentences_list = list(sentences) @@ -159,5 +159,4 @@ def _compute_ratings(self, sentences): # value is the iteration in which it was removed multiplied by -1 so that # the first sentences removed (the most important) have highest values ratings[best_sentence] = -1 * len(ratings) - print(f"Num interations: {iterations}") return ratings From e28e34d5834d90e9a21fb31bfcc9c095c6fab21b Mon Sep 17 00:00:00 2001 From: marcel Date: Sat, 2 Dec 2023 02:18:28 +0100 Subject: [PATCH 3/3] add test_numpy_not_installed --- tests/test_summarizers/test_fast_kl.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_summarizers/test_fast_kl.py b/tests/test_summarizers/test_fast_kl.py index 08c057c..9d5778e 100644 --- a/tests/test_summarizers/test_fast_kl.py +++ b/tests/test_summarizers/test_fast_kl.py @@ -6,6 +6,7 @@ import pytest import numpy as np +import sumy.summarizers.fast_kl as fast_kl_module from sumy.models.dom._sentence import Sentence from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.fast_kl import KLSummarizer @@ -29,6 +30,18 @@ def summarizer(stop_words): return summarizer +def test_numpy_not_installed(): + summarizer = KLSummarizer() + + numpy = fast_kl_module.np + fast_kl_module.np = None + + with pytest.raises(ValueError): + summarizer(build_document(), 10) + + fast_kl_module.np = numpy + + def test_empty_document(summarizer): document = build_document() returned = summarizer(document, 10)