diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py index f79ab09fa..e9e84a1f6 100644 --- a/orangecontrib/text/preprocess/filter.py +++ b/orangecontrib/text/preprocess/filter.py @@ -78,7 +78,7 @@ def supported_languages(): except LookupError: # when no NLTK data is available pass - return [file.capitalize() for file in stopwords_listdir] + return sorted(file.capitalize() for file in stopwords_listdir) @wait_nltk_data def __init__(self, language='English', word_list=None): @@ -96,7 +96,8 @@ def language(self, value): if not self._language: self.stopwords = [] else: - self.stopwords = set(stopwords.words(self.language.lower())) + self.stopwords = set( + x.strip() for x in stopwords.words(self.language.lower())) def __str__(self): config = '' diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index e2cc7f943..788bf5487 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -281,6 +281,20 @@ def test_stopwords(self): self.assertFalse(filter.check('a')) self.assertTrue(filter.check('filter')) + self.assertListEqual( + ["snake", "house"], + filter(["a", "snake", "is", "in", "a", "house"])) + + def test_stopwords_slovene(self): + filter = preprocess.StopwordsFilter('slovene') + + self.assertFalse(filter.check('in')) + self.assertTrue(filter.check('abeceda')) + + self.assertListEqual( + ["kača", "hiši"], + filter(["kača", "je", "v", "hiši", "in"])) + def test_lexicon(self): filter = preprocess.LexiconFilter(['filter']) self.assertFalse(filter.check('false'))