From 404b6dffb92e6e13d519144b8ba645788da55702 Mon Sep 17 00:00:00 2001 From: KVGarg Date: Fri, 19 Oct 2018 22:50:37 +0530 Subject: [PATCH 1/3] Added checks for capitalizing the beginning of sentences 1. Fixes Issue #800 2. Added a rule( named split_into_sentences) in tools.py that splits the text into sentences and return all the sentences. 3. Added check in misc.capitalization for checking the beginning of sentences. --- proselint/checks/misc/capitalization.py | 20 +++++++- proselint/tools.py | 66 +++++++++++++++++++++---- tests/test_misc_capitalization.py | 14 +++++- 3 files changed, 87 insertions(+), 13 deletions(-) diff --git a/proselint/checks/misc/capitalization.py b/proselint/checks/misc/capitalization.py index 8d12f8d14..1c84fe2bf 100644 --- a/proselint/checks/misc/capitalization.py +++ b/proselint/checks/misc/capitalization.py @@ -13,7 +13,10 @@ Incorrect capitalization. """ -from proselint.tools import memoize, preferred_forms_check +from proselint.tools import (memoize, + preferred_forms_check, + split_into_sentences, + existence_check) @memoize @@ -91,3 +94,18 @@ def check_days(text): ] return preferred_forms_check(text, list, err, msg, ignore_case=False) + + +@memoize +def check_beginning_of_sentences(text): + """Check for capitalizing the beginning of sentences.""" + sentences = split_into_sentences(text) + err = "misc.capitalization" + msg = "Sentence `{}` should begin with capital letter." + sentences_tobe_capitalized = [_[:50] for _ in sentences if _[0].islower()] + if len(sentences_tobe_capitalized) > 0: + return existence_check(text, + sentences_tobe_capitalized, + err, msg, require_padding=False) + else: + return [] \ No newline at end of file diff --git a/proselint/tools.py b/proselint/tools.py index 2e47e1a08..627c78a49 100644 --- a/proselint/tools.py +++ b/proselint/tools.py @@ -4,16 +4,17 @@ from __future__ import print_function from __future__ import unicode_literals -import sys -import traceback -import os -import shelve -import inspect + import functools -import re import hashlib -import json import importlib +import inspect +import json +import os +import re +import shelve +import sys +import traceback try: import dbm @@ -30,6 +31,50 @@ proselint_path = os.path.dirname(os.path.realpath(__file__)) +alphabets = "([A-Za-z])" +prefixes = "(Mr|St|Mrs|Ms|Dr)[.]" +suffixes = "(Inc|Ltd|Jr|Sr|Co)" +starters = """ + (Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s| + Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)""" +acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" +websites = "[.]([A-Za-z0-9]+)[.](com|net|org|io|gov)" +digits = "([0-9])" + + +def split_into_sentences(text): + """Split the text into sentences.""" + text = " " + text + " " + text = text.replace("\n", " ") + text = re.sub(prefixes, "\\1", text) + text = re.sub(websites, "\\1\\2", text) + text = re.sub(digits + "[.]" + digits, "\\1\\2", text) + text = re.sub("\s" + alphabets + "[.] ", " \\1 ", text) + text = re.sub(acronyms + " " + starters, "\\1 \\2", text) + text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", + "\\1\\2\\3", text) + text = re.sub(alphabets + "[.]" + alphabets + "[.]", + "\\1\\2", text) + text = re.sub(" " + suffixes + "[.] " + starters, " \\1 \\2", text) + text = re.sub(" " + suffixes + "[.]", " \\1", text) + text = re.sub(" " + alphabets + "[.]", " \\1", text) + if "”" in text: + text = text.replace(".”", "”.") + if "\"" in text: + text = text.replace(".\"", "\".") + if "!" in text: + text = text.replace("!\"", "\"!") + if "?" in text: + text = text.replace("?\"", "\"?") + text = text.replace(".", ".") + text = text.replace("?", "?") + text = text.replace("!", "!") + text = text.replace("", ".") + sentences = text.split("") + sentences = sentences[:-1] + sentences = [s.strip() for s in sentences] + return sentences + def close_cache_shelves(): """Close previously opened cache shelves.""" @@ -44,6 +89,7 @@ def close_cache_shelves_after(f): def wrapped(*args, **kwargs): f(*args, **kwargs) close_cache_shelves() + return wrapped @@ -261,7 +307,7 @@ def lint(input_file, debug=False): (line, column) = line_and_column(text, start) if not is_quoted(start, text): errors += [(check, message, line, column, start, end, - end - start, "warning", replacements)] + end - start, "warning", replacements)] if len(errors) > options["max_errors"]: break @@ -275,7 +321,7 @@ def lint(input_file, debug=False): def assert_error(text, check, n=1): """Assert that text has n errors of type check.""" assert_error.description = "No {} error for '{}'".format(check, text) - assert(check in [error[0] for error in lint(text)]) + assert (check in [error[0] for error in lint(text)]) def consistency_check(text, word_pairs, err, msg, offset=0): @@ -426,7 +472,7 @@ def find_ranges(text): s = 2 elif s == 2: if c in seps: - ranges.append((start+1, i-1)) + ranges.append((start + 1, i - 1)) start = None s = 0 else: diff --git a/tests/test_misc_capitalization.py b/tests/test_misc_capitalization.py index 6ec8bc32f..475f4afad 100644 --- a/tests/test_misc_capitalization.py +++ b/tests/test_misc_capitalization.py @@ -1,10 +1,10 @@ """Tests for misc.capitalization check.""" from __future__ import absolute_import -from .check import Check - from proselint.checks.misc import capitalization as chk +from .check import Check + class TestCheck(Check): """The test class for misc.capitalization.""" @@ -30,3 +30,13 @@ def test_smoke_check_days(self): """Basic smoke test for misc.capitalization.check_days.""" assert chk.check_days("""Smoke phrase with nothing flagged""") == [] assert chk.check_days("""It happened on friday.""") != [] + + def test_smoke_check_beginning_of_sentences(self): + """ + Basic smoke test for misc.capitalization.check_beginning_of_sentences. + """ + assert chk.check_beginning_of_sentences("Smoke " + + "sentence with nothing " + + "flagged.") == [] + assert chk.check_beginning_of_sentences("the sentence begins " + + "with lowercase letter.") != [] From cfeb471927dcd1f9be8d431c6a3e9ab322bdbd3e Mon Sep 17 00:00:00 2001 From: KVGarg Date: Fri, 19 Oct 2018 23:10:00 +0530 Subject: [PATCH 2/3] Minor changes done --- proselint/checks/misc/capitalization.py | 2 +- tests/test_misc_capitalization.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/proselint/checks/misc/capitalization.py b/proselint/checks/misc/capitalization.py index 1c84fe2bf..cbc400650 100644 --- a/proselint/checks/misc/capitalization.py +++ b/proselint/checks/misc/capitalization.py @@ -108,4 +108,4 @@ def check_beginning_of_sentences(text): sentences_tobe_capitalized, err, msg, require_padding=False) else: - return [] \ No newline at end of file + return [] diff --git a/tests/test_misc_capitalization.py b/tests/test_misc_capitalization.py index 475f4afad..ccd013052 100644 --- a/tests/test_misc_capitalization.py +++ b/tests/test_misc_capitalization.py @@ -32,9 +32,7 @@ def test_smoke_check_days(self): assert chk.check_days("""It happened on friday.""") != [] def test_smoke_check_beginning_of_sentences(self): - """ - Basic smoke test for misc.capitalization.check_beginning_of_sentences. - """ + """Smoke test for misc.capitalization.check_beginning_of_sentences.""" assert chk.check_beginning_of_sentences("Smoke " + "sentence with nothing " + "flagged.") == [] From 96050f2a8669328da0e234f9c2249ea3ab7caade Mon Sep 17 00:00:00 2001 From: KVGarg Date: Sat, 20 Oct 2018 15:43:43 +0530 Subject: [PATCH 3/3] Modified websites rules --- proselint/tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proselint/tools.py b/proselint/tools.py index 627c78a49..610042bab 100644 --- a/proselint/tools.py +++ b/proselint/tools.py @@ -38,7 +38,7 @@ (Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s| Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)""" acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" -websites = "[.]([A-Za-z0-9]+)[.](com|net|org|io|gov)" +websites = "[.]([\w\W\d\D]+)[.](com|net|org|io|gov)" digits = "([0-9])"