From 404b6dffb92e6e13d519144b8ba645788da55702 Mon Sep 17 00:00:00 2001
From: KVGarg <co16326.ccet@gmail.com>
Date: Fri, 19 Oct 2018 22:50:37 +0530
Subject: [PATCH 1/3] Added checks for capitalizing the beginning of sentences

1. Fixes Issue #800
2. Added a rule( named split_into_sentences) in tools.py that splits the text into sentences and return all the sentences.
3. Added check in misc.capitalization for checking the beginning of sentences.
---
 proselint/checks/misc/capitalization.py | 20 +++++++-
 proselint/tools.py                      | 66 +++++++++++++++++++++----
 tests/test_misc_capitalization.py       | 14 +++++-
 3 files changed, 87 insertions(+), 13 deletions(-)
diff --git a/proselint/checks/misc/capitalization.py b/proselint/checks/misc/capitalization.py
index 8d12f8d14..1c84fe2bf 100644
--- a/proselint/checks/misc/capitalization.py
+++ b/proselint/checks/misc/capitalization.py
@@ -13,7 +13,10 @@
 Incorrect capitalization.
 
 """
-from proselint.tools import memoize, preferred_forms_check
+from proselint.tools import (memoize,
+                             preferred_forms_check,
+                             split_into_sentences,
+                             existence_check)
 
 
 @memoize
@@ -91,3 +94,18 @@ def check_days(text):
     ]
 
     return preferred_forms_check(text, list, err, msg, ignore_case=False)
+
+
+@memoize
+def check_beginning_of_sentences(text):
+    """Check for capitalizing the beginning of sentences."""
+    sentences = split_into_sentences(text)
+    err = "misc.capitalization"
+    msg = "Sentence `{}` should begin with capital letter."
+    sentences_tobe_capitalized = [_[:50] for _ in sentences if _[0].islower()]
+    if len(sentences_tobe_capitalized) > 0:
+        return existence_check(text,
+                               sentences_tobe_capitalized,
+                               err, msg, require_padding=False)
+    else:
+        return []
\ No newline at end of file
diff --git a/proselint/tools.py b/proselint/tools.py
index 2e47e1a08..627c78a49 100644
--- a/proselint/tools.py
+++ b/proselint/tools.py
@@ -4,16 +4,17 @@
 
 from __future__ import print_function
 from __future__ import unicode_literals
-import sys
-import traceback
-import os
-import shelve
-import inspect
+
 import functools
-import re
 import hashlib
-import json
 import importlib
+import inspect
+import json
+import os
+import re
+import shelve
+import sys
+import traceback
 
 try:
     import dbm
@@ -30,6 +31,50 @@
 
 proselint_path = os.path.dirname(os.path.realpath(__file__))
 
+alphabets = "([A-Za-z])"
+prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
+suffixes = "(Inc|Ltd|Jr|Sr|Co)"
+starters = """
+            (Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|
+            Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"""
+acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
+websites = "[.]([A-Za-z0-9]+)[.](com|net|org|io|gov)"
+digits = "([0-9])"
+
+
+def split_into_sentences(text):
+    """Split the text into sentences."""
+    text = " " + text + "  "
+    text = text.replace("\n", " ")
+    text = re.sub(prefixes, "\\1<prd>", text)
+    text = re.sub(websites, "<prd>\\1<prd>\\2", text)
+    text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
+    text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text)
+    text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text)
+    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]",
+                  "\\1<prd>\\2<prd>\\3<prd>", text)
+    text = re.sub(alphabets + "[.]" + alphabets + "[.]",
+                  "\\1<prd>\\2<prd>", text)
+    text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text)
+    text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text)
+    text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
+    if "”" in text:
+        text = text.replace(".”", "”.")
+    if "\"" in text:
+        text = text.replace(".\"", "\".")
+    if "!" in text:
+        text = text.replace("!\"", "\"!")
+    if "?" in text:
+        text = text.replace("?\"", "\"?")
+    text = text.replace(".", ".<stop>")
+    text = text.replace("?", "?<stop>")
+    text = text.replace("!", "!<stop>")
+    text = text.replace("<prd>", ".")
+    sentences = text.split("<stop>")
+    sentences = sentences[:-1]
+    sentences = [s.strip() for s in sentences]
+    return sentences
+
 
 def close_cache_shelves():
     """Close previously opened cache shelves."""
@@ -44,6 +89,7 @@ def close_cache_shelves_after(f):
     def wrapped(*args, **kwargs):
         f(*args, **kwargs)
         close_cache_shelves()
+
     return wrapped
 
 
@@ -261,7 +307,7 @@ def lint(input_file, debug=False):
             (line, column) = line_and_column(text, start)
             if not is_quoted(start, text):
                 errors += [(check, message, line, column, start, end,
-                           end - start, "warning", replacements)]
+                            end - start, "warning", replacements)]
 
         if len(errors) > options["max_errors"]:
             break
@@ -275,7 +321,7 @@ def lint(input_file, debug=False):
 def assert_error(text, check, n=1):
     """Assert that text has n errors of type check."""
     assert_error.description = "No {} error for '{}'".format(check, text)
-    assert(check in [error[0] for error in lint(text)])
+    assert (check in [error[0] for error in lint(text)])
 
 
 def consistency_check(text, word_pairs, err, msg, offset=0):
@@ -426,7 +472,7 @@ def find_ranges(text):
                 s = 2
             elif s == 2:
                 if c in seps:
-                    ranges.append((start+1, i-1))
+                    ranges.append((start + 1, i - 1))
                     start = None
                     s = 0
                 else:
diff --git a/tests/test_misc_capitalization.py b/tests/test_misc_capitalization.py
index 6ec8bc32f..475f4afad 100644
--- a/tests/test_misc_capitalization.py
+++ b/tests/test_misc_capitalization.py
@@ -1,10 +1,10 @@
 """Tests for misc.capitalization check."""
 from __future__ import absolute_import
 
-from .check import Check
-
 from proselint.checks.misc import capitalization as chk
 
+from .check import Check
+
 
 class TestCheck(Check):
     """The test class for misc.capitalization."""
@@ -30,3 +30,13 @@ def test_smoke_check_days(self):
         """Basic smoke test for misc.capitalization.check_days."""
         assert chk.check_days("""Smoke phrase with nothing flagged""") == []
         assert chk.check_days("""It happened on friday.""") != []
+
+    def test_smoke_check_beginning_of_sentences(self):
+        """
+        Basic smoke test for misc.capitalization.check_beginning_of_sentences.
+        """
+        assert chk.check_beginning_of_sentences("Smoke " +
+                                                "sentence with nothing " +
+                                                "flagged.") == []
+        assert chk.check_beginning_of_sentences("the sentence begins " +
+                                                "with lowercase letter.") != []

From cfeb471927dcd1f9be8d431c6a3e9ab322bdbd3e Mon Sep 17 00:00:00 2001
From: KVGarg <co16326.ccet@gmail.com>
Date: Fri, 19 Oct 2018 23:10:00 +0530
Subject: [PATCH 2/3] Minor changes done

---
 proselint/checks/misc/capitalization.py | 2 +-
 tests/test_misc_capitalization.py       | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/proselint/checks/misc/capitalization.py b/proselint/checks/misc/capitalization.py
index 1c84fe2bf..cbc400650 100644
--- a/proselint/checks/misc/capitalization.py
+++ b/proselint/checks/misc/capitalization.py
@@ -108,4 +108,4 @@ def check_beginning_of_sentences(text):
                                sentences_tobe_capitalized,
                                err, msg, require_padding=False)
     else:
-        return []
\ No newline at end of file
+        return []
diff --git a/tests/test_misc_capitalization.py b/tests/test_misc_capitalization.py
index 475f4afad..ccd013052 100644
--- a/tests/test_misc_capitalization.py
+++ b/tests/test_misc_capitalization.py
@@ -32,9 +32,7 @@ def test_smoke_check_days(self):
         assert chk.check_days("""It happened on friday.""") != []
 
     def test_smoke_check_beginning_of_sentences(self):
-        """
-        Basic smoke test for misc.capitalization.check_beginning_of_sentences.
-        """
+        """Smoke test for misc.capitalization.check_beginning_of_sentences."""
         assert chk.check_beginning_of_sentences("Smoke " +
                                                 "sentence with nothing " +
                                                 "flagged.") == []

From 96050f2a8669328da0e234f9c2249ea3ab7caade Mon Sep 17 00:00:00 2001
From: KVGarg <co16326.ccet@gmail.com>
Date: Sat, 20 Oct 2018 15:43:43 +0530
Subject: [PATCH 3/3] Modified websites rules

---
 proselint/tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proselint/tools.py b/proselint/tools.py
index 627c78a49..610042bab 100644
--- a/proselint/tools.py
+++ b/proselint/tools.py
@@ -38,7 +38,7 @@
             (Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|
             Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"""
 acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
-websites = "[.]([A-Za-z0-9]+)[.](com|net|org|io|gov)"
+websites = "[.]([\w\W\d\D]+)[.](com|net|org|io|gov)"
 digits = "([0-9])"