Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added checks for capitalizing the beginning of sentences #897

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion proselint/checks/misc/capitalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
Incorrect capitalization.

"""
from proselint.tools import memoize, preferred_forms_check
from proselint.tools import (memoize,
preferred_forms_check,
split_into_sentences,
existence_check)


@memoize
Expand Down Expand Up @@ -91,3 +94,18 @@ def check_days(text):
]

return preferred_forms_check(text, list, err, msg, ignore_case=False)


@memoize
def check_beginning_of_sentences(text):
"""Check for capitalizing the beginning of sentences."""
sentences = split_into_sentences(text)
err = "misc.capitalization"
msg = "Sentence `{}` should begin with capital letter."
sentences_tobe_capitalized = [_[:50] for _ in sentences if _[0].islower()]
if len(sentences_tobe_capitalized) > 0:
return existence_check(text,
sentences_tobe_capitalized,
err, msg, require_padding=False)
else:
return []
66 changes: 56 additions & 10 deletions proselint/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@

from __future__ import print_function
from __future__ import unicode_literals
import sys
import traceback
import os
import shelve
import inspect

import functools
import re
import hashlib
import json
import importlib
import inspect
import json
import os
import re
import shelve
import sys
import traceback

try:
import dbm
Expand All @@ -30,6 +31,50 @@

proselint_path = os.path.dirname(os.path.realpath(__file__))

alphabets = "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = """
(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|
Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"""
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.]([\w\W\d\D]+)[.](com|net|org|io|gov)"
digits = "([0-9])"


def split_into_sentences(text):
"""Split the text into sentences."""
text = " " + text + " "
text = text.replace("\n", " ")
text = re.sub(prefixes, "\\1<prd>", text)
text = re.sub(websites, "<prd>\\1<prd>\\2", text)
text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text)
text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]",
"\\1<prd>\\2<prd>\\3<prd>", text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]",
"\\1<prd>\\2<prd>", text)
text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text)
text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text)
text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
if "”" in text:
text = text.replace(".”", "”.")
if "\"" in text:
text = text.replace(".\"", "\".")
if "!" in text:
text = text.replace("!\"", "\"!")
if "?" in text:
text = text.replace("?\"", "\"?")
text = text.replace(".", ".<stop>")
text = text.replace("?", "?<stop>")
text = text.replace("!", "!<stop>")
text = text.replace("<prd>", ".")
sentences = text.split("<stop>")
sentences = sentences[:-1]
sentences = [s.strip() for s in sentences]
return sentences


def close_cache_shelves():
"""Close previously opened cache shelves."""
Expand All @@ -44,6 +89,7 @@ def close_cache_shelves_after(f):
def wrapped(*args, **kwargs):
f(*args, **kwargs)
close_cache_shelves()

return wrapped


Expand Down Expand Up @@ -261,7 +307,7 @@ def lint(input_file, debug=False):
(line, column) = line_and_column(text, start)
if not is_quoted(start, text):
errors += [(check, message, line, column, start, end,
end - start, "warning", replacements)]
end - start, "warning", replacements)]

if len(errors) > options["max_errors"]:
break
Expand All @@ -275,7 +321,7 @@ def lint(input_file, debug=False):
def assert_error(text, check, n=1):
"""Assert that text has n errors of type check."""
assert_error.description = "No {} error for '{}'".format(check, text)
assert(check in [error[0] for error in lint(text)])
assert (check in [error[0] for error in lint(text)])


def consistency_check(text, word_pairs, err, msg, offset=0):
Expand Down Expand Up @@ -426,7 +472,7 @@ def find_ranges(text):
s = 2
elif s == 2:
if c in seps:
ranges.append((start+1, i-1))
ranges.append((start + 1, i - 1))
start = None
s = 0
else:
Expand Down
12 changes: 10 additions & 2 deletions tests/test_misc_capitalization.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
"""Tests for misc.capitalization check."""
from __future__ import absolute_import

from .check import Check

from proselint.checks.misc import capitalization as chk

from .check import Check


class TestCheck(Check):
"""The test class for misc.capitalization."""
Expand All @@ -30,3 +30,11 @@ def test_smoke_check_days(self):
"""Basic smoke test for misc.capitalization.check_days."""
assert chk.check_days("""Smoke phrase with nothing flagged""") == []
assert chk.check_days("""It happened on friday.""") != []

def test_smoke_check_beginning_of_sentences(self):
"""Smoke test for misc.capitalization.check_beginning_of_sentences."""
assert chk.check_beginning_of_sentences("Smoke " +
"sentence with nothing " +
"flagged.") == []
assert chk.check_beginning_of_sentences("the sentence begins " +
"with lowercase letter.") != []