Skip to content

Commit 3ab0ff1

Browse files
committed
replace page breaks with newline character fix
1 parent bf39ed1 commit 3ab0ff1

File tree

1 file changed

+2
-3
lines changed

1 file changed

+2
-3
lines changed

umb/audit_data/audit_parser.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def extract_as_txt(filename):
1717
for i in range(number_of_pages):
1818
page = reader.pages[i]
1919
text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False)
20-
text = re.sub(re.compile("\s+Page\s[0-9]+\sof\s[0-9]+(\s+)?"), "", text)
20+
text = re.sub(re.compile("\s+Page\s[0-9]+\sof\s[0-9]+(\s+)?"), "\n", text)
2121
f.write(text)
2222

2323
f.close()
@@ -51,9 +51,8 @@ def add_entry(group_list):
5151
f.close()
5252

5353
entry_data = []
54-
text = "\n".join(re.split(re.compile("\s+Page\s[0-9]+\sof\s[0-9]+\s+"), text))
5554
section_pattern = r"(NO|OK|\+\-)\s+(((\s+[A-Z0-9\:\&\/\']+)+)(\s+\*+(\s+(\s+([A-Z0-9\:\&\/\']+))+)?)?)"
56-
subsection_pattern = r"(\s{3,}|\n\s*|GPA)(\-|\+|ip\s*(\-|\+))(\s+R\s)?(\s+[0-9]\))?\s*(([A-Z0-9(\/)\:\']([A-Za-z0-9\/\:\'\-]+\s)*)([A-Za-z0-9\/\:\-\']+))(\s{3,}|\n\s*)"
55+
subsection_pattern = r"(\s{3,}|\n\s*)(\-|\+|ip\s*(\-|\+))(\s+R\s)?(\s+[0-9]\))?\s*(([A-Z0-9(\/)\:\']([A-Za-z0-9\/\:\'\-]+\s)*)([A-Za-z0-9\/\:\-\']+))(\s{3,}|\n\s*)"
5756
entry_pattern = r"(([a-zA-Z0-9]{4})\s+([a-zA-z]+\s?[0-9]+([a-zA-Z]?)+)\s+([0-9]+\.[0-9]+)\s+([A-Za-z\-\+\/]+)\s+(\>S|\>X|\>\-|RP|\>D|\>R)?\s+((([\w\&\/]+\s)+)?\w+(\n|[^.])?))"
5857
sections = extract_section(text, section_pattern)
5958
validation_length = len([_ for _ in extract_section(text, entry_pattern)])

0 commit comments

Comments
 (0)