Skip to content

Commit cbaf7aa

Browse files
committed
fixed broken push: copied over files
1 parent 3ab0ff1 commit cbaf7aa

File tree

3 files changed

+165
-32
lines changed

3 files changed

+165
-32
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,5 @@ gpt/data/*/*.bin
77
gpt/data/openwebtext/test.hf
88
gpt/lm_config.py
99
*/__pycache__
10-
umb/audit_data/*/
10+
umb/audit_data/*/*
1111

umb/audit_data/audit_parser.py

+117-31
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,54 @@
88
import regex as re
99
import json
1010
import sys
11+
import json
12+
13+
section_pattern = r"(NO|OK|\+\-)\s+(((\s+[A-Z0-9\:\&\/\']+)+)(\s+\*+(\s+(\s+([A-Z0-9\:\&\/\']+))+)?)?)"
14+
subsection_pattern = r"(\s{3,}|\n\s*)(\-|\+|ip\s*(\-|\+))(\s+R\s)?(\s+[0-9]\))?\s*(([A-Z0-9(\/)\:\']([A-Za-z0-9\/\:\'\-]+\s)*)([A-Za-z0-9\/\:\-\']+))(\s{3,}|\n\s*)"
15+
entry_pattern = r"(([a-zA-Z0-9]{4})\s+([a-zA-z]+\s?[0-9]+([a-zA-Z]?)+)\s+([0-9]+\.[0-9]+)\s+([A-Za-z\-\+\/]+)\s+(\>S|\>X|\>\-|RP|\>D|\>R)?\s+((([\w\&\/]+\s)+)?\w+(\n|[^.])?))"
16+
reqs_pattern = r"((Needs\:)\s+([1-9]+)\s*.+(\n\s+)?)?(Select from\:)(((\s+[A-Z]+\s*)?(\,)?([0-9]+[A-Z]*)(\([A-Z0-9\s]+\))?(\,)?)+)"
17+
req_option_pattern = r"([A-Z]+)?\s*((([0-9]+)([A-Z])?)(\s*TO\s*([0-9]+))?)"
18+
19+
course_catalog = {}
20+
with open("../course_data/data.json", "r") as f:
21+
course_catalog = json.load(f)
22+
23+
print(len(course_catalog.keys()))
24+
25+
class Response:
26+
def __init__(self, status: bool, message: str):
27+
self.status = status
28+
self.message = message
29+
30+
def __dict__(self):
31+
return {
32+
"status": self.status,
33+
"message": self.message
34+
}
35+
36+
def __str__(self):
37+
print(self.__dict__())
38+
39+
def is_course_available(key):
40+
if key not in course_catalog.keys():
41+
return Response(False, "not found")
42+
43+
if "sessions" not in course_catalog[key].keys():
44+
return Response(False, "no sessions available")
45+
46+
session_available = False
47+
for course_session in course_catalog[key]["sessions"]:
48+
is_full = int(course_session["enrolled"]) >= int(course_session["capacity"])
49+
is_closed = course_session["status"].lower() != "open"
50+
if not is_full and not is_closed:
51+
session_available = True
52+
53+
if not session_available:
54+
return Response(False, "all sessions are closed or unavailable")
55+
56+
return Response(True, "session is available")
57+
58+
1159

1260
def extract_as_txt(filename):
1361
reader = PdfReader(f"pdf/{filename}.pdf")
@@ -22,7 +70,7 @@ def extract_as_txt(filename):
2270

2371
f.close()
2472

25-
def extract_section(text, pattern):
73+
def extract_section(pattern, text):
2674
s = list(re.finditer(pattern, text))
2775
matches = [x.start() for x in s]
2876
matches_shifted = matches[1:] + [len(text)]
@@ -42,6 +90,56 @@ def add_entry(group_list):
4290
}
4391
entry_data.append(data)
4492

93+
def add_all_entries(text):
94+
global entry_pattern
95+
entries = extract_section(entry_pattern, text)
96+
for entry_obj, _ in entries:
97+
group_list = list(entry_obj.groups())
98+
add_entry(group_list)
99+
100+
def add_req(key):
101+
if key in course_catalog.keys():
102+
data = {
103+
"course": key,
104+
"type": " ".join(str(section_key).split()),
105+
"subtype": " ".join(str(entry_title).split()),
106+
"availability": is_course_available(key).__dict__(),
107+
**(course_catalog[key])
108+
}
109+
else:
110+
data = {
111+
"course": key,
112+
"type": " ".join(str(section_key).split()),
113+
"subtype": " ".join(str(entry_title).split()),
114+
"availability": is_course_available(key).__dict__()
115+
}
116+
req_data.append(data)
117+
118+
def add_all_reqs(text):
119+
reqs = extract_section(reqs_pattern, text)
120+
course_code = ""
121+
for _, reqs_str in reqs:
122+
req_options = extract_section(req_option_pattern, reqs_str)
123+
for req_option_obj, req_option_str in list(req_options)[1:]:
124+
program_prefix = req_option_obj.group(1)
125+
program_suffix = req_option_obj.group(4)
126+
program_range = req_option_obj.group(7)
127+
min = int(program_suffix)
128+
max = min
129+
if program_prefix is not None:
130+
course_code = program_prefix
131+
132+
if program_range is not None:
133+
max = int(program_range)
134+
135+
for i in range(min, max):
136+
key = course_code + str(i) if i != min else course_code + req_option_obj.group(3)
137+
if key in course_catalog.keys():
138+
add_req(key)
139+
140+
key = course_code + req_option_obj.group(3)
141+
add_req(key)
142+
45143
if __name__ == "__main__":
46144
audit_name = sys.argv[1]
47145
extract_as_txt(audit_name)
@@ -51,49 +149,37 @@ def add_entry(group_list):
51149
f.close()
52150

53151
entry_data = []
54-
section_pattern = r"(NO|OK|\+\-)\s+(((\s+[A-Z0-9\:\&\/\']+)+)(\s+\*+(\s+(\s+([A-Z0-9\:\&\/\']+))+)?)?)"
55-
subsection_pattern = r"(\s{3,}|\n\s*)(\-|\+|ip\s*(\-|\+))(\s+R\s)?(\s+[0-9]\))?\s*(([A-Z0-9(\/)\:\']([A-Za-z0-9\/\:\'\-]+\s)*)([A-Za-z0-9\/\:\-\']+))(\s{3,}|\n\s*)"
56-
entry_pattern = r"(([a-zA-Z0-9]{4})\s+([a-zA-z]+\s?[0-9]+([a-zA-Z]?)+)\s+([0-9]+\.[0-9]+)\s+([A-Za-z\-\+\/]+)\s+(\>S|\>X|\>\-|RP|\>D|\>R)?\s+((([\w\&\/]+\s)+)?\w+(\n|[^.])?))"
57-
sections = extract_section(text, section_pattern)
58-
validation_length = len([_ for _ in extract_section(text, entry_pattern)])
152+
req_data = []
153+
154+
sections = extract_section(section_pattern, text)
155+
validation_length = len([_ for _ in extract_section(entry_pattern, text)])
59156

60157
for section_obj, section_str in sections:
61-
subsections = extract_section(section_str, subsection_pattern)
158+
subsections = list(extract_section(subsection_pattern, section_str))
62159
section_key = section_obj.groups()[2]
63-
has_subsection = False
160+
161+
if len(subsections) == 0:
162+
entry_title = section_key
163+
add_all_entries(section_str)
164+
add_all_reqs(section_str)
165+
continue
64166

65167
for subsection_obj, subsection_str in subsections:
66168
has_subsection = True
67169
entry_title = subsection_obj.groups()[5]
68-
69-
entries = extract_section(subsection_str, entry_pattern)
70-
for entry_obj, entry_str in entries:
71-
group_list = list(entry_obj.groups())
72-
add_entry(group_list)
73-
74-
if has_subsection:
75-
continue
76-
77-
entries = extract_section(section_str, entry_pattern)
78-
entry_title = section_key
79-
for entry_obj, entry_str in entries:
80-
group_list = list(entry_obj.groups())
81-
add_entry(group_list)
170+
add_all_entries(subsection_str)
171+
add_all_reqs(subsection_str)
82172

83-
with open(f"json/{audit_name}.json", "w+") as outfile:
173+
with open(f"json/{audit_name}-past.json", "w+") as outfile:
84174
json.dump(entry_data, outfile, indent=4)
85175

176+
with open(f"json/{audit_name}.json", "w+") as outfile:
177+
json.dump(req_data, outfile, indent=4)
178+
86179
if len(entry_data) != validation_length:
87180
print(f"Warning: Failed to parse {validation_length - len(entry_data)} entries")
88181
print("\tTotal entries:", len(entry_data), "Expected entries:", validation_length, "\n")
89182
else:
90183
print("Parsing complete! ", end="")
91184

92-
print("Saved as", f"json/{audit_name}.json")
93-
94-
95-
# (NO|OK|\+\-)\s+(((\s+[A-Z0-9\:\&\/\']+)+)(\s+\*+(\s+(\s+([A-Z0-9\:\&\/\']+))+)?)?)
96-
# ((NO|OK|\+\-)\s+(((\s+[A-Z0-9\:\&\/\']+)+)(\s+\*+(\s+(\s+([A-Z0-9\:\&\/\']+))+)?)?)|(\n\s|\s\s|\n)(ip\s)?(\-|\+)\s{2,}([0-9]+\))?\s?([A-Za-z\/\-\:0-9]+\s(\-\s)?)+)\s(^(?:(?!((NO|OK|\+\-)\s+([A-Za-z0-9]+( [A-Za-z0-9]+)+)\s+\*+|(\n\s|\s\s|\n)(ip\s)?(\-|\+)\s{2,}([0-9]+\))?\s?([A-Za-z\/\-\:0-9]+\s(\-\s)?)+)).)*$\n){0,}
97-
# ((NO|OK|\+\-)\s+([A-Za-z0-9]+( [A-Za-z0-9]+)+)\s+\*+|(\n\s|\s\s|\n)(ip\s)?(\-|\+)\s{2,}([0-9]+\))?\s?([A-Za-z\/\-\:0-9]+\s(\-\s)?)+)\s(^(?:(?!((NO|OK|\+\-)\s+([A-Za-z0-9]+( [A-Za-z0-9]+)+)\s+\*+|(\n\s|\s\s|\n)(ip\s)?(\-|\+)\s{2,}([0-9]+\))?\s?([A-Za-z\/\-\:0-9]+\s(\-\s)?)+)).)*$\n){0,}
98-
# (\s{3,}|\n\s*)(\-|\+|ip\s*(\-|\+))(\s+R\s)?(\s+[0-9]\))?\s*([A-Za-z0-9]+)(\s[A-Za-z0-9]+)*(\s{3,}|\n\s*)
99-
# (\s{3,}|\n\s*)(\-|\+|ip\s*(\-|\+))(\s+R\s)?(\s+[0-9]\))?\s*(([A-Z0-9(\/)\:\']([A-Za-z0-9\/\:\'\-]+\s)*)([A-Za-z0-9\/\:\-\']+))(\s{3,}|\n\s*)
185+
print("Saved as", f"json/{audit_name}.json")

umb/audit_data/extract_urls.py

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import PyPDF2
2+
import pdfplumber
3+
4+
def extract_urls_with_text(pdf_path):
5+
urls_and_text = []
6+
7+
# Use PyPDF2 to get URLs
8+
with open(pdf_path, 'rb') as pdf_file:
9+
reader = PyPDF2.PdfReader(pdf_file)
10+
11+
with pdfplumber.open(pdf_path) as pdf:
12+
for page_num, page in enumerate(reader.pages):
13+
# Extract annotations (URLs) if they exist
14+
if "/Annots" in page:
15+
annotations = page["/Annots"]
16+
17+
for annotation in annotations:
18+
annot_obj = annotation.get_object()
19+
20+
# Check if the annotation has a URI (URL)
21+
if annot_obj.get("/Subtype") == "/Link" and "/A" in annot_obj:
22+
uri = annot_obj["/A"].get("/URI")
23+
# Attempt to get /Contents directly, or set to None if not available
24+
text = annot_obj.get("/Contents")
25+
26+
# If /Contents isn't available, fall back to extracting surrounding text
27+
if not text:
28+
# Get coordinates for the link annotation (if available)
29+
rect = annot_obj.get("/Rect")
30+
if rect:
31+
x0, y0, x1, y1 = rect # Coordinates of the link area
32+
33+
# Use pdfplumber to extract text in this area
34+
pdf_page = pdf.pages[page_num]
35+
text = pdf_page.within_bbox((float(x0), float(y0), float(x1), float(y1))).extract_text() or "No text available"
36+
37+
if uri:
38+
urls_and_text.append((text, uri))
39+
40+
return urls_and_text
41+
42+
# Example usage
43+
pdf_path = "pdf/ahnaf_audit.pdf"
44+
extracted_data = extract_urls_with_text(pdf_path)
45+
46+
for text, url in extracted_data:
47+
print(f"Text: {text} -> URL: {url}")

0 commit comments

Comments
 (0)