From 9e39a1c78a1a62a9dccbbae15e964c2396176665 Mon Sep 17 00:00:00 2001 From: jpahm <20374744+jpahm@users.noreply.github.com> Date: Thu, 2 Nov 2023 17:23:25 -0500 Subject: [PATCH] Parser cleanup & validator optimization --- parser/courseParser.go | 81 +++ parser/parser.go | 1071 +------------------------------------ parser/professorParser.go | 43 ++ parser/profileLoader.go | 51 ++ parser/requisiteParser.go | 578 ++++++++++++++++++++ parser/sectionParser.go | 157 ++++++ parser/utils.go | 90 ++++ parser/validator.go | 108 ++++ 8 files changed, 1120 insertions(+), 1059 deletions(-) create mode 100644 parser/courseParser.go create mode 100644 parser/professorParser.go create mode 100644 parser/profileLoader.go create mode 100644 parser/requisiteParser.go create mode 100644 parser/sectionParser.go create mode 100644 parser/utils.go create mode 100644 parser/validator.go diff --git a/parser/courseParser.go b/parser/courseParser.go new file mode 100644 index 0000000..1f36069 --- /dev/null +++ b/parser/courseParser.go @@ -0,0 +1,81 @@ +package parser + +import ( + "fmt" + "regexp" + "strconv" + + "github.com/UTDNebula/nebula-api/api/schema" + "go.mongodb.org/mongo-driver/bson/primitive" +) + +var coursePrefixRexp *regexp.Regexp = regexp.MustCompile(`^([A-Z]{2,4})([0-9V]{4})`) +var contactRegexp *regexp.Regexp = regexp.MustCompile(`\(([0-9]+)-([0-9]+)\)\s+([SUFY]+)`) + +func getCatalogYear(session schema.AcademicSession) string { + sessionYear, err := strconv.Atoi(session.Name[0:2]) + if err != nil { + panic(err) + } + sessionSemester := session.Name[2] + switch sessionSemester { + case 'F': + return strconv.Itoa(sessionYear) + case 'S': + return strconv.Itoa(sessionYear - 1) + case 'U': + return strconv.Itoa(sessionYear - 1) + default: + panic(fmt.Errorf("encountered invalid session semester '%c!'", sessionSemester)) + } +} + +func parseCourse(courseNum string, session schema.AcademicSession, rowInfo map[string]string, classInfo map[string]string) *schema.Course { + // Courses are internally keyed by their internal course number and the catalog year they're part of + catalogYear := getCatalogYear(session) + courseKey := courseNum + catalogYear + + // Don't recreate the course if it already exists + course, courseExists := Courses[courseKey] + if courseExists { + return course + } + + // Get subject prefix and course number by doing a regexp match on the section id + sectionId := classInfo["Class Section:"] + idMatches := coursePrefixRexp.FindStringSubmatch(sectionId) + + course = &schema.Course{} + + course.Id = primitive.NewObjectID() + course.Course_number = idMatches[2] + course.Subject_prefix = idMatches[1] + course.Title = rowInfo["Course Title:"] + course.Description = rowInfo["Description:"] + course.School = rowInfo["College:"] + course.Credit_hours = classInfo["Semester Credit Hours:"] + course.Class_level = classInfo["Class Level:"] + course.Activity_type = classInfo["Activity Type:"] + course.Grading = classInfo["Grading:"] + course.Internal_course_number = courseNum + + // Get closure for parsing course requisites (god help me) + enrollmentReqs, hasEnrollmentReqs := rowInfo["Enrollment Reqs:"] + ReqParsers[course.Id] = getReqParser(course, hasEnrollmentReqs, enrollmentReqs) + + // Try to get lecture/lab contact hours and offering frequency from course description + contactMatches := contactRegexp.FindStringSubmatch(course.Description) + // Length of contactMatches should be 4 upon successful match + if len(contactMatches) == 4 { + course.Lecture_contact_hours = contactMatches[1] + course.Laboratory_contact_hours = contactMatches[2] + course.Offering_frequency = contactMatches[3] + } + + // Set the catalog year + course.Catalog_year = catalogYear + + Courses[courseKey] = course + CourseIDMap[course.Id] = courseKey + return course +} diff --git a/parser/parser.go b/parser/parser.go index 65330d4..978d1da 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -1,12 +1,9 @@ package parser import ( - "encoding/json" "fmt" "log" "os" - "regexp" - "strconv" "strings" "time" @@ -52,7 +49,11 @@ func Parse(inDir string, outDir string, csvPath string, skipValidation bool) { // Find paths of all scraped data paths := getAllSectionFilepaths(inDir) - log.Printf("Parsing and validating %d files...\n", len(paths)) + if !skipValidation { + log.Printf("Parsing and validating %d files...\n", len(paths)) + } else { + log.Printf("Parsing %d files WITHOUT VALIDATION...\n", len(paths)) + } // Parse all data for _, path := range paths { @@ -62,8 +63,10 @@ func Parse(inDir string, outDir string, csvPath string, skipValidation bool) { log.Printf("\nParsing complete. Created %d courses, %d sections, and %d professors.\n", len(Courses), len(Sections), len(Professors)) log.Print("\nParsing course requisites...\n") + // Initialize matchers at runtime for requisite parsing; this is necessary to avoid circular reference errors with compile-time initialization initMatchers() + for _, course := range Courses { ReqParsers[course.Id]() } @@ -82,73 +85,9 @@ func Parse(inDir string, outDir string, csvPath string, skipValidation bool) { } // Write validated data to output files - fptr, err := os.Create(fmt.Sprintf("%s/courses.json", outDir)) - if err != nil { - panic(err) - } - encoder := json.NewEncoder(fptr) - encoder.SetIndent("", "\t") - encoder.Encode(getMapValues(Courses)) - fptr.Close() - - fptr, err = os.Create(fmt.Sprintf("%s/sections.json", outDir)) - if err != nil { - panic(err) - } - encoder = json.NewEncoder(fptr) - encoder.SetIndent("", "\t") - encoder.Encode(getMapValues(Sections)) - fptr.Close() - - fptr, err = os.Create(fmt.Sprintf("%s/professors.json", outDir)) - if err != nil { - panic(err) - } - encoder = json.NewEncoder(fptr) - encoder.SetIndent("", "\t") - encoder.Encode(getMapValues(Professors)) - fptr.Close() -} - -func loadProfiles(inDir string) { - fptr, err := os.Open(fmt.Sprintf("%s/profiles.json", inDir)) - if err != nil { - log.Print("Couldn't find/open profiles.json in the input directory. Skipping profile load.\n") - return - } - - decoder := json.NewDecoder(fptr) - - log.Print("Beginning profile load.\n") - - // Read open bracket - _, err = decoder.Token() - if err != nil { - panic(err) - } - - // While the array contains values - profileCount := 0 - for ; decoder.More(); profileCount++ { - // Decode a professor - var prof schema.Professor - err := decoder.Decode(&prof) - if err != nil { - panic(err) - } - professorKey := prof.First_name + prof.Last_name - Professors[professorKey] = &prof - ProfessorIDMap[prof.Id] = professorKey - } - - // Read closing bracket - _, err = decoder.Token() - if err != nil { - panic(err) - } - - log.Printf("Loaded %d profiles!\n\n", profileCount) - fptr.Close() + writeJSON(fmt.Sprintf("%s/courses.json", outDir), getMapValues(Courses)) + writeJSON(fmt.Sprintf("%s/sections.json", outDir), getMapValues(Sections)) + writeJSON(fmt.Sprintf("%s/professors.json", outDir), getMapValues(Professors)) } // Internal parse function @@ -214,993 +153,7 @@ func parse(path string) { session := getAcademicSession(rowInfo, classInfo) // Try to create the course and section based on collected info - courseRef := addCourse(courseNum, session, rowInfo, classInfo) - addSection(courseRef, classNum, syllabusURI, session, rowInfo, classInfo) + courseRef := parseCourse(courseNum, session, rowInfo, classInfo) + parseSection(courseRef, classNum, syllabusURI, session, rowInfo, classInfo) log.Print("Parsed!\n") } - -var coursePrefixRexp *regexp.Regexp = regexp.MustCompile(`^([A-Z]{2,4})([0-9V]{4})`) -var contactRegexp *regexp.Regexp = regexp.MustCompile(`\(([0-9]+)-([0-9]+)\)\s+([SUFY]+)`) - -func getCatalogYear(session schema.AcademicSession) string { - sessionYear, err := strconv.Atoi(session.Name[0:2]) - if err != nil { - panic(err) - } - sessionSemester := session.Name[2] - switch sessionSemester { - case 'F': - return strconv.Itoa(sessionYear) - case 'S': - return strconv.Itoa(sessionYear - 1) - case 'U': - return strconv.Itoa(sessionYear - 1) - default: - panic(fmt.Errorf("encountered invalid session semester '%c!'", sessionSemester)) - } -} - -func addCourse(courseNum string, session schema.AcademicSession, rowInfo map[string]string, classInfo map[string]string) *schema.Course { - - // Courses are internally keyed by their internal course number and the catalog year they're part of - catalogYear := getCatalogYear(session) - courseKey := courseNum + catalogYear - - // Don't recreate the course if it already exists - course, courseExists := Courses[courseKey] - if courseExists { - return course - } - - // Get subject prefix and course number by doing a regexp match on the section id - sectionId := classInfo["Class Section:"] - idMatches := coursePrefixRexp.FindStringSubmatch(sectionId) - - course = &schema.Course{} - - course.Id = primitive.NewObjectID() - course.Course_number = idMatches[2] - course.Subject_prefix = idMatches[1] - course.Title = rowInfo["Course Title:"] - course.Description = rowInfo["Description:"] - course.School = rowInfo["College:"] - course.Credit_hours = classInfo["Semester Credit Hours:"] - course.Class_level = classInfo["Class Level:"] - course.Activity_type = classInfo["Activity Type:"] - course.Grading = classInfo["Grading:"] - course.Internal_course_number = courseNum - - // Get closure for parsing course requisites (god help me) - enrollmentReqs, hasEnrollmentReqs := rowInfo["Enrollment Reqs:"] - ReqParsers[course.Id] = getReqParser(course, hasEnrollmentReqs, enrollmentReqs) - - // Try to get lecture/lab contact hours and offering frequency from course description - contactMatches := contactRegexp.FindStringSubmatch(course.Description) - // Length of contactMatches should be 4 upon successful match - if len(contactMatches) == 4 { - course.Lecture_contact_hours = contactMatches[1] - course.Laboratory_contact_hours = contactMatches[2] - course.Offering_frequency = contactMatches[3] - } - - // Set the catalog year - course.Catalog_year = catalogYear - - Courses[courseKey] = course - CourseIDMap[course.Id] = courseKey - return course -} - -/* -// Below is the code for the requisite parser. It is *by far* the most complicated code in this entire project. -// In summary, it uses a bottom-up "stack"-based parsing technique, building requisites by taking small groups of text, parsing those groups, -// storing them on the "stack", and then uses those previously parsed groups as dependencies for parsing the larger "higher level" groups. -*/ - -////////////////////////////////////////////////// BEGIN REQUISITE PARSER CODE ////////////////////////////////////////////////// - -// Regex matcher object for requisite group parsing -type Matcher struct { - Regex *regexp.Regexp - Handler func(string, []string) interface{} -} - -// Regex for group tags -var groupTagRegex = regexp.MustCompile(`@(\d+)`) - -////////////////////// BEGIN MATCHER FUNCS ////////////////////// - -var ANDRegex = regexp.MustCompile(`(?i)\s+and\s+`) - -func ANDMatcher(group string, subgroups []string) interface{} { - // Split text along " and " boundaries, then parse subexpressions as groups into an "AND" CollectionRequirement - subExpressions := ANDRegex.Split(group, -1) - parsedSubExps := make([]interface{}, 0, len(subExpressions)) - for _, exp := range subExpressions { - parsedExp := parseGroup(trimWhitespace(exp)) - // Don't include throwaways - if !reqIsThrowaway(parsedExp) { - parsedSubExps = append(parsedSubExps, parsedExp) - } - } - - parsedSubExps = joinAdjacentOthers(parsedSubExps, " and ") - - if len(parsedSubExps) > 1 { - return schema.NewCollectionRequirement("AND", len(parsedSubExps), parsedSubExps) - } else { - return parsedSubExps[0] - } -} - -// First regex subgroup represents the text to be subgrouped and parsed with parseFnc -// Ex: Text is: "(OPRE 3360 or STAT 3360 or STAT 4351), and JSOM majors and minors only" -// Regex would be: "(JSOM majors and minors only)" -// Resulting substituted text would be: "(OPRE 3360 or STAT 3360 or STAT 4351), and @N", where N is some group number -// When @N is dereferenced from the requisite list, it will have a value equivalent to the result of parseFnc(group, subgroups) - -func SubstitutionMatcher(parseFnc func(string, []string) interface{}) func(string, []string) interface{} { - // Return a closure that uses parseFnc to substitute subgroups[1] - return func(group string, subgroups []string) interface{} { - // If there's no text to substitute, just return an OtherRequirement - if len(subgroups) < 2 { - return OtherMatcher(group, subgroups) - } - // Otherwise, substitute subgroups[1] and parse it with parseFnc - return parseGroup(makeSubgroup(group, subgroups[1], parseFnc(group, subgroups))) - } -} - -var ORRegex = regexp.MustCompile(`(?i)\s+or\s+`) - -func ORMatcher(group string, subgroups []string) interface{} { - // Split text along " or " boundaries, then parse subexpressions as groups into an "OR" CollectionRequirement - subExpressions := ORRegex.Split(group, -1) - parsedSubExps := make([]interface{}, 0, len(subExpressions)) - for _, exp := range subExpressions { - parsedExp := parseGroup(trimWhitespace(exp)) - // Don't include throwaways - if !reqIsThrowaway(parsedExp) { - parsedSubExps = append(parsedSubExps, parsedExp) - } - } - - parsedSubExps = joinAdjacentOthers(parsedSubExps, " or ") - - if len(parsedSubExps) > 1 { - return schema.NewCollectionRequirement("OR", 1, parsedSubExps) - } else { - return parsedSubExps[0] - } -} - -func CourseMinGradeMatcher(group string, subgroups []string) interface{} { - icn, err := findICN(subgroups[1], subgroups[2]) - if err != nil { - log.Printf("WARN: %s\n", err) - return OtherMatcher(group, subgroups) - } - return schema.NewCourseRequirement(icn, subgroups[3]) -} - -func CourseMatcher(group string, subgroups []string) interface{} { - icn, err := findICN(subgroups[1], subgroups[2]) - if err != nil { - log.Printf("WARN: %s\n", err) - return OtherMatcher(group, subgroups) - } - return schema.NewCourseRequirement(icn, "F") -} - -func ConsentMatcher(group string, subgroups []string) interface{} { - return schema.NewConsentRequirement(subgroups[1]) -} - -func LimitMatcher(group string, subgroups []string) interface{} { - hourLimit, err := strconv.Atoi(subgroups[1]) - if err != nil { - panic(err) - } - return schema.NewLimitRequirement(hourLimit) -} - -func MajorMatcher(group string, subgroups []string) interface{} { - return schema.NewMajorRequirement(subgroups[1]) -} - -func MinorMatcher(group string, subgroups []string) interface{} { - return schema.NewMinorRequirement(subgroups[1]) -} - -func MajorMinorMatcher(group string, subgroups []string) interface{} { - return schema.NewCollectionRequirement("OR", 1, []interface{}{*schema.NewMajorRequirement(subgroups[1]), *schema.NewMinorRequirement(subgroups[1])}) -} - -func CoreMatcher(group string, subgroups []string) interface{} { - hourReq, err := strconv.Atoi(subgroups[1]) - if err != nil { - panic(err) - } - return schema.NewCoreRequirement(subgroups[2], hourReq) -} - -func CoreCompletionMatcher(group string, subgroups []string) interface{} { - return schema.NewCoreRequirement(subgroups[1], -1) -} - -func ChoiceMatcher(group string, subgroups []string) interface{} { - collectionReq, ok := parseGroup(subgroups[1]).(*schema.CollectionRequirement) - if !ok { - log.Printf("WARN: ChoiceMatcher wasn't able to parse subgroup '%s' into a CollectionRequirement!", subgroups[1]) - return OtherMatcher(group, subgroups) - } - return schema.NewChoiceRequirement(collectionReq) -} - -func GPAMatcher(group string, subgroups []string) interface{} { - GPAFloat, err := strconv.ParseFloat(subgroups[1], 32) - if err != nil { - panic(err) - } - return schema.NewGPARequirement(GPAFloat, "") -} - -func ThrowawayMatcher(group string, subgroups []string) interface{} { - return schema.Requirement{Type: "throwaway"} -} - -func GroupTagMatcher(group string, subgroups []string) interface{} { - groupIndex, err := strconv.Atoi(subgroups[1]) - if err != nil { - panic(err) - } - // Return a throwaway if index is out of range - if groupIndex < 0 || groupIndex >= len(requisiteList) { - return schema.Requirement{Type: "throwaway"} - } - // Find referenced group and return it - parsedGrp := requisiteList[groupIndex] - return parsedGrp -} - -func OtherMatcher(group string, subgroups []string) interface{} { - return schema.NewOtherRequirement(ungroupText(group), "") -} - -/////////////////////// END MATCHER FUNCS /////////////////////// - -// Matcher container, matchers must be in order of precedence -// NOTE: PARENTHESES ARE OF HIGHEST PRECEDENCE! (This is due to groupParens() handling grouping of parenthesized text before parsing begins) -var Matchers []Matcher - -// Must init matchers via function at runtime to avoid compile-time circular definition error -func initMatchers() { - Matchers = []Matcher{ - - // Throwaways - Matcher{ - regexp.MustCompile(`^(?i)(?:better|\d-\d|same as.+)$`), - ThrowawayMatcher, - }, - - /* TO IMPLEMENT: - - X or Y or ... Z Major/Minor - - SUBJECT NUMBER, SUBJECT NUMBER, ..., or SUBJECT NUMBER - - ... probably many more - - */ - - // * only - Matcher{ - regexp.MustCompile(`(?i).+(?:freshman|sophomores|juniors|seniors)\s+only$`), - OtherMatcher, - }, - - // * in any combination of * - Matcher{ - regexp.MustCompile(`(?i).+\s+in\s+any\s+combination\s+of\s+.+`), - OtherMatcher, - }, - - // majors and minors only - Matcher{ - regexp.MustCompile(`(?i)(([A-Z]+)\s+majors\s+and\s+minors\s+only)`), - SubstitutionMatcher(func(group string, subgroups []string) interface{} { - return MajorMinorMatcher(subgroups[1], subgroups[1:3]) - }), - }, - - // Core completion - Matcher{ - regexp.MustCompile(`(?i)(Completion\s+of\s+(?:an?\s+)?(\d{3}).+core(?:\s+course)?)`), - SubstitutionMatcher(func(group string, subgroups []string) interface{} { - return CoreCompletionMatcher(subgroups[1], subgroups[1:3]) - }), - }, - - // Credit cannot be received for both courses, - Matcher{ - regexp.MustCompile(`(?i)(Credit\s+cannot\s+be\s+received\s+for\s+both\s+(?:courses)?,?(.+))`), - SubstitutionMatcher(func(group string, subgroups []string) interface{} { - return ChoiceMatcher(subgroups[1], subgroups[1:3]) - }), - }, - - // Credit cannot be received for more than one of *: - Matcher{ - regexp.MustCompile(`(?i)(Credit\s+cannot\s+be\s+received\s+for\s+more\s+than\s+one\s+of.+:(.+))`), - SubstitutionMatcher(func(group string, subgroups []string) interface{} { - return ChoiceMatcher(subgroups[1], subgroups[1:3]) - }), - }, - - // Logical & - Matcher{ - ANDRegex, - ANDMatcher, - }, - - // " with a [grade] [of] or better" - Matcher{ - regexp.MustCompile(`^(?i)(([A-Z]{2,4})\s+([0-9V]{4})\s+with\s+a(?:\s+grade)?(?:\s+of)?\s+([ABCF][+-]?)\s+or\s+better)`), // [name, number, min grade] - SubstitutionMatcher(func(group string, subgroups []string) interface{} { - return CourseMinGradeMatcher(subgroups[1], subgroups[1:5]) - }), - }, - - // Logical | - Matcher{ - ORRegex, - ORMatcher, - }, - - // with a [minimum] grade of [at least] [a] - Matcher{ - regexp.MustCompile(`^(?i)([A-Z]{2,4})\s+([0-9V]{4})\s+with\s+a\s+(?:minimum\s+)?grade\s+of\s+(?:at least\s+)?(?:a\s+)?([ABCF][+-]?)$`), // [name, number, min grade] - CourseMinGradeMatcher, - }, - - // A grade of [at least] [a] in - Matcher{ - regexp.MustCompile(`^(?i)A\s+grade\s+of(?:\s+at\s+least)?(?:\s+a)?\s+([ABCF][+-]?)\s+in\s+([A-Z]{2,4})\s+([0-9V]{4})$`), // [min grade, name, number] - func(group string, subgroups []string) interface{} { - return CourseMinGradeMatcher(group, []string{subgroups[0], subgroups[2], subgroups[3], subgroups[1]}) - }, - }, - - // - Matcher{ - regexp.MustCompile(`^\s*([A-Z]{2,4})\s+([0-9V]{4})\s*$`), // [name, number] - CourseMatcher, - }, - - // consent required - Matcher{ - regexp.MustCompile(`^(?i)(.+)\s+consent\s+required`), // [granter] - ConsentMatcher, - }, - - // semester credit hours maximum - Matcher{ - regexp.MustCompile(`^(?i)(\d+)\s+semester\s+credit\s+hours\s+maximum$`), - LimitMatcher, - }, - // This course may only be repeated for credit hours - Matcher{ - regexp.MustCompile(`^(?:[A-Z]{2,4}\s+[0-9V]{4}\s+)?Repeat\s+Limit\s+-\s+(?:[A-Z]{2,4}\s+[0-9V]{4}|This\s+course)\s+may\s+only\s+be\s+repeated\s+for(?:\s+a\s+maximum\s+of)?\s+(\d+)\s+semester\s+cre?dit\s+hours(?:\s+maximum)?$`), - LimitMatcher, - }, - - // majors only - Matcher{ - regexp.MustCompile(`^(?i)(.+)\s+major(?:s\s+only)?$`), - MajorMatcher, - }, - - // minors only - Matcher{ - regexp.MustCompile(`^(?i)(.+)\s+minor(?:s\s+only)?$`), - MinorMatcher, - }, - - // Any semester credit hour course - Matcher{ - regexp.MustCompile(`^(?i)any\s+(\d+)\s+semester\s+credit\s+hour\s+(\d{3})(?:\s+@\d+)?\s+core(?:\s+course)?$`), - CoreMatcher, - }, - - // Minimum GPA of - Matcher{ - regexp.MustCompile(`^(?i)(?:minimum\s+)?GPA\s+of\s+([0-9\.]+)$`), // [GPA] - GPAMatcher, - }, - // GPA - Matcher{ - regexp.MustCompile(`^(?i)([0-9\.]+) GPA$`), // [GPA] - GPAMatcher, - }, - // A university grade point average of at least - Matcher{ - regexp.MustCompile(`^(?i)a(?:\s+university)?\s+grade\s+point\s+average\s+of(?:\s+at\s+least)?\s+([0-9\.]+)$`), // [GPA] - GPAMatcher, - }, - - // Group tags (i.e. @1) - Matcher{ - groupTagRegex, // [group #] - GroupTagMatcher, - }, - } -} - -var preOrCoreqRegexp *regexp.Regexp = regexp.MustCompile(`(?i)((?:Prerequisites?\s+or\s+corequisites?|Corequisites?\s+or\s+prerequisites?):(.*))`) -var prereqRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(Prerequisites?:(.*))`) -var coreqRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(Corequisites?:(.*))`) - -// It is very important that these remain in the same order -- this keeps proper precedence in the below function! -var reqRegexes [3]*regexp.Regexp = [3]*regexp.Regexp{preOrCoreqRegexp, prereqRegexp, coreqRegexp} - -// Returns a closure that parses the course's requisites -func getReqParser(course *schema.Course, hasEnrollmentReqs bool, enrollmentReqs string) func() { - return func() { - // Pointer array to course requisite properties must be in same order as reqRegexes above - courseReqs := [3]**schema.CollectionRequirement{&course.Co_or_pre_requisites, &course.Prerequisites, &course.Corequisites} - // The actual text to check for requisites - var checkText string - // Extract req text from the enrollment req info if it exists, otherwise try using the description - if hasEnrollmentReqs { - course.Enrollment_reqs = enrollmentReqs - checkText = enrollmentReqs - } else { - checkText = course.Description - } - // Iterate over and parse each type of requisite, populating the course's relevant requisite property - for index, reqPtr := range courseReqs { - reqMatches := reqRegexes[index].FindStringSubmatch(checkText) - if reqMatches != nil { - // Actual useful text is the inner match, index 2 - reqText := reqMatches[2] - // Erase any sub-matches for other requisite types by matching outer text, index 1 - for _, regex := range reqRegexes { - matches := regex.FindStringSubmatch(reqText) - if matches != nil { - reqText = strings.Replace(reqText, matches[1], "", -1) - } - } - // Erase current match from checkText to prevent erroneous duplicated Reqs - checkText = strings.Replace(checkText, reqMatches[1], "", -1) - // Split reqText into chunks based on period-space delimiters - textChunks := strings.Split(trimWhitespace(reqText), ". ") - parsedChunks := make([]interface{}, 0, len(textChunks)) - // Parse each chunk, then add non-throwaway chunks to parsedChunks - for _, chunk := range textChunks { - // Trim any remaining rightmost periods - chunk = trimWhitespace(strings.TrimRight(chunk, ".")) - parsedChunk := parseChunk(chunk) - if !reqIsThrowaway(parsedChunk) { - parsedChunks = append(parsedChunks, parsedChunk) - } - } - // Build CollectionRequirement from parsed chunks and apply to the course property - if len(parsedChunks) > 0 { - *reqPtr = schema.NewCollectionRequirement("REQUISITES", len(parsedChunks), parsedChunks) - } - log.Printf("\n\n") - } - } - } -} - -// Function for pulling all requisite references (reqs referenced via group tags) from text -/* -func getReqRefs(text string) []interface{} { - matches := groupTagRegex.FindAllStringSubmatch(text, -1) - refs := make([]interface{}, len(matches)) - for i, submatches := range matches { - refs[i] = GroupTagMatcher(submatches[0], submatches) - } - return refs -} -*/ - -// Function for creating a new group by replacing subtext in an existing group, and pushing the new group's info to the req and group list -func makeSubgroup(group string, subtext string, requisite interface{}) string { - newGroup := strings.Replace(group, subtext, fmt.Sprintf("@%d", len(requisiteList)), -1) - requisiteList = append(requisiteList, requisite) - groupList = append(groupList, newGroup) - return newGroup -} - -// Function for joining adjacent OtherRequirements into one OtherRequirement by joining their descriptions with a string -func joinAdjacentOthers(reqs []interface{}, joinString string) []interface{} { - joinedReqs := make([]interface{}, 0, len(reqs)) - // Temp is a blank OtherRequirement - temp := *schema.NewOtherRequirement("", "") - // Iterate over each existing req - for _, req := range reqs { - // Determine whether req is an OtherRequirement - otherReq, isOtherReq := req.(schema.OtherRequirement) - if !isOtherReq { - // If temp contains data, append its final result to the joinedReqs - if temp.Description != "" { - joinedReqs = append(joinedReqs, temp) - } - // Append the non-OtherRequirement to the joinedReqs - joinedReqs = append(joinedReqs, req) - // Reset temp's description - temp.Description = "" - continue - } - // If temp is blank, and req is an otherReq, use otherReq as the initial value of temp - // Otherwise, join temp's existing description with otherReq's description - if temp.Description == "" { - temp = otherReq - } else { - temp.Description = strings.Join([]string{temp.Description, otherReq.Description}, joinString) - } - } - // If temp contains data, append its final result to the joinedReqs - if temp.Description != "" { - joinedReqs = append(joinedReqs, temp) - } - //log.Printf("JOINEDREQS ARE: %v\n", joinedReqs) - return joinedReqs -} - -// Function for finding the Internal Course Number associated with the course with the specified subject and course number -func findICN(subject string, number string) (string, error) { - for _, coursePtr := range Courses { - if coursePtr.Subject_prefix == subject && coursePtr.Course_number == number { - return coursePtr.Internal_course_number, nil - } - } - return "ERROR", fmt.Errorf("couldn't find an ICN for %s %s", subject, number) -} - -// This is the list of produced requisites. Indices coincide with group indices -- aka group @0 will also be the 0th index of the list since it will be processed first. -var requisiteList []interface{} - -// This is the list of groups that are to be parsed. They are the raw text chunks associated with the reqs above. -var groupList []string - -// Innermost function for parsing individual text groups (used recursively by some Matchers) -func parseGroup(grp string) interface{} { - // Make sure we trim any mismatched right parentheses - grp = strings.TrimRight(grp, ")") - // Find an applicable matcher in Matchers - for _, matcher := range Matchers { - matches := matcher.Regex.FindStringSubmatch(grp) - if matches != nil { - // If an applicable matcher has been found, return the result of calling its handler - result := matcher.Handler(grp, matches) - log.Printf("'%s' -> %T\n", grp, result) - return result - } - } - // Panic if no matcher was able to be found for a given group -- this means we need to add handling for it!!! - //log.Panicf("NO MATCHER FOUND FOR GROUP '%s'\nSTACK IS: %#v\n", grp, requisiteList) - //log.Printf("NO MATCHER FOR: '%s'\n", grp) - log.Printf("'%s' -> parser.OtherRequirement\n", grp) - //var temp string - //fmt.Scanf("%s", temp) - return *schema.NewOtherRequirement(ungroupText(grp), "") -} - -// Outermost function for parsing a chunk of requisite text (potentially containing multiple nested text groups) -func parseChunk(chunk string) interface{} { - log.Printf("\nPARSING CHUNK: '%s'\n", chunk) - // Extract parenthesized groups from chunk text - parseText, parseGroups := groupParens(chunk) - // Initialize the requisite list and group list - requisiteList = make([]interface{}, 0, len(parseGroups)) - groupList = parseGroups - // Begin recursive group parsing -- order is bottom-up - for _, grp := range parseGroups { - parsedReq := parseGroup(grp) - // Only append requisite to stack if it isn't marked as throwaway - if !reqIsThrowaway(parsedReq) { - requisiteList = append(requisiteList, parsedReq) - } - } - finalGroup := parseGroup(parseText) - return finalGroup -} - -// Check whether a requisite is a throwaway or not by trying a type assertion to Requirement -func reqIsThrowaway(req interface{}) bool { - baseReq, isBaseReq := req.(schema.Requirement) - return isBaseReq && baseReq.Type == "throwaway" -} - -// Use stack-based parentheses parsing to form text groups and reference them in the original string -func groupParens(text string) (string, []string) { - var groups []string = make([]string, 0, 5) - var positionStack []int = make([]int, 0, 5) - var depth int = 0 - for pos := 0; pos < len(text); pos++ { - if text[pos] == '(' { - depth++ - positionStack = append(positionStack, pos) - } else if text[pos] == ')' && depth > 0 { - depth-- - lastIndex := len(positionStack) - 1 - // Get last '(' position from stack - lastPos := positionStack[lastIndex] - // Pop stack - positionStack = positionStack[:lastIndex] - // Make group and replace group text with group index reference - groupText := text[lastPos+1 : pos] - groupNum := len(groups) - groups = append(groups, groupText) - subText := fmt.Sprintf("@%d", groupNum) - text = strings.Replace(text, text[lastPos:pos+1], subText, -1) - // Adjust position to account for replaced text - pos += len(subText) - len(groupText) - 2 - } - } - return text, groups -} - -// Function for replacing all group references (groups referenced via group tags) with their actual text -func ungroupText(text string) string { - text = trimWhitespace(text) - for groupNum := len(groupList) - 1; groupNum >= 0; groupNum-- { - subText := fmt.Sprintf("@%d", groupNum) - replacementText := fmt.Sprintf("(%s)", groupList[groupNum]) - text = strings.Replace(text, subText, replacementText, -1) - } - return text -} - -/////////////////////////////////////////////////// END REQUISITE PARSER CODE /////////////////////////////////////////////////// - -var sectionPrefixRegexp *regexp.Regexp = regexp.MustCompile(`^(?i)[A-Z]{2,4}[0-9V]{4}\.([0-9A-z]+)`) -var coreRegexp *regexp.Regexp = regexp.MustCompile(`[0-9]{3}`) -var personRegexp *regexp.Regexp = regexp.MustCompile(`\s*([\w ]+)\s+・\s+([A-z ]+)\s+・\s+([\w@.]+)`) - -func addSection(courseRef *schema.Course, classNum string, syllabusURI string, session schema.AcademicSession, rowInfo map[string]string, classInfo map[string]string) { - // Get subject prefix and course number by doing a regexp match on the section id - sectionId := classInfo["Class Section:"] - idMatches := sectionPrefixRegexp.FindStringSubmatch(sectionId) - - section := &schema.Section{} - - section.Id = primitive.NewObjectID() - section.Section_number = idMatches[1] - section.Course_reference = courseRef.Id - - //TODO: section requisites? - - // Set academic session - section.Academic_session = session - // Add professors - section.Professors = addProfessors(section.Id, rowInfo, classInfo) - - // Get all TA/RA info - assistantText := rowInfo["TA/RA(s):"] - assistantMatches := personRegexp.FindAllStringSubmatch(assistantText, -1) - section.Teaching_assistants = make([]schema.Assistant, 0, len(assistantMatches)) - for _, match := range assistantMatches { - assistant := schema.Assistant{} - nameStr := match[1] - names := strings.Split(nameStr, " ") - assistant.First_name = names[0] - assistant.Last_name = names[len(names)-1] - assistant.Role = match[2] - assistant.Email = match[3] - section.Teaching_assistants = append(section.Teaching_assistants, assistant) - } - - section.Internal_class_number = classNum - section.Instruction_mode = classInfo["Instruction Mode:"] - section.Meetings = getMeetings(rowInfo, classInfo) - - // Parse core flags (may or may not exist) - coreText, hasCore := rowInfo["Core:"] - if hasCore { - section.Core_flags = coreRegexp.FindAllString(coreText, -1) - } - - section.Syllabus_uri = syllabusURI - - semesterGrades, exists := GradeMap[session.Name] - if exists { - sectionGrades, exists := semesterGrades[courseRef.Subject_prefix+courseRef.Course_number+section.Section_number] - if exists { - section.Grade_distribution = sectionGrades - } - } - - // Add new section to section map - Sections[section.Id] = section - - // Append new section to course's section listing - courseRef.Sections = append(courseRef.Sections, section.Id) -} - -var termRegexp *regexp.Regexp = regexp.MustCompile(`Term: ([0-9]+[SUF])`) -var datesRegexp *regexp.Regexp = regexp.MustCompile(`(?:Start|End)s: ([A-z]+ [0-9]{1,2}, [0-9]{4})`) - -func getAcademicSession(rowInfo map[string]string, classInfo map[string]string) schema.AcademicSession { - session := schema.AcademicSession{} - scheduleText := rowInfo["Schedule:"] - - session.Name = termRegexp.FindStringSubmatch(scheduleText)[1] - dateMatches := datesRegexp.FindAllStringSubmatch(scheduleText, -1) - - datesFound := len(dateMatches) - switch { - case datesFound == 1: - startDate, err := time.ParseInLocation("January 2, 2006", dateMatches[0][1], timeLocation) - if err != nil { - panic(err) - } - session.Start_date = startDate - case datesFound == 2: - startDate, err := time.ParseInLocation("January 2, 2006", dateMatches[0][1], timeLocation) - if err != nil { - panic(err) - } - endDate, err := time.ParseInLocation("January 2, 2006", dateMatches[1][1], timeLocation) - if err != nil { - panic(err) - } - session.Start_date = startDate - session.End_date = endDate - } - return session -} - -func addProfessors(sectionId primitive.ObjectID, rowInfo map[string]string, classInfo map[string]string) []primitive.ObjectID { - professorText := rowInfo["Instructor(s):"] - professorMatches := personRegexp.FindAllStringSubmatch(professorText, -1) - var profRefs []primitive.ObjectID = make([]primitive.ObjectID, 0, len(professorMatches)) - for _, match := range professorMatches { - - nameStr := match[1] - names := strings.Split(nameStr, " ") - - firstName := names[0] - lastName := names[len(names)-1] - - profKey := firstName + lastName - - prof, profExists := Professors[profKey] - if profExists { - prof.Sections = append(prof.Sections, sectionId) - profRefs = append(profRefs, prof.Id) - continue - } - - prof = &schema.Professor{} - prof.Id = primitive.NewObjectID() - prof.First_name = firstName - prof.Last_name = lastName - prof.Titles = []string{match[2]} - prof.Email = match[3] - prof.Sections = []primitive.ObjectID{sectionId} - profRefs = append(profRefs, prof.Id) - Professors[profKey] = prof - ProfessorIDMap[prof.Id] = profKey - } - return profRefs -} - -var meetingsRegexp *regexp.Regexp = regexp.MustCompile(`([A-z]+\s+[0-9]+,\s+[0-9]{4})-([A-z]+\s+[0-9]+,\s+[0-9]{4})\W+((?:(?:Mon|Tues|Wednes|Thurs|Fri|Satur|Sun)day(?:, )?)+)\W+([0-9]+:[0-9]+(?:am|pm))-([0-9]+:[0-9]+(?:am|pm))(?:\W+(?:(\S+)\s+(\S+)))`) - -func getMeetings(rowInfo map[string]string, classInfo map[string]string) []schema.Meeting { - scheduleText := rowInfo["Schedule:"] - meetingMatches := meetingsRegexp.FindAllStringSubmatch(scheduleText, -1) - var meetings []schema.Meeting = make([]schema.Meeting, 0, len(meetingMatches)) - for _, match := range meetingMatches { - meeting := schema.Meeting{} - - startDate, err := time.ParseInLocation("January 2, 2006", match[1], timeLocation) - if err != nil { - panic(err) - } - meeting.Start_date = startDate - - endDate, err := time.ParseInLocation("January 2, 2006", match[2], timeLocation) - if err != nil { - panic(err) - } - meeting.End_date = endDate - - meeting.Meeting_days = strings.Split(match[3], ", ") - - startTime, err := time.ParseInLocation("3:04pm", match[4], timeLocation) - if err != nil { - panic(err) - } - meeting.Start_time = startTime - - endTime, err := time.ParseInLocation("3:04pm", match[5], timeLocation) - if err != nil { - panic(err) - } - meeting.End_time = endTime - - // Only add location data if it's available - if len(match) > 6 { - location := schema.Location{} - location.Building = match[6] - location.Room = match[7] - location.Map_uri = fmt.Sprintf("https://locator.utdallas.edu/%s_%s", location.Building, location.Room) - meeting.Location = location - } - - meetings = append(meetings, meeting) - } - return meetings -} - -func validate() { - // Set up deferred handler for panics to display validation fails - defer func() { - if err := recover(); err != nil { - log.Printf("VALIDATION FAILED: %s", err) - } - }() - - log.Printf("\nValidating courses...\n") - for _, course1 := range Courses { - // Check for duplicate courses by comparing course_number and subject_prefix as a compound key - for _, course2 := range Courses { - // Make sure the course doesn't check itself - if course1.Internal_course_number == course2.Internal_course_number { - continue - } - if course2.Course_number == course1.Course_number && course2.Subject_prefix == course1.Subject_prefix { - log.Printf("Duplicate course found for %s%s!\n", course1.Subject_prefix, course1.Course_number) - log.Printf("Course 1: %v\n\nCourse 2: %v", course1, course2) - log.Panic("Courses failed to validate!") - } - } - // Make sure course isn't referencing any nonexistent sections, and that course-section references are consistent both ways - for _, sectionId := range course1.Sections { - section, exists := Sections[sectionId] - if !exists { - log.Printf("Nonexistent section reference found for %s%s!\n", course1.Subject_prefix, course1.Course_number) - log.Printf("Referenced section ID: %s\nCourse ID: %s\n", sectionId, course1.Id) - log.Panic("Courses failed to validate!") - } - if section.Course_reference != course1.Id { - log.Printf("Inconsistent section reference found for %s%s! The course references the section, but not vice-versa!\n", course1.Subject_prefix, course1.Course_number) - log.Printf("Referenced section ID: %s\nCourse ID: %s\nSection course reference: %s\n", sectionId, course1.Id, section.Course_reference) - log.Panic("Courses failed to validate!") - } - } - } - log.Print("No invalid courses!\n\n") - - log.Print("Validating sections...\n") - for _, section1 := range Sections { - // Check for duplicate sections by comparing section_number, course_reference, and academic_session as a compound key - for _, section2 := range Sections { - // Make sure the section doesn't check itself - if section1.Internal_class_number == section2.Internal_class_number { - continue - } - if section2.Section_number == section1.Section_number && - section2.Course_reference == section1.Course_reference && - section2.Academic_session == section1.Academic_session { - log.Print("Duplicate section found!\n") - log.Printf("Section 1: %v\n\nSection 2: %v", section1, section2) - log.Panic("Sections failed to validate!") - } - } - // Make sure section isn't referencing any nonexistent professors, and that section-professor references are consistent both ways - for _, profId := range section1.Professors { - professorKey, exists := ProfessorIDMap[profId] - if !exists { - log.Printf("Nonexistent professor reference found for section ID %s!\n", section1.Id) - log.Printf("Referenced professor ID: %s\n", profId) - log.Panic("Sections failed to validate!") - } - profRefsSection := false - for _, profSection := range Professors[professorKey].Sections { - if profSection == section1.Id { - profRefsSection = true - break - } - } - if !profRefsSection { - log.Printf("Inconsistent professor reference found for section ID %s! The section references the professor, but not vice-versa!\n", section1.Id) - log.Printf("Referenced professor ID: %s\n", profId) - log.Panic("Sections failed to validate!") - } - } - // Make sure section isn't referencing a nonexistant course - _, exists := CourseIDMap[section1.Course_reference] - if !exists { - log.Printf("Nonexistent course reference found for section ID %s!\n", section1.Id) - log.Printf("Referenced course ID: %s\n", section1.Course_reference) - log.Panic("Sections failed to validate!") - } - } - log.Printf("No invalid sections!\n\n") - - log.Printf("Validating professors...\n") - // Check for duplicate professors by comparing first_name, last_name, and sections as a compound key - for _, prof1 := range Professors { - for _, prof2 := range Professors { - // Make sure the professor doesn't check itself - if prof1.Id == prof2.Id { - continue - } - if prof2.First_name == prof1.First_name && - prof2.Last_name == prof1.Last_name && - prof2.Profile_uri == prof1.Profile_uri { - log.Printf("Duplicate professor found!\n") - log.Printf("Professor 1: %v\n\nProfessor 2: %v", prof1, prof2) - log.Panic("Professors failed to validate!") - } - } - } - log.Printf("No invalid professors!\n\n") -} - -func getAllSectionFilepaths(inDir string) []string { - var sectionFilePaths []string - // Try to open inDir - fptr, err := os.Open(inDir) - if err != nil { - panic(err) - } - // Try to get term directories in inDir - termFiles, err := fptr.ReadDir(-1) - fptr.Close() - if err != nil { - panic(err) - } - // Iterate over term directories - for _, file := range termFiles { - if !file.IsDir() { - continue - } - termPath := fmt.Sprintf("%s/%s", inDir, file.Name()) - fptr, err = os.Open(termPath) - if err != nil { - panic(err) - } - courseFiles, err := fptr.ReadDir(-1) - fptr.Close() - if err != nil { - panic(err) - } - // Iterate over course directories - for _, file := range courseFiles { - coursePath := fmt.Sprintf("%s/%s", termPath, file.Name()) - fptr, err = os.Open(coursePath) - if err != nil { - panic(err) - } - sectionFiles, err := fptr.ReadDir(-1) - fptr.Close() - if err != nil { - panic(err) - } - // Get all section file paths from course directory - for _, file := range sectionFiles { - sectionFilePaths = append(sectionFilePaths, fmt.Sprintf("%s/%s", coursePath, file.Name())) - } - } - } - return sectionFilePaths -} - -func trimWhitespace(text string) string { - return strings.Trim(text, " \t\n\r") -} - -func getMapValues[M ~map[K]V, K comparable, V any](m M) []V { - r := make([]V, 0, len(m)) - for _, v := range m { - r = append(r, v) - } - return r -} diff --git a/parser/professorParser.go b/parser/professorParser.go new file mode 100644 index 0000000..2d935e6 --- /dev/null +++ b/parser/professorParser.go @@ -0,0 +1,43 @@ +package parser + +import ( + "strings" + + "github.com/UTDNebula/nebula-api/api/schema" + "go.mongodb.org/mongo-driver/bson/primitive" +) + +func parseProfessors(sectionId primitive.ObjectID, rowInfo map[string]string, classInfo map[string]string) []primitive.ObjectID { + professorText := rowInfo["Instructor(s):"] + professorMatches := personRegexp.FindAllStringSubmatch(professorText, -1) + var profRefs []primitive.ObjectID = make([]primitive.ObjectID, 0, len(professorMatches)) + for _, match := range professorMatches { + + nameStr := match[1] + names := strings.Split(nameStr, " ") + + firstName := names[0] + lastName := names[len(names)-1] + + profKey := firstName + lastName + + prof, profExists := Professors[profKey] + if profExists { + prof.Sections = append(prof.Sections, sectionId) + profRefs = append(profRefs, prof.Id) + continue + } + + prof = &schema.Professor{} + prof.Id = primitive.NewObjectID() + prof.First_name = firstName + prof.Last_name = lastName + prof.Titles = []string{match[2]} + prof.Email = match[3] + prof.Sections = []primitive.ObjectID{sectionId} + profRefs = append(profRefs, prof.Id) + Professors[profKey] = prof + ProfessorIDMap[prof.Id] = profKey + } + return profRefs +} diff --git a/parser/profileLoader.go b/parser/profileLoader.go new file mode 100644 index 0000000..9382911 --- /dev/null +++ b/parser/profileLoader.go @@ -0,0 +1,51 @@ +package parser + +import ( + "encoding/json" + "fmt" + "log" + "os" + + "github.com/UTDNebula/nebula-api/api/schema" +) + +func loadProfiles(inDir string) { + fptr, err := os.Open(fmt.Sprintf("%s/profiles.json", inDir)) + if err != nil { + log.Print("Couldn't find/open profiles.json in the input directory. Skipping profile load.\n") + return + } + + decoder := json.NewDecoder(fptr) + + log.Print("Beginning profile load.\n") + + // Read open bracket + _, err = decoder.Token() + if err != nil { + panic(err) + } + + // While the array contains values + profileCount := 0 + for ; decoder.More(); profileCount++ { + // Decode a professor + var prof schema.Professor + err := decoder.Decode(&prof) + if err != nil { + panic(err) + } + professorKey := prof.First_name + prof.Last_name + Professors[professorKey] = &prof + ProfessorIDMap[prof.Id] = professorKey + } + + // Read closing bracket + _, err = decoder.Token() + if err != nil { + panic(err) + } + + log.Printf("Loaded %d profiles!\n\n", profileCount) + fptr.Close() +} diff --git a/parser/requisiteParser.go b/parser/requisiteParser.go new file mode 100644 index 0000000..e58fcd8 --- /dev/null +++ b/parser/requisiteParser.go @@ -0,0 +1,578 @@ +package parser + +import ( + "fmt" + "log" + "regexp" + "strconv" + "strings" + + "github.com/UTDNebula/nebula-api/api/schema" +) + +/* + Below is the code for the requisite parser. It is *by far* the most complicated code in this entire project. + In summary, it uses a bottom-up "stack"-based parsing technique, building requisites by taking small groups of text, parsing those groups, + storing them on the "stack", and then uses those previously parsed groups as dependencies for parsing the larger "higher level" groups. + + It's worth noting that I say stack in quotes above because it's not treated as strictly LIFO like a stack would normally be. +*/ + +// Regex matcher object for requisite group parsing +type Matcher struct { + Regex *regexp.Regexp + Handler func(string, []string) interface{} +} + +////////////////////// BEGIN MATCHER FUNCS ////////////////////// + +var ANDRegex = regexp.MustCompile(`(?i)\s+and\s+`) + +func ANDMatcher(group string, subgroups []string) interface{} { + // Split text along " and " boundaries, then parse subexpressions as groups into an "AND" CollectionRequirement + subExpressions := ANDRegex.Split(group, -1) + parsedSubExps := make([]interface{}, 0, len(subExpressions)) + for _, exp := range subExpressions { + parsedExp := parseGroup(trimWhitespace(exp)) + // Don't include throwaways + if !reqIsThrowaway(parsedExp) { + parsedSubExps = append(parsedSubExps, parsedExp) + } + } + + parsedSubExps = joinAdjacentOthers(parsedSubExps, " and ") + + if len(parsedSubExps) > 1 { + return schema.NewCollectionRequirement("AND", len(parsedSubExps), parsedSubExps) + } else { + return parsedSubExps[0] + } +} + +// First regex subgroup represents the text to be subgrouped and parsed with parseFnc +// Ex: Text is: "(OPRE 3360 or STAT 3360 or STAT 4351), and JSOM majors and minors only" +// Regex is: "(JSOM majors and minors only)" +// Resulting substituted text would be: "(OPRE 3360 or STAT 3360 or STAT 4351), and @N", where N is some group number +// When @N is dereferenced from the requisite list, it will have a value equivalent to the result of parseFnc(group, subgroups) + +func SubstitutionMatcher(parseFnc func(string, []string) interface{}) func(string, []string) interface{} { + // Return a closure that uses parseFnc to substitute subgroups[1] + return func(group string, subgroups []string) interface{} { + // If there's no text to substitute, just return an OtherRequirement + if len(subgroups) < 2 { + return OtherMatcher(group, subgroups) + } + // Otherwise, substitute subgroups[1] and parse it with parseFnc + return parseGroup(makeSubgroup(group, subgroups[1], parseFnc(group, subgroups))) + } +} + +var ORRegex = regexp.MustCompile(`(?i)\s+or\s+`) + +func ORMatcher(group string, subgroups []string) interface{} { + // Split text along " or " boundaries, then parse subexpressions as groups into an "OR" CollectionRequirement + subExpressions := ORRegex.Split(group, -1) + parsedSubExps := make([]interface{}, 0, len(subExpressions)) + for _, exp := range subExpressions { + parsedExp := parseGroup(trimWhitespace(exp)) + // Don't include throwaways + if !reqIsThrowaway(parsedExp) { + parsedSubExps = append(parsedSubExps, parsedExp) + } + } + + parsedSubExps = joinAdjacentOthers(parsedSubExps, " or ") + + if len(parsedSubExps) > 1 { + return schema.NewCollectionRequirement("OR", 1, parsedSubExps) + } else { + return parsedSubExps[0] + } +} + +func CourseMinGradeMatcher(group string, subgroups []string) interface{} { + icn, err := findICN(subgroups[1], subgroups[2]) + if err != nil { + log.Printf("WARN: %s\n", err) + return OtherMatcher(group, subgroups) + } + return schema.NewCourseRequirement(icn, subgroups[3]) +} + +func CourseMatcher(group string, subgroups []string) interface{} { + icn, err := findICN(subgroups[1], subgroups[2]) + if err != nil { + log.Printf("WARN: %s\n", err) + return OtherMatcher(group, subgroups) + } + return schema.NewCourseRequirement(icn, "D") +} + +func ConsentMatcher(group string, subgroups []string) interface{} { + return schema.NewConsentRequirement(subgroups[1]) +} + +func LimitMatcher(group string, subgroups []string) interface{} { + hourLimit, err := strconv.Atoi(subgroups[1]) + if err != nil { + panic(err) + } + return schema.NewLimitRequirement(hourLimit) +} + +func MajorMatcher(group string, subgroups []string) interface{} { + return schema.NewMajorRequirement(subgroups[1]) +} + +func MinorMatcher(group string, subgroups []string) interface{} { + return schema.NewMinorRequirement(subgroups[1]) +} + +func MajorMinorMatcher(group string, subgroups []string) interface{} { + return schema.NewCollectionRequirement("OR", 1, []interface{}{*schema.NewMajorRequirement(subgroups[1]), *schema.NewMinorRequirement(subgroups[1])}) +} + +func CoreMatcher(group string, subgroups []string) interface{} { + hourReq, err := strconv.Atoi(subgroups[1]) + if err != nil { + panic(err) + } + return schema.NewCoreRequirement(subgroups[2], hourReq) +} + +func CoreCompletionMatcher(group string, subgroups []string) interface{} { + return schema.NewCoreRequirement(subgroups[1], -1) +} + +func ChoiceMatcher(group string, subgroups []string) interface{} { + collectionReq, ok := parseGroup(subgroups[1]).(*schema.CollectionRequirement) + if !ok { + log.Printf("WARN: ChoiceMatcher wasn't able to parse subgroup '%s' into a CollectionRequirement!", subgroups[1]) + return OtherMatcher(group, subgroups) + } + return schema.NewChoiceRequirement(collectionReq) +} + +func GPAMatcher(group string, subgroups []string) interface{} { + GPAFloat, err := strconv.ParseFloat(subgroups[1], 32) + if err != nil { + panic(err) + } + return schema.NewGPARequirement(GPAFloat, "") +} + +func ThrowawayMatcher(group string, subgroups []string) interface{} { + return schema.Requirement{Type: "throwaway"} +} + +// Regex for group tags +var groupTagRegex = regexp.MustCompile(`@(\d+)`) + +func GroupTagMatcher(group string, subgroups []string) interface{} { + groupIndex, err := strconv.Atoi(subgroups[1]) + if err != nil { + panic(err) + } + // Return a throwaway if index is out of range + if groupIndex < 0 || groupIndex >= len(requisiteList) { + return schema.Requirement{Type: "throwaway"} + } + // Find referenced group and return it + parsedGrp := requisiteList[groupIndex] + return parsedGrp +} + +func OtherMatcher(group string, subgroups []string) interface{} { + return schema.NewOtherRequirement(ungroupText(group), "") +} + +/////////////////////// END MATCHER FUNCS /////////////////////// + +// Matcher container, matchers must be in order of precedence +// NOTE: PARENTHESES ARE OF HIGHEST PRECEDENCE! (This is due to groupParens() handling grouping of parenthesized text before parsing begins) +var Matchers []Matcher + +// Must init matchers via function at runtime to avoid compile-time circular definition error +func initMatchers() { + Matchers = []Matcher{ + + // Throwaways + { + regexp.MustCompile(`^(?i)(?:better|\d-\d|same as.+)$`), + ThrowawayMatcher, + }, + + /* TO IMPLEMENT: + + X or Y or ... Z Major/Minor + + SUBJECT NUMBER, SUBJECT NUMBER, ..., or SUBJECT NUMBER + + ... probably many more + + */ + + // * only + { + regexp.MustCompile(`(?i).+(?:freshman|sophomores|juniors|seniors)\s+only$`), + OtherMatcher, + }, + + // * in any combination of * + { + regexp.MustCompile(`(?i).+\s+in\s+any\s+combination\s+of\s+.+`), + OtherMatcher, + }, + + // majors and minors only + { + regexp.MustCompile(`(?i)(([A-Z]+)\s+majors\s+and\s+minors\s+only)`), + SubstitutionMatcher(func(group string, subgroups []string) interface{} { + return MajorMinorMatcher(subgroups[1], subgroups[1:3]) + }), + }, + + // Core completion + { + regexp.MustCompile(`(?i)(Completion\s+of\s+(?:an?\s+)?(\d{3}).+core(?:\s+course)?)`), + SubstitutionMatcher(func(group string, subgroups []string) interface{} { + return CoreCompletionMatcher(subgroups[1], subgroups[1:3]) + }), + }, + + // Credit cannot be received for both courses, + { + regexp.MustCompile(`(?i)(Credit\s+cannot\s+be\s+received\s+for\s+both\s+(?:courses)?,?(.+))`), + SubstitutionMatcher(func(group string, subgroups []string) interface{} { + return ChoiceMatcher(subgroups[1], subgroups[1:3]) + }), + }, + + // Credit cannot be received for more than one of *: + { + regexp.MustCompile(`(?i)(Credit\s+cannot\s+be\s+received\s+for\s+more\s+than\s+one\s+of.+:(.+))`), + SubstitutionMatcher(func(group string, subgroups []string) interface{} { + return ChoiceMatcher(subgroups[1], subgroups[1:3]) + }), + }, + + // Logical & + { + ANDRegex, + ANDMatcher, + }, + + // " with a [grade] [of] or better" + { + regexp.MustCompile(`^(?i)(([A-Z]{2,4})\s+([0-9V]{4})\s+with\s+a(?:\s+grade)?(?:\s+of)?\s+([ABCF][+-]?)\s+or\s+better)`), // [name, number, min grade] + SubstitutionMatcher(func(group string, subgroups []string) interface{} { + return CourseMinGradeMatcher(subgroups[1], subgroups[1:5]) + }), + }, + + // Logical | + { + ORRegex, + ORMatcher, + }, + + // with a [minimum] grade of [at least] [a] + { + regexp.MustCompile(`^(?i)([A-Z]{2,4})\s+([0-9V]{4})\s+with\s+a\s+(?:minimum\s+)?grade\s+of\s+(?:at least\s+)?(?:a\s+)?([ABCF][+-]?)$`), // [name, number, min grade] + CourseMinGradeMatcher, + }, + + // A grade of [at least] [a] in + { + regexp.MustCompile(`^(?i)A\s+grade\s+of(?:\s+at\s+least)?(?:\s+a)?\s+([ABCF][+-]?)\s+in\s+([A-Z]{2,4})\s+([0-9V]{4})$`), // [min grade, name, number] + func(group string, subgroups []string) interface{} { + return CourseMinGradeMatcher(group, []string{subgroups[0], subgroups[2], subgroups[3], subgroups[1]}) + }, + }, + + // + { + regexp.MustCompile(`^\s*([A-Z]{2,4})\s+([0-9V]{4})\s*$`), // [name, number] + CourseMatcher, + }, + + // consent required + { + regexp.MustCompile(`^(?i)(.+)\s+consent\s+required`), // [granter] + ConsentMatcher, + }, + + // semester credit hours maximum + { + regexp.MustCompile(`^(?i)(\d+)\s+semester\s+credit\s+hours\s+maximum$`), + LimitMatcher, + }, + + // This course may only be repeated for credit hours + { + regexp.MustCompile(`^(?:[A-Z]{2,4}\s+[0-9V]{4}\s+)?Repeat\s+Limit\s+-\s+(?:[A-Z]{2,4}\s+[0-9V]{4}|This\s+course)\s+may\s+only\s+be\s+repeated\s+for(?:\s+a\s+maximum\s+of)?\s+(\d+)\s+semester\s+cre?dit\s+hours(?:\s+maximum)?$`), + LimitMatcher, + }, + + // majors only + { + regexp.MustCompile(`^(?i)(.+)\s+major(?:s\s+only)?$`), + MajorMatcher, + }, + + // minors only + { + regexp.MustCompile(`^(?i)(.+)\s+minor(?:s\s+only)?$`), + MinorMatcher, + }, + + // Any semester credit hour course + { + regexp.MustCompile(`^(?i)any\s+(\d+)\s+semester\s+credit\s+hour\s+(\d{3})(?:\s+@\d+)?\s+core(?:\s+course)?$`), + CoreMatcher, + }, + + // Minimum GPA of + { + regexp.MustCompile(`^(?i)(?:minimum\s+)?GPA\s+of\s+([0-9\.]+)$`), // [GPA] + GPAMatcher, + }, + + // GPA + { + regexp.MustCompile(`^(?i)([0-9\.]+) GPA$`), // [GPA] + GPAMatcher, + }, + + // A university grade point average of at least + { + regexp.MustCompile(`^(?i)a(?:\s+university)?\s+grade\s+point\s+average\s+of(?:\s+at\s+least)?\s+([0-9\.]+)$`), // [GPA] + GPAMatcher, + }, + + // Group tags (i.e. @1) + { + groupTagRegex, // [group #] + GroupTagMatcher, + }, + } +} + +var preOrCoreqRegexp *regexp.Regexp = regexp.MustCompile(`(?i)((?:Prerequisites?\s+or\s+corequisites?|Corequisites?\s+or\s+prerequisites?):(.*))`) +var prereqRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(Prerequisites?:(.*))`) +var coreqRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(Corequisites?:(.*))`) + +// It is very important that these remain in the same order -- this keeps proper precedence in the below function! +var reqRegexes [3]*regexp.Regexp = [3]*regexp.Regexp{preOrCoreqRegexp, prereqRegexp, coreqRegexp} + +// Returns a closure that parses the course's requisites +func getReqParser(course *schema.Course, hasEnrollmentReqs bool, enrollmentReqs string) func() { + return func() { + // Pointer array to course requisite properties must be in same order as reqRegexes above + courseReqs := [3]**schema.CollectionRequirement{&course.Co_or_pre_requisites, &course.Prerequisites, &course.Corequisites} + // The actual text to check for requisites + var checkText string + // Extract req text from the enrollment req info if it exists, otherwise try using the description + if hasEnrollmentReqs { + course.Enrollment_reqs = enrollmentReqs + checkText = enrollmentReqs + } else { + checkText = course.Description + } + // Iterate over and parse each type of requisite, populating the course's relevant requisite property + for index, reqPtr := range courseReqs { + reqMatches := reqRegexes[index].FindStringSubmatch(checkText) + if reqMatches != nil { + // Actual useful text is the inner match, index 2 + reqText := reqMatches[2] + // Erase any sub-matches for other requisite types by matching outer text, index 1 + for _, regex := range reqRegexes { + matches := regex.FindStringSubmatch(reqText) + if matches != nil { + reqText = strings.Replace(reqText, matches[1], "", -1) + } + } + // Erase current match from checkText to prevent erroneous duplicated Reqs + checkText = strings.Replace(checkText, reqMatches[1], "", -1) + // Split reqText into chunks based on period-space delimiters + textChunks := strings.Split(trimWhitespace(reqText), ". ") + parsedChunks := make([]interface{}, 0, len(textChunks)) + // Parse each chunk, then add non-throwaway chunks to parsedChunks + for _, chunk := range textChunks { + // Trim any remaining rightmost periods + chunk = trimWhitespace(strings.TrimRight(chunk, ".")) + parsedChunk := parseChunk(chunk) + if !reqIsThrowaway(parsedChunk) { + parsedChunks = append(parsedChunks, parsedChunk) + } + } + // Build CollectionRequirement from parsed chunks and apply to the course property + if len(parsedChunks) > 0 { + *reqPtr = schema.NewCollectionRequirement("REQUISITES", len(parsedChunks), parsedChunks) + } + log.Printf("\n\n") + } + } + } +} + +// Function for pulling all requisite references (reqs referenced via group tags) from text +/* +func getReqRefs(text string) []interface{} { + matches := groupTagRegex.FindAllStringSubmatch(text, -1) + refs := make([]interface{}, len(matches)) + for i, submatches := range matches { + refs[i] = GroupTagMatcher(submatches[0], submatches) + } + return refs +} +*/ + +// Function for creating a new group by replacing subtext in an existing group, and pushing the new group's info to the req and group list +func makeSubgroup(group string, subtext string, requisite interface{}) string { + newGroup := strings.Replace(group, subtext, fmt.Sprintf("@%d", len(requisiteList)), -1) + requisiteList = append(requisiteList, requisite) + groupList = append(groupList, newGroup) + return newGroup +} + +// Function for joining adjacent OtherRequirements into one OtherRequirement by joining their descriptions with a string +func joinAdjacentOthers(reqs []interface{}, joinString string) []interface{} { + joinedReqs := make([]interface{}, 0, len(reqs)) + // Temp is a blank OtherRequirement + temp := *schema.NewOtherRequirement("", "") + // Iterate over each existing req + for _, req := range reqs { + // Determine whether req is an OtherRequirement + otherReq, isOtherReq := req.(schema.OtherRequirement) + if !isOtherReq { + // If temp contains data, append its final result to the joinedReqs + if temp.Description != "" { + joinedReqs = append(joinedReqs, temp) + } + // Append the non-OtherRequirement to the joinedReqs + joinedReqs = append(joinedReqs, req) + // Reset temp's description + temp.Description = "" + continue + } + // If temp is blank, and req is an otherReq, use otherReq as the initial value of temp + // Otherwise, join temp's existing description with otherReq's description + if temp.Description == "" { + temp = otherReq + } else { + temp.Description = strings.Join([]string{temp.Description, otherReq.Description}, joinString) + } + } + // If temp contains data, append its final result to the joinedReqs + if temp.Description != "" { + joinedReqs = append(joinedReqs, temp) + } + //log.Printf("JOINEDREQS ARE: %v\n", joinedReqs) + return joinedReqs +} + +// Function for finding the Internal Course Number associated with the course with the specified subject and course number +func findICN(subject string, number string) (string, error) { + for _, coursePtr := range Courses { + if coursePtr.Subject_prefix == subject && coursePtr.Course_number == number { + return coursePtr.Internal_course_number, nil + } + } + return "ERROR", fmt.Errorf("couldn't find an ICN for %s %s", subject, number) +} + +// This is the list of produced requisites. Indices coincide with group indices -- aka group @0 will also be the 0th index of the list since it will be processed first. +var requisiteList []interface{} + +// This is the list of groups that are to be parsed. They are the raw text chunks associated with the reqs above. +var groupList []string + +// Innermost function for parsing individual text groups (used recursively by some Matchers) +func parseGroup(grp string) interface{} { + // Make sure we trim any mismatched right parentheses + grp = strings.TrimRight(grp, ")") + // Find an applicable matcher in Matchers + for _, matcher := range Matchers { + matches := matcher.Regex.FindStringSubmatch(grp) + if matches != nil { + // If an applicable matcher has been found, return the result of calling its handler + result := matcher.Handler(grp, matches) + log.Printf("'%s' -> %T\n", grp, result) + return result + } + } + // Panic if no matcher was able to be found for a given group -- this means we need to add handling for it!!! + //log.Panicf("NO MATCHER FOUND FOR GROUP '%s'\nSTACK IS: %#v\n", grp, requisiteList) + //log.Printf("NO MATCHER FOR: '%s'\n", grp) + log.Printf("'%s' -> parser.OtherRequirement\n", grp) + //var temp string + //fmt.Scanf("%s", temp) + return *schema.NewOtherRequirement(ungroupText(grp), "") +} + +// Outermost function for parsing a chunk of requisite text (potentially containing multiple nested text groups) +func parseChunk(chunk string) interface{} { + log.Printf("\nPARSING CHUNK: '%s'\n", chunk) + // Extract parenthesized groups from chunk text + parseText, parseGroups := groupParens(chunk) + // Initialize the requisite list and group list + requisiteList = make([]interface{}, 0, len(parseGroups)) + groupList = parseGroups + // Begin recursive group parsing -- order is bottom-up + for _, grp := range parseGroups { + parsedReq := parseGroup(grp) + // Only append requisite to stack if it isn't marked as throwaway + if !reqIsThrowaway(parsedReq) { + requisiteList = append(requisiteList, parsedReq) + } + } + finalGroup := parseGroup(parseText) + return finalGroup +} + +// Check whether a requisite is a throwaway or not by trying a type assertion to Requirement +func reqIsThrowaway(req interface{}) bool { + baseReq, isBaseReq := req.(schema.Requirement) + return isBaseReq && baseReq.Type == "throwaway" +} + +// Use stack-based parentheses parsing to form text groups and reference them in the original string +func groupParens(text string) (string, []string) { + var groups []string = make([]string, 0, 5) + var positionStack []int = make([]int, 0, 5) + var depth int = 0 + for pos := 0; pos < len(text); pos++ { + if text[pos] == '(' { + depth++ + positionStack = append(positionStack, pos) + } else if text[pos] == ')' && depth > 0 { + depth-- + lastIndex := len(positionStack) - 1 + // Get last '(' position from stack + lastPos := positionStack[lastIndex] + // Pop stack + positionStack = positionStack[:lastIndex] + // Make group and replace group text with group index reference + groupText := text[lastPos+1 : pos] + groupNum := len(groups) + groups = append(groups, groupText) + subText := fmt.Sprintf("@%d", groupNum) + text = strings.Replace(text, text[lastPos:pos+1], subText, -1) + // Adjust position to account for replaced text + pos += len(subText) - len(groupText) - 2 + } + } + return text, groups +} + +// Function for replacing all group references (groups referenced via group tags) with their actual text +func ungroupText(text string) string { + text = trimWhitespace(text) + for groupNum := len(groupList) - 1; groupNum >= 0; groupNum-- { + subText := fmt.Sprintf("@%d", groupNum) + replacementText := fmt.Sprintf("(%s)", groupList[groupNum]) + text = strings.Replace(text, subText, replacementText, -1) + } + return text +} diff --git a/parser/sectionParser.go b/parser/sectionParser.go new file mode 100644 index 0000000..48a2bcc --- /dev/null +++ b/parser/sectionParser.go @@ -0,0 +1,157 @@ +package parser + +import ( + "fmt" + "regexp" + "strings" + "time" + + "github.com/UTDNebula/nebula-api/api/schema" + "go.mongodb.org/mongo-driver/bson/primitive" +) + +var sectionPrefixRegexp *regexp.Regexp = regexp.MustCompile(`^(?i)[A-Z]{2,4}[0-9V]{4}\.([0-9A-z]+)`) +var coreRegexp *regexp.Regexp = regexp.MustCompile(`[0-9]{3}`) +var personRegexp *regexp.Regexp = regexp.MustCompile(`\s*([\w ]+)\s+・\s+([A-z ]+)\s+・\s+([\w@.]+)`) + +func parseSection(courseRef *schema.Course, classNum string, syllabusURI string, session schema.AcademicSession, rowInfo map[string]string, classInfo map[string]string) { + // Get subject prefix and course number by doing a regexp match on the section id + sectionId := classInfo["Class Section:"] + idMatches := sectionPrefixRegexp.FindStringSubmatch(sectionId) + + section := &schema.Section{} + + section.Id = primitive.NewObjectID() + section.Section_number = idMatches[1] + section.Course_reference = courseRef.Id + + //TODO: section requisites? + + // Set academic session + section.Academic_session = session + // Add professors + section.Professors = parseProfessors(section.Id, rowInfo, classInfo) + + // Get all TA/RA info + assistantText := rowInfo["TA/RA(s):"] + assistantMatches := personRegexp.FindAllStringSubmatch(assistantText, -1) + section.Teaching_assistants = make([]schema.Assistant, 0, len(assistantMatches)) + for _, match := range assistantMatches { + assistant := schema.Assistant{} + nameStr := match[1] + names := strings.Split(nameStr, " ") + assistant.First_name = names[0] + assistant.Last_name = names[len(names)-1] + assistant.Role = match[2] + assistant.Email = match[3] + section.Teaching_assistants = append(section.Teaching_assistants, assistant) + } + + section.Internal_class_number = classNum + section.Instruction_mode = classInfo["Instruction Mode:"] + section.Meetings = getMeetings(rowInfo, classInfo) + + // Parse core flags (may or may not exist) + coreText, hasCore := rowInfo["Core:"] + if hasCore { + section.Core_flags = coreRegexp.FindAllString(coreText, -1) + } + + section.Syllabus_uri = syllabusURI + + semesterGrades, exists := GradeMap[session.Name] + if exists { + sectionGrades, exists := semesterGrades[courseRef.Subject_prefix+courseRef.Course_number+section.Section_number] + if exists { + section.Grade_distribution = sectionGrades + } + } + + // Add new section to section map + Sections[section.Id] = section + + // Append new section to course's section listing + courseRef.Sections = append(courseRef.Sections, section.Id) +} + +var termRegexp *regexp.Regexp = regexp.MustCompile(`Term: ([0-9]+[SUF])`) +var datesRegexp *regexp.Regexp = regexp.MustCompile(`(?:Start|End)s: ([A-z]+ [0-9]{1,2}, [0-9]{4})`) + +func getAcademicSession(rowInfo map[string]string, classInfo map[string]string) schema.AcademicSession { + session := schema.AcademicSession{} + scheduleText := rowInfo["Schedule:"] + + session.Name = termRegexp.FindStringSubmatch(scheduleText)[1] + dateMatches := datesRegexp.FindAllStringSubmatch(scheduleText, -1) + + datesFound := len(dateMatches) + switch { + case datesFound == 1: + startDate, err := time.ParseInLocation("January 2, 2006", dateMatches[0][1], timeLocation) + if err != nil { + panic(err) + } + session.Start_date = startDate + case datesFound == 2: + startDate, err := time.ParseInLocation("January 2, 2006", dateMatches[0][1], timeLocation) + if err != nil { + panic(err) + } + endDate, err := time.ParseInLocation("January 2, 2006", dateMatches[1][1], timeLocation) + if err != nil { + panic(err) + } + session.Start_date = startDate + session.End_date = endDate + } + return session +} + +var meetingsRegexp *regexp.Regexp = regexp.MustCompile(`([A-z]+\s+[0-9]+,\s+[0-9]{4})-([A-z]+\s+[0-9]+,\s+[0-9]{4})\W+((?:(?:Mon|Tues|Wednes|Thurs|Fri|Satur|Sun)day(?:, )?)+)\W+([0-9]+:[0-9]+(?:am|pm))-([0-9]+:[0-9]+(?:am|pm))(?:\W+(?:(\S+)\s+(\S+)))`) + +func getMeetings(rowInfo map[string]string, classInfo map[string]string) []schema.Meeting { + scheduleText := rowInfo["Schedule:"] + meetingMatches := meetingsRegexp.FindAllStringSubmatch(scheduleText, -1) + var meetings []schema.Meeting = make([]schema.Meeting, 0, len(meetingMatches)) + for _, match := range meetingMatches { + meeting := schema.Meeting{} + + startDate, err := time.ParseInLocation("January 2, 2006", match[1], timeLocation) + if err != nil { + panic(err) + } + meeting.Start_date = startDate + + endDate, err := time.ParseInLocation("January 2, 2006", match[2], timeLocation) + if err != nil { + panic(err) + } + meeting.End_date = endDate + + meeting.Meeting_days = strings.Split(match[3], ", ") + + startTime, err := time.ParseInLocation("3:04pm", match[4], timeLocation) + if err != nil { + panic(err) + } + meeting.Start_time = startTime + + endTime, err := time.ParseInLocation("3:04pm", match[5], timeLocation) + if err != nil { + panic(err) + } + meeting.End_time = endTime + + // Only add location data if it's available + if len(match) > 6 { + location := schema.Location{} + location.Building = match[6] + location.Room = match[7] + location.Map_uri = fmt.Sprintf("https://locator.utdallas.edu/%s_%s", location.Building, location.Room) + meeting.Location = location + } + + meetings = append(meetings, meeting) + } + return meetings +} diff --git a/parser/utils.go b/parser/utils.go new file mode 100644 index 0000000..c00aa14 --- /dev/null +++ b/parser/utils.go @@ -0,0 +1,90 @@ +package parser + +import ( + "encoding/json" + "fmt" + "os" + "strings" +) + +func writeJSON(filepath string, data interface{}) error { + fptr, err := os.Create(filepath) + if err != nil { + return err + } + defer fptr.Close() + encoder := json.NewEncoder(fptr) + encoder.SetIndent("", "\t") + encoder.Encode(getMapValues(Courses)) + return nil +} + +// TODO: Do this in a cleaner manner via filepath.Walk or similar +func getAllSectionFilepaths(inDir string) []string { + var sectionFilePaths []string + // Try to open inDir + fptr, err := os.Open(inDir) + if err != nil { + panic(err) + } + // Try to get term directories in inDir + termFiles, err := fptr.ReadDir(-1) + fptr.Close() + if err != nil { + panic(err) + } + // Iterate over term directories + for _, file := range termFiles { + if !file.IsDir() { + continue + } + termPath := fmt.Sprintf("%s/%s", inDir, file.Name()) + fptr, err = os.Open(termPath) + if err != nil { + panic(err) + } + courseFiles, err := fptr.ReadDir(-1) + fptr.Close() + if err != nil { + panic(err) + } + // Iterate over course directories + for _, file := range courseFiles { + coursePath := fmt.Sprintf("%s/%s", termPath, file.Name()) + fptr, err = os.Open(coursePath) + if err != nil { + panic(err) + } + sectionFiles, err := fptr.ReadDir(-1) + fptr.Close() + if err != nil { + panic(err) + } + // Get all section file paths from course directory + for _, file := range sectionFiles { + sectionFilePaths = append(sectionFilePaths, fmt.Sprintf("%s/%s", coursePath, file.Name())) + } + } + } + return sectionFilePaths +} + +func trimWhitespace(text string) string { + return strings.Trim(text, " \t\n\r") +} + +func getMapValues[M ~map[K]V, K comparable, V any](m M) []V { + r := make([]V, 0, len(m)) + for _, v := range m { + r = append(r, v) + } + return r +} + +func getMapKeys[M ~map[K]V, K comparable, V any](m M) []K { + r := make([]K, 0, len(m)) + for k := range m { + r = append(r, k) + } + return r +} diff --git a/parser/validator.go b/parser/validator.go new file mode 100644 index 0000000..e1a803c --- /dev/null +++ b/parser/validator.go @@ -0,0 +1,108 @@ +package parser + +import "log" + +func validate() { + // Set up deferred handler for panics to display validation fails + defer func() { + if err := recover(); err != nil { + log.Printf("VALIDATION FAILED: %s", err) + } + }() + + log.Printf("\nValidating courses...\n") + courseKeys := getMapKeys(Courses) + for i := 0; i < len(courseKeys)-1; i++ { + course1 := Courses[courseKeys[i]] + // Check for duplicate courses by comparing course_number, subject_prefix, and catalog_year as a compound key + for j := i + 1; j < len(courseKeys); j++ { + course2 := Courses[courseKeys[j]] + if course2.Catalog_year == course1.Catalog_year && course2.Course_number == course1.Course_number && course2.Subject_prefix == course1.Subject_prefix { + log.Printf("Duplicate course found for %s%s!\n", course1.Subject_prefix, course1.Course_number) + log.Printf("Course 1: %v\n\nCourse 2: %v", course1, course2) + log.Panic("Courses failed to validate!") + } + } + // Make sure course isn't referencing any nonexistent sections, and that course-section references are consistent both ways + for _, sectionId := range course1.Sections { + section, exists := Sections[sectionId] + if !exists { + log.Printf("Nonexistent section reference found for %s%s!\n", course1.Subject_prefix, course1.Course_number) + log.Printf("Referenced section ID: %s\nCourse ID: %s\n", sectionId, course1.Id) + log.Panic("Courses failed to validate!") + } + if section.Course_reference != course1.Id { + log.Printf("Inconsistent section reference found for %s%s! The course references the section, but not vice-versa!\n", course1.Subject_prefix, course1.Course_number) + log.Printf("Referenced section ID: %s\nCourse ID: %s\nSection course reference: %s\n", sectionId, course1.Id, section.Course_reference) + log.Panic("Courses failed to validate!") + } + } + } + courseKeys = nil + log.Print("No invalid courses!\n\n") + + log.Print("Validating sections...\n") + sectionKeys := getMapKeys(Sections) + for i := 0; i < len(sectionKeys)-1; i++ { + section1 := Sections[sectionKeys[i]] + // Check for duplicate sections by comparing section_number, course_reference, and academic_session as a compound key + for j := i + 1; j < len(sectionKeys); j++ { + section2 := Sections[sectionKeys[j]] + if section2.Section_number == section1.Section_number && + section2.Course_reference == section1.Course_reference && + section2.Academic_session == section1.Academic_session { + log.Print("Duplicate section found!\n") + log.Printf("Section 1: %v\n\nSection 2: %v", section1, section2) + log.Panic("Sections failed to validate!") + } + } + // Make sure section isn't referencing any nonexistent professors, and that section-professor references are consistent both ways + for _, profId := range section1.Professors { + professorKey, exists := ProfessorIDMap[profId] + if !exists { + log.Printf("Nonexistent professor reference found for section ID %s!\n", section1.Id) + log.Printf("Referenced professor ID: %s\n", profId) + log.Panic("Sections failed to validate!") + } + profRefsSection := false + for _, profSection := range Professors[professorKey].Sections { + if profSection == section1.Id { + profRefsSection = true + break + } + } + if !profRefsSection { + log.Printf("Inconsistent professor reference found for section ID %s! The section references the professor, but not vice-versa!\n", section1.Id) + log.Printf("Referenced professor ID: %s\n", profId) + log.Panic("Sections failed to validate!") + } + } + // Make sure section isn't referencing a nonexistant course + _, exists := CourseIDMap[section1.Course_reference] + if !exists { + log.Printf("Nonexistent course reference found for section ID %s!\n", section1.Id) + log.Printf("Referenced course ID: %s\n", section1.Course_reference) + log.Panic("Sections failed to validate!") + } + } + sectionKeys = nil + log.Printf("No invalid sections!\n\n") + + log.Printf("Validating professors...\n") + profKeys := getMapKeys(Professors) + // Check for duplicate professors by comparing first_name, last_name, and sections as a compound key + for i := 0; i < len(profKeys)-1; i++ { + prof1 := Professors[profKeys[i]] + for j := i + 1; j < len(profKeys); j++ { + prof2 := Professors[profKeys[j]] + if prof2.First_name == prof1.First_name && + prof2.Last_name == prof1.Last_name && + prof2.Profile_uri == prof1.Profile_uri { + log.Printf("Duplicate professor found!\n") + log.Printf("Professor 1: %v\n\nProfessor 2: %v", prof1, prof2) + log.Panic("Professors failed to validate!") + } + } + } + log.Printf("No invalid professors!\n\n") +}