Skip to content

Commit

Permalink
Created toolkit uploader (#26)
Browse files Browse the repository at this point in the history
* Created toolkit uploader

Created an uploader for the toolkit which uploads parsed JSON content to the Mongo database. Both replace and merge options are implemented.

* Updated uploader and database methods

Changed connectDB() to use non-deprecated methods, and fixed many issues in uploader (currently still broken)

* Adjust uploader code for IdWrapper removal

* Create generic upload function, clean up uploader.go

Created a generic function for uploading JSON files to Mongo, and added some comments for clarification. Currently the function requires that the file name is the same as the collection that the file contents are being uploaded to. Only courses, professors, and sections are supported.

* Delete data.7z

---------

Co-authored-by: jpahm <[email protected]>
  • Loading branch information
mohammadmehrab and jpahm authored Apr 25, 2024
1 parent 8fd0174 commit b93064c
Show file tree
Hide file tree
Showing 11 changed files with 321 additions and 70 deletions.
49 changes: 41 additions & 8 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,55 @@ go 1.19

require (
github.com/PuerkitoBio/goquery v1.8.1
github.com/UTDNebula/nebula-api/api v0.0.0-20231204040332-adccdc08b203
github.com/chromedp/cdproto v0.0.0-20240116100315-4a0ec5e4c400
github.com/chromedp/chromedp v0.9.3
github.com/UTDNebula/nebula-api/api v0.0.0-20240423212728-2ef02f280c6c
github.com/chromedp/cdproto v0.0.0-20240421230201-ab917191657d
github.com/chromedp/chromedp v0.9.5
github.com/joho/godotenv v1.5.1
go.mongodb.org/mongo-driver v1.13.0
go.mongodb.org/mongo-driver v1.15.0
)

require (
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/bytedance/sonic v1.11.5 // indirect
github.com/bytedance/sonic/loader v0.1.1 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect
github.com/cloudwego/base64x v0.1.3 // indirect
github.com/cloudwego/iasm v0.2.0 // indirect
github.com/gabriel-vasile/mimetype v1.4.3 // indirect
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/gin-gonic/gin v1.9.1 // indirect
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/go-playground/validator/v10 v10.19.0 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.3.0 // indirect
github.com/google/go-cmp v0.5.5 // indirect
github.com/gobwas/ws v1.3.2 // indirect
github.com/goccy/go-json v0.10.2 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/gorilla/schema v1.3.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.8 // indirect
github.com/klauspost/cpuid/v2 v2.2.7 // indirect
github.com/leodido/go-urn v1.4.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
golang.org/x/net v0.17.0 // indirect
golang.org/x/sys v0.14.0 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/montanaflynn/stats v0.7.1 // indirect
github.com/pelletier/go-toml/v2 v2.2.1 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.12 // indirect
github.com/xdg-go/pbkdf2 v1.0.0 // indirect
github.com/xdg-go/scram v1.1.2 // indirect
github.com/xdg-go/stringprep v1.0.4 // indirect
github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a // indirect
golang.org/x/arch v0.7.0 // indirect
golang.org/x/crypto v0.22.0 // indirect
golang.org/x/net v0.24.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.19.0 // indirect
golang.org/x/text v0.14.0 // indirect
google.golang.org/protobuf v1.33.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
136 changes: 112 additions & 24 deletions go.sum

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion parser/courseParser.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func parseCourse(courseNum string, session schema.AcademicSession, rowInfo map[s

course = &schema.Course{}

course.Id = schema.IdWrapper(primitive.NewObjectID().Hex())
course.Id = primitive.NewObjectID()
course.Course_number = idMatches[2]
course.Subject_prefix = idMatches[1]
course.Title = rowInfo["Course Title:"]
Expand Down
9 changes: 5 additions & 4 deletions parser/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,23 @@ import (
"time"

"github.com/UTDNebula/api-tools/utils"
"go.mongodb.org/mongo-driver/bson/primitive"

"github.com/PuerkitoBio/goquery"
"github.com/UTDNebula/nebula-api/api/schema"
)

// Main dictionaries for mapping unique keys to the actual data
var Sections = make(map[schema.IdWrapper]*schema.Section)
var Sections = make(map[primitive.ObjectID]*schema.Section)
var Courses = make(map[string]*schema.Course)
var Professors = make(map[string]*schema.Professor)

// Auxilliary dictionaries for mapping the generated ObjectIDs to the keys used in the above maps, used for validation purposes
var CourseIDMap = make(map[schema.IdWrapper]string)
var ProfessorIDMap = make(map[schema.IdWrapper]string)
var CourseIDMap = make(map[primitive.ObjectID]string)
var ProfessorIDMap = make(map[primitive.ObjectID]string)

// Requisite parser closures associated with courses
var ReqParsers = make(map[schema.IdWrapper]func())
var ReqParsers = make(map[primitive.ObjectID]func())

// Grade mappings for section grade distributions, mapping is MAP[SEMESTER] -> MAP[SUBJECT + NUMBER + SECTION] -> GRADE DISTRIBUTION
var GradeMap map[string]map[string][]int
Expand Down
8 changes: 4 additions & 4 deletions parser/professorParser.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ import (
"go.mongodb.org/mongo-driver/bson/primitive"
)

func parseProfessors(sectionId schema.IdWrapper, rowInfo map[string]string, classInfo map[string]string) []schema.IdWrapper {
func parseProfessors(sectionId primitive.ObjectID, rowInfo map[string]string, classInfo map[string]string) []primitive.ObjectID {
professorText := rowInfo["Instructor(s):"]
professorMatches := personRegexp.FindAllStringSubmatch(professorText, -1)
var profRefs []schema.IdWrapper = make([]schema.IdWrapper, 0, len(professorMatches))
var profRefs []primitive.ObjectID = make([]primitive.ObjectID, 0, len(professorMatches))
for _, match := range professorMatches {

nameStr := utils.TrimWhitespace(match[1])
Expand All @@ -35,12 +35,12 @@ func parseProfessors(sectionId schema.IdWrapper, rowInfo map[string]string, clas
}

prof = &schema.Professor{}
prof.Id = schema.IdWrapper(primitive.NewObjectID().Hex())
prof.Id = primitive.NewObjectID()
prof.First_name = firstName
prof.Last_name = lastName
prof.Titles = []string{utils.TrimWhitespace(match[2])}
prof.Email = utils.TrimWhitespace(match[3])
prof.Sections = []schema.IdWrapper{sectionId}
prof.Sections = []primitive.ObjectID{sectionId}
profRefs = append(profRefs, prof.Id)
Professors[profKey] = prof
ProfessorIDMap[prof.Id] = profKey
Expand Down
2 changes: 1 addition & 1 deletion parser/sectionParser.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func parseSection(courseRef *schema.Course, classNum string, syllabusURI string,

section := &schema.Section{}

section.Id = schema.IdWrapper(primitive.NewObjectID().Hex())
section.Id = primitive.NewObjectID()
section.Section_number = idMatches[1]
section.Course_reference = courseRef.Id

Expand Down
2 changes: 1 addition & 1 deletion scrapers/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ func ScrapeEvents(outDir string) {
utils.VPrintf("Scraped contact phone info: %s", contactInformationPhone)

events = append(events, schema.Event{
Id: schema.IdWrapper(primitive.NewObjectID().Hex()),
Id: primitive.NewObjectID(),
Summary: summary,
Location: location,
StartTime: dateTimeStart,
Expand Down
2 changes: 1 addition & 1 deletion scrapers/organizations.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ func parseCsvRecord(ctx context.Context, entry []string) (*schema.Organization,
utils.VPrintf("Error retrieving image for %s: %v", entry[0], err)
}
return &schema.Organization{
Id: schema.IdWrapper(primitive.NewObjectID().Hex()),
Id: primitive.NewObjectID(),
Title: entry[0],
Categories: parseCategories(entry[1]),
Description: entry[2],
Expand Down
4 changes: 2 additions & 2 deletions scrapers/profiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ func ScrapeProfiles(outDir string) {
utils.VPrintf("Parsed list! #: %s, Office: %v", phoneNumber, office)

professors = append(professors, schema.Professor{
Id: schema.IdWrapper(primitive.NewObjectID().Hex()),
Id: primitive.NewObjectID(),
First_name: firstName,
Last_name: lastName,
Titles: titles,
Expand All @@ -285,7 +285,7 @@ func ScrapeProfiles(outDir string) {
Profile_uri: link,
Image_uri: imageUri,
Office_hours: []schema.Meeting{},
Sections: []schema.IdWrapper{},
Sections: []primitive.ObjectID{},
})

utils.VPrintf("Scraped profile for %s %s!", firstName, lastName)
Expand Down
15 changes: 6 additions & 9 deletions uploader/database.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

package uploader

/* import (
import (
//"go.mongodb.org/mongo-driver/bson"
//"go.mongodb.org/mongo-driver/bson/primitive"
"context"
Expand All @@ -17,18 +17,15 @@ package uploader
)

func connectDB() *mongo.Client {
client, err := mongo.NewClient(options.Client().ApplyURI(getEnvMongoURI()))
if err != nil {
log.Panic("Unable to create MongoDB client")
os.Exit(1)
}

ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()

err = client.Connect(ctx)
opts := options.Client().ApplyURI(getEnvMongoURI())

client, err := mongo.Connect(ctx, opts)
if err != nil {
log.Panic("Unable to connect to database")
log.Panic("Unable to create MongoDB client and connect to database")
os.Exit(1)
}

Expand Down Expand Up @@ -56,4 +53,4 @@ func getEnvMongoURI() string {
os.Exit(1)
}
return uri
} */
}
162 changes: 147 additions & 15 deletions uploader/uploader.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,23 @@

package uploader

/*
import (
"context"
"encoding/json"
"fmt"
"log"
"os"
"strings"

"time"

"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/bson/primitive"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
"github.com/joho/godotenv"

"github.com/UTDNebula/nebula-api/api/schema"
"github.com/joho/godotenv"
)

// It's important to note that all of the files must be updated/uploaded TOGETHER!
Expand All @@ -22,23 +30,147 @@ import (
// Also note that this uploader assumes that the collection names match the names of these files, which they should.
// If the names of these collections ever change, the file names should be updated accordingly.

var filesToUpload []string = []string{"courses.json", "professors.json", "sections.json"}
*/
var filesToUpload [3]string = [3]string{"courses.json", "professors.json", "sections.json"}

func Upload(inDir string, replace bool) {
/*
//Load env vars
if err := godotenv.Load(); err != nil {

//Load env vars
if err := godotenv.Load(); err != nil {
log.Panic("Error loading .env file")
}
//Connect to mongo
client := connectDB()
for _, path := range(filesToUpload) {
//Open data file for reading
fptr, err := os.Open(path)
}

//Connect to mongo
client := connectDB()

// Get 5 minute context
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()

for _, path := range filesToUpload {

// Open data file for reading
fptr, err := os.Open(fmt.Sprintf("%s/"+path, inDir))
if err != nil {
panic(err)
log.Panic(err)
}

defer fptr.Close()

switch path {
case "courses.json":
UploadData[schema.Course](client, ctx, fptr, replace)
case "professors.json":
UploadData[schema.Professor](client, ctx, fptr, replace)
case "sections.json":
UploadData[schema.Section](client, ctx, fptr, replace)
}
*/
}

}

// Generic upload function to upload parsed JSON data to the Mongo database
// Make sure that the name of the file being parsed matches with the name of the collection you are uploading to!
// For example, your file should be named courses.json if you want to upload courses
// As of right now, courses, professors, and sections are available to upload.
func UploadData[T any](client *mongo.Client, ctx context.Context, fptr *os.File, replace bool) {
fileName := fptr.Name()[strings.LastIndex(fptr.Name(), "/")+1 : len(fptr.Name())-5]
log.Println("Uploading " + fileName + ".json ...")

// Decode documents from file
var docs []T
decoder := json.NewDecoder(fptr)
err := decoder.Decode(&docs)
if err != nil {
log.Panic(err)
}

if replace {

// Get collection
collection := getCollection(client, fileName)

// Delete all documents from collection
_, err := collection.DeleteMany(ctx, bson.D{})
if err != nil {
log.Panic(err)
}

// Convert your documents to []interface{}
docsInterface := make([]interface{}, len(docs))
for i := range docs {
docsInterface[i] = docs[i]
}

// Add all documents decoded from the file into the collection
opts := options.InsertMany().SetOrdered(false)
_, err = collection.InsertMany(ctx, docsInterface, opts)
if err != nil {
log.Panic(err)
}

} else {
// If a temp collection already exists, drop it
tempCollection := getCollection(client, "temp")
err = tempCollection.Drop(ctx)
if err != nil {
log.Panic(err)
}

// Create a temporary collection
err := client.Database("combinedDB").CreateCollection(ctx, "temp")
if err != nil {
log.Panic(err)
}

// Get the temporary collection
tempCollection = getCollection(client, "temp")

// Convert your documents to []interface{}
docsInterface := make([]interface{}, len(docs))
for i := range docs {
docsInterface[i] = docs[i]
}

// Add all documents decoded from the file into the temporary collection
opts := options.InsertMany().SetOrdered(false)
_, err = tempCollection.InsertMany(ctx, docsInterface, opts)
if err != nil {
log.Panic(err)
}

// Create a merge aggregate pipeline
// Matched documents from the temporary collection will replace matched documents from the Mongo collection
// Unmatched documents from the temporary collection will be inserted into the Mongo collection
var matchFilters []string
switch fileName {
case "courses":
matchFilters = []string{"catalog_year", "course_number", "subject_prefix"}
case "professors":
matchFilters = []string{"first_name", "last_name"}
case "sections":
matchFilters = []string{"section_number", "course_reference", "academic_session"}
default:
log.Panic("Unrecognizable filename: " + fileName)
}

// The documents will be added/merged into the collection with the same name as the file
// The filters for the merge aggregate pipeline are based on the file name
mergeStage := bson.D{primitive.E{Key: "$merge", Value: bson.D{primitive.E{Key: "into", Value: fileName}, primitive.E{Key: "on", Value: matchFilters}, primitive.E{Key: "whenMatched", Value: "replace"}, primitive.E{Key: "whenNotMatched", Value: "insert"}}}}

// Execute aggregate pipeline
_, err = tempCollection.Aggregate(ctx, mongo.Pipeline{mergeStage})
if err != nil {
log.Panic(err)
}

// Drop the temporary collection
err = tempCollection.Drop(ctx)
if err != nil {
log.Panic(err)
}
}

log.Println("Done uploading " + fileName + ".json!")

defer fptr.Close()
}

0 comments on commit b93064c

Please sign in to comment.