-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
265 lines (231 loc) · 10.4 KB
/
data.py
File metadata and controls
265 lines (231 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import sys
import re
import requests
import json
import time
import random
from bs4 import BeautifulSoup
#Utility class that contains all the info and does all the work
class classesData:
#URL for class page and individual class data, doesn't change with year
classURL = "https://oscar.wpi.edu/cgi-bin/oscar/1.3/byC.cgi"
indivURL = "https://oscar.wpi.edu/cgi-bin/oscar/1.3/byPandC.cgi"
#Initialize all the headers and urls with the year and cookie
def __init__(self, year, cookie):
self.cookie = cookie
self.year = year
self.yearURL = "https://oscar.wpi.edu/oscar/" + year + ".html"
self.yearHeaders = {"Host": "oscar.wpi.edu",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Referer": "https://oscar.wpi.edu/",
"Cookie": cookie,
"Upgrade-Insecure-Requests": "1"}
self.classHeaders = {"Host": "oscar.wpi.edu",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Referer": self.yearURL,
"Cookie": cookie}
self.indivHeaders = {"Host": "oscar.wpi.edu",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Referer": self.yearURL,
"Cookie": cookie}
#Driver function that calls everything else
def getAllInfo(self):
self.getYearInfo()
self.getClassInfo()
#Gets all the info about the year and all the classes and professors during that year
def getYearInfo(self):
r = requests.get(self.yearURL, headers = self.yearHeaders)
f = open((year + "Index.html"), "w")
f.write(r.text)
f.close()
soup = BeautifulSoup(r.text, 'html.parser')
i = 0
isProf = True
profs = {}
classes = {}
#Look through all the options in the html
for option in soup.find_all("option"):
#Skip the first option because it isn't needed
if i == 0:
i += 1
#The options are switching from professors to classes
elif option["value"] == '0':
isProf = False
#Add to either the prof or class dictionary
else:
if isProf:
profs[option["value"]] = option.text
else:
classes[option["value"]] = option.text
#Save the files out
fProf = open((year + "Prof.json"), "w")
json.dump(profs, fProf, indent = 4)
fProf.close()
fClass = open((year + "OnlyClasses.json"), "w")
json.dump(classes, fClass, indent = 4)
fClass.close()
#Save the data in the object
self.profs = profs
self.classes = classes
#Gets all the actual info about the indivdual classes
def getClassInfo(self):
random.seed()
results = {}
#Loops through every class
for key in self.classes.keys():
print(key)
classPayload = {"courseNumber": key, "year": self.year}
r = requests.get(self.classURL, params=classPayload, headers=self.classHeaders)
tempArray = []
#Loops through all the actual class sessions of that course
for item in r.text[:-1].split("\n"):
itemSplit = item.split(";")
indivPayload = {"pidm_id": itemSplit[0], "courseNumber": itemSplit[3], "term": itemSplit[5]}
r2 = requests.get(self.indivURL, params=indivPayload, headers=self.indivHeaders)
if r2.status_code != 200:
print("ERROR: ", r2.status_code)
return 1
#Append the array with the results converted to a dictionary
tempArray.append(self.resultsToDictionary(r2.text))
#Sleep for a random amount of time <1 second to hopefully fool onlookers. Any longer and the program might just take forever
time.sleep(random.random())
#Add all the class information to the full results
results[key] = tempArray
#Save these results to a file
fout = open((year + "Classes.json"), "w")
json.dump(results, fout, indent = 4)
fout.close()
#Converts the text response to a dictionary and unscrambles
def resultsToDictionary(self, r):
#Split on semicolons
tarray = r.split(";")
#Manually assign everything to where it belongs, the actual statistics are passed through the unscrambler
fullResults = {"profID": tarray[0], "prof": tarray[1], "courseName": tarray[2], "courseID": tarray[3], "numStudents": tarray[4], "courseType": tarray[5], "courseYearTerm": tarray[6], "courseRatings": self.unscramble(tarray[7])}
return fullResults
#Unscrambles the statistics the same way the website does
def unscramble(self, text):
#Multiple regexes that just replace some items
text = re.sub(r'w', "0", text)
text = re.sub(r'b', ".", text)
text = re.sub(r'[rstvxzcdfghjklmnpq]', "", text)
text = re.sub(r'[aeiouy]', ",", text)
#Strip out the newline at the end
text = text.rstrip()
#And if the string ends with a trailing comma then just cut it off
if text[-1] == ",":
text = text[0:-1]
return text
#TODO: TEST THIS
#Utility class to join together all files and update the entries with an easy value
#This class could be simplified and joined together with the previous one but I already wrote the code in a different format so I am being lazy and just putting it here. Maybe I will one day rework it
#Joins and updates are also done the quick way in memory so if the data files actually grow big enough this won't work. Won't be a problem for a while though
class joinUpdate:
#Set the year and weights which will all be used later
def __init__(self, years, gradeWeight, timeWeight):
self.years = years
self.gradeWeight = gradeWeight
self.timeWeight = timeWeight
#Simple driver to call the functions
def joinAndUpdate(self):
self.classJoinUpdate()
self.profJoin()
#Joins all the years together into one big data set and updates them all with easy scores
def classJoinUpdate(self):
classFileString = "Classes.json"
#Features per question
FPQ = 9
allDataSplit = []
allData = {}
#For every year open up the data file and put it into the split data array
for year in self.years:
f = open(year + classFileString, "r")
fileJson = json.load(f)
f.close()
allDataSplit.append(fileJson)
#For every set of data in the split data, if the course is in the new dictionary then just tack it on, otherwise set it new
for yearData in allDataSplit:
for course in yearData:
if course in allData:
allData[course] += yearData[course]
else:
allData[course] = yearData[course]
#For every course find the needed info to compute and add an easy value
for course in allData:
for indiv in allData[course]:
#Split up all the ratings
ratings = indiv["courseRatings"].split(",")
#Find the time avg value hidden in the data using the FPQ
timeAvg = float(ratings[FPQ+(FPQ*18)-1])
#Get all the grade ratings and convert them all into ints
gradeAvg = list(map(int, ratings[(1+(FPQ*17)):(5+(FPQ*17))]))
#Compute the average grade if possible
if sum(gradeAvg) == 0:
gradeAvg = 0
else:
gradeAvg = (gradeAvg[0] + (gradeAvg[1] * 2) + (gradeAvg[2] * 3) + (gradeAvg[3] * 4)) / sum(gradeAvg)
#compute the easy value if possible, if either component is 0 then this won't be a valid easy number and should be marked as such
if gradeAvg == 0 or timeAvg == 0:
easy = -1
else:
easy = round((self.gradeWeight * gradeAvg) * (self.timeWeight * timeAvg), 2)
indiv["easy"] = easy
#Save it out
fout = open("allClassesEasy.json", "w")
json.dump(allData, fout, indent = 4)
fout.close()
#Joins all the years together into one big list of professors
def profJoin(self):
profFileString = "Prof.json"
allDataSplit = []
allData = {}
#Same as with classes above
for year in self.years:
f = open(year + profFileString, "r")
fileJson = json.load(f)
f.close()
allDataSplit.append(fileJson)
#For every set of data in the split data, if the prof is not in the new dictionary then set it new
for yearData in allDataSplit:
for prof in yearData:
if prof not in allData:
allData[prof] = yearData[prof]
#Save it out
fout = open("allProf.json", "w")
json.dump(allData, fout, indent = 4)
fout.close()
if __name__ == "__main__":
#All the years to get info from
COOKIE = "MOD_AUTH_CAS_S="
if len(sys.argv) < 2:
print("Need to enter your CAS auth string")
sys.exit(1)
elif len(sys.argv) == 4:
COOKIE += sys.argv[1]
gradeWeight = float(sys.argv[2])
timeWeight = float(sys.argv[3])
else:
COOKIE += sys.argv[1]
gradeWeight = 1
timeWeight = 1
years = ["2018-2019", "2019-2020"]
for year in years:
print(year)
#Create an object with that year and cookie and then get its info
classesData(year, COOKIE).getAllInfo()
#Create an object with the years and weights and then have it join and update
joinUpdate(years, gradeWeight, timeWeight).joinAndUpdate()