Skip to content

Commit 3f0ac26

Browse files
authored
Merge pull request #1 from Submitty/Image_Scraper
SIS images scraper
2 parents 6bcd444 + cdbaa0d commit 3f0ac26

File tree

1 file changed

+224
-0
lines changed

1 file changed

+224
-0
lines changed

SIS_Images_Scraper.py

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
import getpass, urllib, os, re
2+
from pathlib import Path
3+
from selenium.webdriver.support.ui import Select
4+
from selenium import webdriver
5+
from selenium.webdriver.chrome.options import Options
6+
chrome_options = Options()
7+
chrome_options.add_argument("--headless")
8+
from selenium.webdriver.common.keys import Keys
9+
10+
11+
# Login to SIS
12+
def login(driver):
13+
# Get RIN and PIN
14+
rin_id = input("RIN: ")
15+
pin_id = getpass.getpass("PIN: ")
16+
17+
# Click into login page
18+
driver.find_element_by_link_text('Login').click()
19+
20+
# Types in RIN and PIN in login page
21+
rin = driver.find_element_by_name('sid')
22+
rin.send_keys(rin_id)
23+
pin = driver.find_element_by_name('PIN')
24+
pin.send_keys(pin_id)
25+
26+
# click login button
27+
driver.find_element_by_xpath("//input[@value='Login']").click()
28+
# checks to see if login credentials work- if not, return False and end program
29+
if "Authorization Failure - Invalid User ID or PIN." in driver.page_source:
30+
print("Authorization Failure - Invalid User ID or PIN.")
31+
return False
32+
return True
33+
34+
35+
# Gets the session/term the user wants
36+
def getSession(driver):
37+
# click Instructors & Advisors Menu
38+
driver.find_element_by_link_text('Instructor & Advisor Menu').click()
39+
40+
# click Select a Semester or Summer Session
41+
driver.find_element_by_link_text('Select a Semester or Summer Session').click()
42+
43+
# iterate and ask for a term
44+
select_term = Select(driver.find_element_by_name('term'))
45+
options_term = select_term.options
46+
47+
print("Here are the following Semester/Summer Sessions:")
48+
# print the available sessions/terms
49+
for option in options_term:
50+
print(option.text)
51+
52+
# gets the term the user wants by looping until the user enters Exit or a valid term
53+
foundTerm = False
54+
while not foundTerm:
55+
term = input("Select a term ( or Exit to terminate ): ")
56+
# if user does not wish to continue, exit
57+
if term == "Exit":
58+
return term
59+
# loops through all the term options to see if there is a match
60+
for index in range(len(options_term)):
61+
if options_term[index].text == term:
62+
select_term.select_by_index(index)
63+
foundTerm = True
64+
break
65+
# if no match with any term options, print out that the term cannot be found
66+
if not foundTerm:
67+
print("Cannot find the term")
68+
69+
# click submit button
70+
driver.find_element_by_xpath("//input[@value='Submit']").click()
71+
return term
72+
73+
74+
# Saves the images with rcs id as image name to a term/course folder
75+
def saveImagesToFolder(term, course, class_list):
76+
# create shortened name for class folder
77+
course_name = re.match(r'([A-Z]{4}) ([0-9]{4}) ([0-9]+)\:',course)
78+
if course_name is None:
79+
print ("Invalid format for course name")
80+
return
81+
course_folder_name = "{}-{}-{}".format(*course_name.groups())
82+
83+
# make term (month year) into month-year
84+
term_elements = term.split()
85+
folder_term = term_elements[0]+"-"+term_elements[1]
86+
87+
# get path and create path if not already existed
88+
path = Path(folder_term, course_folder_name)
89+
path.mkdir(exist_ok=True, parents=True)
90+
91+
# loops through the class list of dictionaries of student info
92+
for i in range(len(class_list)):
93+
for k in class_list[i].keys():
94+
# no email available on SIS for this student so label the image with error_first_last.png
95+
if len(class_list[i]) == 2:
96+
if k == "name":
97+
name_str = class_list[i].get(k).split()
98+
first_name = name_str[0]
99+
last_name = name_str[1]
100+
rcs_id = "error-{}-{}".format(first_name, last_name)
101+
# if there is an email address, assign letters before "@rpi.edu" to rcs_id
102+
if k == "email":
103+
rcs_id = class_list[i].get(k)[:-8]
104+
# regardless if email or not, get image if the current dict key is img url
105+
if k == "img url":
106+
img_url = class_list[i].get(k)
107+
img_name = rcs_id+".png"
108+
filepath = path / img_name
109+
urllib.request.urlretrieve(img_url, str(filepath))
110+
111+
112+
# returns the class list of dictionaries of info collected about each student's img url, name, and email
113+
def getStudentInfoFromCourse(driver, select_course, index, class_list):
114+
select_course.select_by_index(index)
115+
# click submit button
116+
driver.find_element_by_xpath("//input[@value='Submit']").click()
117+
118+
# click Summary Class List & Electronic Warning System (EWS)
119+
driver.find_element_by_link_text('Summary Class List & Electronic Warning System (EWS)').click()
120+
121+
# check if class is size 0
122+
if len(driver.find_elements_by_class_name('errortext')) == 1:
123+
driver.back()
124+
driver.back()
125+
print("Error: Class size is 0!")
126+
return 0
127+
128+
# find link for pic
129+
student_list = driver.find_elements_by_class_name('datadisplaytable')[2].find_element_by_tag_name('tbody').find_elements_by_tag_name('tr')
130+
131+
# loop through list of students to get image, name, and email
132+
# all info collected from for loop (img url, name, email) put into dict
133+
for s in range(1, len(student_list)):
134+
student_record = {}
135+
student = driver.find_elements_by_class_name('datadisplaytable')[2].find_element_by_tag_name('tbody').find_elements_by_tag_name('tr')[s]
136+
student.find_elements_by_tag_name('td')[1].find_element_by_class_name('fieldmediumtext').click()
137+
138+
img_url = driver.current_url
139+
driver.get(img_url)
140+
141+
# image
142+
image = driver.find_elements_by_tag_name('img')[6].get_attribute('src')
143+
student_record['img url'] = image
144+
145+
# name
146+
info_name = driver.find_elements_by_class_name('plaintable')[4].find_element_by_tag_name('tbody').find_element_by_tag_name('tr').find_elements_by_tag_name('td')[1].text
147+
name = info_name[16:]
148+
student_record['name'] = name
149+
150+
# email address
151+
driver.find_element_by_link_text('Student E-mail Address').click()
152+
if len(driver.find_elements_by_class_name('datadisplaytable')) == 1:
153+
emails = driver.find_element_by_class_name('datadisplaytable').find_element_by_tag_name('tbody').find_elements_by_tag_name('tr')
154+
for i in range(len(emails)):
155+
if emails[i].text == "Campus Student Email Address":
156+
email = emails[i+1].find_element_by_tag_name('td').text
157+
student_record['email'] = email
158+
break
159+
class_list.append(student_record)
160+
driver.back()
161+
driver.back()
162+
driver.back()
163+
driver.back()
164+
return class_list
165+
166+
167+
# Gets the info regarding each course of student images with their rcs id
168+
def getInfoFromCourse(driver):
169+
# Get the term to use to save images
170+
term = getSession(driver)
171+
if term == "Exit":
172+
return
173+
174+
# click Course Information- Select a CRN
175+
driver.find_element_by_link_text('Course Information- Select a CRN').click()
176+
177+
# check if there are any sections assigned for this term
178+
if len(driver.find_elements_by_class_name('warningtext')) == 1:
179+
print ("Error: No sections assigned for this term!")
180+
return
181+
182+
# iterate and ask if user wants images/names from this course
183+
select_course = Select(driver.find_element_by_name('crn'))
184+
options_course = select_course.options
185+
186+
for index in range(len(options_course)):
187+
# all dicts put into list for each class section
188+
class_list = []
189+
select_course = Select(driver.find_element_by_name('crn'))
190+
options_course = select_course.options
191+
course = options_course[index].text
192+
193+
# gets the images the user wants for the class section by looping until the user enters a valid command
194+
foundAnswer = False
195+
while not foundAnswer:
196+
# asks if user wants pictures from current course displayed
197+
print("Do you want pictures from {}?".format(course))
198+
answer = input("Y/N/Exit\n").lower()
199+
if answer == "n":
200+
break
201+
elif answer == "exit":
202+
return
203+
elif answer == "y":
204+
# get the class list of dictionary of email, name, and image per student
205+
class_list = getStudentInfoFromCourse(driver, select_course, index, class_list)
206+
if class_list == 0:
207+
break
208+
# Use the info collected and save the image with rcs id for term/course in current directory
209+
saveImagesToFolder(term, course, class_list)
210+
foundAnswer = True
211+
else:
212+
print("Invalid answer! Try again!")
213+
214+
if __name__ == "__main__":
215+
driver = webdriver.Chrome(chrome_options=chrome_options)
216+
try:
217+
# open SIS
218+
driver.get('https://sis.rpi.edu/')
219+
# if login is valid with correct User ID or PIN, econtinue the program by collecting data
220+
if login(driver):
221+
getInfoFromCourse(driver)
222+
finally:
223+
# ends the program
224+
driver.quit()

0 commit comments

Comments
 (0)