Skip to content

Commit 5d6c0cd

Browse files
authored
Changes to accomodate requests and fixed images
Changes to accomodate requests and fixed images
2 parents 3f0ac26 + f8a44c5 commit 5d6c0cd

File tree

1 file changed

+29
-8
lines changed

1 file changed

+29
-8
lines changed

SIS_Images_Scraper.py

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
1-
import getpass, urllib, os, re
1+
import getpass, requests, os, re
22
from pathlib import Path
33
from selenium.webdriver.support.ui import Select
44
from selenium import webdriver
55
from selenium.webdriver.chrome.options import Options
66
chrome_options = Options()
77
chrome_options.add_argument("--headless")
88
from selenium.webdriver.common.keys import Keys
9-
9+
10+
#Workaround for if pyopenssl is installed and we want weak keys
11+
try:
12+
from urllib3.contrib import pyopenssl
13+
pyopenssl.extract_from_urllib3()
14+
except ImportError:
15+
pass
1016

1117
# Login to SIS
1218
def login(driver):
@@ -104,10 +110,13 @@ def saveImagesToFolder(term, course, class_list):
104110
# regardless if email or not, get image if the current dict key is img url
105111
if k == "img url":
106112
img_url = class_list[i].get(k)
113+
# download and save the image to a specific folder (term/course_section) from the image url
107114
img_name = rcs_id+".png"
108115
filepath = path / img_name
109-
urllib.request.urlretrieve(img_url, str(filepath))
110-
116+
#TODO: Get SSL cipher setting to work with requests, right now still getting handshake errors
117+
r = requests.get(img_url)
118+
with open(str(filepath),'wb') as f:
119+
f.write(r.content)
111120

112121
# returns the class list of dictionaries of info collected about each student's img url, name, and email
113122
def getStudentInfoFromCourse(driver, select_course, index, class_list):
@@ -138,9 +147,18 @@ def getStudentInfoFromCourse(driver, select_course, index, class_list):
138147
img_url = driver.current_url
139148
driver.get(img_url)
140149

141-
# image
142-
image = driver.find_elements_by_tag_name('img')[6].get_attribute('src')
143-
student_record['img url'] = image
150+
# image, initalize to empty string
151+
student_record['img url'] = ""
152+
image_arr = driver.find_elements_by_tag_name('img')
153+
154+
#do search through all <img> tags for first non-header-layout tag
155+
#have to skip 2 more <img> tags because they are transparent images
156+
for i in range(len(image_arr)):
157+
if image_arr[i].get_attribute('NAME') != "web_tab_corner_right":
158+
student_record['img url'] = image_arr[i+2].get_attribute('src')
159+
#Uncomment this line to print the image URLs we are attempting, useful for debugging
160+
#print("found non-match, +2 is " + student_record['img url'])
161+
break
144162

145163
# name
146164
info_name = driver.find_elements_by_class_name('plaintable')[4].find_element_by_tag_name('tbody').find_element_by_tag_name('tr').find_elements_by_tag_name('td')[1].text
@@ -212,11 +230,14 @@ def getInfoFromCourse(driver):
212230
print("Invalid answer! Try again!")
213231

214232
if __name__ == "__main__":
233+
#Just setting the default ciphers (for this session) to be weak DES/SHA for SIS compatibility
234+
#Be careful about navigating to any other sites...
235+
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'DES-CBC3-SHA:AES128-SHA'
215236
driver = webdriver.Chrome(chrome_options=chrome_options)
216237
try:
217238
# open SIS
218239
driver.get('https://sis.rpi.edu/')
219-
# if login is valid with correct User ID or PIN, econtinue the program by collecting data
240+
# if login is valid with correct User ID or PIN, continue the program by collecting data
220241
if login(driver):
221242
getInfoFromCourse(driver)
222243
finally:

0 commit comments

Comments
 (0)