Changes to accomodate requests and fixed images

bmcutler · web-flow · commit 5d6c0cdd0980 · 2018-06-06T09:52:12.000-04:00
diff --git a/SIS_Images_Scraper.py b/SIS_Images_Scraper.py
@@ -1,12 +1,18 @@
-import getpass, urllib, os, re
+import getpass, requests, os, re
 from pathlib import Path
 from selenium.webdriver.support.ui import Select
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 chrome_options = Options()
 chrome_options.add_argument("--headless")
 from selenium.webdriver.common.keys import Keys
-    
+
+#Workaround for if pyopenssl is installed and we want weak keys
+try:
+    from urllib3.contrib import pyopenssl
+    pyopenssl.extract_from_urllib3()
+except ImportError:
+    pass    
 
 # Login to SIS
 def login(driver):
@@ -104,10 +110,13 @@ def saveImagesToFolder(term, course, class_list):
             # regardless if email or not, get image if the current dict key is img url
             if k == "img url":
                 img_url = class_list[i].get(k)
+        # download and save the image to a specific folder (term/course_section) from the image url
         img_name = rcs_id+".png"
         filepath = path / img_name
-        urllib.request.urlretrieve(img_url, str(filepath))
-
+        #TODO: Get SSL cipher setting to work with requests, right now still getting handshake errors
+        r = requests.get(img_url)
+        with open(str(filepath),'wb') as f:
+            f.write(r.content)
 
 # returns the class list of dictionaries of info collected about each student's img url, name, and email
 def getStudentInfoFromCourse(driver, select_course, index, class_list):
@@ -138,9 +147,18 @@ def getStudentInfoFromCourse(driver, select_course, index, class_list):
         img_url = driver.current_url
         driver.get(img_url)
 
-        # image
-        image = driver.find_elements_by_tag_name('img')[6].get_attribute('src')
-        student_record['img url'] = image
+        # image, initalize to empty string
+        student_record['img url'] = ""
+        image_arr = driver.find_elements_by_tag_name('img')
+
+        #do search through all <img> tags for first non-header-layout tag
+        #have to skip 2 more <img> tags because they are transparent images
+        for i in range(len(image_arr)):
+            if image_arr[i].get_attribute('NAME') != "web_tab_corner_right":
+                student_record['img url'] = image_arr[i+2].get_attribute('src')
+                #Uncomment this line to print the image URLs we are attempting, useful for debugging
+                #print("found non-match, +2 is " + student_record['img url'])
+                break
 
         # name
         info_name = driver.find_elements_by_class_name('plaintable')[4].find_element_by_tag_name('tbody').find_element_by_tag_name('tr').find_elements_by_tag_name('td')[1].text
@@ -212,11 +230,14 @@ def getInfoFromCourse(driver):
                 print("Invalid answer! Try again!")
 
 if __name__ == "__main__":      
+    #Just setting the default ciphers (for this session) to be weak DES/SHA for SIS compatibility
+    #Be careful about navigating to any other sites...
+    requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'DES-CBC3-SHA:AES128-SHA'
     driver = webdriver.Chrome(chrome_options=chrome_options)
     try:
         # open SIS
         driver.get('https://sis.rpi.edu/')
-        # if login is valid with correct User ID or PIN, econtinue the program by collecting data
+        # if login is valid with correct User ID or PIN, continue the program by collecting data
         if login(driver):
             getInfoFromCourse(driver)
     finally: