anwala · dsincl1999 · Jan 22, 2018 · Jan 22, 2018 · Jan 24, 2018 · Jan 24, 2018
diff --git a/assignments/DavidSinclair/assignment2 b/assignments/DavidSinclair/assignment2
@@ -0,0 +1,15 @@
+Assignment 2 is located in assignment_2 folder
+
+The file to look for is assignment2.pdf.  This contains the questions and answers for assignment 2.
+
+The rest of the items included are to show the work.
+The programs are listed from 01_...py ==> 15_...py
+If the programs are run in oder they will produce similar results but numbers will change becasue the url's (data) will change.
+Py 01 produce the inital link file which is called raw.
+All other programs produce either a .txt, .csv, .file and the output can be seen.
+The 09_json.py program produces a directory called data that inputs both the memo and carb dating files into it.  
+I copied the files from data direcory but may have missed one or two, because I was coping a 100 at a time.  I
+the created a json.tar.gz file which is the data directory archived.
+The 15_counter only produces text of the screen showing the counts of data.
+
+David Sinclair
diff --git a/assignments/DavidSinclair/assignment9 b/assignments/DavidSinclair/assignment9
@@ -0,0 +1 @@
+Please look at the pdf file for information
diff --git a/assignments/DavidSinclair/assignment_2/01_twt.py b/assignments/DavidSinclair/assignment_2/01_twt.py
@@ -0,0 +1,89 @@
+'''
+	prerequisites:
+	0. create a twitter account
+	1. obtain your access tokens: https://apps.twitter.com/
+		1.0 create new app
+	2. install tweepy (pip install tweepy)
+
+	credit: 
+	http://docs.tweepy.org/en/v3.4.0/streaming_how_to.html
+	http://adilmoujahid.com/posts/2014/07/twitter-analytics/
+	https://pythonprogramming.net/twitter-api-streaming-tweets-python-tutorial/
+
+	Tweet JSON:, use http://jsonviewer.stack.hu/ to view object
+	https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/intro-to-tweet-json
+
+	rate limiting:
+	https://developer.twitter.com/en/docs/basics/rate-limiting
+
+	streaming rate limiting:
+	https://developer.twitter.com/en/docs/tweets/filter-realtime/guides/connecting.html
+'''
+
+#Import the necessary methods from tweepy library
+from tweepy.streaming import StreamListener
+from tweepy import OAuthHandler
+from tweepy import Stream
+import json
+import time
+
+#get keys from: https://apps.twitter.com/
+#consumer key, consumer secret, access token, access secret.
+ckey = 'TulEhMcwsvnMGgDnYXJS2O0Mr'
+csecret = '9ZfQvJy6CDF20fAVoFDL0YZhBMBwoN6Kb9NXtwxIJWvVaH31DX'
+atoken = '955941770369609733-gcO2pTEx5gNikLQb05l6gEuTfAlw6li'
+asecret = 'wSwUnXYJodfT78rhXkzQpjfaFVYfwpwfwfmtOTZrqPkwS'
+
+class listener(StreamListener):
+
+	def on_data(self, data):
+		#learn about tweet json structure: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/intro-to-tweet-json
+		tweetJson = json.loads(data)
+
+		#tweet = tweetJson['text']
+		username = tweetJson['user']['screen_name']
+		links = tweetJson['entities']['urls']
+
+		if( len(links) != 0 and tweetJson['truncated'] == False ):
+			links = self.getLinksFromTweet(links)
+
+			print( username , '\t' , )
+			for l in links :
+				print(l ,file=open("twitterlinks.raw", "a"),)
+				print(l,)
+			print ()
+			filename = "twitterlinks.raw"
+			numLines = sum(1 for line in open("twitterlinks.raw"))
+			print (numLines)
+			if numLines==2000: #Number of links you want to collect and have the file stop.
+				exit()
+
+		#print('...sleep for 5 seconds')
+		#time.sleep(5)
+
+		return True
+
+	def getLinksFromTweet(self, linksDict):
+
+		links = []
+		for uri in linksDict:
+			links.append( uri['expanded_url'] )
+
+		return links
+
+	def on_error(self, status):
+		print( status )
+
+		if status_code == 420:
+			#returning False in on_data disconnects the stream
+			return False
+		return True
+
+
+auth = OAuthHandler(ckey, csecret)
+auth.set_access_token(atoken, asecret)
+
+twitterStream = Stream(auth, listener())
+twitterStream.filter(track=['government'])
+
+
diff --git a/assignments/DavidSinclair/assignment_2/02_validate.py b/assignments/DavidSinclair/assignment_2/02_validate.py
@@ -0,0 +1,193 @@
+import requests #used for url request information
+import time	#used for time of day
+import os	#used for search strings and removeing of string information
+import sys	#used for search strings and removeing of string information
+import fileinput	#used for search strings and removeing of string information
+import re	#used for search strings and removeing of string information
+import json	#used by memo and carbondate
+
+#Gets the twt.py output and inputs into this python program
+tests = [test.rstrip('\n') for test in open('twitter_links.raw')]
+print("TEST DATA: ", tests)
+print(len(tests))
+
+#Validates if the links are valid and returns a response 
+for test in tests:
+    try:
+        r = requests.head(test)
+        print(r.status_code)
+        if (r.status_code) >= 400:
+                print(r.status_code, test, file=open("400ws.txt", "a",))
+                print(test, file=open("400.txt", "a",))
+        elif(r.status_code) >= 300 and (r.status_code) <= 399:
+                print(r.status_code, test, file=open("300ws.txt", 'a'))
+                print(test, file=open("300.txt", 'a'))
+        elif(r.status_code) == 200:
+                print(r.status_code, test, file=open("200ws.txt", 'a'))
+                print(test, file=open("200.txt", 'a'))
+        else:
+                print(r.status_code, test, file=open("everythingelsews.txt", 'a'))
+                print(test, file=open("everythingelse.txt", 'a'))
+    except requests.ConnectionError:
+        print("failed to connect",)
+        print(test, file=open("notvalidurl.txt", "a",))
+'''
+#Takes the validated url and attempts to get a server code of 200
+with open('validurl.txt') as f:
+    text = f.read().splitlines()
+
+print(text)
+print(len(text))
+
+for url in text:
+	response = requests.get(url)
+	print('initial url',url)
+	if response.history:
+		print("Request was redirected")
+		for resp in response.history:
+			print(resp.status_code, ) 
+			print(resp.url, file=open("300.txt", "a",))#copy redirect link to file.
+			print("Final destination: ")
+			print(response.status_code,)
+			print(response.url, file=open("200.txt", "a"))#copy response url to file.
+	else:
+		print(url, file=open("200.txt", "a"))#copy url to file
+
+#Takes all the 200 code links and makes sure there are no duplicates
+content = open('200.txt', 'r').readlines()
+content_set = set(content)
+cleandata = open('200clean.txt', 'w')
+
+for line in content_set:
+        cleandata.write(line)
+
+
+#Takes the validated 200 codes that are not duplicates and removes the twitter.com urls.
+err_occur = []
+pattern = re.compile("twitter.com", re.IGNORECASE)
+try:
+        with open ('200clean.txt','rt') as in_file:
+                for linenum, line in enumerate(in_file):
+                        if pattern.search(line) != None:
+                                err_occur.append((linenum, line.rstrip('\n')))
+                        else:
+                                print(line, file=open("twitline.txt",'a'))
+                for iinenum, line in err_occur:
+                        print(line, file=open("twitonly.txt",'a'))
+except FileNotFoundError:
+        print("Log file not found.")
+
+#Removes blank lines from the twitline.txt and changes the name to twitremove.txt
+for line in open('twitline.txt'):
+  line = line.rstrip()
+  if line != '':
+    print(line, file=open('twitremove.txt','a'))
+
+#Takes the urls from twitremove.txt and adds the memgator http to the front 
+with open('twitremove.txt') as f:
+    text = f.read().splitlines()
+
+print(text)
+print(len(text))
+
+for url in text:
+       print('http://memgator.cs.odu.edu/timemap/json/',sep='',*url,file=open("memgat.txt", "a"))
+
+with open('memgat.txt') as f:
+       text = f.read().splitlines()
+
+print(text)
+print(len(text))
+
+#Takes the memgator http information and puts it into a file in the data directory and creates a key to reference url to filename.
+for idx,url in enumerate(text):
+        r = requests.get(url)
+        response = r.text
+        i = idx
+        print('{0:04}'.format(i),url)
+        print('{0:04}'.format(i),url,file=open('./data/memokey.txt','a'))
+        filename = ''.join(str(x) for x in ("./data/memo",'{0:04}'.format(i),".txt"))
+        print(response,file=open(filename,'w'))
+
+
+#Takes the urls from twitremove.txt and adds the carbondate http to the front
+with open('twitremove.txt') as f: 
+    text = f.read().splitlines()
+
+
+for url in text:
+        print('http://localhost:8888/cd/',sep='',*url,file=open("carbdate.txt", "a"))
+
+with open('carbdate.txt') as f:
+        text = f.read().splitlines()
+
+print(text)
+print(len(text))
+
+#Takes the carbondate http information and puts it into a file in the data directory and creates a key to reference url to filename.
+for idx,url in enumerate(text):
+        r = requests.get(url)
+        response = r.text
+        i = idx
+        print('{0:04}'.format(i),url)
+        print('{0:04}'.format(i),url,file=open('./data/cbdatekey.txt','a'))
+        filename = ''.join(str(x) for x in ("./data/cbdate",'{0:04}'.format(i),".txt"))
+        print(response,file=open(filename,'w'))
+
+file=open('200.txt', 'r')
+urlcomp = file.read().split('\n')
+file.close()
+
+file=open('300.txt', 'r')
+redirlink = file.read().split('\n')
+file.close()
+
+file=open('400.txt', 'r')
+notfound = file.read().split('\n')
+file.close()
+
+file=open('200clean.txt', 'r')
+cleantwitterurl = file.read().split('\n')
+file.close()
+
+file=open('twitter_links.raw', 'r')
+twitterlinks = file.read().split('\n')
+file.close()
+
+file=open('notvalidurl.txt', 'r')
+notvalid = file.read().split('\n')
+file.close()
+
+file=open('validurl.txt', 'r')
+valid = file.read().split('\n')
+file.close()
+
+file=open('twitremove.txt', 'r')
+twitremove = file.read().split('\n')
+file.close()
+
+file=open('twitonly.txt', 'r')
+twitonly = file.read().split('\n')
+file.close()
+
+file=open('memgat.txt', 'r')
+memgat = file.read().split('\n')
+file.close()
+
+file=open('carbdate.txt', 'r')
+carbdate = file.read().split('\n')
+file.close()
+
+
+print(len(twitterlinks), "is the number of links from the orginal twitter search.")
+print(len(notvalid), "is the number of links that were not valid.")
+print(len(valid), "is the number of links that were valid.")
+print(len(urlcomp), "is the number of urls with code 200")
+print(len(redirlink), "is the number of urls with code 301")
+print(len(notfound), "is the number of urls with code 400")
+print(len(cleantwitterurl), "is the number of urls with out duplicates")
+print(len(twitonly), "is the number of urls that had twitter.com")
+print(len(twitremove), "is the number of urls with twitter.com removed")
+print(len(memgat), "is the number of urls in memgat")
+print(len(carbdate), "is the number of urls in carbdate")
+'''
diff --git a/assignments/DavidSinclair/assignment_2/03_redir.py b/assignments/DavidSinclair/assignment_2/03_redir.py
@@ -0,0 +1,38 @@
+import requests #used for url request information
+import time	#used for time of day
+import os	#used for search strings and removeing of string information
+import sys	#used for search strings and removeing of string information
+import fileinput	#used for search strings and removeing of string information
+import re	#used for search strings and removeing of string information
+import json	#used by memo and carbondate
+
+#Takes the 300.txt file and and attempts to get a server code of 200
+text = []
+
+with open('300.txt') as f:
+    text = f.read().splitlines()
+
+print(text)
+print(len(text))
+
+for url in text:
+	try:
+		response = requests.get(url)
+		print(response.history, response.url)
+		for resp in response.history:
+			print(response.history, response.url, resp.status_code, resp.url, file=open('300comp.txt','a'))
+
+			if response.status_code == 200:
+				print(response.status_code, response.url, 'save code 200')
+				print(response.status_code, response.url, file=open('300goodws.txt','a'))
+				print(response.url, file=open('300good.txt','a'))	
+			else:
+				print(response.status_code, response.url, 'save not')
+				print(response.status_code, response.url, file=open('300failsws.txt','a'))
+				print(response.url, file=open('300fails.txt','a'))
+
+	except requests.ConnectionError:
+		print("failed to connect",response.url)
+		print(response.status_code, response.url, file=open('300connectfailws.txt','a'))
+		print(response.url, file=open('300connectfail.txt','a'))
+
diff --git a/assignments/DavidSinclair/assignment_2/04_combine.py b/assignments/DavidSinclair/assignment_2/04_combine.py
@@ -0,0 +1,30 @@
+import requests #used for url request information
+import time	#used for time of day
+import os	#used for search strings and removeing of string information
+import sys	#used for search strings and removeing of string information
+import fileinput	#used for search strings and removeing of string information
+import re	#used for search strings and removeing of string information
+import json	#used by memo and carbondate
+
+
+#Takes the validated url and attempts to get a server code of 200
+with open('200.txt') as f:
+    text = f.read().splitlines()
+
+print(text)
+print(len(text))
+
+for url in text:
+	print(url)	
+	print(url, file=open("200comp.txt", "a"))
+
+with open('300good.txt') as f:
+    text = f.read().splitlines()
+
+print(text)
+print(len(text))
+
+for url in text:
+	print(url)
+	print(url, file=open("200comp.txt", "a"))
+
diff --git a/assignments/DavidSinclair/assignment_2/05_rmdup.py b/assignments/DavidSinclair/assignment_2/05_rmdup.py
@@ -0,0 +1,17 @@
+import requests #used for url request information
+import time	#used for time of day
+import os	#used for search strings and removeing of string information
+import sys	#used for search strings and removeing of string information
+import fileinput	#used for search strings and removeing of string information
+import re	#used for search strings and removeing of string information
+import json	#used by memo and carbondate
+
+
+#Takes all the 200 code links and makes sure there are no duplicates
+content = open('200comp.txt', 'r').readlines()
+content_set = set(content)
+cleandata = open('200clean.txt', 'w')
+
+for line in content_set:
+        cleandata.write(line)
+