Skip to content

Commit 19e76d8

Browse files
Add files via upload
1 parent e5b489f commit 19e76d8

16 files changed

+106409
-0
lines changed

scraping/adremover.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import cssselect
2+
3+
class AdRemover(object):
4+
"""
5+
This class applies elemhide rules from AdBlock Plus to an lxml
6+
document or element object. One or more AdBlock Plus filter
7+
subscription files must be provided.
8+
9+
Example usage:
10+
11+
>>> import lxml.html
12+
>>> remover = AdRemover('fanboy-annoyance.txt')
13+
>>> doc = lxml.html.document_fromstring("<html>...</html>")
14+
>>> remover.remove_ads(doc)
15+
"""
16+
17+
def __init__(self, *rules_files):
18+
if not rules_files:
19+
raise ValueError("one or more rules_files required")
20+
21+
translator = cssselect.HTMLTranslator()
22+
rules = []
23+
24+
for rules_file in rules_files:
25+
with open(rules_file, 'r', encoding='utf-8') as f:
26+
for line in f:
27+
# elemhide rules are prefixed by ## in the adblock filter syntax
28+
if line[:2] == '##':
29+
try:
30+
rules.append(translator.css_to_xpath(line[2:]))
31+
except cssselect.SelectorError:
32+
# just skip bad selectors
33+
pass
34+
35+
# create one large query by joining them the xpath | (or) operator
36+
self.xpath_query = '|'.join(rules)
37+
38+
39+
def remove_ads(self, tree):
40+
"""Remove ads from an lxml document or element object.
41+
42+
The object passed to this method will be modified in place."""
43+
44+
for elem in tree.xpath(self.xpath_query):
45+
elem.getparent().remove(elem)

scraping/db_manager.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import os
2+
import sqlite3 as lite
3+
4+
5+
class DB(object):
6+
def __init__(self, dst_path, dbname):
7+
self.DB_Name = dst_path + '/' + dbname
8+
if not os.path.exists(self.DB_Name):
9+
open(self.DB_Name, 'w')
10+
self.setupDBCon()
11+
12+
def setupDBCon(self):
13+
self.con = lite.connect(self.DB_Name, check_same_thread=False, isolation_level=None, timeout=20)
14+
self.cur = self.con.cursor()
15+
self.cur.execute("PRAGMA synchronous=OFF")
16+
self.cur.execute('pragma journal_mode=wal')
17+
18+
def createTable(self, tbl_name, fields):
19+
# self.dropTable(tbl_name=tbl_name)
20+
query_fields_list = [f'"{field}" TEXT' for field in fields]
21+
query_fields = '(' + ", ".join(query_fields_list) + ')'
22+
final_query = f'CREATE TABLE IF NOT EXISTS {tbl_name} ' + query_fields
23+
self.cur.execute(final_query)
24+
25+
def insert_row(self, tbl_name, result_row, start_pos=0, filter_limit=6):
26+
values = list(result_row.values())
27+
fields = list(result_row.keys())
28+
29+
if not self.isExist(tbl_name=tbl_name, condition_row=result_row, filter_limit=filter_limit):
30+
fields = [f"'{elm}'" for elm in fields]
31+
query_values = '(' + ', '.join(fields) + ')' + ' VALUES(' + ', '.join([f"'{elm}'" for elm in values]) + ')'
32+
final_query = f'INSERT OR REPLACE INTO {tbl_name}' + query_values
33+
self.cur.execute(final_query)
34+
self.con.commit()
35+
36+
def select_data(self, tbl_name, condition_row=None, filter_limit=6, start_pos=0):
37+
query_select = '*'
38+
39+
if condition_row:
40+
query_condition = ' WHERE ' + ' AND '.join(
41+
[f'"{condition_field}"="{condition_value}"' for condition_field, condition_value in
42+
zip(list(condition_row.keys())[start_pos:], list(condition_row.values())[:filter_limit])])
43+
else:
44+
query_condition = ''
45+
46+
final_query = 'SELECT ' + query_select + f' FROM {tbl_name}' + query_condition
47+
self.cur.execute(final_query)
48+
return self.cur.fetchall()
49+
50+
def isExist(self, tbl_name, condition_row, filter_limit=6, start_pos=0):
51+
if self.select_data(tbl_name, condition_row, filter_limit, start_pos):
52+
return True
53+
else:
54+
return False
55+
56+
def delete(self, tbl_name):
57+
final_query = f"DELETE FROM {tbl_name} WHERE email NOT LIKE '%@%'"
58+
self.cur.execute(final_query)
59+
self.con.commit()
60+
61+
def dropTable(self, tbl_name):
62+
final_query = f"DROP TABLE IF EXISTS {tbl_name}"
63+
self.cur.execute(final_query)
64+
65+
def closeDB(self):
66+
self.cur.close()
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import requests
2+
import math
3+
import os
4+
from urllib.parse import urlparse
5+
6+
from tqdm import *
7+
8+
def download_zip(self, url):
9+
print(f"[download_zip] url: {url}")
10+
headers = {
11+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
12+
}
13+
r = requests.get(url, stream=True, headers=headers)
14+
total_length = int(r.headers.get('content-length'))
15+
zip_fname = os.path.basename(urlparse(r.url).path)
16+
17+
bar = tqdm(total=math.ceil(total_length / 4096))
18+
bar.set_description(f"\t{zip_fname} is downloading now...")
19+
20+
full_zip_fname = os.path.join(self.working_dir, zip_fname)
21+
with open(full_zip_fname, "wb") as z:
22+
for chunk in r.iter_content(chunk_size=4096):
23+
if chunk:
24+
z.write(chunk)
25+
bar.update()
26+
27+
bar.close()
28+
29+
return full_zip_fname
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import requests
2+
3+
def download_img(file_url, save_file_name, retry_num=3):
4+
try:
5+
headers = {
6+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
7+
}
8+
r = requests.get(file_url, stream=True, headers=headers)
9+
with open(save_file_name, "wb") as img:
10+
for chunk in r.iter_content(chunk_size=1024):
11+
if chunk:
12+
img.write(chunk)
13+
except:
14+
if retry_num > 0:
15+
download_img(file_url=file_url, save_file_name=save_file_name, retry_num=retry_num - 1)

scraping/get_free_proxy.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from free_proxy import FreeProxy
2+
import os
3+
from proxyscrape import create_collector
4+
5+
from datetime import datetime
6+
7+
def get_now_str():
8+
now = datetime.now()
9+
return now.strftime("%Y-%m-%d")
10+
11+
from db_manager import DB
12+
13+
class ScrapeFreeProxies():
14+
def __init__(self):
15+
self.result_dir = os.path.join(os.getcwd(), "Result")
16+
if not os.path.exists(self.result_dir):
17+
os.makedirs(self.result_dir)
18+
19+
self.collector = create_collector('my-collector', 'http')
20+
21+
self.DB_Name = "proxies.db"
22+
self.tbl_name = "http"
23+
self.db_proxy = DB(dst_path=self.result_dir, dbname=self.DB_Name)
24+
25+
self.fields = [
26+
"anonymous", "code", "country", "host", "port", "type", "date", "status"
27+
]
28+
self.db_proxy.createTable(tbl_name=self.tbl_name, fields=self.fields)
29+
30+
31+
def get_proxies(self):
32+
# Retrieve only anonymous 'uk' or 'us' proxies
33+
proxies = self.collector.get_proxies({'anonymous': True})
34+
for proxy in proxies:
35+
result_row = {
36+
'anonymous': proxy.anonymous,
37+
'code': proxy.code,
38+
'country': proxy.country,
39+
'host': proxy.host,
40+
'port': proxy.port,
41+
'type': proxy.type,
42+
'date': get_now_str()
43+
}
44+
self.db_proxy.insert_row(tbl_name=self.tbl_name, result_row=result_row)
45+
print(result_row)
46+
47+
48+
if __name__ == '__main__':
49+
app = ScrapeFreeProxies()
50+
app.get_proxies()

scraping/google maps API.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import googlemaps
2+
from datetime import datetime
3+
4+
gmaps = googlemaps.Client(key='Your API Key')
5+
6+
# Geocoding an address
7+
geocode_result = gmaps.geocode('1600 Amphitheatre Parkway, Mountain View, CA')
8+
9+
10+
11+
# Look up an address with reverse geocoding
12+
reverse_geocode_result = gmaps.reverse_geocode((40.714224, -73.961452))
13+
14+
# Request directions via public transit
15+
now = datetime.now()
16+
directions_result = gmaps.directions("Sydney Town Hall",
17+
"Parramatta, NSW",
18+
mode="transit",
19+
departure_time=now)
20+
21+
print(directions_result)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from urllib.parse import urlencode
2+
3+
url = "http://in-stjoseph-assessor.governmax.com/propertymax/search_property.asp?l_nm=owner&user=guest_in-stjoseph-assessor&pass=manatron&sid={}"
4+
5+
query = {
6+
'l_nm': 'owner',
7+
'user': 'guest_in-stjoseph-assessor',
8+
'pass': 'manatron',
9+
'sid': 'F83D36C5F0094B86B4BE9307649DA66F'
10+
}
11+
12+
query_encoded = urlencode(query)
13+
new_url = "http://in-stjoseph-assessor.governmax.com/propertymax/search_property.asp?" + query_encoded
14+
print(new_url)

scraping/read sav file.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import multiprocessing as mp
2+
from time import time
3+
import pandas as pd
4+
import pyreadstat
5+
from multiprocessing import freeze_support
6+
7+
if __name__ == '__main__':
8+
start_ts = time()
9+
# df_spss, meta = pyreadstat.read_sav("Surgery.sav", user_missing=False)
10+
df_spss, meta = pyreadstat.read_file_multiprocessing(pyreadstat.read_sav, "Surgery.sav")
11+
12+
elapsed_ts = time() - start_ts
13+
print(elapsed_ts)
14+
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import multiprocessing as mp
2+
from time import time
3+
4+
import pandas as pd
5+
import pyreadstat
6+
7+
def worker(inpt):
8+
import pyreadstat
9+
offset, chunksize, path = inpt
10+
df, meta = pyreadstat.read_sav(path, row_offset=offset, row_limit=chunksize)
11+
return df
12+
13+
# calculate the number of rows in the file
14+
_, meta = pyreadstat.read_sav("big.sav", metadataonly=True)
15+
numrows = meta.number_rows
16+
# calculate number of cores in the machine, this could also be set manually to some number, i.e. 8
17+
numcores = mp.cpu_count()
18+
# calculate the chunksize and offsets
19+
divs = [numrows // numcores + (1 if x < numrows % numcores else 0) for x in range (numcores) ]
20+
chunksize = divs[0]
21+
offsets = [indx*chunksize for indx in range(numcores)]
22+
# pack the data for the jobs
23+
jobs = [(x, chunksize, "big.sav") for x in offsets]
24+
25+
pool = mp.Pool(processes=numcores)
26+
# let's go!
27+
t0=time()
28+
chunks = pool.map(worker, jobs)
29+
t1=time()
30+
print(t1-t0) # this prints 29 seconds
31+
# chunks is a list of dataframes in the right order
32+
# you can concatenate all the chunks into a single big dataframe if you like
33+
final = pd.concat(chunks, axis=0, ignore_index=True)
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import multiprocessing as mp
2+
from time import time
3+
4+
import pandas as pd
5+
import pyreadstat
6+
import math
7+
import threading
8+
9+
10+
def worker(inpt):
11+
print(inpt)
12+
offset, chunksize, path = inpt
13+
df, meta = pyreadstat.read_sav(path, row_offset=offset, row_limit=chunksize)
14+
# df, meta = pyreadstat.read_file_in_chunks(pyreadstat.read_sav, path, offset=offset, chunksize=chunksize,
15+
# multiprocess=True, num_processes=10)
16+
return df
17+
18+
19+
start_ts = time()
20+
21+
# calculate the number of rows in the file
22+
_, meta = pyreadstat.read_sav("Surgery.sav", metadataonly=True)
23+
numrows = meta.number_rows
24+
# calculate number of cores in the machine, this could also be set manually to some number, i.e. 8
25+
# calculate the chunksize and offsets
26+
chunksize = 200
27+
offsets = [indx * chunksize for indx in range(math.ceil(numrows / chunksize))]
28+
# pack the data for the jobs
29+
jobs = [(x, chunksize, "Surgery.sav") for x in offsets]
30+
31+
threads = []
32+
max_threads = 30
33+
while threads or jobs:
34+
for thread in threads:
35+
if not thread.is_alive():
36+
threads.remove(thread)
37+
38+
while len(threads) < max_threads and jobs:
39+
job = jobs.pop()
40+
thread = threading.Thread(target=worker, args=(job,))
41+
thread.start()
42+
threads.append(thread)
43+
44+
elapsed_ts = time() - start_ts
45+
print(elapsed_ts)

0 commit comments

Comments
 (0)