pydeveloper510
diff --git a/‎scraping/adremover.py
Lines changed: 45 additions & 0 deletions b/‎scraping/adremover.py
Lines changed: 45 additions & 0 deletions
diff --git a/‎scraping/db_manager.py
Lines changed: 66 additions & 0 deletions b/‎scraping/db_manager.py
Lines changed: 66 additions & 0 deletions
diff --git a/‎scraping/download zip file with progress bar.py
Lines changed: 29 additions & 0 deletions b/‎scraping/download zip file with progress bar.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎scraping/download_image_with_requests.py
Lines changed: 15 additions & 0 deletions b/‎scraping/download_image_with_requests.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎scraping/get_free_proxy.py
Lines changed: 50 additions & 0 deletions b/‎scraping/get_free_proxy.py
Lines changed: 50 additions & 0 deletions
diff --git a/‎scraping/google maps API.py
Lines changed: 21 additions & 0 deletions b/‎scraping/google maps API.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎scraping/make a complete url from a query.py
Lines changed: 14 additions & 0 deletions b/‎scraping/make a complete url from a query.py
Lines changed: 14 additions & 0 deletions
diff --git a/‎scraping/read sav file.py
Lines changed: 14 additions & 0 deletions b/‎scraping/read sav file.py
Lines changed: 14 additions & 0 deletions
diff --git a/‎scraping/read sav file_multiple_processing.py
Lines changed: 33 additions & 0 deletions b/‎scraping/read sav file_multiple_processing.py
Lines changed: 33 additions & 0 deletions
diff --git a/‎scraping/read sav file_multiple_threading.py
Lines changed: 45 additions & 0 deletions b/‎scraping/read sav file_multiple_threading.py
Lines changed: 45 additions & 0 deletions
@@ -0,0 +1,45 @@
+import cssselect
+
+class AdRemover(object):
+    """
+    This class applies elemhide rules from AdBlock Plus to an lxml
+    document or element object. One or more AdBlock Plus filter
+    subscription files must be provided.
+
+    Example usage:
+
+    >>> import lxml.html
+    >>> remover = AdRemover('fanboy-annoyance.txt')
+    >>> doc = lxml.html.document_fromstring("<html>...</html>")
+    >>> remover.remove_ads(doc)
+    """
+
+    def __init__(self, *rules_files):
+        if not rules_files:
+            raise ValueError("one or more rules_files required")
+
+        translator = cssselect.HTMLTranslator()
+        rules = []
+
+        for rules_file in rules_files:
+            with open(rules_file, 'r', encoding='utf-8') as f:
+                for line in f:
+                    # elemhide rules are prefixed by ## in the adblock filter syntax
+                    if line[:2] == '##':
+                        try:
+                            rules.append(translator.css_to_xpath(line[2:]))
+                        except cssselect.SelectorError:
+                            # just skip bad selectors
+                            pass
+
+        # create one large query by joining them the xpath | (or) operator
+        self.xpath_query = '|'.join(rules)
+
+
+    def remove_ads(self, tree):
+        """Remove ads from an lxml document or element object.
+
+        The object passed to this method will be modified in place."""
+
+        for elem in tree.xpath(self.xpath_query):
+            elem.getparent().remove(elem)
@@ -0,0 +1,66 @@
+import os
+import sqlite3 as lite
+
+
+class DB(object):
+    def __init__(self, dst_path, dbname):
+        self.DB_Name = dst_path + '/' + dbname
+        if not os.path.exists(self.DB_Name):
+            open(self.DB_Name, 'w')
+        self.setupDBCon()
+
+    def setupDBCon(self):
+        self.con = lite.connect(self.DB_Name, check_same_thread=False, isolation_level=None, timeout=20)
+        self.cur = self.con.cursor()
+        self.cur.execute("PRAGMA synchronous=OFF")
+        self.cur.execute('pragma journal_mode=wal')
+
+    def createTable(self, tbl_name, fields):
+        # self.dropTable(tbl_name=tbl_name)
+        query_fields_list = [f'"{field}" TEXT' for field in fields]
+        query_fields = '(' + ", ".join(query_fields_list) + ')'
+        final_query = f'CREATE TABLE IF NOT EXISTS {tbl_name} ' + query_fields
+        self.cur.execute(final_query)
+
+    def insert_row(self, tbl_name, result_row, start_pos=0, filter_limit=6):
+        values = list(result_row.values())
+        fields = list(result_row.keys())
+
+        if not self.isExist(tbl_name=tbl_name, condition_row=result_row, filter_limit=filter_limit):
+            fields = [f"'{elm}'" for elm in fields]
+            query_values = '(' + ', '.join(fields) + ')' + ' VALUES(' + ', '.join([f"'{elm}'" for elm in values]) + ')'
+            final_query = f'INSERT OR REPLACE INTO {tbl_name}' + query_values
+            self.cur.execute(final_query)
+            self.con.commit()
+
+    def select_data(self, tbl_name, condition_row=None, filter_limit=6, start_pos=0):
+        query_select = '*'
+
+        if condition_row:
+            query_condition = ' WHERE ' + ' AND '.join(
+                [f'"{condition_field}"="{condition_value}"' for condition_field, condition_value in
+                 zip(list(condition_row.keys())[start_pos:], list(condition_row.values())[:filter_limit])])
+        else:
+            query_condition = ''
+
+        final_query = 'SELECT ' + query_select + f' FROM {tbl_name}' + query_condition
+        self.cur.execute(final_query)
+        return self.cur.fetchall()
+
+    def isExist(self, tbl_name, condition_row, filter_limit=6, start_pos=0):
+        if self.select_data(tbl_name, condition_row, filter_limit, start_pos):
+            return True
+        else:
+            return False
+
+    def delete(self, tbl_name):
+        final_query = f"DELETE FROM {tbl_name} WHERE email NOT LIKE '%@%'"
+        self.cur.execute(final_query)
+        self.con.commit()
+
+    def dropTable(self, tbl_name):
+        final_query = f"DROP TABLE IF EXISTS {tbl_name}"
+        self.cur.execute(final_query)
+
+    def closeDB(self):
+        self.cur.close()
@@ -0,0 +1,29 @@
+import requests
+import math
+import os
+from urllib.parse import urlparse
+
+from tqdm import *
+
+def download_zip(self, url):
+    print(f"[download_zip] url: {url}")
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
+    }
+    r = requests.get(url, stream=True, headers=headers)
+    total_length = int(r.headers.get('content-length'))
+    zip_fname = os.path.basename(urlparse(r.url).path)
+
+    bar = tqdm(total=math.ceil(total_length / 4096))
+    bar.set_description(f"\t{zip_fname} is downloading now...")
+
+    full_zip_fname = os.path.join(self.working_dir, zip_fname)
+    with open(full_zip_fname, "wb") as z:
+        for chunk in r.iter_content(chunk_size=4096):
+            if chunk:
+                z.write(chunk)
+            bar.update()
+
+    bar.close()
+
+    return full_zip_fname
@@ -0,0 +1,15 @@
+import requests
+
+def download_img(file_url, save_file_name, retry_num=3):
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
+        }
+        r = requests.get(file_url, stream=True, headers=headers)
+        with open(save_file_name, "wb") as img:
+            for chunk in r.iter_content(chunk_size=1024):
+                if chunk:
+                    img.write(chunk)
+    except:
+        if retry_num > 0:
+            download_img(file_url=file_url, save_file_name=save_file_name, retry_num=retry_num - 1)
@@ -0,0 +1,50 @@
+from free_proxy import FreeProxy
+import os
+from proxyscrape import create_collector
+
+from datetime import datetime
+
+def get_now_str():
+    now = datetime.now()
+    return now.strftime("%Y-%m-%d")
+
+from db_manager import DB
+
+class ScrapeFreeProxies():
+    def __init__(self):
+        self.result_dir = os.path.join(os.getcwd(), "Result")
+        if not os.path.exists(self.result_dir):
+            os.makedirs(self.result_dir)
+
+        self.collector = create_collector('my-collector', 'http')
+
+        self.DB_Name = "proxies.db"
+        self.tbl_name = "http"
+        self.db_proxy = DB(dst_path=self.result_dir, dbname=self.DB_Name)
+
+        self.fields = [
+            "anonymous", "code", "country", "host", "port", "type", "date", "status"
+        ]
+        self.db_proxy.createTable(tbl_name=self.tbl_name, fields=self.fields)
+
+
+    def get_proxies(self):
+        # Retrieve only anonymous 'uk' or 'us' proxies
+        proxies = self.collector.get_proxies({'anonymous': True})
+        for proxy in proxies:
+            result_row = {
+                'anonymous': proxy.anonymous,
+                'code': proxy.code,
+                'country': proxy.country,
+                'host': proxy.host,
+                'port': proxy.port,
+                'type': proxy.type,
+                'date': get_now_str()
+            }
+            self.db_proxy.insert_row(tbl_name=self.tbl_name, result_row=result_row)
+            print(result_row)
+
+
+if __name__ == '__main__':
+    app = ScrapeFreeProxies()
+    app.get_proxies()
@@ -0,0 +1,21 @@
+import googlemaps
+from datetime import datetime
+
+gmaps = googlemaps.Client(key='Your API Key')
+
+# Geocoding an address
+geocode_result = gmaps.geocode('1600 Amphitheatre Parkway, Mountain View, CA')
+
+
+
+# Look up an address with reverse geocoding
+reverse_geocode_result = gmaps.reverse_geocode((40.714224, -73.961452))
+
+# Request directions via public transit
+now = datetime.now()
+directions_result = gmaps.directions("Sydney Town Hall",
+                                     "Parramatta, NSW",
+                                     mode="transit",
+                                     departure_time=now)
+
+print(directions_result)
@@ -0,0 +1,14 @@
+from urllib.parse import urlencode
+
+url = "http://in-stjoseph-assessor.governmax.com/propertymax/search_property.asp?l_nm=owner&user=guest_in-stjoseph-assessor&pass=manatron&sid={}"
+
+query = {
+    'l_nm': 'owner',
+    'user': 'guest_in-stjoseph-assessor',
+    'pass': 'manatron',
+    'sid': 'F83D36C5F0094B86B4BE9307649DA66F'
+}
+
+query_encoded = urlencode(query)
+new_url = "http://in-stjoseph-assessor.governmax.com/propertymax/search_property.asp?" + query_encoded
+print(new_url)
@@ -0,0 +1,14 @@
+import multiprocessing as mp
+from time import time
+import pandas as pd
+import pyreadstat
+from multiprocessing import freeze_support
+
+if __name__ == '__main__':
+    start_ts = time()
+    # df_spss, meta = pyreadstat.read_sav("Surgery.sav", user_missing=False)
+    df_spss, meta = pyreadstat.read_file_multiprocessing(pyreadstat.read_sav, "Surgery.sav")
+
+    elapsed_ts = time() - start_ts
+    print(elapsed_ts)
+
@@ -0,0 +1,33 @@
+import multiprocessing as mp
+from time import time
+
+import pandas as pd
+import pyreadstat
+
+def worker(inpt):
+    import pyreadstat
+    offset, chunksize, path = inpt
+    df, meta = pyreadstat.read_sav(path, row_offset=offset, row_limit=chunksize)
+    return df
+
+# calculate the number of rows in the file
+_, meta = pyreadstat.read_sav("big.sav", metadataonly=True)
+numrows = meta.number_rows
+# calculate number of cores in the machine, this could also be set manually to some number, i.e. 8
+numcores = mp.cpu_count()
+# calculate the chunksize and offsets
+divs = [numrows // numcores + (1 if x < numrows % numcores else 0)  for x in range (numcores) ]
+chunksize = divs[0]
+offsets = [indx*chunksize for indx in range(numcores)]
+# pack the data for the jobs
+jobs = [(x, chunksize, "big.sav") for x in offsets]
+
+pool = mp.Pool(processes=numcores)
+# let's go!
+t0=time()
+chunks = pool.map(worker, jobs)
+t1=time()
+print(t1-t0) # this prints 29 seconds
+# chunks is a list of dataframes in the right order
+# you can concatenate all the chunks into a single big dataframe if you like
+final = pd.concat(chunks, axis=0, ignore_index=True)
@@ -0,0 +1,45 @@
+import multiprocessing as mp
+from time import time
+
+import pandas as pd
+import pyreadstat
+import math
+import threading
+
+
+def worker(inpt):
+    print(inpt)
+    offset, chunksize, path = inpt
+    df, meta = pyreadstat.read_sav(path, row_offset=offset, row_limit=chunksize)
+    # df, meta = pyreadstat.read_file_in_chunks(pyreadstat.read_sav, path, offset=offset, chunksize=chunksize,
+    #                                           multiprocess=True, num_processes=10)
+    return df
+
+
+start_ts = time()
+
+# calculate the number of rows in the file
+_, meta = pyreadstat.read_sav("Surgery.sav", metadataonly=True)
+numrows = meta.number_rows
+# calculate number of cores in the machine, this could also be set manually to some number, i.e. 8
+# calculate the chunksize and offsets
+chunksize = 200
+offsets = [indx * chunksize for indx in range(math.ceil(numrows / chunksize))]
+# pack the data for the jobs
+jobs = [(x, chunksize, "Surgery.sav") for x in offsets]
+
+threads = []
+max_threads = 30
+while threads or jobs:
+    for thread in threads:
+        if not thread.is_alive():
+            threads.remove(thread)
+
+    while len(threads) < max_threads and jobs:
+        job = jobs.pop()
+        thread = threading.Thread(target=worker, args=(job,))
+        thread.start()
+        threads.append(thread)
+
+elapsed_ts = time() - start_ts
+print(elapsed_ts)