|
| 1 | +import random |
| 2 | +import requests |
| 3 | +import traceback |
| 4 | +from bs4 import BeautifulSoup |
| 5 | + |
| 6 | +from settings import * |
| 7 | + |
| 8 | +from server.proxies import * |
| 9 | +from server.user_agent import * |
| 10 | +from db.redis_session import * |
| 11 | +from server.pipelines import * |
| 12 | + |
| 13 | +proxies_set = RedisSet('proxies', host = REDIS_HOST, port = REDIS_PORT, db = REDIS_DB, password = REDIS_PWD) |
| 14 | +pageurls_queue = RedisQueue('pageurls', host = REDIS_HOST, port = REDIS_PORT, db = REDIS_DB, password = REDIS_PWD) |
| 15 | + |
| 16 | +class Engine(): |
| 17 | + def __init__(self, root_url, page_num): |
| 18 | + self.root_url = root_url |
| 19 | + self.page_num = page_num |
| 20 | + |
| 21 | + def get_urls(self, num): |
| 22 | + while True: |
| 23 | + try: |
| 24 | + headers = { |
| 25 | + 'User-Agent': random.sample(user_agent_list, 1)[0], |
| 26 | + 'Host': 'www.lagou.com', |
| 27 | + 'Connection': 'keep-alive', |
| 28 | + 'Content-Length': '26', |
| 29 | + 'Pragma': 'no-cache', |
| 30 | + 'Cache-Control': 'no-cache', |
| 31 | + 'Origin': 'https://www.lagou.com', |
| 32 | + 'X-Anit-Forge-Code': '0', |
| 33 | + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', |
| 34 | + 'Accept': 'application/json, text/javascript, */*; q=0.01', |
| 35 | + 'X-Requested-With': 'XMLHttpRequest', |
| 36 | + 'X-Anit-Forge-Token': 'None', |
| 37 | + 'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=', |
| 38 | + 'Accept-Encoding': 'gzip, deflate, br', |
| 39 | + 'Accept-Language': 'zh-CN,zh;q=0.9', |
| 40 | + 'Cookie': '_ga=GA1.2.2102803584.1542767027; user_trace_token=20181121102346-794ca69c-ed34-11e8-af2a-525400f775ce; LGUID=20181121102346-794ca90c-ed34-11e8-af2a-525400f775ce; index_location_city=%E6%B7%B1%E5%9C%B3; _gid=GA1.2.417210577.1545100733; JSESSIONID=ABAAABAAAFCAAEGA2594CCC871A5C6033C439A7D767B848; _gat=1; LGSID=20181218212005-a2432f21-02c7-11e9-8f5b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542767027,1545100734,1545139206; TG-TRACK-CODE=index_search; SEARCH_ID=3062075691be4d739e276ef7ea9921c1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545139212; LGRID=20181218212011-a5f61409-02c7-11e9-8f5b-5254005c3644' |
| 41 | + } |
| 42 | + proxies = { |
| 43 | + "http": proxies_set.get_rand() |
| 44 | + } |
| 45 | + data = { |
| 46 | + 'first': 'false', |
| 47 | + 'pn': str(num), |
| 48 | + 'kd': 'python' |
| 49 | + } |
| 50 | + params = { |
| 51 | + 'city': '深圳', |
| 52 | + 'needAddtionalResult': False |
| 53 | + } |
| 54 | + res = requests.post(self.root_url, params = params, data = data, headers = headers, proxies = proxies, timeout = 3) |
| 55 | + url_list = url_generator(res) |
| 56 | + [pageurls_queue.put(url) for url in url_list] |
| 57 | + if res.status_code == 200: |
| 58 | + print('success', str(num), proxies['http']) |
| 59 | + break |
| 60 | + else: |
| 61 | + print('status failed') |
| 62 | + continue |
| 63 | + except requests.exceptions.ProxyError: |
| 64 | + proxies_set.remove(proxies['http']) |
| 65 | + print('ProxyError. Removing proxy:', proxies['http']) |
| 66 | + continue |
| 67 | + except requests.exceptions.ConnectTimeout: |
| 68 | + proxies_set.remove(proxies['http']) |
| 69 | + print('ConnectTimeout. Removing proxy:', proxies['http']) |
| 70 | + continue |
| 71 | + except Exception as e: |
| 72 | + traceback.format_exc() |
| 73 | + continue |
| 74 | + |
| 75 | + def main(self): |
| 76 | + for num in range(self.page_num): |
| 77 | + self.get_urls(num + 1) |
| 78 | + |
| 79 | +if __name__ == '__main__': |
| 80 | + GetIps(100).main() |
| 81 | + engine = Engine(SERVER_ROOT_URL, 30) |
| 82 | + engine.main() |
0 commit comments