JasonJe
diff --git a/‎README.md
+2 b/‎README.md
+2
diff --git a/‎distributed_web_spider_demo/README.md
+21 b/‎distributed_web_spider_demo/README.md
+21
diff --git a/‎distributed_web_spider_demo/client.py
+38 b/‎distributed_web_spider_demo/client.py
+38
diff --git a/‎distributed_web_spider_demo/client/__init__.py b/‎distributed_web_spider_demo/client/__init__.py
diff --git a/‎distributed_web_spider_demo/client/downloader.py
+39 b/‎distributed_web_spider_demo/client/downloader.py
+39
diff --git a/‎distributed_web_spider_demo/client/pipelines.py
+43 b/‎distributed_web_spider_demo/client/pipelines.py
+43
diff --git a/‎distributed_web_spider_demo/client/proxies.py
+55 b/‎distributed_web_spider_demo/client/proxies.py
+55
diff --git a/‎distributed_web_spider_demo/db/__init__.py b/‎distributed_web_spider_demo/db/__init__.py
diff --git a/‎distributed_web_spider_demo/db/redis_session.py
+37 b/‎distributed_web_spider_demo/db/redis_session.py
+37
diff --git a/‎distributed_web_spider_demo/server.py
+82 b/‎distributed_web_spider_demo/server.py
+82
diff --git a/‎distributed_web_spider_demo/server/__init__.py b/‎distributed_web_spider_demo/server/__init__.py
diff --git a/‎distributed_web_spider_demo/server/pipelines.py
+13 b/‎distributed_web_spider_demo/server/pipelines.py
+13
@@ -8,6 +8,8 @@
 
 * [搜狗微信基于关键词爬取相关微信公众号文章](./wechat_spider)
 
+* [一个`CS`架构的分布式爬虫例子](./distributed_web_spider_demo)
+
 ## `Web`开发相关
 
 * [`Flask`实现`api token`](./redis_token)
 
@@ -0,0 +1,21 @@
+# 一个`CS`架构的分布式爬虫例子
+
+基于`Redis`设计缓存队列，爬取拉勾网上指定关键字的职位信息。
+
+#### 使用
+
+* 客户端
+
+从职位列表获取职位详情页链接，放入缓存队列中。
+
+```bash
+python3 client.py
+```
+
+* 服务端
+
+从缓存队列中取出职位详情页链接，解析详情页页面，结构化数据存入数据库。
+
+```bash
+python3 server.py
+```
@@ -0,0 +1,38 @@
+import asyncio
+import traceback
+
+from settings import *
+
+from db.redis_session import *
+from client.downloader import *
+from client.proxies import *
+
+proxies_set = RedisSet('proxies', host = REDIS_HOST, port = REDIS_PORT, db = REDIS_DB, password = REDIS_PWD)
+pageurls_queue = RedisQueue('pageurls', host = REDIS_HOST, port = REDIS_PORT, db = REDIS_DB, password = REDIS_PWD)
+
+if __name__ == '__main__':
+    # GetIps(100).main()
+    tasks = []
+    while True:
+        if pageurls_queue.qsize() == 0:
+            break
+        url = pageurls_queue.get_nowait()
+        url = url.decode('utf-8')
+
+        header = {
+            'User-Agent': random.sample(user_agent_list, 1)[0]
+        }
+        proxies = {}
+        if 'https' in proxies_set.get_rand():
+            proxies.update({
+                "https": proxies_set.get_rand() if proxies_set.size() != 0 else ''
+            })
+        else:
+            proxies.update({
+                "http": proxies_set.get_rand() if proxies_set.size() != 0 else ''
+            })
+
+        tasks.append(asyncio.ensure_future(get_content(url, header, proxies)))
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(asyncio.wait(tasks))
+    loop.close()
@@ -0,0 +1,39 @@
+import random
+import requests
+import asyncio
+import traceback
+import functools
+from concurrent import futures
+
+from server.proxies import *
+from server.user_agent import *
+from client.pipelines import *
+
+proxies_set = RedisSet('proxies', host = REDIS_HOST, port = REDIS_PORT, db = REDIS_DB, password = REDIS_PWD)
+pageurls_queue = RedisQueue('pageurls', host = REDIS_HOST, port = REDIS_PORT, db = REDIS_DB, password = REDIS_PWD)
+
+executor = futures.ThreadPoolExecutor(max_workers = 10)
+
+async def get_content(url, header, proxies):
+    loop = asyncio.get_event_loop()
+    try:
+        response = await loop.run_in_executor(executor, functools.partial(requests.get, url = url, headers = header, proxies = proxies, timeout = 30))
+        if response.status_code == 200:
+            if analyze_content(response.text):
+                print('Success crawl page: {}, {}, {}'.format(url, header, proxies))
+            else:
+                proxies_set.remove(list(proxies.values())[0])
+                print('Analyze error. Removing proxy:', list(proxies.values())[0])
+                pageurls_queue.put(url)
+                print('Add page url to queue again.')
+        else:
+            proxies_set.remove(list(proxies.values())[0])
+            print('Response error. Removing proxy:', list(proxies.values())[0])
+            pageurls_queue.put(url)
+            print('Add page url to queue again.')
+    except Exception as e:
+        proxies_set.remove(list(proxies.values())[0])
+        traceback.print_exc()
+        print('Removing proxy:', list(proxies.values())[0])
+        pageurls_queue.put(url)
+        print('Add page url to queue again.')
@@ -0,0 +1,43 @@
+import traceback
+from bs4 import BeautifulSoup
+
+from settings import *
+from db.redis_session import *
+
+jobdatas = RedisSet('jobdatas', host = REDIS_HOST, port = REDIS_PORT, db = REDIS_DB, password = REDIS_PWD)
+
+def analyze_content(html_text):
+    try:
+        soup = BeautifulSoup(html_text, 'html5lib')
+
+        job_names = soup.find_all(name = 'div', attrs = {"class": "job-name"})[0]
+        name = job_names.attrs['title']
+        company = job_names.find(name = 'div', attrs = {"class": "company"}).text
+
+        job_requests = soup.find_all(name = 'dd', attrs = {"class": "job_request"})[0]
+        request = "".join([i.text for i in job_requests.find_all(name = 'span')])
+
+        job_detail = soup.select('#job_detail')[0]
+        job_advantage = job_detail.select('.job-advantage p')[0].text
+
+        job_description = [i.text for i in job_detail.select('.job_bt div p')]
+
+        job_address = job_detail.select('.work_addr')[0].text.replace('\n', '').replace(' ', '').replace('查看地图', '')
+
+        job_url = soup.find(name = 'link', attrs = {"rel": "canonical"}).attrs['href']
+
+        job_data = {
+            'name': name,
+            'company': company,
+            'request': request,
+            'advantage': job_advantage,
+            'description': job_description,
+            'address': job_address,
+            'url': job_url
+        }
+
+        jobdatas.add(job_data)
+        return True
+    except Exception as e:
+        traceback.print_exc()
+        return False
@@ -0,0 +1,55 @@
+import requests
+import asyncio
+import functools
+from bs4 import BeautifulSoup
+from concurrent import futures
+
+from settings import *
+from db.redis_session import *
+
+executor = futures.ThreadPoolExecutor(max_workers = 10)
+
+proxies_set = RedisSet('proxies', host = REDIS_HOST, port = REDIS_PORT, db = REDIS_DB, password = REDIS_PWD)
+
+class GetIps():
+    def __init__(self, page):
+        self.ips = []
+        self.urls = []
+        for i in range(page):
+            self.urls.append(PROXY_CRAWL_URL + "%s"%i)
+        self.header = {
+            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
+        }
+
+    def get_ips(self):
+        for url in self.urls:
+            res = requests.get(url, headers = self.header)
+            soup = BeautifulSoup(res.text, 'html5lib')
+            ips = soup.find_all('tr')
+            for ip in ips:
+                tds = ip.find_all('td')
+                if len(tds) < 2:
+                    continue
+                ip_temp = 'http://' + tds[0].contents[0] + ':' + tds[1].contents[0]
+                self.ips.append(str(ip_temp))
+
+    async def review_ips(self, url, ip):
+        loop = asyncio.get_event_loop()
+        try:
+            proxy = {'http': ip}
+            response = await loop.run_in_executor(executor, functools.partial(requests.get, url = url, proxies = proxy, timeout = 3))
+            if response.status_code == 200:
+                proxies_set.add(ip)
+            else:
+                print('Time Out!')
+        except Exception as e:
+            print(e)
+            pass
+
+    def main(self):
+        self.get_ips()
+        url = PROXY_TEST_URL
+        tasks = [asyncio.ensure_future(self.review_ips(url, ip)) for ip in self.ips]
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(asyncio.wait(tasks))
+        loop.close()
@@ -0,0 +1,37 @@
+import redis
+
+class RedisQueue(object):
+    def __init__(self, name, namespace='queue', **redis_kwargs):
+        self.__db = redis.Redis(**redis_kwargs)
+        self.key = '%s:%s' %(namespace, name)
+
+    def qsize(self):
+        return self.__db.llen(self.key)
+
+    def put(self, item):
+        self.__db.rpush(self.key, item)
+
+    def get_wait(self, timeout=None):
+        item = self.__db.blpop(self.key, timeout=timeout)
+        return item
+
+    def get_nowait(self):
+        item = self.__db.lpop(self.key)
+        return item
+
+class RedisSet(object):
+    def __init__(self, name, namespace='set', **redis_kwargs):
+        self.__db = redis.Redis(**redis_kwargs)
+        self.name = '%s:%s' %(namespace, name)
+
+    def size(self):
+        return self.__db.scard(self.name)
+
+    def add(self, value):
+        self.__db.sadd(self.name, value)
+
+    def get_rand(self):
+        return self.__db.srandmember(self.name, 1)[0].decode('utf-8')
+
+    def remove(self, value):
+        self.__db.srem(self.name, value)
@@ -0,0 +1,82 @@
+import random
+import requests
+import traceback
+from bs4 import BeautifulSoup
+
+from settings import *
+
+from server.proxies import *
+from server.user_agent import *
+from db.redis_session import *
+from server.pipelines import *
+
+proxies_set = RedisSet('proxies', host = REDIS_HOST, port = REDIS_PORT, db = REDIS_DB, password = REDIS_PWD)
+pageurls_queue = RedisQueue('pageurls', host = REDIS_HOST, port = REDIS_PORT, db = REDIS_DB, password = REDIS_PWD)
+
+class Engine():
+    def __init__(self, root_url, page_num):
+        self.root_url = root_url
+        self.page_num = page_num
+
+    def get_urls(self, num):
+        while True:
+            try:
+                headers = {
+                    'User-Agent': random.sample(user_agent_list, 1)[0],
+                    'Host': 'www.lagou.com',
+                    'Connection': 'keep-alive',
+                    'Content-Length': '26',
+                    'Pragma': 'no-cache',
+                    'Cache-Control': 'no-cache',
+                    'Origin': 'https://www.lagou.com',
+                    'X-Anit-Forge-Code': '0',
+                    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+                    'Accept': 'application/json, text/javascript, */*; q=0.01',
+                    'X-Requested-With': 'XMLHttpRequest',
+                    'X-Anit-Forge-Token': 'None',
+                    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
+                    'Accept-Encoding': 'gzip, deflate, br',
+                    'Accept-Language': 'zh-CN,zh;q=0.9',
+                    'Cookie': '_ga=GA1.2.2102803584.1542767027; user_trace_token=20181121102346-794ca69c-ed34-11e8-af2a-525400f775ce; LGUID=20181121102346-794ca90c-ed34-11e8-af2a-525400f775ce; index_location_city=%E6%B7%B1%E5%9C%B3; _gid=GA1.2.417210577.1545100733; JSESSIONID=ABAAABAAAFCAAEGA2594CCC871A5C6033C439A7D767B848; _gat=1; LGSID=20181218212005-a2432f21-02c7-11e9-8f5b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542767027,1545100734,1545139206; TG-TRACK-CODE=index_search; SEARCH_ID=3062075691be4d739e276ef7ea9921c1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545139212; LGRID=20181218212011-a5f61409-02c7-11e9-8f5b-5254005c3644'
+                }
+                proxies = {
+                    "http": proxies_set.get_rand()
+                }
+                data = {
+                    'first': 'false',
+                    'pn': str(num),
+                    'kd': 'python'
+                }
+                params = {
+                    'city': '深圳',
+                    'needAddtionalResult': False
+                }
+                res = requests.post(self.root_url, params = params, data = data, headers = headers, proxies = proxies, timeout = 3)
+                url_list = url_generator(res)
+                [pageurls_queue.put(url) for url in url_list]
+                if res.status_code == 200:
+                    print('success', str(num), proxies['http'])
+                    break
+                else:
+                    print('status failed')
+                    continue
+            except requests.exceptions.ProxyError:
+                proxies_set.remove(proxies['http'])
+                print('ProxyError. Removing proxy:', proxies['http'])
+                continue
+            except requests.exceptions.ConnectTimeout:
+                proxies_set.remove(proxies['http'])
+                print('ConnectTimeout. Removing proxy:', proxies['http'])
+                continue
+            except Exception as e:
+                traceback.format_exc()
+                continue
+
+    def main(self):
+        for num in range(self.page_num):
+            self.get_urls(num + 1)
+
+if __name__ == '__main__':
+    GetIps(100).main()
+    engine = Engine(SERVER_ROOT_URL, 30)
+    engine.main()
@@ -0,0 +1,13 @@
+import requests
+from pprint import pprint
+
+def url_generator(response):
+    '''
+    :param response: class:`Response <Response>` object
+    :return: :list:Details page URL list
+    :rtype: list
+    '''
+    url_list = []
+    for result in response.json()['content']['positionResult']['result']:
+        url_list.append('https://www.lagou.com/jobs/{}.html'.format(result['positionId']))
+    return url_list