-
Notifications
You must be signed in to change notification settings - Fork 375
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
搜索结果为空判断错误 #506
Comments
(刚才误触enter了 def parse(self, response):
base_url = response.meta.get('base_url')
keyword = response.meta.get('keyword')
province = response.meta.get('province')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
print(f"{datetime.now()}: 搜索{response.meta.get('start_date')}至{response.meta.get('end_date')}")
# if is_empty:
# print(f"{datetime.now()}: 当前页面搜索结果为空")
# 如果第一页全被夹了,也会被认为是空的
if page_count < self.further_threshold and not is_empty:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={
'keyword': keyword,
'page_count': page_count,
'page_index': 1
})
else:
start_date = datetime.strptime(response.meta.get('start_date'), '%Y-%m-%d')
end_date = datetime.strptime(response.meta.get('end_date'), '%Y-%m-%d')
if start_date < end_date:
start_str = start_date.strftime('%Y-%m-%d') + '-0'
end_str = end_date.strftime('%Y-%m-%d') + '-0'
meta = {
'base_url': base_url,
'keyword': keyword
} if not self.settings.get('REGION') or '全部' in self.settings.get(
'REGION') else {
'base_url': base_url,
'keyword': keyword,
'province': province
}
mid_date = start_date + timedelta(days=(end_date - start_date).days // 2)
mid_str = mid_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, mid_str)
meta['start_date'] = start_str[:-2]
meta['end_date'] = mid_str[:-2]
# 获取前半段时间的搜索结果
yield scrapy.Request(url=url,
callback=self.parse,
meta=meta)
mid_date = mid_date + timedelta(days=1)
mid_str = mid_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
mid_str, end_str)
meta['start_date'] = mid_str[:-2]
meta['end_date'] = end_str[:-2]
# 获取后半段时间的搜索结果
yield scrapy.Request(url=url,
callback=self.parse,
meta=meta)
elif start_date == end_date:
start_str = start_date.strftime('%Y-%m-%d') + '-0'
start_date = start_date + timedelta(days=1)
end_str = start_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, end_str)
# 获取一天的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_day,
meta={
'base_url': base_url,
'keyword': keyword,
'province': province,
'date': start_str[:-2]
}) 希望作者能针对这种情况补充,并增加调整粒度的选项 |
感谢建议,我现在没办法调试,我可以把您的方法置顶,供大家参考,再次感谢。 |
# -*- coding: utf-8 -*-
import os
import re
import sys
from datetime import datetime, timedelta
from urllib.parse import unquote
import requests
import scrapy
import weibo.utils.util as util
from scrapy.exceptions import CloseSpider
from scrapy.utils.project import get_project_settings
from weibo.items import WeiboItem
class SearchSpider(scrapy.Spider):
name = 'search'
allowed_domains = ['weibo.com']
settings = get_project_settings()
keyword_list = settings.get('KEYWORD_LIST')
if not isinstance(keyword_list, list):
if not os.path.isabs(keyword_list):
keyword_list = os.getcwd() + os.sep + keyword_list
if not os.path.isfile(keyword_list):
sys.exit('不存在%s文件' % keyword_list)
keyword_list = util.get_keyword_list(keyword_list)
for i, keyword in enumerate(keyword_list):
if len(keyword) > 2 and keyword[0] == '#' and keyword[-1] == '#':
keyword_list[i] = '%23' + keyword[1:-1] + '%23'
weibo_type = util.convert_weibo_type(settings.get('WEIBO_TYPE'))
contain_type = util.convert_contain_type(settings.get('CONTAIN_TYPE'))
regions = util.get_regions(settings.get('REGION'))
base_url = 'https://s.weibo.com'
start_date = settings.get('START_DATE',
datetime.now().strftime('%Y-%m-%d'))
end_date = settings.get('END_DATE', datetime.now().strftime('%Y-%m-%d'))
if util.str_to_time(start_date) > util.str_to_time(end_date):
sys.exit('settings.py配置错误,START_DATE值应早于或等于END_DATE值,请重新配置settings.py')
further_threshold = settings.get('FURTHER_THRESHOLD', 46)
mongo_error = False
pymongo_error = False
mysql_error = False
pymysql_error = False
with open('log.txt', 'w') as f:
f.write('')
def start_requests(self):
start_date = datetime.strptime(self.start_date, '%Y-%m-%d')
end_date = datetime.strptime(self.end_date,
'%Y-%m-%d') + timedelta(days=1)
start_str = start_date.strftime('%Y-%m-%d') + '-0'
end_str = end_date.strftime('%Y-%m-%d') + '-0'
for keyword in self.keyword_list:
if not self.settings.get('REGION') or '全部' in self.settings.get(
'REGION'):
base_url = 'https://s.weibo.com/weibo?q=%s' % keyword
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}'.format(start_str, end_str)
yield scrapy.Request(url=url,
callback=self.parse,
meta={
'base_url': base_url,
'keyword': keyword,
'start_date': self.start_date,
'end_date': self.end_date
})
else:
for region in self.regions.values():
base_url = (
'https://s.weibo.com/weibo?q={}®ion=custom:{}:1000'
).format(keyword, region['code'])
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}'.format(start_str, end_str)
# 获取一个省的搜索结果
yield scrapy.Request(url=url,
callback=self.parse,
meta={
'base_url': base_url,
'keyword': keyword,
'province': region,
'start_date': self.start_date,
'end_date': self.end_date
})
def check_environment(self):
"""判断配置要求的软件是否已安装"""
if self.pymongo_error:
print('系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序')
raise CloseSpider()
if self.mongo_error:
print('系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序')
raise CloseSpider()
if self.pymysql_error:
print('系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序')
raise CloseSpider()
if self.mysql_error:
print('系统中可能没有安装或正确配置MySQL数据库,请先根据系统环境安装或配置MySQL,再运行程序')
raise CloseSpider()
def parse(self, response):
base_url = response.meta.get('base_url')
keyword = response.meta.get('keyword')
province = response.meta.get('province')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
print(f"{datetime.now()}: 搜索{response.meta.get('start_date')}至{response.meta.get('end_date')}")
# if is_empty:
# print(f"{datetime.now()}: 当前页面搜索结果为空")
# 如果第一页全被夹了,也会被认为是空的
if page_count < self.further_threshold and not is_empty:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
if page_count > 1:
next_url = response.url.split('&page=')[0] + f'&page=2'
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={
'keyword': keyword,
'page_count': page_count,
'page_index': 2
})
else:
start_date = datetime.strptime(response.meta.get('start_date'), '%Y-%m-%d')
end_date = datetime.strptime(response.meta.get('end_date'), '%Y-%m-%d')
if start_date + timedelta(days=3) <= end_date:
start_str = start_date.strftime('%Y-%m-%d') + '-0'
end_str = (end_date + timedelta(days=1)).strftime('%Y-%m-%d') + '-0'
meta = {
'base_url': base_url,
'keyword': keyword
} if not self.settings.get('REGION') or '全部' in self.settings.get(
'REGION') else {
'base_url': base_url,
'keyword': keyword,
'province': province
}
mid_date = start_date + timedelta(days=(end_date - start_date).days // 2)
mid_str = mid_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, mid_str)
meta['start_date'] = start_date.strftime('%Y-%m-%d')
meta['end_date'] = mid_date.strftime('%Y-%m-%d')
# 获取前半段时间的搜索结果
yield scrapy.Request(url=url,
callback=self.parse,
meta=meta)
mid_date = mid_date + timedelta(days=1)
mid_str = mid_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
mid_str, end_str)
meta['start_date'] = mid_date.strftime('%Y-%m-%d')
meta['end_date'] = end_date.strftime('%Y-%m-%d')
# 获取后半段时间的搜索结果
yield scrapy.Request(url=url,
callback=self.parse,
meta=meta)
else:
while start_date <= end_date:
start_str = start_date.strftime('%Y-%m-%d') + '-0'
start_date = start_date + timedelta(days=1)
end_str = start_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, end_str)
# 获取一天的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_day,
meta={
'base_url': base_url,
'keyword': keyword,
'province': province,
'date': start_str[:-2]
})
def parse_by_day(self, response):
"""以天为单位筛选"""
base_url = response.meta.get('base_url')
keyword = response.meta.get('keyword')
province = response.meta.get('province')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
date = response.meta.get('date')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print(f"{datetime.now()}: {date} 日搜索结果为空")
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
if page_count > 1:
next_url = response.url.split('&page=')[0] + f'&page=2'
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword,
'page_count': page_count,
'page_index': 2
})
else:
start_date_str = date + '-0'
start_date = datetime.strptime(start_date_str, '%Y-%m-%d-%H')
for i in range(1, 25):
start_str = start_date.strftime('%Y-%m-%d-X%H').replace(
'X0', 'X').replace('X', '')
start_date = start_date + timedelta(hours=1)
end_str = start_date.strftime('%Y-%m-%d-X%H').replace(
'X0', 'X').replace('X', '')
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, end_str)
# 获取一小时的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_hour_province
if province else self.parse_by_hour,
meta={
'base_url': base_url,
'keyword': keyword,
'province': province,
'start_time': start_str,
'end_time': end_str
})
def parse_by_hour(self, response):
"""以小时为单位筛选"""
keyword = response.meta.get('keyword')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
start_time = response.meta.get('start_time')
end_time = response.meta.get('end_time')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print(f"{datetime.now()}: {start_time} 搜索结果为空")
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
if page_count > 1:
next_url = response.url.split('&page=')[0] + f'&page=2'
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword,
'page_count': page_count,
'page_index': 2
})
else:
for region in self.regions.values():
url = ('https://s.weibo.com/weibo?q={}®ion=custom:{}:1000'
).format(keyword, region['code'])
url += self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_time, end_time)
# 获取一小时一个省的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_hour_province,
meta={
'keyword': keyword,
'start_time': start_time,
'end_time': end_time,
'province': region
})
def parse_by_hour_province(self, response):
"""以小时和直辖市/省为单位筛选"""
keyword = response.meta.get('keyword')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
start_time = response.meta.get('start_time')
end_time = response.meta.get('end_time')
province = response.meta.get('province')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print(f"{datetime.now()}: {start_time} {province} 搜索结果为空")
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
if page_count > 1:
next_url = response.url.split('&page=')[0] + f'&page=2'
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword,
'page_count': page_count,
'page_index': 2
})
else:
for city in province['city'].values():
url = ('https://s.weibo.com/weibo?q={}®ion=custom:{}:{}'
).format(keyword, province['code'], city)
url += self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_time, end_time)
# 获取一小时一个城市的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_page,
meta={
'keyword': keyword,
'start_time': start_time,
'end_time': end_time,
'province': province,
'city': city,
'page_count': -1,
'page_index': 1
})
def parse_page(self, response):
"""解析一页搜索结果的信息"""
keyword = response.meta.get('keyword')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
page_count = response.meta.get('page_count')
page_index = response.meta.get('page_index')
if is_empty and page_count < 0:
print(f"{datetime.now()}: {page_index}/{page_count} 页搜索结果为空")
with open('log.txt', 'a') as f:
f.write(f"{response.url} 0\n")
else:
if not is_empty:
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
else:
with open('log.txt', 'a') as f:
f.write(f"{response.url} 0\n")
if page_index < page_count:
next_url = response.url.split('&page=')[0] + f'&page={page_index + 1}'
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword,
'page_count': page_count,
'page_index': page_index + 1})
def get_ip(self, bid):
url = f"https://weibo.com/ajax/statuses/show?id={bid}&locale=zh-CN"
response = requests.get(url, headers=self.settings.get('DEFAULT_REQUEST_HEADERS'))
if response.status_code != 200:
return ""
try:
data = response.json()
except requests.exceptions.JSONDecodeError:
return ""
ip_str = data.get("region_name", "")
if ip_str:
ip_str = ip_str.split()[-1]
return ip_str
def get_article_url(self, selector):
"""获取微博头条文章url"""
article_url = ''
text = selector.xpath('string(.)').extract_first().replace(
'\u200b', '').replace('\ue627', '').replace('\n',
'').replace(' ', '')
if text.startswith('发布了头条文章'):
urls = selector.xpath('.//a')
for url in urls:
if url.xpath(
'i[@class="wbicon"]/text()').extract_first() == 'O':
if url.xpath('@href').extract_first() and url.xpath(
'@href').extract_first().startswith('https://t.cn'):
article_url = url.xpath('@href').extract_first()
break
return article_url
def n_get_article_url(self, selector, text_selector):
"""获取微博头条文章url"""
article_url = self.get_article_url(text_selector)
paths = ['div[@class="from"]/a[1]/@href', 'p[@class="from"]/a[1]/@href']
if not article_url or article_url == '':
for path in paths:
try:
article_url = 'https:' + selector.xpath('.//'+path).extract_first().split('?')[0]
break
except:
pass
if not article_url or article_url == '':
print('未找到文章链接')
print(selector.extract())
return article_url
def get_location(self, selector):
"""获取微博发布位置"""
a_list = selector.xpath('.//a')
location = ''
for a in a_list:
if a.xpath('./i[@class="wbicon"]') and a.xpath(
'./i[@class="wbicon"]/text()').extract_first() == '2':
location = a.xpath('string(.)').extract_first()[1:]
break
return location
def get_at_users(self, selector):
"""获取微博中@的用户昵称"""
a_list = selector.xpath('.//a')
at_users = ''
at_list = []
for a in a_list:
if len(unquote(a.xpath('@href').extract_first())) > 14 and len(
a.xpath('string(.)').extract_first()) > 1:
if unquote(a.xpath('@href').extract_first())[14:] == a.xpath(
'string(.)').extract_first()[1:]:
at_user = a.xpath('string(.)').extract_first()[1:]
if at_user not in at_list:
at_list.append(at_user)
if at_list:
at_users = ','.join(at_list)
return at_users
def get_topics(self, selector):
"""获取参与的微博话题"""
a_list = selector.xpath('.//a')
topics = ''
topic_list = []
for a in a_list:
text = a.xpath('string(.)').extract_first()
if len(text) > 2 and text[0] == '#' and text[-1] == '#':
if text[1:-1] not in topic_list:
topic_list.append(text[1:-1])
if topic_list:
topics = ','.join(topic_list)
return topics
def parse_weibo(self, response):
"""解析网页中的微博信息"""
keyword = response.meta.get('keyword')
card_warp_len = len(response.xpath('//div[@class="card-wrap"]'))
with open('log.txt', 'a') as f:
f.write(f"{response.url} {card_warp_len}\n")
for sel in response.xpath("//div[@class='card-wrap']"):
info = sel.xpath(
"div[@class='card']/div[@class='card-feed']/div[@class='content']/div[@class='info']"
)
if info:
weibo = WeiboItem()
weibo['id'] = sel.xpath('@mid').extract_first()
bid = sel.xpath(
'.//div[@class="from"]/a[1]/@href').extract_first(
).split('/')[-1].split('?')[0]
weibo['bid'] = bid
weibo['user_id'] = info[0].xpath(
'div[2]/a/@href').extract_first().split('?')[0].split(
'/')[-1]
weibo['screen_name'] = info[0].xpath(
'div[2]/a/@nick-name').extract_first()
txt_sel = sel.xpath('.//p[@class="txt"]')[0]
retweet_sel = sel.xpath('.//div[@class="card-comment"]')
retweet_txt_sel = ''
if retweet_sel and retweet_sel[0].xpath('.//p[@class="txt"]'):
retweet_txt_sel = retweet_sel[0].xpath(
'.//p[@class="txt"]')[0]
content_full = sel.xpath(
'.//p[@node-type="feed_list_content_full"]')
is_long_weibo = False
is_long_retweet = False
if content_full:
if not retweet_sel:
txt_sel = content_full[0]
is_long_weibo = True
elif len(content_full) == 2:
txt_sel = content_full[0]
retweet_txt_sel = content_full[1]
is_long_weibo = True
is_long_retweet = True
elif retweet_sel[0].xpath(
'.//p[@node-type="feed_list_content_full"]'):
retweet_txt_sel = retweet_sel[0].xpath(
'.//p[@node-type="feed_list_content_full"]')[0]
is_long_retweet = True
else:
txt_sel = content_full[0]
is_long_weibo = True
weibo['text'] = txt_sel.xpath(
'string(.)').extract_first().replace('\u200b', '').replace(
'\ue627', '')
weibo['article_url'] = self.n_get_article_url(sel, txt_sel)
weibo['location'] = self.get_location(txt_sel)
if weibo['location']:
weibo['text'] = weibo['text'].replace(
'2' + weibo['location'], '')
weibo['text'] = weibo['text'][2:].replace(' ', '')
if is_long_weibo:
weibo['text'] = weibo['text'][:-4]
weibo['at_users'] = self.get_at_users(txt_sel)
weibo['topics'] = self.get_topics(txt_sel)
reposts_count = sel.xpath(
'.//a[@action-type="feed_list_forward"]/text()').extract()
reposts_count = "".join(reposts_count)
try:
reposts_count = re.findall(r'\d+.*', reposts_count)
except TypeError:
print(
"无法解析转发按钮,可能是 1) 网页布局有改动 2) cookie无效或已过期。\n"
"请在 https://github.com/dataabc/weibo-search 查看文档,以解决问题,"
)
raise CloseSpider()
weibo['reposts_count'] = reposts_count[
0] if reposts_count else '0'
comments_count = sel.xpath(
'.//a[@action-type="feed_list_comment"]/text()'
).extract_first()
comments_count = re.findall(r'\d+.*', comments_count)
weibo['comments_count'] = comments_count[
0] if comments_count else '0'
attitudes_count = sel.xpath(
'.//a[@action-type="feed_list_like"]/button/span[2]/text()').extract_first()
attitudes_count = re.findall(r'\d+.*', attitudes_count)
weibo['attitudes_count'] = attitudes_count[
0] if attitudes_count else '0'
created_at = sel.xpath(
'.//div[@class="from"]/a[1]/text()').extract_first(
).replace(' ', '').replace('\n', '').split('前')[0]
weibo['created_at'] = util.standardize_date(created_at)
source = sel.xpath('.//div[@class="from"]/a[2]/text()'
).extract_first()
weibo['source'] = source if source else ''
pics = ''
is_exist_pic = sel.xpath(
'.//div[@class="media media-piclist"]')
if is_exist_pic:
pics = is_exist_pic[0].xpath('ul[1]/li/img/@src').extract()
pics = [pic[8:] for pic in pics]
pics = [
re.sub(r'/.*?/', '/large/', pic, 1) for pic in pics
]
pics = ['https://' + pic for pic in pics]
video_url = ''
is_exist_video = sel.xpath(
'.//div[@class="thumbnail"]//video-player').extract_first()
if is_exist_video:
video_url = re.findall(r'src:\'(.*?)\'', is_exist_video)[0]
video_url = video_url.replace('&', '&')
video_url = 'http:' + video_url
if not retweet_sel:
weibo['pics'] = pics
weibo['video_url'] = video_url
else:
weibo['pics'] = ''
weibo['video_url'] = ''
weibo['retweet_id'] = ''
if retweet_sel and retweet_sel[0].xpath(
'.//div[@node-type="feed_list_forwardContent"]/a[1]'):
retweet = WeiboItem()
retweet['id'] = retweet_sel[0].xpath(
'.//a[@action-type="feed_list_like"]/@action-data'
).extract_first()[4:]
retweet['bid'] = retweet_sel[0].xpath(
'.//p[@class="from"]/a/@href').extract_first().split(
'/')[-1].split('?')[0]
info = retweet_sel[0].xpath(
'.//div[@node-type="feed_list_forwardContent"]/a[1]'
)[0]
retweet['user_id'] = info.xpath(
'@href').extract_first().split('/')[-1]
retweet['screen_name'] = info.xpath(
'@nick-name').extract_first()
retweet['text'] = retweet_txt_sel.xpath(
'string(.)').extract_first().replace('\u200b',
'').replace(
'\ue627', '')
retweet['article_url'] = self.n_get_article_url(
retweet_sel[0], retweet_txt_sel)
retweet['location'] = self.get_location(retweet_txt_sel)
if retweet['location']:
retweet['text'] = retweet['text'].replace(
'2' + retweet['location'], '')
retweet['text'] = retweet['text'][2:].replace(' ', '')
if is_long_retweet:
retweet['text'] = retweet['text'][:-4]
retweet['at_users'] = self.get_at_users(retweet_txt_sel)
retweet['topics'] = self.get_topics(retweet_txt_sel)
reposts_count = retweet_sel[0].xpath(
'.//ul[@class="act s-fr"]/li[1]/a[1]/text()'
).extract_first()
reposts_count = re.findall(r'\d+.*', reposts_count)
retweet['reposts_count'] = reposts_count[
0] if reposts_count else '0'
comments_count = retweet_sel[0].xpath(
'.//ul[@class="act s-fr"]/li[2]/a[1]/text()'
).extract_first()
comments_count = re.findall(r'\d+.*', comments_count)
retweet['comments_count'] = comments_count[
0] if comments_count else '0'
attitudes_count = retweet_sel[0].xpath(
'.//a[@class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter"]//span[@class="woo-like-count"]/text()'
).extract_first()
attitudes_count = re.findall(r'\d+.*', attitudes_count)
retweet['attitudes_count'] = attitudes_count[
0] if attitudes_count else '0'
created_at = retweet_sel[0].xpath(
'.//p[@class="from"]/a[1]/text()').extract_first(
).replace(' ', '').replace('\n', '').split('前')[0]
retweet['created_at'] = util.standardize_date(created_at)
source = retweet_sel[0].xpath(
'.//p[@class="from"]/a[2]/text()').extract_first()
retweet['source'] = source if source else ''
retweet['pics'] = pics
retweet['video_url'] = video_url
retweet['retweet_id'] = ''
yield {'weibo': retweet, 'keyword': keyword}
weibo['retweet_id'] = retweet['id']
weibo["ip"] = self.get_ip(bid)
avator = sel.xpath(
"div[@class='card']/div[@class='card-feed']/div[@class='avator']"
)
if avator:
user_auth = avator.xpath('.//svg/@id').extract_first()
# print(user_auth)
if user_auth == 'woo_svg_vblue':
weibo['user_authentication'] = '蓝V'
elif user_auth == 'woo_svg_vyellow':
weibo['user_authentication'] = '黄V'
elif user_auth == 'woo_svg_vorange':
weibo['user_authentication'] = '红V'
elif user_auth == 'woo_svg_vgold':
weibo['user_authentication'] = '金V'
else:
weibo['user_authentication'] = '普通用户'
# print(weibo)
yield {'weibo': weibo, 'keyword': keyword}
调整了搜索方式,使得搜索日期区间不相交(屏蔽情况的输出很随意,希望作者能调整一下) 另外很早以前的微博视频链接不能正常爬取(我这里是2014年) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
搜素微博时会出现这种情况:一页的内容非常少,远低于我这里的7条,推测可能微博屏蔽了某些内容,在最极端的情况下,一页可能什么都没有,这时就会出现搜索结果为空
因此parse_page似乎会提前终止
特别的,如果第一页什么都没有,就会导致诸如parse, parse_by_day之类的立即停止
这是我目前的改动
``
The text was updated successfully, but these errors were encountered: