@@ -42,7 +42,7 @@ def save_one_article(self, article_content, img_save_flag=True, content_save_fla
42
42
"""
43
43
# 整理文章关键信息
44
44
nickname = re .search (r'var nickname.*"(.*?)".*' , article_content ).group (1 ) # 公众号名称
45
- article_link = re .search (r'var msg_link = .*"(.*?)".*' , article_content ).group (1 ) # 文章短链接
45
+ article_link = re .search (r'var msg_link = .*"(.*?)".*' , article_content ).group (1 ) # 文章链接
46
46
createTime = re .search (r"var createTime = '(.*?)'.*" , article_content ).group (1 ) # 文章创建时间
47
47
# year, month, day = createTime.split(" ")[0].split("-") # 年,月,日
48
48
# hour, minute = createTime.split(" ")[1].split(":") # 小时,分钟
@@ -121,8 +121,15 @@ def get_one_article(self, url, img_save_flag=True, content_save_flag=True):
121
121
# 验证请求
122
122
if 'var createTime = ' in res .text : # 正常获取到文章内容
123
123
print ('正常获取到文章内容,开始保存操作' )
124
- self .save_one_article (res .text , img_save_flag , content_save_flag ) # 开始保存单篇文章
125
- return {'content_flag' : 1 , 'content' : res .text } # 用来获取公众号主页链接
124
+ try :
125
+ self .save_one_article (res .text , img_save_flag , content_save_flag ) # 开始保存单篇文章
126
+ return {'content_flag' : 1 , 'content' : res .text } # 用来获取公众号主页链接
127
+ except :
128
+ article_title = re .search (r"var title = '(.*?)'.*" , res .text ) # 文章标题
129
+ if article_title : article_title = article_title .group (1 )
130
+ print ('检测到抓取出错,文章名>>>> ' + article_title )
131
+ print ('检测到抓取出错,文章链接>>>> ' + url )
132
+ return {'content_flag' : 0 }
126
133
elif '>当前环境异常,完成验证后即可继续访问。<' in res .text :
127
134
print ('当前环境异常,请检查链接后访问!!!' ) # 代码访问遇到人机验证,需进行验证操作
128
135
return {'content_flag' : 0 }
@@ -599,29 +606,7 @@ def get_detail_new(self, link):
599
606
if read_num == [] or read_num == '' :
600
607
return '' , '' , '' , ''
601
608
else :
602
- print (local_time , createTime , article_title , link , article_texts , # 本地创建时间,文章发布时间,标题,链接,文本,
603
- read_num [0 ], like_num [0 ], share_num [0 ], show_read [0 ], # 阅读量,点赞数,转发数,在看数,
604
- comments , comments_star_nums ) # 评论,评论点赞
605
609
return (local_time , createTime , article_title , link , article_texts , # 本地创建时间,文章发布时间,标题,链接,文本,
606
610
read_num [0 ], like_num [0 ], share_num [0 ], show_read [0 ], # 阅读量,点赞数,转发数,在看数,
607
611
comments , comments_star_nums ) # 评论,评论点赞
608
612
609
-
610
- # url = 'https://mp.weixin.qq.com/s/JcsoqS2u2qC_J1V4EdlWmg'
611
- # url = 'http://mp.weixin.qq.com/s?__biz=MjM5MjAxNDM4MA==&mid=2666927281&idx=1&sn=e5608f6bc1f681f2495550596605d73e&chksm=bc94670a04237bf6638785f637da33b013d5971ff012073f44b73b199e8ea2e0497b3dc17060&scene=27'
612
-
613
- # # url = 'http://mp.weixin.qq.com/s?__biz=Mzg2MzUzNTA3MA==&mid=2247625647&idx=2&sn=aab4c87e990845af7299bd506837d81d&chksm=cf1218503cb08eec0a81148e7911689ae04a68de12ef672ddd591504bf2142b46a3ae573e59f&scene=27'
614
- # access_token = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5MjAxNDM4MA==&scene=124&uin=MTM1NDgyNTcyNw%3D%3D&key=daf9bdc5abc4e8d0fc74d77961957e5251665583011fa0c9de5108844084772e40373d4806a7b8ed6dc20c64c8cfb3af2892e050f37dd255af53bdba182a541be14e5178d05dff7b365afa3390f9c4af1c47272125eb8e8e37f1a9855530feb33245128692baf6801dceca5f4f25f2243429a419322cc1d5fc69fbfaf3cc8c48&devicetype=Windows+11+x64&version=63090c2d&lang=zh_CN&a8scene=1&acctmode=0&pass_ticket=PNwCUSrnyMnXHGldjYvgys9R6BSA%2FhSd6VFTeCBWd74jJFF%2F1FvGh1G2cLSpcDe8&wx_header=1'
615
-
616
-
617
- # ap = ArticleDetail()
618
- # # ap.access_origin_list(access_token, 2) # 获取公众号的前2页文章并保存,默认获取全部文章
619
- # # ap.get_list_article(url, False) # 此处url是公众号下任意一篇文章
620
- # ap.get_detail_list(access_token)
621
-
622
- # 检验access_token是否合法
623
- # ap.biz = str(re.search('biz=(.*?)&', access_token).group(1))
624
- # ap.uin = str(re.search('uin=(.*?)&', access_token).group(1))
625
- # ap.key = str(re.search('key=(.*?)&', access_token).group(1))
626
- # ap.pass_ticket = str(re.search('pass_ticket=(.*?)&', access_token).group(1))
627
- # ap.get_detail_new(url)
0 commit comments