1
+ # -*- coding:utf-8 -*-
2
+
3
+ import re
4
+ import requests
5
+ import os
6
+ import urllib .request
7
+ import ssl
8
+
9
+ from urllib .parse import urlsplit
10
+ from os .path import basename
11
+ import json
12
+
13
+ ssl ._create_default_https_context = ssl ._create_unverified_context
14
+
15
+ headers = {
16
+ 'User-Agent' : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36" ,
17
+ 'Accept-Encoding' : 'gzip, deflate'
18
+ }
19
+
20
+ def get_image_url (qid , title ):
21
+ answers_url = 'https://www.zhihu.com/api/v4/questions/' + str (qid )+ '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics%3Bdata%5B*%5D.settings.table_of_content.enabled&offset={}&limit=10&sort_by=default&platform=desktop'
22
+ offset = 0
23
+ session = requests .Session ()
24
+
25
+ while True :
26
+ page = session .get (answers_url .format (offset ), headers = headers )
27
+ json_text = json .loads (page .text )
28
+ answers = json_text ['data' ]
29
+
30
+ offset += 10
31
+
32
+ if not answers :
33
+ print ('获取图片地址完成' )
34
+ return
35
+
36
+ pic_re = re .compile ('data-original="(.*?)"' , re .S )
37
+
38
+ for answer in answers :
39
+ tmp_list = []
40
+ pic_urls = re .findall (pic_re , answer ['content' ])
41
+
42
+ for item in pic_urls :
43
+ # 去掉转移字符 \
44
+ pic_url = item .replace ("\\ " , "" )
45
+ pic_url = pic_url .split ('?' )[0 ]
46
+
47
+ # 去重复
48
+ if pic_url not in tmp_list :
49
+ tmp_list .append (pic_url )
50
+
51
+
52
+ for pic_url in tmp_list :
53
+ if pic_url .endswith ('r.jpg' ):
54
+ print (pic_url )
55
+ write_file (title , pic_url )
56
+
57
+ def write_file (title , pic_url ):
58
+ file_name = title + '.txt'
59
+
60
+ f = open (file_name , 'a' )
61
+ f .write (pic_url + '\n ' )
62
+ f .close ()
63
+
64
+ def read_file (title ):
65
+ file_name = title + '.txt'
66
+
67
+ pic_urls = []
68
+
69
+ # 判断文件是否存在
70
+ if not os .path .exists (file_name ):
71
+ return pic_urls
72
+
73
+ with open (file_name , 'r' ) as f :
74
+ for line in f :
75
+ url = line .replace ("\n " , "" )
76
+ if url not in pic_urls :
77
+ pic_urls .append (url )
78
+
79
+ print ("文件中共有{}个不重复的 URL" .format (len (pic_urls )))
80
+ return pic_urls
81
+
82
+ def download_pic (pic_urls , title ):
83
+
84
+ # 创建文件夹
85
+ if not os .path .exists (title ):
86
+ os .makedirs (title )
87
+
88
+ error_pic_urls = []
89
+ success_pic_num = 0
90
+ repeat_pic_num = 0
91
+
92
+ index = 1
93
+
94
+ for url in pic_urls :
95
+ file_name = os .sep .join ((title ,basename (urlsplit (url )[2 ])))
96
+
97
+ if os .path .exists (file_name ):
98
+ print ("图片{}已存在" .format (file_name ))
99
+ index += 1
100
+ repeat_pic_num += 1
101
+ continue
102
+
103
+ try :
104
+ urllib .request .urlretrieve (url , file_name )
105
+ success_pic_num += 1
106
+ index += 1
107
+ print ("下载{}完成!({}/{})" .format (file_name , index , len (pic_urls )))
108
+ except :
109
+ print ("下载{}失败!({}/{})" .format (file_name , index , len (pic_urls )))
110
+ error_pic_urls .append (url )
111
+ index += 1
112
+ continue
113
+
114
+ print ("图片全部下载完毕!(成功:{}/重复:{}/失败:{})" .format (success_pic_num , repeat_pic_num , len (error_pic_urls )))
115
+
116
+ if len (error_pic_urls ) > 0 :
117
+ print ('下面打印失败的图片地址' )
118
+ for error_url in error_pic_urls :
119
+ print (error_url )
120
+
121
+ if __name__ == '__main__' :
122
+
123
+ qid = 406321189
124
+ title = '你们身边有什么素人美女吗(颜值身材巨好的那种)?'
125
+
126
+ get_image_url (qid , title )
127
+
128
+ pic_urls = read_file (title )
129
+ # 下载文件
130
+ download_pic (pic_urls , title )
0 commit comments