1
+ {
2
+ "metadata" : {
3
+ "language_info" : {
4
+ "codemirror_mode" : {
5
+ "name" : " ipython" ,
6
+ "version" : 3
7
+ },
8
+ "file_extension" : " .py" ,
9
+ "mimetype" : " text/x-python" ,
10
+ "name" : " python" ,
11
+ "nbconvert_exporter" : " python" ,
12
+ "pygments_lexer" : " ipython3" ,
13
+ "version" : " 3.8.7-final"
14
+ },
15
+ "orig_nbformat" : 2 ,
16
+ "kernelspec" : {
17
+ "name" : " python36664bitea6884f10f474b21a2a2f022451e0d09" ,
18
+ "display_name" : " Python 3.6.6 64-bit" ,
19
+ "language" : " python"
20
+ }
21
+ },
22
+ "nbformat" : 4 ,
23
+ "nbformat_minor" : 2 ,
24
+ "cells" : [
25
+ {
26
+ "cell_type" : " code" ,
27
+ "execution_count" : null ,
28
+ "metadata" : {},
29
+ "outputs" : [],
30
+ "source" : [
31
+ " from googleapiclient.discovery import build\n " ,
32
+ " from google_auth_oauthlib.flow import InstalledAppFlow\n " ,
33
+ " from google.auth.transport.requests import Request\n " ,
34
+ " \n " ,
35
+ " import urllib.parse as p\n " ,
36
+ " import re\n " ,
37
+ " import os\n " ,
38
+ " import pickle\n " ,
39
+ " \n " ,
40
+ " SCOPES = [\" https://www.googleapis.com/auth/youtube.force-ssl\" ]"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type" : " code" ,
45
+ "execution_count" : null ,
46
+ "metadata" : {},
47
+ "outputs" : [],
48
+ "source" : [
49
+ " def youtube_authenticate():\n " ,
50
+ " os.environ[\" OAUTHLIB_INSECURE_TRANSPORT\" ] = \" 1\"\n " ,
51
+ " api_service_name = \" youtube\"\n " ,
52
+ " api_version = \" v3\"\n " ,
53
+ " client_secrets_file = \" credentials.json\"\n " ,
54
+ " creds = None\n " ,
55
+ " # the file token.pickle stores the user's access and refresh tokens, and is\n " ,
56
+ " # created automatically when the authorization flow completes for the first time\n " ,
57
+ " if os.path.exists(\" token.pickle\" ):\n " ,
58
+ " with open(\" token.pickle\" , \" rb\" ) as token:\n " ,
59
+ " creds = pickle.load(token)\n " ,
60
+ " # if there are no (valid) credentials availablle, let the user log in.\n " ,
61
+ " if not creds or not creds.valid:\n " ,
62
+ " if creds and creds.expired and creds.refresh_token:\n " ,
63
+ " creds.refresh(Request())\n " ,
64
+ " else:\n " ,
65
+ " flow = InstalledAppFlow.from_client_secrets_file(client_secrets_file, SCOPES)\n " ,
66
+ " creds = flow.run_local_server(port=0)\n " ,
67
+ " # save the credentials for the next run\n " ,
68
+ " with open(\" token.pickle\" , \" wb\" ) as token:\n " ,
69
+ " pickle.dump(creds, token)\n " ,
70
+ " \n " ,
71
+ " return build(api_service_name, api_version, credentials=creds)\n " ,
72
+ " \n " ,
73
+ " # authenticate to YouTube API\n " ,
74
+ " youtube = youtube_authenticate()"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type" : " code" ,
79
+ "execution_count" : null ,
80
+ "metadata" : {},
81
+ "outputs" : [],
82
+ "source" : [
83
+ " def get_video_id_by_url(url):\n " ,
84
+ " \"\"\"\n " ,
85
+ " Return the Video ID from the video `url`\n " ,
86
+ " \"\"\"\n " ,
87
+ " # split URL parts\n " ,
88
+ " parsed_url = p.urlparse(url)\n " ,
89
+ " # get the video ID by parsing the query of the URL\n " ,
90
+ " video_id = p.parse_qs(parsed_url.query).get(\" v\" )\n " ,
91
+ " if video_id:\n " ,
92
+ " return video_id[0]\n " ,
93
+ " else:\n " ,
94
+ " raise Exception(f\" Wasn't able to parse video URL: {url}\" )\n " ,
95
+ " \n " ,
96
+ " def get_video_details(youtube, **kwargs):\n " ,
97
+ " return youtube.videos().list(\n " ,
98
+ " part=\" snippet,contentDetails,statistics\" ,\n " ,
99
+ " **kwargs\n " ,
100
+ " ).execute()"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type" : " code" ,
105
+ "execution_count" : null ,
106
+ "metadata" : {},
107
+ "outputs" : [],
108
+ "source" : [
109
+ " def print_video_infos(video_response):\n " ,
110
+ " items = video_response.get(\" items\" )[0]\n " ,
111
+ " # get the snippet, statistics & content details from the video response\n " ,
112
+ " snippet = items[\" snippet\" ]\n " ,
113
+ " statistics = items[\" statistics\" ]\n " ,
114
+ " content_details = items[\" contentDetails\" ]\n " ,
115
+ " # get infos from the snippet\n " ,
116
+ " channel_title = snippet[\" channelTitle\" ]\n " ,
117
+ " title = snippet[\" title\" ]\n " ,
118
+ " description = snippet[\" description\" ]\n " ,
119
+ " publish_time = snippet[\" publishedAt\" ]\n " ,
120
+ " # get stats infos\n " ,
121
+ " comment_count = statistics[\" commentCount\" ]\n " ,
122
+ " like_count = statistics[\" likeCount\" ]\n " ,
123
+ " dislike_count = statistics[\" dislikeCount\" ]\n " ,
124
+ " view_count = statistics[\" viewCount\" ]\n " ,
125
+ " # get duration from content details\n " ,
126
+ " duration = content_details[\" duration\" ]\n " ,
127
+ " # duration in the form of something like 'PT5H50M15S'\n " ,
128
+ " # parsing it to be something like '5:50:15'\n " ,
129
+ " parsed_duration = re.search(f\" PT(\\ d+H)?(\\ d+M)?(\\ d+S)\" , duration).groups()\n " ,
130
+ " duration_str = \"\"\n " ,
131
+ " for d in parsed_duration:\n " ,
132
+ " if d:\n " ,
133
+ " duration_str += f\" {d[:-1]}:\"\n " ,
134
+ " duration_str = duration_str.strip(\" :\" )\n " ,
135
+ " print(f\"\"\"\\\n " ,
136
+ " Title: {title}\n " ,
137
+ " Description: {description}\n " ,
138
+ " Channel Title: {channel_title}\n " ,
139
+ " Publish time: {publish_time}\n " ,
140
+ " Duration: {duration_str}\n " ,
141
+ " Number of comments: {comment_count}\n " ,
142
+ " Number of likes: {like_count}\n " ,
143
+ " Number of dislikes: {dislike_count}\n " ,
144
+ " Number of views: {view_count}\n " ,
145
+ " \"\"\" )"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type" : " code" ,
150
+ "execution_count" : null ,
151
+ "metadata" : {},
152
+ "outputs" : [],
153
+ "source" : [
154
+ " video_url = \" https://www.youtube.com/watch?v=jNQXAC9IVRw&ab_channel=jawed\"\n " ,
155
+ " # parse video ID from URL\n " ,
156
+ " video_id = get_video_id_by_url(video_url)\n " ,
157
+ " # make API call to get video info\n " ,
158
+ " response = get_video_details(youtube, id=video_id)\n " ,
159
+ " # print extracted video infos\n " ,
160
+ " print_video_infos(response)"
161
+ ]
162
+ },
163
+ {
164
+ "cell_type" : " code" ,
165
+ "execution_count" : null ,
166
+ "metadata" : {},
167
+ "outputs" : [],
168
+ "source" : [
169
+ " def search(youtube, **kwargs):\n " ,
170
+ " return youtube.search().list(\n " ,
171
+ " part=\" snippet\" ,\n " ,
172
+ " **kwargs\n " ,
173
+ " ).execute()\n " ,
174
+ " \n " ,
175
+ " # search for the query 'python' and retrieve 2 items only\n " ,
176
+ " response = search(youtube, q=\" python\" , maxResults=2)\n " ,
177
+ " items = response.get(\" items\" )\n " ,
178
+ " for item in items:\n " ,
179
+ " # get the video ID\n " ,
180
+ " video_id = item[\" id\" ][\" videoId\" ]\n " ,
181
+ " # get the video details\n " ,
182
+ " video_response = get_video_details(youtube, id=video_id)\n " ,
183
+ " # print the video details\n " ,
184
+ " print_video_infos(video_response)\n " ,
185
+ " print(\" =\" *50)"
186
+ ]
187
+ },
188
+ {
189
+ "cell_type" : " code" ,
190
+ "execution_count" : null ,
191
+ "metadata" : {},
192
+ "outputs" : [],
193
+ "source" : [
194
+ " def parse_channel_url(url):\n " ,
195
+ " \"\"\"\n " ,
196
+ " This function takes channel `url` to check whether it includes a\n " ,
197
+ " channel ID, user ID or channel name\n " ,
198
+ " \"\"\"\n " ,
199
+ " path = p.urlparse(url).path\n " ,
200
+ " id = path.split(\" /\" )[-1]\n " ,
201
+ " if \" /c/\" in path:\n " ,
202
+ " return \" c\" , id\n " ,
203
+ " elif \" /channel/\" in path:\n " ,
204
+ " return \" channel\" , id\n " ,
205
+ " elif \" /user/\" in path:\n " ,
206
+ " return \" user\" , id\n " ,
207
+ " \n " ,
208
+ " \n " ,
209
+ " def get_channel_id_by_url(youtube, url):\n " ,
210
+ " \"\"\"\n " ,
211
+ " Returns channel ID of a given `id` and `method`\n " ,
212
+ " - `method` (str): can be 'c', 'channel', 'user'\n " ,
213
+ " - `id` (str): if method is 'c', then `id` is display name\n " ,
214
+ " if method is 'channel', then it's channel id\n " ,
215
+ " if method is 'user', then it's username\n " ,
216
+ " \"\"\"\n " ,
217
+ " # parse the channel URL\n " ,
218
+ " method, id = parse_channel_url(url)\n " ,
219
+ " if method == \" channel\" :\n " ,
220
+ " # if it's a channel ID, then just return it\n " ,
221
+ " return id\n " ,
222
+ " elif method == \" user\" :\n " ,
223
+ " # if it's a user ID, make a request to get the channel ID\n " ,
224
+ " response = get_channel_details(youtube, forUsername=id)\n " ,
225
+ " items = response.get(\" items\" )\n " ,
226
+ " if items:\n " ,
227
+ " channel_id = items[0].get(\" id\" )\n " ,
228
+ " return channel_id\n " ,
229
+ " elif method == \" c\" :\n " ,
230
+ " # if it's a channel name, search for the channel using the name\n " ,
231
+ " # may be inaccurate\n " ,
232
+ " response = search(youtube, q=id, maxResults=1)\n " ,
233
+ " items = response.get(\" items\" )\n " ,
234
+ " if items:\n " ,
235
+ " channel_id = items[0][\" snippet\" ][\" channelId\" ]\n " ,
236
+ " return channel_id\n " ,
237
+ " raise Exception(f\" Cannot find ID:{id} with {method} method\" )"
238
+ ]
239
+ },
240
+ {
241
+ "cell_type" : " code" ,
242
+ "execution_count" : null ,
243
+ "metadata" : {},
244
+ "outputs" : [],
245
+ "source" : [
246
+ " def get_channel_videos(youtube, **kwargs):\n " ,
247
+ " return youtube.search().list(\n " ,
248
+ " **kwargs\n " ,
249
+ " ).execute()\n " ,
250
+ " \n " ,
251
+ " \n " ,
252
+ " def get_channel_details(youtube, **kwargs):\n " ,
253
+ " return youtube.channels().list(\n " ,
254
+ " part=\" statistics,snippet,contentDetails\" ,\n " ,
255
+ " **kwargs\n " ,
256
+ " ).execute()"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type" : " code" ,
261
+ "execution_count" : null ,
262
+ "metadata" : {},
263
+ "outputs" : [],
264
+ "source" : [
265
+ " channel_url = \" https://www.youtube.com/channel/UC8butISFwT-Wl7EV0hUK0BQ\"\n " ,
266
+ " # get the channel ID from the URL\n " ,
267
+ " channel_id = get_channel_id_by_url(youtube, channel_url)\n " ,
268
+ " # get the channel details\n " ,
269
+ " response = get_channel_details(youtube, id=channel_id)\n " ,
270
+ " # extract channel infos\n " ,
271
+ " snippet = response[\" items\" ][0][\" snippet\" ]\n " ,
272
+ " statistics = response[\" items\" ][0][\" statistics\" ]\n " ,
273
+ " channel_country = snippet[\" country\" ]\n " ,
274
+ " channel_description = snippet[\" description\" ]\n " ,
275
+ " channel_creation_date = snippet[\" publishedAt\" ]\n " ,
276
+ " channel_title = snippet[\" title\" ]\n " ,
277
+ " channel_subscriber_count = statistics[\" subscriberCount\" ]\n " ,
278
+ " channel_video_count = statistics[\" videoCount\" ]\n " ,
279
+ " channel_view_count = statistics[\" viewCount\" ]\n " ,
280
+ " print(f\"\"\"\n " ,
281
+ " Title: {channel_title}\n " ,
282
+ " Published At: {channel_creation_date}\n " ,
283
+ " Description: {channel_description}\n " ,
284
+ " Country: {channel_country}\n " ,
285
+ " Number of videos: {channel_video_count}\n " ,
286
+ " Number of subscribers: {channel_subscriber_count}\n " ,
287
+ " Total views: {channel_view_count}\n " ,
288
+ " \"\"\" )\n " ,
289
+ " # the following is grabbing channel videos\n " ,
290
+ " # number of pages you want to get\n " ,
291
+ " n_pages = 2\n " ,
292
+ " # counting number of videos grabbed\n " ,
293
+ " n_videos = 0\n " ,
294
+ " next_page_token = None\n " ,
295
+ " for i in range(n_pages):\n " ,
296
+ " params = {\n " ,
297
+ " 'part': 'snippet',\n " ,
298
+ " 'q': '',\n " ,
299
+ " 'channelId': channel_id,\n " ,
300
+ " 'type': 'video',\n " ,
301
+ " }\n " ,
302
+ " if next_page_token:\n " ,
303
+ " params['pageToken'] = next_page_token\n " ,
304
+ " res = get_channel_videos(youtube, **params)\n " ,
305
+ " channel_videos = res.get(\" items\" )\n " ,
306
+ " for video in channel_videos:\n " ,
307
+ " n_videos += 1\n " ,
308
+ " video_id = video[\" id\" ][\" videoId\" ]\n " ,
309
+ " # easily construct video URL by its ID\n " ,
310
+ " video_url = f\" https://www.youtube.com/watch?v={video_id}\"\n " ,
311
+ " video_response = get_video_details(youtube, id=video_id)\n " ,
312
+ " print(f\" ================Video #{n_videos}================\" )\n " ,
313
+ " # print the video details\n " ,
314
+ " print_video_infos(video_response)\n " ,
315
+ " print(f\" Video URL: {video_url}\" )\n " ,
316
+ " print(\" =\" *40)\n " ,
317
+ " print(\" *\" *100)\n " ,
318
+ " # if there is a next page, then add it to our parameters\n " ,
319
+ " # to proceed to the next page\n " ,
320
+ " if \" nextPageToken\" in res:\n " ,
321
+ " next_page_token = res[\" nextPageToken\" ]"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type" : " code" ,
326
+ "execution_count" : null ,
327
+ "metadata" : {},
328
+ "outputs" : [],
329
+ "source" : [
330
+ " def get_comments(youtube, **kwargs):\n " ,
331
+ " return youtube.commentThreads().list(\n " ,
332
+ " part=\" snippet\" ,\n " ,
333
+ " **kwargs\n " ,
334
+ " ).execute()"
335
+ ]
336
+ },
337
+ {
338
+ "cell_type" : " code" ,
339
+ "execution_count" : null ,
340
+ "metadata" : {},
341
+ "outputs" : [],
342
+ "source" : [
343
+ " # URL can be a channel or a video, to extract comments\n " ,
344
+ " url = \" https://www.youtube.com/watch?v=jNQXAC9IVRw&ab_channel=jawed\"\n " ,
345
+ " # parameters to send to commentThreads API endpoint\n " ,
346
+ " params = {\n " ,
347
+ " 'maxResults': 2,\n " ,
348
+ " 'order': 'relevance', # default is 'time' (newest)\n " ,
349
+ " }\n " ,
350
+ " if \" watch\" in url:\n " ,
351
+ " # that's a video\n " ,
352
+ " video_id = get_video_id_by_url(url)\n " ,
353
+ " params['videoId'] = video_id\n " ,
354
+ " else:\n " ,
355
+ " # should be a channel\n " ,
356
+ " channel_id = get_channel_id_by_url(url)\n " ,
357
+ " params['allThreadsRelatedToChannelId'] = channel_id\n " ,
358
+ " # get the first 2 pages (2 API requests)\n " ,
359
+ " n_pages = 2\n " ,
360
+ " for i in range(n_pages):\n " ,
361
+ " # make API call to get all comments from the channel (including posts & videos)\n " ,
362
+ " response = get_comments(youtube, **params)\n " ,
363
+ " items = response.get(\" items\" )\n " ,
364
+ " # if items is empty, breakout of the loop\n " ,
365
+ " if not items:\n " ,
366
+ " break\n " ,
367
+ " for item in items:\n " ,
368
+ " comment = item[\" snippet\" ][\" topLevelComment\" ][\" snippet\" ][\" textDisplay\" ]\n " ,
369
+ " updated_at = item[\" snippet\" ][\" topLevelComment\" ][\" snippet\" ][\" updatedAt\" ]\n " ,
370
+ " like_count = item[\" snippet\" ][\" topLevelComment\" ][\" snippet\" ][\" likeCount\" ]\n " ,
371
+ " comment_id = item[\" snippet\" ][\" topLevelComment\" ][\" id\" ]\n " ,
372
+ " print(f\"\"\"\\\n " ,
373
+ " Comment: {comment}\n " ,
374
+ " Likes: {like_count}\n " ,
375
+ " Updated At: {updated_at}\n " ,
376
+ " ==================================\\\n " ,
377
+ " \"\"\" )\n " ,
378
+ " if \" nextPageToken\" in response:\n " ,
379
+ " # if there is a next page\n " ,
380
+ " # add next page token to the params we pass to the function\n " ,
381
+ " params[\" pageToken\" ] = response[\" nextPageToken\" ]\n " ,
382
+ " else:\n " ,
383
+ " # must be end of comments!!!!\n " ,
384
+ " break\n " ,
385
+ " print(\" *\" *70)"
386
+ ]
387
+ },
388
+ {
389
+ "cell_type" : " code" ,
390
+ "execution_count" : null ,
391
+ "metadata" : {},
392
+ "outputs" : [],
393
+ "source" : []
394
+ }
395
+ ]
396
+ }
0 commit comments