-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
70 lines (66 loc) · 2.63 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from __future__ import division
from bs4 import BeautifulSoup
import requests
import shutil
import os.path
import re
'''
This script scrappes the memegenerator.net website and downloads meme images and text into local drive
'''
images_path = 'memes3'
captions_path = 'Captions3.txt'
n_captions = 15 #this number *15 is the total number of captions per template
n_pages = 300 #this number is the total number of pages
Uerrors = 0
num_memes_downloaded = 0
regex = re.compile('[^a-zA-Z0-9 ]')
for i in range(1, n_pages):
if i == 1:
url = 'https://memegenerator.net/memes/popular/alltime'
# url = 'https://memegenerator.net/memes/popular/month'
else:
url = 'https://memegenerator.net/memes/popular/alltime/page/' + str(i)
# url = 'https://memegenerator.net/memes/popular/month/page/' + str(i)
r = requests.get(url)
soup = BeautifulSoup(r.text,'html.parser')
chars = soup.find_all(class_='char-img')
links = [char.find('a') for char in chars]
imgs = [char.find('img') for char in chars]
assert len(links) == len(imgs)
if len(imgs) < 1:
break
for j, img in enumerate(imgs):
img_url = img['src']
response = requests.get(img_url, stream=True)
image_name = regex.sub('', img['alt'])
name_of_file = '_'.join(image_name.split())
for k in range(1,n_captions):
if k == 1:
URL = 'https://memegenerator.net' + links[j]['href']
else:
URL = 'https://memegenerator.net' + links[j]['href'] + '/images/popular/alltime/page/' + str(k)
R = requests.get(URL)
SOUP = BeautifulSoup(R.text,'html.parser')
POSTS = SOUP.find_all(class_='generator-img')
if (len(POSTS) < 1):
break
CHARS = SOUP.find_all(class_='single-generator')
TAGS = [char.find(class_='optimized-instance-container img') for char in CHARS]
MEMES = [tag.getText().strip().replace('\n', ' ') for tag in TAGS]
with open(captions_path, 'a') as f:
for meme in MEMES:
try:
f.write(f'{name_of_file} - {meme}\n')
except UnicodeEncodeError:
Uerrors += 1
pass
completeName = os.path.join(images_path, name_of_file + '.jpg')
with open(completeName,'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
del response
num_memes_downloaded += 1
if i % 10 == 0:
print('<'+'='*10+'>')
print(i)
print(Uerrors)
print('num_memes_downloaded =',num_memes_downloaded)