Skip to content

Commit 5812521

Browse files
committed
crawler
1 parent 5db1a5c commit 5812521

File tree

3 files changed

+45
-0
lines changed

3 files changed

+45
-0
lines changed

crawler/1_getsource.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from urllib.request import urlopen
2+
3+
# if has Chinese, apply decode()
4+
html = urlopen(
5+
"https://morvanzhou.github.io/static/scraping/basic-structure.html"
6+
).read().decode('utf-8')
7+
print(html)

crawler/2_match.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from urllib.request import urlopen
2+
import re
3+
4+
# if has Chinese, apply decode()
5+
html = urlopen(
6+
"https://morvanzhou.github.io/static/scraping/basic-structure.html"
7+
).read().decode('utf-8')
8+
print(html)
9+
10+
print("-"*100)
11+
12+
res = re.findall(r"<title>(.+?)</title>", html)
13+
print("\nPage title is: ", res[0])
14+
15+
res1 = re.findall(r"<p>(.*?)</p>", html, flags=re.DOTALL) # re.DOTALL if multi line
16+
print("\nPage paragraph is: ", res1[0])
17+
18+
res = re.findall(r'href="(.*?)"', html)
19+
print("\nAll links: ", res)

crawler/3_bs4test.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from bs4 import BeautifulSoup
2+
from urllib.request import urlopen
3+
4+
#pip3 install beautifulsoup4
5+
#pip3 install lxml
6+
7+
# if has Chinese, apply decode()
8+
html = urlopen(
9+
"https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8')
10+
print(html)
11+
print("-"*100)
12+
13+
soup = BeautifulSoup(html, features='lxml')
14+
print(soup.h1)
15+
print('\n', soup.p)
16+
17+
all_href = soup.find_all('a')
18+
all_href = [l['href'] for l in all_href]
19+
print('\n', all_href)

0 commit comments

Comments
 (0)