File tree 3 files changed +45
-0
lines changed
3 files changed +45
-0
lines changed Original file line number Diff line number Diff line change
1
+ from urllib .request import urlopen
2
+
3
+ # if has Chinese, apply decode()
4
+ html = urlopen (
5
+ "https://morvanzhou.github.io/static/scraping/basic-structure.html"
6
+ ).read ().decode ('utf-8' )
7
+ print (html )
Original file line number Diff line number Diff line change
1
+ from urllib .request import urlopen
2
+ import re
3
+
4
+ # if has Chinese, apply decode()
5
+ html = urlopen (
6
+ "https://morvanzhou.github.io/static/scraping/basic-structure.html"
7
+ ).read ().decode ('utf-8' )
8
+ print (html )
9
+
10
+ print ("-" * 100 )
11
+
12
+ res = re .findall (r"<title>(.+?)</title>" , html )
13
+ print ("\n Page title is: " , res [0 ])
14
+
15
+ res1 = re .findall (r"<p>(.*?)</p>" , html , flags = re .DOTALL ) # re.DOTALL if multi line
16
+ print ("\n Page paragraph is: " , res1 [0 ])
17
+
18
+ res = re .findall (r'href="(.*?)"' , html )
19
+ print ("\n All links: " , res )
Original file line number Diff line number Diff line change
1
+ from bs4 import BeautifulSoup
2
+ from urllib .request import urlopen
3
+
4
+ #pip3 install beautifulsoup4
5
+ #pip3 install lxml
6
+
7
+ # if has Chinese, apply decode()
8
+ html = urlopen (
9
+ "https://morvanzhou.github.io/static/scraping/basic-structure.html" ).read ().decode ('utf-8' )
10
+ print (html )
11
+ print ("-" * 100 )
12
+
13
+ soup = BeautifulSoup (html , features = 'lxml' )
14
+ print (soup .h1 )
15
+ print ('\n ' , soup .p )
16
+
17
+ all_href = soup .find_all ('a' )
18
+ all_href = [l ['href' ] for l in all_href ]
19
+ print ('\n ' , all_href )
You can’t perform that action at this time.
0 commit comments