1
+ from bs4 import BeautifulSoup
2
+
3
+ html_doc = """
4
+ <html><head><title>index</title></head>
5
+ <body>
6
+ <p class="title"><b>首页</b></p>
7
+ <p class="main">我常用的网站
8
+ <a href="https://www.google.com" class="website" id="google">Google</a>
9
+ <a href="https://www.baidu.com" class="website" id="baidu">Baidu</a>
10
+ <a href="https://cn.bing.com" class="website" id="bing">Bing</a>
11
+ </p>
12
+ <div><!--这是注释内容--></div>
13
+ <p class="content1">...</p>
14
+ <p class="content2">...</p>
15
+ </body>
16
+ """
17
+
18
+ # demo 1
19
+ soup = BeautifulSoup (html_doc , "lxml" )
20
+ tags = soup .find_all ('b' )
21
+ print (tags )
22
+
23
+
24
+ # demo 2
25
+ import re
26
+ for tag in soup .find_all (re .compile ("^b" )):
27
+ print (tag .name )
28
+
29
+
30
+ # demo 3
31
+ for tag in soup .find_all (['a' , 'b' ]):
32
+ print (tag )
33
+
34
+
35
+ # demo 4
36
+ for tag in soup .find_all (True ):
37
+ print (tag .name , end = ', ' )
38
+
39
+
40
+ # demo 5
41
+ def has_id_class (tag ):
42
+ return tag .has_attr ('id' ) and tag .has_attr ('class' )
43
+
44
+ tags = soup .find_all (has_id_class )
45
+ for tag in tags :
46
+ print (tag )
47
+
48
+
49
+ # demo 6
50
+ tags = soup .find_all (id = 'google' )
51
+ print (tags [0 ]['href' ])
52
+
53
+ for tag in soup .find_all (id = True ):
54
+ print (tag ['href' ])
55
+
56
+
57
+ # demo 7
58
+ tags = soup .find_all ("a" , class_ = "website" )
59
+ for tag in tags :
60
+ print (tag ['href' ])
61
+
62
+ def has_seven_characters (css_class ):
63
+ return css_class is not None and len (css_class ) == 7
64
+
65
+ for tag in soup .find_all (class_ = has_seven_characters ):
66
+ print (tag ['id' ])
67
+
68
+
69
+ # demo 8
70
+ css_soup = BeautifulSoup ('<p class="body strikeout"></p>' , 'lxml' )
71
+ tags = css_soup .find_all ("p" , class_ = "strikeout" )
72
+ print (tags )
73
+
74
+
75
+ # demo 9
76
+ tags = soup .find_all (text = "Google" )
77
+ print ("google : " , tags )
78
+
79
+ tags = soup .find_all (text = ["Baidu" , "Bing" ])
80
+ print ("baidu & bing : " , tags )
81
+
82
+ tags = soup .find_all ('a' , text = "Google" )
83
+ print ("a[text=google] : " , tags )
84
+
85
+
86
+ # demo 10
87
+ tag = soup .find_all ("a" , limit = 1 )
88
+ print (tag )
89
+
90
+ tags = soup .find_all ("p" , recursive = False )
91
+ print (tags )
92
+
93
+
94
+ # demo 11
95
+ tags = soup .select ("body a" )
96
+ for tag in tags :
97
+ print (tag ['href' ])
98
+
99
+
100
+ # demo 12
101
+ tags = soup .select ("p > a" )
102
+ print (tags )
103
+
104
+ tags = soup .select ("p > #google" )
105
+ print (tags )
106
+
107
+
108
+ # demo 13
109
+ tags = soup .select (".website" )
110
+ for tag in tags :
111
+ print (tag .string )
112
+
113
+
114
+ # demo 14
115
+ tags = soup .select ("#google" )
116
+ print (tags )
117
+
118
+
119
+ # demo 15
120
+ tags = soup .select ('a[href="https://cn.bing.com"]' )
121
+ print (tags )
0 commit comments