Skip to content

Commit 976ab65

Browse files
committed
update package
1 parent 7ad44fb commit 976ab65

File tree

4 files changed

+573
-203
lines changed

4 files changed

+573
-203
lines changed

README.md

Lines changed: 142 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -5,130 +5,159 @@
55
[![Go Report Card](https://goreportcard.com/badge/github.com/gitgitcode/alink)](https://goreportcard.com/report/github.com/gitgitcode/alink)
66
- [ZH](#简介)
77

8-
Golang package to read href,video,title ... tags from an HTML page。
8+
Golang package to read href,video,title,img ... tags from an HTML page。
99

1010

1111
## 简介
1212

13-
一个简单的Golang package 主要用来读取HTML页面中的 ``` <title> ,<video>,<a>``` 等元素.
14-
通过 ```alink.NewRespBody``` 方法处理可以读取 ```http.Get``` 返回的```response.Body```内容。
15-
注意如果要多次读取使用io.Reader 要通过 ```body, err := ioutil.ReadAll(b.Body)```读取后再次新建 ``` readerHref := bytes.NewReader(body)``` 的方式来进行。
16-
内部使用html.Parse 解析后返回一个字符串数组指针。
13+
一个简单的Golang package 主要用来读取HTML页面中的 ``` <title> ,<video>的src,<a>的href,<img>的src``` 等元素的内容.
14+
在库里提供了两种方式处理 ```http.Get``` 返回的```response.Body```内容,一是通过 ```alink.GetBytesReaderWithIoReader```方法处理可以读取 ```http.Get``` 返回的```response.Body```内容。
15+
但是如果要***多次***读取使用io.Reader 要通过 ```body, err := ioutil.ReadAll(b.Body)```读取后再次新建 ``` readerHref := bytes.NewReader(body)``` 的方式来进行。
16+
第二中就是使用 ```alink.GetByteWithIoReader``` 方法读取```http.Get``` 返回的```response.Body``` 使用``WithByte``后缀的方进行多次读取.
17+
内部方法使用html.Parse 解析后内容。
18+
1719

1820
### 例子 Example
1921

20-
- 一个读取google/baidu主页的例子。获取页面的title和全部a连接并打印出来
22+
- 一个读取google/baidu主页的例子。获取页面的img和全部a连接并打印出来
2123

22-
- Use http client Get google/baidu Index Page and collect tags title ,href
24+
- Use http client Get google/baidu Index Page and collect tags img ,href
2325

2426
```go
2527
package main
2628

2729
import (
28-
"github.com/gitgitcode/alink"
29-
"fmt"
30-
"log"
31-
"math/rand"
32-
"net/http"
33-
"time"
34-
)
35-
var userAgentList = []string{"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)",
36-
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)",
37-
"Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)",
38-
"Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,",
39-
"Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11",
40-
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)",
41-
"Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
42-
"Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
43-
"Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
44-
"Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
45-
"Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
46-
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)",
47-
"Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
48-
"MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"}
49-
func GetRandomUserAgent() string{
50-
r := rand.New(rand.NewSource(time.Now().UnixNano()))
51-
return userAgentList[r.Intn(len(userAgentList))]
52-
}
53-
var accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
54-
55-
func main() {
56-
57-
str:="https://google.co.jp"
58-
str1:="https://www.baidu.com"
59-
60-
//fmt.Print(alink.IsValidUrl(str1))
61-
client:= http.Client{Timeout: 2 * time.Second}
62-
req,err := http.NewRequest("GET",str,nil)
63-
req1,err1 := http.NewRequest("GET",str1,nil)
64-
65-
if err != nil{
66-
log.Printf("google is err:%s",err.Error())
67-
}
68-
69-
if err1 != nil{
70-
log.Printf("baidu is err:%s",err1.Error())
71-
}
72-
73-
ReqAdd(req)
74-
ReqAdd(req1)
75-
b,err := client.Do(req)
76-
defer client.CloseIdleConnections()
77-
78-
if err != nil{
79-
log.Printf("request google err %s",err.Error())
80-
b1,err1 := client.Do(req1)
81-
if err1 !=nil{
82-
log.Printf("request baidu err %s",err.Error())
83-
return
84-
}
85-
b = b1
86-
}
87-
88-
body, err := ioutil.ReadAll(b.Body)
89-
if err !=nil{
90-
panic(err)
91-
}
92-
//for read twice create new reader
93-
readerHref := bytes.NewReader(body)
94-
//创建两个新 reader
95-
readerTitle := bytes.NewReader(body)
96-
97-
t,f := alink.Title(readerTitle)
98-
99-
if f !=nil {
100-
log.Print(f)
101-
}
102-
fmt.Printf("title:%s \n",t)
103-
104-
a,bl := alink.Alink(readerHref)
105-
106-
107-
if bl {
108-
for i,v := range *a{
109-
fmt.Printf("index:%d=href:%s\n",i,v)
110-
}
111-
}
112-
113-
114-
115-
//title:百度一下,你就知道
116-
//index:0=href:/
117-
// index:1=href:javascript:;
118-
// index:2=href:https://passport.baidu.com/v2
119-
//or
120-
//title:Google
121-
//index:0=href:/
122-
// index:1=href:javascript:;
123-
// index:2=href:https://wwww.google.com/
124-
125-
}
126-
127-
func ReqAdd(req *http.Request) {
128-
req.Header.Set("Cookie","sug=3; a=1; ORIGIN=0; bdime=21110")
129-
req.Header.Add("User-Agent",GetRandomUserAgent() )
130-
req.Header.Add("Accept",accept)
131-
req.Header.Add("Upgrade-Insecure-Requests","1")
132-
}
133-
30+
"github.com/gitgitcode/alink"
31+
"bytes"
32+
"fmt"
33+
"io/ioutil"
34+
"log"
35+
"math/rand"
36+
"net/http"
37+
"time"
38+
)
39+
var userAgentList = []string{"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)",
40+
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)",
41+
"Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)",
42+
"Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,",
43+
"Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11",
44+
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)",
45+
"Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
46+
"Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
47+
"Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
48+
"Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
49+
"Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
50+
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)",
51+
"Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
52+
"MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"}
53+
54+
func GetRandomUserAgent() string{
55+
r := rand.New(rand.NewSource(time.Now().UnixNano()))
56+
return userAgentList[r.Intn(len(userAgentList))]
57+
}
58+
59+
var accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
60+
61+
func ReqAdd(req *http.Request) {
62+
req.Header.Set("Cookie","sug=3; a=1; ORIGIN=0; bdime=21110")
63+
req.Header.Add("User-Agent",GetRandomUserAgent() )
64+
req.Header.Add("Accept",accept)
65+
req.Header.Add("Upgrade-Insecure-Requests","1")
66+
}
67+
68+
func main() {
69+
70+
Response ,_:= GetHttpResponseP()
71+
body, err := ioutil.ReadAll(Response.Body)
72+
if err !=nil{
73+
panic(err)
74+
}
75+
GetWithByte(body)
76+
GetWithBytesReaderCreateTwiceNewReader(body)
77+
78+
}
79+
80+
func GetHttpResponseP() (*http.Response,error){
81+
str:="https://google.co.jp"
82+
str1:="https://www.baidu.com"
83+
84+
//fmt.Print(alink.IsValidUrl(str1))
85+
client:= http.Client{Timeout: 2 * time.Second}
86+
req,err := http.NewRequest("GET",str,nil)
87+
req1,err1 := http.NewRequest("GET",str1,nil)
88+
89+
if err != nil{
90+
log.Printf("google is err:%s",err.Error())
91+
}
92+
93+
if err1 != nil{
94+
log.Printf("baidu is err:%s",err1.Error())
95+
}
96+
97+
ReqAdd(req)
98+
ReqAdd(req1)
99+
b,err := client.Do(req)
100+
defer client.CloseIdleConnections()
101+
if err != nil{
102+
log.Printf("request google err %s",err.Error())
103+
b1,err1 := client.Do(req1)
104+
if err1 !=nil{
105+
log.Printf("request baidu err %s",err.Error())
106+
panic(err1)
107+
}
108+
b = b1
109+
}
110+
return b ,nil
111+
}
112+
func GetWithByte(body []byte) {
113+
114+
title, err:= alink.GetTitleWithByte(body)
115+
if err == nil{
116+
fmt.Println(title)
117+
}else{
118+
fmt.Println("GetWithByte GetTitleWithByte err")
119+
}
120+
src,err := alink.GetImgSrcWithByte(body)
121+
if err == nil{
122+
for _,s :=range *src{
123+
fmt.Println(s)
124+
}
125+
}else{
126+
fmt.Println("GetWithByte GetImgSrcWithByte err")
127+
}
128+
129+
}
130+
131+
func GetWithBytesReaderCreateTwiceNewReader(body []byte){
132+
fmt.Println("<=================>")
133+
//for read twice create new reader
134+
readerHref := bytes.NewReader(body)
135+
//创建两个新 reader
136+
readerImg := bytes.NewReader(body)
137+
138+
t,f := alink.GetHrefWithBytesReader (readerImg)
139+
140+
if f !=nil {
141+
log.Print(f)
142+
}
143+
fmt.Printf("Href:%s \n",t)
144+
145+
a,bl := alink.GetImgSrcWithBytesReader(readerHref)
146+
147+
if bl ==nil{
148+
for i,v := range *a{
149+
fmt.Printf("index:%d=href:%s\n",i,v)
150+
}
151+
}
152+
153+
//title:百度一下,你就知道
154+
//index:0=href:/
155+
// index:1=href:javascript:;
156+
// index:2=href:https://passport.baidu.com/v2
157+
//or
158+
//title:Google
159+
//index:0=href:/
160+
// index:1=href:javascript:;
161+
// index:2=href:https://wwww.google.com/
162+
}
134163
```

0 commit comments

Comments
 (0)