|
5 | 5 | [](https://goreportcard.com/report/github.com/gitgitcode/alink)
|
6 | 6 | - [ZH](#简介)
|
7 | 7 |
|
8 |
| -Golang package to read href,video,title ... tags from an HTML page。 |
| 8 | +Golang package to read href,video,title,img ... tags from an HTML page。 |
9 | 9 |
|
10 | 10 |
|
11 | 11 | ## 简介
|
12 | 12 |
|
13 |
| -一个简单的Golang package 主要用来读取HTML页面中的 ``` <title> ,<video>,<a>``` 等元素. |
14 |
| -通过 ```alink.NewRespBody``` 方法处理可以读取 ```http.Get``` 返回的```response.Body```内容。 |
15 |
| -注意如果要多次读取使用io.Reader 要通过 ```body, err := ioutil.ReadAll(b.Body)```读取后再次新建 ``` readerHref := bytes.NewReader(body)``` 的方式来进行。 |
16 |
| -内部使用html.Parse 解析后返回一个字符串数组指针。 |
| 13 | +一个简单的Golang package 主要用来读取HTML页面中的 ``` <title> ,<video>的src,<a>的href,<img>的src``` 等元素的内容. |
| 14 | +在库里提供了两种方式处理 ```http.Get``` 返回的```response.Body```内容,一是通过 ```alink.GetBytesReaderWithIoReader```方法处理可以读取 ```http.Get``` 返回的```response.Body```内容。 |
| 15 | +但是如果要***多次***读取使用io.Reader 要通过 ```body, err := ioutil.ReadAll(b.Body)```读取后再次新建 ``` readerHref := bytes.NewReader(body)``` 的方式来进行。 |
| 16 | +第二中就是使用 ```alink.GetByteWithIoReader``` 方法读取```http.Get``` 返回的```response.Body``` 使用``WithByte``后缀的方进行多次读取. |
| 17 | +内部方法使用html.Parse 解析后内容。 |
| 18 | + |
17 | 19 |
|
18 | 20 | ### 例子 Example
|
19 | 21 |
|
20 |
| -- 一个读取google/baidu主页的例子。获取页面的title和全部a连接并打印出来 |
| 22 | +- 一个读取google/baidu主页的例子。获取页面的img和全部a连接并打印出来 |
21 | 23 |
|
22 |
| -- Use http client Get google/baidu Index Page and collect tags title ,href |
| 24 | +- Use http client Get google/baidu Index Page and collect tags img ,href |
23 | 25 |
|
24 | 26 | ```go
|
25 | 27 | package main
|
26 | 28 |
|
27 | 29 | import (
|
28 |
| - "github.com/gitgitcode/alink" |
29 |
| - "fmt" |
30 |
| - "log" |
31 |
| - "math/rand" |
32 |
| - "net/http" |
33 |
| - "time" |
34 |
| -) |
35 |
| -var userAgentList = []string{"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)", |
36 |
| - "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)", |
37 |
| - "Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)", |
38 |
| - "Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,", |
39 |
| - "Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11", |
40 |
| - "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)", |
41 |
| - "Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", |
42 |
| - "Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", |
43 |
| - "Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", |
44 |
| - "Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", |
45 |
| - "Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", |
46 |
| - "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)", |
47 |
| - "Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", |
48 |
| - "MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"} |
49 |
| -func GetRandomUserAgent() string{ |
50 |
| - r := rand.New(rand.NewSource(time.Now().UnixNano())) |
51 |
| - return userAgentList[r.Intn(len(userAgentList))] |
52 |
| -} |
53 |
| -var accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" |
54 |
| - |
55 |
| -func main() { |
56 |
| - |
57 |
| - str:="https://google.co.jp" |
58 |
| - str1:="https://www.baidu.com" |
59 |
| - |
60 |
| - //fmt.Print(alink.IsValidUrl(str1)) |
61 |
| - client:= http.Client{Timeout: 2 * time.Second} |
62 |
| - req,err := http.NewRequest("GET",str,nil) |
63 |
| - req1,err1 := http.NewRequest("GET",str1,nil) |
64 |
| - |
65 |
| - if err != nil{ |
66 |
| - log.Printf("google is err:%s",err.Error()) |
67 |
| - } |
68 |
| - |
69 |
| - if err1 != nil{ |
70 |
| - log.Printf("baidu is err:%s",err1.Error()) |
71 |
| - } |
72 |
| - |
73 |
| - ReqAdd(req) |
74 |
| - ReqAdd(req1) |
75 |
| - b,err := client.Do(req) |
76 |
| - defer client.CloseIdleConnections() |
77 |
| - |
78 |
| - if err != nil{ |
79 |
| - log.Printf("request google err %s",err.Error()) |
80 |
| - b1,err1 := client.Do(req1) |
81 |
| - if err1 !=nil{ |
82 |
| - log.Printf("request baidu err %s",err.Error()) |
83 |
| - return |
84 |
| - } |
85 |
| - b = b1 |
86 |
| - } |
87 |
| - |
88 |
| - body, err := ioutil.ReadAll(b.Body) |
89 |
| - if err !=nil{ |
90 |
| - panic(err) |
91 |
| - } |
92 |
| - //for read twice create new reader |
93 |
| - readerHref := bytes.NewReader(body) |
94 |
| - //创建两个新 reader |
95 |
| - readerTitle := bytes.NewReader(body) |
96 |
| - |
97 |
| - t,f := alink.Title(readerTitle) |
98 |
| - |
99 |
| - if f !=nil { |
100 |
| - log.Print(f) |
101 |
| - } |
102 |
| - fmt.Printf("title:%s \n",t) |
103 |
| - |
104 |
| - a,bl := alink.Alink(readerHref) |
105 |
| - |
106 |
| - |
107 |
| - if bl { |
108 |
| - for i,v := range *a{ |
109 |
| - fmt.Printf("index:%d=href:%s\n",i,v) |
110 |
| - } |
111 |
| - } |
112 |
| - |
113 |
| - |
114 |
| - |
115 |
| - //title:百度一下,你就知道 |
116 |
| - //index:0=href:/ |
117 |
| - // index:1=href:javascript:; |
118 |
| - // index:2=href:https://passport.baidu.com/v2 |
119 |
| - //or |
120 |
| -//title:Google |
121 |
| - //index:0=href:/ |
122 |
| - // index:1=href:javascript:; |
123 |
| - // index:2=href:https://wwww.google.com/ |
124 |
| - |
125 |
| -} |
126 |
| - |
127 |
| -func ReqAdd(req *http.Request) { |
128 |
| - req.Header.Set("Cookie","sug=3; a=1; ORIGIN=0; bdime=21110") |
129 |
| - req.Header.Add("User-Agent",GetRandomUserAgent() ) |
130 |
| - req.Header.Add("Accept",accept) |
131 |
| - req.Header.Add("Upgrade-Insecure-Requests","1") |
132 |
| -} |
133 |
| - |
| 30 | + "github.com/gitgitcode/alink" |
| 31 | + "bytes" |
| 32 | + "fmt" |
| 33 | + "io/ioutil" |
| 34 | + "log" |
| 35 | + "math/rand" |
| 36 | + "net/http" |
| 37 | + "time" |
| 38 | + ) |
| 39 | + var userAgentList = []string{"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)", |
| 40 | + "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)", |
| 41 | + "Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)", |
| 42 | + "Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,", |
| 43 | + "Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11", |
| 44 | + "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)", |
| 45 | + "Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", |
| 46 | + "Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", |
| 47 | + "Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", |
| 48 | + "Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", |
| 49 | + "Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", |
| 50 | + "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)", |
| 51 | + "Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", |
| 52 | + "MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"} |
| 53 | + |
| 54 | + func GetRandomUserAgent() string{ |
| 55 | + r := rand.New(rand.NewSource(time.Now().UnixNano())) |
| 56 | + return userAgentList[r.Intn(len(userAgentList))] |
| 57 | + } |
| 58 | + |
| 59 | + var accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" |
| 60 | + |
| 61 | + func ReqAdd(req *http.Request) { |
| 62 | + req.Header.Set("Cookie","sug=3; a=1; ORIGIN=0; bdime=21110") |
| 63 | + req.Header.Add("User-Agent",GetRandomUserAgent() ) |
| 64 | + req.Header.Add("Accept",accept) |
| 65 | + req.Header.Add("Upgrade-Insecure-Requests","1") |
| 66 | + } |
| 67 | + |
| 68 | + func main() { |
| 69 | + |
| 70 | + Response ,_:= GetHttpResponseP() |
| 71 | + body, err := ioutil.ReadAll(Response.Body) |
| 72 | + if err !=nil{ |
| 73 | + panic(err) |
| 74 | + } |
| 75 | + GetWithByte(body) |
| 76 | + GetWithBytesReaderCreateTwiceNewReader(body) |
| 77 | + |
| 78 | + } |
| 79 | + |
| 80 | + func GetHttpResponseP() (*http.Response,error){ |
| 81 | + str:="https://google.co.jp" |
| 82 | + str1:="https://www.baidu.com" |
| 83 | + |
| 84 | + //fmt.Print(alink.IsValidUrl(str1)) |
| 85 | + client:= http.Client{Timeout: 2 * time.Second} |
| 86 | + req,err := http.NewRequest("GET",str,nil) |
| 87 | + req1,err1 := http.NewRequest("GET",str1,nil) |
| 88 | + |
| 89 | + if err != nil{ |
| 90 | + log.Printf("google is err:%s",err.Error()) |
| 91 | + } |
| 92 | + |
| 93 | + if err1 != nil{ |
| 94 | + log.Printf("baidu is err:%s",err1.Error()) |
| 95 | + } |
| 96 | + |
| 97 | + ReqAdd(req) |
| 98 | + ReqAdd(req1) |
| 99 | + b,err := client.Do(req) |
| 100 | + defer client.CloseIdleConnections() |
| 101 | + if err != nil{ |
| 102 | + log.Printf("request google err %s",err.Error()) |
| 103 | + b1,err1 := client.Do(req1) |
| 104 | + if err1 !=nil{ |
| 105 | + log.Printf("request baidu err %s",err.Error()) |
| 106 | + panic(err1) |
| 107 | + } |
| 108 | + b = b1 |
| 109 | + } |
| 110 | + return b ,nil |
| 111 | + } |
| 112 | + func GetWithByte(body []byte) { |
| 113 | + |
| 114 | + title, err:= alink.GetTitleWithByte(body) |
| 115 | + if err == nil{ |
| 116 | + fmt.Println(title) |
| 117 | + }else{ |
| 118 | + fmt.Println("GetWithByte GetTitleWithByte err") |
| 119 | + } |
| 120 | + src,err := alink.GetImgSrcWithByte(body) |
| 121 | + if err == nil{ |
| 122 | + for _,s :=range *src{ |
| 123 | + fmt.Println(s) |
| 124 | + } |
| 125 | + }else{ |
| 126 | + fmt.Println("GetWithByte GetImgSrcWithByte err") |
| 127 | + } |
| 128 | + |
| 129 | + } |
| 130 | + |
| 131 | + func GetWithBytesReaderCreateTwiceNewReader(body []byte){ |
| 132 | + fmt.Println("<=================>") |
| 133 | + //for read twice create new reader |
| 134 | + readerHref := bytes.NewReader(body) |
| 135 | + //创建两个新 reader |
| 136 | + readerImg := bytes.NewReader(body) |
| 137 | + |
| 138 | + t,f := alink.GetHrefWithBytesReader (readerImg) |
| 139 | + |
| 140 | + if f !=nil { |
| 141 | + log.Print(f) |
| 142 | + } |
| 143 | + fmt.Printf("Href:%s \n",t) |
| 144 | + |
| 145 | + a,bl := alink.GetImgSrcWithBytesReader(readerHref) |
| 146 | + |
| 147 | + if bl ==nil{ |
| 148 | + for i,v := range *a{ |
| 149 | + fmt.Printf("index:%d=href:%s\n",i,v) |
| 150 | + } |
| 151 | + } |
| 152 | + |
| 153 | + //title:百度一下,你就知道 |
| 154 | + //index:0=href:/ |
| 155 | + // index:1=href:javascript:; |
| 156 | + // index:2=href:https://passport.baidu.com/v2 |
| 157 | + //or |
| 158 | + //title:Google |
| 159 | + //index:0=href:/ |
| 160 | + // index:1=href:javascript:; |
| 161 | + // index:2=href:https://wwww.google.com/ |
| 162 | + } |
134 | 163 | ```
|
0 commit comments