@@ -1179,6 +1179,159 @@ func main() {
1179
1179
}
1180
1180
```
1181
1181
1182
+ ` crawler.Crawl() ` with few handlers:
1183
+
1184
+ ``` go
1185
+ package main
1186
+
1187
+ import (
1188
+ " context"
1189
+ " fmt"
1190
+ " html/template"
1191
+ stdlog " log"
1192
+ " net/http"
1193
+ " net/http/httptest"
1194
+ " os"
1195
+ " runtime"
1196
+ " strings"
1197
+ " time"
1198
+
1199
+ " github.com/go-log/log/print"
1200
+ crawler " github.com/thewizardplusplus/go-crawler"
1201
+ " github.com/thewizardplusplus/go-crawler/checkers"
1202
+ " github.com/thewizardplusplus/go-crawler/extractors"
1203
+ " github.com/thewizardplusplus/go-crawler/handlers"
1204
+ " github.com/thewizardplusplus/go-crawler/models"
1205
+ urlutils " github.com/thewizardplusplus/go-crawler/url-utils"
1206
+ htmlselector " github.com/thewizardplusplus/go-html-selector"
1207
+ )
1208
+
1209
+ type LinkHandler struct {
1210
+ Name string
1211
+ ServerURL string
1212
+ }
1213
+
1214
+ func (handler LinkHandler ) HandleLink (
1215
+ ctx context.Context,
1216
+ link models.SourcedLink,
1217
+ ) {
1218
+ fmt.Printf (
1219
+ " [%s ] received link %q from page %q \n " ,
1220
+ handler.Name ,
1221
+ handler.replaceServerURL (link.Link ),
1222
+ handler.replaceServerURL (link.SourceLink ),
1223
+ )
1224
+ }
1225
+
1226
+ // replace the test server URL for reproducibility of the example
1227
+ func (handler LinkHandler ) replaceServerURL (link string ) string {
1228
+ return strings.Replace (link, handler.ServerURL , " http://example.com" , -1 )
1229
+ }
1230
+
1231
+ func RunServer () *httptest .Server {
1232
+ return httptest.NewServer (http.HandlerFunc (func (
1233
+ writer http.ResponseWriter ,
1234
+ request *http.Request ,
1235
+ ) {
1236
+ var links []string
1237
+ switch request.URL .Path {
1238
+ case " /" :
1239
+ links = []string {" /1" , " /2" , " /2" , " https://golang.org/" }
1240
+ case " /1" :
1241
+ links = []string {" /1/1" , " /1/2" }
1242
+ case " /2" :
1243
+ links = []string {" /2/1" , " /2/2" }
1244
+ }
1245
+ for index := range links {
1246
+ if strings.HasPrefix (links[index], " /" ) {
1247
+ links[index] = " http://" + request.Host + links[index]
1248
+ }
1249
+ }
1250
+
1251
+ template , _ := template.New (" " ).Parse ( // nolint: errcheck
1252
+ ` <ul>
1253
+ {{ range $link := . }}
1254
+ <li><a href="{{ $link }}">{{ $link }}</a></li>
1255
+ {{ end }}
1256
+ </ul>` ,
1257
+ )
1258
+ template.Execute (writer, links) // nolint: errcheck
1259
+ }))
1260
+ }
1261
+
1262
+ func main () {
1263
+ server := RunServer ()
1264
+ defer server.Close ()
1265
+
1266
+ logger := stdlog.New (os.Stderr , " " , stdlog.LstdFlags |stdlog.Lmicroseconds )
1267
+ // wrap the standard logger via the github.com/go-log/log package
1268
+ wrappedLogger := print .New (logger)
1269
+
1270
+ crawler.Crawl (
1271
+ context.Background (),
1272
+ crawler.ConcurrencyConfig {
1273
+ ConcurrencyFactor: runtime.NumCPU (),
1274
+ BufferSize: 1000 ,
1275
+ },
1276
+ []string {server.URL },
1277
+ crawler.CrawlDependencies {
1278
+ LinkExtractor: extractors.RepeatingExtractor {
1279
+ LinkExtractor: extractors.DefaultExtractor {
1280
+ TrimLink: urlutils.TrimLink ,
1281
+ HTTPClient: http.DefaultClient ,
1282
+ Filters: htmlselector.OptimizeFilters (htmlselector.FilterGroup {
1283
+ " a" : {" href" },
1284
+ }),
1285
+ },
1286
+ RepeatCount: 5 ,
1287
+ RepeatDelay: time.Second ,
1288
+ Logger: wrappedLogger,
1289
+ SleepHandler: time.Sleep ,
1290
+ },
1291
+ LinkChecker: checkers.HostChecker {
1292
+ ComparisonResult: urlutils.Same ,
1293
+ Logger: wrappedLogger,
1294
+ },
1295
+ LinkHandler: handlers.HandlerGroup {
1296
+ handlers.CheckedHandler {
1297
+ LinkChecker: checkers.HostChecker {
1298
+ ComparisonResult: urlutils.Same ,
1299
+ Logger: wrappedLogger,
1300
+ },
1301
+ LinkHandler: LinkHandler{
1302
+ Name: " inner" ,
1303
+ ServerURL: server.URL ,
1304
+ },
1305
+ },
1306
+ handlers.CheckedHandler {
1307
+ LinkChecker: checkers.HostChecker {
1308
+ ComparisonResult: urlutils.Different ,
1309
+ Logger: wrappedLogger,
1310
+ },
1311
+ LinkHandler: LinkHandler{
1312
+ Name: " outer" ,
1313
+ ServerURL: server.URL ,
1314
+ },
1315
+ },
1316
+ },
1317
+ Logger: wrappedLogger,
1318
+ },
1319
+ )
1320
+
1321
+ // Unordered output:
1322
+ // [inner] received link "http://example.com/1" from page "http://example.com"
1323
+ // [inner] received link "http://example.com/1/1" from page "http://example.com/1"
1324
+ // [inner] received link "http://example.com/1/2" from page "http://example.com/1"
1325
+ // [inner] received link "http://example.com/2" from page "http://example.com"
1326
+ // [inner] received link "http://example.com/2" from page "http://example.com"
1327
+ // [inner] received link "http://example.com/2/1" from page "http://example.com/2"
1328
+ // [inner] received link "http://example.com/2/1" from page "http://example.com/2"
1329
+ // [inner] received link "http://example.com/2/2" from page "http://example.com/2"
1330
+ // [inner] received link "http://example.com/2/2" from page "http://example.com/2"
1331
+ // [outer] received link "https://golang.org/" from page "http://example.com"
1332
+ }
1333
+ ```
1334
+
1182
1335
` crawler.CrawlByConcurrentHandler() ` :
1183
1336
1184
1337
``` go
0 commit comments