@@ -206,6 +206,170 @@ func main() {
206
206
}
207
207
```
208
208
209
+ ` crawler.Crawl() ` with concurrent handling:
210
+
211
+ ``` go
212
+ package main
213
+
214
+ import (
215
+ " context"
216
+ " fmt"
217
+ " html/template"
218
+ stdlog " log"
219
+ " net/http"
220
+ " net/http/httptest"
221
+ " os"
222
+ " runtime"
223
+ " strings"
224
+ " time"
225
+
226
+ " github.com/go-log/log/print"
227
+ crawler " github.com/thewizardplusplus/go-crawler"
228
+ " github.com/thewizardplusplus/go-crawler/checkers"
229
+ " github.com/thewizardplusplus/go-crawler/extractors"
230
+ " github.com/thewizardplusplus/go-crawler/handlers"
231
+ " github.com/thewizardplusplus/go-crawler/registers"
232
+ " github.com/thewizardplusplus/go-crawler/sanitizing"
233
+ htmlselector " github.com/thewizardplusplus/go-html-selector"
234
+ )
235
+
236
+ type LinkHandler struct {
237
+ ServerURL string
238
+ }
239
+
240
+ func (handler LinkHandler ) HandleLink (
241
+ ctx context.Context,
242
+ link crawler.SourcedLink,
243
+ ) {
244
+ fmt.Printf (
245
+ " have got the link %q from the page %q \n " ,
246
+ handler.replaceServerURL (link.Link ),
247
+ handler.replaceServerURL (link.SourceLink ),
248
+ )
249
+ }
250
+
251
+ // replace the test server URL for reproducibility of the example
252
+ func (handler LinkHandler ) replaceServerURL (link string ) string {
253
+ return strings.Replace (link, handler.ServerURL , " http://example.com" , -1 )
254
+ }
255
+
256
+ func RunServer () *httptest .Server {
257
+ return httptest.NewServer (http.HandlerFunc (func (
258
+ writer http.ResponseWriter ,
259
+ request *http.Request ,
260
+ ) {
261
+ if request.URL .Path == " /robots.txt" {
262
+ // nolint: errcheck
263
+ fmt.Fprint (writer, `
264
+ User-agent: go-crawler
265
+ Disallow: /2
266
+ ` )
267
+
268
+ return
269
+ }
270
+
271
+ var links []string
272
+ switch request.URL .Path {
273
+ case " /" :
274
+ links = []string {" /1" , " /2" , " /2" , " https://golang.org/" }
275
+ case " /1" :
276
+ links = []string {" /1/1" , " /1/2" }
277
+ case " /2" :
278
+ links = []string {" /2/1" , " /2/2" }
279
+ }
280
+ for index := range links {
281
+ if strings.HasPrefix (links[index], " /" ) {
282
+ links[index] = " http://" + request.Host + links[index]
283
+ }
284
+ }
285
+
286
+ template , _ := template.New (" " ).Parse ( // nolint: errcheck
287
+ ` <ul>
288
+ {{ range $link := . }}
289
+ <li><a href="{{ $link }}">{{ $link }}</a></li>
290
+ {{ end }}
291
+ </ul>` ,
292
+ )
293
+ template.Execute (writer, links) // nolint: errcheck
294
+ }))
295
+ }
296
+
297
+ func main () {
298
+ server := RunServer ()
299
+ defer server.Close ()
300
+
301
+ logger := stdlog.New (os.Stderr , " " , stdlog.LstdFlags |stdlog.Lmicroseconds )
302
+ // wrap the standard logger via the github.com/go-log/log package
303
+ wrappedLogger := print .New (logger)
304
+
305
+ // this context should be shared between the handler
306
+ // and the crawler.Crawl() call
307
+ ctx := context.Background ()
308
+ handler := handlers.NewConcurrentHandler (1000 , handlers.CheckedHandler {
309
+ LinkChecker: checkers.DuplicateChecker {
310
+ LinkRegister: registers.NewLinkRegister (
311
+ sanitizing.SanitizeLink ,
312
+ wrappedLogger,
313
+ ),
314
+ },
315
+ LinkHandler: LinkHandler{
316
+ ServerURL: server.URL ,
317
+ },
318
+ })
319
+ go handler.RunConcurrently (ctx, runtime.NumCPU ())
320
+ // it can be called immediately after the crawler.Crawl() call
321
+ defer handler.Stop ()
322
+
323
+ crawler.Crawl (
324
+ ctx,
325
+ runtime.NumCPU (),
326
+ 1000 ,
327
+ []string {server.URL },
328
+ crawler.CrawlDependencies {
329
+ LinkExtractor: extractors.RepeatingExtractor {
330
+ LinkExtractor: extractors.NewDelayingExtractor (
331
+ time.Second ,
332
+ time.Sleep ,
333
+ extractors.DefaultExtractor {
334
+ HTTPClient: http.DefaultClient ,
335
+ Filters: htmlselector.OptimizeFilters (htmlselector.FilterGroup {
336
+ " a" : {" href" },
337
+ }),
338
+ },
339
+ ),
340
+ RepeatCount: 5 ,
341
+ RepeatDelay: 0 ,
342
+ Logger: wrappedLogger,
343
+ SleepHandler: time.Sleep ,
344
+ },
345
+ LinkChecker: checkers.CheckerGroup {
346
+ checkers.HostChecker {
347
+ Logger: wrappedLogger,
348
+ },
349
+ checkers.DuplicateChecker {
350
+ // don't use here the link register from the handler above
351
+ LinkRegister: registers.NewLinkRegister (
352
+ sanitizing.SanitizeLink ,
353
+ wrappedLogger,
354
+ ),
355
+ },
356
+ },
357
+ LinkHandler: handler,
358
+ Logger: wrappedLogger,
359
+ },
360
+ )
361
+
362
+ // Unordered output:
363
+ // have got the link "http://example.com/1" from the page "http://example.com"
364
+ // have got the link "http://example.com/1/1" from the page "http://example.com/1"
365
+ // have got the link "http://example.com/1/2" from the page "http://example.com/1"
366
+ // have got the link "http://example.com/2" from the page "http://example.com"
367
+ // have got the link "http://example.com/2/1" from the page "http://example.com/2"
368
+ // have got the link "http://example.com/2/2" from the page "http://example.com/2"
369
+ // have got the link "https://golang.org/" from the page "http://example.com"
370
+ }
371
+ ```
372
+
209
373
` crawler.HandleLinksConcurrently() ` :
210
374
211
375
``` go
0 commit comments