Skip to content

Commit a290f46

Browse files
Add the example for the crawler.Crawl() function with concurrent handling
1 parent f6c0899 commit a290f46

File tree

2 files changed

+239
-0
lines changed

2 files changed

+239
-0
lines changed

README.md

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,170 @@ func main() {
206206
}
207207
```
208208

209+
`crawler.Crawl()` with concurrent handling:
210+
211+
```go
212+
package main
213+
214+
import (
215+
"context"
216+
"fmt"
217+
"html/template"
218+
stdlog "log"
219+
"net/http"
220+
"net/http/httptest"
221+
"os"
222+
"runtime"
223+
"strings"
224+
"time"
225+
226+
"github.com/go-log/log/print"
227+
crawler "github.com/thewizardplusplus/go-crawler"
228+
"github.com/thewizardplusplus/go-crawler/checkers"
229+
"github.com/thewizardplusplus/go-crawler/extractors"
230+
"github.com/thewizardplusplus/go-crawler/handlers"
231+
"github.com/thewizardplusplus/go-crawler/registers"
232+
"github.com/thewizardplusplus/go-crawler/sanitizing"
233+
htmlselector "github.com/thewizardplusplus/go-html-selector"
234+
)
235+
236+
type LinkHandler struct {
237+
ServerURL string
238+
}
239+
240+
func (handler LinkHandler) HandleLink(
241+
ctx context.Context,
242+
link crawler.SourcedLink,
243+
) {
244+
fmt.Printf(
245+
"have got the link %q from the page %q\n",
246+
handler.replaceServerURL(link.Link),
247+
handler.replaceServerURL(link.SourceLink),
248+
)
249+
}
250+
251+
// replace the test server URL for reproducibility of the example
252+
func (handler LinkHandler) replaceServerURL(link string) string {
253+
return strings.Replace(link, handler.ServerURL, "http://example.com", -1)
254+
}
255+
256+
func RunServer() *httptest.Server {
257+
return httptest.NewServer(http.HandlerFunc(func(
258+
writer http.ResponseWriter,
259+
request *http.Request,
260+
) {
261+
if request.URL.Path == "/robots.txt" {
262+
// nolint: errcheck
263+
fmt.Fprint(writer, `
264+
User-agent: go-crawler
265+
Disallow: /2
266+
`)
267+
268+
return
269+
}
270+
271+
var links []string
272+
switch request.URL.Path {
273+
case "/":
274+
links = []string{"/1", "/2", "/2", "https://golang.org/"}
275+
case "/1":
276+
links = []string{"/1/1", "/1/2"}
277+
case "/2":
278+
links = []string{"/2/1", "/2/2"}
279+
}
280+
for index := range links {
281+
if strings.HasPrefix(links[index], "/") {
282+
links[index] = "http://" + request.Host + links[index]
283+
}
284+
}
285+
286+
template, _ := template.New("").Parse( // nolint: errcheck
287+
`<ul>
288+
{{ range $link := . }}
289+
<li><a href="{{ $link }}">{{ $link }}</a></li>
290+
{{ end }}
291+
</ul>`,
292+
)
293+
template.Execute(writer, links) // nolint: errcheck
294+
}))
295+
}
296+
297+
func main() {
298+
server := RunServer()
299+
defer server.Close()
300+
301+
logger := stdlog.New(os.Stderr, "", stdlog.LstdFlags|stdlog.Lmicroseconds)
302+
// wrap the standard logger via the github.com/go-log/log package
303+
wrappedLogger := print.New(logger)
304+
305+
// this context should be shared between the handler
306+
// and the crawler.Crawl() call
307+
ctx := context.Background()
308+
handler := handlers.NewConcurrentHandler(1000, handlers.CheckedHandler{
309+
LinkChecker: checkers.DuplicateChecker{
310+
LinkRegister: registers.NewLinkRegister(
311+
sanitizing.SanitizeLink,
312+
wrappedLogger,
313+
),
314+
},
315+
LinkHandler: LinkHandler{
316+
ServerURL: server.URL,
317+
},
318+
})
319+
go handler.RunConcurrently(ctx, runtime.NumCPU())
320+
// it can be called immediately after the crawler.Crawl() call
321+
defer handler.Stop()
322+
323+
crawler.Crawl(
324+
ctx,
325+
runtime.NumCPU(),
326+
1000,
327+
[]string{server.URL},
328+
crawler.CrawlDependencies{
329+
LinkExtractor: extractors.RepeatingExtractor{
330+
LinkExtractor: extractors.NewDelayingExtractor(
331+
time.Second,
332+
time.Sleep,
333+
extractors.DefaultExtractor{
334+
HTTPClient: http.DefaultClient,
335+
Filters: htmlselector.OptimizeFilters(htmlselector.FilterGroup{
336+
"a": {"href"},
337+
}),
338+
},
339+
),
340+
RepeatCount: 5,
341+
RepeatDelay: 0,
342+
Logger: wrappedLogger,
343+
SleepHandler: time.Sleep,
344+
},
345+
LinkChecker: checkers.CheckerGroup{
346+
checkers.HostChecker{
347+
Logger: wrappedLogger,
348+
},
349+
checkers.DuplicateChecker{
350+
// don't use here the link register from the handler above
351+
LinkRegister: registers.NewLinkRegister(
352+
sanitizing.SanitizeLink,
353+
wrappedLogger,
354+
),
355+
},
356+
},
357+
LinkHandler: handler,
358+
Logger: wrappedLogger,
359+
},
360+
)
361+
362+
// Unordered output:
363+
// have got the link "http://example.com/1" from the page "http://example.com"
364+
// have got the link "http://example.com/1/1" from the page "http://example.com/1"
365+
// have got the link "http://example.com/1/2" from the page "http://example.com/1"
366+
// have got the link "http://example.com/2" from the page "http://example.com"
367+
// have got the link "http://example.com/2/1" from the page "http://example.com/2"
368+
// have got the link "http://example.com/2/2" from the page "http://example.com/2"
369+
// have got the link "https://golang.org/" from the page "http://example.com"
370+
}
371+
```
372+
209373
`crawler.HandleLinksConcurrently()`:
210374

211375
```go

examples_test.go

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,81 @@ func ExampleCrawl() {
151151
// have got the link "https://golang.org/" from the page "http://example.com"
152152
}
153153

154+
func ExampleCrawl_withConcurrentHandling() {
155+
server := RunServer()
156+
defer server.Close()
157+
158+
logger := stdlog.New(os.Stderr, "", stdlog.LstdFlags|stdlog.Lmicroseconds)
159+
// wrap the standard logger via the github.com/go-log/log package
160+
wrappedLogger := print.New(logger)
161+
162+
// this context should be shared between the handler
163+
// and the crawler.Crawl() call
164+
ctx := context.Background()
165+
handler := handlers.NewConcurrentHandler(1000, handlers.CheckedHandler{
166+
LinkChecker: checkers.DuplicateChecker{
167+
LinkRegister: registers.NewLinkRegister(
168+
sanitizing.SanitizeLink,
169+
wrappedLogger,
170+
),
171+
},
172+
LinkHandler: LinkHandler{
173+
ServerURL: server.URL,
174+
},
175+
})
176+
go handler.RunConcurrently(ctx, runtime.NumCPU())
177+
// it can be called immediately after the crawler.Crawl() call
178+
defer handler.Stop()
179+
180+
crawler.Crawl(
181+
ctx,
182+
runtime.NumCPU(),
183+
1000,
184+
[]string{server.URL},
185+
crawler.CrawlDependencies{
186+
LinkExtractor: extractors.RepeatingExtractor{
187+
LinkExtractor: extractors.NewDelayingExtractor(
188+
time.Second,
189+
time.Sleep,
190+
extractors.DefaultExtractor{
191+
HTTPClient: http.DefaultClient,
192+
Filters: htmlselector.OptimizeFilters(htmlselector.FilterGroup{
193+
"a": {"href"},
194+
}),
195+
},
196+
),
197+
RepeatCount: 5,
198+
RepeatDelay: 0,
199+
Logger: wrappedLogger,
200+
SleepHandler: time.Sleep,
201+
},
202+
LinkChecker: checkers.CheckerGroup{
203+
checkers.HostChecker{
204+
Logger: wrappedLogger,
205+
},
206+
checkers.DuplicateChecker{
207+
// don't use here the link register from the handler above
208+
LinkRegister: registers.NewLinkRegister(
209+
sanitizing.SanitizeLink,
210+
wrappedLogger,
211+
),
212+
},
213+
},
214+
LinkHandler: handler,
215+
Logger: wrappedLogger,
216+
},
217+
)
218+
219+
// Unordered output:
220+
// have got the link "http://example.com/1" from the page "http://example.com"
221+
// have got the link "http://example.com/1/1" from the page "http://example.com/1"
222+
// have got the link "http://example.com/1/2" from the page "http://example.com/1"
223+
// have got the link "http://example.com/2" from the page "http://example.com"
224+
// have got the link "http://example.com/2/1" from the page "http://example.com/2"
225+
// have got the link "http://example.com/2/2" from the page "http://example.com/2"
226+
// have got the link "https://golang.org/" from the page "http://example.com"
227+
}
228+
154229
func ExampleHandleLinksConcurrently() {
155230
server := RunServer()
156231
defer server.Close()

0 commit comments

Comments
 (0)