Skip to content

Commit 2ff6348

Browse files
Add the example for the crawler.Crawl() function with few handlers
1 parent d692afc commit 2ff6348

File tree

2 files changed

+233
-1
lines changed

2 files changed

+233
-1
lines changed

README.md

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,6 +1179,159 @@ func main() {
11791179
}
11801180
```
11811181

1182+
`crawler.Crawl()` with few handlers:
1183+
1184+
```go
1185+
package main
1186+
1187+
import (
1188+
"context"
1189+
"fmt"
1190+
"html/template"
1191+
stdlog "log"
1192+
"net/http"
1193+
"net/http/httptest"
1194+
"os"
1195+
"runtime"
1196+
"strings"
1197+
"time"
1198+
1199+
"github.com/go-log/log/print"
1200+
crawler "github.com/thewizardplusplus/go-crawler"
1201+
"github.com/thewizardplusplus/go-crawler/checkers"
1202+
"github.com/thewizardplusplus/go-crawler/extractors"
1203+
"github.com/thewizardplusplus/go-crawler/handlers"
1204+
"github.com/thewizardplusplus/go-crawler/models"
1205+
urlutils "github.com/thewizardplusplus/go-crawler/url-utils"
1206+
htmlselector "github.com/thewizardplusplus/go-html-selector"
1207+
)
1208+
1209+
type LinkHandler struct {
1210+
Name string
1211+
ServerURL string
1212+
}
1213+
1214+
func (handler LinkHandler) HandleLink(
1215+
ctx context.Context,
1216+
link models.SourcedLink,
1217+
) {
1218+
fmt.Printf(
1219+
"[%s] received link %q from page %q\n",
1220+
handler.Name,
1221+
handler.replaceServerURL(link.Link),
1222+
handler.replaceServerURL(link.SourceLink),
1223+
)
1224+
}
1225+
1226+
// replace the test server URL for reproducibility of the example
1227+
func (handler LinkHandler) replaceServerURL(link string) string {
1228+
return strings.Replace(link, handler.ServerURL, "http://example.com", -1)
1229+
}
1230+
1231+
func RunServer() *httptest.Server {
1232+
return httptest.NewServer(http.HandlerFunc(func(
1233+
writer http.ResponseWriter,
1234+
request *http.Request,
1235+
) {
1236+
var links []string
1237+
switch request.URL.Path {
1238+
case "/":
1239+
links = []string{"/1", "/2", "/2", "https://golang.org/"}
1240+
case "/1":
1241+
links = []string{"/1/1", "/1/2"}
1242+
case "/2":
1243+
links = []string{"/2/1", "/2/2"}
1244+
}
1245+
for index := range links {
1246+
if strings.HasPrefix(links[index], "/") {
1247+
links[index] = "http://" + request.Host + links[index]
1248+
}
1249+
}
1250+
1251+
template, _ := template.New("").Parse( // nolint: errcheck
1252+
`<ul>
1253+
{{ range $link := . }}
1254+
<li><a href="{{ $link }}">{{ $link }}</a></li>
1255+
{{ end }}
1256+
</ul>`,
1257+
)
1258+
template.Execute(writer, links) // nolint: errcheck
1259+
}))
1260+
}
1261+
1262+
func main() {
1263+
server := RunServer()
1264+
defer server.Close()
1265+
1266+
logger := stdlog.New(os.Stderr, "", stdlog.LstdFlags|stdlog.Lmicroseconds)
1267+
// wrap the standard logger via the github.com/go-log/log package
1268+
wrappedLogger := print.New(logger)
1269+
1270+
crawler.Crawl(
1271+
context.Background(),
1272+
crawler.ConcurrencyConfig{
1273+
ConcurrencyFactor: runtime.NumCPU(),
1274+
BufferSize: 1000,
1275+
},
1276+
[]string{server.URL},
1277+
crawler.CrawlDependencies{
1278+
LinkExtractor: extractors.RepeatingExtractor{
1279+
LinkExtractor: extractors.DefaultExtractor{
1280+
TrimLink: urlutils.TrimLink,
1281+
HTTPClient: http.DefaultClient,
1282+
Filters: htmlselector.OptimizeFilters(htmlselector.FilterGroup{
1283+
"a": {"href"},
1284+
}),
1285+
},
1286+
RepeatCount: 5,
1287+
RepeatDelay: time.Second,
1288+
Logger: wrappedLogger,
1289+
SleepHandler: time.Sleep,
1290+
},
1291+
LinkChecker: checkers.HostChecker{
1292+
ComparisonResult: urlutils.Same,
1293+
Logger: wrappedLogger,
1294+
},
1295+
LinkHandler: handlers.HandlerGroup{
1296+
handlers.CheckedHandler{
1297+
LinkChecker: checkers.HostChecker{
1298+
ComparisonResult: urlutils.Same,
1299+
Logger: wrappedLogger,
1300+
},
1301+
LinkHandler: LinkHandler{
1302+
Name: "inner",
1303+
ServerURL: server.URL,
1304+
},
1305+
},
1306+
handlers.CheckedHandler{
1307+
LinkChecker: checkers.HostChecker{
1308+
ComparisonResult: urlutils.Different,
1309+
Logger: wrappedLogger,
1310+
},
1311+
LinkHandler: LinkHandler{
1312+
Name: "outer",
1313+
ServerURL: server.URL,
1314+
},
1315+
},
1316+
},
1317+
Logger: wrappedLogger,
1318+
},
1319+
)
1320+
1321+
// Unordered output:
1322+
// [inner] received link "http://example.com/1" from page "http://example.com"
1323+
// [inner] received link "http://example.com/1/1" from page "http://example.com/1"
1324+
// [inner] received link "http://example.com/1/2" from page "http://example.com/1"
1325+
// [inner] received link "http://example.com/2" from page "http://example.com"
1326+
// [inner] received link "http://example.com/2" from page "http://example.com"
1327+
// [inner] received link "http://example.com/2/1" from page "http://example.com/2"
1328+
// [inner] received link "http://example.com/2/1" from page "http://example.com/2"
1329+
// [inner] received link "http://example.com/2/2" from page "http://example.com/2"
1330+
// [inner] received link "http://example.com/2/2" from page "http://example.com/2"
1331+
// [outer] received link "https://golang.org/" from page "http://example.com"
1332+
}
1333+
```
1334+
11821335
`crawler.CrawlByConcurrentHandler()`:
11831336

11841337
```go

examples_test.go

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,22 @@ import (
2929
)
3030

3131
type LinkHandler struct {
32+
Name string
3233
ServerURL string
3334
}
3435

3536
func (handler LinkHandler) HandleLink(
3637
ctx context.Context,
3738
link models.SourcedLink,
3839
) {
40+
var prefix string
41+
if handler.Name != "" {
42+
prefix = fmt.Sprintf("[%s] ", handler.Name)
43+
}
44+
3945
fmt.Printf(
40-
"received link %q from page %q\n",
46+
"%sreceived link %q from page %q\n",
47+
prefix,
4148
handler.replaceServerURL(link.Link),
4249
handler.replaceServerURL(link.SourceLink),
4350
)
@@ -601,6 +608,78 @@ func ExampleCrawl_withSitemap() {
601608
// received link "https://golang.org/" from page "http://example.com"
602609
}
603610

611+
func ExampleCrawl_withFewHandlers() {
612+
server := RunServer()
613+
defer server.Close()
614+
615+
logger := stdlog.New(os.Stderr, "", stdlog.LstdFlags|stdlog.Lmicroseconds)
616+
// wrap the standard logger via the github.com/go-log/log package
617+
wrappedLogger := print.New(logger)
618+
619+
crawler.Crawl(
620+
context.Background(),
621+
crawler.ConcurrencyConfig{
622+
ConcurrencyFactor: runtime.NumCPU(),
623+
BufferSize: 1000,
624+
},
625+
[]string{server.URL},
626+
crawler.CrawlDependencies{
627+
LinkExtractor: extractors.RepeatingExtractor{
628+
LinkExtractor: extractors.DefaultExtractor{
629+
TrimLink: urlutils.TrimLink,
630+
HTTPClient: http.DefaultClient,
631+
Filters: htmlselector.OptimizeFilters(htmlselector.FilterGroup{
632+
"a": {"href"},
633+
}),
634+
},
635+
RepeatCount: 5,
636+
RepeatDelay: time.Second,
637+
Logger: wrappedLogger,
638+
SleepHandler: time.Sleep,
639+
},
640+
LinkChecker: checkers.HostChecker{
641+
ComparisonResult: urlutils.Same,
642+
Logger: wrappedLogger,
643+
},
644+
LinkHandler: handlers.HandlerGroup{
645+
handlers.CheckedHandler{
646+
LinkChecker: checkers.HostChecker{
647+
ComparisonResult: urlutils.Same,
648+
Logger: wrappedLogger,
649+
},
650+
LinkHandler: LinkHandler{
651+
Name: "inner",
652+
ServerURL: server.URL,
653+
},
654+
},
655+
handlers.CheckedHandler{
656+
LinkChecker: checkers.HostChecker{
657+
ComparisonResult: urlutils.Different,
658+
Logger: wrappedLogger,
659+
},
660+
LinkHandler: LinkHandler{
661+
Name: "outer",
662+
ServerURL: server.URL,
663+
},
664+
},
665+
},
666+
Logger: wrappedLogger,
667+
},
668+
)
669+
670+
// Unordered output:
671+
// [inner] received link "http://example.com/1" from page "http://example.com"
672+
// [inner] received link "http://example.com/1/1" from page "http://example.com/1"
673+
// [inner] received link "http://example.com/1/2" from page "http://example.com/1"
674+
// [inner] received link "http://example.com/2" from page "http://example.com"
675+
// [inner] received link "http://example.com/2" from page "http://example.com"
676+
// [inner] received link "http://example.com/2/1" from page "http://example.com/2"
677+
// [inner] received link "http://example.com/2/1" from page "http://example.com/2"
678+
// [inner] received link "http://example.com/2/2" from page "http://example.com/2"
679+
// [inner] received link "http://example.com/2/2" from page "http://example.com/2"
680+
// [outer] received link "https://golang.org/" from page "http://example.com"
681+
}
682+
604683
func ExampleCrawlByConcurrentHandler() {
605684
server := RunServer()
606685
defer server.Close()

0 commit comments

Comments
 (0)