Skip to content

Commit 255f299

Browse files
Use the urlutils.CompareLinkHosts() function in the checkers.HostChecker structure
1 parent 2347feb commit 255f299

File tree

4 files changed

+100
-52
lines changed

4 files changed

+100
-52
lines changed

README.md

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,8 @@ func main() {
188188
SleepHandler: time.Sleep,
189189
},
190190
LinkChecker: checkers.HostChecker{
191-
Logger: wrappedLogger,
191+
ComparisonResult: urlutils.Same,
192+
Logger: wrappedLogger,
192193
},
193194
LinkHandler: LinkHandler{
194195
ServerURL: server.URL,
@@ -320,7 +321,8 @@ func main() {
320321
},
321322
LinkChecker: checkers.CheckerGroup{
322323
checkers.HostChecker{
323-
Logger: wrappedLogger,
324+
ComparisonResult: urlutils.Same,
325+
Logger: wrappedLogger,
324326
},
325327
checkers.DuplicateChecker{
326328
LinkRegister: registers.NewLinkRegister(
@@ -458,7 +460,8 @@ func main() {
458460
},
459461
LinkChecker: checkers.CheckerGroup{
460462
checkers.HostChecker{
461-
Logger: wrappedLogger,
463+
ComparisonResult: urlutils.Same,
464+
Logger: wrappedLogger,
462465
},
463466
checkers.DuplicateChecker{
464467
LinkRegister: registers.NewLinkRegister(
@@ -608,7 +611,8 @@ func main() {
608611
},
609612
LinkChecker: checkers.CheckerGroup{
610613
checkers.HostChecker{
611-
Logger: wrappedLogger,
614+
ComparisonResult: urlutils.Same,
615+
Logger: wrappedLogger,
612616
},
613617
checkers.DuplicateChecker{
614618
LinkRegister: registers.NewLinkRegister(
@@ -761,7 +765,8 @@ func main() {
761765
},
762766
LinkChecker: checkers.CheckerGroup{
763767
checkers.HostChecker{
764-
Logger: wrappedLogger,
768+
ComparisonResult: urlutils.Same,
769+
Logger: wrappedLogger,
765770
},
766771
checkers.RobotsTXTChecker{
767772
UserAgent: "go-crawler",
@@ -903,7 +908,8 @@ func main() {
903908
SleepHandler: time.Sleep,
904909
},
905910
LinkChecker: checkers.HostChecker{
906-
Logger: wrappedLogger,
911+
ComparisonResult: urlutils.Same,
912+
Logger: wrappedLogger,
907913
},
908914
LinkHandler: handlers.CheckedHandler{
909915
LinkChecker: checkers.RobotsTXTChecker{
@@ -1129,7 +1135,8 @@ func main() {
11291135
},
11301136
LinkChecker: checkers.CheckerGroup{
11311137
checkers.HostChecker{
1132-
Logger: wrappedLogger,
1138+
ComparisonResult: urlutils.Same,
1139+
Logger: wrappedLogger,
11331140
},
11341141
checkers.DuplicateChecker{
11351142
LinkRegister: registers.NewLinkRegister(
@@ -1282,7 +1289,8 @@ func main() {
12821289
SleepHandler: time.Sleep,
12831290
},
12841291
LinkChecker: checkers.HostChecker{
1285-
Logger: wrappedLogger,
1292+
ComparisonResult: urlutils.Same,
1293+
Logger: wrappedLogger,
12861294
},
12871295
LinkHandler: LinkHandler{
12881296
ServerURL: server.URL,
@@ -1416,7 +1424,8 @@ func main() {
14161424
SleepHandler: time.Sleep,
14171425
},
14181426
LinkChecker: checkers.HostChecker{
1419-
Logger: wrappedLogger,
1427+
ComparisonResult: urlutils.Same,
1428+
Logger: wrappedLogger,
14201429
},
14211430
LinkHandler: LinkHandler{
14221431
ServerURL: server.URL,

checkers/host_checker.go

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,16 @@ package checkers
22

33
import (
44
"context"
5-
"net/url"
65

76
"github.com/go-log/log"
87
"github.com/thewizardplusplus/go-crawler/models"
8+
urlutils "github.com/thewizardplusplus/go-crawler/url-utils"
99
)
1010

1111
// HostChecker ...
1212
type HostChecker struct {
13-
Logger log.Logger
13+
ComparisonResult urlutils.ComparisonResult
14+
Logger log.Logger
1415
}
1516

1617
// CheckLink ...
@@ -20,21 +21,13 @@ func (checker HostChecker) CheckLink(
2021
) bool {
2122
const logPrefix = "host checking"
2223

23-
parsedSourceLink, err := url.Parse(link.SourceLink)
24+
result, err := urlutils.CompareLinkHosts(link.SourceLink, link.Link)
2425
if err != nil {
25-
const logMessage = "%s: unable to parse parent link %q: %s"
26-
checker.Logger.Logf(logMessage, logPrefix, link.SourceLink, err)
26+
const logMessage = "%s: unable to compare link hosts: %s"
27+
checker.Logger.Logf(logMessage, logPrefix, err)
2728

2829
return false
2930
}
3031

31-
parsedLink, err := url.Parse(link.Link)
32-
if err != nil {
33-
const logMessage = "%s: unable to parse link %q: %s"
34-
checker.Logger.Logf(logMessage, logPrefix, link.Link, err)
35-
36-
return false
37-
}
38-
39-
return parsedLink.Host == parsedSourceLink.Host
32+
return result == checker.ComparisonResult
4033
}

checkers/host_checker_test.go

Lines changed: 57 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,19 @@ package checkers
22

33
import (
44
"context"
5-
"errors"
6-
"net/url"
75
"testing"
86

97
"github.com/go-log/log"
108
"github.com/stretchr/testify/assert"
119
"github.com/stretchr/testify/mock"
1210
"github.com/thewizardplusplus/go-crawler/models"
11+
urlutils "github.com/thewizardplusplus/go-crawler/url-utils"
1312
)
1413

1514
func TestHostChecker_CheckLink(test *testing.T) {
1615
type fields struct {
17-
Logger log.Logger
16+
ComparisonResult urlutils.ComparisonResult
17+
Logger log.Logger
1818
}
1919
type args struct {
2020
ctx context.Context
@@ -28,9 +28,10 @@ func TestHostChecker_CheckLink(test *testing.T) {
2828
want assert.BoolAssertionFunc
2929
}{
3030
{
31-
name: "success with different hosts",
31+
name: "success with different hosts (false)",
3232
fields: fields{
33-
Logger: new(MockLogger),
33+
ComparisonResult: urlutils.Same,
34+
Logger: new(MockLogger),
3435
},
3536
args: args{
3637
ctx: context.Background(),
@@ -42,9 +43,40 @@ func TestHostChecker_CheckLink(test *testing.T) {
4243
want: assert.False,
4344
},
4445
{
45-
name: "success with same hosts",
46+
name: "success with different hosts (true)",
4647
fields: fields{
47-
Logger: new(MockLogger),
48+
ComparisonResult: urlutils.Different,
49+
Logger: new(MockLogger),
50+
},
51+
args: args{
52+
ctx: context.Background(),
53+
link: models.SourcedLink{
54+
SourceLink: "http://example1.com/",
55+
Link: "http://example2.com/test",
56+
},
57+
},
58+
want: assert.True,
59+
},
60+
{
61+
name: "success with same hosts (false)",
62+
fields: fields{
63+
ComparisonResult: urlutils.Different,
64+
Logger: new(MockLogger),
65+
},
66+
args: args{
67+
ctx: context.Background(),
68+
link: models.SourcedLink{
69+
SourceLink: "http://example.com/",
70+
Link: "http://example.com/test",
71+
},
72+
},
73+
want: assert.False,
74+
},
75+
{
76+
name: "success with same hosts (true)",
77+
fields: fields{
78+
ComparisonResult: urlutils.Same,
79+
Logger: new(MockLogger),
4880
},
4981
args: args{
5082
ctx: context.Background(),
@@ -58,18 +90,20 @@ func TestHostChecker_CheckLink(test *testing.T) {
5890
{
5991
name: "error with the parent link",
6092
fields: fields{
93+
ComparisonResult: urlutils.Same,
6194
Logger: func() Logger {
62-
err := errors.New("missing protocol scheme")
63-
urlErr := &url.Error{Op: "parse", URL: ":", Err: err}
64-
6595
logger := new(MockLogger)
6696
logger.
6797
On(
6898
"Logf",
69-
"%s: unable to parse parent link %q: %s",
99+
"%s: unable to compare link hosts: %s",
70100
"host checking",
71-
":",
72-
urlErr,
101+
mock.MatchedBy(func(err error) bool {
102+
wantErrMessage := `unable to parse link ":": ` +
103+
`parse :: ` +
104+
"missing protocol scheme"
105+
return err.Error() == wantErrMessage
106+
}),
73107
).
74108
Return()
75109

@@ -88,18 +122,20 @@ func TestHostChecker_CheckLink(test *testing.T) {
88122
{
89123
name: "error with the link",
90124
fields: fields{
125+
ComparisonResult: urlutils.Same,
91126
Logger: func() Logger {
92-
err := errors.New("missing protocol scheme")
93-
urlErr := &url.Error{Op: "parse", URL: ":", Err: err}
94-
95127
logger := new(MockLogger)
96128
logger.
97129
On(
98130
"Logf",
99-
"%s: unable to parse link %q: %s",
131+
"%s: unable to compare link hosts: %s",
100132
"host checking",
101-
":",
102-
urlErr,
133+
mock.MatchedBy(func(err error) bool {
134+
wantErrMessage := `unable to parse link ":": ` +
135+
`parse :: ` +
136+
"missing protocol scheme"
137+
return err.Error() == wantErrMessage
138+
}),
103139
).
104140
Return()
105141

@@ -118,7 +154,8 @@ func TestHostChecker_CheckLink(test *testing.T) {
118154
} {
119155
test.Run(data.name, func(test *testing.T) {
120156
checker := HostChecker{
121-
Logger: data.fields.Logger,
157+
ComparisonResult: data.fields.ComparisonResult,
158+
Logger: data.fields.Logger,
122159
}
123160
got := checker.CheckLink(data.args.ctx, data.args.link)
124161

examples_test.go

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,8 @@ func ExampleCrawl() {
178178
SleepHandler: time.Sleep,
179179
},
180180
LinkChecker: checkers.HostChecker{
181-
Logger: wrappedLogger,
181+
ComparisonResult: urlutils.Same,
182+
Logger: wrappedLogger,
182183
},
183184
LinkHandler: LinkHandler{
184185
ServerURL: server.URL,
@@ -231,7 +232,8 @@ func ExampleCrawl_withoutDuplicatesOnExtracting() {
231232
},
232233
LinkChecker: checkers.CheckerGroup{
233234
checkers.HostChecker{
234-
Logger: wrappedLogger,
235+
ComparisonResult: urlutils.Same,
236+
Logger: wrappedLogger,
235237
},
236238
checkers.DuplicateChecker{
237239
LinkRegister: registers.NewLinkRegister(
@@ -289,7 +291,8 @@ func ExampleCrawl_withoutDuplicatesOnHandling() {
289291
},
290292
LinkChecker: checkers.CheckerGroup{
291293
checkers.HostChecker{
292-
Logger: wrappedLogger,
294+
ComparisonResult: urlutils.Same,
295+
Logger: wrappedLogger,
293296
},
294297
checkers.DuplicateChecker{
295298
LinkRegister: registers.NewLinkRegister(
@@ -359,7 +362,8 @@ func ExampleCrawl_withDelayingExtracting() {
359362
},
360363
LinkChecker: checkers.CheckerGroup{
361364
checkers.HostChecker{
362-
Logger: wrappedLogger,
365+
ComparisonResult: urlutils.Same,
366+
Logger: wrappedLogger,
363367
},
364368
checkers.DuplicateChecker{
365369
LinkRegister: registers.NewLinkRegister(
@@ -425,7 +429,8 @@ func ExampleCrawl_withRobotsTXTOnExtracting() {
425429
},
426430
LinkChecker: checkers.CheckerGroup{
427431
checkers.HostChecker{
428-
Logger: wrappedLogger,
432+
ComparisonResult: urlutils.Same,
433+
Logger: wrappedLogger,
429434
},
430435
checkers.RobotsTXTChecker{
431436
UserAgent: "go-crawler",
@@ -479,7 +484,8 @@ func ExampleCrawl_withRobotsTXTOnHandling() {
479484
SleepHandler: time.Sleep,
480485
},
481486
LinkChecker: checkers.HostChecker{
482-
Logger: wrappedLogger,
487+
ComparisonResult: urlutils.Same,
488+
Logger: wrappedLogger,
483489
},
484490
LinkHandler: handlers.CheckedHandler{
485491
LinkChecker: checkers.RobotsTXTChecker{
@@ -552,7 +558,8 @@ func ExampleCrawl_withSitemap() {
552558
},
553559
LinkChecker: checkers.CheckerGroup{
554560
checkers.HostChecker{
555-
Logger: wrappedLogger,
561+
ComparisonResult: urlutils.Same,
562+
Logger: wrappedLogger,
556563
},
557564
checkers.DuplicateChecker{
558565
LinkRegister: registers.NewLinkRegister(
@@ -628,7 +635,8 @@ func ExampleCrawlByConcurrentHandler() {
628635
SleepHandler: time.Sleep,
629636
},
630637
LinkChecker: checkers.HostChecker{
631-
Logger: wrappedLogger,
638+
ComparisonResult: urlutils.Same,
639+
Logger: wrappedLogger,
632640
},
633641
LinkHandler: LinkHandler{
634642
ServerURL: server.URL,
@@ -684,7 +692,8 @@ func ExampleHandleLinksConcurrently() {
684692
SleepHandler: time.Sleep,
685693
},
686694
LinkChecker: checkers.HostChecker{
687-
Logger: wrappedLogger,
695+
ComparisonResult: urlutils.Same,
696+
Logger: wrappedLogger,
688697
},
689698
LinkHandler: LinkHandler{
690699
ServerURL: server.URL,

0 commit comments

Comments
 (0)