-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.go
80 lines (70 loc) · 1.92 KB
/
crawl.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
package crawler
import (
"context"
"sync"
"github.com/go-log/log"
"github.com/thewizardplusplus/go-crawler/handlers"
"github.com/thewizardplusplus/go-crawler/models"
syncutils "github.com/thewizardplusplus/go-sync-utils"
)
// ConcurrencyConfig ...
type ConcurrencyConfig struct {
ConcurrencyFactor int
BufferSize int
}
// CrawlDependencies ...
type CrawlDependencies struct {
LinkExtractor models.LinkExtractor
LinkChecker models.LinkChecker
LinkHandler models.LinkHandler
Logger log.Logger
}
// Crawl ...
func Crawl(
ctx context.Context,
concurrencyConfig ConcurrencyConfig,
links []string,
dependencies CrawlDependencies,
) {
linkChannel := make(chan string, concurrencyConfig.BufferSize)
for _, link := range links {
// use unbounded sending to avoid a deadlock
syncutils.UnboundedSend(linkChannel, link)
}
var waiter sync.WaitGroup
waiter.Add(len(links))
HandleLinksConcurrently(
ctx,
concurrencyConfig.ConcurrencyFactor,
linkChannel,
HandleLinkDependencies{
CrawlDependencies: dependencies,
Waiter: &waiter,
},
)
waiter.Wait()
// it should be called after the waiter.Wait() call
close(linkChannel)
}
// CrawlByConcurrentHandler ...
func CrawlByConcurrentHandler(
ctx context.Context,
concurrencyConfig ConcurrencyConfig,
handlerConcurrencyConfig ConcurrencyConfig,
links []string,
dependencies CrawlDependencies,
) {
handlerConcurrencyFactor, handlerBufferSize :=
handlerConcurrencyConfig.ConcurrencyFactor,
handlerConcurrencyConfig.BufferSize
concurrentHandler :=
handlers.NewConcurrentHandler(handlerBufferSize, dependencies.LinkHandler)
go concurrentHandler.StartConcurrently(ctx, handlerConcurrencyFactor)
defer concurrentHandler.Stop()
Crawl(ctx, concurrencyConfig, links, CrawlDependencies{
LinkExtractor: dependencies.LinkExtractor,
LinkChecker: dependencies.LinkChecker,
LinkHandler: concurrentHandler,
Logger: dependencies.Logger,
})
}