Skip to content

Commit e321395

Browse files
authored
Merge pull request #55 from SkynetLabs/ivo/parallel_repins
Parallel repins
2 parents 352dbf3 + 65d38a8 commit e321395

File tree

7 files changed

+157
-85
lines changed

7 files changed

+157
-85
lines changed

conf/configuration.go

+20-7
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,14 @@ import (
2121
// Default configuration values.
2222
// For individual descriptions see Config.
2323
const (
24-
defaultAccountsHost = "10.10.10.70"
25-
defaultAccountsPort = "3000"
26-
defaultLogFile = "" // disabled logging to file
27-
defaultLogLevel = logrus.InfoLevel
28-
defaultSiaAPIHost = "10.10.10.10"
29-
defaultSiaAPIPort = "9980"
30-
defaultMinPinners = 1
24+
defaultAccountsHost = "10.10.10.70"
25+
defaultAccountsPort = "3000"
26+
defaultLogFile = "" // disabled logging to file
27+
defaultLogLevel = logrus.InfoLevel
28+
defaultScannerThreads = 5
29+
defaultSiaAPIHost = "10.10.10.10"
30+
defaultSiaAPIPort = "9980"
31+
defaultMinPinners = 1
3132
)
3233

3334
// Cluster-wide configuration variable names.
@@ -105,6 +106,9 @@ type (
105106
// which a skylink needs in order to not be considered underpinned.
106107
// Anything below this value requires more servers to pin the skylink.
107108
MinPinners int
109+
// ScannerThreads defines the number of scanning threads which might attempt
110+
// to pin an underpinned skylink.
111+
ScannerThreads int
108112
// ServerName holds the name of the current server. This name will be
109113
// used for identifying which servers are pinning a given skylink.
110114
ServerName string
@@ -134,6 +138,7 @@ func LoadConfig() (Config, error) {
134138
LogFile: defaultLogFile,
135139
LogLevel: defaultLogLevel,
136140
MinPinners: defaultMinPinners,
141+
ScannerThreads: defaultScannerThreads,
137142
SiaAPIHost: defaultSiaAPIHost,
138143
SiaAPIPort: defaultSiaAPIPort,
139144
SleepBetweenScans: 0, // This will be ignored by the scanner.
@@ -179,6 +184,14 @@ func LoadConfig() (Config, error) {
179184
}
180185
cfg.LogLevel = lvl
181186
}
187+
if val, ok = os.LookupEnv("PINNER_SCANNER_THREADS"); ok {
188+
// Check for a bare number and interpret that as seconds.
189+
st, err := strconv.ParseInt(val, 0, 0)
190+
if err != nil {
191+
log.Fatalf("PINNER_SCANNER_THREADS has an invalid value of '%s'", val)
192+
}
193+
cfg.ScannerThreads = int(st)
194+
}
182195
if val, ok = os.LookupEnv("PINNER_SLEEP_BETWEEN_SCANS"); ok {
183196
// Check for a bare number and interpret that as seconds.
184197
if _, err := strconv.ParseInt(val, 0, 0); err == nil {

database/skylink.go

+10-9
Original file line numberDiff line numberDiff line change
@@ -262,15 +262,16 @@ func (db *DB) RemoveServerFromSkylinks(ctx context.Context, skylinks []string, s
262262
// the given server.
263263
//
264264
// The MongoDB query is this:
265-
// db.getCollection('skylinks').find({
266-
// "pinned": { "$ne": false }},
267-
// "$expr": { "$lt": [{ "$size": "$servers" }, 2 ]},
268-
// "servers": { "$nin": [ "ro-tex.siasky.ivo.NOPE" ]},
269-
// "$or": [
270-
// { "lock_expires" : { "$exists": false }},
271-
// { "lock_expires" : { "$lt": new Date() }}
272-
// ]
273-
// })
265+
//
266+
// db.getCollection('skylinks').find({
267+
// "pinned": { "$ne": false }},
268+
// "$expr": { "$lt": [{ "$size": "$servers" }, 2 ]},
269+
// "servers": { "$nin": [ "ro-tex.siasky.ivo.NOPE" ]},
270+
// "$or": [
271+
// { "lock_expires" : { "$exists": false }},
272+
// { "lock_expires" : { "$lt": new Date() }}
273+
// ]
274+
// })
274275
func (db *DB) FindAndLockUnderpinned(ctx context.Context, server string, skipSkylinks []string, minPinners int) (skymodules.Skylink, error) {
275276
if skipSkylinks == nil {
276277
skipSkylinks = make([]string, 0)

main.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ func main() {
4444

4545
// Start the background scanner.
4646
skydClient := skyd.NewClient(cfg.SiaAPIHost, cfg.SiaAPIPort, cfg.SiaAPIPassword, skyd.NewCache(), logger)
47-
scanner := workers.NewScanner(db, logger, cfg.MinPinners, cfg.ServerName, cfg.SleepBetweenScans, skydClient)
47+
scanner := workers.NewScanner(db, logger, cfg.MinPinners, cfg.ScannerThreads, cfg.ServerName, cfg.SleepBetweenScans, skydClient)
4848
err = scanner.Start()
4949
if err != nil {
5050
log.Fatal(errors.AddContext(err, "failed to start Scanner"))

skyd/mock.go

+21-14
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,14 @@ func (c *ClientMock) DiffPinnedSkylinks(skylinks []string) (unknown []string, mi
9292

9393
// FileHealth returns the health of the given skylink.
9494
// Note that the mock will return 0 (fully healthy) by default.
95-
func (c *ClientMock) FileHealth(sl skymodules.SiaPath) (float64, error) {
95+
func (c *ClientMock) FileHealth(sp skymodules.SiaPath) (float64, error) {
9696
c.mu.Lock()
9797
defer c.mu.Unlock()
98-
return c.fileHealth[sl], nil
98+
health, ok := c.fileHealth[sp]
99+
if !ok {
100+
return 1, nil
101+
}
102+
return health, nil
99103
}
100104

101105
// IsPinning checks whether skyd is pinning the given skylink.
@@ -129,10 +133,12 @@ func (c *ClientMock) Pin(skylink string) (skymodules.SiaPath, error) {
129133
return skymodules.SiaPath{}, ErrSkylinkAlreadyPinned
130134
}
131135
c.skylinks[skylink] = struct{}{}
132-
sp := skymodules.SiaPath{
133-
Path: skylink,
136+
var sl skymodules.Skylink
137+
err := sl.LoadString(skylink)
138+
if err != nil {
139+
return skymodules.SiaPath{}, err
134140
}
135-
return sp, nil
141+
return sl.SiaPath()
136142
}
137143

138144
// RebuildCache is a noop mock that takes at least 100ms.
@@ -226,15 +232,16 @@ func (c *ClientMock) SetUnpinError(e error) {
226232
// The mocked structure is the following:
227233
//
228234
// SkynetFolder/ (three dirs, one file)
229-
// dirA/ (two files, one skylink each)
230-
// fileA1 (CAClyosjvI9Fg75N-LRylcfba79bam9Ljp-4qfxS08Q_A1)
231-
// fileA2 (CAClyosjvI9Fg75N-LRylcfba79bam9Ljp-4qfxS08Q_A2)
232-
// dirB/ (one file, one dir)
233-
// dirC/ (one file, two skylinks)
234-
// fileC (CAClyosjvI9Fg75N-LRylcfba79bam9Ljp-4qfxS08Q_C1, C2_uSb3BpGxmSbRAg1xj5T8SdB4hiSFiEW2sEEzxt5MNkg)
235-
// fileB (CAClyosjvI9Fg75N-LRylcfba79bam9Ljp-4qfxS08Q__B)
236-
// dirD/ (empty)
237-
// file (CAClyosjvI9Fg75N-LRylcfba79bam9Ljp-4qfxS08Q___)
235+
//
236+
// dirA/ (two files, one skylink each)
237+
// fileA1 (CAClyosjvI9Fg75N-LRylcfba79bam9Ljp-4qfxS08Q_A1)
238+
// fileA2 (CAClyosjvI9Fg75N-LRylcfba79bam9Ljp-4qfxS08Q_A2)
239+
// dirB/ (one file, one dir)
240+
// dirC/ (one file, two skylinks)
241+
// fileC (CAClyosjvI9Fg75N-LRylcfba79bam9Ljp-4qfxS08Q_C1, C2_uSb3BpGxmSbRAg1xj5T8SdB4hiSFiEW2sEEzxt5MNkg)
242+
// fileB (CAClyosjvI9Fg75N-LRylcfba79bam9Ljp-4qfxS08Q__B)
243+
// dirD/ (empty)
244+
// file (CAClyosjvI9Fg75N-LRylcfba79bam9Ljp-4qfxS08Q___)
238245
func (c *ClientMock) MockFilesystem() []string {
239246
slR0 := "CAClyosjvI9Fg75N-LRylcfba79bam9Ljp-4qfxS08Q___"
240247
slA1 := "CAClyosjvI9Fg75N-LRylcfba79bam9Ljp-4qfxS08Q_A1"

test/tester.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,9 @@ func NewTester(dbName string) (*Tester, error) {
8888
// Start the HTTP server in a goroutine and gracefully stop it once the
8989
// cancel function is called and the context is closed.
9090
srv := &http.Server{
91-
Addr: ":" + testPortalPort,
92-
Handler: server,
91+
Addr: ":" + testPortalPort,
92+
Handler: server,
93+
ReadHeaderTimeout: time.Second,
9394
}
9495
go func() {
9596
_ = srv.ListenAndServe()

workers/scanner.go

+90-41
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"github.com/skynetlabs/pinner/lib"
77
"strings"
88
"sync"
9+
"sync/atomic"
910
"time"
1011

1112
"github.com/skynetlabs/pinner/conf"
@@ -109,11 +110,16 @@ type (
109110
Scanner struct {
110111
staticDB *database.DB
111112
staticLogger logger.Logger
113+
staticScannerThreads int
112114
staticServerName string
113115
staticSkydClient skyd.Client
114116
staticSleepBetweenScans time.Duration
115117
staticTG *threadgroup.ThreadGroup
116118

119+
// Stats variables:
120+
atomicCountPinned uint32
121+
scanStart time.Time
122+
117123
dryRun bool
118124
minPinners int
119125
// skipSkylinks is a list of skylinks which we want to skip during this
@@ -124,14 +130,15 @@ type (
124130
)
125131

126132
// NewScanner creates a new Scanner instance.
127-
func NewScanner(db *database.DB, logger logger.Logger, minPinners int, serverName string, customSleepBetweenScans time.Duration, skydClient skyd.Client) *Scanner {
133+
func NewScanner(db *database.DB, logger logger.Logger, minPinners int, threads int, serverName string, customSleepBetweenScans time.Duration, skydClient skyd.Client) *Scanner {
128134
sleep := customSleepBetweenScans
129135
if sleep == 0 {
130136
sleep = sleepBetweenScans
131137
}
132138
return &Scanner{
133139
staticDB: db,
134140
staticLogger: logger,
141+
staticScannerThreads: threads,
135142
staticServerName: serverName,
136143
staticSkydClient: skydClient,
137144
staticSleepBetweenScans: sleep,
@@ -238,7 +245,31 @@ func (s *Scanner) threadedScanAndPin() {
238245
s.staticLogger.Tracef("Start scanning")
239246
s.managedRefreshDryRun()
240247
s.managedRefreshMinPinners()
241-
s.managedPinUnderpinnedSkylinks()
248+
s.managedResetSkippedSkylinks()
249+
s.managedResetStats()
250+
251+
// Start a thread that will print intermediate scanning statistics.
252+
statsCh := make(chan struct{})
253+
err = s.staticTG.Add()
254+
if err != nil {
255+
return // the threadgroup is stopped
256+
}
257+
go s.threadedPrintStats(statsCh)
258+
259+
// Start N threads that will scan for underpinned skylinks and repin
260+
// them. It's possible that at first all of those start pinning skylinks
261+
// without properly respecting the MaxRepairingSkylinks limit. That's
262+
// expected and chosen because of the simplicity of the implementation.
263+
var wg sync.WaitGroup
264+
for i := 0; i < s.staticScannerThreads; i++ {
265+
wg.Add(1)
266+
go func() {
267+
defer wg.Done()
268+
s.managedPinUnderpinnedSkylinks()
269+
}()
270+
}
271+
wg.Wait()
272+
close(statsCh)
242273
s.staticLogger.Tracef("End scanning")
243274

244275
// Schedule the next scan, unless already scheduled:
@@ -249,6 +280,54 @@ func (s *Scanner) threadedScanAndPin() {
249280
}
250281
}
251282

283+
// threadedPrintStats prints regular updates on the scanning process plus a
284+
// final overview of the pinned and skipped skylinks.
285+
func (s *Scanner) threadedPrintStats(stopCh chan struct{}) {
286+
defer s.staticTG.Done()
287+
intermediateStatsTicker := time.NewTicker(printPinningStatisticsPeriod)
288+
defer intermediateStatsTicker.Stop()
289+
290+
select {
291+
case <-intermediateStatsTicker.C:
292+
// Print intermediate statistics.
293+
t1 := lib.Now()
294+
s.mu.Lock()
295+
numSkipped := len(s.skipSkylinks)
296+
startTime := s.scanStart
297+
s.mu.Unlock()
298+
s.staticLogger.Infof("Time %s, runtime %s, pinned skylinks %d, skipped skylinks %d",
299+
t1.Format(conf.TimeFormat), t1.Sub(startTime).String(), atomic.LoadUint32(&s.atomicCountPinned), numSkipped)
300+
case <-stopCh:
301+
// Print final statistics when finishing the method.
302+
t1 := lib.Now()
303+
s.mu.Lock()
304+
skipped := s.skipSkylinks
305+
startTime := s.scanStart
306+
s.mu.Unlock()
307+
s.staticLogger.Infof("Finished at %s, runtime %s, pinned skylinks %d, skipped skylinks %d",
308+
t1.Format(conf.TimeFormat), t1.Sub(startTime).String(), atomic.LoadUint32(&s.atomicCountPinned), len(skipped))
309+
s.staticLogger.Tracef("Skipped %d skylinks: %v", len(skipped), skipped)
310+
case <-s.staticTG.StopChan():
311+
s.staticLogger.Trace("Stop channel closed")
312+
return
313+
}
314+
}
315+
316+
// managedResetSkippedSkylinks resets the skipped skylinks.
317+
func (s *Scanner) managedResetSkippedSkylinks() {
318+
s.mu.Lock()
319+
s.skipSkylinks = []string{}
320+
s.mu.Unlock()
321+
}
322+
323+
// managedResetStats resets the scanning statistics.
324+
func (s *Scanner) managedResetStats() {
325+
s.mu.Lock()
326+
s.scanStart = lib.Now()
327+
s.mu.Unlock()
328+
atomic.StoreUint32(&s.atomicCountPinned, 0)
329+
}
330+
252331
// staticScheduleNextScan attempts to set the time of the next scan until either we
253332
// succeed, another server succeeds, or Scanner's TG is stopped. Returns true
254333
// when Scanner's TG is stopped.
@@ -292,26 +371,6 @@ func (s *Scanner) managedPinUnderpinnedSkylinks() {
292371
s.staticLogger.Trace("Entering managedPinUnderpinnedSkylinks")
293372
defer s.staticLogger.Trace("Exiting managedPinUnderpinnedSkylinks")
294373

295-
// Clear out the skipped skylinks from the previous run.
296-
s.mu.Lock()
297-
s.skipSkylinks = []string{}
298-
s.mu.Unlock()
299-
300-
intermediateStatsTicker := time.NewTicker(printPinningStatisticsPeriod)
301-
defer intermediateStatsTicker.Stop()
302-
countPinned := 0
303-
t0 := lib.Now()
304-
305-
// Print final statistics when finishing the method.
306-
defer func() {
307-
t1 := lib.Now()
308-
s.mu.Lock()
309-
skipped := s.skipSkylinks
310-
s.mu.Unlock()
311-
s.staticLogger.Infof("Finished at %s, runtime %s, pinned skylinks %d, skipped skylinks %d", t1.Format(conf.TimeFormat), t1.Sub(t0).String(), countPinned, len(skipped))
312-
s.staticLogger.Tracef("Skipped %d skylinks: %v", len(skipped), skipped)
313-
}()
314-
315374
for {
316375
// Check for service shutdown before talking to the DB.
317376
select {
@@ -321,21 +380,11 @@ func (s *Scanner) managedPinUnderpinnedSkylinks() {
321380
default:
322381
}
323382

324-
// Print intermediate statistics.
325-
select {
326-
case <-intermediateStatsTicker.C:
327-
t1 := lib.Now()
328-
s.mu.Lock()
329-
numSkipped := len(s.skipSkylinks)
330-
s.mu.Unlock()
331-
s.staticLogger.Infof("Time %s, runtime %s, pinned skylinks %d, skipped skylinks %d", t1.Format(conf.TimeFormat), t1.Sub(t0).String(), countPinned, numSkipped)
332-
default:
333-
}
334-
335383
skylink, sp, continueScanning, err := s.managedFindAndPinOneUnderpinnedSkylink()
336384
if !sp.IsEmpty() {
337-
countPinned++
338-
} else {
385+
atomic.AddUint32(&s.atomicCountPinned, 1)
386+
}
387+
if err != nil {
339388
s.staticLogger.Trace(err)
340389
}
341390
if !continueScanning {
@@ -345,7 +394,7 @@ func (s *Scanner) managedPinUnderpinnedSkylinks() {
345394
// already logged and the only indication it gives us is whether we
346395
// should wait for the file we pinned to become healthy or not. If there
347396
// is an error, then there is nothing to wait for.
348-
if err == nil && !sp.IsEmpty() {
397+
if !sp.IsEmpty() {
349398
// Block until the pinned skylink becomes healthy or until a timeout.
350399
s.staticWaitUntilHealthy(skylink, sp)
351400
continue
@@ -378,7 +427,6 @@ func (s *Scanner) managedFindAndPinOneUnderpinnedSkylink() (skylink skymodules.S
378427
s.mu.Unlock()
379428

380429
ctx := context.TODO()
381-
382430
sl, err := s.staticDB.FindAndLockUnderpinned(ctx, s.staticServerName, skipSkylinks, minPinners)
383431
if database.IsNoSkylinksNeedPinning(err) {
384432
return skymodules.Skylink{}, skymodules.SiaPath{}, false, err
@@ -466,9 +514,9 @@ func (s *Scanner) managedSkipSkylink(sl skymodules.Skylink) {
466514
// we pin another one. It returns a ballpark value.
467515
//
468516
// This method makes some assumptions for simplicity:
469-
// * assumes lazy pinning, meaning that none of the fanout is uploaded
470-
// * all skyfiles are assumed to be large files (base sector + fanout) and the
471-
// metadata is assumed to fill up the base sector (to err on the safe side)
517+
// - assumes lazy pinning, meaning that none of the fanout is uploaded
518+
// - all skyfiles are assumed to be large files (base sector + fanout) and the
519+
// metadata is assumed to fill up the base sector (to err on the safe side)
472520
func (s *Scanner) staticEstimateTimeToFull(skylink skymodules.Skylink) time.Duration {
473521
meta, err := s.staticSkydClient.Metadata(skylink.String())
474522
if err != nil {
@@ -527,7 +575,8 @@ func (s *Scanner) staticEligibleToPin(ctx context.Context) (bool, error) {
527575
pinnedData, err := s.staticDB.ServerLoad(ctx, s.staticServerName)
528576
if errors.Contains(err, database.ErrServerLoadNotFound) {
529577
// We don't know what the server's load is. Get that data.
530-
load, err := s.staticSkydClient.ContractData()
578+
var load uint64
579+
load, err = s.staticSkydClient.ContractData()
531580
if err != nil {
532581
return false, errors.AddContext(err, "failed to fetch server's load")
533582
}

0 commit comments

Comments
 (0)