11package  sync
22
33import  (
4+ 	"crypto/md5" 
5+ 	"encoding/hex" 
46	"fmt" 
57	"io" 
68	"io/ioutil" 
@@ -56,8 +58,11 @@ type DownloadTask struct {
5658	Uri        string 
5759	LocalPath  string 
5860	Uid        string 
61+ 	// uid key is common suffix between local path and remote uri 
62+ 	UidKey  string 
5963}
6064
65+ // parse bucket and key out of remote object URI 
6166func  parseObjectUri (uri  string ) (string , string , error ) {
6267	parts  :=  strings .SplitN (uri , "//" , 2 )
6368	if  len (parts ) !=  2  {
@@ -73,6 +78,27 @@ func parseObjectUri(uri string) (string, string, error) {
7378	return  pathParts [0 ], pathParts [1 ], nil 
7479}
7580
81+ func  uidKeyFromLocalPath (localDir  string , localPath  string ) (string , error ) {
82+ 	return  filepath .Rel (localDir , localPath )
83+ }
84+ 
85+ func  uidFromLocalPath (localPath  string ) (string , error ) {
86+ 	f , err  :=  os .Open (localPath )
87+ 	if  err  !=  nil  {
88+ 		return  "" , fmt .Errorf ("Invalid file path for checksum calculation: %s, err: %s" , localPath , err )
89+ 	}
90+ 	defer  f .Close ()
91+ 
92+ 	h  :=  md5 .New ()
93+ 	if  _ , err  :=  io .Copy (h , f ); err  !=  nil  {
94+ 		return  "" , fmt .Errorf ("Failed to calculate checksum for file: %s, err: %s" , localPath , err )
95+ 	}
96+ 
97+ 	uid  :=  hex .EncodeToString (h .Sum (nil ))
98+ 	// AWS S3 ETag is a quoted hex string 
99+ 	return  fmt .Sprintf ("\" %s\" " , uid ), nil 
100+ }
101+ 
76102func  (self  * Puller ) downloadHandler (task  DownloadTask , downloader  GenericDownloader ) {
77103	l  :=  zap .S ()
78104
@@ -120,9 +146,19 @@ func (self *Puller) downloadHandler(task DownloadTask, downloader GenericDownloa
120146
121147	// update cache with new object ID 
122148	self .uidLock .Lock ()
123- 	l .Debugw ("Updaing uid cache" , "key" , task .Uri , "val" , task .Uid )
124- 	self .uidCache [task .Uri ] =  task .Uid 
125- 	defer  self .uidLock .Unlock ()
149+ 	l .Debugw ("Updaing uid cache" , "key" , task .UidKey , "val" , task .Uid )
150+ 	self .uidCache [task .UidKey ] =  task .Uid 
151+ 	self .uidLock .Unlock ()
152+ }
153+ 
154+ func  (self  * Puller ) isPathExcluded (path  string ) bool  {
155+ 	for  _ , pattern  :=  range  self .exclude  {
156+ 		matched , _  :=  doublestar .Match (pattern , path )
157+ 		if  matched  {
158+ 			return  true 
159+ 		}
160+ 	}
161+ 	return  false 
126162}
127163
128164func  (self  * Puller ) handlePageList (
@@ -153,16 +189,9 @@ func (self *Puller) handlePageList(
153189			continue 
154190		}
155191		// ignore file that matches exclude rules 
156- 		shouldSkip  :=  false 
157- 		for  _ , pattern  :=  range  self .exclude  {
158- 			matched , _  :=  doublestar .Match (pattern , relPath )
159- 			if  matched  {
160- 				l .Debugf ("skipped %s due to exclude pattern: %s" , uri , pattern )
161- 				shouldSkip  =  true 
162- 				break 
163- 			}
164- 		}
192+ 		shouldSkip  :=  self .isPathExcluded (relPath )
165193		if  shouldSkip  {
194+ 			l .Debugf ("skipped %s due to exclude pattern" , uri )
166195			continue 
167196		}
168197
@@ -178,10 +207,11 @@ func (self *Puller) handlePageList(
178207
179208		self .fileListedCnt  +=  1 
180209
210+ 		uidKey  :=  relPath 
181211		self .uidLock .Lock ()
182- 		oldUid , ok  :=  self .uidCache [uri ]
212+ 		oldUid , ok  :=  self .uidCache [uidKey ]
183213		self .uidLock .Unlock ()
184- 		l .Debugf ("Comparing object UID: %s <> %s = %v " , oldUid ,  newUid ,  oldUid   ==  newUid )
214+ 		l .Debugf ("Comparing object UID: %s <> %s" , oldUid , newUid )
185215		if  ok  &&  oldUid  ==  newUid  {
186216			// skip update if uid is the same 
187217			continue 
@@ -192,6 +222,7 @@ func (self *Puller) handlePageList(
192222			Uri :       uri ,
193223			LocalPath : localPath ,
194224			Uid :       newUid ,
225+ 			UidKey :    uidKey ,
195226		}
196227	}
197228	return  true 
@@ -219,8 +250,10 @@ type Puller struct {
219250	filePulledCnt  int 
220251}
221252
222- func  (self  * Puller ) AddExcludePattern (pattern  string ) {
223- 	self .exclude  =  append (self .exclude , pattern )
253+ func  (self  * Puller ) AddExcludePatterns (patterns  []string ) {
254+ 	for  _ , pattern  :=  range  patterns  {
255+ 		self .exclude  =  append (self .exclude , pattern )
256+ 	}
224257}
225258
226259func  (self  * Puller ) Pull (remoteUri  string , localDir  string ) string  {
@@ -321,6 +354,76 @@ func (self *Puller) Pull(remoteUri string, localDir string) string {
321354	}
322355}
323356
357+ func  (self  * Puller ) PopulateChecksum (localDir  string ) {
358+ 	l  :=  zap .S ()
359+ 
360+ 	setFileChecksum  :=  func (path  string ) {
361+ 		f , err  :=  os .Open (path )
362+ 		if  err  !=  nil  {
363+ 			l .Errorf ("Invalid file path for checksum calculation: %s, err: %s" , path , err )
364+ 		}
365+ 		defer  f .Close ()
366+ 
367+ 		h  :=  md5 .New ()
368+ 		if  _ , err  :=  io .Copy (h , f ); err  !=  nil  {
369+ 			l .Errorf ("Failed to calculate checksum for file: %s, err: %s" , path , err )
370+ 		}
371+ 
372+ 		uidKey , err  :=  uidKeyFromLocalPath (localDir , path )
373+ 		if  err  !=  nil  {
374+ 			l .Errorf ("Failed to calculate uidKey for file: %s under dir: %s, err: %s" , path , localDir , err )
375+ 			return 
376+ 		}
377+ 
378+ 		uid , err  :=  uidFromLocalPath (path )
379+ 		if  err  !=  nil  {
380+ 			l .Errorf ("Failed to calculate UID: %s" , err )
381+ 			return 
382+ 		}
383+ 
384+ 		self .uidLock .Lock ()
385+ 		self .uidCache [uidKey ] =  uid 
386+ 		self .uidLock .Unlock ()
387+ 	}
388+ 
389+ 	err  :=  filepath .Walk (localDir , func (path  string , info  os.FileInfo , err  error ) error  {
390+ 		if  err  !=  nil  {
391+ 			return  err 
392+ 		}
393+ 
394+ 		// ignore file that matches exclude rules 
395+ 		shouldSkip  :=  false 
396+ 		relPath , err  :=  filepath .Rel (localDir , path )
397+ 		if  err  !=  nil  {
398+ 			l .Errorf ("Got invalid path from filepath.Walk: %s, err: %s" , path , err )
399+ 			shouldSkip  =  true 
400+ 		} else  {
401+ 			if  info .IsDir () {
402+ 				// this is so that pattern `foo/**` also matches `foo` 
403+ 				relPath  +=  "/" 
404+ 			}
405+ 			shouldSkip  =  self .isPathExcluded (relPath )
406+ 		}
407+ 
408+ 		if  info .IsDir () {
409+ 			if  shouldSkip  {
410+ 				return  filepath .SkipDir 
411+ 			}
412+ 		} else  {
413+ 			if  shouldSkip  {
414+ 				return  nil 
415+ 			}
416+ 
417+ 			setFileChecksum (path )
418+ 		}
419+ 		return  nil 
420+ 	})
421+ 
422+ 	if  err  !=  nil  {
423+ 		l .Errorf ("Failed to walk directory for populating file checksum, err: %s" , err )
424+ 	}
425+ }
426+ 
324427func  NewPuller () * Puller  {
325428	return  & Puller {
326429		workerCnt : 5 ,
0 commit comments