@@ -17,17 +17,17 @@ var crawl,
17
17
crawler ,
18
18
cron_expression ,
19
19
rendered_sitemaps_folder = __dirname + '/rendered_sitemaps' ,
20
+ cronJob ,
21
+ dir_perm = 0766 ,
20
22
conditionID ;
21
23
22
24
/**
23
25
* Determine if directory can be written
24
26
*/
25
- mkDir ( rendered_sitemaps_folder , 0766 )
27
+ mkDir ( rendered_sitemaps_folder , dir_perm )
26
28
. then ( null , function ( err ) {
27
- if ( err . errno === 47 ) {
28
- console . log ( '/rendered_sitemaps already exists' ) ;
29
- } else {
30
- console . log ( err ) ;
29
+ if ( err . errno !== 47 ) {
30
+ throw new Error ( err ) ;
31
31
}
32
32
} ) ;
33
33
@@ -48,14 +48,19 @@ if (index_sitemap === false) {
48
48
}
49
49
50
50
if ( argv . cron_schedule === undefined ) {
51
- console . log ( 'You are able to set-up a custom cron-schedule via the command line for this process' + '\n'
52
- + 'Using the following flag: --cron_schedule' + '\n' ) ;
51
+ console . log ( '\nYou are able to set-up a custom cron-schedule via the command line for this process' + '\n'
52
+ + 'Using the following flag: --cron_schedule= ' + '\n' ) ;
53
53
} else {
54
54
try {
55
55
cron_parser . parseExpression ( argv . cron_schedule ) ;
56
- cron_expression = argv . cron_schedule
56
+ if ( argv . cron_schedule ) {
57
+ cron_expression = argv . cron_schedule
58
+ console . log ( 'Cron set to: ' , cron_expression ) ;
59
+ } else {
60
+ console . log ( '\n Cron Expression is empty, please submit a proper cron expression. \n' ) ;
61
+ }
57
62
} catch ( err ) {
58
- console . log ( 'Error parsing cron. Cron expression submitted is not properly formatted: ' , err ) ;
63
+ throw new Error ( 'Error parsing cron. Cron expression submitted is not properly formatted: ' , err ) ;
59
64
}
60
65
}
61
66
@@ -112,18 +117,39 @@ var loadBody = function(res) {
112
117
return deferred . promise ;
113
118
}
114
119
120
+ /**
121
+ * Strips the ending of the file_name within the sitemap to whatever other
122
+ * extension you want. Does not actually change the file itself in any way.
123
+ *
124
+ * @param sitemap_filename $sitemap_filename
125
+ * @param format $format
126
+ * @access public
127
+ * @return string
128
+ */
115
129
function format_file ( sitemap_filename , format ) {
116
130
if ( format !== 'xml' ) {
117
131
sitemap_filename = sitemap_filename . replace ( / x m l / g, format ) ;
118
132
}
119
133
return sitemap_filename ;
120
134
}
121
135
136
+ /**
137
+ * Hits the URL, awaits a response, then parses the response
138
+ *
139
+ * @param url_feed $url_feed
140
+ * @param format $format
141
+ * @access public
142
+ * @return void
143
+ */
122
144
function jobCrawler ( url_feed , format ) {
123
145
if ( format == 'xml' ) {
124
146
httpGet ( url_feed ) . then ( function ( res ) {
125
147
res . setEncoding ( 'utf8' ) ;
126
- return res ;
148
+ if ( res . statusCode === 200 ) {
149
+ return res ;
150
+ } else {
151
+ throw new Error ( 'Status code was ' + res . statusCode + '. Not parsing because it was not a 200 OK.' )
152
+ }
127
153
} ) . then ( function ( res ) {
128
154
loadBody ( res )
129
155
. then ( function ( body ) {
@@ -132,7 +158,7 @@ function jobCrawler(url_feed, format) {
132
158
var dir_path = rendered_sitemaps_folder + '/' + host_name ;
133
159
var file_path = dir_path + file_name ;
134
160
135
- mkDir ( dir_path , 0766 )
161
+ mkDir ( dir_path , dir_perm )
136
162
. then ( function ( ) {
137
163
console . log ( 'Directory ' + dir_path + ' has been written' ) ;
138
164
@@ -158,21 +184,32 @@ function jobCrawler(url_feed, format) {
158
184
}
159
185
} ;
160
186
161
- crawler = Crawler . crawl ( index_sitemap ) ;
162
- crawler . interval = 2000 ; // 1000 = 1 second
187
+ function initiate_crawl ( crawler , Crawler , index_sitemap , format ) {
188
+ crawler = Crawler . crawl ( index_sitemap ) ;
189
+ crawler . interval = 2000 ; // 1000 = 1 second
163
190
164
- // Crawler engage
165
- crawler . on ( "fetchcomplete" , function ( queueItem , resBuffer , response ) {
166
- console . log ( "crawler has received %s (%d bytes)" , queueItem . url , resBuffer . length ) ;
167
- jobCrawler ( queueItem . url , format ) ;
168
- } ) ;
191
+ crawler . on ( "fetchcomplete" , function ( queueItem , resBuffer , response ) {
192
+ console . log ( "crawler has received %s (%d bytes)" , queueItem . url , resBuffer . length ) ;
193
+ jobCrawler ( queueItem . url , format ) ;
194
+ } ) ;
169
195
170
- // Only parse XML documents and ignore all other links
171
- conditionID = crawler . addFetchCondition ( function ( parsedURL ) {
172
- return parsedURL . path . match ( / \. x m l $ / i) ;
173
- } ) ;
196
+ // Only parse XML documents and ignore all other links
197
+ conditionID = crawler . addFetchCondition ( function ( parsedURL ) {
198
+ return parsedURL . path . match ( / \. x m l $ / i) ;
199
+ } ) ;
200
+
201
+ crawler . on ( "complete" , function ( err , response ) {
202
+ if ( err ) throw err ;
203
+ console . log ( "Crawler has completed crawling the sitemap index." ) ;
204
+ } ) ;
205
+ }
206
+
207
+
208
+ if ( cron_expression ) {
209
+ cronJob = schedule . scheduleJob ( cron_expression , function ( ) {
210
+ initiate_crawl ( crawler , Crawler , index_sitemap , format ) ;
211
+ } ) ;
212
+ } else {
213
+ initiate_crawl ( crawler , Crawler , index_sitemap , format ) ;
214
+ }
174
215
175
- crawler . on ( "complete" , function ( err , response ) {
176
- if ( err ) throw err ;
177
- console . log ( "Crawler has completed crawling the sitemap index." ) ;
178
- } ) ;
0 commit comments