Skip to content

Commit dfe4274

Browse files
committed
setting up crawler options in a function, can be executed in a cron expression as well.
1 parent c126de3 commit dfe4274

File tree

2 files changed

+66
-26
lines changed

2 files changed

+66
-26
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,6 @@ node_modules
2929

3030
# Created Rendered Sitemaps
3131
rendered_sitemaps
32+
33+
# Ignore VIM files
34+
*.swp

app.js

+63-26
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,17 @@ var crawl,
1717
crawler,
1818
cron_expression,
1919
rendered_sitemaps_folder = __dirname + '/rendered_sitemaps',
20+
cronJob,
21+
dir_perm = 0766,
2022
conditionID;
2123

2224
/**
2325
* Determine if directory can be written
2426
*/
25-
mkDir(rendered_sitemaps_folder, 0766)
27+
mkDir(rendered_sitemaps_folder, dir_perm)
2628
.then(null, function(err){
27-
if(err.errno === 47) {
28-
console.log('/rendered_sitemaps already exists');
29-
} else {
30-
console.log(err);
29+
if(err.errno !== 47) {
30+
throw new Error(err);
3131
}
3232
});
3333

@@ -48,14 +48,19 @@ if (index_sitemap === false) {
4848
}
4949

5050
if (argv.cron_schedule === undefined) {
51-
console.log('You are able to set-up a custom cron-schedule via the command line for this process' + '\n'
52-
+ 'Using the following flag: --cron_schedule' + '\n');
51+
console.log('\nYou are able to set-up a custom cron-schedule via the command line for this process' + '\n'
52+
+ 'Using the following flag: --cron_schedule=' + '\n');
5353
} else {
5454
try {
5555
cron_parser.parseExpression(argv.cron_schedule);
56-
cron_expression = argv.cron_schedule
56+
if (argv.cron_schedule) {
57+
cron_expression = argv.cron_schedule
58+
console.log('Cron set to: ', cron_expression);
59+
} else {
60+
console.log('\n Cron Expression is empty, please submit a proper cron expression. \n');
61+
}
5762
} catch (err) {
58-
console.log('Error parsing cron. Cron expression submitted is not properly formatted: ', err);
63+
throw new Error('Error parsing cron. Cron expression submitted is not properly formatted: ', err);
5964
}
6065
}
6166

@@ -112,18 +117,39 @@ var loadBody = function(res) {
112117
return deferred.promise;
113118
}
114119

120+
/**
121+
* Strips the ending of the file_name within the sitemap to whatever other
122+
* extension you want. Does not actually change the file itself in any way.
123+
*
124+
* @param sitemap_filename $sitemap_filename
125+
* @param format $format
126+
* @access public
127+
* @return string
128+
*/
115129
function format_file(sitemap_filename, format) {
116130
if (format !== 'xml') {
117131
sitemap_filename = sitemap_filename.replace(/xml/g, format);
118132
}
119133
return sitemap_filename;
120134
}
121135

136+
/**
137+
* Hits the URL, awaits a response, then parses the response
138+
*
139+
* @param url_feed $url_feed
140+
* @param format $format
141+
* @access public
142+
* @return void
143+
*/
122144
function jobCrawler(url_feed, format) {
123145
if (format == 'xml') {
124146
httpGet(url_feed).then(function(res) {
125147
res.setEncoding('utf8');
126-
return res;
148+
if (res.statusCode === 200) {
149+
return res;
150+
} else {
151+
throw new Error('Status code was ' + res.statusCode + '. Not parsing because it was not a 200 OK.')
152+
}
127153
}).then(function(res) {
128154
loadBody(res)
129155
.then(function(body) {
@@ -132,7 +158,7 @@ function jobCrawler(url_feed, format) {
132158
var dir_path = rendered_sitemaps_folder+'/'+host_name;
133159
var file_path = dir_path+file_name;
134160

135-
mkDir(dir_path, 0766)
161+
mkDir(dir_path, dir_perm)
136162
.then(function() {
137163
console.log('Directory ' + dir_path + ' has been written');
138164

@@ -158,21 +184,32 @@ function jobCrawler(url_feed, format) {
158184
}
159185
};
160186

161-
crawler = Crawler.crawl(index_sitemap);
162-
crawler.interval = 2000; // 1000 = 1 second
187+
function initiate_crawl(crawler, Crawler, index_sitemap, format) {
188+
crawler = Crawler.crawl(index_sitemap);
189+
crawler.interval = 2000; // 1000 = 1 second
163190

164-
// Crawler engage
165-
crawler.on("fetchcomplete", function(queueItem, resBuffer, response) {
166-
console.log("crawler has received %s (%d bytes)",queueItem.url,resBuffer.length);
167-
jobCrawler(queueItem.url, format);
168-
});
191+
crawler.on("fetchcomplete", function(queueItem, resBuffer, response) {
192+
console.log("crawler has received %s (%d bytes)",queueItem.url,resBuffer.length);
193+
jobCrawler(queueItem.url, format);
194+
});
169195

170-
// Only parse XML documents and ignore all other links
171-
conditionID = crawler.addFetchCondition(function(parsedURL) {
172-
return parsedURL.path.match(/\.xml$/i);
173-
});
196+
// Only parse XML documents and ignore all other links
197+
conditionID = crawler.addFetchCondition(function(parsedURL) {
198+
return parsedURL.path.match(/\.xml$/i);
199+
});
200+
201+
crawler.on("complete", function(err, response) {
202+
if (err) throw err;
203+
console.log("Crawler has completed crawling the sitemap index.");
204+
});
205+
}
206+
207+
208+
if (cron_expression) {
209+
cronJob = schedule.scheduleJob(cron_expression, function() {
210+
initiate_crawl(crawler, Crawler, index_sitemap, format);
211+
});
212+
} else {
213+
initiate_crawl(crawler, Crawler, index_sitemap, format);
214+
}
174215

175-
crawler.on("complete", function(err, response) {
176-
if (err) throw err;
177-
console.log("Crawler has completed crawling the sitemap index.");
178-
});

0 commit comments

Comments
 (0)