Skip to content

Commit 1a8844b

Browse files
committed
massive refactoring of the crawler to make it into a separate object from the app.js.
1 parent b5615ca commit 1a8844b

File tree

2 files changed

+206
-207
lines changed

2 files changed

+206
-207
lines changed

app.js

+3-207
Original file line numberDiff line numberDiff line change
@@ -1,208 +1,4 @@
1-
var http = require('http'),
2-
XmlStream = require('xml-stream'),
3-
url = require('url'),
4-
fs = require('fs'),
5-
validate = require('url-validator'),
6-
argv = require('minimist')(process.argv.slice(2)),
7-
Q = require('q'),
8-
Crawler = require('simplecrawler'),
9-
schedule = require('node-schedule'),
10-
cron_parser = require('cron-parser'),
11-
msg = require('./messages.js'),
12-
_fs = require('./filesystem.js');
13-
14-
var crawl,
15-
index_sitemap,
16-
format,
17-
crawler,
18-
cron_expression,
19-
rendered_sitemaps_folder = __dirname + '/rendered_sitemaps',
20-
cronJob,
21-
dir_perm = 0766,
22-
conditionID;
23-
24-
/**
25-
* Determine if directory can be written
26-
*/
27-
_fs.mkDir(rendered_sitemaps_folder, dir_perm);
28-
29-
/**
30-
* The Index Sitemap that will be crawled for more links
31-
* @type {String} URL to the sitemap.
32-
*/
33-
if (argv.sitemap_index_url === undefined) {
34-
throw new Error(msg.undefined_sitemap);
35-
}
36-
37-
// Validate if it is a valid URL
38-
index_sitemap = validate(argv.sitemap_index_url);
39-
if (index_sitemap === false) {
40-
throw new Error(msg.improper_sitemap_url);
41-
}
42-
43-
if (argv.cron_schedule === undefined) {
44-
console.log(msg.cron_schedule);
45-
} else {
46-
try {
47-
cron_parser.parseExpression(argv.cron_schedule);
48-
if (argv.cron_schedule) {
49-
cron_expression = argv.cron_schedule
50-
console.log(msg.cron_set(cron_expression));
51-
}
52-
} catch (err) {
53-
throw new Error(msg.error_cron);
54-
}
55-
}
56-
57-
switch (argv.format) {
58-
case undefined:
59-
format = 'xml';
60-
break;
61-
case 'xml':
62-
format = 'xml';
63-
break;
64-
}
65-
66-
console.log("Format to save is set to " + format);
67-
console.log("Index sitemap to crawl: " + index_sitemap + '\n');
68-
69-
var httpGet = function(opts) {
70-
var deferred = Q.defer();
71-
72-
http.get(opts, function(res) {
73-
deferred.resolve(res);
74-
}).on('error', function(e) {
75-
deferred.reject(e);
76-
});
77-
78-
return deferred.promise;
79-
};
80-
81-
var write_dir = function(dir_name) {
82-
var deferred = Q.defer();
83-
if (!fs.existsSync(dir_name)) {
84-
fs.mkdirSync(dir_name, 0766, function(err) {
85-
if (err) {
86-
deferred.reject(err);
87-
} else {
88-
deferred.resolve();
89-
}
90-
});
91-
} else {
92-
deferred.resolve();
93-
}
94-
95-
return deferred.promise;
96-
}
97-
98-
var loadBody = function(res) {
99-
var deferred = Q.defer();
100-
var body = '';
101-
res.on("data", function(chunk) {
102-
body += chunk;
103-
});
104-
res.on("end", function() {
105-
deferred.resolve(body);
106-
});
107-
return deferred.promise;
108-
}
109-
110-
/**
111-
* Strips the ending of the file_name within the sitemap to whatever other
112-
* extension you want. Does not actually change the file itself in any way.
113-
*
114-
* @param sitemap_filename $sitemap_filename
115-
* @param format $format
116-
* @access public
117-
* @return string
118-
*/
119-
function format_file(sitemap_filename, format) {
120-
if (format !== 'xml') {
121-
sitemap_filename = sitemap_filename.replace(/xml/g, format);
122-
}
123-
return sitemap_filename;
124-
}
125-
126-
/**
127-
* Hits the URL, awaits a response, then parses the response
128-
*
129-
* @param url_feed $url_feed
130-
* @param format $format
131-
* @access public
132-
* @return void
133-
*/
134-
function jobCrawler(url_feed, format) {
135-
if (format == 'xml') {
136-
httpGet(url_feed).then(function(res) {
137-
res.setEncoding('utf8');
138-
if (res.statusCode === 200) {
139-
return res;
140-
} else {
141-
throw new Error(msg.error_status_code(res.statusCode));
142-
}
143-
})
144-
.then(function(res) {
145-
loadBody(res)
146-
.then(function(body) {
147-
var host_name = url.parse(url_feed).hostname;
148-
var file_name = format_file(url.parse(url_feed).pathname, format);
149-
var dir_path = rendered_sitemaps_folder+'/'+host_name;
150-
var file_path = dir_path+file_name;
151-
152-
_fs.mkDir(dir_path, dir_perm, file_path, body)
153-
.then(function(file_path, file_body) {
154-
var file_s = {
155-
file_path: file_path,
156-
body: body
157-
};
158-
return file_s;
159-
}, function(err) {
160-
if (err.errno === 47) { // File already exists
161-
var file_s = {
162-
file_path: file_path,
163-
body: body
164-
};
165-
return file_s;
166-
} else {
167-
throw new Error(err);
168-
}
169-
})
170-
.then(function(file_info) {
171-
_fs.mkFile(file_info.file_path, file_info.body);
172-
}, function(err) {
173-
throw new Error(err);
174-
});
175-
}, console.error);
176-
}, console.error);
177-
}
178-
};
179-
180-
function initiate_crawl(crawler, Crawler, index_sitemap, format) {
181-
crawler = Crawler.crawl(index_sitemap);
182-
crawler.interval = 2000; // 1000 = 1 second
183-
184-
crawler.on("fetchcomplete", function(queueItem, resBuffer, response) {
185-
console.log("crawler has received %s (%d bytes)",queueItem.url,resBuffer.length);
186-
jobCrawler(queueItem.url, format);
187-
});
188-
189-
// Only parse XML documents and ignore all other links
190-
conditionID = crawler.addFetchCondition(function(parsedURL) {
191-
return parsedURL.path.match(/\.xml$/i);
192-
});
193-
194-
crawler.on("complete", function(err, response) {
195-
if (err) throw err;
196-
console.log("Crawler has completed crawling the sitemap index.");
197-
});
198-
}
199-
200-
201-
if (cron_expression) {
202-
cronJob = schedule.scheduleJob(cron_expression, function() {
203-
initiate_crawl(crawler, Crawler, index_sitemap, format);
204-
});
205-
} else {
206-
initiate_crawl(crawler, Crawler, index_sitemap, format);
207-
}
1+
var crawler = require('./crawler');
2082

3+
// Start the crawling application
4+
crawler.init();

0 commit comments

Comments
 (0)