|
1 |
| -var http = require('http'), |
2 |
| - XmlStream = require('xml-stream'), |
3 |
| - url = require('url'), |
4 |
| - fs = require('fs'), |
5 |
| - validate = require('url-validator'), |
6 |
| - argv = require('minimist')(process.argv.slice(2)), |
7 |
| - Q = require('q'), |
8 |
| - Crawler = require('simplecrawler'), |
9 |
| - schedule = require('node-schedule'), |
10 |
| - cron_parser = require('cron-parser'), |
11 |
| - msg = require('./messages.js'), |
12 |
| - _fs = require('./filesystem.js'); |
13 |
| - |
14 |
| -var crawl, |
15 |
| - index_sitemap, |
16 |
| - format, |
17 |
| - crawler, |
18 |
| - cron_expression, |
19 |
| - rendered_sitemaps_folder = __dirname + '/rendered_sitemaps', |
20 |
| - cronJob, |
21 |
| - dir_perm = 0766, |
22 |
| - conditionID; |
23 |
| - |
24 |
| -/** |
25 |
| - * Determine if directory can be written |
26 |
| - */ |
27 |
| -_fs.mkDir(rendered_sitemaps_folder, dir_perm); |
28 |
| - |
29 |
| -/** |
30 |
| - * The Index Sitemap that will be crawled for more links |
31 |
| - * @type {String} URL to the sitemap. |
32 |
| - */ |
33 |
| -if (argv.sitemap_index_url === undefined) { |
34 |
| - throw new Error(msg.undefined_sitemap); |
35 |
| -} |
36 |
| - |
37 |
| -// Validate if it is a valid URL |
38 |
| -index_sitemap = validate(argv.sitemap_index_url); |
39 |
| -if (index_sitemap === false) { |
40 |
| - throw new Error(msg.improper_sitemap_url); |
41 |
| -} |
42 |
| - |
43 |
| -if (argv.cron_schedule === undefined) { |
44 |
| - console.log(msg.cron_schedule); |
45 |
| -} else { |
46 |
| - try { |
47 |
| - cron_parser.parseExpression(argv.cron_schedule); |
48 |
| - if (argv.cron_schedule) { |
49 |
| - cron_expression = argv.cron_schedule |
50 |
| - console.log(msg.cron_set(cron_expression)); |
51 |
| - } |
52 |
| - } catch (err) { |
53 |
| - throw new Error(msg.error_cron); |
54 |
| - } |
55 |
| -} |
56 |
| - |
57 |
| -switch (argv.format) { |
58 |
| - case undefined: |
59 |
| - format = 'xml'; |
60 |
| - break; |
61 |
| - case 'xml': |
62 |
| - format = 'xml'; |
63 |
| - break; |
64 |
| -} |
65 |
| - |
66 |
| -console.log("Format to save is set to " + format); |
67 |
| -console.log("Index sitemap to crawl: " + index_sitemap + '\n'); |
68 |
| - |
69 |
| -var httpGet = function(opts) { |
70 |
| - var deferred = Q.defer(); |
71 |
| - |
72 |
| - http.get(opts, function(res) { |
73 |
| - deferred.resolve(res); |
74 |
| - }).on('error', function(e) { |
75 |
| - deferred.reject(e); |
76 |
| - }); |
77 |
| - |
78 |
| - return deferred.promise; |
79 |
| -}; |
80 |
| - |
81 |
| -var write_dir = function(dir_name) { |
82 |
| - var deferred = Q.defer(); |
83 |
| - if (!fs.existsSync(dir_name)) { |
84 |
| - fs.mkdirSync(dir_name, 0766, function(err) { |
85 |
| - if (err) { |
86 |
| - deferred.reject(err); |
87 |
| - } else { |
88 |
| - deferred.resolve(); |
89 |
| - } |
90 |
| - }); |
91 |
| - } else { |
92 |
| - deferred.resolve(); |
93 |
| - } |
94 |
| - |
95 |
| - return deferred.promise; |
96 |
| -} |
97 |
| - |
98 |
| -var loadBody = function(res) { |
99 |
| - var deferred = Q.defer(); |
100 |
| - var body = ''; |
101 |
| - res.on("data", function(chunk) { |
102 |
| - body += chunk; |
103 |
| - }); |
104 |
| - res.on("end", function() { |
105 |
| - deferred.resolve(body); |
106 |
| - }); |
107 |
| - return deferred.promise; |
108 |
| -} |
109 |
| - |
110 |
| -/** |
111 |
| - * Strips the ending of the file_name within the sitemap to whatever other |
112 |
| - * extension you want. Does not actually change the file itself in any way. |
113 |
| - * |
114 |
| - * @param sitemap_filename $sitemap_filename |
115 |
| - * @param format $format |
116 |
| - * @access public |
117 |
| - * @return string |
118 |
| - */ |
119 |
| -function format_file(sitemap_filename, format) { |
120 |
| - if (format !== 'xml') { |
121 |
| - sitemap_filename = sitemap_filename.replace(/xml/g, format); |
122 |
| - } |
123 |
| - return sitemap_filename; |
124 |
| -} |
125 |
| - |
126 |
| -/** |
127 |
| - * Hits the URL, awaits a response, then parses the response |
128 |
| - * |
129 |
| - * @param url_feed $url_feed |
130 |
| - * @param format $format |
131 |
| - * @access public |
132 |
| - * @return void |
133 |
| - */ |
134 |
| -function jobCrawler(url_feed, format) { |
135 |
| - if (format == 'xml') { |
136 |
| - httpGet(url_feed).then(function(res) { |
137 |
| - res.setEncoding('utf8'); |
138 |
| - if (res.statusCode === 200) { |
139 |
| - return res; |
140 |
| - } else { |
141 |
| - throw new Error(msg.error_status_code(res.statusCode)); |
142 |
| - } |
143 |
| - }) |
144 |
| - .then(function(res) { |
145 |
| - loadBody(res) |
146 |
| - .then(function(body) { |
147 |
| - var host_name = url.parse(url_feed).hostname; |
148 |
| - var file_name = format_file(url.parse(url_feed).pathname, format); |
149 |
| - var dir_path = rendered_sitemaps_folder+'/'+host_name; |
150 |
| - var file_path = dir_path+file_name; |
151 |
| - |
152 |
| - _fs.mkDir(dir_path, dir_perm, file_path, body) |
153 |
| - .then(function(file_path, file_body) { |
154 |
| - var file_s = { |
155 |
| - file_path: file_path, |
156 |
| - body: body |
157 |
| - }; |
158 |
| - return file_s; |
159 |
| - }, function(err) { |
160 |
| - if (err.errno === 47) { // File already exists |
161 |
| - var file_s = { |
162 |
| - file_path: file_path, |
163 |
| - body: body |
164 |
| - }; |
165 |
| - return file_s; |
166 |
| - } else { |
167 |
| - throw new Error(err); |
168 |
| - } |
169 |
| - }) |
170 |
| - .then(function(file_info) { |
171 |
| - _fs.mkFile(file_info.file_path, file_info.body); |
172 |
| - }, function(err) { |
173 |
| - throw new Error(err); |
174 |
| - }); |
175 |
| - }, console.error); |
176 |
| - }, console.error); |
177 |
| - } |
178 |
| -}; |
179 |
| - |
180 |
| -function initiate_crawl(crawler, Crawler, index_sitemap, format) { |
181 |
| - crawler = Crawler.crawl(index_sitemap); |
182 |
| - crawler.interval = 2000; // 1000 = 1 second |
183 |
| - |
184 |
| - crawler.on("fetchcomplete", function(queueItem, resBuffer, response) { |
185 |
| - console.log("crawler has received %s (%d bytes)",queueItem.url,resBuffer.length); |
186 |
| - jobCrawler(queueItem.url, format); |
187 |
| - }); |
188 |
| - |
189 |
| - // Only parse XML documents and ignore all other links |
190 |
| - conditionID = crawler.addFetchCondition(function(parsedURL) { |
191 |
| - return parsedURL.path.match(/\.xml$/i); |
192 |
| - }); |
193 |
| - |
194 |
| - crawler.on("complete", function(err, response) { |
195 |
| - if (err) throw err; |
196 |
| - console.log("Crawler has completed crawling the sitemap index."); |
197 |
| - }); |
198 |
| -} |
199 |
| - |
200 |
| - |
201 |
| -if (cron_expression) { |
202 |
| - cronJob = schedule.scheduleJob(cron_expression, function() { |
203 |
| - initiate_crawl(crawler, Crawler, index_sitemap, format); |
204 |
| - }); |
205 |
| -} else { |
206 |
| - initiate_crawl(crawler, Crawler, index_sitemap, format); |
207 |
| -} |
| 1 | +var crawler = require('./crawler'); |
208 | 2 |
|
| 3 | +// Start the crawling application |
| 4 | +crawler.init(); |
0 commit comments