|
| 1 | +# encoding: utf-8 |
1 | 2 | require 'fileutils' |
| 3 | +require 'open-uri' |
| 4 | +require 'rss' |
2 | 5 |
|
3 | 6 | gem 'spidr', '~> 0.4' |
4 | 7 | require 'spidr' |
5 | 8 |
|
6 | 9 | gem 'kramdown', '~> 0.13' |
7 | 10 | require 'kramdown' |
8 | 11 |
|
| 12 | +HOST = 'www.ruby-lang.org' |
9 | 13 | OUTPUT_DIR = '_import' |
10 | | -LAYOUTS = { |
11 | | - :default => 'page', |
12 | | - :post => 'news_post' |
13 | | -} |
14 | 14 |
|
15 | | -desc 'Spiders ruby-lang.org and converts HTML to Markdown' |
16 | | -task :import do |
17 | | - Spidr.site('http://www.ruby-lang.org/index.html') do |agent| |
18 | | - agent.ignore_links_like /\/cgi-bin\// |
19 | | - agent.ignore_links_like /\.cgi[\/]?$/ |
20 | | - agent.ignore_links_like /\/[a-z_]+\/old-man\// |
| 15 | +def url_to_path(url) |
| 16 | + local_path = File.join(OUTPUT_DIR,url.path[1..-1]) |
21 | 17 |
|
22 | | - agent.every_ok_page do |page| |
23 | | - path = page.url.path[1..-1] |
| 18 | + case File.extname(local_path) |
| 19 | + when '.html' |
| 20 | + local_path.chomp!('.html') << '.md' |
| 21 | + when '' |
| 22 | + local_path << '/' unless local_path.end_with?('/') |
| 23 | + local_path << 'index.md' |
| 24 | + end |
| 25 | + |
| 26 | + return local_path |
| 27 | +end |
| 28 | + |
| 29 | +def html_to_markdown(content_div) |
| 30 | + # remove all comments |
| 31 | + content_div.traverse do |node| |
| 32 | + node.remove if node.comment? |
| 33 | + end |
| 34 | + |
| 35 | + # remove all page anchors |
| 36 | + content_div.search('//a[@id]').remove |
24 | 37 |
|
25 | | - layout = :default |
| 38 | + # replace all caps spans with their text |
| 39 | + content_div.search('span.caps').each do |span| |
| 40 | + span.replace(span.inner_text) |
| 41 | + end |
26 | 42 |
|
27 | | - if path =~ %r{^[a-z_-]+/news/\d{4}/\d{1,2}/\d{1,2}/[^/]+/$} |
28 | | - # map news posts in to news/_posts/ |
29 | | - dirs = path.split('/') |
30 | | - local_path = File.join(OUTPUT_DIR,dirs[0,2],'_posts',dirs[2..-1].join('-')) + '.md' |
| 43 | + # remove the 'class' attribute from all pre tags |
| 44 | + content_div.search('pre').remove_attr('class') |
31 | 45 |
|
32 | | - layout = :post |
| 46 | + # map all code elements to their inner_text |
| 47 | + content_div.search('pre > code').each do |code| |
| 48 | + code.replace(code.children.map { |node| |
| 49 | + if node.name == 'br' |
| 50 | + $/ |
33 | 51 | else |
34 | | - # normal page |
35 | | - local_path = File.join(OUTPUT_DIR,path) |
36 | | - |
37 | | - case File.extname(local_path) |
38 | | - when '.html' |
39 | | - local_path.gsub!(/\.html$/,'.md') |
40 | | - when '' |
41 | | - local_path += '/' unless local_path.end_with?('/') |
42 | | - local_path += 'index.md' |
| 52 | + node.inner_text |
| 53 | + end |
| 54 | + }.join) |
| 55 | + end |
| 56 | + |
| 57 | + # replace the #extended div with it's children |
| 58 | + if (extended_div = content_div.at('#extended')) |
| 59 | + extended_div.replace(extended_div.inner_html) |
| 60 | + end |
| 61 | + |
| 62 | + # convert from HTML to Markdown |
| 63 | + return Kramdown::Document.new( |
| 64 | + content_div.inner_html, |
| 65 | + :input => :html |
| 66 | + ).to_kramdown |
| 67 | +end |
| 68 | + |
| 69 | +namespace :import do |
| 70 | + desc 'Spiders ruby-lang.org and converts HTML to Markdown' |
| 71 | + task :pages do |
| 72 | + Spidr.site("http://www.#{HOST}/index.html") do |agent| |
| 73 | + agent.ignore_links_like /\/cgi-bin\// |
| 74 | + agent.ignore_links_like /\.cgi[\/]?$/ |
| 75 | + agent.ignore_links_like /\/[a-z_]+\/feeds\// |
| 76 | + agent.ignore_links_like /\/[a-z_]+\/news\// |
| 77 | + agent.ignore_links_like /\/[a-z_]+\/old-man\// |
| 78 | + |
| 79 | + agent.every_ok_page do |page| |
| 80 | + local_path = url_to_path(page.url) |
| 81 | + |
| 82 | + # ensure the parent directory exists |
| 83 | + mkdir_p File.dirname(local_path) |
| 84 | + |
| 85 | + # don't overwrite existing files |
| 86 | + unless File.exist?(local_path) |
| 87 | + puts "Importing #{page.url} -> #{local_path} ..." |
| 88 | + |
| 89 | + File.open(local_path,'w') do |file| |
| 90 | + if page.html? |
| 91 | + title = page.title.strip |
| 92 | + lang = path.split('/',2).first |
| 93 | + |
| 94 | + # add the YAML front matter |
| 95 | + file.puts( |
| 96 | + '---', |
| 97 | + "layout: default", |
| 98 | + "title: #{title.inspect}", |
| 99 | + "lang: #{lang}", |
| 100 | + '---', |
| 101 | + '' |
| 102 | + ) |
| 103 | + |
| 104 | + if (content_div = page.at('#content')) |
| 105 | + file.puts(html_to_markdown(content_div)) |
| 106 | + end |
| 107 | + else |
| 108 | + file.write(page.body) |
| 109 | + end |
| 110 | + end |
43 | 111 | end |
44 | 112 | end |
| 113 | + end |
| 114 | + end |
45 | 115 |
|
46 | | - # ensure the parent directory exists |
47 | | - FileUtils.mkdir_p(File.dirname(local_path)) |
| 116 | + desc "Imports news posts from the RSS feed" |
| 117 | + task :news do |
| 118 | + languages = %w[bg de en es fr id it ja ko pl pt tr zh_TW zh_cn] |
| 119 | + by_lines = { |
| 120 | + 'bg' => /Публикувана от (.+) на/, |
| 121 | + 'de' => /Geschrieben von (.+) am/, |
| 122 | + 'en' => /Posted by (.+) on/, |
| 123 | + 'es' => /Publicado por (.+) Caro el/, |
| 124 | + 'fr' => /par (.+)/, |
| 125 | + 'id' => /Ditulis oleh (.+) tanggal/, |
| 126 | + 'it' => /Inserito da (.+) il/, |
| 127 | + 'ja' => /Posted by (.+) on/, |
| 128 | + 'ko' => /작성자 (.+) \(/, |
| 129 | + 'pl' => /Zamieszczone przez (.+) \d+/, |
| 130 | + 'pt' => /Escrito por (.+) em/, |
| 131 | + 'tr' => /Posted by (.+) on/, |
| 132 | + 'zh_TW' => /Posted by (.+) on/, |
| 133 | + 'zh_cn' => /由 (.+) 发表于/ |
| 134 | + } |
| 135 | + |
| 136 | + Spidr.host(HOST) do |agent| |
| 137 | + languages.each do |lang| |
| 138 | + feed, news_dir = case lang |
| 139 | + when 'pt' then ['noticias', 'noticias-recentes'] |
| 140 | + else ['news', 'news'] |
| 141 | + end |
| 142 | + |
| 143 | + agent.visit_urls_like do |url| |
| 144 | + url.path.start_with?("/#{lang}/#{news_dir}/") |
| 145 | + end |
48 | 146 |
|
49 | | - # don't overwrite existing files |
50 | | - unless File.exist?(local_path) |
51 | | - puts "Saving #{page.url} -> #{local_path} ..." |
| 147 | + agent.enqueue("http://#{HOST}/#{lang}/#{news_dir}/") |
52 | 148 |
|
53 | | - File.open(local_path,'w') do |file| |
54 | | - if page.html? |
55 | | - title = page.title.strip |
56 | | - lang = path.split('/',2).first |
| 149 | + begin |
| 150 | + rss = RSS::Parser.parse(open("http://#{HOST}/#{lang}/feeds/#{feed}.rss")) |
| 151 | + rss.items.each do |item| |
| 152 | + puts "Queuing #{item.link} ..." |
| 153 | + agent.enqueue(item.link) |
| 154 | + end |
| 155 | + rescue OpenURI::HTTPError |
| 156 | + end |
| 157 | + end |
57 | 158 |
|
| 159 | + agent.every_ok_page do |page| |
| 160 | + lang, news_dir, year, month, day, slug = page.url.path[1..-2].split('/') |
| 161 | + title = page.title.strip |
| 162 | + |
| 163 | + if page.url.path =~ /^\/#{lang}\/#{news_dir}\/\d{4}\/\d{2}\/\d{2}\// |
| 164 | + # news post |
| 165 | + local_path = File.join(OUTPUT_DIR,lang,news_dir,'_posts',"#{year}-#{month}-#{day}-#{slug}.md") |
| 166 | + layout = 'news_post' |
| 167 | + author = nil |
| 168 | + |
| 169 | + archive_url = URI("http://#{HOST}/#{lang}/#{news_dir}/#{year}/#{month}/") |
| 170 | + begin |
| 171 | + agent.get_page(archive_url) do |archive| |
| 172 | + if archive.is_ok? |
| 173 | + if (post_div = archive.at("//div[@class='post']/h3/a[@href=#{page.url.path.dump}]/../..")) |
| 174 | + post_info = post_div.at("//p[@class='post-info']").inner_text |
| 175 | + |
| 176 | + author = if (match = post_info.match(by_lines[lang])) |
| 177 | + match[1] |
| 178 | + else |
| 179 | + '' |
| 180 | + end |
| 181 | + end |
| 182 | + end |
| 183 | + end |
| 184 | + rescue Net::HTTPNotFound |
| 185 | + end |
| 186 | + else |
| 187 | + # archive page |
| 188 | + local_path = url_to_path(page.url) |
| 189 | + layout = 'default' |
| 190 | + end |
| 191 | + |
| 192 | + # ensure the parent directory exists |
| 193 | + FileUtils.mkdir_p File.dirname(local_path) |
| 194 | + |
| 195 | + unless File.exists?(local_path) |
| 196 | + puts "Importing #{page.url} -> #{local_path} ..." |
| 197 | + |
| 198 | + File.open(local_path,'w') do |file| |
58 | 199 | # add the YAML front matter |
59 | 200 | file.puts( |
60 | 201 | '---', |
61 | | - "layout: #{LAYOUTS[layout]}", |
62 | | - "title: #{title.inspect}", |
| 202 | + "layout: #{layout}", |
| 203 | + "title: #{title.inspect}" |
| 204 | + ) |
| 205 | + |
| 206 | + if author |
| 207 | + file.puts "author: #{author.inspect}" |
| 208 | + end |
| 209 | + |
| 210 | + file.puts( |
63 | 211 | "lang: #{lang}", |
64 | 212 | '---', |
65 | 213 | '' |
66 | 214 | ) |
67 | 215 |
|
68 | | - if (content_div = page.at('#content')) |
69 | | - # remove all comments |
70 | | - content_div.traverse do |node| |
71 | | - node.remove if node.comment? |
72 | | - end |
73 | | - |
74 | | - # remove all page anchors |
75 | | - content_div.search('//a[@id]').remove |
| 216 | + content_div = page.at('div.post') || page.at('#content') |
76 | 217 |
|
77 | | - # replace all caps spans with their text |
78 | | - content_div.search('span.caps').each do |span| |
79 | | - span.replace(span.inner_text) |
80 | | - end |
81 | | - |
82 | | - # remove the 'class' attribute from all pre tags |
83 | | - content_div.search('pre').remove_attr('class') |
84 | | - |
85 | | - # map all code elements to their inner_text |
86 | | - content_div.search('pre > code').each do |code| |
87 | | - code.replace(code.children.map { |node| |
88 | | - if node.name == 'br' |
89 | | - $/ |
90 | | - else |
91 | | - node.inner_text |
92 | | - end |
93 | | - }.join) |
94 | | - end |
95 | | - |
96 | | - # replace the #extended div with it's children |
97 | | - if (extended_div = content_div.at('#extended')) |
98 | | - extended_div.replace(extended_div.inner_html) |
99 | | - end |
100 | | - |
101 | | - # convert from HTML to Markdown |
102 | | - content = Kramdown::Document.new( |
103 | | - content_div.inner_html, |
104 | | - :input => :html |
105 | | - ).to_kramdown |
106 | | - |
107 | | - file.puts(content) |
108 | | - end |
109 | | - else |
110 | | - file.write(page.body) |
| 218 | + file.puts(html_to_markdown(content_div)) |
111 | 219 | end |
112 | 220 | end |
113 | 221 | end |
|
0 commit comments