Ported scrAPI to Ruby 1.9.3 (unfortunately 1.9.2 will not work out because of a bug in Ruby itself).

Christoph Lupprich · Christoph Lupprich · commit 22d49012a202 · 2010-11-10T17:23:48.000+01:00
diff --git a/README.rdoc b/README.rdoc
@@ -40,13 +40,16 @@ To get the latest source code with regular updates:
 
 svn co http://labnotes.org/svn/public/ruby/scrapi
 
+== Version of Ruby
+
+Currently ScrAPI does not run with Ruby 1.9.2, but with the dev versions of Ruby 1.9.3. This is due to a bug in Ruby's visibility context handling (see changelog #29578 and bug #3406 on the official Ruby page). Using the most recent dev version of Ruby is easy with RVM (http://rvm.beginrescueend.com/).
 
 == Using TIDY
 
-By default scrAPI uses Tidy to cleanup the HTML.
+By default scrAPI uses Tidy (actually Tidy-FFI) to cleanup the HTML.
 
 You need to install the Tidy Gem for Ruby:
-  gem install tidy
+  gem install tidy_ffi
 
 And the Tidy binary libraries, available here:
 
@@ -56,15 +59,15 @@ By default scrAPI looks for the Tidy DLL (Windows) or shared library (Linux) in
 
 Alternatively, just point Tidy to the library with:
 
-  Tidy.path = "...."
+  TidyFFI.library_path = "...."
 
 On Linux this would probably be:
 
-  Tidy.path = "/usr/local/lib/libtidy.so"
+  TidyFFI.library_path = "/usr/local/lib/libtidy.so"
 
 On OS/X this would probably be:
 
-  Tidy.path = “/usr/lib/libtidy.dylib”
+  TidyFFI.library_path = “/usr/lib/libtidy.dylib”
 
 For testing purposes, you can also use the built in HTML parser. It's useful for testing and getting up to grabs with scrAPI, but it doesn't deal well with broken HTML. So for testing only:
 
@@ -86,3 +89,5 @@ HTML DOM extracted from Rails, Copyright (c) 2004 David Heinemeier Hansson. Unde
 
 HTML parser by Takahiro Maebashi and Katsuyuki Komatsu, Ruby license.
 http://www.jin.gr.jp/~nahi/Ruby/html-parser/README.html
+
+Porting to Ruby 1.9.x by Christoph Lupprich, http://lupprich.info
diff --git a/Rakefile b/Rakefile
@@ -1,6 +1,5 @@
 require "benchmark"
 require "rubygems"
-Gem::manage_gems
 require "rake"
 require "rake/testtask"
 require "rake/rdoctask"
diff --git a/lib/scraper/base.rb b/lib/scraper/base.rb
@@ -906,10 +906,10 @@ def request(url, options)
     #   end
     def skip(elements = nil)
       case elements
-      when Array: @skip.concat elements
-      when HTML::Node: @skip << elements
-      when nil: @skip << true
-      when true, false: @skip << elements
+      when Array then @skip.concat elements
+      when HTML::Node then @skip << elements
+      when nil then @skip << true
+      when true, false then @skip << elements
       end
       # Calling skip(element) as the last statement is
       # redundant by design.
diff --git a/lib/scraper/reader.rb b/lib/scraper/reader.rb
@@ -10,7 +10,7 @@
 require "net/https"
 begin
   require "rubygems"
-  require "tidy"
+  require "tidy_ffi"
 rescue LoadError
 end
 
@@ -95,6 +95,7 @@ def to_s
     # * :redirect_limit -- Number of redirects allowed (default is 3).
     # * :user_agent -- The User-Agent header to send.
     # * :timeout -- HTTP open connection/read timeouts (in second).
+    # * :ssl_verify_mode -- SSL verification mode, defaults to OpenSSL::SSL::VERIFY_NONE
     #
     # It returns a hash with the following information:
     # * :url -- The URL of the requested page (may change by permanent redirect)
@@ -123,6 +124,7 @@ def read_page(url, options = nil)
       begin
         http = Net::HTTP.new(uri.host, uri.port)
         http.use_ssl = (uri.scheme == "https")
+        http.verify_mode = options[:ssl_verify_mode] || OpenSSL::SSL::VERIFY_NONE
         http.close_on_empty_response = true
         http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT
         path = uri.path.dup # required so we don't modify path
@@ -202,10 +204,8 @@ def parse_page(content, encoding = nil, options = nil, parser = :tidy)
           find_tidy
           options = (options || {}).update(TIDY_OPTIONS)
           options[:input_encoding] = encoding.gsub("-", "").downcase
-          document = Tidy.open(options) do |tidy|
-            html = tidy.clean(content)
-            HTML::Document.new(html).find(:tag=>"html")
-          end
+          html = TidyFFI::Tidy.with_options(options).clean(content)
+          document = HTML::Document.new(html).find(:tag=>"html")
         when :html_parser
           document = HTML::HTMLParser.parse(content).root
         else
@@ -223,14 +223,14 @@ def parse_page(content, encoding = nil, options = nil, parser = :tidy)
   module_function
 
     def find_tidy()
-      return if Tidy.path
+      return if TidyFFI.library_path
       begin
-        Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.so")
+        TidyFFI.library_path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.so")
       rescue LoadError
         begin
-          Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dll")
+          TidyFFI.library_path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dll")
         rescue LoadError
-          Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dylib")
+          TidyFFI.library_path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dylib")
         end
       end
     end
diff --git a/scrapi.gemspec b/scrapi.gemspec
@@ -1,6 +1,6 @@
 Gem::Specification.new do |spec|
   spec.name = 'scrapi'
-  spec.version = '1.2.1'
+  spec.version = '1.2.2'
   spec.summary = "scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules."
   spec.description = <<-EOF
 scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
@@ -13,10 +13,10 @@ EOF
   spec.files = Dir['{test,lib}/**/*', 'README.rdoc', 'CHANGELOG', 'Rakefile', 'MIT-LICENSE']
   spec.require_path = 'lib'
   spec.autorequire = 'scrapi.rb'
-  spec.requirements << 'Tidy'
+  spec.requirements << 'Tidy_ffi'
   spec.has_rdoc = true
   spec.rdoc_options << '--main' << 'README.rdoc' << '--title' <<  "scrAPI toolkit for Ruby" << '--line-numbers'
   spec.extra_rdoc_files = ['README.rdoc']
 
-  spec.add_dependency 'tidy', '>=1.1.0'
+  spec.add_dependency 'tidy_ffy', '>=0.1.2'
 end
diff --git a/test/node_ext_test.rb b/test/node_ext_test.rb
@@ -7,7 +7,7 @@
 
 require "rubygems"
 require "test/unit"
-require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+require "./lib/scrapi"
 
 
 class NodeExtTest < Test::Unit::TestCase
diff --git a/test/reader_test.rb b/test/reader_test.rb
@@ -12,8 +12,8 @@
 require "webrick/https"
 require "logger"
 require "stringio"
-require File.join(File.dirname(__FILE__), "mock_net_http")
-require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+require "./test/mock_net_http"
+require "./lib/scrapi"
 
 
 class ReaderTest < Test::Unit::TestCase
@@ -239,38 +239,38 @@ def test_should_handle_encoding_correctly
     # Test content encoding returned from HTTP server.
     with_webrick do |server, params|
       server.mount_proc "/test.html" do |req,resp|
-        resp["Content-Type"] = "text/html; charset=my-encoding"
+        resp["Content-Type"] = "text/html; charset=ASCII"
         resp.body = "Content comes here"
       end
       page = Reader.read_page(WEBRICK_TEST_URL)
       page = Reader.parse_page(page.content, page.encoding)
-      assert_equal "my-encoding", page.encoding
+      assert_equal "ASCII", page.encoding
     end
     # Test content encoding in HTML http-equiv header
     # that overrides content encoding returned in HTTP.
     with_webrick do |server, params|
       server.mount_proc "/test.html" do |req,resp|
-        resp["Content-Type"] = "text/html; charset=my-encoding"
+        resp["Content-Type"] = "text/html; charset=ASCII"
         resp.body = %Q{
 <html>
 <head>
-<meta http-equiv="content-type" value="text/html; charset=other-encoding">
+<meta http-equiv="content-type" value="text/html; charset=UTF-8">
 </head>
 <body></body>
 </html>
         }
       end
       page = Reader.read_page(WEBRICK_TEST_URL)
       page = Reader.parse_page(page.content, page.encoding)
-      assert_equal "other-encoding", page.encoding
+      assert_equal "UTF-8", page.encoding
     end
   end
 
   def test_should_support_https
     begin
       options = WEBRICK_OPTIONS.dup.update(
         :SSLEnable=>true,
-        :SSLVerifyClient => ::OpenSSL::SSL::VERIFY_NONE,
+        :SSLVerifyClient => OpenSSL::SSL::VERIFY_NONE,
         :SSLCertName => [ ["C","JP"], ["O","WEBrick.Org"], ["CN", "WWW"] ]
       )
       server = WEBrick::HTTPServer.new(options)
diff --git a/test/scraper_test.rb b/test/scraper_test.rb
@@ -8,8 +8,8 @@
 require "rubygems"
 require "time"
 require "test/unit"
-require File.join(File.dirname(__FILE__), "mock_net_http")
-require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+require "./test/mock_net_http"
+require "./lib/scrapi"
 
 
 class ScraperTest < Test::Unit::TestCase
@@ -301,8 +301,8 @@ def test_skip_from_extractor
     assert_equal "this", scraper.this2
 
     scraper = new_scraper(html) do
-      process "#1", :this1=>:text, :skip=>true do
-        false
+      process "#1", :this1=>:text, :skip=>true do |element|
+        element
       end
       process "#1", :this2=>:text
     end
@@ -351,7 +351,7 @@ def test_accessors
         [response, <<-EOF
           <html>
             <head>
-              <meta http-equiv="content-type" value="text/html; charset=other-encoding">
+              <meta http-equiv="content-type" value="text/html; charset=ASCII">
             </head>
             <body>
               <div id="x"/>
@@ -371,7 +371,7 @@ def test_accessors
     assert_equal "http://localhost/redirect", scraper.page_info.url.to_s
     assert_equal time, scraper.page_info.last_modified
     assert_equal "etag", scraper.page_info.etag
-    assert_equal "other-encoding", scraper.page_info.encoding
+    assert_equal "ASCII", scraper.page_info.encoding
   end
 
 
@@ -721,7 +721,7 @@ def test_prepare_and_result
     # Extracting the attribute skips the second match.
     scraper = new_scraper(DIVS123) do
       process("div") { |element| @count +=1 }
-      define_method(:prepare) { @count = 1 }
+      define_method(:prepare) { |element| @count = 1 }
       define_method(:result) { @count }
     end
     result = scraper.scrape
diff --git a/test/selector_test.rb b/test/selector_test.rb
@@ -4,7 +4,7 @@
 # Developed for http://co.mments.com
 # Code and documention: http://labnotes.org
 
-require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+require "./lib/scrapi"
 
 
 class SelectorTest < Test::Unit::TestCase