From 5795835b8e83556ae7ad1bc298905d1c96c44797 Mon Sep 17 00:00:00 2001 From: iavivai <18yukitaka@gmail.com> Date: Fri, 1 Nov 2019 23:59:58 +0900 Subject: [PATCH 1/5] Fix of sample code. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index dddc3f8..cd15a19 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ class GithubSpider < Kimurai::Base } def parse(response, url:, data: {}) - response.xpath("//ul[@class='repo-list']/div//h3/a").each do |a| + response.xpath("//ul[@class='repo-list']//a[@class='v-align-middle']").each do |a| request_to :parse_repo_page, url: absolute_url(a[:href], base: url) end @@ -51,7 +51,7 @@ class GithubSpider < Kimurai::Base item[:repo_name] = response.xpath("//h1/strong[@itemprop='name']/a").text item[:repo_url] = url item[:description] = response.xpath("//span[@itemprop='about']").text.squish - item[:tags] = response.xpath("//div[@id='topics-list-container']/div/a").map { |a| a.text.squish } + item[:tags] = response.xpath("//div[starts-with(@class, 'list-topics-container')]/a").map { |a| a.text.squish } item[:watch_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Watch')]/a[2]").text.squish item[:star_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Star')]/a[2]").text.squish item[:fork_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Fork')]/a[2]").text.squish From 4aa720fca9f5b287f0be6186a702a18089088a94 Mon Sep 17 00:00:00 2001 From: Dusan Orlovic Date: Thu, 14 Nov 2019 12:33:54 +0100 Subject: [PATCH 2/5] Use config argument on parse! to set config --- README.md | 4 +++- lib/kimurai/base.rb | 8 +++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cd15a19..082124a 100644 --- a/README.md +++ b/README.md @@ -1359,7 +1359,7 @@ end # => So what if you're don't care about stats and just want to process request to a particular spider method and get the returning value from this method? Use `.parse!` instead: -#### `.parse!(:method_name, url:)` method +#### `.parse!(:method_name, url:, config: {})` method `.parse!` (class method) creates a new spider instance and performs a request to given method with a given url. Value from the method will be returned back: @@ -1376,6 +1376,8 @@ end ExampleSpider.parse!(:parse, url: "https://example.com/") # => "Example Domain" +# this is example when you need to override config +ExampleSpider.parse!(:parse, url: "https://example.com/", config: { before_request: { clear_and_set_cookies: true } } ) ``` Like `.crawl!`, `.parse!` method takes care of a browser instance and kills it (`browser.destroy_driver!`) before returning the value. Unlike `.crawl!`, `.parse!` method can be called from different threads at the same time: diff --git a/lib/kimurai/base.rb b/lib/kimurai/base.rb index f8a4b4f..1d2ae87 100644 --- a/lib/kimurai/base.rb +++ b/lib/kimurai/base.rb @@ -154,7 +154,13 @@ def self.crawl!(exception_on_fail: true) end def self.parse!(handler, *args, **request) - spider = self.new + if request.has_key? :config + config = request[:config] + request.delete :config + else + config = {} + end + spider = self.new config: config if args.present? spider.public_send(handler, *args) From fd96802658920eb2acebc48a4d3823b2a9072d9e Mon Sep 17 00:00:00 2001 From: John Phamvan Date: Wed, 13 May 2020 11:25:22 -0700 Subject: [PATCH 3/5] Switch to Addressable.URI.escape away from obsolete URI.escape; updated some development gems --- kimurai.gemspec | 5 +++-- lib/kimurai/base_helper.rb | 6 ++++-- lib/kimurai/version.rb | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/kimurai.gemspec b/kimurai.gemspec index 07c4d55..748add4 100644 --- a/kimurai.gemspec +++ b/kimurai.gemspec @@ -37,12 +37,13 @@ Gem::Specification.new do |spec| spec.add_dependency "headless" spec.add_dependency "pmap" + spec.add_dependency "addressable" spec.add_dependency "whenever" spec.add_dependency "rbcat", "~> 0.2" spec.add_dependency "pry" - spec.add_development_dependency "bundler", "~> 1.16" - spec.add_development_dependency "rake", "~> 10.0" + spec.add_development_dependency "bundler", "~> 2.1" + spec.add_development_dependency "rake", "~> 13.0" spec.add_development_dependency "minitest", "~> 5.0" end diff --git a/lib/kimurai/base_helper.rb b/lib/kimurai/base_helper.rb index cff59d2..633208d 100644 --- a/lib/kimurai/base_helper.rb +++ b/lib/kimurai/base_helper.rb @@ -1,16 +1,18 @@ +require 'addressable/uri' + module Kimurai module BaseHelper private def absolute_url(url, base:) return unless url - URI.join(base, URI.escape(url)).to_s + URI.join(base, Addressable::URI.escape(url)).to_s end def escape_url(url) uri = URI.parse(url) rescue URI::InvalidURIError => e - URI.parse(URI.escape url).to_s rescue url + URI.parse(Addressable::URI.escape(url)).to_s rescue url else url end diff --git a/lib/kimurai/version.rb b/lib/kimurai/version.rb index ed8ce2a..52e0c33 100644 --- a/lib/kimurai/version.rb +++ b/lib/kimurai/version.rb @@ -1,3 +1,3 @@ module Kimurai - VERSION = "1.4.0" + VERSION = "1.4.1" end From 7f5e9f1682efdf148a32b7c47ab9e4fc73f63e34 Mon Sep 17 00:00:00 2001 From: John Phamvan Date: Wed, 13 May 2020 12:27:51 -0700 Subject: [PATCH 4/5] Fixed Ruby 2.7 warning for keyword arguments --- lib/kimurai/base.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/kimurai/base.rb b/lib/kimurai/base.rb index 1d2ae87..6e09270 100644 --- a/lib/kimurai/base.rb +++ b/lib/kimurai/base.rb @@ -207,7 +207,9 @@ def request_to(handler, delay = nil, url:, data: {}, response_type: :html) visited = delay ? browser.visit(url, delay: delay) : browser.visit(url) return unless visited - public_send(handler, browser.current_response(response_type), { url: url, data: data }) + options = { url: url, data: data } + + public_send(handler, browser.current_response(response_type), **options) end def console(response = nil, url: nil, data: {}) From eb715d7bf2c4be62badcda8d143698b4c4539649 Mon Sep 17 00:00:00 2001 From: John Phamvan Date: Sat, 2 Jan 2021 17:45:42 -0800 Subject: [PATCH 5/5] Working Cuprite driver using Ferrum - Chrome CDP --- kimurai.gemspec | 1 + .../browser_builder/cuprite_builder.rb | 201 ++++++++++++++++++ lib/kimurai/capybara_ext/cuprite/driver.rb | 9 + lib/kimurai/capybara_ext/mechanize/driver.rb | 2 +- lib/kimurai/version.rb | 2 +- 5 files changed, 213 insertions(+), 2 deletions(-) create mode 100644 lib/kimurai/browser_builder/cuprite_builder.rb create mode 100644 lib/kimurai/capybara_ext/cuprite/driver.rb diff --git a/kimurai.gemspec b/kimurai.gemspec index 748add4..637fad0 100644 --- a/kimurai.gemspec +++ b/kimurai.gemspec @@ -33,6 +33,7 @@ Gem::Specification.new do |spec| spec.add_dependency "capybara-mechanize" spec.add_dependency "poltergeist" spec.add_dependency "selenium-webdriver" + spec.add_dependency "cuprite" spec.add_dependency "headless" spec.add_dependency "pmap" diff --git a/lib/kimurai/browser_builder/cuprite_builder.rb b/lib/kimurai/browser_builder/cuprite_builder.rb new file mode 100644 index 0000000..0c85191 --- /dev/null +++ b/lib/kimurai/browser_builder/cuprite_builder.rb @@ -0,0 +1,201 @@ +require 'capybara' +require 'capybara/cuprite' +require_relative '../capybara_configuration' +require_relative '../capybara_ext/cuprite/driver' +require_relative '../capybara_ext/session' + +module Kimurai::BrowserBuilder + class CupriteBuilder + class << self + attr_accessor :virtual_display + end + + attr_reader :logger, :spider + + def initialize(config, spider:) + @config = config + @spider = spider + @logger = spider.logger + end + + def build + # Register driver + Capybara.register_driver :cuprite do |app| + # Create driver options + # opts = { args: %w[--disable-gpu --no-sandbox --disable-translate] } + + # Provide custom chrome browser path: + # if chrome_path = Kimurai.configuration.selenium_chrome_path + # opts.merge!(binary: chrome_path) + # end + + # See all options here: https://seleniumhq.github.io/selenium/docs/api/rb/Selenium/WebDriver/Chrome/Options.html + # driver_options = Selenium::WebDriver::Chrome::Options.new(opts) + + # Window size + # if size = @config[:window_size].presence + # driver_options.args << "--window-size=#{size.join(',')}" + # logger.debug "BrowserBuilder (cuprite): enabled window_size" + # end + + # Proxy + # if proxy = @config[:proxy].presence + # proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip + # ip, port, type, user, password = proxy_string.split(":") + # + # if %w(http socks5).include?(type) + # if user.nil? && password.nil? + # driver_options.args << "--proxy-server=#{type}://#{ip}:#{port}" + # logger.debug "BrowserBuilder (cuprite): enabled #{type} proxy, ip: #{ip}, port: #{port}" + # else + # logger.error "BrowserBuilder (cuprite): proxy with authentication doesn't supported by selenium, skipped" + # end + # else + # logger.error "BrowserBuilder (cuprite): wrong type of proxy: #{type}, skipped" + # end + # end + # + # if proxy_bypass_list = @config[:proxy_bypass_list].presence + # if proxy + # driver_options.args << "--proxy-bypass-list=#{proxy_bypass_list.join(';')}" + # logger.debug "BrowserBuilder (cuprite): enabled proxy_bypass_list" + # else + # logger.error "BrowserBuilder (cuprite): provide `proxy` to set proxy_bypass_list, skipped" + # end + # end + + # SSL + # if @config[:ignore_ssl_errors].present? + # driver_options.args << "--ignore-certificate-errors" + # driver_options.args << "--allow-insecure-localhost" + # logger.debug "BrowserBuilder (cuprite): enabled ignore_ssl_errors" + # end + + # Disable images + # if @config[:disable_images].present? + # driver_options.prefs["profile.managed_default_content_settings.images"] = 2 + # logger.debug "BrowserBuilder (cuprite): enabled disable_images" + # end + + # Headers + # if @config[:headers].present? + # logger.warn "BrowserBuilder: (selenium_chrome): custom headers doesn't supported by selenium, skipped" + # end + + # if user_agent = @config[:user_agent].presence + # user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip + # driver_options.args << "--user-agent='#{user_agent_string}'" + # logger.debug "BrowserBuilder (cuprite): enabled custom user_agent" + # end + + # Headless mode + # if ENV["HEADLESS"] != "false" + # if @config[:headless_mode] == :virtual_display + # if Gem::Platform.local.os == "linux" + # unless self.class.virtual_display + # require 'headless' + # self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false) + # self.class.virtual_display.start + # end + # + # logger.debug "BrowserBuilder (cuprite): enabled virtual_display headless_mode" + # else + # logger.error "BrowserBuilder (cuprite): virtual_display headless_mode works only " \ + # "on Linux platform. Browser will run in normal mode. Set `native` mode instead." + # end + # else + # driver_options.args << "--headless" + # logger.debug "BrowserBuilder (cuprite): enabled native headless_mode" + # end + # end + + # chromedriver_path = Kimurai.configuration.chromedriver_path || "/usr/local/bin/chromedriver" + # service = Selenium::WebDriver::Service.chrome(path: chromedriver_path) + # Capybara::Selenium::Driver.new(app, browser: :chrome, options: driver_options, service: service) + # Capybara::Cuprite::Driver.new(app, window_size: window_size[1200, 800]) + Capybara::Cuprite::Driver.new(app) + end + + # Create browser instance (Capybara session) + @browser = Capybara::Session.new(:cuprite) + @browser.spider = spider + logger.debug "BrowserBuilder (cuprite): created browser instance" + + # if @config[:extensions].present? + # logger.error "BrowserBuilder (cuprite): `extensions` option not supported by Selenium, skipped" + # end + + # Cookies + if cookies = @config[:cookies].presence + @browser.config.cookies = cookies + logger.debug "BrowserBuilder (cuprite): enabled custom cookies" + end + + # Browser instance options + # skip_request_errors + if skip_errors = @config[:skip_request_errors].presence + @browser.config.skip_request_errors = skip_errors + logger.debug "BrowserBuilder (cuprite): enabled skip_request_errors" + end + + # retry_request_errors + if retry_errors = @config[:retry_request_errors].presence + @browser.config.retry_request_errors = retry_errors + logger.debug "BrowserBuilder (cuprite): enabled retry_request_errors" + end + + # restart_if + if requests_limit = @config.dig(:restart_if, :requests_limit).presence + @browser.config.restart_if[:requests_limit] = requests_limit + logger.debug "BrowserBuilder (cuprite): enabled restart_if.requests_limit >= #{requests_limit}" + end + + # if memory_limit = @config.dig(:restart_if, :memory_limit).presence + # @browser.config.restart_if[:memory_limit] = memory_limit + # logger.debug "BrowserBuilder (cuprite): enabled restart_if.memory_limit >= #{memory_limit}" + # end + + # before_request clear_cookies + if @config.dig(:before_request, :clear_cookies) + @browser.config.before_request[:clear_cookies] = true + logger.debug "BrowserBuilder (cuprite): enabled before_request.clear_cookies" + end + + # before_request clear_and_set_cookies + if @config.dig(:before_request, :clear_and_set_cookies) + if cookies = @config[:cookies].presence + @browser.config.cookies = cookies + @browser.config.before_request[:clear_and_set_cookies] = true + logger.debug "BrowserBuilder (cuprite): enabled before_request.clear_and_set_cookies" + else + logger.error "BrowserBuilder (cuprite): cookies should be present to enable before_request.clear_and_set_cookies, skipped" + end + end + + # before_request change_user_agent + if @config.dig(:before_request, :change_user_agent) + logger.error "BrowserBuilder (cuprite): before_request.change_user_agent option not supported by Selenium, skipped" + end + + # before_request change_proxy + if @config.dig(:before_request, :change_proxy) + logger.error "BrowserBuilder (cuprite): before_request.change_proxy option not supported by Selenium, skipped" + end + + # before_request delay + if delay = @config.dig(:before_request, :delay).presence + @browser.config.before_request[:delay] = delay + logger.debug "BrowserBuilder (cuprite): enabled before_request.delay" + end + + # encoding + if encoding = @config[:encoding] + @browser.config.encoding = encoding + logger.debug "BrowserBuilder (cuprite): enabled encoding: #{encoding}" + end + + # return Capybara session instance + @browser + end + end +end diff --git a/lib/kimurai/capybara_ext/cuprite/driver.rb b/lib/kimurai/capybara_ext/cuprite/driver.rb new file mode 100644 index 0000000..935431e --- /dev/null +++ b/lib/kimurai/capybara_ext/cuprite/driver.rb @@ -0,0 +1,9 @@ +require_relative '../driver/base' + +module Capybara::Cuprite + class Driver + def current_memory + nil + end + end +end diff --git a/lib/kimurai/capybara_ext/mechanize/driver.rb b/lib/kimurai/capybara_ext/mechanize/driver.rb index be8a22b..61cab0c 100644 --- a/lib/kimurai/capybara_ext/mechanize/driver.rb +++ b/lib/kimurai/capybara_ext/mechanize/driver.rb @@ -2,7 +2,7 @@ require_relative '../driver/base' class Capybara::Mechanize::Driver - # Extend capybara-mechnize to support Poltergeist-like methods + # Extend capybara-mechanize to support Poltergeist-like methods # https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver def set_proxy(ip, port, type, user = nil, password = nil) diff --git a/lib/kimurai/version.rb b/lib/kimurai/version.rb index 52e0c33..cdd4a7d 100644 --- a/lib/kimurai/version.rb +++ b/lib/kimurai/version.rb @@ -1,3 +1,3 @@ module Kimurai - VERSION = "1.4.1" + VERSION = "1.5" end