require 'logger' require 'nokogiri' require 'typhoeus' class MapCrawler attr_accessor :base_map_url, :log, :request_pool def initialize(base_map_url) @base_map_url = base_map_url @log = Logger.new( File.expand_path('../log/map-crawler.log', File.dirname(__FILE__)) ) @log.level = Logger::INFO @log.formatter = lambda do |severity, time, prog, message| "#{time} - #{severity} - #{message}\n" end end def crawl(city_name_callback = nil) city_name_callback ||= lambda { |n| puts n } hydra = Typhoeus::Hydra.new(:initial_pool_size => 26) ('A'..'Z').each do |letter| letter_request = Typhoeus::Request.new( @base_map_url.gsub(/\{FIRST_LETTER\}/, letter) ) letter_request.on_complete do |response| handle_cities(response, city_name_callback) end hydra.queue(letter_request) end hydra.run end def handle_cities(response, city_name_callback) @log.info("Handling cities at url #{response.effective_url}") doc = Nokogiri::HTML(response.body) doc.css('div.mw-content-ltr ul')[3].css('li a').each do |anchor| @log.info("Found city: #{anchor.text}") city_name_callback.call(anchor.text.strip) end end end