50 lines
1.2 KiB
Ruby
50 lines
1.2 KiB
Ruby
|
require 'logger'
|
||
|
|
||
|
require 'nokogiri'
|
||
|
require 'typhoeus'
|
||
|
|
||
|
|
||
|
class MapCrawler
|
||
|
attr_accessor :base_map_url, :log, :request_pool
|
||
|
|
||
|
def initialize(base_map_url)
|
||
|
@base_map_url = base_map_url
|
||
|
|
||
|
@log = Logger.new(
|
||
|
File.expand_path('../log/map-crawler.log', File.dirname(__FILE__))
|
||
|
)
|
||
|
@log.level = Logger::INFO
|
||
|
@log.formatter = lambda do |severity, time, prog, message|
|
||
|
"#{time} - #{severity} - #{message}\n"
|
||
|
end
|
||
|
end
|
||
|
|
||
|
def crawl(city_name_callback = nil)
|
||
|
city_name_callback ||= lambda { |n| puts n }
|
||
|
|
||
|
hydra = Typhoeus::Hydra.new(:initial_pool_size => 26)
|
||
|
|
||
|
('A'..'Z').each do |letter|
|
||
|
letter_request = Typhoeus::Request.new(
|
||
|
@base_map_url.gsub(/\{FIRST_LETTER\}/, letter)
|
||
|
)
|
||
|
letter_request.on_complete do |response|
|
||
|
handle_cities(response, city_name_callback)
|
||
|
end
|
||
|
|
||
|
hydra.queue(letter_request)
|
||
|
end
|
||
|
|
||
|
hydra.run
|
||
|
end
|
||
|
|
||
|
def handle_cities(response, city_name_callback)
|
||
|
@log.info("Handling cities at url #{response.effective_url}")
|
||
|
doc = Nokogiri::HTML(response.body)
|
||
|
doc.css('div.mw-content-ltr ul')[3].css('li a').each do |anchor|
|
||
|
@log.info("Found city: #{anchor.text}")
|
||
|
city_name_callback.call(anchor.text.strip)
|
||
|
end
|
||
|
end
|
||
|
end
|