Crawling actual map *names*, but not yet pulling in maps
This commit is contained in:
parent
6d9ae87557
commit
607a4158e1
1
rails/map-mash/.gitignore
vendored
1
rails/map-mash/.gitignore
vendored
@ -2,3 +2,4 @@ log/*.log
|
|||||||
tmp/*
|
tmp/*
|
||||||
db/*.sqlite3
|
db/*.sqlite3
|
||||||
config/redis.conf
|
config/redis.conf
|
||||||
|
db/*.sqlite3-journal
|
||||||
|
@ -1,2 +1,5 @@
|
|||||||
class Map < ActiveRecord::Base
|
class Map < ActiveRecord::Base
|
||||||
|
def self.from_city_name(city_name)
|
||||||
|
self.find_or_initialize_by_name(city_name).save!
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
@ -3,3 +3,6 @@ resque_web:
|
|||||||
|
|
||||||
redis:
|
redis:
|
||||||
port: 16379
|
port: 16379
|
||||||
|
|
||||||
|
map:
|
||||||
|
base_url: 'http://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/cityname:_{FIRST_LETTER}'
|
||||||
|
49
rails/map-mash/lib/map_crawler.rb
Normal file
49
rails/map-mash/lib/map_crawler.rb
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
require 'logger'
|
||||||
|
|
||||||
|
require 'nokogiri'
|
||||||
|
require 'typhoeus'
|
||||||
|
|
||||||
|
|
||||||
|
class MapCrawler
|
||||||
|
attr_accessor :base_map_url, :log, :request_pool
|
||||||
|
|
||||||
|
def initialize(base_map_url)
|
||||||
|
@base_map_url = base_map_url
|
||||||
|
|
||||||
|
@log = Logger.new(
|
||||||
|
File.expand_path('../log/map-crawler.log', File.dirname(__FILE__))
|
||||||
|
)
|
||||||
|
@log.level = Logger::INFO
|
||||||
|
@log.formatter = lambda do |severity, time, prog, message|
|
||||||
|
"#{time} - #{severity} - #{message}\n"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def crawl(city_name_callback = nil)
|
||||||
|
city_name_callback ||= lambda { |n| puts n }
|
||||||
|
|
||||||
|
hydra = Typhoeus::Hydra.new(:initial_pool_size => 26)
|
||||||
|
|
||||||
|
('A'..'Z').each do |letter|
|
||||||
|
letter_request = Typhoeus::Request.new(
|
||||||
|
@base_map_url.gsub(/\{FIRST_LETTER\}/, letter)
|
||||||
|
)
|
||||||
|
letter_request.on_complete do |response|
|
||||||
|
handle_cities(response, city_name_callback)
|
||||||
|
end
|
||||||
|
|
||||||
|
hydra.queue(letter_request)
|
||||||
|
end
|
||||||
|
|
||||||
|
hydra.run
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_cities(response, city_name_callback)
|
||||||
|
@log.info("Handling cities at url #{response.effective_url}")
|
||||||
|
doc = Nokogiri::HTML(response.body)
|
||||||
|
doc.css('div.mw-content-ltr ul')[3].css('li a').each do |anchor|
|
||||||
|
@log.info("Found city: #{anchor.text}")
|
||||||
|
city_name_callback.call(anchor.text.strip)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
6
rails/map-mash/lib/tasks/maps.rake
Normal file
6
rails/map-mash/lib/tasks/maps.rake
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
namespace :maps do
|
||||||
|
desc 'Index the maps!'
|
||||||
|
task :index => :environment do
|
||||||
|
MapCrawler.new(Setting.map(:base_url)).crawl(Map.method(:from_city_name))
|
||||||
|
end
|
||||||
|
end
|
16
rails/map-mash/spec/lib/map_crawler_spec.rb
Normal file
16
rails/map-mash/spec/lib/map_crawler_spec.rb
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
|
||||||
|
describe MapCrawler do
|
||||||
|
let(:subject) { MapCrawler.new(Setting.map(:base_url)) }
|
||||||
|
|
||||||
|
describe 'when crawling for actual maps', :integration => true do
|
||||||
|
before(:each) do
|
||||||
|
Map.destroy_all
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'should create Map entries for each map found' do
|
||||||
|
expect { subject.crawl }.to change{ Map.count }.by_at_least(1)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue
Block a user