From 607a4158e1943e9ed8527d0bbbb5774d026a98fe Mon Sep 17 00:00:00 2001 From: Dan Buch Date: Mon, 5 Mar 2012 09:42:09 -0500 Subject: [PATCH] Crawling actual map *names*, but not yet pulling in maps --- rails/map-mash/.gitignore | 1 + rails/map-mash/app/models/map.rb | 3 ++ rails/map-mash/config/settings/default.yml | 3 ++ rails/map-mash/lib/map_crawler.rb | 49 +++++++++++++++++++++ rails/map-mash/lib/tasks/maps.rake | 6 +++ rails/map-mash/spec/lib/map_crawler_spec.rb | 16 +++++++ 6 files changed, 78 insertions(+) create mode 100644 rails/map-mash/lib/map_crawler.rb create mode 100644 rails/map-mash/lib/tasks/maps.rake create mode 100644 rails/map-mash/spec/lib/map_crawler_spec.rb diff --git a/rails/map-mash/.gitignore b/rails/map-mash/.gitignore index de19c29..6dc2924 100644 --- a/rails/map-mash/.gitignore +++ b/rails/map-mash/.gitignore @@ -2,3 +2,4 @@ log/*.log tmp/* db/*.sqlite3 config/redis.conf +db/*.sqlite3-journal diff --git a/rails/map-mash/app/models/map.rb b/rails/map-mash/app/models/map.rb index e7d4ce2..1c285f3 100644 --- a/rails/map-mash/app/models/map.rb +++ b/rails/map-mash/app/models/map.rb @@ -1,2 +1,5 @@ class Map < ActiveRecord::Base + def self.from_city_name(city_name) + self.find_or_initialize_by_name(city_name).save! + end end diff --git a/rails/map-mash/config/settings/default.yml b/rails/map-mash/config/settings/default.yml index 7af3cff..a8abf5d 100644 --- a/rails/map-mash/config/settings/default.yml +++ b/rails/map-mash/config/settings/default.yml @@ -3,3 +3,6 @@ resque_web: redis: port: 16379 + +map: + base_url: 'http://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/cityname:_{FIRST_LETTER}' diff --git a/rails/map-mash/lib/map_crawler.rb b/rails/map-mash/lib/map_crawler.rb new file mode 100644 index 0000000..6e1248c --- /dev/null +++ b/rails/map-mash/lib/map_crawler.rb @@ -0,0 +1,49 @@ +require 'logger' + +require 'nokogiri' +require 'typhoeus' + + +class MapCrawler + attr_accessor :base_map_url, :log, :request_pool + + def initialize(base_map_url) + @base_map_url = base_map_url + + @log = Logger.new( + File.expand_path('../log/map-crawler.log', File.dirname(__FILE__)) + ) + @log.level = Logger::INFO + @log.formatter = lambda do |severity, time, prog, message| + "#{time} - #{severity} - #{message}\n" + end + end + + def crawl(city_name_callback = nil) + city_name_callback ||= lambda { |n| puts n } + + hydra = Typhoeus::Hydra.new(:initial_pool_size => 26) + + ('A'..'Z').each do |letter| + letter_request = Typhoeus::Request.new( + @base_map_url.gsub(/\{FIRST_LETTER\}/, letter) + ) + letter_request.on_complete do |response| + handle_cities(response, city_name_callback) + end + + hydra.queue(letter_request) + end + + hydra.run + end + + def handle_cities(response, city_name_callback) + @log.info("Handling cities at url #{response.effective_url}") + doc = Nokogiri::HTML(response.body) + doc.css('div.mw-content-ltr ul')[3].css('li a').each do |anchor| + @log.info("Found city: #{anchor.text}") + city_name_callback.call(anchor.text.strip) + end + end +end diff --git a/rails/map-mash/lib/tasks/maps.rake b/rails/map-mash/lib/tasks/maps.rake new file mode 100644 index 0000000..3c25617 --- /dev/null +++ b/rails/map-mash/lib/tasks/maps.rake @@ -0,0 +1,6 @@ +namespace :maps do + desc 'Index the maps!' + task :index => :environment do + MapCrawler.new(Setting.map(:base_url)).crawl(Map.method(:from_city_name)) + end +end diff --git a/rails/map-mash/spec/lib/map_crawler_spec.rb b/rails/map-mash/spec/lib/map_crawler_spec.rb new file mode 100644 index 0000000..75ac602 --- /dev/null +++ b/rails/map-mash/spec/lib/map_crawler_spec.rb @@ -0,0 +1,16 @@ +require 'spec_helper' + + +describe MapCrawler do + let(:subject) { MapCrawler.new(Setting.map(:base_url)) } + + describe 'when crawling for actual maps', :integration => true do + before(:each) do + Map.destroy_all + end + + it 'should create Map entries for each map found' do + expect { subject.crawl }.to change{ Map.count }.by_at_least(1) + end + end +end