Report abuse

require 'anemone'
require 'nokogiri'


class Page < ActiveRecord::Base

  acts_as_tree :order => "title"

  def set_parent_page_from_last_breadcrumb_item
    #puts breadcrumb
    bread = Nokogiri::HTML(breadcrumb)
    the_link = nil
    bread.search('a').each do |link|
       puts link.content
       puts link['href']
       the_link = 'http://birmingham.gov.uk' + link['href']
    end
    puts 'Result of link finding: ' + the_link.to_s
    unless the_link.blank?
      possible_parent = Page.find_by_url(the_link)
      if possible_parent
        puts 'Found parent by url: ' + possible_parent.title

        self.parent_id = possible_parent.id
        self.save
        puts 'Saved new parent id'
      else
        puts 'No page found by url: ' + the_link

        new_page = Page.create_from_anemone_page Anemone::Page.fetch(the_link)
        if new_page
          self.parent_id = new_page.id
          self.save
        end


      end
    end

  end

  def self.root_page
    Page.find_by_url('http://birmingham.gov.uk/')
  end

  def self.setup_hierarchy # you may need to run this several times manually - sorry.
    pages = Page.find(:all)
    pages.each do |p|
      p.set_parent_page_from_last_breadcrumb_item
    end
  end

  def self.create_from_anemone_page page
    puts 'Create from anemone page'
    #begin
      unless page.blank? || page.doc.blank?  
        html = page.doc.at('html')
        unless html.blank?
          found_page = Page.find_by_url(page.url.to_s)
          if found_page
            puts('Already indexed. Ignoring: ' + page.url.to_s)
          else

            title = page.doc.at('title')
            if title.blank?
              puts('Blank title. Ignoring: ' + page.url.to_s)
            else

              if(title.inner_html.include?('404') || title.inner_html.downcase.include?('error') )
                puts('404 found. Ignoring: ' + page.url.to_s)
              else
                p = Page.new()



                p.title = title.inner_html unless title.blank?

                p.url= page.url.to_s

                p.page_source= html.inner_html 

                breadcrumb = page.doc.css('#breadcrumb')
                unless breadcrumb.blank?
                  p.breadcrumb = breadcrumb.first.inner_html
                end

                page_content = page.doc.css('#content')
                unless page_content.blank?
                  p.content = page_content.first.inner_html
                end
                if p.save
                  puts('Saved: ' + p.title + ' : ' + p.url.to_s)
                  return p
                else
                  puts("Save failed: " + p.url.to_s)

                end

              end # unless title 404
            end # unless title blank
          end # unless page already in database
        end # unless html blank
      end # unless page/doc blank
    #rescue
    #  puts('An error occurred. Continuing... ')
    #end #begin/rescue
    return false
  end

  def self.crawl_bcc start_url
    counter = 0
    Anemone.crawl(start_url) do |anemone|

      anemone.focus_crawl do |page| 
        page.links.delete_if { |x| x.to_s[/c=Page&childpagename=SystemAdmin/] }
        page.links.delete_if { |x| x.to_s[/pagename=BCC%252FCommon%252FWrapper%252FWrapper/] }
        page.links.delete_if { |x| x.to_s[/Common%252FWrapper%252FInlineWrapper/] }
        page.links.delete_if { |x| x.to_s[/MungoBlob/] }
        #page.links.delete_if { |x| !Page.find_by_url(x.to_s).blank? }
      end  

      anemone.on_every_page do |page|
        counter = counter + 1 if Page.create_from_anemone_page
      end #on_every_page
    end # do Anemone
    puts(counter.to_s + ' pages saved')
  end #self.crawl

end #class