Report abuse

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
require 'anemone'
require 'nokogiri'


class Page < ActiveRecord::Base

  acts_as_tree :order => "title"

  def set_parent_page_from_last_breadcrumb_item
    #puts breadcrumb
    bread = Nokogiri::HTML(breadcrumb)
    the_link = nil
    bread.search('a').each do |link|
       puts link.content
       puts link['href']
       the_link = 'http://birmingham.gov.uk' + link['href']
    end
    puts 'Result of link finding: ' + the_link.to_s
    unless the_link.blank?
      possible_parent = Page.find_by_url(the_link)
      if possible_parent
        puts 'Found parent by url: ' + possible_parent.title

        self.parent_id = possible_parent.id
        self.save
        puts 'Saved new parent id'
      else
        puts 'No page found by url: ' + the_link

        new_page = Page.create_from_anemone_page Anemone::Page.fetch(the_link)
        if new_page
          self.parent_id = new_page.id
          self.save
        end


      end
    end

  end

  def self.root_page
    Page.find_by_url('http://birmingham.gov.uk/')
  end

  def self.setup_hierarchy # you may need to run this several times manually - sorry.
    pages = Page.find(:all)
    pages.each do |p|
      p.set_parent_page_from_last_breadcrumb_item
    end
  end

  def self.create_from_anemone_page page
    puts 'Create from anemone page'
    #begin
      unless page.blank? || page.doc.blank?  
        html = page.doc.at('html')
        unless html.blank?
          found_page = Page.find_by_url(page.url.to_s)
          if found_page
            puts('Already indexed. Ignoring: ' + page.url.to_s)
          else

            title = page.doc.at('title')
            if title.blank?
              puts('Blank title. Ignoring: ' + page.url.to_s)
            else

              if(title.inner_html.include?('404') || title.inner_html.downcase.include?('error') )
                puts('404 found. Ignoring: ' + page.url.to_s)
              else
                p = Page.new()



                p.title = title.inner_html unless title.blank?

                p.url= page.url.to_s

                p.page_source= html.inner_html 

                breadcrumb = page.doc.css('#breadcrumb')
                unless breadcrumb.blank?
                  p.breadcrumb = breadcrumb.first.inner_html
                end

                page_content = page.doc.css('#content')
                unless page_content.blank?
                  p.content = page_content.first.inner_html
                end
                if p.save
                  puts('Saved: ' + p.title + ' : ' + p.url.to_s)
                  return p
                else
                  puts("Save failed: " + p.url.to_s)

                end

              end # unless title 404
            end # unless title blank
          end # unless page already in database
        end # unless html blank
      end # unless page/doc blank
    #rescue
    #  puts('An error occurred. Continuing... ')
    #end #begin/rescue
    return false
  end

  def self.crawl_bcc start_url
    counter = 0
    Anemone.crawl(start_url) do |anemone|

      anemone.focus_crawl do |page| 
        page.links.delete_if { |x| x.to_s[/c=Page&childpagename=SystemAdmin/] }
        page.links.delete_if { |x| x.to_s[/pagename=BCC%252FCommon%252FWrapper%252FWrapper/] }
        page.links.delete_if { |x| x.to_s[/Common%252FWrapper%252FInlineWrapper/] }
        page.links.delete_if { |x| x.to_s[/MungoBlob/] }
        #page.links.delete_if { |x| !Page.find_by_url(x.to_s).blank? }
      end  

      anemone.on_every_page do |page|
        counter = counter + 1 if Page.create_from_anemone_page
      end #on_every_page
    end # do Anemone
    puts(counter.to_s + ' pages saved')
  end #self.crawl

end #class