Wrap text
Report abuse
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
require 'anemone'
require 'nokogiri'
class Page < ActiveRecord::Base
acts_as_tree :order => "title"
def set_parent_page_from_last_breadcrumb_item
bread = Nokogiri::HTML(breadcrumb)
the_link = nil
bread.search('a').each do |link|
puts link.content
puts link['href']
the_link = 'http://birmingham.gov.uk' + link['href']
end
puts 'Result of link finding: ' + the_link.to_s
unless the_link.blank?
possible_parent = Page.find_by_url(the_link)
if possible_parent
puts 'Found parent by url: ' + possible_parent.title
self.parent_id = possible_parent.id
self.save
puts 'Saved new parent id'
else
puts 'No page found by url: ' + the_link
new_page = Page.create_from_anemone_page Anemone::Page.fetch(the_link)
if new_page
self.parent_id = new_page.id
self.save
end
end
end
end
def self.root_page
Page.find_by_url('http://birmingham.gov.uk/')
end
def self.setup_hierarchy
pages = Page.find(:all)
pages.each do |p|
p.set_parent_page_from_last_breadcrumb_item
end
end
def self.create_from_anemone_page page
puts 'Create from anemone page'
unless page.blank? || page.doc.blank?
html = page.doc.at('html')
unless html.blank?
found_page = Page.find_by_url(page.url.to_s)
if found_page
puts('Already indexed. Ignoring: ' + page.url.to_s)
else
title = page.doc.at('title')
if title.blank?
puts('Blank title. Ignoring: ' + page.url.to_s)
else
if(title.inner_html.include?('404') || title.inner_html.downcase.include?('error') )
puts('404 found. Ignoring: ' + page.url.to_s)
else
p = Page.new()
p.title = title.inner_html unless title.blank?
p.url= page.url.to_s
p.page_source= html.inner_html
breadcrumb = page.doc.css('#breadcrumb')
unless breadcrumb.blank?
p.breadcrumb = breadcrumb.first.inner_html
end
page_content = page.doc.css('#content')
unless page_content.blank?
p.content = page_content.first.inner_html
end
if p.save
puts('Saved: ' + p.title + ' : ' + p.url.to_s)
return p
else
puts("Save failed: " + p.url.to_s)
end
end
end
end
end
end
return false
end
def self.crawl_bcc start_url
counter = 0
Anemone.crawl(start_url) do |anemone|
anemone.focus_crawl do |page|
page.links.delete_if { |x| x.to_s[/c=Page&childpagename=SystemAdmin/] }
page.links.delete_if { |x| x.to_s[/pagename=BCC%252FCommon%252FWrapper%252FWrapper/] }
page.links.delete_if { |x| x.to_s[/Common%252FWrapper%252FInlineWrapper/] }
page.links.delete_if { |x| x.to_s[/MungoBlob/] }
end
anemone.on_every_page do |page|
counter = counter + 1 if Page.create_from_anemone_page
end
end
puts(counter.to_s + ' pages saved')
end
end
|