|
|
# == Schema Information
# Schema version: 42
#
# Table name: referers
#
# id :integer(11) not null, primary key
# created_at :datetime
# host :string(255)
# page :string(255)
# domain_id :integer(11)
# raw_hits_count :integer(11)
#
class Referer < ActiveRecord::Base
belongs_to :domain
has_many :stats
GOOGLE_TLD = %w{ ae as at az be bg bi by ca cd cg ch ci cl co.cr co.hu co.id co.il co.in co.jp co.kr co.ls co.nz co.th co.uk co.ve co.yu com com.ar com.au com.br com.co com.cu com.do com.eg com.ec com.fj com.gi com.gr com.hk com.ly com.mt com.mx com.my com.na com.nf com.ni com.np com.pa com.pe com.ph com.pk com.pr com.py com.sa com.sg com.sv com.tr com.tw com.ua com.uy com.vc com.vn de dj dk ee es fi fm fr ge gg gl gm hn ie it je kz li lt lu lv ms mu mw nl no pl pn pt ro ru rw se sh sk sm td tt uz vg }
YAHOO_TLD = %w{ com com.au co.uk }
MSN_TLD = %w{ ca com es com.au }
def before_create
self.page = page.gsub(' ', '%20').gsub(/\%\d*$/, '')
self.host = URI.parse(page).host
end
def before_save
self.raw_hits_count = raw_hits_count.to_i + 1
end
def search_result
@search_result ||= Referer.parse_terms(page)
end
private
def self.engines
@engines ||= [
# IMAGE SEARCH
{ :query => 'prev', :name => 'google (image)', :path => 'imgres', :uri => %r{^images\.google\.(#{GOOGLE_TLD.join('|')})$} },
{ :query => 'prev', :name => 'google (image)', :path => 'imgres', :uri => %r{^www\.google\.(#{GOOGLE_TLD.join('|')})$} },
{ :query => 'query', :name => 'aol (image)', :path => 'aolcom/imageDetails', :uri => %r{aolsearcht\d\.search\.aol\.com$} },
{ :query => 'back', :name => 'yahoo (imaage)', :path => 'search/images/view', :uri => "images.search.yahoo.com" },
{ :query => 'back', :name => 'yahoo (imaage)', :path => 'search/images/view', :uri => 'uk.search.yahoo.com' },
# REGULAR SEARCH
{ :query => 'q', :name => 'google', :path => 'search', :uri => %r{^www\.google\.com$} },
{ :query => 'q', :name => 'google', :path => 'search', :uri => %r{^www\.google\.(#{GOOGLE_TLD.join('|')})$} },
{ :query => 'w', :name => 'seznam', :path => 'searchGoogleScreen', :uri => "search1.seznam.cz" },
{ :query => 'query', :name => 'aol', :path => 'web', :uri => %r{^aolsearch\.aol\.co\.uk$} },
{ :query => 'query', :name => 'aol', :path => 'aol/search', :uri => %r{aolsearch\.aol\.com$} },
{ :query => 'p', :name => 'yahoo', :path => 'search', :uri => %r{yahoo\.(#{YAHOO_TLD.join('|')})$} },
{ :query => 'q', :name => 'msn', :path => %r{\/(previewx|(sp)?results).aspx}, :uri => %r{^search(\.latam)?\.(nine|sympatico\.)?msn\.(#{MSN_TLD.join('|')})$} },
{ :query => 'q', :name => 'live', :path => %r{\/(previewx|(sp)?results).aspx}, :uri => %r{^search\.live\.com$} },
{ :query => 'q', :name => 'comcast', :path => "", :uri => 'search.comcast.net' },
{ :query => 'q', :name => 'blingo', :path => 'search', :uri => 'blingo.com' },
{ :query => 'kw', :name => 'voila', :path => 'S/voila', :uri => 'search.ke.voila.fr' },
{ :query => 'query', :name => 'lycos', :path => 'cgi-bin/pursuit', :uri => 'suche.lycos.de' },
{ :query => 'q', :name => 'picsearch', :path => 'info.cgi', :uri => 'www.picsearch.com' },
{ :query => 'q', :name => 'altavsita', :path => 'image/results', :uri => 'www.altavista.com' },
{ :query => 'q', :name => 'myway', :path => 'search/GGmain.jhtml', :uri => 'search.myway.com' },
{ :query => 'q', :name => 'bbc', :path => 'cgi-bin/search/results.pl', :uri => 'search.bbc.co.uk' },
## MAIL ##
{ :query => 'auth', :name => 'gmail (email)', :path => 'mail/', :uri => %r{^mail.google.com$} },
{ :query => 'len', :name => 'hotmail (email)', :path => 'cgi-bin/getmsg', :uri => %r{bay\d+\.hotmail\.msn\.com$} },
{ :query => 'hash', :name => 'ms-live (email)', :path => 'mail/ApplicationMain_11.10.0000.0095.aspx', :uri => %r{bay\d+\.mail\.live\.com$} },
{ :query => 'MsgId', :name => 'yahoo! (email)', :path => %r{ym\/(\d\.com\/)?ShowLetter}, :uri => %r{\d+\.mail\.yahoo\.com$} },
{ :query => 'MLS', :name => 'fastmail (email)', :path => 'mail', :uri => 'www\.fastmail\.fm' },
{ :query => 'msgId', :name => 'bigpond (email)', :path => 'webedge/do/mail/message/view', :uri => 'webedge\.bigpond\.com' },
{ :query => 'q', :name => 'mywestnet', :path => 'search.aspx', :uri => 'search.mywestnet.com.au' }
# { :query => '', :name => '', :path => '', :uri => %r{} },
]
end
def self.parse_terms(ref)
return "blank" if ref.blank?
uri = URI.parse(ref.gsub(' ', "%20"))
engine = self.engines.select { |e|
e[:uri].is_a?(String) ? uri.host == e[:uri] : uri.host.match(e[:uri])
}
return "nil path" if uri.path.nil?
for e in engine
next if e.nil?
next if e[:path].is_a?Regexp and !uri.path =~ e[:path]
next if e[:path].is_a?String and uri.path != "/#{e[:path]}"
return self.find_terms_from_uri( uri.query, e[:query], e[:name] )
end
return nil
rescue URI::InvalidURIError
# do nothing :( bad URI.
end
def self.find_terms_from_uri( query, key, engine )
if result = self.parameterize( query )[ key ]
CGI.unescape( result.split(/\+|\%2C/).join(" ") )
# .gsub("%20", " ").gsub("%27", "'").gsub("%22", '"').gsub("%2B", " +") # was: Search.new(:term => URI.decode(e), :engine => engine) } rescue nil
else
"??" # Couldn't determine search terms"
end
end
def self.parameterize(query)
query.split("&").inject({}) { |s,e|
a = e.split("=")
s.merge({ a[0] => (a[1] || nil) })
}
end
end
|