Pastie now auto-senses if line-wrap is a bad or good idea. Feedback?
## mark a section (Learn more)
# twitter_friends_of_friends.rb # by Greg Houston # blog: http://ghouston.blogspot.com # wiki: http://ghouston.wiki.zoho.com/ # # this script is posted online at http://pastie.caboo.se/195011 # # special thanks to Yoan Blanc # http://yoan.dosimple.ch/blog/2007/05/17/ # which put this idea in my head. this code is based on his script http://yoan.dosimple.ch/blog/2007/05/17/graph2.rb # # twitter_friends_of_friends.rb looks at all the followers and follower's followers # (e.g. 2nd level deep) to produce a Long Tail report. # # the report is output in an HTML fragment which I plan to post # on my wiki. # # usage: ./ruby twitter_friends_of_friends.rb nick # where nick is the starting nick. # the results are written to twitter_friends_of_friends.html # # notes: # hCard.find( @url ) returns an empty collection if there # was an error page received from Twitter. # # the script uses a cache folder to store the downloaded html. it also # saves the yaml for each Twitt after parsing the html. If this script is # run multiple times, it will use the cache to avoid extra work. This makes # things go much faster while developing this script. You will have to # clear the cache directory if you want to get the latest from Twitter. # # getting the img_url can be slow (expecially since this is single threaded), # so generating the output can take a long time. please be patient. If you # want to speed things up, look at the lines near the end which follow #NOTE: # comments. You can change between the Fast option which doesn't get img_urls # for friends of friends. Or the Slow option which provides a nicer output. # require 'rubygems' require 'fileutils' require 'mofo' require 'hpricot' require 'pp' require 'rio' require 'yaml' def say( msg ) puts msg STDOUT.flush # get it displayed faster. end FileUtils.mkpath( 'cache' ) class TwittSpace include Singleton def TwittSpace.size TwittSpace.instance.everyone.size end def TwittSpace.get( nick ) TwittSpace.instance.get( nick ) end def initialize @everyone = Hash.new end def get( nick ) return @everyone[nick] if @everyone.include? nick yaml_file_path = 'cache/' + nick + '.yaml' if File.exists?( yaml_file_path ) say " loading #{yaml_file_path} (#{TwittSpace.size})" twitt = YAML::load( File.open( yaml_file_path )) else twitt = Twitt.new( nick ) end @everyone[twitt.nick] = twitt twitt end def TwittSpace.everyone TwittSpace.instance.everyone end def everyone @everyone end end class Twitt attr_accessor :score attr_reader :url, :nick, :loaded, :address, :full_name def initialize( nick ) @nick = nick @score = 0 @url = 'http://twitter.com/' + @nick end def download( cache_file_path ) page = '' 3.times do begin page = rio( @url ).read rescue page = '' next end break unless page =~ /Status: 500 Internal Server Error/ page = '' end if page.size > 0 rio( cache_file_path ) < page end page end def to_yaml_properties (instance_variables - ["@following","@score"]).sort end def load # optimization: lazy load the basic attributes unless @loaded_base @loaded_base = true cache_file_path = 'cache/'+@nick+'.html' if File.exists?( cache_file_path ) say " loading #{@nick}.html (#{TwittSpace.size})" page = rio( cache_file_path ).read else say " downloading #{@nick} (#{TwittSpace.size})" page = download( cache_file_path ) end @contacts = hCard.find( :text => page ) @contacts = [@contacts] if @contacts.class != Array doc = Hpricot( page ) begin @img_url = (doc/"h2//img[@id='profile-image']")[0].attributes['src'] @full_name = (doc/"span[@class='fn']").inner_html @address = (doc/"span[@class='adr']").inner_html # rescue NoMethodError # # was unable to load page # puts " ERROR: UNABLE TO LOAD TWITTER HOMEPAGE" # @img_url = nil # rescue # puts " ERROR: UNKNOWN ERROR" # puts page # attempting to find origin of other errors. # @img_url = nil end rio( 'cache/'+@nick+'.yaml' ) < self.to_yaml end end def load_following # optimization: lazy load the following attribute # this attribute is not include in the yaml cache # it has to be load separately from the basic # attributes to avoid getting into recusion unless @loaded_following @loaded_following = true @following = Hash.new @contacts.each do |contact| if contact.url =~ /^http:\/\/twitter.com\/.*/ || contact.url =~ /http:\/\/explore.twitter.com\/.*/ nick = contact.url.slice( /[^\/]*$/ ) @following[nick] = TwittSpace.get(nick) end end end end def contacts self.load @contacts end def img_url self.load @img_url end def following self.load self.load_following @following end end if __FILE__==$0 say "starting..." STDOUT.flush root_nick = (ARGV[0].nil?) ? 'neversleep360' : ARGV[0] root = TwittSpace.get( root_nick ) #pp root #say "forced load..." #root.load #pp root #root.load_following #pp root #exit level1 = root.following # load level 1 say "scoring friends..." level2 = Hash.new level1.each do | nick, twitt | say " #{nick}" twitt.following.each do | nick2, twitt2 | twitt2.score += 1 level2[nick2]=twitt2 unless level2.include?(nick2) || level1.include?(nick2) end end say "scoring friends of friends..." level2.each do | nick, twitt | say " #{nick}" twitt.following.each do | nick2, twitt2 | twitt2.score += 1 end end say "\n\ncalculating rank..." ranking = level1.values + level2.values say " #{ranking.size} nicks" ranking.sort! do |a,b| result = -(a.score <=> b.score) (result == 0) ? a.nick.casecmp( b.nick ) : result end say "\n\nreporting results..." out = rio( "twitter_friends_of_friends.html" ) out < "<table border=1>" current_score = nil column = 0 ranking.each do |buddy| if current_score != buddy.score out << "</tr></table></td></tr>" unless current_score.nil? current_score = buddy.score out << "<tr><td valign='top'><h3>#{buddy.score}</h3></td><td><table><tr>" column = 1 end if column > 6 out << "</tr><tr>" column = 1 end #NOTE: fast... but not all images #puts "<td><a href='#{buddy.url}'>#{(buddy.loaded) ? "<img src='#{buddy.img_url}'/>" : "" }#{buddy.nick}</a></td>" #NOTE: slow... has to load each buddy to get img_url out << "<td><a href='#{buddy.url}'><img src='#{buddy.img_url}'/>#{buddy.nick}</a></td>" column += 1 end out << "</tr></table></td></tr></table>" end
This paste will be private.
From the Design Piracy series on my blog: