from lxml import etree
import urllib2

urls = {}
ranks=[]

f = file("languages.html", "r")
parser = etree.HTMLParser()
tree = etree.parse(f, parser)
languages = tree.findall("//div[@class='all_languages']//a")

def norm(name):
name=name.replace(' ', '_')
name=name.replace('/', '_')
return name

#Create the urls list
for l in languages:
name = l.text
url = l.get('href')
urls[name] = url

#Download the files
for name in urls:
url = urls[name]
filename = 'languages/'+norm(name)
print filename
fo = file(filename, 'w')
try:
fo.write(urllib2.urlopen("http://www.github.com"+url).read())
except:
print "Error getting language",name

#Analyize them
for name in urls:
url = urls[name]
filename = 'languages/'+norm(name)
fo = file(filename, 'r')
try :
tree = etree.parse(fo, parser)
pop = int(tree.find("//div[@class='pagehead']/h1/em").text.split()[2][1:])
ranks.append((pop, name))
except:
pass

print ranks