import feedparser
import re
import numpy
from numpy import *
feedlist=['http://feeds.reuters.com/reuters/topNews',
'http://feeds.reuters.com/reuters/businessNews',
'http://feeds.reuters.com/reuters/worldNews',
'http://feeds2.feedburner.com/time/world',
'http://feeds2.feedburner.com/time/business',
'http://feeds2.feedburner.com/time/politics',
'http://rss.cnn.com/rss/edition.rss',
'http://rss.cnn.com/rss/edition_world.rss',
'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml',
'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml',
'http://www.nytimes.com/services/xml/rss/nyt/World.xml'
'http://www.nytimes.com/services/xml/rss/nyt/Economy.xml'
]
def getarticlewords():
articletitles = []
articlewords = []
allwords = {}
for feed in feedlist:
articles = feedparser.parse(feed)
for article in articles.entries:
cArticleWords = {}
articletitles.append(article.title)
text = article.title + " " + stripHtml(article.description)
for word in separatewords(text):
if (allwords.has_key(word)):
allwords[word] += 1
else:
allwords[word] = 1
if ( cArticleWords.has_key(word) ):
cArticleWords[word] += 1
else:
cArticleWords[word] = 1
articlewords.append(cArticleWords)
return allwords,articlewords,articletitles
def makematrix(allw,articlew):
wordvec = {}
for word in allw.keys():
if allw[word] > 3:
if wordvec.has_key(word):
wordvec[word] += 1
else:
wordvec[word] = allw[word]
for word in wordvec.keys():
for article in articlew:
articlesWithWord = 0
if article.has_key(word):
articlesWithWord += 1
if articlesWithWord/len(articlew) > 0.6:
del wordvec[word]
wordvec = wordvec
wordInArt = []
for i in range(len(articlew)):
articleWo = []
for t in range(len(wordvec.keys())):
if articlew[i].has_key(wordvec.keys()[t]):
articleWo.insert(t,articlew[i][wordvec.keys()[t]])
else:
articleWo.insert(t,0)
wordInArt.append(articleWo)
return wordvec.keys(), wordInArt
def stripHtml(h):
p=' '
s=0
for c in h:
if c=='<' : s=1
elif c=='>' :
s=0
p+=' '
elif s==0: p+=c
return p
def separatewords ( text ):
splitter=re.compile('\\W*')
return [ s.lower() for s in splitter.split (text) if len ( s )>3]
def cost(A,B):
k = 0.0
rows,columns = A.shape
for i in range(rows):
for j in range(columns):
k += pow(2,((A[i,j]) - (B[i,j])))
return k
def nnmf(A,m,it):
rows, columns = A.shape
H = numpy.matrix(numpy.random.rand(m,columns))
W = numpy.matrix(numpy.random.rand(rows,m))
if ( m > columns):
m = columns-1
ic = 0
while(ic<it):
print "Iteration ", ic
B = W*H
k = cost(A,B)
WTA = W.transpose() * A
WTWH = W.transpose() * W * H
for i in range(H.shape[0]):
for j in range(H.shape[1]):
H[i,j] = H[i,j] * (WTA[i,j]/WTWH[i,j])
AHT = A * H.transpose()
WHHT = W * H * H.transpose()
for l in range(W.shape[0]):
for i in range(W.shape[1]):
W[l,i] = W[l,i] * (AHT[l,i]/(WHHT[l,i]))
ic = ic +1
return W,H
def showfeatures(w,h,titles,wordvec):
m = []
rows,columns = h.shape
print "Alle Merkmale"
for row in range(rows):
i = []
for column in range(columns):
i.append((h[row,column],wordvec[column]))
m.append(i)
for index in range(len(m)):
temp = m[index]
temp.sort()
temp.reverse()
print index, ":",
for item in temp[0:6]:
print ",", item[1],
print "\n"
m[index] = temp[0:6]
merkmale = m
r = []
rows,columns = w.shape
for row in range(rows):
i = []
for column in range(columns):
i.append((w[row,column],merkmale[column]))
r.append(i)
for index in range(len(r)):
r[index].sort()
r[index].reverse()
print "Merkmale von ", titles[index]
for t in range(3):
for merk_element in r[index][t][1]:
print merk_element[1],
print "\n"
print "\n"
def deleteNull(wordvec,titles):
A = numpy.matrix(wordvec)
ind = 0
delete = []
for element in A:
if numpy.sum(element) == 0:
delete.append(ind)
titles.pop(ind)
ind += 1
A = numpy.delete(A,delete,0)
return A,titles