Report abuse

# -*- coding:utf-8 -*-

import feedparser
import re
import numpy
from numpy import *

feedlist=['http://feeds.reuters.com/reuters/topNews',
'http://feeds.reuters.com/reuters/businessNews',
'http://feeds.reuters.com/reuters/worldNews',
'http://feeds2.feedburner.com/time/world',
'http://feeds2.feedburner.com/time/business',
'http://feeds2.feedburner.com/time/politics',
'http://rss.cnn.com/rss/edition.rss',
'http://rss.cnn.com/rss/edition_world.rss',
'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml',
'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml',
'http://www.nytimes.com/services/xml/rss/nyt/World.xml'
'http://www.nytimes.com/services/xml/rss/nyt/Economy.xml'
]


#feedlist = ['http://feedproxy.google.com/TechCrunch']

# Aufgabe 2.2.1
def getarticlewords():
    articletitles = []
    articlewords = []
    allwords = {}
    for feed in feedlist:
        articles = feedparser.parse(feed)
        for article in articles.entries:
            cArticleWords = {}
            # append title to the articletitles list
            articletitles.append(article.title)
            text = article.title + " " + stripHtml(article.description)
            for word in separatewords(text):
                if (allwords.has_key(word)):
                    allwords[word] += 1
                else:
                    allwords[word] = 1
                if ( cArticleWords.has_key(word) ):
                    cArticleWords[word] += 1
                else:
                    cArticleWords[word] = 1
            articlewords.append(cArticleWords)
    return allwords,articlewords,articletitles

# Aufgabe 2.2.2
def makematrix(allw,articlew):
    wordvec = {}
    # processing the allvec list
    for word in allw.keys():
        if allw[word] > 3:
            if wordvec.has_key(word):
                wordvec[word] += 1
            else:
                wordvec[word] = allw[word]
    for word in wordvec.keys():
        for article in articlew:
            articlesWithWord = 0
            if article.has_key(word):
                articlesWithWord += 1
            if articlesWithWord/len(articlew) > 0.6:
                del wordvec[word]
    wordvec =  wordvec
    wordInArt = []
    for i in range(len(articlew)):
        articleWo = []
        for t in range(len(wordvec.keys())):
            if articlew[i].has_key(wordvec.keys()[t]):
                articleWo.insert(t,articlew[i][wordvec.keys()[t]])
            else:
                articleWo.insert(t,0)
        wordInArt.append(articleWo)
    return wordvec.keys(), wordInArt
    
# Hilfsmethoden
def stripHtml(h):
    p=' '
    s=0
    for c in h:
        if c=='<' : s=1
        elif c=='>' :
            s=0
            p+=' '
        elif s==0: p+=c
    return p
    
# Hilfsmethoden
def separatewords ( text ):
    splitter=re.compile('\\W*')
    return [ s.lower() for s in splitter.split (text) if len ( s )>3]

# Aufgabe 2.3.3, Kostenfunktion
def cost(A,B):
    k = 0.0
    rows,columns = A.shape
    for i in range(rows):
        for j in range(columns):
            k += pow(2,((A[i,j]) - (B[i,j])))
    return k

# Aufgabe aus 2.3.3, nnmf
def nnmf(A,m,it):
    rows, columns = A.shape
    H = numpy.matrix(numpy.random.rand(m,columns))
    W = numpy.matrix(numpy.random.rand(rows,m))
    if ( m > columns):
        m = columns-1
    ic = 0
    while(ic<it):
        print "Iteration ", ic
        B = W*H
        k = cost(A,B)
                
        # H anpassen
        WTA = W.transpose() * A
        WTWH = W.transpose() * W * H
        for i in range(H.shape[0]):
            for j in range(H.shape[1]): 
                H[i,j] = H[i,j] * (WTA[i,j]/WTWH[i,j])
        
        AHT = A * H.transpose()
        WHHT = W * H * H.transpose()
        for l in range(W.shape[0]):
            for i in range(W.shape[1]): 
                W[l,i] = W[l,i] * (AHT[l,i]/(WHHT[l,i]))


        ic = ic +1 
    
    return W,H
    
# Aufgabe 2.4.3, Ausgabe der Ergebnisse
def showfeatures(w,h,titles,wordvec):
    m = []
    rows,columns = h.shape
    print "Alle Merkmale"
    for row in range(rows):
        i = []
        for column in range(columns):
            i.append((h[row,column],wordvec[column]))
        m.append(i)
    for index in range(len(m)):
        temp = m[index]
        temp.sort()
        temp.reverse()
        print index, ":",
        for item in temp[0:6]:
            print ",", item[1],
        print "\n"
        
        
        m[index] = temp[0:6]
    merkmale = m
    r = []
    rows,columns = w.shape
    for row in range(rows):
        i = []
        for column in range(columns):
            i.append((w[row,column],merkmale[column]))
        r.append(i)
    for index in range(len(r)):
        r[index].sort()
        r[index].reverse()
        print "Merkmale von ", titles[index]
        for t in range(3):
            for merk_element in r[index][t][1]:
                print merk_element[1],
            print "\n"
        print "\n"
    

# Aufgabe 2.3.1 / Löschen der All´-Null Zeilen
def deleteNull(wordvec,titles):
    A = numpy.matrix(wordvec)
    ind = 0
    delete = []
    for element in A:
        if numpy.sum(element) == 0:
            delete.append(ind)
            titles.pop(ind)
        ind += 1
    A = numpy.delete(A,delete,0)
    return A,titles