Pastie now auto-senses if line-wrap is a bad or good idea. Feedback?
## mark a section (Learn more)
# -*- coding:utf-8 -*- import feedparser import re import numpy from numpy import * feedlist=['http://feeds.reuters.com/reuters/topNews', 'http://feeds.reuters.com/reuters/businessNews', 'http://feeds.reuters.com/reuters/worldNews', 'http://feeds2.feedburner.com/time/world', 'http://feeds2.feedburner.com/time/business', 'http://feeds2.feedburner.com/time/politics', 'http://rss.cnn.com/rss/edition.rss', 'http://rss.cnn.com/rss/edition_world.rss', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml', 'http://www.nytimes.com/services/xml/rss/nyt/World.xml' 'http://www.nytimes.com/services/xml/rss/nyt/Economy.xml' ] #feedlist = ['http://feedproxy.google.com/TechCrunch'] # Aufgabe 2.2.1 def getarticlewords(): articletitles = [] articlewords = [] allwords = {} for feed in feedlist: articles = feedparser.parse(feed) for article in articles.entries: cArticleWords = {} # append title to the articletitles list articletitles.append(article.title) text = article.title + " " + stripHtml(article.description) for word in separatewords(text): if (allwords.has_key(word)): allwords[word] += 1 else: allwords[word] = 1 if ( cArticleWords.has_key(word) ): cArticleWords[word] += 1 else: cArticleWords[word] = 1 articlewords.append(cArticleWords) return allwords,articlewords,articletitles # Aufgabe 2.2.2 def makematrix(allw,articlew): wordvec = {} # processing the allvec list for word in allw.keys(): if allw[word] > 3: if wordvec.has_key(word): wordvec[word] += 1 else: wordvec[word] = allw[word] for word in wordvec.keys(): for article in articlew: articlesWithWord = 0 if article.has_key(word): articlesWithWord += 1 if articlesWithWord/len(articlew) > 0.6: del wordvec[word] wordvec = wordvec wordInArt = [] for i in range(len(articlew)): articleWo = [] for t in range(len(wordvec.keys())): if articlew[i].has_key(wordvec.keys()[t]): articleWo.insert(t,articlew[i][wordvec.keys()[t]]) else: articleWo.insert(t,0) wordInArt.append(articleWo) return wordvec.keys(), wordInArt # Hilfsmethoden def stripHtml(h): p=' ' s=0 for c in h: if c=='<' : s=1 elif c=='>' : s=0 p+=' ' elif s==0: p+=c return p # Hilfsmethoden def separatewords ( text ): splitter=re.compile('\\W*') return [ s.lower() for s in splitter.split (text) if len ( s )>3] # Aufgabe 2.3.3, Kostenfunktion def cost(A,B): k = 0.0 rows,columns = A.shape for i in range(rows): for j in range(columns): k += pow(2,((A[i,j]) - (B[i,j]))) return k # Aufgabe aus 2.3.3, nnmf def nnmf(A,m,it): rows, columns = A.shape H = numpy.matrix(numpy.random.rand(m,columns)) W = numpy.matrix(numpy.random.rand(rows,m)) if ( m > columns): m = columns-1 ic = 0 while(ic<it): print "Iteration ", ic B = W*H k = cost(A,B) # H anpassen WTA = W.transpose() * A WTWH = W.transpose() * W * H for i in range(H.shape[0]): for j in range(H.shape[1]): H[i,j] = H[i,j] * (WTA[i,j]/WTWH[i,j]) AHT = A * H.transpose() WHHT = W * H * H.transpose() for l in range(W.shape[0]): for i in range(W.shape[1]): W[l,i] = W[l,i] * (AHT[l,i]/(WHHT[l,i])) ic = ic +1 return W,H # Aufgabe 2.4.3, Ausgabe der Ergebnisse def showfeatures(w,h,titles,wordvec): m = [] rows,columns = h.shape print "Alle Merkmale" for row in range(rows): i = [] for column in range(columns): i.append((h[row,column],wordvec[column])) m.append(i) for index in range(len(m)): temp = m[index] temp.sort() temp.reverse() print index, ":", for item in temp[0:6]: print ",", item[1], print "\n" m[index] = temp[0:6] merkmale = m r = [] rows,columns = w.shape for row in range(rows): i = [] for column in range(columns): i.append((w[row,column],merkmale[column])) r.append(i) for index in range(len(r)): r[index].sort() r[index].reverse() print "Merkmale von ", titles[index] for t in range(3): for merk_element in r[index][t][1]: print merk_element[1], print "\n" print "\n" # Aufgabe 2.3.1 / Löschen der All´-Null Zeilen def deleteNull(wordvec,titles): A = numpy.matrix(wordvec) ind = 0 delete = [] for element in A: if numpy.sum(element) == 0: delete.append(ind) titles.pop(ind) ind += 1 A = numpy.delete(A,delete,0) return A,titles
This paste will be private.
From the Design Piracy series on my blog: