Pastie now auto-senses if line-wrap is a bad or good idea. Feedback?
## mark a section (Learn more)
#!/usr/bin/python """ Author: Philipp Keller Context: http://www.pui.ch/phred/archives/2006/07/automated_tag_clustering.html """ import urllib, sys, os def dict_concat(dict1, dict2): """ >>> dict_concat(dict(a=1), dict(b=2)) {'a': 1, 'b': 2} """ result = {} result.update(dict1) result.update(dict2) return result class graph(object): edges = {} total_weight = 0 def add(self, edge1, edge2, weight=1): self.edges[edge1] = dict_concat(self.edges.get(edge1, {}), {edge2:weight}) self.edges[edge2] = dict_concat(self.edges.get(edge2, {}), {edge1:weight}) self.total_weight += weight def get_nodes(self): return self.edges.keys() def get_edges(self): """ >>> g = graph() >>> g.add('a', 'b') >>> g.add('b', 'c') >>> g.add('c', 'a') >>> sorted(g.get_edges()) [('a', 'b'), ('a', 'c'), ('b', 'c')] """ edges = [] for node in self.edges.keys(): for neighbour in self.edges[node].keys(): if node < neighbour: edges.append((node, neighbour)) return edges def node_weight(self, node): """ >>> g = graph() >>> g.add('a', 'b', 3) >>> g.add('a', 'c', 2) >>> g.node_weight('a') 5 """ weight = 0 for neighbour in self.edges[node].keys(): weight += self.edges[node][neighbour] return weight def edge_weight_dice(self, node1, node2): """computes the dice-similarity-number see http://www.pui.ch/phred/automated_tag_clustering/figures/similarity_measures.png returns int(dice * 1000) >>> g = graph() >>> g.add('a', 'b', 3) >>> g.add('b', 'c', 6) >>> g.edge_weight_dice('a', 'b') 500 >>> g.edge_weight_dice('b', 'c') 800 """ edge_weight = self.edges[node1][node2] node1_weight = self.node_weight(node1) node2_weight = self.node_weight(node2) dice_weight = (2.0 * edge_weight) / (node1_weight + node2_weight) return int(1000 * dice_weight) def edge_weight_matching(self, node1, node2): return self.edges[node1][node2] def num_edges(self): num = 0 for node in self.edges.keys(): num += len(self.edges[node]) return num / 2 def reachable(self, node, reached=[]): """find all nodes that are reachable from a starting node >>> g = graph() >>> g.add('a', 'b') >>> g.add('b', 'c') >>> g.add('d', 'e') >>> sorted(g.reachable('a')) ['a', 'b', 'c'] """ reached.append(node) for neighbour in self.edges[node]: if neighbour not in reached: reached = self.reachable(neighbour, reached) return reached def cluster_metis(self, n, weight_method=None): """cluster the graph using metis (http://glaros.dtc.umn.edu/gkhome/views/metis) into n clusters n: number of clusters to do weight_method: g.edge_weight_matching or g.edge_weight_dice (default) metis requires the nodes to be numeric -> rewrite the graph to their format and back """ if weight_method is None: weight_method = self.edge_weight_dice metis_filename = '/tmp/my.graph' name2number = {} number2name = [None] i = 1 metis_file = open(metis_filename, 'w') metis_file.write("%s %s 1\n" % (len(self.get_nodes()), self.num_edges())) for node in self.get_nodes(): number2name.append(node) name2number[node] = i i += 1 for i in range(1, len(number2name)): node = number2name[i] for neighbour_node in self.edges[node].keys(): neighbour_node_number = name2number[neighbour_node] weight = weight_method(node, neighbour_node) metis_file.write("%s %s " % (neighbour_node_number, weight)) metis_file.write("\n") metis_file.close() metis_output = os.popen('kmetis %s %s' % (metis_filename, n)).read().strip() clusters = {} i = 1 for line in open(metis_filename + '.part.%s' % n): nodename = number2name[i] cluster_number = line.strip() clusters[cluster_number] = clusters.get(cluster_number, []) + [nodename] i += 1 return clusters def cluster_quality(self, clusters, weight_method=None): """computes the quality of the cluster according to "modularity function" Q described here: http://www.datalab.uci.edu/papers/siam_graph_clustering.pdf clusters: output from cluster_metis weight_method: the weight you used for clustering: graph.edge_weight_dice (default) or graph.edge_weight_matching """ if weight_method is None: weight_method = self.edge_weight_dice total_weight = 0 for edge in self.get_edges(): total_weight += weight_method(edge[0], edge[1]) quality = 0.0 for cluster in clusters: cluster_nodes = clusters[cluster] edge_in_cluster_weight = 0.0 node_in_cluster_weight = 0.0 for edge in self.get_edges(): node1 = edge[0] node2 = edge[1] if node1 in cluster_nodes or node2 in cluster_nodes: weight = weight_method(node1, node2) node_in_cluster_weight += weight if node1 in cluster_nodes and node2 in cluster_nodes: edge_in_cluster_weight += weight # print "%s: %s/%s - (%s/%s)^2" % (cluster, edge_in_cluster_weight, total_weight, node_in_cluster_weight, total_weight) quality += ((edge_in_cluster_weight / total_weight) - (node_in_cluster_weight / total_weight)) ** 2 quality += (edge_in_cluster_weight / total_weight) - ((node_in_cluster_weight / total_weight) ** 2) return quality def graphviz_export(self, filename, clusters={}, only_weights_bigger_than=None): """stores graph into a file that can be visualized by graphviz (graphviz.org) clusters: output from cluster_metis. If given, the nodes are colored according to the cluster they are in only_weights_bigger_than: many graphs are too big to be visualized: if this number is set, only the nodes with a dice-weight bigger than this number are written. A good number to start: 15 """ f = open(filename, 'w') f.write("""graph G {\n node[style=filled];\n""") if not clusters == {}: # 24 colors should be enough for everyone colors = ['greenyellow', 'salmon2', 'deepskyblue', 'firebrick', 'cyan', 'bisque4', 'goldenrod2', 'burlywood2', 'gold1', 'darkseagreen', 'dodgerblue1', 'thistle2', 'darkolivegreen3', 'chocolate', 'turquoise3', 'steelblue3', 'coral', 'coral3', 'lemonchiffon', 'darkorange1', 'lightyellow1', 'darkorchid4', 'mistyrose2', 'crimson'] for node in self.get_nodes(): for cluster in clusters.keys(): if node in clusters[cluster]: draw = True if only_weights_bigger_than is not None: draw = False for neighbour in self.edges[node].keys(): if self.edge_weight_dice(node, neighbour) > only_weights_bigger_than: draw = True if draw: color = colors[clusters.keys().index(cluster)] f.write(' "%s" [color=%s];\n' % (node, color)) for node in self.edges.keys(): for neighbour in self.edges[node].keys(): if node < neighbour: draw = True weight = self.edge_weight_matching(node, neighbour) weight_dice = self.edge_weight_dice(node, neighbour) if only_weights_bigger_than is not None and weight_dice <= only_weights_bigger_than: draw = False if draw: f.write(' "%s"--"%s" [ label = "%s (%s)" ]' % (node, neighbour, weight, weight_dice)) f.write(";\n") f.write("}") f.close() # the code in this method is from memory, I did not try to run it # Therefore it could contain minor flaws def build_graph(filename): """ build graph from a file that each line has tag1 tag2. """ tag_count = {} connections = {} i = 0 for line in open(filename): [tag1, tag2] = sorted(line.strip().split(' ')) key = tag1 + "+" + tag2 connections[key] = connections.get(key, 0) + 1 gr = graph() for key,weight in connections.iteritems(): if weight > 2: first,second = key.split('+') gr.add(first, second, weight=weight) return gr # the code in this method is from memory, I did not try to run it # Therefore it could contain minor flaws def usage(): g = build_graph('tags.txt') num_clusters = 2 q_old = -1 while True: clusters = g.cluster_metis(num_clusters) q = g.cluster_quality(clusters) print "%s clusters: Quality = %s" (num_clusters, q) # this simple method just finds the local maximum # you should find a better way to find the global maxima if q < q_old: print "local maximum found with quality=%s" print clusters break num_clusters += 1 if __name__ == "__main__": import doctest doctest.testmod()
This paste will be private.
From the Design Piracy series on my blog: