"""
Author: Philipp Keller
Context: http://www.pui.ch/phred/archives/2006/07/automated_tag_clustering.html
"""
import urllib, sys, os
def dict_concat(dict1, dict2):
"""
>>> dict_concat(dict(a=1), dict(b=2))
{'a': 1, 'b': 2}
"""
result = {}
result.update(dict1)
result.update(dict2)
return result
class graph(object):
edges = {}
total_weight = 0
def add(self, edge1, edge2, weight=1):
self.edges[edge1] = dict_concat(self.edges.get(edge1, {}), {edge2:weight})
self.edges[edge2] = dict_concat(self.edges.get(edge2, {}), {edge1:weight})
self.total_weight += weight
def get_nodes(self):
return self.edges.keys()
def get_edges(self):
"""
>>> g = graph()
>>> g.add('a', 'b')
>>> g.add('b', 'c')
>>> g.add('c', 'a')
>>> sorted(g.get_edges())
[('a', 'b'), ('a', 'c'), ('b', 'c')]
"""
edges = []
for node in self.edges.keys():
for neighbour in self.edges[node].keys():
if node < neighbour:
edges.append((node, neighbour))
return edges
def node_weight(self, node):
"""
>>> g = graph()
>>> g.add('a', 'b', 3)
>>> g.add('a', 'c', 2)
>>> g.node_weight('a')
5
"""
weight = 0
for neighbour in self.edges[node].keys():
weight += self.edges[node][neighbour]
return weight
def edge_weight_dice(self, node1, node2):
"""computes the dice-similarity-number
see http://www.pui.ch/phred/automated_tag_clustering/figures/similarity_measures.png
returns int(dice * 1000)
>>> g = graph()
>>> g.add('a', 'b', 3)
>>> g.add('b', 'c', 6)
>>> g.edge_weight_dice('a', 'b')
500
>>> g.edge_weight_dice('b', 'c')
800
"""
edge_weight = self.edges[node1][node2]
node1_weight = self.node_weight(node1)
node2_weight = self.node_weight(node2)
dice_weight = (2.0 * edge_weight) / (node1_weight + node2_weight)
return int(1000 * dice_weight)
def edge_weight_matching(self, node1, node2):
return self.edges[node1][node2]
def num_edges(self):
num = 0
for node in self.edges.keys():
num += len(self.edges[node])
return num / 2
def reachable(self, node, reached=[]):
"""find all nodes that are reachable from a starting node
>>> g = graph()
>>> g.add('a', 'b')
>>> g.add('b', 'c')
>>> g.add('d', 'e')
>>> sorted(g.reachable('a'))
['a', 'b', 'c']
"""
reached.append(node)
for neighbour in self.edges[node]:
if neighbour not in reached:
reached = self.reachable(neighbour, reached)
return reached
def cluster_metis(self, n, weight_method=None):
"""cluster the graph using metis (http://glaros.dtc.umn.edu/gkhome/views/metis) into n clusters
n: number of clusters to do
weight_method: g.edge_weight_matching or g.edge_weight_dice (default)
metis requires the nodes to be numeric -> rewrite the graph to their format and back
"""
if weight_method is None:
weight_method = self.edge_weight_dice
metis_filename = '/tmp/my.graph'
name2number = {}
number2name = [None]
i = 1
metis_file = open(metis_filename, 'w')
metis_file.write("%s %s 1\n" % (len(self.get_nodes()), self.num_edges()))
for node in self.get_nodes():
number2name.append(node)
name2number[node] = i
i += 1
for i in range(1, len(number2name)):
node = number2name[i]
for neighbour_node in self.edges[node].keys():
neighbour_node_number = name2number[neighbour_node]
weight = weight_method(node, neighbour_node)
metis_file.write("%s %s " % (neighbour_node_number, weight))
metis_file.write("\n")
metis_file.close()
metis_output = os.popen('kmetis %s %s' % (metis_filename, n)).read().strip()
clusters = {}
i = 1
for line in open(metis_filename + '.part.%s' % n):
nodename = number2name[i]
cluster_number = line.strip()
clusters[cluster_number] = clusters.get(cluster_number, []) + [nodename]
i += 1
return clusters
def cluster_quality(self, clusters, weight_method=None):
"""computes the quality of the cluster according to "modularity function" Q
described here: http://www.datalab.uci.edu/papers/siam_graph_clustering.pdf
clusters: output from cluster_metis
weight_method: the weight you used for clustering:
graph.edge_weight_dice (default) or graph.edge_weight_matching
"""
if weight_method is None:
weight_method = self.edge_weight_dice
total_weight = 0
for edge in self.get_edges():
total_weight += weight_method(edge[0], edge[1])
quality = 0.0
for cluster in clusters:
cluster_nodes = clusters[cluster]
edge_in_cluster_weight = 0.0
node_in_cluster_weight = 0.0
for edge in self.get_edges():
node1 = edge[0]
node2 = edge[1]
if node1 in cluster_nodes or node2 in cluster_nodes:
weight = weight_method(node1, node2)
node_in_cluster_weight += weight
if node1 in cluster_nodes and node2 in cluster_nodes:
edge_in_cluster_weight += weight
quality += ((edge_in_cluster_weight / total_weight) - (node_in_cluster_weight / total_weight)) ** 2
quality += (edge_in_cluster_weight / total_weight) - ((node_in_cluster_weight / total_weight) ** 2)
return quality
def graphviz_export(self, filename, clusters={}, only_weights_bigger_than=None):
"""stores graph into a file that can be visualized by graphviz (graphviz.org)
clusters: output from cluster_metis. If given, the nodes are colored according to the cluster they are in
only_weights_bigger_than: many graphs are too big to be visualized:
if this number is set, only the nodes with a dice-weight
bigger than this number are written.
A good number to start: 15
"""
f = open(filename, 'w')
f.write("""graph G {\n node[style=filled];\n""")
if not clusters == {}:
colors = ['greenyellow', 'salmon2', 'deepskyblue', 'firebrick', 'cyan', 'bisque4', 'goldenrod2', 'burlywood2', 'gold1', 'darkseagreen', 'dodgerblue1',
'thistle2', 'darkolivegreen3', 'chocolate', 'turquoise3', 'steelblue3', 'coral', 'coral3', 'lemonchiffon',
'darkorange1', 'lightyellow1', 'darkorchid4', 'mistyrose2', 'crimson']
for node in self.get_nodes():
for cluster in clusters.keys():
if node in clusters[cluster]:
draw = True
if only_weights_bigger_than is not None:
draw = False
for neighbour in self.edges[node].keys():
if self.edge_weight_dice(node, neighbour) > only_weights_bigger_than:
draw = True
if draw:
color = colors[clusters.keys().index(cluster)]
f.write(' "%s" [color=%s];\n' % (node, color))
for node in self.edges.keys():
for neighbour in self.edges[node].keys():
if node < neighbour:
draw = True
weight = self.edge_weight_matching(node, neighbour)
weight_dice = self.edge_weight_dice(node, neighbour)
if only_weights_bigger_than is not None and weight_dice <= only_weights_bigger_than:
draw = False
if draw:
f.write(' "%s"--"%s" [ label = "%s (%s)" ]' % (node, neighbour, weight, weight_dice))
f.write(";\n")
f.write("}")
f.close()
def build_graph(filename):
"""
build graph from a file that each line has tag1 tag2.
"""
tag_count = {}
connections = {}
i = 0
for line in open(filename):
[tag1, tag2] = sorted(line.strip().split(' '))
key = tag1 + "+" + tag2
connections[key] = connections.get(key, 0) + 1
gr = graph()
for key,weight in connections.iteritems():
if weight > 2:
first,second = key.split('+')
gr.add(first, second, weight=weight)
return gr
def usage():
g = build_graph('tags.txt')
num_clusters = 2
q_old = -1
while True:
clusters = g.cluster_metis(num_clusters)
q = g.cluster_quality(clusters)
print "%s clusters: Quality = %s" (num_clusters, q)
if q < q_old:
print "local maximum found with quality=%s"
print clusters
break
num_clusters += 1
if __name__ == "__main__":
import doctest
doctest.testmod()