Report abuse

#!/usr/bin/python

"""
Author: Philipp Keller
Context: http://www.pui.ch/phred/archives/2006/07/automated_tag_clustering.html
"""

import urllib, sys, os

def dict_concat(dict1, dict2):
    """
    >>> dict_concat(dict(a=1), dict(b=2))
    {'a': 1, 'b': 2}
    """
    result = {}
    result.update(dict1)
    result.update(dict2)
    return result

class graph(object):
    edges = {}
    total_weight = 0
    def add(self, edge1, edge2, weight=1):
        self.edges[edge1] = dict_concat(self.edges.get(edge1, {}), {edge2:weight})
        self.edges[edge2] = dict_concat(self.edges.get(edge2, {}), {edge1:weight})
        self.total_weight += weight

    def get_nodes(self):
        return self.edges.keys()

    def get_edges(self):
        """
        >>> g = graph()
        >>> g.add('a', 'b')
        >>> g.add('b', 'c')
        >>> g.add('c', 'a')
        >>> sorted(g.get_edges())
        [('a', 'b'), ('a', 'c'), ('b', 'c')]
        """
        edges = []
        for node in self.edges.keys():
            for neighbour in self.edges[node].keys():
                if node < neighbour:
                    edges.append((node, neighbour))
        return edges

    def node_weight(self, node):
        """
        >>> g = graph()
        >>> g.add('a', 'b', 3)
        >>> g.add('a', 'c', 2)
        >>> g.node_weight('a')
        5
        """
        weight = 0
        for neighbour in self.edges[node].keys():
            weight += self.edges[node][neighbour]
        return weight

    def edge_weight_dice(self, node1, node2):
        """computes the dice-similarity-number
        see http://www.pui.ch/phred/automated_tag_clustering/figures/similarity_measures.png
        returns int(dice * 1000)

        >>> g = graph()
        >>> g.add('a', 'b', 3)
        >>> g.add('b', 'c', 6)
        >>> g.edge_weight_dice('a', 'b')
        500
        >>> g.edge_weight_dice('b', 'c')
        800
        """
        edge_weight = self.edges[node1][node2]
        node1_weight = self.node_weight(node1)
        node2_weight = self.node_weight(node2)
        dice_weight = (2.0 * edge_weight) / (node1_weight + node2_weight)
        return int(1000 * dice_weight)

    def edge_weight_matching(self, node1, node2):
        return self.edges[node1][node2]

    def num_edges(self):
        num = 0
        for node in self.edges.keys():
            num += len(self.edges[node])
        return num / 2

    def reachable(self, node, reached=[]):
        """find all nodes that are reachable from a starting node
        
        >>> g = graph()
        >>> g.add('a', 'b')
        >>> g.add('b', 'c')
        >>> g.add('d', 'e')
        >>> sorted(g.reachable('a'))
        ['a', 'b', 'c']
        """
        reached.append(node)
        for neighbour in self.edges[node]:
            if neighbour not in reached:
                reached = self.reachable(neighbour, reached)
        return reached

    def cluster_metis(self, n, weight_method=None):
        """cluster the graph using metis (http://glaros.dtc.umn.edu/gkhome/views/metis) into n clusters

        n: number of clusters to do
        weight_method: g.edge_weight_matching or g.edge_weight_dice (default)
        
        metis requires the nodes to be numeric -> rewrite the graph to their format and back
        """
        if weight_method is None:
            weight_method = self.edge_weight_dice
        metis_filename = '/tmp/my.graph'
        name2number = {}
        number2name = [None]
        i = 1
        metis_file = open(metis_filename, 'w')
        metis_file.write("%s %s 1\n" % (len(self.get_nodes()), self.num_edges()))
        for node in self.get_nodes():
            number2name.append(node)
            name2number[node] = i
            i += 1

        for i in range(1, len(number2name)):
            node = number2name[i]
            for neighbour_node in self.edges[node].keys():
                neighbour_node_number = name2number[neighbour_node]
                weight = weight_method(node, neighbour_node)
                metis_file.write("%s %s " % (neighbour_node_number, weight))
            metis_file.write("\n")

        metis_file.close()
        metis_output = os.popen('kmetis %s %s' % (metis_filename, n)).read().strip()
        clusters = {}
        i = 1
        for line in open(metis_filename + '.part.%s' % n):
            nodename = number2name[i]
            cluster_number = line.strip()
            clusters[cluster_number] = clusters.get(cluster_number, []) + [nodename]
            i += 1
        return clusters

    def cluster_quality(self, clusters, weight_method=None):
        """computes the quality of the cluster according to "modularity function" Q
        described here: http://www.datalab.uci.edu/papers/siam_graph_clustering.pdf
        
        clusters: output from cluster_metis
        weight_method: the weight you used for clustering:
                       graph.edge_weight_dice (default) or graph.edge_weight_matching
        
        """
        if weight_method is None:
            weight_method = self.edge_weight_dice

        total_weight = 0
        for edge in self.get_edges():
            total_weight += weight_method(edge[0], edge[1])

        quality = 0.0
        for cluster in clusters:
            cluster_nodes = clusters[cluster]
            edge_in_cluster_weight = 0.0
            node_in_cluster_weight = 0.0
            for edge in self.get_edges():
                node1 = edge[0]
                node2 = edge[1]
                if node1 in cluster_nodes or node2 in cluster_nodes:
                    weight = weight_method(node1, node2)
                    node_in_cluster_weight += weight
                    if node1 in cluster_nodes and node2 in cluster_nodes:
                        edge_in_cluster_weight += weight

            # print "%s: %s/%s - (%s/%s)^2" % (cluster, edge_in_cluster_weight, total_weight, node_in_cluster_weight, total_weight)
            quality += ((edge_in_cluster_weight / total_weight) - (node_in_cluster_weight / total_weight)) ** 2
            quality += (edge_in_cluster_weight / total_weight) - ((node_in_cluster_weight / total_weight) ** 2)
        return quality

    def graphviz_export(self, filename, clusters={}, only_weights_bigger_than=None):
        """stores graph into a file that can be visualized by graphviz (graphviz.org)

        clusters: output from cluster_metis. If given, the nodes are colored according to the cluster they are in
        only_weights_bigger_than: many graphs are too big to be visualized:
                                  if this number is set, only the nodes with a dice-weight
                                  bigger than this number are written.
                                  A good number to start: 15
        
        """
        f = open(filename, 'w')
        f.write("""graph G {\n  node[style=filled];\n""")
        if not clusters == {}:
            # 24 colors should be enough for everyone
            colors = ['greenyellow', 'salmon2', 'deepskyblue', 'firebrick', 'cyan', 'bisque4', 'goldenrod2', 'burlywood2', 'gold1', 'darkseagreen', 'dodgerblue1',
                      'thistle2', 'darkolivegreen3', 'chocolate', 'turquoise3', 'steelblue3', 'coral', 'coral3', 'lemonchiffon',
                      'darkorange1', 'lightyellow1', 'darkorchid4', 'mistyrose2', 'crimson']
            for node in self.get_nodes():
                for cluster in clusters.keys():
                    if node in clusters[cluster]:
                        draw = True
                        if only_weights_bigger_than is not None:
                            draw = False
                            for neighbour in self.edges[node].keys():
                                if self.edge_weight_dice(node, neighbour) > only_weights_bigger_than:
                                    draw = True
                        if draw:
                            color = colors[clusters.keys().index(cluster)]
                            f.write('  "%s" [color=%s];\n' % (node, color))
        for node in self.edges.keys():
            for neighbour in self.edges[node].keys():
                if node < neighbour:
                    draw = True
                    weight = self.edge_weight_matching(node, neighbour)
                    weight_dice = self.edge_weight_dice(node, neighbour)
                    if only_weights_bigger_than is not None and weight_dice <= only_weights_bigger_than:
                        draw = False
                    if draw:
                        f.write('  "%s"--"%s" [ label = "%s (%s)" ]' % (node, neighbour, weight, weight_dice))
                        f.write(";\n")
        f.write("}")
        f.close()


# the code in this method is from memory, I did not try to run it
# Therefore it could contain minor flaws
def build_graph(filename):
    """
    build graph from a file that each line has tag1 tag2.
    """
    tag_count = {}
    connections = {}

    i = 0
    for line in open(filename):
        [tag1, tag2] = sorted(line.strip().split(' '))
        key = tag1 + "+" + tag2
        connections[key] = connections.get(key, 0) + 1

    gr = graph()
    for key,weight in connections.iteritems():
        if weight > 2:
            first,second = key.split('+')
            gr.add(first, second, weight=weight)
    return gr

# the code in this method is from memory, I did not try to run it
# Therefore it could contain minor flaws
def usage():
    g = build_graph('tags.txt')
    num_clusters = 2
    q_old = -1
    while True:
        clusters = g.cluster_metis(num_clusters)
        q = g.cluster_quality(clusters)
        print "%s clusters: Quality = %s" (num_clusters, q)
        # this simple method just finds the local maximum
        # you should find a better way to find the global maxima
        if q < q_old:
            print "local maximum found with quality=%s"
            print clusters
            break
        num_clusters += 1

if __name__ == "__main__":
    import doctest
    doctest.testmod()