graph_kernels/gk_wl.py at master · svegapons/graph_kernels · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
Weisfeiler_Lehman graph kernel.

Python implementation of Nino Shervashidze Matlab code at:
http://mlcb.is.tuebingen.mpg.de/Mitarbeiter/Nino/Graphkernels/

Author : Sandro Vega Pons

"""

import numpy as np
import networkx as nx
import copy
from gk_base import GK_Base
import pdb


class GK_WL(GK_Base):
    """
    Weisfeiler_Lehman graph kernel.
    """

    def __init__(self, th=0, h=1):
        """
        Parameters
        ----------
        th: float
            Threshold to edge weights
        h : int
            Number of iterations
        nl : boolean
            Whether to use original node labels. True for using node labels
            saved in the attribute 'node_label'. False for using the node
            degree of each node as node attribute.
        """
        self.th = th
        self.h = h


    def compare_pairwise(self, graph_list, normalize=True):
        """Compute the all-pairs kernel values for a list of graphs.

        Parameters
        ----------
        graph_list: list of ndarray
                    A list of graphs (adjacency matrices)

        Return
        ------
        K: ndarray, shape = (len(graph_list), len(graph_list))
            The similarity matrix of all graphs in graph_list.
        """
        #Thresholding and computing the networkx graphs
        graph_list = [nx.from_numpy_matrix(np.where(mat > self.th, 1, 0))
                      for mat in graph_list]
        n = len(graph_list)
        lists = [0] * n
        k = [0]*(self.h + 1)
        n_nodes = 0
        n_max = 0

        #Compute adjacency lists and n_nodes, the total number of nodes in
        #the dataset.
        for i in range(n):
            #the graph should be a networkx graph or having the same methods.
            lists[i] = graph_list[i].adjacency_list()
            n_nodes = n_nodes + graph_list[i].number_of_nodes()

            #Computing the maximum number of nodes in the graphs. It will be
            #used in the computation of vectorial representation.
            if(n_max < graph_list[i].number_of_nodes()):
                n_max = graph_list[i].number_of_nodes()

        phi = np.zeros((n_max, n))
        #each column j of phi will be the explicit feature representation
        # for the graph j.
        #n_max is enough to store all possible labels

        #INITIALIZATION
        #initialize the nodes labels for each graph with their labels or
        #with degrees (for unlabeled graphs)

        labels = [0] * n
        label_lookup = {}
        label_counter = 0

        # label_lookup is an associative array, which will contain the
        # mapping from multiset labels (strings) to short labels (integers)

        #Node degrees are used as node labels
        for i in range(n):
            labels[i] = np.array(graph_list[i].degree().values())
            for j in range(len(labels[i])):
                phi[labels[i][j], i] += 1

        k = np.dot(phi.transpose(), phi)


        ### MAIN LOOP
        it = 0
        new_labels = copy.deepcopy(labels)


        while it < self.h:
            # create an empty lookup table
            label_lookup = {}
            label_counter = 0

            phi = np.zeros((n_nodes, n))
            for i in range(n):
                for v in range(len(lists[i])):
                    # form a multiset label of the node v of the i'th graph
                    # and convert it to a string
#                    pdb.set_trace()
                    long_label = np.concatenate((np.array([labels[i][v]]),
                                                 np.sort(labels[i]
                                                 [lists[i][v]])))
                    long_label_string = str(long_label)
                    # if the multiset label has not yet occurred, add it to the
                    # lookup table and assign a number to it
                    if not label_lookup.has_key(long_label_string):
                        label_lookup[long_label_string] = label_counter
                        new_labels[i][v] = label_counter
                        label_counter += 1
                    else:
                        new_labels[i][v] = label_lookup[long_label_string]
                # fill the column for i'th graph in phi
                aux = np.bincount(new_labels[i])
                phi[new_labels[i], i] += aux[new_labels[i]]

            k += np.dot(phi.transpose(), phi)
            labels = copy.deepcopy(new_labels)
            it = it + 1

        if normalize:
            k = self._normalize(k)

        return k