dim_red_zoo/support_func.py at master · Shekhale/dim_red_zoo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
import random

try:
    import faiss
    hasfaiss = True
except:
    hasfaiss = False

import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import time
import itertools

from data import write_fvecs, write_ivecs


def get_nearestneighbors_faiss(xq, xb, k, device, needs_exact=True, verbose=False):
    assert device in ["cpu", "cuda"]

    if verbose:
        print("Computing nearest neighbors (Faiss)")

    if needs_exact or device == 'cuda':
        index = faiss.IndexFlatL2(xq.shape[1])
    else:
        index = faiss.index_factory(xq.shape[1], "HNSW32")
        index.hnsw.efSearch = 64
    if device == 'cuda':
        index = faiss.index_cpu_to_all_gpus(index)

    start = time.time()
    index.add(xb)
    _, I = index.search(xq, k)
    if verbose:
        print("  NN search (%s) done in %.2f s" % (
            device, time.time() - start))

    return I


def cdist2(A, B):
    return  (A.pow(2).sum(1, keepdim = True)
             - 2 * torch.mm(A, B.t())
             + B.pow(2).sum(1, keepdim = True).t())


def top_dist(A, B, k):
    return cdist2(A, B).topk(k, dim=1, largest=False, sorted=True)[1]


def get_nearestneighbors_torch(xq, xb, k, device, needs_exact=False, verbose=False):
    if verbose:
        print("Computing nearest neighbors (torch)")

    assert device in ["cpu", "cuda"]
    start = time.time()
    xb, xq = torch.from_numpy(xb), torch.from_numpy(xq)
    xb, xq = xb.to(device), xq.to(device)
    bs = 500
    I = torch.cat([top_dist(xq[i*bs:(i+1)*bs], xb, k)
                   for i in range(xq.size(0) // bs)], dim=0)
    if verbose:
        print("  NN search done in %.2f s" % (time.time() - start))
    I = I.cpu()
    return I.numpy()


if hasfaiss:
    get_nearestneighbors = get_nearestneighbors_faiss
else:
    get_nearestneighbors = get_nearestneighbors_torch


def sanitize(x):
    return np.ascontiguousarray(x, dtype='float32')


def get_transform(xb, net, step, args):
    xb_lat = np.zeros((xb.shape[0], args.dout))
    net.eval()
    for i0 in range(0, xb.shape[0], step):
        i1 = min(i0 + step, xb.shape[0])
        xb_i = xb[i0:i1, :]
        xb_i = torch.from_numpy(xb_i).to(args.device)
        xb_l = net(xb_i.float())
        xb_lat[i0:i1, :] = xb_l.detach().cpu().numpy()
    xb_lat = sanitize(xb_lat)

    return xb_lat


def normalize_numpy(x, args):
    x_var = torch.from_numpy(x).to(args.device)
    x_var = x_var / x_var.norm(dim=-1, keepdim=True)
    x = x_var.detach().cpu().numpy()
    return x


def get_set_difference(pos, neg):
    if 2 * len(pos[0]) > len(neg[0]):
        print("Wrong pos and neg lenghts")
        return
    k_pos = len(pos[0])
    neg_dif = [0] * len(neg)
    for i in range(len(neg)):
        neg_dif[i] = list(set(neg[i]).difference(set(pos[i])))[:k_pos]

    return neg_dif


def calc_permutation(x, y, k):
    ans = 0
    for i in range(x.shape[0]):
        ans += len(list(set(x[i]) & set(y[i]))) / k
    return ans / x.shape[0]


def l2_dist(x, y):
    return np.sqrt(sum((x-y)**2))


def forward_pass(net, xall, bs=128, device=None):
    if device is None:
        device = next(net.parameters()).device
    xl_net = []
    net.eval()
    for i0 in range(0, xall.shape[0], bs):
        x = torch.from_numpy(xall[i0:i0 + bs])
        x = x.to(device)

        res = net(x)
        xl_net.append(res.data.cpu().numpy())

    return np.vstack(xl_net)


def forward_pass_enc(enc, xall, bs=128, device=None):
    if device is None:
        device = next(enc.parameters()).device
    xl_net = []
    enc.eval()
    for i0 in range(0, xall.shape[0], bs):
        x = torch.from_numpy(xall[i0:i0 + bs])
        x = x.to(device)
        res, _ = enc(x)
        xl_net.append(res.data.cpu().numpy())

    return np.vstack(xl_net)


def save_transform(ds, model, path, device, enc=False):
    # ds = torch.from_numpy(ds).to(device)

    if enc:
        xb_var = torch.from_numpy(ds).to(device)
        xb_var = xb_var / xb_var.norm(dim=-1, keepdim=True)
        ds = xb_var.detach().cpu().numpy()
        del xb_var

    # ds = forward_pass_model(model, ds, 1024, lat=True)
    if enc:
        ds = forward_pass_enc(model, ds, 1024)
    else:
        ds = forward_pass(model, ds, 1024)
    # file_for_write_base = "data/" + path
    file_for_write_base = "/mnt/data/shekhale/data/" + path
    write_fvecs(file_for_write_base, ds)


def loss_permutation(x, y, args, k, size=10**4):
    perm = np.random.permutation(x.shape[0])
    k_nn_x = get_nearestneighbors(x[perm[:size]], x, k, args.device, needs_exact=True)
    k_nn_y = get_nearestneighbors(y[perm[:size]], y, k, args.device, needs_exact=True)
    perm_coeff = calc_permutation(k_nn_x, k_nn_y, k)
    print('top %d permutation is %.3f' % (k, perm_coeff))
    # with open("validation_sim.txt", "a") as rfile:
    #     rfile.write("epoch %d, perm = %.3f\n" % (int(epoch), perm_coeff))
    # logs['val'] = perm_coeff
    return perm_coeff


def loss_top_1_in_lat_top_k(xs, x, ys, y, args, kx, ky, size, name, fake_args=False):
    if xs.shape[0] != ys.shape[0]:
        print("wrong data")
    perm = np.random.permutation(xs.shape[0])
    top1_x = get_nearestneighbors(xs[perm[:size]], x, kx, args.device, needs_exact=True)
    top_neg_y = get_nearestneighbors(ys[perm[:size]], y, ky, args.device, needs_exact=True)
    ans_in_top_neg = 0
    for i in range(top1_x.shape[0]):
        if top1_x[i, -1] in top_neg_y[i]:
            ans_in_top_neg += 1
    print('%s: Part of top1_x in gt_lat_ %d = %.4f' % (name, ky, ans_in_top_neg / len(top1_x)))
    return ans_in_top_neg / top1_x.shape[0]


def loss_top_1_in_lat_top_k_new(gt, yb, yq, args, k):
    k_nn_yq = get_nearestneighbors(yq, yb, 2*k, args.device, needs_exact=True)
    ans_in_top_100 = 0
    ans_in_top_50 = 0
    for i in range(len(yq)):
        if gt[i, 0] in k_nn_yq[i]:
            ans_in_top_100 += 1
        if gt[i, 0] in k_nn_yq[i][:k]:
            ans_in_top_50 += 1
    print('QUERY: Part of gt in gt_lat_%d = %.4f' % (k, ans_in_top_50 / len(yq)))
    print('QUERY: Part of gt in gt_lat_%d = %.4f' % (2*k, ans_in_top_100 / len(yq)))

    return ans_in_top_50 / len(yq), ans_in_top_100 / len(yq)


def show_neighbours_distr(x, k, args, hist_steps=10, print_in_file=False, file_name=""):
    n = x.shape[0]
    knn = get_nearestneighbors(x, x, k, args.device, needs_exact=True)
    kth_neighbors_ind = knn[:, -1]
    distances = [0] * n
    for i in range(n):
        distances[i] = np.sqrt(np.sum((x[i] - x[kth_neighbors_ind[i]]) ** 2))
    dist_min = np.min(distances)
    dist_max = np.max(distances)
    delta = dist_max - dist_min + 0.000001
    hist = [0] * hist_steps
    for i in range(n):
        ind = int(hist_steps * (distances[i] - dist_min) / delta)
        hist[ind] += 1
    hist = [h / n for h in hist]
    # print(dist_min, dist_max)
    print("Histogram neig:", hist)

    hist_values = [0] * hist_steps
    for i in range(hist_steps):
        hist_values[i] = dist_min + i * delta / hist_steps
    if print_in_file:
        with open(file_name, "a") as rfile:
            rfile.write("Top %d neigbour distance distribution \n" %(k-1))
            # rfile.write(hist)
            rfile.write(" ".join(str(item)[:5] for item in hist) + "\n")
            rfile.write(" ".join(str(item)[:5] for item in hist_values) + "\n")
            # rfile.write(hist_values)

    return hist


def get_weights(x, k, args):
    n = x.shape[0]
    knn = get_nearestneighbors(x, x, k, args.device, needs_exact=True)
    kth_neighbors_ind = knn[:, -1]
    distances = [0] * n

    weights = [1 / d for d in distances]
    weights = weights / (sum(weights) / n)
    return weights


def calc_clasterization_coeff(graph, size=10**3, file_name=""):
    n, k = graph.shape
    distr = [0] * 10
    for i in range(size):
        inters = 0
        ind = random.randint(0, n - 1)
        for nb in graph[ind]:
            if nb != ind:
                inters += len(set(graph[ind]) & set(graph[nb])) # -2 because ind n for ind

        place = int(len(distr) * inters / (k * k)) # each edge two times
        distr[place] += 1 / size

    # print(distr)
    print("Claster distribution: " + " ".join(str(x)[:5] for x in distr))

    if file_name != "":
        with open(file_name, "a") as rfile:
            rfile.write("Claster distribution: " +" ".join(str(x)[:5] for x in distr))
            rfile.write("\n")

    return distr


def generate_uniform(n, d, nq, device, save=False):
    xb = torch.randn(size=(n, d)).to(device)
    xb = xb / xb.norm(dim=-1, keepdim=True)

    xq = torch.randn(size=(nq, d)).to(device)
    xq = xq / xq.norm(dim=-1, keepdim=True)

    x = xq.detach().cpu().numpy()
    xq = xq.detach().cpu().numpy()

    gt = get_nearestneighbors(xq, x, 100, device, needs_exact=True)
    if save:
        # path_start = "/uniform_"
        file_for_write_base = "/uniform_base_" + str(d) + ".fvecs"
        write_fvecs(file_for_write_base, xb)
        file_for_write_q = "/uniform_query_" + str(d) + ".fvecs"
        write_fvecs(file_for_write_q, xq)
        file_for_write_gt = "/uniform_gt_" + str(d) + ".ivecs"
        write_ivecs(file_for_write_gt, gt)

    return xb, xq, gt


def get_nearestneighbors_partly(xq, xb, k, device, bs=10**5, needs_exact=True):

    knn = []

    for i0 in range(0, xq.shape[0], bs):
        xq_p = xq[i0:i0 + bs]
        res = get_nearestneighbors(xq_p, xb, k, device, needs_exact)
        knn.append(res)

    return np.vstack(knn)


def get_nearestneighbors_brute(xq, xb, device, bs=10**6, needs_exact=True):

    knn = []
    for i in range(0, xq.shape[0], bs):
        xq_p = xq[i:i + bs]
        knn_i = []
        for j in range(0, xb.shape[0], bs):
            print(i, j)
            xb_p = xb[j:j + bs]
            res = get_nearestneighbors(xq_p, xb_p, 100, device, needs_exact)
            knn_i.append(res)
        knn.append(np.vstack(knn_i))

    return np.vstack(knn)


def GetIntrinsicDimension(x, knn, calc_size=10**4):
    edges = [10, 15, 20]
    dims = []
    perm = np.random.permutation(x.shape[0])
    problems = 0
    for edge in edges:
        d_low = 0
        for i in perm[:calc_size]:
            d_low_cur = 0
            dist_k = l2_dist(x[i], x[knn[i][edge]])
            for j in range(1, edge):
                dist_j = l2_dist(x[i], x[knn[i][j]])
                if dist_j > 0:
                    d_low_cur += np.log(dist_k / dist_j)
            if d_low_cur > 0:
                d_low_cur = 1 / d_low_cur
                d_low_cur *= edge - 1
                d_low += d_low_cur
            else:
                problems += 1

        d_low /= calc_size
        dims.append(d_low)

    d_low = np.mean(dims)
    print(dims)
    print("Intrinsic dimension is  %.3f" % (d_low))
    print("problems  %d" % (problems))


class Normalize(nn.Module):
    def __init__(self):
        super(Normalize, self).__init__()

    def forward(self, x):
        return F.normalize(x, p=2., dim=1)


def repeat(l, r):
    return list(itertools.chain.from_iterable(itertools.repeat(x, r) for x in l))


def pairwise_NNs_inner(x):
    dots = torch.mm(x, x.t())
    n = x.shape[0]
    dots.view(-1)[::(n + 1)].fill_(-1)  # Trick to fill diagonal with -1
    _, I = torch.max(dots, 1)  # max inner prod -> min distance
    return I