-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathutils.py
More file actions
84 lines (64 loc) · 3.13 KB
/
utils.py
File metadata and controls
84 lines (64 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import numpy as np
import torch
from tqdm import tqdm
def retrieve_topk(query_b, doc_b, topK, batch_size=100):
n_bits = doc_b.size(1)
n_train = doc_b.size(0)
n_test = query_b.size(0)
topScores = torch.cuda.ByteTensor(n_test, topK + batch_size).fill_(n_bits+1)
topIndices = torch.cuda.LongTensor(n_test, topK + batch_size).zero_()
testBinmat = query_b.unsqueeze(2)
for batchIdx in tqdm(range(0, n_train, batch_size), ncols=0, leave=False):
s_idx = batchIdx
e_idx = min(batchIdx + batch_size, n_train)
numCandidates = e_idx - s_idx
trainBinmat = doc_b[s_idx:e_idx]
trainBinmat.unsqueeze_(0)
trainBinmat = trainBinmat.permute(0, 2, 1)
trainBinmat = trainBinmat.expand(testBinmat.size(0), n_bits, trainBinmat.size(2))
testBinmatExpand = testBinmat.expand_as(trainBinmat)
scores = (trainBinmat ^ testBinmatExpand).sum(dim=1)
indices = torch.arange(start=s_idx, end=e_idx, step=1).type(torch.cuda.LongTensor).unsqueeze(0).expand(n_test, numCandidates)
topScores[:, -numCandidates:] = scores
topIndices[:, -numCandidates:] = indices
topScores, newIndices = topScores.sort(dim=1)
topIndices = torch.gather(topIndices, 1, newIndices)
return topIndices
def compute_precision_at_k(retrieved_indices, query_labels, doc_labels, topK, is_single_label):
n_test = query_labels.size(0)
Indices = retrieved_indices[:,:topK]
if is_single_label:
test_labels = query_labels.unsqueeze(1).expand(n_test, topK)
topTrainLabels = [torch.index_select(doc_labels, 0, Indices[idx]).unsqueeze_(0) for idx in range(0, n_test)]
topTrainLabels = torch.cat(topTrainLabels, dim=0)
relevances = (test_labels == topTrainLabels).type(torch.cuda.ShortTensor)
else:
topTrainLabels = [torch.index_select(doc_labels, 0, Indices[idx]).unsqueeze_(0) for idx in range(0, n_test)]
topTrainLabels = torch.cat(topTrainLabels, dim=0).type(torch.cuda.ShortTensor)
test_labels = query_labels.unsqueeze(1).expand(n_test, topK, topTrainLabels.size(-1)).type(torch.cuda.ShortTensor)
relevances = (topTrainLabels & test_labels).sum(dim=2)
relevances = (relevances > 0).type(torch.cuda.ShortTensor)
true_positive = relevances.sum(dim=1).type(torch.cuda.FloatTensor)
true_positive = true_positive.div_(100)
prec_at_k = torch.mean(true_positive)
return prec_at_k
class TopDoc(object):
def __init__(self, data_fn, is_train=False):
self.data_fn = data_fn
self.is_train = is_train
self.db = self.load(data_fn, is_train)
def load(self, fn, is_train):
db = {}
with open(fn) as in_data:
for line in in_data:
line = line.strip()
first, rest = line.split(':')
topk = list(map(int, rest.split(',')))
docId = int(first)
if is_train:
db[docId] = topk[1:]
else:
db[docId] = topk
return db
def getTopK(self, docId, topK):
return self.db[docId][:topK]