-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathBM25.cpp
More file actions
59 lines (51 loc) · 1.61 KB
/
BM25.cpp
File metadata and controls
59 lines (51 loc) · 1.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#include "./header/BM25.hpp"
double avg_length;
BM25::BM25(const std::vector<Text>& queries, const std::vector<Document>& docs) {
calc_avg_length(docs);
init_fqi_all(queries, docs);
}
void BM25::calc_avg_length(const std::vector<Document>& docs) {
avg_length = 0.0;
for (const auto& doc : docs) {
avg_length += doc.get_length();
}
avg_length /= docs.size();
}
void BM25::init_fqi_all(const std::vector<Text>& queries, const std::vector<Document>& docs) {
for (const auto& query : queries) {
for (const auto& tok : query.tokens) {
if (fqi_all.count(tok.first) > 0) continue; //this tok is checked
for (auto& doc : docs) {
if (doc.tokens.find(tok.first) != doc.tokens.end()) {
++fqi_all[tok.first]; //increase the number of doc that contains tok
}
}
}
}
}
double BM25::idf(const std::string& tok, const size_t& n_of_docs) const {
const auto it = fqi_all.find(tok);
size_t fqi_val = 0;
if (it != fqi_all.end()) {
fqi_val = it->second;
}
return log(1 + (n_of_docs - fqi_val + 0.5) / (fqi_val + 0.5));
}
size_t BM25::fqi(const std::string& tok, const Document& doc) const {
auto it = doc.tokens.find(tok);
if (it != doc.tokens.end()) {
return it->second;
} else {
return 0;
}
}
double BM25::score(const Document& doc, const Text& query, const size_t& n_of_docs) const {
double ans = 0.0;
size_t i = 0;
for (const auto& it : query.tokens) {
std::string tok = it.first;
auto fqi_val = fqi(tok, doc);
ans += idf(tok, n_of_docs) * (fqi_val * (k + 1)) / (fqi_val + k * (1 - b + b * doc.get_length() / avg_length));
}
return ans;
}