-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsolution.py
More file actions
98 lines (80 loc) · 3.7 KB
/
solution.py
File metadata and controls
98 lines (80 loc) · 3.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import recordlinkage as rl;
from os.path import join;
from recordlinkage.base import BaseCompareFeature;
import pandas as pd;
import spacy;
import Levenshtein as lev;
from functools import lru_cache
import sys
from time import time
_nlp = spacy.load("en_core_web_sm")
# for progress bar loading
total = -1
start_time = 0
@lru_cache(3000)
def nlp(s):
return _nlp(s)
def jaccard_similarity(x, y):
x = set(x.lower().split())
y = set(y.lower().split())
return len(x.intersection(y)) / max(len(x), len(y))
def levenshtein_distance(x, y):
x = x.lower()
y = y.lower()
return lev.distance(x, y)
class NLP_POS_Comparator(BaseCompareFeature):
def __inner_compute_vectorized(self, s1, s2):
self.counter += 1
sys.stdout.write("\rCompleted %3.2f%% [%8s/%8s] in %d seconds." % (self.counter/total*100, self.counter, total, time() - start_time))
sys.stdout.flush()
try:
document1 = nlp(s1)
document2 = nlp(s2)
final_score = 0
for ent1 in set(document1.ents):
for ent2 in set(document2.ents):
score = max((int(0.32 * len(ent1.text + ent2.text)) - levenshtein_distance(ent1.text, ent2.text)) / int(
0.32 * len(ent1.text + ent2.text)), 0) + jaccard_similarity(ent1.text, ent2.text)
if score > 0.65 and ent1.label_ == ent2.label__:
final_score+=3.0
final_score += score
for token1 in document1:
for token2 in document2:
score = max((int(0.32*len(token1.text+token2.text))-levenshtein_distance(token1.text, token2.text))/int(0.32*len(token1.text+token2.text)), 0) + jaccard_similarity(token1.text, token2.text)
if score > 0.9 and token1.pos_ == token2.pos_:
final_score += 2.0
final_score += score
return final_score
except:
return 0
def _compute_vectorized(self, s1, s2):
self.counter = 0
return pd.DataFrame(data = {'l': s1, 'r': s2}).apply(lambda p: self.__inner_compute_vectorized(p['l'], p['r']), axis=1)
def main():
# load in dataset
global total, start_time
ltable = pd.read_csv(join('data', "ltable.csv"));
rtable = pd.read_csv(join('data', "rtable.csv"));
indexer = rl.Index();
indexer.sortedneighbourhood(left_on='brand', right_on='brand');
blocked = indexer.index(ltable, rtable);
total = len(blocked)
print("Total number of tuples to process: %s tuples" % total);
compare = rl.Compare();
compare.add(NLP_POS_Comparator('title', 'title', label='custom'))
start_time = time()
features = compare.compute(blocked, ltable, rtable);
print()
features.sum(axis=1).value_counts().sort_index(ascending=False);
potential_matches = features.reset_index()
potential_matches = pd.merge(potential_matches, ltable, how="left", left_on="level_0", right_on='id');
potential_matches = pd.merge(potential_matches, rtable, how="left", left_on="level_1", right_on='id');
train = pd.read_csv('data/train.csv')
potential_matches.merge(train, how="outer", left_on=["level_0", "level_1"], right_on=["ltable_id", "rtable_id"], indicator=True).query("_merge=='left_only'")
potential_matches[potential_matches['custom'] > 20][['level_0', 'level_1', 'custom']].sort_values(by=['custom'],
ascending=False)[
['level_0', 'level_1']].rename(
{'level_0': 'ltable_id', 'level_1': 'rtable_id'}, inplace = False).head(
650).to_csv('data/output.csv', index=False)
if __name__ == '__main__':
main();