forked from zyocum/dedup
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlist_equivalence_classes.py
More file actions
executable file
·70 lines (57 loc) · 2.25 KB
/
list_equivalence_classes.py
File metadata and controls
executable file
·70 lines (57 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
'''Produce a TSV of equivalence classes given a TSV of duplicates produced by dedup.py.'''
import csv
import sys
def main(filename, threshold, display_first_marker):
with open(filename, mode='r') as f:
equivalents = {}
for documents in filter_rows(csv.reader(f, delimiter='\t'), threshold):
# find equivalence class
equivalence_class = set()
if documents[0] in equivalents:
equivalence_class = equivalents[documents[0]]
elif documents[1] in equivalents:
equivalence_class = equivalents[documents[1]]
# update equivalence class
equivalence_class.add(documents[0])
equivalence_class.add(documents[1])
equivalents[documents[0]] = equivalence_class
equivalents[documents[1]] = equivalence_class
frozen_equivalents = set()
for equivalence_class in equivalents.values():
frozen_equivalents.add(frozenset(equivalence_class))
writer = csv.writer(sys.stdout, dialect=csv.excel_tab)
for i, equivalence_class in enumerate(frozen_equivalents):
for j, document in enumerate(equivalence_class):
row = (i, document)
if display_first_marker:
marker = '' if j > 0 else '*'
row = (marker, i, document)
writer.writerow(row)
def filter_rows(rows, threshold):
yield from ((a, b) for (a, b, bitwise_difference) in rows if int(bitwise_difference) <= threshold)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description=__doc__
)
parser.add_argument(
'input',
help='path to duplicates TSV file'
)
parser.add_argument(
'-t',
'--threshold',
type=int,
default=0,
help='minimum bitwise difference threshold for considering two LSHs equivalent'
)
parser.add_argument(
'-d',
'--display-first-marker',
action='store_true',
help='mark the first entry in each equivalence class with an asterisk in the first column'
)
args = parser.parse_args()
main(args.input, args.threshold, args.display_first_marker)