forked from Moonshile/ChineseWordSegmentation
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhashtree.py
More file actions
104 lines (85 loc) · 3.06 KB
/
hashtree.py
File metadata and controls
104 lines (85 loc) · 3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#coding=utf-8
"""
A simple implementation of Hash Tree
Author: 段凯强
"""
class HashTreeNode(object):
def __init__(self, name=''):
self.val = 0
self.name = name
self.level = 0
self.children = {}
def addBag(self, bag):
"""
Note that bag must be sorted
"""
if bag:
node = self.children.get(bag[0], HashTreeNode(name=bag[0]))
node.addBag(bag[1:])
self.children[bag[0]] = node
self.level = len(bag)
def count(self, transaction):
"""
count the child who matches bag, suppose that current node matches
"""
if self.level == 0:
self.val += 1
elif self.level == 1:
for t in transaction:
if t in self.children: self.children[t].val += 1
else:
for i in range(0, len(transaction)):
t = transaction[i]
if t in self.children:
self.children[t].count(transaction[i:])
def get(self, theta):
return map(lambda items: map(lambda c: c.name, items), self.getNodes(theta))
"""
if self.level == 0:
return [[self.name]] if self.val >= theta else None
else:
children_res = [self.children[i].get(theta) for i in sorted(self.children.keys())]
total = reduce(lambda res, x: res + x, filter(lambda x: x, children_res), [])
return map(lambda c: [self.name] + c, total)
"""
def getNodes(self, theta):
if self.level == 0:
return [[self]] if self.val >= theta else None
else:
children_res = [self.children[i].getNodes(theta) for i in sorted(self.children.keys())]
total = reduce(lambda res, x: res + x, filter(lambda x: x, children_res), [])
return map(lambda c: [self] + c, total)
def __str__(self):
return '(%s : %s)'%(self.name, '; '.join([str(i) for i in self.children.values()]))
def sameNode(node1, node2):
return node1.name == node2.name
def sameNodes(nodes1, nodes2):
func = lambda n: n.name
return map(func, nodes1) == map(func, nodes2)
class HashTree(object):
"""
Note that all bags must be sorted
"""
def __init__(self, bags):
self.root = HashTreeNode()
self.root.val = 0
for b in bags:
if b: self.root.addBag(b)
def count(self, transactions):
for t in transactions: self.root.count(t)
def get(self, theta):
res = map(lambda c: c[1:], self.root.get(theta))
return [] if res == [[]] else res
def getNodes(self, theta):
res = map(lambda c: c[1:], self.root.getNodes(theta))
return [] if res == [[]] else res
def __str__(self):
return str(self.root)
if __name__ == '__main__':
to_count = [[1,2], [2,4], [1,3], [1,5], [3,4], [2,7], [6,8]]
tree = HashTree(to_count)
transactions = [[1,2,3],[1,2,4],[2,4,6,8],[1,3,5,7]]
tree.count(transactions)
print 'Frequency with transactions', transactions
print tree.get(2)
print tree.get(1)