-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodels.py
More file actions
60 lines (52 loc) · 2.13 KB
/
models.py
File metadata and controls
60 lines (52 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import re
import itertools
# this regular expression describes what a "word" is.
WORD_PATTERN = r'\w(\w|-\w)*'
word_re = re.compile(WORD_PATTERN)
CONTEXT_LENGTH = 11
class Concordance(object):
def __init__(self):
self.words = {}
self.text = ''
def span_prefix(self, span):
context_start, w_start, w_end, context_end = span
return self.text[context_start:w_start]
def span_suffix(self, span):
context_start, w_start, w_end, context_end = span
return self.text[w_end:context_end]
def FromText(self, text):
self.text = text
# tokenize the input into words, and then add an infinite stream of
# None to the end of the list. This keeps the algorithm working for
# words near the end of the list.
it = word_re.finditer(text)
it = itertools.chain(it, itertools.cycle([None]))
# context is our sliding window, where the middle word is the one we
# will tablulate. Initialize context to a bunch of empty words.
context = []
middle = CONTEXT_LENGTH // 2
for i in range(CONTEXT_LENGTH):
context.append( ('', 0, 0) )
for m in it:
# add the next word to the context, and remove the first word.
# remember the Nones are the end of the iterator.
context.pop(0)
if m == None:
context.append( ('', len(text), len(text)) )
else:
context.append( (m.group(), m.start(), m.end()) )
# tabulate this word unless it is one of our empty words
word, w_start, w_end = context[middle]
# is this word one of our initial blanks?
if word == '' and w_start == 0:
continue
# are we at the end?
if word == '' and w_start == len(text):
break
_, context_start, _ = context[0]
_, _, context_end = context[-1]
lst = self.words.get(word, None)
if lst is None:
lst = []
self.words[word] = lst
lst.append( (context_start, w_start, w_end, context_end) )