TATIA/genProba.py at master · ThomasPortet/TATIA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Format :
# <word> <nb of entries>
# <next word> <nb of occurences>
# ...
# start of title is written _start_
# end of title is written _end_

import argparse
import nltk
from nltk.tokenize import RegexpTokenizer

parser = argparse.ArgumentParser(description='Parse and preprocess data for the program.')
parser.add_argument('input', help='input file')
parser.add_argument('output', help='output file')

args = parser.parse_args()

startwords = {}
middlewords = {}

tokenizer = RegexpTokenizer(r'\w+')

def addStart(wd):
	startwords[wd] = startwords.get(wd, 0) + 1

def addBigram(wd1, wd2):
	wddict = middlewords.get(wd1, {})
	wddict[wd2] = wddict.get(wd2, 0) + 1
	middlewords[wd1] = wddict

def addEnd(wd):
	addBigram(wd, "_end_")

#Parse
with open(args.input, "r") as file:
	for l in file:
		l = l.rstrip()
		if l.isspace():
			continue
		words = tokenizer.tokenize(l)
		if not words:
			continue
		addStart(words[0])
		addEnd(words[-1])
		bigrams = nltk.bigrams(words)
		for b in bigrams:
			addBigram(b[0], b[1])

#Save formatted data
with open(args.output, "w") as file:
	#Write start words
	file.write('_start_ ' + str(len(startwords)) + '\n')
	for k, v in startwords.items():
		file.write(k + ' ' + str(v) + '\n')
	#Write middle words
	for w, d in middlewords.items():
		file.write(w + ' ' + str(len(d)) + '\n')
		for k, v in d.items():
			file.write(k + ' ' + str(v) + '\n')