-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmatrixgenerator.py
More file actions
124 lines (115 loc) · 3.49 KB
/
matrixgenerator.py
File metadata and controls
124 lines (115 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from collections import Counter
from numpy import *
import re
import numpy
from tempfile import mkdtemp
import os.path as path
hello = path.join("./", 'xmas.dat')
def openfile_and_replace(filename):
fh = open(filename, "r+")
str = fh.read()
str=str.replace("."," ")
str=str.replace(","," ")
str=str.replace("\n"," ")
str=str.replace("&comma"," ")
#str=str.replace(";"," ")
#str=str.replace(":"," ")
#str=str.replace("!"," ")
#str=str.replace("?",".")
re.sub(' +',' ',str)
fh.close()
return str
def openfile_and_split(filename):
fh = open(filename, "r+")
str = fh.read()
str=str.replace(","," ")
str=str.replace("\n"," ")
str=str.replace("&comma"," ")
#str=str.replace(";"," ")
#str=str.replace(":"," ")
#str=str.replace("!"," ")
#str=str.replace("?",".")
re.sub(' +',' ',str)
str=str.split('.')
fh.close()
return str
def getwordbins(words):
cnt = Counter()
for word in words:
cnt[word] += 1
return cnt
def main(filename):
txt = openfile_and_replace(filename)
array=list()
sentenceuniquearray=list()
txt=txt.replace(","," ")
txt=txt.replace("\n"," ")
txt=txt.replace("&comma"," ")
txt=txt.replace("?",".")
words = txt.split(' ')
bins = getwordbins(words)
for key, value in bins.most_common():
if value > 0:
if key != '':
array.append(key)
#array contains a list of unique words
lenny=len(array)
#x contains the matrix of unique words X unique words
# x=zeros((lenny, lenny), int)
x=numpy.memmap(hello, dtype='int', mode='w+', shape=(lenny, lenny))
# print x
#reading file line by line and then doing the matrix operation
l=list()
with open('input_matrixgenerator.txt') as f:
while True:
z=f.read(1)
if not z:
break
elif z == ',' or z == '\n':
None
elif z == '&comma':
None
elif z=='.' or z == '?':
#in each iteration, ""sentence"" gets overwritten by a new sentence's value
sentence = ''.join(l)
#strip deletes all the trailing and leading white spaces
sentence=sentence.strip(" ")
sentence=sentence.split(' ')
#now ""sentence"" is an array of each word of the sentence
#now counter willcontain list of all unique words in a sentence and not the entire corpus
counter = getwordbins(sentence)
for key, value in counter.most_common():
if value > 0:
if key != '' or key!= '&comma':
sentenceuniquearray.append(key)
for wordsofsentence in sentenceuniquearray:
freq=counter[wordsofsentence]
smalllenny=len(sentenceuniquearray)
for i in xrange(smalllenny):
for j in xrange(i, smalllenny):
#ai and aj are indexes in the main matrix
#whereas i and j will be the value in the individual sentence level matrix's indexes
ai=array.index(sentenceuniquearray[i])
aj=array.index(sentenceuniquearray[j])
if i == j:
freq1 = counter[sentenceuniquearray[i]]
freq2 = counter[sentenceuniquearray[i]]-1
x[ai][aj]+=(freq2*freq1)
else :
freq1 = counter[sentenceuniquearray[i]]
freq2 = counter[sentenceuniquearray[j]]
product=freq2*freq1
x[ai][aj]+=product
x[aj][ai]+=product
#l needs to get refreshed everytime so that it now contains a fresh list of all the characters in each sentence
#so does sentence and sentenceuniquearray
#the counter's instance along with the referenced one in the function needs to be deleted, dont wanna blow up the ram
del sentence[:]
del counter
del sentenceuniquearray[:]
l=list()
else:
l.append(z)
f.close()
print x
main('input_matrixgenerator.txt')