-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMarkov_chain.cpp
More file actions
119 lines (111 loc) · 3.74 KB
/
Copy pathMarkov_chain.cpp
File metadata and controls
119 lines (111 loc) · 3.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
// - - - - - - - - - - - - - - - - - - - -
// Name : Jeremy Choo, Siyuan Liu
// ID : 1602380, 1589879
// CMPUT 275, Winter 2019
//
// project: Text Editor
// - - - - - - - - - - - - - - - - - - - -
#include "Markov_chain.h"
using namespace std;
void Markov::create(string file, unsigned int keyLen) {
/*
This function will create the whole Markov chain by reading the training text file
with the inquired key length (number of string as a key). To do this, it will call
the function createDictionary after reading all the text in the file and the file
is not empty.
Argument:
file: the training text file name
keyLen: number of string to be as a key
*/
ifstream f(file.c_str(), ios_base::in);
fileBuffer = string((istreambuf_iterator<char>(f)),
istreambuf_iterator<char>());
f.close();
if (fileBuffer.length() < 1) return;
createDictionary(keyLen);
}
string Markov::predict_next(string key) {
/*
This function weill get the maximum frequency that appear behind the two string (key)
by getting the heap of that key and get the max as the predicted next word.
Argument:
key: the two string
return:
astring of word that appear next
*/
BinaryHeap<string, int> heap = dictionary[key];
if (heap.size() == 0)
return "";
return heap.max().first;
}
string Markov::get_word(string str) {
/*
This funtion is to filter the string so that they are all lowercase and no character
other than alphabets and numbers. However, no more than one symbol is allowed and the
only symbol can only appear at the end of the string.
Agument:
str: string to be processed
return:
str new string that is processed
*/
locale loc;
string alpha;
string::size_type i = 0;
while (i < str.length()) {
alpha.push_back(tolower(str[i], loc));
i++;
}
// handle symbol (eg. . ! ?)
char last_char = str[str.length() - 1];
if (!isalnum(last_char)) {
if (last_char == '.' || last_char == '!' ||
last_char == '?' || last_char == ',')
alpha.pop_back();
}
return alpha;
}
void Markov::createDictionary(unsigned int kl) {
/*
This function will get every word by separating the text file with spaces.
Insert the new word into the dictionary if it does not exist and update the word
if it exist. Every key will have a heap and is built to mantain the maximum at the
root.
Argument:
kl: keylength of the dictionary
*/
string w1, key;
size_t wc = 0, pos, next;
next = fileBuffer.find_first_not_of(SPACE, 0);
if (next == string::npos)
return;
// get key with inquired number of word(s)
while (wc < kl) {
pos = fileBuffer.find_first_of(' ', next);
w1 = fileBuffer.substr(next, pos - next);
w1 = get_word(w1);
key += w1 + " ";
next = fileBuffer.find_first_not_of(SPACE, pos + 1);
if (next == string::npos)
return;
wc++;
}
// remove the extra space at the end
key = key.substr(0, key.size() - 1);
while (true) {
next = fileBuffer.find_first_not_of(SPACE, pos + 1);
if (next == string::npos)
return;
pos = fileBuffer.find_first_of(SPACE, next);
w1 = fileBuffer.substr(next, pos - next);
if (w1.size() < 1)
break;
w1 = get_word(w1);
// key as processed word (without upper case, number and symbol)
int index = dictionary[key].find(w1);
if (index >= dictionary[key].size())
dictionary[key].insert(w1, 1);
else
dictionary[key].update(index);
key = key.substr(key.find_first_of(SPACE) + 1) + " " + w1;
}
}