-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcs.cpp
More file actions
103 lines (102 loc) · 2.43 KB
/
cs.cpp
File metadata and controls
103 lines (102 loc) · 2.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
//无语料库的中文分词,C++实现
//author:@DGideas,@sunlanchang
using namespace std;
#include<iostream>
#include<string>
#include<cmath>
#include<vector>
#include<locale>
#include"cs.h"
int main(int argc,char *argv[])
{
setlocale(LC_ALL, ""); //更改字符集为UTF-8
wstring str=L"十四是十四,四十是四十。十四不是四十,四十不是十四。"; //待分词的字符串
wstring symbol=L",。!—;【】:《》"; //特殊符号
vector<wstring> split; //分割后的字符串
vector<wstring> words; //最后得出的分词结果
vector<entropy> eto; //信息熵类
word_freq L_freq,freq; //词频统计类:左邻字,单字
for (int i=0;i<symbol.size();i++)//删除标点符号
{
for (long long j=0;j<str.size();j++)
{
if (symbol[i]==str[j])
{
str[j]=L' ';
}
}
}
wstring tmp; //临时字符串
for(long long i=0;i<str.size();i++) //将字符串以空格分隔,存入wstring数组中
{
if(str[i]!=L' ')
{
tmp+=str[i];
}
else
{
split.push_back(tmp);
tmp=L"";
}
}
for(long long i=0;i<split.size();i++) //迭代地构建左邻字集合
{
wstring single_tmp;
for(long long h=0;h<split[i].size();h++)
{
tmp=L"";
for(long long j=h;j<split[i].size();j++)
{
tmp+=split[i][j];
if(tmp.size()>1)
{
L_freq.add(tmp);
}
single_tmp=split[i][j];
freq.add(single_tmp);
}
}
}
//字符串逆序
vector<wstring> split_NESC;
for(int i=split.size()-1;i>=0;i--)
{
split_NESC.push_back(L"");
for(int j=split[i].size()-1;j>=0;j--)
{
for(int h=0;h<split[i].size();h++)
{
//split_NESC[h]+=split[i][j];
}
}
}
L_freq.calculate(str.size());
//输出L_freq相关变量
for(int i=0;i<L_freq.word.size();i++)
{
//wcout<<L_freq.word[i]<<L" "<<endl;
}
//计算信息量
vector<double> xinxiliang;
for(int i=0;i<L_freq.word.size();i++)
{
xinxiliang.push_back(-log10 (L_freq.freq[i]));
//wcout<<xinxiliang[i]<<endl;
}
//计算信息熵
vector<double> xinxishang;
for(int i=0;i<L_freq.word.size();i++)
{
xinxishang.push_back(xinxiliang[i]*L_freq.freq[i]);
//cout<<xinxishang[i]<<endl;
wcout<<xinxishang[i]<<endl;
}
//测试输出
for(int i=0;i<L_freq.word.size();i++)
{
wcout<<L_freq.word[i];
wcout<<L_freq.freq[i]<<L" ";
wcout<<xinxishang[i]<<endl;
}
return 0;
}