-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain_a4_extra.py
More file actions
110 lines (96 loc) · 3.79 KB
/
main_a4_extra.py
File metadata and controls
110 lines (96 loc) · 3.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""The idea of tagging the universal tag is very simple and similar to the one before."""
"""I only add one new function to it. This function is to chanage the original training"""
"""tagged sample to univseral tag. And then, train the model with new tag and predict the"""
"""outcome with new tags as well. In addition, the testing sample was changed as well."""
import nltk
import requests
import sys
from nltk.corpus import brown
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
def change_to_uni(target):
"""I decided to change the original train and test tag before fitting the models"""
link = 'https://raw.githubusercontent.com/slavpetrov/universal-pos-tags/master/en-brown.map'
f = requests.get(link)
no_space = f.text.split('\n')
tag_dic = {}
for i in no_space[:-1]:
tag_list = i.split('\t')
tag_dic[tag_list[0]] = tag_list[1]
out = []
for sent in target:
out_sent = []
for group in sent:
word = group[0]
if group[1] in tag_dic:
uni_tag = tag_dic[group[1]]
out_sent.append((word, uni_tag))
else:
uni_tag = group[1]
out_sent.append((word, uni_tag))
out.append(out_sent)
return out
"""Descision Tree Method"""
def get_word_feature(sent, word_pos):
return {
'word': sent[word_pos],
'first': word_pos == 0,
'last': word_pos == len(sent) - 1,
'capitalized': sent[word_pos][0].upper() == sent[word_pos][0],
'all_caps': sent[word_pos].upper() == sent[word_pos],
'all_lower': sent[word_pos].lower() == sent[word_pos],
'first_1': sent[word_pos][0],
'first_2': sent[word_pos][:2],
'first_3': sent[word_pos][:3],
'last_1': sent[word_pos][-1],
'last_2': sent[word_pos][-2:],
'last_3': sent[word_pos][-3:],
'prev_word': '' if word_pos == 0 else sent[word_pos - 1],
'next_word': '' if word_pos == len(sent) - 1 else sent[word_pos + 1],
'has_hyphen': '-' in sent[word_pos],
'is_numeric': sent[word_pos].isdigit(),
'capitals_inside': sent[word_pos][1:].lower() != sent[word_pos][1:]
}
def get_words(tagged_sent):
return [w for w,t in tagged_sent]
def to_dataset(tagged_sent):
x = []
y = []
for sent in tagged_sent:
for i in range(len(sent)):
x.append(get_word_feature(get_words(sent), i))
y.append(sent[i][1])
return x, y
def get_trained_model(x_train, y_train):
clf = Pipeline([
('vectorize', DictVectorizer(sparse = True)),
('classifier', DecisionTreeClassifier(criterion = 'entropy'))
])
clf.fit(x_train, y_train)
return clf
def get_pos_tag(model, word_list):
tag = model.predict([get_word_feature(word_list, i) for i in range(len(word_list))])
return list(zip(word_list, tag))
if __name__ == "__main__":
take_arg = sys.argv
news = change_to_uni(brown.tagged_sents(categories = 'news'))
if len(take_arg) == 2:
method = take_arg[1]
if 'train' in method:
train_x, train_y = to_dataset(news)
model1 = get_trained_model(train_x, train_y)
else:
print('Please check the input')
elif len(take_arg) == 3:
train_x, train_y = to_dataset(news)
model1 = get_trained_model(train_x, train_y)
method = take_arg[1]
argv = take_arg[2]
if 'run' in method and isinstance(argv, str):
sents = argv.split(' ')
print(get_pos_tag(model1, sents))
elif 'test' in method:
data_tag = change_to_uni(brown.tagged_sents(categories = argv))
test_x, test_y = to_dataset(data_tag)
print(model1.score(test_x, test_y))