-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodels.py
More file actions
140 lines (115 loc) · 5.58 KB
/
models.py
File metadata and controls
140 lines (115 loc) · 5.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from featurize import Dataset, FeatureMaker
from copy import deepcopy
import numpy as np
from sklearn import linear_model
"""
The classifier and partitioner that make up the thread detection model
"""
class ConvClassifier:
def __init__(self):
self.dataset = Dataset('IRC/dev/linux-dev-0X.annot', 'IRC/pilot/linux-pilot-0X.annot')
self.featurizer = FeatureMaker(self.dataset)
def train_from_file(self, features_path_train, features_path_dev):
self.train_data = np.loadtxt(features_path_train, delimiter='\t')
self.dev_data = np.loadtxt(features_path_dev, delimiter='\t')
self.train_X = self.train_data[...,:-1]
self.train_y = self.train_data[...,-1]
self.dev_X = self.dev_data[...,:-1]
self.dev_y = self.dev_data[...,-1]
self.model = linear_model.LogisticRegression(solver='newton-cg').fit(self.train_X, self.train_y)
y_hat = self.model.predict(self.train_X)
score = np.sum(y_hat == self.train_y) / len(self.train_y)
print('Training accuracy: {:.3f}'.format(score))
dev_y_hat = self.model.predict(self.dev_X)
dev_score = np.sum(dev_y_hat == self.dev_y) / len(self.dev_y)
print('Dev accuracy: {:.3f}'.format(dev_score))
def create_feature_files(self, save_train_path, save_dev_path, is_baseline):
train_pairs = self.get_pairs(self.dataset.train_set)
self.featurize(train_pairs, save_train_path, is_baseline)
dev_pairs = self.get_pairs(self.dataset.dev_set)
self.featurize(dev_pairs, save_dev_path, is_baseline)
def get_pairs_for(self, b, dset):
# Find all possible example pairs for a given utterance: If another utterance is within 50 seconds, it's considered eligible
pairs = []
exs = deepcopy(dset)
exs.remove(b)
for e in exs:
if abs(b[1] - e[1]) < 50:
pairs.append((b, e))
return pairs
def get_pairs(self, dset):
# get pairs for every example in a dataset
pairs = []
for b in dset:
for p in self.get_pairs_for(b, dset):
pairs.append(p)
return pairs
def featurize(self, pairs, save_txt, is_baseline):
# create vectors for a list of tuples, and save to a file
arr = self.featurizer.get_feature_vector(*pairs[0], baseline=is_baseline)
n = 1
for p in pairs[1:]:
arr = np.vstack([arr, self.featurizer.get_feature_vector(*p, baseline=is_baseline)])
print('Featurizing {} of {}'.format(n, len(pairs)))
n += 1
np.savetxt(save_txt, arr, delimiter='\t', fmt='%.3f')
class Partitioner:
"""
Partition a dataset of example-tuples into multiple conversations.
Parameters: a reference to a ConvClassifier and a bool to determine if only baseline features should be used
"""
def __init__(self, classf, is_baseline):
self.featurizer = classf.featurizer
self.linear_model = classf.model
self.test_data = classf.dataset.test_set
self.gold_conv = self.get_gold_clusters()
self.is_baseline = is_baseline
# partition the test data using the model, then use the gold clusters to predict a final partitioning accuracy
sys_convos = self.partition()
print('Test set average overlap: {:.3f}'.format(self.conv_overlap(sys_convos)))
def partition(self):
# attempt to completely segment the test set. It needs a reference to a Classifier object's trained linear model
convos = {}
for entry in self.test_data:
print('Entry {} of {}'.format(self.test_data.index(entry) + 1, len(self.test_data)))
# if empty, add first entry to a new conversation
if not convos:
convos[len(convos)] = [entry]
# otherwise, test entry against the last entry in all conversations
else:
# get prediction probabilities for all of the clusters, and choose the max. If all predictions are less than 0, create a new cluster
votes = np.zeros(len(convos))
for i in range(len(convos)):
pair = self.featurizer.get_feature_vector(convos[i][-1], entry, baseline=self.is_baseline)[:-1]
proba = self.linear_model.predict_proba(pair.reshape(1,-1))
votes[i] = proba[0][1]
votes = votes - 0.5
if np.any(votes > 0):
convos[np.argmax(votes)].append(entry)
else:
convos[len(convos)] = [entry]
return convos
def get_gold_clusters(self):
# create a dictionary that represents the "gold standard" conversation clustering
gold_convos = {}
for entry in self.test_data:
if entry[0] not in gold_convos.keys():
gold_convos[entry[0]] = [entry]
else:
gold_convos[entry[0]].append(entry)
return gold_convos
def conv_overlap(self, sys):
# evaluate the model's partitioning accuracy: return the average overlap
n_pairs = 0
tot_avg_overlap = 0
# find the gold conversation with the highest overlap, add to the total
for sys_entry in sys.values():
max_overlap = 0
for gold_entry in self.gold_conv.values():
overlap = set(sys_entry).intersection(set(gold_entry))
overlap = len(overlap) / len(gold_entry)
if overlap > max_overlap:
max_overlap = overlap
n_pairs += 1
tot_avg_overlap += max_overlap
return tot_avg_overlap / n_pairs