-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathutil.py
More file actions
83 lines (72 loc) · 2.25 KB
/
util.py
File metadata and controls
83 lines (72 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from __future__ import division
import sys
import csv
import os
import re
import random
# extracts features from a given body of text
def features(text,latex=True):
features = {}
if latex:
features['latex_symbol'] = 0
for token in tokenize(text):
# bow feature extraction
if token not in features:
features[token] = 0
# word count features
features[token] += 1
# LaTex feature extraction
if latex:
if '%' in token:
features['latex_symbol'] += 1
return features
# converts a string to a list of lowercased tokens
def tokenize(text):
text = text.lower()
text = re.sub(r'<[\w/]*>','',text)
tokens = text.split()
return tokens
# converts label tag strings into a list of labels
def extract_labels(labels):
return re.sub(r'<|>',' ',labels).split()
# parses csv into shuffled dataset with features already extracted
def parse_data(subdir,fname,single_label=False,extract_features=False):
# read in the data set
csvfile = open(os.path.join(subdir, fname))
reader = csv.reader(csvfile,delimiter=',')
raw_data = list(reader)
# collect features and label to form dataset
dataset = []
for post,tags in raw_data:
y = extract_labels(tags)
if extract_features:
x = features(post)
if single_label:
y = y[0]
else:
x = tokenize(post)
dataset.append((x,y))
# randomize the data cases
random.shuffle(dataset)
return dataset
# number of bit flips to get from prediction to gold standard (total error)
def hamming_error(gold,pred):
count = 0
for i in range(len(gold)):
if gold[i] != pred[i]:
count += 1
return count
# number of tags missed by OVR classifier for given gold standard and prediction
def recall_error(gold,pred):
count = 0
for i in range(len(gold)):
if gold[i] == 1 and pred[i] == 0:
count += 1
return count
# number of tags wrongly predicted by OVR classifier given gold standard and prediction
def precision_error(gold,pred):
count = 0
for i in range(len(gold)):
if gold[i] == 0 and pred[i] == 1:
count += 1
return count