-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathproppr.py
More file actions
78 lines (63 loc) · 2.73 KB
/
proppr.py
File metadata and controls
78 lines (63 loc) · 2.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
import os.path as path
import subprocess
from config import PROPPR_DIR, PROPPR_PROGRAM_DIR
import util
def generate_facts(data_set):
with open(path.join(PROPPR_PROGRAM_DIR, data_set + '.cfacts'), 'w') as f:
for i in xrange(1, util.get_num_labels(data_set) + 1):
f.write('isLabel\tl%d\n' % i)
def generate_data(data_set):
labels = [str(i) for i in xrange(1, util.get_num_labels(data_set) + 1)]
with util.open_seeds_file(data_set, 'r') as seeds:
with open(path.join(PROPPR_PROGRAM_DIR, data_set + '.data'), 'w') as f:
for line in seeds:
doc, label, weight = line.split('\t')
f.write('predict(d%s,Y)\t' % doc + '\t'.join(
'%spredict(d%s,l%s)' % (
'+' if label == l else '-',
doc,
l) for l in labels) + '\n')
def generate_graph(data_set):
with util.open_data_file(data_set) as data:
with open(path.join(PROPPR_PROGRAM_DIR, data_set + '.graph'), 'w') as f:
for line in data:
doc, feature, weight = line.split()
f.write('\t'.join(['hasWord', 'd' + doc, 'w' + feature]) + '\n')
def run_proppr(data_set):
commands = {
'compile': '''
cd {0} && python src/scripts/compiler.py serialize \
{1}/textcat.ppr > {1}/textcat.wam
'''.format(PROPPR_DIR, PROPPR_PROGRAM_DIR),
'train': '''
java -cp {0}/bin:{0}/lib/*:{0}/conf/ edu.cmu.ml.proppr.Grounder \
--queries {1}/{2}.data --grounded {1}/{2}.cooked \
--programFiles {1}/textcat.wam:{1}/{2}.graph:{1}/{2}.cfacts \
--threads 24
'''.format(PROPPR_DIR, PROPPR_PROGRAM_DIR, data_set)
}
generate_data(data_set)
subprocess.call(commands['compile'], shell=True)
subprocess.call(commands['train'], shell=True)
def parse_cooked(cooked):
def split(l, sep=','):
return l.split(sep) if l else []
def parse_edge(edge):
src, dest, raw_feats = re.match(r'(\d*)->(\d*):([\d,]*)', edge).groups()
return int(src), int(dest), [int(f) for f in split(raw_feats)]
doc_nodes = {}
for line in cooked:
tokens = line.strip().split('\t')
doc = int(re.match(r'predict\(d(\w*),.*\)', tokens[0]).group(1))
query_vec_keys = tokens[1].split(',') if tokens[1] else []
pos_nodes = [int(n) for n in split(tokens[2])]
neg_nodes = [int(n) for n in split(tokens[3])]
node_count = int(tokens[4])
edge_count = int(tokens[5])
features = split(tokens[6], ':')
edges = [parse_edge(e) for e in tokens[7:]]
doc_nodes[doc] = {
'pos': pos_nodes,
'neg': neg_nodes
}