TextCatSSL/proppr.py at master · tomshen/TextCatSSL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
import os.path as path
import subprocess

from config import PROPPR_DIR, PROPPR_PROGRAM_DIR
import util


def generate_facts(data_set):
    with open(path.join(PROPPR_PROGRAM_DIR, data_set + '.cfacts'), 'w') as f:
        for i in xrange(1, util.get_num_labels(data_set) + 1):
            f.write('isLabel\tl%d\n' % i)


def generate_data(data_set):
    labels = [str(i) for i in xrange(1, util.get_num_labels(data_set) + 1)]
    with util.open_seeds_file(data_set, 'r') as seeds:
        with open(path.join(PROPPR_PROGRAM_DIR, data_set + '.data'), 'w') as f:
            for line in seeds:
                doc, label, weight = line.split('\t')
                f.write('predict(d%s,Y)\t' % doc + '\t'.join(
                    '%spredict(d%s,l%s)' % (
                        '+' if label == l else '-',
                        doc,
                        l) for l in labels) + '\n')


def generate_graph(data_set):
    with util.open_data_file(data_set) as data:
        with open(path.join(PROPPR_PROGRAM_DIR, data_set + '.graph'), 'w') as f:
            for line in data:
                doc, feature, weight = line.split()
                f.write('\t'.join(['hasWord', 'd' + doc, 'w' + feature]) + '\n')


def run_proppr(data_set):
    commands = {
        'compile': '''
            cd {0} && python src/scripts/compiler.py serialize \
            {1}/textcat.ppr > {1}/textcat.wam
        '''.format(PROPPR_DIR, PROPPR_PROGRAM_DIR),
        'train': '''
            java -cp {0}/bin:{0}/lib/*:{0}/conf/ edu.cmu.ml.proppr.Grounder \
            --queries {1}/{2}.data --grounded {1}/{2}.cooked \
            --programFiles {1}/textcat.wam:{1}/{2}.graph:{1}/{2}.cfacts \
            --threads 24
        '''.format(PROPPR_DIR, PROPPR_PROGRAM_DIR, data_set)
    }
    generate_data(data_set)
    subprocess.call(commands['compile'], shell=True)
    subprocess.call(commands['train'], shell=True)


def parse_cooked(cooked):
    def split(l, sep=','):
        return l.split(sep) if l else []

    def parse_edge(edge):
        src, dest, raw_feats = re.match(r'(\d*)->(\d*):([\d,]*)', edge).groups()
        return int(src), int(dest), [int(f) for f in split(raw_feats)]

    doc_nodes = {}

    for line in cooked:
        tokens = line.strip().split('\t')
        doc = int(re.match(r'predict\(d(\w*),.*\)', tokens[0]).group(1))
        query_vec_keys = tokens[1].split(',') if tokens[1] else []
        pos_nodes = [int(n) for n in split(tokens[2])]
        neg_nodes = [int(n) for n in split(tokens[3])]
        node_count = int(tokens[4])
        edge_count = int(tokens[5])
        features = split(tokens[6], ':')
        edges = [parse_edge(e) for e in tokens[7:]]
        doc_nodes[doc] = {
            'pos': pos_nodes,
            'neg': neg_nodes
        }