dfr-analysis/diagnostic_xml2csv.py at master · agoldst/dfr-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/env python

# USAGE
# diagnostic_xml2csv file.xml outfile1.csv outfile2.csv
# file.xml: diagnostic results produced by mallet
# outfile1.csv: table of mallet's diagnostics on topics
# outfile2.csv: table of mallet's diagnostics on top keywords in each topic

from xml.dom import minidom
import csv


def main(script, xml_filename, topic_file, word_diagnostic_file):
    doc = minidom.parse(xml_filename)
    topics = doc.getElementsByTagName('topic')

    f = open(topic_file,"w")
    topic_writer = csv.writer(f,delimiter=",")
    topic_headers = ["id", "tokens", "document_entropy", "word-length", "coherence", "uniform_dist", "corpus_dist", "eff_num_words", "token-doc-diff", "rank_1_docs", "allocation_ratio", "allocation_count"]
    topic_writer.writerow(topic_headers)

    g = open(word_diagnostic_file,"w")
    word_headers = ["rank","count","prob","cumulative","docs","word-length","coherence","uniform_dist","corpus_dist","token-doc-diff"]
    word_writer = csv.writer(g,delimiter=",")
    word_writer.writerow(["topic","word"] + word_headers)

    for t in topics:
        attrs = [t.getAttribute(a) for a in topic_headers]
        attrs[0] = int(attrs[0]) + 1   # re-index topics from 1, not 0
        topic_writer.writerow(attrs)

        words = t.getElementsByTagName('word')
        for w in words:
            wattrs = [w.getAttribute(a) for a in word_headers]
            word_writer.writerow([attrs[0],w.firstChild.nodeValue] + wattrs)

if __name__ == '__main__':
    import sys
    main(*sys.argv)