-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathdiagnostic_xml2csv.py
More file actions
39 lines (30 loc) · 1.49 KB
/
diagnostic_xml2csv.py
File metadata and controls
39 lines (30 loc) · 1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/env python
# USAGE
# diagnostic_xml2csv file.xml outfile1.csv outfile2.csv
# file.xml: diagnostic results produced by mallet
# outfile1.csv: table of mallet's diagnostics on topics
# outfile2.csv: table of mallet's diagnostics on top keywords in each topic
from xml.dom import minidom
import csv
def main(script, xml_filename, topic_file, word_diagnostic_file):
doc = minidom.parse(xml_filename)
topics = doc.getElementsByTagName('topic')
f = open(topic_file,"w")
topic_writer = csv.writer(f,delimiter=",")
topic_headers = ["id", "tokens", "document_entropy", "word-length", "coherence", "uniform_dist", "corpus_dist", "eff_num_words", "token-doc-diff", "rank_1_docs", "allocation_ratio", "allocation_count"]
topic_writer.writerow(topic_headers)
g = open(word_diagnostic_file,"w")
word_headers = ["rank","count","prob","cumulative","docs","word-length","coherence","uniform_dist","corpus_dist","token-doc-diff"]
word_writer = csv.writer(g,delimiter=",")
word_writer.writerow(["topic","word"] + word_headers)
for t in topics:
attrs = [t.getAttribute(a) for a in topic_headers]
attrs[0] = int(attrs[0]) + 1 # re-index topics from 1, not 0
topic_writer.writerow(attrs)
words = t.getElementsByTagName('word')
for w in words:
wattrs = [w.getAttribute(a) for a in word_headers]
word_writer.writerow([attrs[0],w.firstChild.nodeValue] + wattrs)
if __name__ == '__main__':
import sys
main(*sys.argv)