lda/TopicModeling_ICAAD.py at master · Tilana/lda · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/python
#-*- coding: utf-8 -*-
from lda import Collection, Dictionary, Model, Info, Viewer, utils, Word2Vec, ImagePlotter
from lda.docLoader import loadCategories
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import names
from gensim.models import TfidfModel
import os.path
from lda import dataframeUtils as df
import csv
import pandas as pd
import pdb

def TopicModeling_ICAAD():

    info = Info()
    # Categories and Keywords
    info.categories = loadCategories('Documents/categories.txt')[0]     #0 -human rights categories   1 - Scientific Paper categories
    keywordFile = 'Documents/ICAAD/CategoryLists.csv'
    keywords_df = pd.read_csv(keywordFile).astype(str)
    keywords = list(df.toListMultiColumns(keywords_df, keywords_df.columns))

    #### PARAMETERS ####
    word2vec = Word2Vec()
    info.data = 'ICAAD'     # 'ICAAD' 'NIPS' 'scifibooks' 'HRC'

    # Preprocessing #
    info.preprocess = 0
    info.startDoc = 0
    info.numberDoc= None
    info.specialChars = set(u'''[,\.\'\`=\":\\\/_+]''')
    info.includeEntities = 0
    info.bigrams = 1

    numbers = [str(nr) for nr in range(0,500)]
    info.whiteList= word2vec.net.vocab.keys() + numbers + keywords
    info.stoplist = list(STOPWORDS) + utils.lowerList(names.words())
    info.stoplist = [x.strip() for x in open('stopwords/english.txt')]

    info.removeNames = 1

    # Dictionary #
    info.analyseDictionary = 0
    info.lowerFilter = 8   # in Documents
    info.upperFilter = 0.25   # in percent

    # LDA #
    info.modelType = 'LDA'  # 'LDA' 'LSI'
    info.numberTopics = 20
    info.tfidf = 0
    info.passes = 20
    info.iterations = 70
    info.online = 0
    info.chunksize = 4100
    info.multicore = 0

    info.setup()

    #### EVALUATION ####
    evaluationFile = 'Documents/PACI.csv'
    dataFeatures = pd.read_csv(evaluationFile)
    filenames = dataFeatures['Filename'].tolist()
    filenames = [name.replace('.txt', '') for name in filenames]
    dataFeatures['Filename'] = filenames
    dataFeatures = dataFeatures.rename(columns = {'Unnamed: 0': 'id'})

    #### MODEL ####
    collection = Collection()
    html = Viewer(info)
    #pdb.set_trace()

    if not os.path.exists(info.collectionName) or info.preprocess:
        print 'Load and preprocess Document Collection'
        collection.load(info.path, info.fileType, info.startDoc, info.numberDoc)
        collection.setDocNumber()
        for doc in collection.documents:
            doc.title = doc.title.replace('.rtf.txt', '')
            features = dataFeatures[dataFeatures['Filename']==doc.title]
            doc.id = df.getValue(features, 'id')
            doc.SA = df.getValue(features, 'Sexual.Assault.Manual')
            doc.DV = df.getValue(features, 'Domestic.Violence.Manual')
            doc.extractYear()
            doc.extractCourt()

        collection.prepareDocumentCollection(lemmatize=True, includeEntities=info.includeEntities, stopwords=info.stoplist, removeShortTokens=True, threshold=2, specialChars=info.specialChars, whiteList=info.whiteList, bigrams=info.bigrams)
        collection.saveDocumentCollection(info.collectionName)

    else:
        print 'Load Processed Document Collection'
        collection.loadPreprocessedCollection(info.collectionName)

    #pdb.set_trace()
    collection.documents = collection.documents[20:1000]

    print 'Create Dictionary'
    dictionary = Dictionary(info.stoplist)
    dictionary.addCollection(collection.documents)

    if info.analyseDictionary:
        'Analyse Word Frequency'
        collectionLength = collection.number
        dictionary.analyseWordFrequencies(info, html, collectionLength)

    print 'Filter extremes'
    dictionary.filter_extremes(info.lowerFilter, info.upperFilter, keywords)

    if info.analyseDictionary:
        dictionary.plotWordDistribution(info)

    print 'Create Corpus'
    corpus = collection.createCorpus(dictionary)

    print 'TF_IDF Model'
    tfidf = TfidfModel(corpus, normalize=True)
    if tfidf:
        corpus = tfidf[corpus]

    print 'Topic Modeling - LDA'
    lda = Model(info)
    lda.createModel(corpus, dictionary.ids, info)
    lda.createTopics(info)

    print 'Topic Coverage'
    topicCoverage = lda.model[corpus]

    print 'Get Documents related to Topics'
    lda.getTopicRelatedDocuments(topicCoverage, info)

    print 'Similarity Analysis'
    lda.computeSimilarityMatrix(corpus, numFeatures=info.numberTopics, num_best = 7)

    maxTopicCoverage = []
    for document in collection.documents:
        docTopicCoverage = topicCoverage[document.nr]
        document.setTopicCoverage(docTopicCoverage, lda.name)
        lda.computeSimilarity(document)
        collection.computeRelevantWords(tfidf, dictionary, document)
        maxTopicCoverage.append(document.LDACoverage[0][1])
        document.createTokenCounter()
        for category in keywords_df.columns.tolist():
            wordsInCategory = df.getColumn(keywords_df, category)
            keywordFrequency = document.countOccurance(wordsInCategory)
            document.entities.addEntities(category, utils.sortTupleList(keywordFrequency))
        document.mostFrequentEntities = document.entities.getMostFrequent(5)

    #pdb.set_trace()


    ImagePlotter.plotHistogram(maxTopicCoverage, 'Maximal Topic Coverage', 'html/' + info.data+'_'+info.identifier+'/Images/maxTopicCoverage.jpg', 'Maximal LDA Coverage', 'Number of Docs', log=1)

    print 'Create HTML Files'
    html.htmlDictionary(dictionary)
    html.printTopics(lda)

    info.SATopics = input('Sexual Assault Topics:')
    info.DVTopics = input('Domestic Violence Topics:')
    info.otherTopics = input('Other Topics: ')
    selectedTopics = info.SATopics + info.DVTopics + info.otherTopics
    info.SAthreshold = 0.2
    info.DVthreshold = 0.2

    for doc in collection.documents:
        doc.predictCases('SA', info, info.SAthreshold)
        doc.tagPrediction('SA')
        doc.predictCases('DV', info, info.DVthreshold)
        doc.tagPrediction('DV')
    SAevaluation = collection.evaluate('SA')
    collection.getConfusionDocuments('SA')
    html.results(SAevaluation, collection, info)
    DVevaluation = collection.evaluate('DV')
    collection.getConfusionDocuments('DV')
    html.results(DVevaluation, collection, info)

    html.printDocuments(collection.documents, lda)
    html.printDocsRelatedTopics(lda, collection.documents, openHtml=False)
    html.documentOverview(collection.documents)

    print('Write Feature File')
    #collection.writeDocumentFeatureFile(info, selectedTopics, keywords)

    info.saveToFile()

if __name__ == "__main__":
    TopicModeling_ICAAD()