-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathclassification_ICAAD.py
More file actions
92 lines (67 loc) · 2.9 KB
/
classification_ICAAD.py
File metadata and controls
92 lines (67 loc) · 2.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from lda import ClassificationModel, Viewer, Info
import pandas as pd
import logging
def classification_ICAAD():
##### PARAMETERS #####
pd.set_option('chained_assignment', None)
logging.basicConfig()
evaluationFile = 'Documents/PACI.csv'
dataFeatures = pd.read_csv(evaluationFile)
dataFeatures = dataFeatures.rename(columns={'Unnamed: 0': 'id'})
features = [feature for feature in dataFeatures.columns.tolist() if dataFeatures[feature].dtypes==bool]
features = ['Domestic.Violence.Manual', 'Sexual.Assault.Manual']
info = Info()
info.data = 'ICAAD'
info.topicNr = 55
info.identifier = 'LDA_T%dP12I100_tfidf_word2vec' % info.topicNr
info.classifierType = 'NeuralNet'
info.classifierType = 'DecisionTree'
selectedTopics = [0,2,3,6,7,8,9,12,19,24,25,34,35,36,38,42,47] # set to None selects all topics
#selectedTopics = [0,3,6,7,8,12,24,25,34,35,36,38,42,47] # set to None selects all topics
#selectedTopics = None
### LOAD DATA ###
for feature in features:
path = 'html/%s/DocumentFeatures.csv' % (info.data + '_' + info.identifier)
model = ClassificationModel(path)
model.droplist = ['DV', 'SA', 'File', 'id']
model.getSelectedTopics(info.topicNr, selectedTopics)
similarDocList = model.getSimilarDocs()
relevantWords = model.getRelevantWords()
model.droplist.extend(similarDocList + relevantWords)
#model.keeplist = model.topicList
model.targetFeature = feature
### PREPROCESSING ###
categories = 'Documents/ICAAD/CategoryLists.csv'
categories = pd.read_csv(categories).astype(str)
categories = pd.unique(categories.values.ravel())
for elem in categories:
try:
model.data[elem] = model.data[elem].divide(sum(model.data[elem]))
except KeyError:
pass
column = dataFeatures[['id', model.targetFeature]]
model.mergeDataset(column)
model.data = model.data.set_index('Unnamed: 0')
model.dropNANRows()
model.createNumericFeature('court')
### SELECT TEST AND TRAINING DATA ###
model.factorFalseCases = 2
model.balanceDataset(model.factorFalseCases)
model.createTarget()
model.dropFeatures()
model.numberTrainingDocs = len(model.data)/3
model.splitDataset(model.numberTrainingDocs)
### CLASSIFICATION ###
model.buildClassifier(info.classifierType)
model.trainClassifier()
model.predict()
### EVALUATION ###
model.evaluate()
model.evaluation.confusionMatrix()
if not info.classifierType=='NeuralNet':
model.computeFeatureImportance()
model.getTaggedDocs()
html = Viewer(info)
html.classificationResults(model)
if __name__ == "__main__":
classification_ICAAD()