-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
252 lines (218 loc) · 9.72 KB
/
data.py
File metadata and controls
252 lines (218 loc) · 9.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#The goal of this script is to clean the data and create a csv file with all the information
#All the data is stored in the same folder as the script
#It is all stored in CSV format
#These are the headers for the output csv file:
#Sentence, Annotation, Explicit/Implicit, Intent, Keywords1, Keywords2, Keywords3, Keywords4, Keywords5
#The data is stored in the following format:
#Website,Title,Date,Author,Sentence,Code
#Where Code Number, Implicit/Explicit, Intent , Keywords are stored in the same cell under code
#Code number needs to be translated to the annotation using this dictionary:
#codes = {
# 1: "Economy/Money/Finance/Trade/GDP",
# 2: "Trust/Stability/Security/Peace",
# 3: "Violence/Conflict/Protest",
# 4: "Unity/Cooperation/Alliance",
# 5: "Change/ Reform/Revolution",
#This is what the data looks like:
#knowledge.wharton.upenn.edu,"To Stave Off Arab Spring Revolts, Saudi Arabia and Fellow Gulf Countries Spend $150 Billion",,,"b'Possessing 20% of the world\xe2\x80\x99s proven petroleum reserves, Saudi Arabia has long played the expected role of benefactor among its fellow Arab and Muslim countries'"," 4, Explicit, Positive, Keywords: Petroleum Reserves, Benefactor"
import csv
import os
import json
import re
import pandas as pd
import chardet
def get_data():
# This function will read the data from the csv files and return a list of dictionaries
# Each dictionary will contain the information for one sentence
# The data is stored in the following format:
# Website,Title,Date,Author,Sentence,Code
# Where Code Number, Implicit/Explicit, Intent , Keywords are stored in the same cell under code
# Code number needs to be translated to the annotation using this dictionary:
# codes = {
# 1: "Economy/Money/Finance/Trade/GDP",
# 2: "Trust/Stability/Security/Peace",
# 3: "Violence/Conflict/Protest",
# 4: "Unity/Cooperation/Alliance",
# 5: "Change/ Reform/Revolution",
data = [] # This is the list that will contain all the dictionaries
# Get the list of all the files in the current directory
files = os.listdir(os.getcwd())
# Loop through all the files
for file in files:
# Check if the file is a csv file
if file.endswith(".csv"):
#Get the encoding of the file
with open(file, "rb") as f:
result = chardet.detect(f.read())
encoding = result["encoding"]
# Open the csv file
with open(file, "r", encoding=encoding) as csv_file:
# Read the csv file
csv_reader = csv.reader(csv_file, delimiter=",")
# Loop through all the rows in the csv file
for row in csv_reader:
#Skip the row if it does not have 6 columns
if len(row) != 6:
continue
# Get the information from the row
website = row[0]
title = row[1]
date = row[2]
author = row[3]
sentence = row[4]
code = row[5]
# Create the list of information
info = [website, title, date, author, sentence, code]
# Add the dictionary to the list
data.append(info)
# Return the list of dictionaries
return data
def remove_unnecessary(data):
#This function will remove all the unnecessary information from the data
#It will also translate the code number to the annotation
#It will also remove the b' from the beginning of the sentence
#It will also remove the ' from the end of the sentence
#It will also remove any byte characters from the sentence
#It will remove the line that contain the following words:
#Website,Title,Date,Author,Sentence,Code,Implicit/Explicit,Intent,Keywords
#It will also remove the column headers
#It will also remove the columns that contain the following words:
#Website,Title,Author
#This is the list that will contain the cleaned sentences
AllSentences = []
#This is the list that will contain the cleaned annotations
AllAnnotations = []
#This is the list that will contain the cleaned Explicit/Implicit
AllIE = []
#This is the list that will contain the cleaned Intent
AllIntent = []
#This is the list that will contain the cleaned Keyword1
AllKeyword1 = []
#This is the list that will contain the cleaned Keyword2
AllKeyword2 = []
#This is the list that will contain the cleaned Keyword3
AllKeyword3 = []
#This is the list that will contain the cleaned Keyword4
AllKeyword4 = []
#This is the list that will contain the cleaned Keyword5
AllKeyword5 = []
#Loop through all the dictionaries in the data
for dictionary in data:
#print(dictionary)
#Remove the line that contain the following words:
#Website,Title,Date,Author,Sentence,Code,Implicit/Explicit,Intent,Keywords
if "Website" in dictionary:
continue
#Get the code number
codeCell = dictionary[5]
#print(codeCell)
#This is the format for code: 1, Explicit, Positive, Keywords: Petroleum Reserves, Benefactor
#The code field may have the following formats:
#0
#error
#In which case we are skipping the row
if codeCell == "0" or codeCell == "error":
continue
#Extract the code number for all the code numbers
code = int(codeCell.split(",")[0].strip())
print(code)
# This is the dictionary that will be used to translate the code number to the annotation
codes = {
1: "Economy/Money/Finance/Trade/GDP",
2: "Trust/Stability/Security/Peace",
3: "Violence/Conflict/Protest",
4: "Unity/Cooperation/Alliance",
5: "Change/ Reform/Revolution",
}
#Check if the code number is in the dictionary
if int(code) in codes:
print("Code is in dictionary")
#Get the annotation
annotation = codes[code]
AllAnnotations.append(annotation)
#Get the IE
else:
print("Code is not in dictionary")
AllAnnotations.append("None")
try:
AllSentences.append(dictionary[4].replace("b'", "").replace("'", "").encode("ascii", "ignore").decode("ascii"))
except:
AllSentences.append("None")
try:
IE = codeCell.split(",")[1] #Index 1 is Implicit/Explicit
AllIE.append(IE) #Add the IE to the list
except:
AllIE.append("None")
try:
intent = codeCell.split(",")[2] #Index 2 is Intent
AllIntent.append(intent) #Add the intent to the list
except:
AllIntent.append("None")
#Get the keywords and create a list of keywords that should be 5 keywords long, if there are less than 5 keywords, the rest of the list will have a string "None"
try:
keywords = codeCell.split(",")[3:]
#Remove the "Keywords: " from the first keyword
keywords[0] = keywords[0].replace("Keywords: ", "")
#Create a list to store the keywords that are seperated by , and remove the "Keywords: " from the first keyword
keywordList = keywords[0].split(", ")
#Check if there are more than 5 keywords in the list
if len(keywordList) > 5:
#Remove the last keyword from the list
keywordList.pop()
except:
keywordList = ["None", "None", "None", "None", "None"]
try:
#Get the first keyword
keyword1 = keywordList[0]
except:
keyword1 = "None"
try:
keyword2 = keywordList[1]
except:
keyword2 = "None"
try:
keyword3 = keywordList[2]
except:
keyword3 = "None"
try:
keyword4 = keywordList[3]
except:
keyword4 = "None"
try:
keyword5 = keywordList[4]
except:
keyword5 = "None"
#Create the list of keywords that should be 5 keywords long, if there are less than 5 keywords, the rest of the list will have a string "None"
AllKeyword1.append(keyword1) #Add the keyword to the list
AllKeyword2.append(keyword2) #Add the keyword to the list
AllKeyword3.append(keyword3) #Add the keyword to the list
AllKeyword4.append(keyword4) #Add the keyword to the list
AllKeyword5.append(keyword5) #Add the keyword to the list
#Add the dictionary to the list
return AllSentences, AllAnnotations, AllIE, AllIntent, AllKeyword1, AllKeyword2, AllKeyword3, AllKeyword4, AllKeyword5
def main():
#Get the data
data = get_data()
#Remove unnecessary stuff from the data
sentences, annotations, ie, intent, kw1, kw2, kw3, kw4, kw5 = remove_unnecessary(data)
print(ie)
# check the lengths of the arrays
lengths = [len(sentences), len(annotations), len(ie), len(intent), len(kw1), len(kw2), len(kw3), len(kw4), len(kw5)]
print(lengths)
min_len = min(lengths)
print(min_len)
# remove elements that exceed the minimum length
sentences = sentences[:min_len]
annotations = annotations[:min_len]
ie = ie[:min_len]
intent = intent[:min_len]
kw1 = kw1[:min_len]
kw2 = kw2[:min_len]
kw3 = kw3[:min_len]
kw4 = kw4[:min_len]
kw5 = kw5[:min_len]
# create the DataFrame
df = pd.DataFrame({"Sentence": sentences, "Annotation": annotations, "Implicit/Explicit": ie, "Intent": intent, "Keyword1": kw1, "Keyword2": kw2, "Keyword3": kw3, "Keyword4": kw4, "Keyword5": kw5})
#Save the dataframe as a csv file
df.to_csv("CleanedData.csv", index=False)
main()