forked from CrazyBurrito/DataMiningProject
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcleanComments.py
More file actions
executable file
·69 lines (62 loc) · 2.21 KB
/
cleanComments.py
File metadata and controls
executable file
·69 lines (62 loc) · 2.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# cleanComments.py
# Author: Dimitrios Economou
#
# Takes a list of comment objects and removes unnecessary properties.
# Write result into a new file.
# Usage: python cleanComments.py <'filename'>
import sys
import json
import codecs
import csv
def cleanComments(filename):
subreddits = "data/subreddits.csv"
# Get {"subreddit":type} for filtering comments by subreddit
# and adding subreddit type field to json data
subTypes = {}
with open(subreddits, 'rb') as csvfile:
csvreader = csv.reader(csvfile)
for row in csvreader:
subTypes[row[1]] = row[2]
subsWeWant = subTypes.keys()
propsToRemove = [
'author_flair_text',
'archived',
'controversiality',
'author_flair_css_class',
'retrieved_on',
'edited',
'id',
'score_hidden',
'gilded',
'distinguished',
'ups',
'downs'
]
with open(filename, 'r') as infile:
with codecs.open('_{}'.format(filename), 'w', encoding='utf-8') as outfile:
for comment in infile:
decodedComment = json.loads(comment)
if decodedComment["subreddit"] not in subsWeWant:
continue
if decodedComment["author"] == "[deleted]":
continue
if decodedComment["body"] == "[deleted]":
continue
for prop in propsToRemove:
# use pop to avoid exception handling
decodedComment.pop(prop, None)
subreddit = decodedComment["subreddit"]
decodedComment["sub_type"] = subTypes[subreddit]
# Note:
# sort_keys = True, indent = 4 gives a nice looking output
# However, we want a comment per line.
json.dump(decodedComment, outfile, ensure_ascii = False)
outfile.write('\n')
def main(argv):
filename = argv[0]
cleanComments(filename)
#fnames = ["RC_2014-09", "RC_2014-10", "RC_2014-11", "RC_2014-12", "RC_2015-01"]
#for fname in fnames:
# cleanComments(fname)
if __name__ == "__main__":
main(sys.argv[1:])