forked from CrazyBurrito/DataMiningProject
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgetUserSubM.py
More file actions
executable file
·62 lines (56 loc) · 2.06 KB
/
getUserSubM.py
File metadata and controls
executable file
·62 lines (56 loc) · 2.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import sys
import json
import shelve
import csv
def main():
# initialize data structures and set variables
fname = "data/2014-10-cleaned"
users = {}
subreddits = set()
subCount = {}
# thresholds
userSubT = 2 # min number of subreddits user must comment to
userCommentsT = 5 # min number of comments user must have made
nSubT = 10 # min number times sub commented in
# put data in dictionary of dictionaries {user:{sub:count}}
with open(fname, 'r') as infile:
for comment in infile:
decodedComment = json.loads(comment)
subreddit = decodedComment['subreddit'].encode('ascii', 'ignore')
subreddits.add(subreddit)
author = decodedComment['author'].encode('ascii', 'ignore')
if author not in users:
users[author] = {subreddit: 0}
if subreddit not in users[author]:
users[author][subreddit] = 0
users[author][subreddit] += 1
# reduce data based on thresholds
for subreddit in subreddits:
subCount[subreddit] = 0
for user in users.keys():
if len(users[user].keys()) < userSubT:
users.pop(user, None)
continue
if sum(users[user].values()) < userCommentsT:
users.pop(user, None)
continue
for subreddit in users[user].keys():
subCount[subreddit] += users[user][subreddit]
subreddits = [x for x in subreddits if subCount[subreddit] >= nSubT]
# myShelve = shelve.open('userSub.shelve')
# myShelve.update(users)
# myShelve.close()
# output reduced dictionary to csv for analysis
w = csv.writer(open("userSub.csv", 'w'))
# write top row of subreddits
w.writerow(subreddits)
for user in users.keys():
row = []
for subreddit in subreddits:
if subreddit not in users[user].keys():
row.append(0)
else:
row.append(users[user][subreddit])
w.writerow(row)
if __name__ == "__main__":
main()