-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfrequency.py
More file actions
28 lines (21 loc) · 1.07 KB
/
frequency.py
File metadata and controls
28 lines (21 loc) · 1.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from __future__ import division
import sys
import json
#Calculate the term frequency histogram of the livestream Twitter data (over five minutes , in Septermber 2017)
#For each term, the result is equal to the number of occurrences of the term in all tweets divided by the
#number of occurences of all of the terms in all of the tweets.
twitter_file = open(sys.argv[1]) #imports the raw twitterstream data
#twitter_file = open("/Users/stuartbarnum/Desktop/Coursera/datasci_course_materials/assignment1/output.txt")
term_frequency = {} # initializes the dictionary that will contain the raw frequecy for each word
all_word_total = 0
for line in twitter_file:
if "text" in json.loads(line):
twitter_text = json.loads(line)["text"]
for word in twitter_text.split():
if word not in term_frequency:
term_frequency[word] = 1
if word in term_frequency:
term_frequency[word] += 1
all_word_total += len(twitter_text.split())
for word in term_frequency:
print word, term_frequency[word] / all_word_total