-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathanalyzer.py
More file actions
69 lines (58 loc) · 2.26 KB
/
analyzer.py
File metadata and controls
69 lines (58 loc) · 2.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import re
import pandas as pd
from tweetdataframe import TweetDataframe
class Analyzer(TweetDataframe):
def __init__(self, path):
TweetDataframe.__init__(self, path)
def top_n_retweet(self, n):
pass
def location(self):
pass
def time_trend(self, freq='H', how='sum'):
time_count = self.df.created_at.groupby(self.df.created_at).count()
return time_count.resample(freq, how)
def weekday_trend(self):
df = self.df
df['weeknumber'] = df['created_at'].map(lambda x: x.isocalendar()[1])
df['weekday'] = df['created_at'].map(lambda x: x.isocalendar()[2])
week_df = pd.DataFrame(self.df.weekday)
week_df['weeknumber'] = self.df.weeknumber
weekday_count = week_df.groupby(['weekday', 'weeknumber']).count()
weekday_count = weekday_count.weeknumber.unstack('weeknumber')
print weekday_count.to_records()
def top_n_influencer(self, n):
pass
def top_n_source(self, n=None):
df_source = self.df.source
source_counts = df_source.value_counts()[:n]
n_source = self.text_only_source(source_counts)
return pd.DataFrame(n_source, columns=['source', 'count'])
def top_official_twitter_apps(self):
sources = self.top_n_source()
platforms = sources[sources.source.str.contains('Twitter for')]
total_count = platforms['count'].sum()
platforms['percent'] = platforms['count'] * 100 / total_count
return platforms
def text_only_source(self, series):
refined_source = []
counts = []
n_source = {}
for source, count in series.iteritems():
result = self.text_from_href(source)
refined_source.append(result)
counts.append(count)
n_source['source'] = refined_source
n_source['count'] = counts
return n_source
def text_from_href(self, source):
result = re.search('>(.*)</a>', source)
if result == None:
result = source
else:
result = result.group(1)
return result
if __name__ == "__main__":
SAMPLE_CSV_PATH = "sample/csv/moto_x.csv"
SAMPLE_JSON_PATH = "sample/json/moto_x.json"
x = Analyzer(SAMPLE_JSON_PATH)
print x.top_official_twitter_apps()