-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtweet_scraper.py
More file actions
100 lines (72 loc) · 2.84 KB
/
tweet_scraper.py
File metadata and controls
100 lines (72 loc) · 2.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import GetOldTweets3 as got
import pandas as pd
import regex as re
import string
def pull_tweets(username, start_date, end_date):
"""
Pull old tweets
username (string) = Twitter username
count (numeric) = Number of tweets to pull. Max 3000
start_date (string) = YYYY-MM-DD
end_date (string) = YYYY-MM-DD
"""
# Creation of query object
tweetCriteria = got.manager.TweetCriteria().setUsername(username).setSince(start_date).setUntil(end_date)
# Creation of list that contains all tweets
tweets = got.manager.TweetManager.getTweets(tweetCriteria)
# Create df for tweets
user_tweets = pd.DataFrame([[tweet.date, tweet.text] for tweet in tweets])
return user_tweets
def query_tweets(query, start_date, end_date):
"""
Pull old tweets
query (string) = Search keyword
count (numeric) = Number of tweets to pull. Max 3000
start_date (string) = YYYY-MM-DD
end_date (string) = YYYY-MM-DD
"""
# Creation of query object
tweetCriteria = got.manager.TweetCriteria().setQuerySearch(query).setSince(start_date).setUntil(end_date)
# Creation of list that contains all tweets
tweets = got.manager.TweetManager.getTweets(tweetCriteria)
# Create df for tweets
user_tweets = pd.DataFrame([[tweet.date, tweet.text, tweet.username] for tweet in tweets])
return user_tweets
def clean_tweets(df, tweets):
"""
Clean text column
df = dataframe
tweets (string) = column name containing tweets
"""
# lowercase text
df[tweets] = df[tweets].str.lower()
# remove URLs
df[tweets] = df[tweets].map(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x))
# remove URL cutoffs
df[tweets] = df[tweets].map(lambda x: re.sub('\\[^\s]*', ' ', x))
# remove spaces
df[tweets] = df[tweets].map(lambda x: re.sub('\n', ' ', x))
# remove picture URLs
df[tweets] = df[tweets].map(lambda x: re.sub('pic.twitter.com\/[^\s]*', ' ', x))
# remove blog/map type
df[tweets] = df[tweets].map(lambda x: re.sub('blog\/maps\/info\/[^\s]*', ' ', x))
# remove hashtags =
df[tweets] = df[tweets].map(lambda x: re.sub("\#[\w]*", "", x))
# remove and signs
df[tweets] = df[tweets].map(lambda x: re.sub("\&", "", x))
# remove AT users
#df[tweets] = df[tweets].map(lambda x: re.sub("\@[\w]*", "", x))
# remove single quotations
df[tweets] = df[tweets].map(lambda x: re.sub("'", "", x))
df[tweets] = df[tweets].map(lambda x: re.sub("'", "", x))
# remove characters that are not word characters or digits
#df[tweets] = df[tweets].map(lambda x: re.sub("[^\w\d]", " ", x))
# remove all characters that are not letters
#df[tweets] = df[tweets].map(lambda x: re.sub("[^a-zA-Z]", " ", x))
# remove multiple spaces
df[tweets] = df[tweets].map(lambda x: re.sub("\s{2,6}", " ", x))
# drop duplicate rows
df.drop_duplicates(subset='text', keep='first', inplace=True)
# remove multiple spaces
df[tweets] = df[tweets].map(lambda x: re.sub("\s{3,20}", "", x))
return df