-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path#Twitter Scrapping.py
More file actions
142 lines (120 loc) · 5.62 KB
/
#Twitter Scrapping.py
File metadata and controls
142 lines (120 loc) · 5.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#Twitter Scrapping
from pip install tweepy
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
import pandas as pd
import csv
import re
from textblob import TextBlob
import string
import preprocessor as p
import os
import time
consumer_key = "r5sWaI2KAFXIpfq4LE6IFaebG"
consumer_secret = "L0xIFEphMo5Urfr8QqXKLap7rdvBZAbB0GQxK4IA6Ehwro1n0y"
access_token = "1053343535724933121-k6xViBxh2OtFv907xg0McYNHfvmbMc"
access_token_secret = "czAxweW9iXnfMn5ipz0npNSfQ81qwutAULVUF8cJFSGfD"
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
def scraptweets(search_words, date_since, numTweets, numRuns):
# Define a for-loop to generate tweets at regular intervals
# We cannot make large API call in one go. Hence, let's try T times
# Define a pandas dataframe to store the date:
db_tweets = pd.DataFrame(columns = ['username', 'acctdesc', 'location', 'following',
'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts',
'retweetcount', 'text', 'hashtags']
)
program_start = time.time()
for i in range(0, numRuns):
# We will time how long it takes to scrape tweets for each run:
start_run = time.time()
# Collect tweets using the Cursor object
# .Cursor() returns an object that you can iterate or loop over to access the data collected.
# Each item in the iterator has various attributes that you can access to get information about each tweet
tweets = tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since, tweet_mode='extended').items(numTweets)
# Store these tweets into a python list
tweet_list = [tweet for tweet in tweets]
# Obtain the following info (methods to call them out):
# user.screen_name - twitter handle
# user.description - description of account
# user.location - where is he tweeting from
# user.friends_count - no. of other users that user is following (following)
# user.followers_count - no. of other users who are following this user (followers)
# user.statuses_count - total tweets by user
# user.created_at - when the user account was created
# created_at - when the tweet was created
# retweet_count - no. of retweets
# (deprecated) user.favourites_count - probably total no. of tweets that is favourited by user
# retweeted_status.full_text - full text of the tweet
# tweet.entities['hashtags'] - hashtags in the tweet
# Begin scraping the tweets individually:
noTweets = 0
for tweet in tweet_list:
# Pull the values
username = tweet.user.screen_name
acctdesc = tweet.user.description
location = tweet.user.location
following = tweet.user.friends_count
followers = tweet.user.followers_count
totaltweets = tweet.user.statuses_count
usercreatedts = tweet.user.created_at
tweetcreatedts = tweet.created_at
retweetcount = tweet.retweet_count
hashtags = tweet.entities['hashtags']
try:
text = tweet.retweeted_status.full_text
# Not a Retweet
except AttributeError:
ext = tweet.full_text
# Add the 11 variables to the empty list - ith_tweet:
ith_tweet = [username, acctdesc, location, following, followers, totaltweets, usercreatedts, tweetcreatedts, retweetcount, text, hashtags]
# Append to dataframe - db_tweets
db_tweets.loc[len(db_tweets)] = ith_tweet
# increase counter - noTweets
noTweets += 1
# Run ended:
end_run = time.time()
duration_run = round((end_run-start_run)/60, 2)
print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
print('time take for {} run to complete is {} mins'.format(i+1, duration_run))
time.sleep(920) #15 minute sleep time
# Once all runs have completed, save them to a single csv file:
from datetime import datetime
# Obtain timestamp in a readable format
to_csv_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S')
# Define working path and filename
path = os.getcwd()
filename = path + '/data/' + to_csv_timestamp + '_sahkprotests_tweets.csv'
# Store dataframe in csv with creation date timestamp
db_tweets.to_csv(filename, index = False)
program_end = time.time()
print('Scraping has completed!')
print('Total time taken to scrap is {} minutes.'.format(round(program_end - program_start)/60, 2))
username = 'jack'
count = 150
try:
# Creation of query method using parameters
tweets = tweepy.Cursor(api.user_timeline,id=username).items(count)
# Pulling information from tweets iterable object
tweets_list = [[tweet.created_at, tweet.id, tweet.text] for tweet in tweets]
# Creation of dataframe from tweets list
# Add or remove columns as you remove tweet information
tweets_df = pd.DataFrame(tweets_list)
except BaseException as e:
print('failed on_status,',str(e))
time.sleep(3)
text_query = '2020 US Election'
count = 150
try:
# Creation of query method using parameters
tweets = tweepy.Cursor(api.search,q=text_query).items(count)
# Pulling information from tweets iterable object
tweets_list = [[tweet.created_at, tweet.id, tweet.text] for tweet in tweets]
# Creation of dataframe from tweets list
# Add or remove columns as you remove tweet information
tweets_df = pd.DataFrame(tweets_list)
except BaseException as e:
print('failed on_status,',str(e))
time.sleep(3)