-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.py
More file actions
214 lines (182 loc) · 8.4 KB
/
main.py
File metadata and controls
214 lines (182 loc) · 8.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import praw
import requests
import os
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from praw.exceptions import RedditAPIException, ClientException, PRAWException
def read_api_credentials():
try:
with open('App.txt', 'r') as file:
lines = file.readlines()
if len(lines) < 3:
raise ValueError("App.txt should contain at least 3 lines")
return {
'client_id': lines[0].strip(),
'client_secret': lines[1].strip(),
'user_agent': lines[2].strip()
}
except (FileNotFoundError, ValueError) as e:
print(f"Error: {str(e)}")
exit(1)
def get_reddit_instance():
credentials = read_api_credentials()
try:
return praw.Reddit(
client_id=credentials['client_id'],
client_secret=credentials['client_secret'],
user_agent=credentials['user_agent']
)
except (ClientException, PRAWException) as e:
print(f"Error creating Reddit instance: {str(e)}")
exit(1)
def get_downloads_folder(content_type):
try:
script_dir = os.path.dirname(os.path.abspath(__file__))
base_folder = os.path.join(script_dir, 'Reddit_downloads')
content_folder = os.path.join(base_folder, content_type.capitalize())
os.makedirs(content_folder, exist_ok=True)
return content_folder
except Exception as e:
print(f"Error creating download folders: {str(e)}")
exit(1)
def download_file(url, filename, download_folder):
try:
response = requests.get(url, stream=True)
response.raise_for_status()
file_path = os.path.join(download_folder, filename)
with open(file_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
print(f"Downloaded {filename} to {file_path}")
return True
except requests.RequestException as e:
print(f"Error downloading file {filename}: {str(e)}")
return False
def save_text(selftext, filename, download_folder):
try:
file_path = os.path.join(download_folder, filename)
with open(file_path, 'w', encoding='utf-8') as file:
file.write(selftext)
print(f"Saved text post to {file_path}")
return True
except IOError as e:
print(f"Error saving text file {filename}: {str(e)}")
return False
def get_available_flairs(subreddit):
try:
flairs = set()
for submission in subreddit.hot(limit=100):
if submission.link_flair_text:
flairs.add(submission.link_flair_text)
return list(flairs)
except RedditAPIException as e:
print(f"Error fetching flairs: {str(e)}")
return []
def countdown_timer(seconds):
for remaining in range(seconds, 0, -1):
print(f"\rCooldown: {remaining} seconds remaining", end='', flush=True)
time.sleep(1)
print("\rCooldown complete. Resuming downloads... ")
def scrape_reddit(subreddit_name, count, num_threads, download_type, flair=None):
reddit = get_reddit_instance()
try:
subreddit = reddit.subreddit(subreddit_name)
posts = list(subreddit.hot(limit=count))
posts = [submission for submission in posts if (not flair or submission.link_flair_text == flair)]
except RedditAPIException as e:
print(f"Error accessing subreddit {subreddit_name}: {str(e)}")
return
total_downloads = 0
total_processed = 0
batch_size = 100
for i in range(0, len(posts), batch_size):
batch = posts[i:i+batch_size]
batch_downloads = 0
with ThreadPoolExecutor(max_workers=num_threads) as executor:
futures = []
for submission in batch:
if download_type in ['media', 'both'] and hasattr(submission, 'url') and submission.url.endswith(('.jpg', '.jpeg', '.png', '.gif', '.mp4')):
media_name = os.path.basename(submission.url)
media_folder = get_downloads_folder('Media')
futures.append(executor.submit(download_file, submission.url, media_name, media_folder))
if download_type in ['text', 'both'] and submission.selftext:
text_filename = f"{submission.id}_text.txt"
text_folder = get_downloads_folder('Text')
futures.append(executor.submit(save_text, submission.selftext, text_filename, text_folder))
for future in as_completed(futures):
try:
result = future.result()
if result:
batch_downloads += 1
except Exception as e:
print(f"Error in future task: {str(e)}")
total_downloads += batch_downloads
total_processed += len(batch)
print(f"\nProcessed {len(batch)} posts in this batch.")
print(f"Completed {batch_downloads} downloads in this batch. Total downloads: {total_downloads}")
print(f"Total posts processed: {total_processed}")
if total_processed < count and total_processed % batch_size == 0:
print(f"Starting cooldown...")
countdown_timer(60)
print(f"\nAll posts processed. Total posts: {total_processed}")
print(f"Total successful downloads: {total_downloads}")
def print_title():
title = r"""
____ _ _ _ _ ____ _ _
| _ \ ___ __| | __| (_) |_ | _ \ _____ ___ __ | | ___ __ _ __| | ___ _ __
| |_) / _ \ / _` |/ _` | | __| | | | |/ _ \ \ /\ / / '_ \| |/ _ \ / _` |/ _` |/ _ \ '__|
| _ < (_) | (_| | (_| | | |_ | |_| | (_) \ V V /| | | | | (_) | (_| | (_| | __/ |
|_| \_\___/ \__,_|\__,_|_|\__| |____/ \___/ \_/\_/ |_| |_|_|\___/ \__,_|\__,_|\___|_|
- made by Drew
"""
print(title)
def search_subreddits(query):
reddit = get_reddit_instance()
try:
return list(reddit.subreddits.search(query, limit=10))
except RedditAPIException as e:
print(f"Error searching subreddits: {str(e)}")
return []
def main():
print_title()
search_query = input("Enter a search query for subreddits: ")
matching_subreddits = search_subreddits(search_query)
if not matching_subreddits:
print("No matching subreddits found.")
return
print("Top matching subreddits:")
for i, subreddit in enumerate(matching_subreddits, 1):
print(f"{i}. r/{subreddit.display_name} - {subreddit.title}")
choice = int(input("Enter the number of the subreddit you want to download from: "))
if choice < 1 or choice > len(matching_subreddits):
print("Invalid choice.")
return
selected_subreddit = matching_subreddits[choice - 1]
subreddit_name = selected_subreddit.display_name
download_type = input("What do you want to download? ([M]edia [T]ext [B]oth): ").lower()
if download_type not in ['m', 't', 'b']:
print("Invalid choice for download type.")
return
count = int(input("Enter the number of posts to process: "))
num_threads = int(input("Enter the number of concurrent downloads: "))
if count <= 0 or num_threads <= 0:
print("The number of posts and concurrent downloads must be positive integers.")
return
if download_type == 'm':
download_type = 'media'
elif download_type == 't':
download_type = 'text'
elif download_type == 'b':
download_type = 'both'
available_flairs = get_available_flairs(selected_subreddit)
flair = None
if available_flairs:
print("Available flairs:")
for i, flair_text in enumerate(available_flairs, 1):
print(f"{i}. {flair_text}")
flair_choice = input("Enter the number of the flair you want to filter from (Leave blank for all): ")
if flair_choice.isdigit() and 1 <= int(flair_choice) <= len(available_flairs):
flair = available_flairs[int(flair_choice) - 1]
scrape_reddit(subreddit_name, count, num_threads, download_type, flair)
if __name__ == "__main__":
main()