-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.py
More file actions
194 lines (166 loc) · 7.69 KB
/
config.py
File metadata and controls
194 lines (166 loc) · 7.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/bin/python3
import os
import argparse
import re
api_url = 'https://api.curseforge.com/v1'
output_dir = './curseforge'
api_key_file = 'api_key.txt'
api_key = ''
dry_run = False
db_filename = 'curseforge.db'
db_filepath = ':memory:'
bucket_filename = 'bucket.db'
bucket_filepath = ':memory:'
bucket_module = 'file_bucket'
category_filter = []
game_filter = [432]
wait_ms = 1000
retry_limit = 4
threshold = 1
store_option = None
cache_option = None
full = False
scrape_descriptions = False
scrape_changelogs = False
scrape_game_versions = False
scrape_all = False
download_media = False
download_files = False
download_all = False
singleton = True
pid_file = 'curse.pid'
pid_arg = None
skip = []
skippable = ['game_retrieve', 'game_iterate', 'category_retrieve', 'category_iterate', 'mod_iterate']
parser = argparse.ArgumentParser(
prog="Python Curseforge Scraper",
description="Scrape Curseforge API",
epilog="https://github.com/MrDoritos/PythonCurseforgeScraper",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument('-o', '--output-dir', default=output_dir, dest='o', help='directory for scraper output')
parser.add_argument('-k', '--api-key', dest='k', help='api key, ignores api_key_file')
parser.add_argument('--api-key-file', default=api_key_file, dest='kf', help='file with api key')
parser.add_argument('-u', '--api-url', default=api_url, dest='u', help='api endpoint base url')
parser.add_argument('-n', '--dry-run', action='store_true', dest='n', help='do not write to disk')
parser.add_argument('-df', '--database-filename', default=db_filename, dest='dbf', help='database connection filename')
parser.add_argument('-dp', '--database-filepath', dest='dbfp', help='full path to database connection, ignores database filename option')
parser.add_argument('-bf', '--bucket-filename', default=bucket_filename, dest='bf', help='filebucket connection filename')
parser.add_argument('-bp', '--bucket-filepath', dest='bfp', help='full path to filebucket connection, ignores bucket filename option')
parser.add_argument('-bm', '--bucket-module', default=bucket_module, dest='bm', help='filebucket python module override')
parser.add_argument('-cf', '--category-filter', type=int, default=None, action='extend', nargs='*', dest='cf', help='category ids to collect')
parser.add_argument('-gf', '--game-filter', type=int, default=None, action='extend', nargs='*', dest='gf', help='game ids to collect')
parser.add_argument('-w', '--wait-ms', type=float, default=wait_ms, dest='w', help='wait time between requests in milliseconds')
parser.add_argument('-r', '--retry-limit', type=int, default=retry_limit, dest='r', help='number of retries for a failed request')
parser.add_argument('-t', '--stale-threshold', type=int, default=threshold, dest='threshold', help='Number of consecutive pages of stale data before leaving the current loop')
parser.add_argument('-s', '--store-option', default='default', choices=['none', 'default', 'all', 'last'], dest='store', help='request storage usage')
parser.add_argument('-c', '--cache-option', default='default', choices=['none', 'default', 'all', 'only'], dest='cache', help='request cache usage')
parser.add_argument('-f', '--full', action='store_true', dest='f', help='Enable when the final numbers show any discrepancies')
parser.add_argument('--scrape-descriptions', action='store_true', dest='sd', help='Scrape descriptions for each mod')
parser.add_argument('--scrape-changelogs', action='store_true', dest='sc', help='Scrape changelogs for each file')
parser.add_argument('--scrape-game-versions', action='store_true', dest='sgv', help='Scrape versions for each game')
parser.add_argument('--scrape-all', action='store_true', dest='scrape_all', help='Shorthand for all scrape flags')
parser.add_argument('--download-media', action='store_true', dest='dm', help='Download media/logo/image files for everything')
parser.add_argument('--download-files', action='store_true', dest='df', help='Download files')
parser.add_argument('-a', '--download-all', action='store_true', dest='da', help='Download and scrape everything')
parser.add_argument('-C', '--curl', dest='curl', help='wrapper to use api key with curl')
parser.add_argument('-pf', '--pid-file', default=pid_file, dest='pid_file', help='Read/write program pid at this file')
parser.add_argument('-p', '--pid', dest='pid', help="Use this pid for the singleton check")
parser.add_argument('--singleton', action='store_true', default=True, dest='singleton', help='Force single instance using pid')
parser.add_argument('--skip', default=skip, action='extend', nargs='+', choices=skippable, dest='skip', help='Skip individual scrape processes')
args = parser.parse_args()
print(args)
output_dir = args.o
api_url = args.u
api_key_file = args.kf
api_key = args.k
dry_run = args.n
category_filter = category_filter if args.cf is None else args.cf
game_filter = game_filter if args.gf is None else args.gf
db_filename = args.dbf
bucket_filename = args.bf
bucket_module = args.bm
wait_ms = args.w
retry_limit = args.r
threshold = args.threshold
store_option = args.store
cache_option = args.cache
full = args.f
scrape_descriptions = args.sd
scrape_changelogs = args.sc
scrape_game_versions = args.sgv
scrape_all = args.scrape_all
download_media = args.dm
download_files = args.df
download_all = args.da
pid_file = args.pid_file
pid_arg = args.pid
singleton = args.singleton
skip = args.skip
# Stateful
def pid_check(pid):
try:
os.kill(int(pid), 0)
except OSError:
return False
else:
return True
class CleanPid():
def __init__(self):
pass
def __del__(self):
if pid_file and len(pid_file) and os.path.exists(pid_file):
print("Clearing pid file lock")
os.remove(pid_file)
__cleanpid = None
if singleton:
if not (pid_arg and len(pid_arg)) and not (pid_file and len(pid_file)):
print('No pid supplied, unsafe circumstances or remove the singleton flag')
if (pid_arg and len(pid_arg) and pid_check(pid_arg)):
print(f'Program is already running. Pid {pid_arg} from pid arg')
exit(1)
if pid_file and len(pid_file):
if os.path.exists(pid_file):
pid = open(pid_file).read().strip()
if pid_check(pid):
print(f'Program is already running. Pid {pid} from pid_file arg')
exit(1)
# write current pid
with open(pid_file, 'w') as f:
f.write(str(os.getpid()))
__cleanpid = CleanPid()
if not (api_key and len(api_key)):
if not (api_key_file and len(api_key_file)):
print('No api_key or api_key_file')
exit(1)
if not os.path.isfile(api_key_file):
print('Bad api_key_file path')
exit(1)
api_key = open(api_key_file).read().strip()
if not dry_run:
os.makedirs(output_dir, exist_ok=True)
if not args.dbfp:
db_filepath = output_dir + '/' + db_filename
if not args.bfp:
bucket_filepath = output_dir + '/' + bucket_filename
if scrape_all:
scrape_descriptions = True
scrape_changelogs = True
scrape_game_versions = True
if download_all:
scrape_descriptions = True
scrape_changelogs = True
scrape_game_versions = True
download_media = True
download_files = True
if args.curl:
curl_opts = 'curl '
curl_opts += '--http1.1 -v '
curl_opts += f'-H "x-api-key: {api_key.replace('$', '\\$')}" '
curl_opts += '-H "Accept: application/json" '
curl_opts += '-H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 OverwolfClient/0.204.0.1" '
curl_opts += '-H "Authorization: OAuth" '
curl_opts += '-H "X-Twitch-Id:" '
curl_opts += '-H "Accept-Encoding: utf-8" '
curl_opts += args.curl
exit(os.system(curl_opts))