forked from MikeMeliz/TorCrawl.py
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtorcrawl.py
More file actions
executable file
·323 lines (285 loc) · 10.4 KB
/
torcrawl.py
File metadata and controls
executable file
·323 lines (285 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
#!/usr/bin/python
"""
TorCrawl.py is a python script to crawl and extract (regular or onion)
webpages through TOR network.
usage: python torcrawl.py [options]
python torcrawl.py -u l0r3m1p5umD0lorS1t4m3t.onion
python torcrawl.py -v -w -u http://www.github.com -o github.htm
python torcrawl.py -v -u l0r3m1p5umD0lorS1t4m3t.onion -c -d 2 -p 5
python torcrawl.py -v -w -u http://www.github.com -c -d 2 -p 5 -e -f GitHub
General:
-h, --help : Help
-v, --verbose : Show more information about the progress
-u, --url *.onion : URL of Webpage to crawl or extract
-w, --without : Without the use of Relay TOR
-rua, --random-ua : Enable random user-agent rotation for requests
-rpr, --random-proxy: Enable random proxy rotation from res/proxies.txt
-px, --proxy : IP address for SOCKS5 proxy
-pr, --proxyport : Port for SOCKS5 proxy
-V, --version : Show version and exit
Extract:
-e, --extract : Extract page's code to terminal or file.
(Default: terminal)
-i, --input filename : Input file with URL(s) (separated by line)
-o, --output [filename] : Output page(s) to file(s) (for one page)
-y, --yara : Yara keyword search page categorisation
read in from /res folder.
'h' search whole html object.
't' search only the text.
Crawl:
-c, --crawl : Crawl website (Default output on /links.txt)
-d, --depth : Set depth of crawl's travel (Default: 1)
-p, --pause : The length of time the crawler will pause (Default: 0)
-f, --folder : The directory which will contain the generated files
-j, --json : Export crawl findings to JSON in addition to txt outputs
-x, --xml : Export crawl findings to XML in addition to txt outputs
-DB, --database : Export crawl findings and link graph to SQLite database
-vis, --visualization: Generate HTML visualization (requires -DB)
-l, --log : Log file with visited URLs and their response code.
GitHub: github.com/MikeMeliz/TorCrawl.py
License: GNU General Public License v3.0
"""
import argparse
import os
import socket
import sys
import datetime
import socks # noqa - pysocks
from modules.checker import check_ip
from modules.checker import check_tor
from modules.checker import extract_domain
from modules.checker import folder
from modules.checker import url_canon
# TorCrawl Modules
from modules.crawler import Crawler
from modules.extractor import extractor
from modules.export import export_json, export_xml, export_database
from modules.visualization import export_visualization
__version__ = "1.35"
# Set socket and connection with TOR network
def connect_tor(proxy_url, proxy_port):
""" Connect to TOR via DNS resolution through a socket.
:return: None or HTTPError.
"""
try:
# Set socks proxy and wrap the urllib module
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, proxy_url, proxy_port)
socket.socket = socks.socksocket
# Perform DNS resolution through the socket
def getaddrinfo(*args): # noqa
return [(socket.AF_INET, socket.SOCK_STREAM, 6, '',
(args[0], args[1]))]
socket.getaddrinfo = getaddrinfo # noqa
except socks.HTTPError as err:
error = sys.exc_info()[0]
print(f"Error: {error} \n## Cannot establish connection with TOR\n"
f"HTTPError: {err}")
def main():
""" Main method of TorCrawl application. Collects and parses arguments and
instructs the rest of the application on how to run.
:return: None
"""
# Get arguments with argparse.
parser = argparse.ArgumentParser(
description="TorCrawl.py is a python script to crawl and extract "
"(regular or onion) webpages through TOR network.")
# General
parser.add_argument(
'-v',
'--verbose',
action='store_true',
help='Show more information about the progress'
)
parser.add_argument(
'-u',
'--url',
help='URL of webpage to crawl or extract'
)
parser.add_argument(
'-w',
'--without',
action='store_true',
help='Without the use of Relay TOR'
)
parser.add_argument(
'-V',
'--version',
action='version',
version=f"%(prog)s {__version__}"
)
# Extract
parser.add_argument(
'-e',
'--extract',
action='store_true',
help='Extract page\'s code to terminal or file.'
)
parser.add_argument(
'-i',
'--input',
help='Input file with URL(s) (separated by line)'
)
parser.add_argument(
'-o',
'--output',
help='Output page(s) to file(s) (for one page)'
)
# Crawl
parser.add_argument(
'-c',
'--crawl',
action='store_true',
help='Crawl website (Default output on /links.txt)'
)
parser.add_argument(
'-d',
'--depth',
help='Set depth of crawl\'s travel (Default: 1)'
)
parser.add_argument(
'-p',
'--pause',
help='The length of time the crawler will pause'
)
parser.add_argument(
'-l',
'--log',
action='store_true',
help='A save log will let you see which URLs were visited and their '
'response code'
)
parser.add_argument(
'-f',
'--folder',
help='The root directory which will contain the generated files'
)
parser.add_argument(
'-j',
'--json',
dest='json_export',
action='store_true',
help='Export crawl findings to JSON in addition to txt outputs'
)
parser.add_argument(
'-x',
'--xml',
dest='xml_export',
action='store_true',
help='Export crawl findings to XML in addition to txt outputs'
)
parser.add_argument(
'-DB',
'--database',
dest='database_export',
action='store_true',
help='Export crawl findings and link graph to SQLite database'
)
parser.add_argument(
'-vis',
'--visualization',
dest='visualization',
action='store_true',
help='Generate HTML visualization from SQLite database (requires -DB)'
)
parser.add_argument(
'-y',
'--yara',
help='Check for keywords and only scrape documents that contain a '
'match. \'h\' search whole html object. \'t\' search only the text.'
)
parser.add_argument(
'-rua',
'--random-ua',
action='store_true',
help='Enable random user-agent rotation for requests'
)
parser.add_argument(
'-rpr',
'--random-proxy',
action='store_true',
help='Enable random proxy rotation from res/proxies.txt'
)
parser.add_argument(
'-pr',
'--proxyport',
help='Port for SOCKS5 proxy',default=9050
)
parser.add_argument(
'-px',
'--proxy',
help='IP address for SOCKS5 proxy',default='127.0.0.1'
)
args = parser.parse_args()
now = datetime.datetime.now().strftime("%y%m%d")
results_prefix = f"{now}_results"
# Canonicalization of web url and create path for output.
website = ''
output_folder = ''
url_arg = args.url.strip() if args.url else ''
if args.input:
pass
elif len(url_arg) > 0:
website = url_canon(url_arg, args.verbose)
if args.folder is not None:
output_folder = folder(args.folder, args.verbose)
else:
output_folder = folder(extract_domain(website), args.verbose)
else:
print("## ERROR: URL is required unless --input is provided.")
sys.exit(2)
# Parse arguments to variables else initiate variables.
input_file = args.input if args.input else ''
output_file = args.output if args.output else ''
depth = args.depth if args.depth else 0
pause = args.pause if args.pause else 0
selection_yara = args.yara if args.yara else None
random_ua = args.random_ua
random_proxy = args.random_proxy
# Visualization requires database export.
if args.visualization and not args.database_export:
print("## Visualization requires --database (-DB) to generate the SQLite file.")
sys.exit(2)
# Random proxy rotation only works when TOR is disabled
if random_proxy and args.without is False:
print("## Warning: Random proxy rotation requires --without (-w) flag to disable TOR.")
print("## Random proxy rotation disabled. Using TOR instead.")
random_proxy = False
# Connect to TOR or random proxy
if random_proxy:
# Random proxy rotation enabled - will be handled per request
if args.verbose:
print("## Random proxy rotation enabled (TOR disabled)")
elif args.without is False:
check_tor(args.verbose)
connect_tor(args.proxy, args.proxyport)
if args.verbose:
check_ip()
if args.url: print(('## URL: ' + args.url))
if args.crawl:
crawler = Crawler(website, depth, pause, output_folder, args.log,
args.verbose, random_ua, random_proxy)
lst = crawler.crawl()
if args.input is None:
input_file = output_folder + '/' + now + '_links.txt'
with open(input_file, 'w+', encoding='UTF-8') as file:
for item in lst:
file.write(f"{item}\n")
print(f"## File created on {os.getcwd()}/{input_file}")
if args.extract:
extractor(website, args.crawl, output_file, input_file, output_folder,
selection_yara, random_ua, random_proxy)
payload = crawler.export_payload()
if args.json_export:
export_json(output_folder, results_prefix, payload["data"], verbose=args.verbose)
if args.xml_export:
export_xml(output_folder, results_prefix, payload["data"], verbose=args.verbose)
if args.database_export:
export_database(output_folder, results_prefix, payload["data"], payload["edges"], payload["titles"], payload["resources"], verbose=args.verbose)
if args.visualization:
export_visualization(output_folder, results_prefix, payload["start_url"], verbose=args.verbose)
else:
extractor(website, args.crawl, output_file, input_file, output_folder,
selection_yara, random_ua, random_proxy)
# Stub to call main method.
if __name__ == "__main__":
main()