-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
124 lines (108 loc) · 3.71 KB
/
main.py
File metadata and controls
124 lines (108 loc) · 3.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from selenium import webdriver
import selenium.common.exceptions
import tldextract
import os
import re
try:
import Config
except ImportError:
from js_spider import Config
if Config.browser == 'Firefox':
from selenium.webdriver.firefox.options import Options
def get_domain(url:str):
try:
domain = tldextract.extract(url)
return domain
except Exception as e:
return None
result_dir = Config.save_dir # indicates where the pages captured store
target = Config.target_url # indicates the target url
domain = get_domain(target)
if domain == None:
print('{url} is not a valid URL'.format(url=target))
exit(2)
if not os.path.isdir(result_dir):
os.makedirs(result_dir, 0o777, True)
def make_driver():
if Config.browser == 'Chrome':
options = webdriver.ChromeOptions()
if Config.headless:
options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=options)
elif Config.browser == 'Firefox':
options = Options()
if Config.headless:
options.add_argument('--headless')
driver = webdriver.Firefox(firefox_options=options)
return driver
if Config.ignore_pattern:
ignore_pattern = re.compile(Config.ignore_pattern)
elif not Config.ignore_pattern:
ignore_pattern = re.compile('^\.\.\.\.\.\.') # This is a nonsense pattern that should never occur in a URL
urls = []
urls.append(target)
index = 0
error_count = 0
error_files = []
errors = {}
driver = make_driver()
while index < len(urls):
url = urls[index]
print("Starting to process " + str(url))
try:
driver.get(url)
except selenium.common.exceptions.WebDriverException as e:
error_count += 1
errors[url] = e
driver = make_driver()
continue
page_source = driver.page_source
url_base = url.replace("https://", "")
url_base = url_base.replace("http://", "")
file_path = result_dir + url_base
if url == target:
file_path = result_dir + domain.domain + ".html"
elif file_path.endswith("/"):
file_path = file_path[:-1] + ".html"
elif not re.search(re.compile('\.[^/]+$'), file_path):
file_path = file_path + ".html"
try:
os.makedirs(os.path.dirname(file_path), exist_ok=True)
except FileExistsError as e:
error_count += 1
errors[url] = e
#error_files.append(file_path)
if os.path.isfile(file_path):
e = "File already exists: " + str(file_path)
print(e)
error_count += 1
errors[url] = e
#error_files.append(file_path)
else:
try:
with open(file_path, 'w+', encoding="utf-8") as f:
f.write(page_source)
new_urls = 0
links = driver.find_elements_by_tag_name('a')
for link in links:
href = link.get_attribute('href')
if (type(href) == str) and (href.startswith('http')) and (not re.search(Config.ignore_pattern, href)) and (get_domain(href).domain == domain.domain) and (href not in urls):
urls.append(href)
new_urls += 1
print("Found " + str(new_urls) + " new urls in " + str(url))
except NotADirectoryError as e:
error_count += 1
errors[url] = e
#error_files.append(file_path)
except IsADirectoryError as e:
error_count += 1
errors[url] = e
#error_files.append(file_path)
index += 1
pages_to_go = len(urls) - index
print(str(index) + " pages archived")
print(str(pages_to_go) + " pages left to process")
file_path = False
print("Archiving complete with " + str(error_count) + " errors")
print("Errors:")
print(errors)