-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathwebpagedownloader.py
More file actions
89 lines (79 loc) · 4.54 KB
/
webpagedownloader.py
File metadata and controls
89 lines (79 loc) · 4.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import sys, os, shutil
from bs4 import BeautifulSoup
from urllib.parse import urlparse, quote
from urllib.request import urlopen, urlretrieve, Request
from urllib.error import URLError, HTTPError
class WebPageDownloader():
def __init__(self, url, data_dir):
self.url = url
self.html_source = self.get_content(self.url)
self.soup = BeautifulSoup(self.html_source, 'html.parser')
if data_dir[-1] == '/':
self.data_dir = data_dir
else:
self.data_dir = data_dir + '/'
def get_content(self, url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
req = Request(url, headers=headers)
res = urlopen(req)
content = res.read()
return content
except HTTPError as err:
print("HTTP error occured!\n\tError code : %s\n\tReason : %s" % (err.code , err.reason))
except URLError as err:
print("ERROR : %s" % (err.reason))
def make_dir(self, path):
try:
if not os.path.exists(path):
os.umask(0000)
os.makedirs(path)
return True
except Exception as e:
print("ERROR : %s" % e.message)
def save_all_assets(self):
assets = {'js':{'tag':'script','attr':'src', 'ext':['.js']},'css':{'tag':'link','attr':'href', 'ext':['.css']},'img':{'tag':'img','attr':'src', 'ext':['.png','.jpg','.gif']}}
parsed_url = urlparse(self.url)
for asset_type in assets:
for asset in self.soup.findAll(assets[asset_type]['tag']):
for extension in assets[asset_type]['ext']:
try:
if extension in asset[assets[asset_type]['attr']]:
if 'http' not in asset[assets[asset_type]['attr']]:
if asset[assets[asset_type]['attr']][0] is not '/':
if asset[assets[asset_type]['attr']][0] == '.':
while asset[assets[asset_type]['attr']][0] is not '/':
asset[assets[asset_type]['attr']] = asset[assets[asset_type]['attr']][1:]
_href = parsed_url[0] + '://' + parsed_url[1] + asset[assets[asset_type]['attr']]
else:
_href = self.url + "/" + asset[assets[asset_type]['attr']]
else:
if asset[assets[asset_type]['attr']][0:2] == '//':
_href = 'http://' + asset[assets[asset_type]['attr']].replace('//','')
else:
_href = parsed_url[0] + '://' + parsed_url[1] + asset[assets[asset_type]['attr']]
else:
_href = asset[assets[asset_type]['attr']]
self.make_dir(self.data_dir + asset_type + '/')
if len(_href.split("/")[-1].split('.')[0]) < 15:
_asset_path = self.data_dir + asset_type + '/' + _href.split("/")[-1].split('.')[0] + extension
else:
_asset_path = self.data_dir + asset_type + '/' + _href.split("/")[-1].split('.')[0][0:15] + extension
if not os.path.isfile(_asset_path):
_asset_string = self.get_content(_href)
if asset_type == 'img':
with open(_asset_path, 'wb') as f:
f.write(_asset_string)
f.close()
else:
with open(_asset_path, 'w+') as f:
f.write(str(_asset_string, 'utf-8'))
f.close()
# Modify the asset path in the source html
asset[assets[asset_type]['attr']] = './'+ asset_type + '/' + _asset_path.split("/")[-1]
except:
print("Can't download the asset : %s" % asset)
self.html_source = str(self.soup)
with open(self.data_dir + 'index.html', 'w+') as html_source_file:
html_source_file.write(self.html_source)
html_source_file.close()