-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathch5ex1.py
More file actions
49 lines (40 loc) · 1.43 KB
/
ch5ex1.py
File metadata and controls
49 lines (40 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os
import re
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
downloadDirectory = 'downloaded'
baseUrl = 'http://pythonscraping.com'
def getAbsoluteURL(baseUrl, source):
if baseUrl[8:] not in source and re.search('^(http|www)', source):
return None
if re.search('^(http://www|https://www)', source):
url = 'http://{}'.format(source[11:])
elif re.search('^(//)', source):
url = 'http://{}'.format(source)
elif source.startswith('http://'):
url = source
elif source.startswith('www.'):
source = source[4:]
url = 'http://{}'.format(source)
elif re.search('^(?!/)', source):
url = '{}/{}'.format(baseUrl, source)
else:
url = '{}{}'.format(baseUrl, source)
return url
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
path = absoluteUrl.replace('www.', '')
path = path.replace(baseUrl, '')
path = downloadDirectory+path
directory = os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
return path
html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
downloadList = bs.findAll(src=True)
for download in downloadList:
fileUrl = getAbsoluteURL(baseUrl, download['src'])
if fileUrl is not None:
print(fileUrl)
urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))