WebScrapingProjects/ch5ex1.py at master · jhchang/WebScrapingProjects · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os
import re
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

downloadDirectory = 'downloaded'
baseUrl = 'http://pythonscraping.com'

def getAbsoluteURL(baseUrl, source):
    if baseUrl[8:] not in source and re.search('^(http|www)', source):
        return None

    if re.search('^(http://www|https://www)', source):
        url = 'http://{}'.format(source[11:])
    elif re.search('^(//)', source):
        url = 'http://{}'.format(source)
    elif source.startswith('http://'):
        url = source
    elif source.startswith('www.'):
        source = source[4:]
        url = 'http://{}'.format(source)
    elif re.search('^(?!/)', source):
        url = '{}/{}'.format(baseUrl, source)
    else:
        url = '{}{}'.format(baseUrl, source)
    return url

def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
    path = absoluteUrl.replace('www.', '')
    path = path.replace(baseUrl, '')
    path = downloadDirectory+path
    directory = os.path.dirname(path)

    if not os.path.exists(directory):
        os.makedirs(directory)

    return path

html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
downloadList = bs.findAll(src=True)

for download in downloadList:
    fileUrl = getAbsoluteURL(baseUrl, download['src'])
    if fileUrl is not None:
        print(fileUrl)

urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))