SDFinalProject/scrapper.py at main · codersUP/SDFinalProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup

links_attr = ['href', 'src']

def get_url_from_html(html, url):
    soup = BeautifulSoup(html, 'html.parser')

    url_list = []
    for tag in soup.find_all(filter_tags):
        for attr in links_attr:
            if tag.has_attr(attr):
                url_list.append(urljoin(url, tag[attr]))
    return url_list

def get_html_from_url(url):
    for i in range(3):
        try:
            response = requests.get(url)
        except:
            if i == 2:
                print(f'Failed to download {url}')
            continue

        return response.content.decode(errors='ignore'), 0
    return '', 1


def filter_tags(tag):
    return any(tag.has_attr(attr) for attr in links_attr)