-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrapper.py
More file actions
31 lines (24 loc) · 784 Bytes
/
scrapper.py
File metadata and controls
31 lines (24 loc) · 784 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
links_attr = ['href', 'src']
def get_url_from_html(html, url):
soup = BeautifulSoup(html, 'html.parser')
url_list = []
for tag in soup.find_all(filter_tags):
for attr in links_attr:
if tag.has_attr(attr):
url_list.append(urljoin(url, tag[attr]))
return url_list
def get_html_from_url(url):
for i in range(3):
try:
response = requests.get(url)
except:
if i == 2:
print(f'Failed to download {url}')
continue
return response.content.decode(errors='ignore'), 0
return '', 1
def filter_tags(tag):
return any(tag.has_attr(attr) for attr in links_attr)