-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdomain.py
More file actions
28 lines (22 loc) · 784 Bytes
/
domain.py
File metadata and controls
28 lines (22 loc) · 784 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# This file is responsible to extracting the domain so that we don't crawl useless domain
# Note: we only get the last two protocol domain word
from urllib.parse import urlparse
# get domain last two names (e.g., google.com)
def get_domain_name(url):
try:
# split the URL based on "."
results = get_sub_domain_name(url).split('.')
return results[-2] + '.' + results[-1]
# Note: this will not work if the URL has something like ".co.za"
except:
return ''
# get sub domain name (e.g., mail.h-u-i.co.za)
def get_sub_domain_name(url):
try:
# parse the url and network location
return urlparse(url).netloc
except:
return ''
# test:
# print(get_domain_name('https://nyit.edu/index.php'))
# print nyit.edu