-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsample_script.py
More file actions
102 lines (91 loc) · 4.46 KB
/
sample_script.py
File metadata and controls
102 lines (91 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from urllib.parse import urlparse
from urllib import error, request
import signal
import ssl
# Change the next line based on what you want the code to return.
Return_option = "domain" # "domain", "url", or "both"
link_shortener_domains = ["tiny.cc", "youtu.be", "ht.ly", "clk.gop", "eepurl.com", "dld.bz", "t.co", "bit.ly", "goo.gl",
"ow.ly", "tinyurl.com", "bitly.com", "ln.is", "linkis.com", "tr.im", "smarturl.it", "spr.ly",
"shar.es", "com.me", "trib.al", "natl.re", "fw.to", "j.mp"]
def handler(signum, frame):
raise RuntimeError("Time's up")
def parser(link):
"""
Function takes in a url as a string. If it succeeds in parsing a domain from the url, it returns the domain as a
string; if not, it returns an error code or message as a string
"""
signal.signal(signal.SIGALRM, handler)
# Shortened urls trigger this loop, which attempts to follow shortened urls to their final destination, then parse
# the domain based on that final destination.
if any(l in link for l in link_shortener_domains) and not 't.com' in link:
if not link.startswith("http"):
link = "http://" + link
signal.alarm(10)
# The first try statement adds a timer, so that the script doesn't get stuck on a problematic url.
try:
# The try statement sends a request for each url, which allows the url to redirect as it would if
# clicked by a user. It then finds the domain for the final url after redirection. The first except
# statement handles 403 (permission denied) errors: it spoofs the user-agent to get past user-agent
# restrictions. The next except statement handles any failed requests, grabbing the domain from the url as
# it appears in the text of the message. Because of the previous if statement, only urls that seem to use a
# link shortener service are put through this process.
try:
response = request.urlopen(link)
expanded_link = response.geturl()
domain = urlparse(expanded_link).netloc
except error.HTTPError as e:
# If 403 error, spoof user-agent to look like a browser
if e.code == 403:
try:
req = request.Request(link)
req.add_header('User-agent', 'Mozilla/5.0')
response = request.urlopen(req)
expanded_link = response.geturl()
domain = urlparse(expanded_link).netloc
# If spoof fails, parse domain from original url
except:
domain = urlparse(link).netloc
# If HTTPError code is anything other than 403, parse domain from original url
else:
error_code = str(e.code)
domain = urlparse(link).netloc
except error.URLError as e:
error_code = str(e)
except ssl.CertificateError as e:
error_code = str(e)
# If it takes longer than 10 seconds to follow a url, return RuntimeError
except RuntimeError as e:
error_code = e.args
signal.alarm(0)
# If the link doesn't appear to be shortened, parse the domain from the link as it appears
else:
if not link.startswith('http'):
link = 'http://' + link
else:
link = link
domain = urlparse(link).netloc
if 'error_code' in locals():
return error_code
elif Return_option == "domain":
return domain
elif Return_option == "url":
if expanded_url:
return expanded_url
else:
return link
elif Return_option == "both":
if expanded_url:
return expanded_url, domain
else:
return link, domain
"""
Everything before this is copied from main.py. Change below here for your data.
"""
# Sample URLs to parse
urls_to_parse = ["http://www.bbc.com/news/world-us-canada-42576978", "https://www.maxwell.syr.edu/research/",
"https://www.nytimes.com/interactive/2017/12/27/business/drug-addiction-ads.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region®ion=top-news&WT.nav=top-news",
"http://bit.ly/2CJAgEn", "https://goo.gl/QqUkYz"]
results = {}
for u in urls_to_parse:
result = parser(u)
results[u] = result