domain_parser/sample_script.py at master · sjacks26/domain_parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from urllib.parse import urlparse
from urllib import error, request
import signal
import ssl

# Change the next line based on what you want the code to return.
Return_option = "domain" # "domain", "url", or "both"

link_shortener_domains = ["tiny.cc", "youtu.be", "ht.ly", "clk.gop", "eepurl.com", "dld.bz", "t.co", "bit.ly", "goo.gl",
                          "ow.ly", "tinyurl.com", "bitly.com", "ln.is", "linkis.com", "tr.im", "smarturl.it", "spr.ly",
                          "shar.es", "com.me", "trib.al", "natl.re", "fw.to", "j.mp"]


def handler(signum, frame):
    raise RuntimeError("Time's up")


def parser(link):
    """
    Function takes in a url as a string. If it succeeds in parsing a domain from the url, it returns the domain as a
    string; if not, it returns an error code or message as a string
    """
    signal.signal(signal.SIGALRM, handler)
    # Shortened urls trigger this loop, which attempts to follow shortened urls to their final destination, then parse
    # the domain based on that final destination.
    if any(l in link for l in link_shortener_domains) and not 't.com' in link:
        if not link.startswith("http"):
            link = "http://" + link
        signal.alarm(10)
        # The first try statement adds a timer, so that the script doesn't get stuck on a problematic url.
        try:
            # The try statement sends a request for each url, which allows the url to redirect as it would if
            # clicked by a user. It then finds the domain for the final url after redirection. The first except
            # statement handles 403 (permission denied) errors: it spoofs the user-agent to get past user-agent
            # restrictions. The next except statement handles any failed requests, grabbing the domain from the url as
            # it appears in the text of the message. Because of the previous if statement, only urls that seem to use a
            # link shortener service are put through this process.
            try:
                response = request.urlopen(link)
                expanded_link = response.geturl()
                domain = urlparse(expanded_link).netloc
            except error.HTTPError as e:
                # If 403 error, spoof user-agent to look like a browser
                if e.code == 403:
                    try:
                        req = request.Request(link)
                        req.add_header('User-agent', 'Mozilla/5.0')
                        response = request.urlopen(req)
                        expanded_link = response.geturl()
                        domain = urlparse(expanded_link).netloc
                    # If spoof fails, parse domain from original url
                    except:
                        domain = urlparse(link).netloc
                # If HTTPError code is anything other than 403, parse domain from original url
                else:
                    error_code = str(e.code)
                    domain = urlparse(link).netloc
            except error.URLError as e:
                error_code = str(e)
            except ssl.CertificateError as e:
                error_code = str(e)
        # If it takes longer than 10 seconds to follow a url, return RuntimeError
        except RuntimeError as e:
            error_code = e.args
        signal.alarm(0)

    # If the link doesn't appear to be shortened, parse the domain from the link as it appears
    else:
        if not link.startswith('http'):
            link = 'http://' + link
        else:
            link = link
        domain = urlparse(link).netloc

    if 'error_code' in locals():
        return error_code
    elif Return_option == "domain":
        return domain
    elif Return_option == "url":
        if expanded_url:
            return expanded_url
        else:
            return link
    elif Return_option == "both":
        if expanded_url:
            return expanded_url, domain
        else:
            return link, domain

"""
Everything before this is copied from main.py. Change below here for your data.
"""

# Sample URLs to parse
urls_to_parse = ["http://www.bbc.com/news/world-us-canada-42576978", "https://www.maxwell.syr.edu/research/",
                 "https://www.nytimes.com/interactive/2017/12/27/business/drug-addiction-ads.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news",
                 "http://bit.ly/2CJAgEn", "https://goo.gl/QqUkYz"]

results = {}
for u in urls_to_parse:
    result = parser(u)
    results[u] = result