Web-Scraper---Python/Webscraper.py at master · ajayrenganathan/Web-Scraper---Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""

Code to Scrape a website using Python : Made use of Beautiful soup and Requests Library
    Data Scraped and Info Gathered :
        External files referenced or Links
        Images
        Css elements
        Html elements
        JS elements
"""

import time
import requests
import bs4

def links():
    """
    Function to get list of links and print number of links
    """
    link_list = [link.get("href") for link in soup.find_all('a') if link.get("href")]
    print("Number of links are ", len(link_list))
    return link_list

def images():
    """
    Function to get list of images and print number of images
    """
    img_list = [i.get('src') for i in soup.find_all('img') if i.get('src')]
    print("Number of images are ", len(img_list))
    return img_list

def css_elements():
    """
    Function to get a list of css element usage and number of css elements
    """
    css_elements_list =[css.get("class") for css in soup.find_all('div') if css.get("class")]
    print("Number of css elements are ", len(css_elements_list))
    return css_elements_list

def js_file():
    """
    Function to get number of JS files
    """
    js_file_lst=[i.get('src')for i in soup.find_all('script') if i.get('src')]
    print("Number of java script files are", len(js_file_lst))

def html_elements():
    """
    Function to get a number of HTML elements
    """
    listele =['html','div','p','span','script','h1','h2','h3','h4','h5','h6','img','hr','a','ul','ol','li','label']
    return listele

def Countreturn(*args):
    """
    used to pick list returned from html_elements function and count total html elements for all strings
    """
    elements=0
    for i in range(0,len(args)):
        elements += len(soup.find_all(args[i]))
    print("Number of Html elements are", elements)


def file_writer(list1,listdes):

    myfile.write("\n\n\n\nBELOW ARE THE LIST OF " + listdes +'\n')
    for i in list1:
        myfile.write('\n'+str(i))


if __name__ == '__main__':
    strurl = input("Please enter URL of Desired Site :")
    start = time.time()
    res = requests.get(strurl)
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    end = time.time()
    lnklist = links()
    imglist = images()
    cslist = css_elements()
    js_file()
    Countreturn(html_elements())
    html_elements()
    print("time taken to Parse ", end - start)

    #Creating custom file name to prevent overwrite by picking 12th to 18 character so as to exclude www....
    filename = strurl[12:18]+'.txt'
    print('\n **** Please check ' + filename + ' for list of links,images and CSS elements ****')

    """
    File Handling section
    """
    myfile=open(filename, 'a+')
    myfile.write("THIS IS THE OUTPUT AFTER PARSING %s" % strurl)
    myfile.write('\nTIME TAKEN TO PARSE '+ str(end-start))
    file_writer(lnklist, ":  LINKS")
    file_writer(imglist, "   IMAGES")
    file_writer(cslist, ":  CSS ELEMENTS")
    myfile.close()