-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWebscraper.py
More file actions
102 lines (83 loc) · 2.87 KB
/
Webscraper.py
File metadata and controls
102 lines (83 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
Code to Scrape a website using Python : Made use of Beautiful soup and Requests Library
Data Scraped and Info Gathered :
External files referenced or Links
Images
Css elements
Html elements
JS elements
"""
import time
import requests
import bs4
def links():
"""
Function to get list of links and print number of links
"""
link_list = [link.get("href") for link in soup.find_all('a') if link.get("href")]
print("Number of links are ", len(link_list))
return link_list
def images():
"""
Function to get list of images and print number of images
"""
img_list = [i.get('src') for i in soup.find_all('img') if i.get('src')]
print("Number of images are ", len(img_list))
return img_list
def css_elements():
"""
Function to get a list of css element usage and number of css elements
"""
css_elements_list =[css.get("class") for css in soup.find_all('div') if css.get("class")]
print("Number of css elements are ", len(css_elements_list))
return css_elements_list
def js_file():
"""
Function to get number of JS files
"""
js_file_lst=[i.get('src')for i in soup.find_all('script') if i.get('src')]
print("Number of java script files are", len(js_file_lst))
def html_elements():
"""
Function to get a number of HTML elements
"""
listele =['html','div','p','span','script','h1','h2','h3','h4','h5','h6','img','hr','a','ul','ol','li','label']
return listele
def Countreturn(*args):
"""
used to pick list returned from html_elements function and count total html elements for all strings
"""
elements=0
for i in range(0,len(args)):
elements += len(soup.find_all(args[i]))
print("Number of Html elements are", elements)
def file_writer(list1,listdes):
myfile.write("\n\n\n\nBELOW ARE THE LIST OF " + listdes +'\n')
for i in list1:
myfile.write('\n'+str(i))
if __name__ == '__main__':
strurl = input("Please enter URL of Desired Site :")
start = time.time()
res = requests.get(strurl)
soup = bs4.BeautifulSoup(res.text, 'lxml')
end = time.time()
lnklist = links()
imglist = images()
cslist = css_elements()
js_file()
Countreturn(html_elements())
html_elements()
print("time taken to Parse ", end - start)
#Creating custom file name to prevent overwrite by picking 12th to 18 character so as to exclude www....
filename = strurl[12:18]+'.txt'
print('\n **** Please check ' + filename + ' for list of links,images and CSS elements ****')
"""
File Handling section
"""
myfile=open(filename, 'a+')
myfile.write("THIS IS THE OUTPUT AFTER PARSING %s" % strurl)
myfile.write('\nTIME TAKEN TO PARSE '+ str(end-start))
file_writer(lnklist, ": LINKS")
file_writer(imglist, " IMAGES")
file_writer(cslist, ": CSS ELEMENTS")
myfile.close()