forked from sachingupta006/github-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
150 lines (114 loc) · 4.29 KB
/
crawler.py
File metadata and controls
150 lines (114 loc) · 4.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import sys
import urllib2
import re
import urlparse
from collections import deque
# This regex is not complete, so using Beautiful Soup to extract links from web page
linkregex = re.compile(r'<a.*?href=[\'|"]?(.*?)[\'|"]?\s*>', re.IGNORECASE)
# Goes to a depth 5 for the input url
search_depth = 1
from BeautifulSoup import BeautifulSoup
class Crawler(object):
def __init__(self, root, depth):
self.root = root
self.depth = depth
self.host = urlparse.urlparse(self.root).netloc
self.crawled = []
self.links = 1 #including the root url
self.externalLinks = []
self.uncrawled = []
def crawl(self):
page = GetLinks(self.root)
page.get()
parentQ = deque()
childQ = deque()
parentQ.append(self.root)
level = 0
while True:
try:
url = parentQ.popleft()
except:
level+=1
print("\n")
if level == self.depth:
break
else:
# transfer all urls from the child queue to the parent queue
while childQ:
url = childQ.popleft()
parentQ.append(url)
# break if the queue is empty
if not parentQ:
print "No more links found"
print "Finishing...."
break
else:
continue
if url not in self.crawled:
try:
# extract the host out of the new url
host = urlparse.urlparse(url).netloc
# if it matches with the current root .* includes any subdomains
if re.match(".*%s" % self.host, host):
print "crawling: " + url
self.links+=1
self.crawled.append(url)
page = GetLinks(url)
page.get()
for new_url in page.urls:
if new_url not in self.crawled:
childQ.append(new_url)
else:
self.externalLinks.append(url)
except Exception, e:
print "ERROR: Can't process url '%s' (%s)" % (url, e)
while childQ:
link = childQ.popleft()
self.uncrawled.append(link)
class GetLinks(object):
def __init__(self,url):
self.url = url
self.urls = []
def get(self):
# Fetch the page contents
url = urlparse.urlparse(self.url)
request = urllib2.Request(self.url)
response = urllib2.urlopen(request)
page = response.read()
# Extract urls from the page
# links = linkregex.findall(page)
# can't use regex here, some problems with that using beautiful soup
soup = BeautifulSoup(page)
tags = soup('a')
for tag in tags:
link = tag.get("href")
if link.startswith('/'):
link = url.scheme + '://' + url.netloc + link
elif link.startswith('#'):
if link == '#':
tags.remove(tag)
continue
else:
link = url.scheme + '://' + url.netloc + url.path
elif not link.startswith('http'):
link = 'http://' + url[1] + '/' + link
# specific to mycareerstack.com
# remove this
if not "accounts" in link:
self.urls.append(link)
def main():
if len(sys.argv) < 2:
print 'No start url was given'
sys.exit()
url = sys.argv[1]
print "Crawling %s (Max Depth: %d)" % (url, search_depth)
crawler = Crawler(url,search_depth)
crawler.crawl()
print "Total internal links found " + str(crawler.links)
print "Total links crawled " + str(len(crawler.crawled))
print "\nUncrawled links "
print "\n".join(crawler.uncrawled)
print "\nExternal links:"
print "\n".join(crawler.externalLinks)
if __name__ == "__main__":
main()