-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path9.py
More file actions
27 lines (23 loc) · 884 Bytes
/
Copy path9.py
File metadata and controls
27 lines (23 loc) · 884 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages = set()
def getLinks(url):
global pages
html= urlopen("http://en.wikipedia.org"+url)
bsObj = BeautifulSoup(html)
try:
print(bsObj.h1.get_text())
print(bsObj.find(id="mv-content-text").findAll("p")[0])
print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
except AttributeError:
print("页面缺少一些属性!不过不用担心!")
for link in bsObj.findAll("a",href=re.compile("^(/wiki/)")):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
#我们遇到了新的页面
newpage = link.attrs['href']
print("----------------------\n"+newpage)
pages.add(newpage)
getLinks(newpage)
getLinks("")