-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbaidu_spider.py
More file actions
68 lines (48 loc) · 1.73 KB
/
baidu_spider.py
File metadata and controls
68 lines (48 loc) · 1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# -*- coding:utf-8 -*-
import re
import urllib2
"""
爬取百度贴吧
"""
class BDTB:
#初始化,传入基地址
def __init__(self, baseUrl, seeLZ):
self.baseUrl = baseUrl
self.seeLZ = '?see_lz='+str(seeLZ)
#获取当前页的代码
def getPage(self,pageNum):
try:
url = 'http://tieba.baidu.com/p/3138733512?see_lz=1&pn=' + str(pageNum)
# url = self.baseUrl + self.seeLZ + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
# print response.read()
return response.read().decode('utf-8')
except urllib2.URLError, e:
if hasattr(e, 'reason'):
print u'连接百度贴吧失败,错误原因',e.reason
return None
#获取标题
def getTitle(self):
page = self.getPage(1)
pattern = re.compile('<h3 class="core_title_txt.*?title="(.*?)".*?class="core_title_btns pull-right">', re.S)
result = re.search(pattern, page)
if result:
# print result.group(1)
return result.group(1).strip()
else:
return None
# 获取帖子一共有多少页
def getPageNum(self):
page = self.getPage(1)
pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>', re.S)
result = re.search(pattern, page)
if result:
print result.group(1)
return result.group(1)
else:
return None
if __name__ == '__main__':
baseUrl = 'http://tieba.baidu.com/p/3138733512?see_lz=1&pn='
bdtb = BDTB(baseUrl, 1)
bdtb.getPageNum()