-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
56 lines (45 loc) · 2.08 KB
/
main.py
File metadata and controls
56 lines (45 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import requests
import time
import dic
import get_comment
import get_info
import emo
from bs4 import BeautifulSoup
from random import randint
import csv
import random
headers ={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/54.0.2840.99 Safari/537.36"
}
if __name__ == '__main__':
# 使用函数
for key in dic.film_id_name:
print("要开始获取"+dic.film_id_name[key]+"的信息了喵!好激动!")
#用于存放统计评论属地数据
loc_arr = [0 for i in range(36)]
locationinfo = []
locationinfo.append(dic.film_id_name[key])
#获取电影信息
url1 = "https://movie.douban.com/subject/{id}".format(id=key)
get_info.getfilminfo(url1,headers)
print(dic.film_id_name[key]+"的基础信息获取完成了喵!接下来要开始获取评论和属地了喵!")
for n in range(10):
#一页100条 每次爬取完翻页
url2 = "https://movie.douban.com/subject/{id}/comments?start={page}&limit=100&status=P&sort=time".format(id=key,page=n*100)
# 获取评论
comments = get_comment.get_comments(url2,headers,loc_arr)
#if comments:
# get_comment.save_comments(comments, f'./information/{dic.film_id_name[key]}_comments.csv')
print(f"正在获取第{n+1}页的数据")
crawl_interval = random.randint(2,4)
time.sleep(crawl_interval)
print(dic.film_id_name[key]+"的评论获取完成了喵!接下来要开始获取属地了喵!")
# 获取评论归属地信息
for i in range(1, 36):
locationinfo.append(loc_arr[i])
filepath = './information/comment_loc.xlsx'
get_comment.get_excel(filepath, locationinfo)
print(dic.film_id_name[key]+"的获取完成了喵!接下来要开始获取下一部电影了喵!")
print("全部获取完成了喵!主人可以用数据来分析了喵!")