-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataGet.py
More file actions
67 lines (49 loc) · 1.98 KB
/
dataGet.py
File metadata and controls
67 lines (49 loc) · 1.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import requests
from lxml import etree
import time
import os
def get_html_content(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
try:
# driver = webdriver.Chrome()
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except Exception as e:
print(f"网络请求异常:{e}")
return None
def parse_html(html_content):
tree = etree.HTML(html_content)
data_list = []
links = tree.xpath("/html/body/div/section/div/div[@class='view_content']/section/div[@id='imageContent']/section/div/figure/a/img/@data-src")
for link in links:
total_link = 'https:' + link + '?x-oss-process=image/format,webp'
data_list.append(total_link)
return data_list
def store_data(result_list, save_path):
if not os.path.exists(save_path):
os.makedirs(save_path)
for url in result_list:
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
file_name = os.path.join(save_path, url.split('/')[-2].split('?')[0])
with open(file_name, 'wb') as f:
f.write(response.content)
print(f"success download: {file_name}")
except requests.exceptions.RequestException as e:
print(f"fail download: {url} : {e}")
time.sleep(5)
if __name__ == '__main__':
for page in range(1, 10):
target_url = f"https://www.vcg.com/creative-image/maogou/?page={page}"
html_content = get_html_content(target_url)
if html_content:
result_list = parse_html(html_content)
store_data(result_list, 'your save path')
print("Everyone down")
else:
print("fail access web")