-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
121 lines (104 loc) · 3.52 KB
/
main.py
File metadata and controls
121 lines (104 loc) · 3.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import time
import shutil
import requests
from requests import exceptions as r_exc
from selenium import webdriver
from selenium.common import exceptions
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
def load_images(links: list, number_of_image: int):
driver = webdriver.Edge()
for link in links:
href = link.get_attribute('href')
driver.get(href)
driver.implicitly_wait(3.5)
try:
driver.find_element(
By.CLASS_NAME, "CheckboxCaptcha-Anchor").click()
except exceptions.NoSuchElementException:
pass
try:
img_origin = WebDriverWait(driver, 7).until(
ec.presence_of_element_located(
(By.CLASS_NAME, "MMImage-Origin"))
)
except exceptions.TimeoutException:
print("No such element")
time.sleep(2)
continue
img_link = img_origin.get_attribute('src')
try:
response = requests.get(img_link, timeout=10)
except r_exc.ReadTimeout:
print("The read operation timed out")
time.sleep(3)
continue
with open(str(number_of_image).zfill(4) + '.jpg', 'wb') as f:
f.write(response.content)
print('Success')
number_of_image += 1
if (number_of_image > 10):
break
driver.quit()
return number_of_image
def get_images(name: str):
os.chdir(name)
url = "https://yandex.ru/images/search?text="
full_url = os.path.join(url, name)
number_of_image = 0
driver = webdriver.Edge()
while True:
try:
driver.get(full_url)
driver.implicitly_wait(0.5)
driver.find_element(
By.CLASS_NAME, "CheckboxCaptcha-Anchor").click()
except exceptions.NoSuchElementException:
print('No Captcha')
body = driver.find_element(By.CSS_SELECTOR, 'body')
for i in range(40):
body.send_keys(Keys.PAGE_DOWN)
try:
driver.find_element(
By.CLASS_NAME, "CheckboxCaptcha-Anchor").click()
except exceptions.NoSuchElementException:
pass
if i in range(25, 40):
try:
full_url = driver.find_element(
By.CLASS_NAME, "button2").get_attribute('href')
break
except exceptions.NoSuchElementException:
pass
time.sleep(0.4)
driver.implicitly_wait(5)
img_links = driver.find_elements(By.CLASS_NAME, 'serp-item__link')
if len(img_links) > 0:
print(len(img_links))
number_of_image = load_images(img_links, number_of_image)
print("Page is done!")
time.sleep(10)
if number_of_image > 10:
driver.quit()
os.chdir('..')
break
def make_folders(names: list):
if not os.path.isdir('dataset'):
os.mkdir('dataset')
os.chdir('dataset')
if os.path.isdir(names[0]) and os.path.isdir(names[1]):
shutil.rmtree(names[0])
shutil.rmtree(names[1])
os.mkdir(names[0])
os.mkdir(names[1])
def main():
class1 = "rose"
class2 = "tulip"
make_folders((class1, class2))
get_images(class1)
time.sleep(30)
get_images(class2)
main()