forked from artemsteshenko/parser_maps
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlink_parser.py
More file actions
107 lines (90 loc) · 3.78 KB
/
link_parser.py
File metadata and controls
107 lines (90 loc) · 3.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import random
import json
import argparse
from time import sleep
from selenium import webdriver
from selenium.webdriver import ActionChains
from utils.constants import districts, ACCEPT_BUTTON, type_org_mapping
class LinksCollector:
def __init__(self,
driver,
link='https://yandex.ru/maps',
max_errors=5,
accept_button=ACCEPT_BUTTON,
accept=False):
self.driver = driver
self.slider = None
self.max_errors = max_errors
self.link = link
self.accept_button = accept_button
self.accept = accept
def _init_driver(self):
self.driver.maximize_window()
def _open_page(self, request):
self.driver.get(self.link)
sleep(random.uniform(1, 2))
self.driver.find_element_by_class_name(name='search-form-view__input').send_keys(request)
sleep(random.uniform(0.4, 0.7))
self.driver.find_element_by_class_name(name='small-search-form-view__button').click()
# Нажимаем на кнопку поиска
sleep(random.uniform(1.4, 2))
self.slider = self.driver.find_element_by_class_name(name='scroll__scrollbar-thumb')
if self.accept:
# Соглашение куки
flag = True
count = 0
while flag:
try:
count += 1
sleep(3)
self.driver.find_element_by_xpath(self.accept_button).click()
flag = False
except:
if count > 5:
self.driver.quit()
self._init_driver()
self._open_page(request)
flag = True
def run(self, city, district, type_org_ru, type_org):
self._init_driver()
request = city + ' ' + district + ' ' + type_org_ru
self._open_page(request)
organizations_hrefs = []
count = 0
link_number = [0]
errors = 0
while self.max_errors > errors:
try:
ActionChains(self.driver).click_and_hold(self.slider).move_by_offset(0, int(100/errors)).release().perform()
slider_organizations_hrefs = self.driver.find_elements_by_class_name(name='search-snippet-view__link-overlay')
slider_organizations_hrefs = [href.get_attribute("href") for href in slider_organizations_hrefs]
organizations_hrefs = list(set(organizations_hrefs + slider_organizations_hrefs))
count += 1
if count % 3 == 0:
if len(organizations_hrefs) == link_number[-1]:
errors = errors + 1
print(len(organizations_hrefs))
link_number.append(len(organizations_hrefs))
sleep(random.uniform(0.05, 0.1))
except Exception:
errors = errors + 1
print('errors', errors)
sleep(random.uniform(0.3, 0.4))
directory = f'links/{type_org}'
if not os.path.exists(directory):
os.makedirs(directory)
self.driver.quit()
with open(f'{directory}/{request}.json', 'w') as file:
json.dump({'1': organizations_hrefs}, file)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("type_org", help="organization type")
args = parser.parse_args()
type_org = args.type_org
for type_org in ['translator', 'accountant', 'massage']:
for district in districts:
sleep(1)
driver = webdriver.Safari()
grabber = LinksCollector(driver)
grabber.run(city="Москва", district=district, type_org_ru=type_org_mapping[type_org], type_org=type_org)