forked from artemsteshenko/parser_maps
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinfo_parser.py
More file actions
96 lines (80 loc) · 3.43 KB
/
info_parser.py
File metadata and controls
96 lines (80 loc) · 3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import json
import random
import argparse
from time import sleep
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from soup_parser import SoupContentParser
from utils import json_pattern
class Parser:
def __init__(self, driver):
self.driver = driver
self.soup_parser = SoupContentParser()
def parse_data(self, hrefs, type_org):
self.driver.maximize_window()
self.driver.get('https://yandex.ru/maps')
parent_handle = self.driver.window_handles[0]
org_id = 0
outputs = []
for organization_url in hrefs:
try:
# if True:
self.driver.execute_script(f'window.open("{organization_url}","org_tab");')
child_handle = [x for x in self.driver.window_handles if x != parent_handle][0]
self.driver.switch_to.window(child_handle)
sleep(0.7)
soup = BeautifulSoup(self.driver.page_source, "lxml")
org_id += 1
name = self.soup_parser.get_name(soup)
address = self.soup_parser.get_address(soup)
website = self.soup_parser.get_website(soup)
opening_hours = self.soup_parser.get_opening_hours(soup)
ypage = self.driver.current_url
rating = self.soup_parser.get_rating(soup)
social = self.soup_parser.get_social(soup)
phone = self.soup_parser.get_phone(soup)
goods, reviews = None, None
output = json_pattern.into_json(org_id, name, address, website, opening_hours, ypage, goods, rating,
reviews, phone, social)
outputs.append(output)
if len(outputs) % 100 == 0:
df = pd.DataFrame()
df['outputs'] = outputs
df.to_csv(f'result_output/{type_org}_outputs.csv')
self.driver.quit()
sleep(random.uniform(2.2, 2.4))
self.driver = webdriver.Safari()
self.driver.maximize_window()
self.driver.get('https://yandex.ru/maps')
parent_handle = self.driver.window_handles[0]
print(f'Данные добавлены, id - {org_id}')
self.driver.switch_to.window(parent_handle)
sleep(random.uniform(0.2, 0.4))
except:
print('except')
# driver.quit()
sleep(random.uniform(2.2, 2.4))
self.driver = webdriver.Safari()
self.driver.maximize_window()
self.driver.get('https://yandex.ru/maps')
parent_handle = self.driver.window_handles[0]
print('Данные сохранены')
self.driver.quit()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("type_org", help="organization type")
args = parser.parse_args()
type_org = args.type_org
all_hrefs = []
files = os.listdir(f'links/{type_org}')
for file in files:
with open(f'links/{type_org}/{file}', 'r', encoding='utf-8') as f:
hrefs = json.load(f)['1']
all_hrefs += hrefs
all_hrefs = list(set(all_hrefs))
print('all_hrefs', len(all_hrefs))
driver = webdriver.Safari()
parser = Parser(driver)
parser.parse_data(all_hrefs, type_org)