BackEnd_Crawling_Server/function.py at main · AlwaysCare/BackEnd_Crawling_Server · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementNotInteractableException
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from time import sleep

def createDriver() -> webdriver.Chrome:
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('lang=ko_KR')
    driver = webdriver.Chrome(service= Service(ChromeDriverManager().install()), options = options)

    return driver

def search(place):
    global driver

    search_area = driver.find_element(By.XPATH, '//*[@id="search.keyword.query"]')  # 검색 창
    search_area.send_keys(place)  # 검색어 입력
    driver.find_element(By.XPATH, '//*[@id="search.keyword.submit"]').send_keys(Keys.ENTER)  # Enter로 검색
    sleep(1)

    # 검색된 정보가 있는 경우에만 탐색
    # 1번 페이지 place list 읽기
    html = driver.page_source

    soup = BeautifulSoup(html, 'html.parser')
    place_lists = soup.select('.placelist > .PlaceItem') # 검색된 장소 목록

    try :
        # 검색된 첫 페이지 장소 목록 크롤링하기 -> 첫 페이지에서 최대 7개 = 거리순 정렬
        crawling(place, place_lists)
        search_area.clear()
    except ElementNotInteractableException :
        print('not found')


def crawling(place, place_lists):
    """
    페이지 목록을 받아서 크롤링 하는 함수
    :param place: 리뷰 정보 찾을 장소이름
    """

    while_flag = False
    for i, place in enumerate(place_lists):
        # 광고에 따라서 index 조정해야함

        place_name = place.select('.head_item > .tit_name > .link_name')[0].text  # place name
        place_address = place.select('.info_item > .addr > p')[0].text  # place address

        detail_page_xpath = '//*[@id="info.search.place.list"]/li[' + str(i + 1) + ']/div[5]/div[4]/a[1]'
        driver.find_element(By.XPATH, detail_page_xpath).send_keys(Keys.ENTER)
        driver.switch_to.window(driver.window_handles[-1])  # 상세정보 탭으로 변환
        sleep(1)
        name_list.append(place_name)
        # print('####', place_name)

        # 첫 페이지 -> 일단은 첫 페이지만 review 저장
        extract_review(place_name)

        driver.close()
        driver.switch_to.window(driver.window_handles[0])  # 검색 탭으로 전환


def extract_review(place_name):
    global driver

    ret = True

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    # 첫 페이지 리뷰 목록 찾기
    review_lists = soup.select('.list_evaluation > li')
    #별점 가져오기
    num_rate = soup.select('.grade_star > em')
    rating = num_rate[0].text
    score_list.append(rating.replace('점',''))
    # print(rating)

    # 리뷰가 있는 경우
    if len(review_lists) != 0:
        reviews = []
        for i, review in enumerate(review_lists):
            comment = review.select('.txt_comment > span') # 리뷰
            val = ''
            if len(comment) != 0:
                val = comment[0].text + '/0'
                # print(val)
                reviews.append(val)
        all_review.append(reviews)

    else:
        print('no review in extract')
        ret = False

    return ret