diff --git a/README.md b/README.md index 065c07a..37f4728 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # TaobaoProduct -Selenium Demo of Taobao Product +Selenium Demo of Taobao Product +test 2020/2/10 更新,关于需要登录的问题,见 Issue:https://github.com/Python3WebSpider/TaobaoProduct/issues/15 diff --git a/bili.png b/bili.png new file mode 100644 index 0000000..6e7e9d6 Binary files /dev/null and b/bili.png differ diff --git a/config.py b/config.py index 3fc5406..5a60d33 100644 --- a/config.py +++ b/config.py @@ -1,7 +1,6 @@ MONGO_URL = 'localhost' MONGO_DB = 'taobao' MONGO_COLLECTION = 'products' - KEYWORD = 'ipad' MAX_PAGE = 100 diff --git a/spider.py b/spider.py deleted file mode 100644 index 4e2c264..0000000 --- a/spider.py +++ /dev/null @@ -1,90 +0,0 @@ -import pymongo -from selenium import webdriver -from selenium.common.exceptions import TimeoutException -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.wait import WebDriverWait -from pyquery import PyQuery as pq -from config import * -from urllib.parse import quote - -# browser = webdriver.Chrome() -# browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) - -chrome_options = webdriver.ChromeOptions() -chrome_options.add_argument('--headless') -browser = webdriver.Chrome(chrome_options=chrome_options) - -wait = WebDriverWait(browser, 10) -client = pymongo.MongoClient(MONGO_URL) -db = client[MONGO_DB] - - -def index_page(page): - """ - 抓取索引页 - :param page: 页码 - """ - print('正在爬取第', page, '页') - try: - url = 'https://s.taobao.com/search?q=' + quote(KEYWORD) - browser.get(url) - if page > 1: - input = wait.until( - EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input'))) - submit = wait.until( - EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit'))) - input.clear() - input.send_keys(page) - submit.click() - wait.until( - EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page))) - wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item'))) - get_products() - except TimeoutException: - index_page(page) - - -def get_products(): - """ - 提取商品数据 - """ - html = browser.page_source - doc = pq(html) - items = doc('#mainsrp-itemlist .items .item').items() - for item in items: - product = { - 'image': item.find('.pic .img').attr('data-src'), - 'price': item.find('.price').text(), - 'deal': item.find('.deal-cnt').text(), - 'title': item.find('.title').text(), - 'shop': item.find('.shop').text(), - 'location': item.find('.location').text() - } - print(product) - save_to_mongo(product) - - -def save_to_mongo(result): - """ - 保存至MongoDB - :param result: 结果 - """ - try: - if db[MONGO_COLLECTION].insert(result): - print('存储到MongoDB成功') - except Exception: - print('存储到MongoDB失败') - - -def main(): - """ - 遍历每一页 - """ - for i in range(1, MAX_PAGE + 1): - index_page(i) - browser.close() - - -if __name__ == '__main__': - main()