forked from tlovestatus/instapy001
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript.py
More file actions
84 lines (75 loc) · 3.17 KB
/
script.py
File metadata and controls
84 lines (75 loc) · 3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
from parsel import Selector
import datetime
import csv
import os
date_part = datetime.datetime.now().strftime('%d %b %Y %H_%M_%S')
output_csv_name = f'Output_{date_part}.csv'
input_file_name = 'scraper_input.csv'
img_folder = f'images_{date_part}'
try:
os.makedirs(img_folder)
except:
pass
row = ['Product SKU', 'Product Name', 'Product Brand',
'Product Price', 'Image URLs seperated by Semicolon']
with open(output_csv_name, 'a', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerow(row)
def download_image(img_url, path_to_store):
if img_url:
r = requests.get(img_url)
with open(path_to_store, 'wb') as img:
img.write(r.content)
def scrape_product(product_url):
print(f"Scraping {product_url}")
r = requests.get(product_url)
sel = Selector(r.text)
product_sku = sel.xpath(
'//div[@class="productSKU"]/dd[@class="productView-info-value"]/text()').extract_first()
product_name = sel.xpath(
'//div[@data-event-type="product"]/@data-name').extract_first()
product_brand = sel.xpath(
'//div[@data-event-type="product"]/@data-product-brand').extract_first()
product_price = sel.xpath(
'//div[@data-event-type="product"]/@data-product-price').extract_first()
product_images = sel.xpath(
'//li[contains(@class,"productView-thumbnail")]/a/@data-image-gallery-new-image-url').extract()
for n, product_image in enumerate(product_images, start=1):
download_image(product_image, f'{img_folder}/{product_sku}-{n}.jpg')
product_images_str = ';'.join(product_images)
return [product_sku, product_name, product_brand, product_price, product_images_str]
def get_all_product_url(search_url):
all_products_url = []
r = requests.get(search_url)
while True:
sel = Selector(r.text)
all_products_url += sel.xpath(
'//li[@class="product"]/article/div/div/h4/a/@href').extract()
next_page_url = sel.xpath(
'//li[contains(@class,"pagination-item--next")]/a/@href').extract_first()
if next_page_url:
r = requests.get(next_page_url)
print(
f"Products URLs Found:{len(all_products_url)} and moving to next page...")
else:
print(f"Total Products Found:{len(all_products_url)}")
return all_products_url
with open(input_file_name, 'r') as infile:
reader = csv.reader(infile)
urls_to_scrape= [row[0] for row in list(reader)]
for search_url in urls_to_scrape[1:]:
if not search_url:
print('Might be a empty line skipping it...')
continue
all_products_url = get_all_product_url(search_url)
print(f'Search URL: {search_url}')
for product_url in all_products_url:
row = scrape_product(product_url)
with open(output_csv_name, 'a', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerow(row)
print("Scraped and added to CSV...")
with open('url_scraped.csv', 'a', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerow([search_url])