scraper/scraper.py at main · bugemarvin/scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import os
import time
from webdriver_manager.chrome import ChromeDriverManager

# Configure WebDriver
def init_driver():
    chrome_options = Options() # Initialize Chrome options
    chrome_options.add_argument("--headless")  # Run headless mode
    service = Service(ChromeDriverManager().install()) # Install ChromeDriver
    return webdriver.Chrome(service=service, options=chrome_options) # Initialize Chrome driver

# Convert RGB to HEX
def rgb_to_hex(rgb):
    rgb = [int(x) for x in rgb[4:-1].split(",")] # Convert RGB string to list of integers
    return '#%02x%02x%02x' % tuple(rgb) # Convert RGB to HEX

# Collect palette data from multiple pages
def collect_palette_data(driver, base_url, num_pages):
    data = [] # Initialize data list
    for page in range(1, num_pages + 1):
        try:
            driver.get(f"{base_url}/?page={page}") # Load the page
            palette_elements = driver.find_elements(By.CLASS_NAME, "palettecontainerlist") # Find palette elements on the page by class name you can adjust this according to the structure of the website
            print(f"Found {len(palette_elements)} palettes on page {page}.")
            for palette in palette_elements:
                print(f"Extracting palette data {palette_elements.index(palette) + 1}...")
                try:
                    anchor = palette.find_element(By.TAG_NAME, "a") # Find the anchor tag within the palette element to extract the ID and name you can adjust this according to the structure of the website
                    id = anchor.get_attribute("href").split("/")[-1] # Extract the ID from the href attribute you can adjust this according to the structure of the website
                    palette_name = anchor.get_attribute("title").split(" ") # Extract the palette name from the title attribute you can adjust this according to the structure of the website
                    palette_name = " ".join(palette_name[2:]) # Join the palette name excluding the first two words (e.g., "Color Palette: ") you can adjust this according to the structure of the website

                    color_elements = palette.find_elements(By.CLASS_NAME, "palettecolordiv") # Find color elements within the palette element you can adjust this according to the structure of the website
                    colors_rgb = [] # Initialize list to store RGB colors
                    colors_hex = [] # Initialize list to store HEX colors

                    for color_elem in color_elements:
                        style = color_elem.get_attribute("style") # Get the style attribute of the color element you can adjust this according to the structure of the website
                        if style and "background-color:" in style:
                            rgb = style.split("background-color: ")[1].split(";")[0] # Extract the RGB value from the style attribute you can adjust this according to the structure of the website
                            colors_rgb.append(rgb) # Append the RGB value to the list
                            colors_hex.append(rgb_to_hex(rgb)) # Convert RGB to HEX and append to the list

                    print(f"Palette Name: {palette_name}" + "\n" + f"Colors: {colors_rgb}" + "\n" + f"HEX: {colors_hex}")
                    data.append({
                        "ID": id,
                        "Name": palette_name,
                        "HEX": colors_hex,
                        "RGB": colors_rgb
                    }) # Append the palette data to the list
                except NoSuchElementException as e:
                    print(f"Palette elements not found within the container. Error: {e}")
                except IndexError as e:
                    print(f"Error extracting background color. Error: {e}")
        except NoSuchElementException:
            print(f"No palettes found on page {page}.")
        except TimeoutException:
            print(f"Timeout occurred on page {page}.")
        time.sleep(1)  # Delay to prevent being blocked
    return data

# Save data to a file
def save_data(data, filename):
    if not os.path.exists('data'):
        os.makedirs('data') # Create a data directory if it does not exist
    df = pd.DataFrame(data) # Create a DataFrame from the data
    if filename.endswith('.json'):
        # Save data to JSON file
        df.to_json(filename, orient='records')
    else:
        # Save data to CSV file
        df.to_csv(filename, index=False)

# Main function
def main():
    url = "https://www.color-hex.com/color-palettes" # Specify the URL to scrape data from (Please note that this URL is subject to change as well as the structure of the website to be scraped from in the future in scraper.py)
    num_pages = 1764  # Set the number of pages to scrape per your requirements (1764 pages in this case)
    driver = init_driver() # Initialize the driver
    try:
        print("Collecting palette data from the specified URL...")
        palette_data = collect_palette_data(driver, url, num_pages) # Collect palette data
        print(f"Total palettes extracted: {len(palette_data)}")
        save_data(palette_data, 'data/palette_data.csv') # Save the data to a CSV file
    finally:
        # Quit the driver after processing
        driver.quit()

if __name__ == "__main__":
    main()