-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.py
More file actions
120 lines (102 loc) · 4.28 KB
/
config.py
File metadata and controls
120 lines (102 loc) · 4.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
Configuration settings for WebDataScraper.
All scraper settings can be customized here.
"""
import os
from dotenv import load_dotenv
load_dotenv()
class ScraperConfig:
"""Configuration for web scraping behavior."""
# Rate limiting
DEFAULT_DELAY = float(os.getenv('SCRAPER_DELAY', '2.0')) # Seconds between requests
MIN_DELAY = float(os.getenv('SCRAPER_MIN_DELAY', '1.0')) # Minimum delay
MAX_DELAY = float(os.getenv('SCRAPER_MAX_DELAY', '10.0')) # Maximum delay
# Retry settings
MAX_RETRIES = int(os.getenv('SCRAPER_MAX_RETRIES', '3'))
RETRY_BACKOFF_FACTOR = float(os.getenv('SCRAPER_RETRY_BACKOFF', '2.0')) # Exponential backoff multiplier
RETRY_STATUSES = [429, 500, 502, 503, 504] # HTTP status codes to retry
# Request settings
TIMEOUT = int(os.getenv('SCRAPER_TIMEOUT', '15')) # Request timeout in seconds
MAX_CONNECTIONS = int(os.getenv('SCRAPER_MAX_CONNECTIONS', '10')) # Max concurrent connections
# User agent
USER_AGENT = os.getenv(
'SCRAPER_USER_AGENT',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
)
# Headers
HEADERS = {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-CA,en;q=0.9,fr-CA;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
# Data quality
MIN_CONFIDENCE_THRESHOLD = float(os.getenv('MIN_CONFIDENCE_THRESHOLD', '0.3'))
# Logging
LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO')
LOG_TO_FILE = os.getenv('LOG_TO_FILE', 'true').lower() == 'true'
LOG_DIR = os.getenv('LOG_DIR', 'logs')
class DatabaseConfig:
"""Configuration for Supabase database."""
SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_KEY = os.getenv('SUPABASE_KEY')
def load_config():
"""Load configuration as a dictionary."""
return {
'supabase_url': DatabaseConfig.SUPABASE_URL,
'supabase_key': DatabaseConfig.SUPABASE_KEY,
'scraper_delay': ScraperConfig.DEFAULT_DELAY,
'scraper_timeout': ScraperConfig.TIMEOUT,
'log_level': ScraperConfig.LOG_LEVEL,
}
# Scraping sources configuration
SCRAPING_SOURCES = {
'creditcardgenius': {
'enabled': True,
'urls': [
("https://creditcardgenius.ca/best-credit-cards/cash-back", "cashback"),
("https://creditcardgenius.ca/best-credit-cards/travel", "travel"),
("https://creditcardgenius.ca/best-credit-cards/rewards", "rewards"),
("https://creditcardgenius.ca/best-credit-cards/no-fee", "no-fee"),
("https://creditcardgenius.ca/best-credit-cards/groceries", "groceries"),
]
},
'ratehub': {
'enabled': True,
'urls': [
"https://www.ratehub.ca/credit-cards/cash-back",
"https://www.ratehub.ca/credit-cards/travel",
"https://www.ratehub.ca/credit-cards/rewards",
"https://www.ratehub.ca/credit-cards/no-fee",
]
},
'moneysense': {
'enabled': True,
'urls': [
"https://www.moneysense.ca/spend/credit-cards/best-credit-cards-in-canada/",
"https://www.moneysense.ca/spend/credit-cards/best-cash-back-credit-cards-in-canada/",
"https://www.moneysense.ca/spend/credit-cards/best-travel-credit-cards-in-canada/",
]
},
'nerdwallet': {
'enabled': True,
'urls': [
"https://www.nerdwallet.com/ca/credit-cards/best-cash-back-credit-cards",
"https://www.nerdwallet.com/ca/credit-cards/best-travel-credit-cards",
"https://www.nerdwallet.com/ca/credit-cards/best-rewards-credit-cards",
"https://www.nerdwallet.com/ca/credit-cards/best-no-fee-credit-cards",
]
},
'greedyrates': {
'enabled': True,
'urls': [
"https://www.greedyrates.ca/blog/best-cash-back-credit-cards-canada/",
"https://www.greedyrates.ca/blog/best-travel-credit-cards-canada/",
"https://www.greedyrates.ca/blog/best-rewards-credit-cards-canada/",
"https://www.greedyrates.ca/blog/best-no-fee-credit-cards-canada/",
]
}
}