Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/hooks/pre-commit
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
echo "🔍 Checking for new imports..."

# Generate temp requirements from imports
pipreqs . --force --ignore data,tests --savepath temp_requirements.txt

# Merge new packages into requirements.txt
if [ -f temp_requirements.txt ]; then
while read pkg; do
grep -qxF "$pkg" requirements.txt || echo "$pkg" >> requirements.txt
done < temp_requirements.txt
rm temp_requirements.txt
echo "✅ requirements.txt updated"
fi
23 changes: 23 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,26 @@ Thumbs.db
*.bak
*.zip
*.tar.gz

Logs
logs/
*.log

Raw data
data/raw/
*.csv

Python artifacts
pycache/
.pyc

Environment files
.env
.env.

VS Code settings
.vscode/

OS-specific
.DS_Store
Thumbs.db
48 changes: 48 additions & 0 deletions Back-End/etl/extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Install with: pip install kagglehub[pandas-datasets] python-dotenv
import os
import pandas as pd
from dotenv import load_dotenv
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Load environment variables
load_dotenv()

# Set Kaggle credentials from .env
os.environ["KAGGLE_USERNAME"] = os.getenv("KAGGLE_USERNAME")
os.environ["KAGGLE_KEY"] = os.getenv("KAGGLE_KEY")

# Dataset and file config
DATASET = "thedevastator/unlock-profits-with-e-commerce-sales-data"
FILE_NAME = "ecommerce_sales_dataset.csv" # Adjust if needed
RAW_DIR = "data/raw"
RAW_PATH = os.path.join(RAW_DIR, FILE_NAME)

def load_kaggle_dataset():


"""Download and cache Kaggle dataset if not already saved locally."""
if not os.path.exists(RAW_DIR):
os.makedirs(RAW_DIR)

if os.path.exists(RAW_PATH):
print(f"📦 Found cached file: {RAW_PATH}")
df = pd.read_csv(RAW_PATH)
else:
print("⬇️ Downloading dataset from Kaggle...")
df = kagglehub.load_dataset(
KaggleDatasetAdapter.PANDAS,
DATASET,
FILE_NAME,
)
df.to_csv(RAW_PATH, index=False)
print(f"✅ Saved raw CSV to {RAW_PATH}")

print("🔍 First 5 records:")
print(df.head())
return df

if __name__ == "__main__":


load_kaggle_dataset()
Empty file added BackEnd/etl/__init__.py
Empty file.
40 changes: 40 additions & 0 deletions BackEnd/etl/extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os
import logging

RAW_DIR = 'data/raw/'
LOG_DIR = 'logs/'
LOG_FILE = os.path.join(LOG_DIR, 'extract.log')

os.makedirs(LOG_DIR, exist_ok=True)

logging.basicConfig(
filename=LOG_FILE,
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)

def ensure_raw_csv_exists():
try:
os.makedirs(RAW_DIR, exist_ok=True)

csv_files = [f for f in os.listdir(RAW_DIR) if f.endswith('.csv')]


if not csv_files:
logging.warning("No CSV file found in data/raw/")
kggleURL = "https://www.kaggle.com/datasets/thedevastator/unlock-profits-with-e-commerce-sales-data"
print("⚠️ No CSV file found in 'data/raw/'.")
print(f"Please download the dataset from {kggleURL} and place it in the 'data/raw/' folder.")
return False

logging.info(f"CSV file(s) found: {csv_files}")
print(f"✅ Found CSV file(s): {csv_files}")
return True

except Exception as e:
logging.error(f"Error during CSV check: {e}")
print(f"❌ Error checking CSV files: {e}")
return False

if __name__ == "__main__":
ensure_raw_csv_exists()
Empty file added BackEnd/etl/load.py
Empty file.
Empty file added BackEnd/etl/transform.py
Empty file.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ seaborn
# Optional utilities (recommended for production or automation)
requests
schedule
kagglehub[pandas-datasets]
Loading