MmelIGaba · MmelIGaba · Nov 7, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/.github/hooks/pre-commit b/.github/hooks/pre-commit
@@ -0,0 +1,14 @@
+#!/bin/bash
+echo "🔍 Checking for new imports..."
+
+# Generate temp requirements from imports
+pipreqs . --force --ignore data,tests --savepath temp_requirements.txt
+
+# Merge new packages into requirements.txt
+if [ -f temp_requirements.txt ]; then
+  while read pkg; do
+    grep -qxF "$pkg" requirements.txt || echo "$pkg" >> requirements.txt
+  done < temp_requirements.txt
+  rm temp_requirements.txt
+  echo "✅ requirements.txt updated"
+fi
diff --git a/.gitignore b/.gitignore
@@ -39,3 +39,26 @@ Thumbs.db
 *.bak
 *.zip
 *.tar.gz
+
+Logs
+logs/
+*.log
+
+Raw data
+data/raw/
+*.csv
+
+Python artifacts
+pycache/
+.pyc
+
+Environment files
+.env
+.env.
+
+VS Code settings
+.vscode/
+
+OS-specific
+.DS_Store
+Thumbs.db
diff --git a/Back-End/etl/extract.py b/Back-End/etl/extract.py
@@ -0,0 +1,48 @@
+# Install with: pip install kagglehub[pandas-datasets] python-dotenv
+import os
+import pandas as pd
+from dotenv import load_dotenv
+import kagglehub
+from kagglehub import KaggleDatasetAdapter
+
+# Load environment variables
+load_dotenv()
+
+# Set Kaggle credentials from .env
+os.environ["KAGGLE_USERNAME"] = os.getenv("KAGGLE_USERNAME")
+os.environ["KAGGLE_KEY"] = os.getenv("KAGGLE_KEY")
+
+# Dataset and file config
+DATASET = "thedevastator/unlock-profits-with-e-commerce-sales-data"
+FILE_NAME = "ecommerce_sales_dataset.csv"  # Adjust if needed
+RAW_DIR = "data/raw"
+RAW_PATH = os.path.join(RAW_DIR, FILE_NAME)
+
+def load_kaggle_dataset():
+
+
+    """Download and cache Kaggle dataset if not already saved locally."""
+    if not os.path.exists(RAW_DIR):
+        os.makedirs(RAW_DIR)
+
+    if os.path.exists(RAW_PATH):
+        print(f"📦 Found cached file: {RAW_PATH}")
+        df = pd.read_csv(RAW_PATH)
+    else:
+        print("⬇️ Downloading dataset from Kaggle...")
+        df = kagglehub.load_dataset(
+            KaggleDatasetAdapter.PANDAS,
+            DATASET,
+            FILE_NAME,
+        )
+        df.to_csv(RAW_PATH, index=False)
+        print(f"✅ Saved raw CSV to {RAW_PATH}")
+
+    print("🔍 First 5 records:")
+    print(df.head())
+    return df
+
+if __name__ == "__main__":
+
+
+    load_kaggle_dataset()
diff --git a/BackEnd/etl/__init__.py b/BackEnd/etl/__init__.py
diff --git a/BackEnd/etl/extract.py b/BackEnd/etl/extract.py
@@ -0,0 +1,40 @@
+import os
+import logging
+
+RAW_DIR = 'data/raw/'
+LOG_DIR = 'logs/'
+LOG_FILE = os.path.join(LOG_DIR, 'extract.log')
+
+os.makedirs(LOG_DIR, exist_ok=True)
+
+logging.basicConfig(
+    filename=LOG_FILE,
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+
+def ensure_raw_csv_exists():
+    try:
+        os.makedirs(RAW_DIR, exist_ok=True)
+
+        csv_files = [f for f in os.listdir(RAW_DIR) if f.endswith('.csv')]
+
+
+        if not csv_files:
+            logging.warning("No CSV file found in data/raw/")
+            kggleURL = "https://www.kaggle.com/datasets/thedevastator/unlock-profits-with-e-commerce-sales-data"
+            print("⚠️ No CSV file found in 'data/raw/'.")
+            print(f"Please download the dataset from {kggleURL} and place it in the 'data/raw/' folder.")
+            return False
+
+        logging.info(f"CSV file(s) found: {csv_files}")
+        print(f"✅ Found CSV file(s): {csv_files}")
+        return True
+
+    except Exception as e:
+        logging.error(f"Error during CSV check: {e}")
+        print(f"❌ Error checking CSV files: {e}")
+        return False
+
+if __name__ == "__main__":
+    ensure_raw_csv_exists()
diff --git a/BackEnd/etl/load.py b/BackEnd/etl/load.py
diff --git a/BackEnd/etl/transform.py b/BackEnd/etl/transform.py
diff --git a/requirements.txt b/requirements.txt
@@ -14,3 +14,4 @@ seaborn
 # Optional utilities (recommended for production or automation)
 requests
 schedule
+kagglehub[pandas-datasets]