MmelIGaba · boipelo-codes · Nov 10, 2025 · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025
diff --git a/Back-End/etl/test_clean_dataframe.py b/Back-End/etl/test_clean_dataframe.py
@@ -0,0 +1,52 @@
+import os
+import pandas as pd
+import importlib
+
+etl_module = importlib.import_module("Back-End.etl.transform")
+config_module = importlib.import_module("Back-End.etl.transform_config")
+
+clean_dataframe = getattr(etl_module, "clean_dataframe")
+TRANSFORM_CONFIG = getattr(config_module, "TRANSFORM_CONFIG")
+
+RAW_DIR = "data/raw"
+
+for filename in os.listdir(RAW_DIR):
+    if filename.endswith(".csv") and filename in TRANSFORM_CONFIG:
+        print(f"\n🔍 Testing: {filename}")
+        config = TRANSFORM_CONFIG[filename]
+        try:
+            df = pd.read_csv(os.path.join(RAW_DIR, filename))
+            df_cleaned = clean_dataframe(df, config["required_columns"], config["dedup_keys"])
+            print(f"✅ {filename}: {len(df_cleaned)} rows after cleaning")
+        except Exception as e:
+            print(f"❌ {filename}: Error during cleaning — {e}")
+import os
+import pandas as pd
+import importlib
+
+etl_module = importlib.import_module("Back-End.etl.transform")
+config_module = importlib.import_module("Back-End.etl.transform_config")
+
+transform_file = getattr(etl_module, "transform_file")
+TRANSFORM_CONFIG = getattr(config_module, "TRANSFORM_CONFIG")
+
+RAW_DIR = "data/raw"
+PROCESSED_DIR = "data/processed"
+os.makedirs(PROCESSED_DIR, exist_ok=True)
+
+for filename in os.listdir(RAW_DIR):
+    if filename.endswith(".csv") and filename in TRANSFORM_CONFIG:
+        print(f"\n🔄 Transforming: {filename}")
+        config = TRANSFORM_CONFIG[filename]
+        try:
+            df_transformed = transform_file(
+                filepath=os.path.join(RAW_DIR, filename),
+                required_columns=config["required_columns"],
+                dedup_keys=config["dedup_keys"],
+                column_types=config["column_types"]
+            )
+            output_path = os.path.join(PROCESSED_DIR, filename)
+            df_transformed.to_csv(output_path, index=False)
+            print(f"✅ Saved cleaned file to: {output_path}")
+        except Exception as e:
+            print(f"❌ {filename}: Transform failed — {e}")
diff --git a/Back-End/etl/transform.py b/Back-End/etl/transform.py
@@ -0,0 +1,73 @@
+import os
+import pandas as pd
+import logging
+
+# Setup logging
+LOG_DIR = 'logs/'
+os.makedirs(LOG_DIR, exist_ok=True)
+
+logging.basicConfig(
+    filename=os.path.join(LOG_DIR, 'transform.log'),
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+
+def clean_dataframe(df: pd.DataFrame, required_columns: list, dedup_keys: list) -> pd.DataFrame:
+    """
+    Cleans a DataFrame by dropping rows with missing critical fields and deduplicating.
+    """
+    initial_rows = len(df)
+    df_cleaned = df.dropna(subset=required_columns)
+    df_cleaned = df_cleaned.drop_duplicates(subset=dedup_keys)
+    final_rows = len(df_cleaned)
+
+    logging.info(f"Cleaned DataFrame: {initial_rows - final_rows} rows removed")
+    print(f"✅ Cleaned DataFrame: {initial_rows - final_rows} rows removed")
+
+    return df_cleaned
+
+
+def validate_types(df: pd.DataFrame, column_types: dict) -> pd.DataFrame:
+    """
+    Validates and converts column types based on expected schema.
+    """
+    for col, expected_type in column_types.items():
+        try:
+            df[col] = df[col].astype(expected_type)
+            logging.info(f"Converted column '{col}' to {expected_type}")
+        except Exception as e:
+            logging.warning(f"Type conversion failed for column '{col}': {e}")
+            print(f"⚠️ Type conversion failed for '{col}': {e}")
+    return df
+
+
+def transform_file(filepath: str, required_columns: list, dedup_keys: list, column_types: dict) -> pd.DataFrame:
+    """
+    Full transform pipeline for a single CSV file.
+    """
+    try:
+        df = pd.read_csv(filepath)
+        logging.info(f"Loaded file: {filepath}")
+        df = clean_dataframe(df, required_columns, dedup_keys)
+        df = validate_types(df, column_types)
+        return df
+    except Exception as e:
+        logging.error(f"Error transforming file {filepath}: {e}")
+        print(f"❌ Error transforming file {filepath}: {e}")
+        return pd.DataFrame()
+
+
+if __name__ == "__main__":
+    # Example usage
+    FILEPATH = 'data/raw/Sale Report.csv'
+    REQUIRED = ['ProductID', 'SaleAmount']
+    DEDUP_KEYS = ['ProductID', 'Date', 'StoreID']
+    COLUMN_TYPES = {
+        'ProductID': str,
+        'SaleAmount': float,
+        'Date': 'datetime64[ns]',
+        'StoreID': str
+    }
+
+    df_transformed = transform_file(FILEPATH, REQUIRED, DEDUP_KEYS, COLUMN_TYPES)
+    print(df_transformed.head())
diff --git a/Back-End/etl/transform_config.py b/Back-End/etl/transform_config.py
@@ -0,0 +1,70 @@
+TRANSFORM_CONFIG = {
+    "Amazon Sale Report.csv": {
+        "required_columns": ["Order ID", "Date", "Amount", "SKU"],
+        "dedup_keys": ["Order ID", "Date", "SKU"],
+        "column_types": {
+            "Order ID": str,
+            "Date": "datetime64[ns]",
+            "Amount": float,
+            "SKU": str
+        }
+    },
+    "Cloud Warehouse Compersion Chart.csv": {
+        "required_columns": ["Shiprocket", "INCREFF"],
+        "dedup_keys": ["Shiprocket", "INCREFF"],
+        "column_types": {
+            "Shiprocket": float,
+            "INCREFF": float
+        }
+    },
+    "Expense IIGF.csv": {
+        "required_columns": ["Recived Amount", "Expance"],
+        "dedup_keys": ["Recived Amount", "Expance"],
+        "column_types": {
+            "Recived Amount": float,
+            "Expance": str
+        }
+    },
+    "International sale Report.csv": {
+        "required_columns": ["DATE", "CUSTOMER", "PCS", "RATE"],
+        "dedup_keys": ["DATE", "CUSTOMER", "SKU"],
+        "column_types": {
+            "DATE": "datetime64[ns]",
+            "CUSTOMER": str,
+            "PCS": int,
+            "RATE": float,
+            "GROSS AMT": float
+        }
+    },
+    "May-2022.csv": {
+        "required_columns": ["Sku", "Style Id", "Category", "TP", "Ajio MRP"],
+        "dedup_keys": ["Sku", "Style Id", "Category"],
+        "column_types": {
+            "Sku": str,
+            "Style Id": str,
+            "Category": str,
+            "TP": float,
+            "Ajio MRP": float
+        }
+    },
+    "P  L March 2021.csv": {
+        "required_columns": ["Sku", "Style Id", "Category", "TP 1", "Ajio MRP"],
+        "dedup_keys": ["Sku", "Style Id", "Category"],
+        "column_types": {
+            "Sku": str,
+            "Style Id": str,
+            "Category": str,
+            "TP 1": float,
+            "Ajio MRP": float
+        }
+    },
+    "Sale Report.csv": {
+        "required_columns": ["SKU Code", "Design No.", "Stock"],
+        "dedup_keys": ["SKU Code", "Design No."],
+        "column_types": {
+            "SKU Code": str,
+            "Design No.": str,
+            "Stock": float
+        }
+    }
+}
diff --git a/BackEnd/etl/__init__.py b/BackEnd/etl/__init__.py
diff --git a/BackEnd/etl/extract.py b/BackEnd/etl/extract.py
diff --git a/BackEnd/etl/load.py b/BackEnd/etl/load.py
diff --git a/BackEnd/etl/transform.py b/BackEnd/etl/transform.py