Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions Back-End/etl/test_clean_dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
import pandas as pd
import importlib

etl_module = importlib.import_module("Back-End.etl.transform")
config_module = importlib.import_module("Back-End.etl.transform_config")

clean_dataframe = getattr(etl_module, "clean_dataframe")
TRANSFORM_CONFIG = getattr(config_module, "TRANSFORM_CONFIG")

RAW_DIR = "data/raw"

for filename in os.listdir(RAW_DIR):
if filename.endswith(".csv") and filename in TRANSFORM_CONFIG:
print(f"\n🔍 Testing: {filename}")
config = TRANSFORM_CONFIG[filename]
try:
df = pd.read_csv(os.path.join(RAW_DIR, filename))
df_cleaned = clean_dataframe(df, config["required_columns"], config["dedup_keys"])
print(f"✅ {filename}: {len(df_cleaned)} rows after cleaning")
except Exception as e:
print(f"❌ {filename}: Error during cleaning — {e}")
import os
import pandas as pd
import importlib

etl_module = importlib.import_module("Back-End.etl.transform")
config_module = importlib.import_module("Back-End.etl.transform_config")

transform_file = getattr(etl_module, "transform_file")
TRANSFORM_CONFIG = getattr(config_module, "TRANSFORM_CONFIG")

RAW_DIR = "data/raw"
PROCESSED_DIR = "data/processed"
os.makedirs(PROCESSED_DIR, exist_ok=True)

for filename in os.listdir(RAW_DIR):
if filename.endswith(".csv") and filename in TRANSFORM_CONFIG:
print(f"\n🔄 Transforming: {filename}")
config = TRANSFORM_CONFIG[filename]
try:
df_transformed = transform_file(
filepath=os.path.join(RAW_DIR, filename),
required_columns=config["required_columns"],
dedup_keys=config["dedup_keys"],
column_types=config["column_types"]
)
output_path = os.path.join(PROCESSED_DIR, filename)
df_transformed.to_csv(output_path, index=False)
print(f"✅ Saved cleaned file to: {output_path}")
except Exception as e:
print(f"❌ {filename}: Transform failed — {e}")
73 changes: 73 additions & 0 deletions Back-End/etl/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import os
import pandas as pd
import logging

# Setup logging
LOG_DIR = 'logs/'
os.makedirs(LOG_DIR, exist_ok=True)

logging.basicConfig(
filename=os.path.join(LOG_DIR, 'transform.log'),
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)

def clean_dataframe(df: pd.DataFrame, required_columns: list, dedup_keys: list) -> pd.DataFrame:
"""
Cleans a DataFrame by dropping rows with missing critical fields and deduplicating.
"""
initial_rows = len(df)
df_cleaned = df.dropna(subset=required_columns)
df_cleaned = df_cleaned.drop_duplicates(subset=dedup_keys)
final_rows = len(df_cleaned)

logging.info(f"Cleaned DataFrame: {initial_rows - final_rows} rows removed")
print(f"✅ Cleaned DataFrame: {initial_rows - final_rows} rows removed")

return df_cleaned


def validate_types(df: pd.DataFrame, column_types: dict) -> pd.DataFrame:
"""
Validates and converts column types based on expected schema.
"""
for col, expected_type in column_types.items():
try:
df[col] = df[col].astype(expected_type)
logging.info(f"Converted column '{col}' to {expected_type}")
except Exception as e:
logging.warning(f"Type conversion failed for column '{col}': {e}")
print(f"⚠️ Type conversion failed for '{col}': {e}")
return df


def transform_file(filepath: str, required_columns: list, dedup_keys: list, column_types: dict) -> pd.DataFrame:
"""
Full transform pipeline for a single CSV file.
"""
try:
df = pd.read_csv(filepath)
logging.info(f"Loaded file: {filepath}")
df = clean_dataframe(df, required_columns, dedup_keys)
df = validate_types(df, column_types)
return df
except Exception as e:
logging.error(f"Error transforming file {filepath}: {e}")
print(f"❌ Error transforming file {filepath}: {e}")
return pd.DataFrame()


if __name__ == "__main__":
# Example usage
FILEPATH = 'data/raw/Sale Report.csv'
REQUIRED = ['ProductID', 'SaleAmount']
DEDUP_KEYS = ['ProductID', 'Date', 'StoreID']
COLUMN_TYPES = {
'ProductID': str,
'SaleAmount': float,
'Date': 'datetime64[ns]',
'StoreID': str
}

df_transformed = transform_file(FILEPATH, REQUIRED, DEDUP_KEYS, COLUMN_TYPES)
print(df_transformed.head())
70 changes: 70 additions & 0 deletions Back-End/etl/transform_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
TRANSFORM_CONFIG = {
"Amazon Sale Report.csv": {
"required_columns": ["Order ID", "Date", "Amount", "SKU"],
"dedup_keys": ["Order ID", "Date", "SKU"],
"column_types": {
"Order ID": str,
"Date": "datetime64[ns]",
"Amount": float,
"SKU": str
}
},
"Cloud Warehouse Compersion Chart.csv": {
"required_columns": ["Shiprocket", "INCREFF"],
"dedup_keys": ["Shiprocket", "INCREFF"],
"column_types": {
"Shiprocket": float,
"INCREFF": float
}
},
"Expense IIGF.csv": {
"required_columns": ["Recived Amount", "Expance"],
"dedup_keys": ["Recived Amount", "Expance"],
"column_types": {
"Recived Amount": float,
"Expance": str
}
},
"International sale Report.csv": {
"required_columns": ["DATE", "CUSTOMER", "PCS", "RATE"],
"dedup_keys": ["DATE", "CUSTOMER", "SKU"],
"column_types": {
"DATE": "datetime64[ns]",
"CUSTOMER": str,
"PCS": int,
"RATE": float,
"GROSS AMT": float
}
},
"May-2022.csv": {
"required_columns": ["Sku", "Style Id", "Category", "TP", "Ajio MRP"],
"dedup_keys": ["Sku", "Style Id", "Category"],
"column_types": {
"Sku": str,
"Style Id": str,
"Category": str,
"TP": float,
"Ajio MRP": float
}
},
"P L March 2021.csv": {
"required_columns": ["Sku", "Style Id", "Category", "TP 1", "Ajio MRP"],
"dedup_keys": ["Sku", "Style Id", "Category"],
"column_types": {
"Sku": str,
"Style Id": str,
"Category": str,
"TP 1": float,
"Ajio MRP": float
}
},
"Sale Report.csv": {
"required_columns": ["SKU Code", "Design No.", "Stock"],
"dedup_keys": ["SKU Code", "Design No."],
"column_types": {
"SKU Code": str,
"Design No.": str,
"Stock": float
}
}
}
Empty file removed BackEnd/etl/__init__.py
Empty file.
40 changes: 0 additions & 40 deletions BackEnd/etl/extract.py

This file was deleted.

Empty file removed BackEnd/etl/load.py
Empty file.
Empty file removed BackEnd/etl/transform.py
Empty file.
Loading
Loading