diff --git a/.env.example b/.env.example index 9a0354b..c0ea240 100644 --- a/.env.example +++ b/.env.example @@ -2,7 +2,7 @@ # Never commit real secrets # Groq API key used by chatbot features -GROQ_API_KEY=your_groq_api_key_here +GROQ_API_KEY="" # Optional: override dataset path for local runs # NEXALEARN_DATASET_PATH=broken-ai_deadcode_dataset.csv diff --git a/ml_pipeline.py b/ml_pipeline.py index 93181ec..ebbb8ac 100644 --- a/ml_pipeline.py +++ b/ml_pipeline.py @@ -38,7 +38,8 @@ # SECTION 1 │ LOAD DATASET FROM CSV # ═════════════════════════════════════════════════════════════════════════════ -DATASET_PATH = os.getenv("NEXALEARN_DATASET_PATH", "broken-ai_deadcode_dataset.csv") +# DATASET_PATH = os.getenv("NEXALEARN_DATASET_PATH", "broken-ai_deadcode_dataset.csv") +DATASET_PATH = "data/broken-ai_deadcode_dataset.csv" def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: @@ -190,7 +191,9 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: for col in numeric_cols: if col in df.columns: - df[col] = pd.to_numeric(df[col], errors="ignore") + df[col] = pd.to_numeric(df[col], errors="coerce") + df.replace(["MISSING", "missing", "unknown", "#REF!"], np.nan, inplace=True) + # df[col] = pd.to_numeric(df[col], errors="ignore") # Replace ±inf with 0 so they slip past null checks df.replace([np.inf, -np.inf], 0, inplace=True) @@ -247,7 +250,8 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: # Drop rows with excessive nulls threshold = 0.5 -rows_to_drop = df[df.isnull().mean() > threshold].index +rows_to_drop = df[df.isnull().mean(axis=1) > threshold].index +# rows_to_drop = df[df.isnull().mean() > threshold].index df = df.drop(index=rows_to_drop) df_clean = df.copy() @@ -426,7 +430,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: for name, model in models.items(): scores = cross_val_score( model, - X_scalled, + X_scaled, y, scoring="accuracy", cv=kf, diff --git a/plots/eda_categorical.png b/plots/eda_categorical.png new file mode 100644 index 0000000..4e97f8c Binary files /dev/null and b/plots/eda_categorical.png differ diff --git a/plots/eda_histograms.png b/plots/eda_histograms.png new file mode 100644 index 0000000..2bf369b Binary files /dev/null and b/plots/eda_histograms.png differ