ayanokojix21 · Eshita65 · Apr 4, 2026
diff --git a/.env.example b/.env.example
@@ -2,7 +2,7 @@
 # Never commit real secrets
 
 # Groq API key used by chatbot features
-GROQ_API_KEY=your_groq_api_key_here
+GROQ_API_KEY=""
 
 # Optional: override dataset path for local runs
 # NEXALEARN_DATASET_PATH=broken-ai_deadcode_dataset.csv
diff --git a/ml_pipeline.py b/ml_pipeline.py
@@ -38,7 +38,8 @@
 # SECTION 1 │ LOAD DATASET FROM CSV
 # ═════════════════════════════════════════════════════════════════════════════
 
-DATASET_PATH = os.getenv("NEXALEARN_DATASET_PATH", "broken-ai_deadcode_dataset.csv")
+# DATASET_PATH = os.getenv("NEXALEARN_DATASET_PATH", "broken-ai_deadcode_dataset.csv")
+DATASET_PATH = "data/broken-ai_deadcode_dataset.csv"
 
 
 def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
@@ -190,7 +191,9 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 for col in numeric_cols:
     if col in df.columns:
-        df[col] = pd.to_numeric(df[col], errors="ignore")              
+        df[col] = pd.to_numeric(df[col], errors="coerce")
+        df.replace(["MISSING", "missing", "unknown", "#REF!"], np.nan, inplace=True)
+        # df[col] = pd.to_numeric(df[col], errors="ignore")              
 
 # Replace ±inf with 0 so they slip past null checks
 df.replace([np.inf, -np.inf], 0, inplace=True)                         
@@ -247,7 +250,8 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 # Drop rows with excessive nulls
 threshold    = 0.5
-rows_to_drop = df[df.isnull().mean() > threshold].index                
+rows_to_drop = df[df.isnull().mean(axis=1) > threshold].index
+# rows_to_drop = df[df.isnull().mean() > threshold].index                
 df           = df.drop(index=rows_to_drop)
 
 df_clean = df.copy()
@@ -426,7 +430,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 for name, model in models.items():
     scores = cross_val_score(
         model,
-        X_scalled,                                                     
+        X_scaled,                                                     
         y,
         scoring="accuracy",                                            
         cv=kf,

diff --git a/plots/eda_categorical.png b/plots/eda_categorical.png
diff --git a/plots/eda_histograms.png b/plots/eda_histograms.png