ayanokojix21 · anshbhoraniya2006-ship-it · Apr 4, 2026
diff --git a/.env.example b/.env.example
@@ -2,7 +2,7 @@
 # Never commit real secrets
 
 # Groq API key used by chatbot features
-GROQ_API_KEY=your_groq_api_key_here
+# GROQ_API_KEY=your_groq_api_key_here
 
 # Optional: override dataset path for local runs
-# NEXALEARN_DATASET_PATH=broken-ai_deadcode_dataset.csv
+NEXALEARN_DATASET_PATH=D:\AI\LangChain\Broken-AI\data\broken-ai_deadcode_dataset.csv
diff --git a/config.py b/config.py
@@ -11,28 +11,28 @@
 
 # ── Server ────────────────────────────────────────────────────────────────────
 API_HOST = "0.0.0.0"
-API_PORT = 8001                        
+API_PORT = 8000                    
 
 # ── Saved model paths ─────────────────────────────────────────────────────────
 MODEL_PATH  = "models/best_model.pkl"  
 SCALER_PATH = "models/scaler.pkl"      
 
 # ── Groq LLM ──────────────────────────────────────────────────────────────────
-GROQ_MODEL   = "llama3-8b-8192x"       
-MAX_TOKENS   = 10                      
-TEMPERATURE  = 2.0                     
-GROQ_ENV_VAR = "GROQ_KEY"             
+GROQ_MODEL   = "Qwen/Qwen2.5-7B-Instruct"       
+MAX_TOKENS   = 1024                      
+TEMPERATURE  = 0.5                     
+GROQ_ENV_VAR = "GROQ_API_KEY"             
 
 # ── LangChain / Embeddings ────────────────────────────────────────────────────
 EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 CHUNK_SIZE      = 512
-CHUNK_OVERLAP   = 0                    
+CHUNK_OVERLAP   = 55                    
 TOP_K_CHUNKS    = 5
 
 # ── Security ──────────────────────────────────────────────────────────────────
 JWT_SECRET    = ""                     
 JWT_ALGORITHM = "HS256"
-ACCESS_TOKEN_EXPIRE_MINUTES = 30
+ACCESS_TOKEN_EXPIRE_MINUTES = 300
 
 # ── Database ──────────────────────────────────────────────────────────────────
 DATABASE_URL = "sqlite:///./nexalearn.db"

diff --git a/ml_pipeline.py b/ml_pipeline.py
@@ -38,8 +38,8 @@
 # SECTION 1 │ LOAD DATASET FROM CSV
 # ═════════════════════════════════════════════════════════════════════════════
 
-DATASET_PATH = os.getenv("NEXALEARN_DATASET_PATH", "broken-ai_deadcode_dataset.csv")
 
+DATASET_PATH = os.getenv("NEXALEARN_DATASET_PATH","Broken-AI\\data\\broken-ai_deadcode_dataset.csv")
 
 def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
     """Load CSV and align column schema with the pipeline feature expectations."""
@@ -190,7 +190,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 for col in numeric_cols:
     if col in df.columns:
-        df[col] = pd.to_numeric(df[col], errors="ignore")              
+        df[col] = pd.to_numeric(df[col], errors="coerce")              
 
 # Replace ±inf with 0 so they slip past null checks
 df.replace([np.inf, -np.inf], 0, inplace=True)                         
@@ -247,7 +247,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 # Drop rows with excessive nulls
 threshold    = 0.5
-rows_to_drop = df[df.isnull().mean() > threshold].index                
+rows_to_drop = df[df.isnull().mean(axis=1) > threshold].index                
 df           = df.drop(index=rows_to_drop)
 
 df_clean = df.copy()
@@ -277,8 +277,8 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 # 4-b  Numeric histograms
 num_plot_cols = ["study_hours_per_day","sleep_hours_per_day","attendance_percentage",
-                 "mental_health_rating","extracurricular_hours","exam_score"]
-fig, axes = plt.subplots(2, 3, figsize=(16, 9))
+                 "mental_health_rating","extracurricular_hours","exam_score","gender"]
+fig, axes = plt.subplots(2, 4, figsize=(16, 9))
 axes = axes.flatten()
 for i, col in enumerate(num_plot_cols):
     axes[i].hist(df_clean[col].dropna(), bins=2,                       
@@ -383,8 +383,9 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 X = df_clean[feature_cols]                                            
 
 # Target variable
-y = df_fe["study_hours_per_day"]                                       
+y = df_clean["study_hours_per_day"]                                       
 
+X=pd.get_dummies(X)
 # Drop target from X if accidentally present
 if TARGET in X.columns:
     X = X.drop(columns=[TARGET])
@@ -393,7 +394,6 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 scaler   = StandardScaler()
 X_scaled = scaler.fit_transform(X)                                     
 X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
-
 # Train / test split
 X_train, X_test, y_train, y_test = train_test_split(
     X_scaled, y,
@@ -410,7 +410,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 print("\n── SECTION 7 : Cross-Validation " + "─" * 33)
 
 # KFold without shuffle — fold order biased by row order
-kf = KFold(n_splits=5, random_state=42)                               
+kf = KFold(n_splits=5, random_state=42,shuffle=True)                               
 
 models = {
     "LinearRegression"  : LinearRegression(),
@@ -426,9 +426,9 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 for name, model in models.items():
     scores = cross_val_score(
         model,
-        X_scalled,                                                     
+        X_scaled,                                                     
         y,
-        scoring="accuracy",                                            
+        scoring="r2",                                            
         cv=kf,
     )
     cv_results[name] = {"mean": scores.mean(), "std": scores.std()}