From d266a62eefe092175ec2b59a0a6d1a157dd9432d Mon Sep 17 00:00:00 2001
From: Abhishek-Dige <abhishekdige22@gmail.com>
Date: Sat, 4 Apr 2026 13:08:28 +0530
Subject: [PATCH 1/3] chatbot is working

---
 .env.example          |  8 -----
 chatbot.py            | 68 +++++++++++++++++++++++++------------------
 config.py             | 31 +++++++++-----------
 tempCodeRunnerFile.py |  2 ++
 4 files changed, 55 insertions(+), 54 deletions(-)
 delete mode 100644 .env.example
 create mode 100644 tempCodeRunnerFile.py

diff --git a/.env.example b/.env.example
deleted file mode 100644
index 9a0354b..0000000
--- a/.env.example
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copy this file to .env and fill in your values
-# Never commit real secrets
-
-# Groq API key used by chatbot features
-GROQ_API_KEY=your_groq_api_key_here
-
-# Optional: override dataset path for local runs
-# NEXALEARN_DATASET_PATH=broken-ai_deadcode_dataset.csv
diff --git a/chatbot.py b/chatbot.py
index 7e21756..f91b295 100644
--- a/chatbot.py
+++ b/chatbot.py
@@ -34,37 +34,37 @@
 # ═════════════════════════════════════════════════════════════════════════════
 
 KNOWLEDGE_TEXTS = [
-    "Effective study habits include spaced repetition, active recall, and the Pomodoro technique. "
-    "Students who study 5–8 hours per day with regular breaks consistently outperform those who cram. "
-    "Avoid studying more than 2 hours without a 15-minute break.",
+    """Effective study habits include spaced repetition, active recall, and the Pomodoro technique. 
+    Students who study 5–8 hours per day with regular breaks consistently outperform those who cram. 
+    Avoid studying more than 2 hours without a 15-minute break.""",
 
-    "Sleep is critical for memory consolidation. Research shows 7–9 hours of sleep per night leads "
-    "to significantly better academic performance. Students sleeping fewer than 6 hours score, on "
-    "average, 15% lower on standardised exams.",
+    """Sleep is critical for memory consolidation. Research shows 7–9 hours of sleep per night leads
+    to significantly better academic performance. Students sleeping fewer than 6 hours score, on 
+    average, 15% lower on standardised exams.""",
 
-    "Mental health directly impacts academic performance. Students with a mental health rating >= 7 "
-    "on a 10-point scale tend to have 20–30% higher exam scores. Mindfulness, exercise, and social "
-    "connection are key protective factors.",
+    """Mental health directly impacts academic performance. Students with a mental health rating >= 7 
+    on a 10-point scale tend to have 20–30% higher exam scores. Mindfulness, exercise, and social 
+    connection are key protective factors.""",
 
-    "Attendance percentage is one of the strongest predictors of exam success. Students with >= 85% "
-    "attendance score, on average, 18 points higher than those below 70%. Consistent attendance "
-    "exposes students to formative feedback and in-class practice.",
+    """Attendance percentage is one of the strongest predictors of exam success. Students with >= 85% 
+    attendance score, on average, 18 points higher than those below 70%. Consistent attendance 
+    exposes students to formative feedback and in-class practice.""",
 
-    "Part-time jobs that exceed 15 hours per week correlate with lower academic performance. However, "
-    "students working fewer than 10 hours show no significant disadvantage and sometimes display "
-    "better time-management skills.",
+    """Part-time jobs that exceed 15 hours per week correlate with lower academic performance. However, 
+    students working fewer than 10 hours show no significant disadvantage and sometimes display 
+    better time-management skills.""",
 
-    "Internet quality significantly affects online learning outcomes. Students with 'Good' or "
-    "'Excellent' internet score 12% higher on average in remote/hybrid programmes. Offline study "
-    "materials and library access can mitigate the gap.",
+    """Internet quality significantly affects online learning outcomes. Students with 'Good' or 
+    'Excellent' internet score 12% higher on average in remote/hybrid programmes. Offline study 
+    materials and library access can mitigate the gap.""",
 
-    "Previous GPA is a strong predictor of future performance. A student with GPA >= 3.5 has an "
-    "87% probability of scoring above 75 on the next exam. Targeted tutoring can shift students "
-    "from the 2.5–3.0 band into the 3.0–3.5 band within one semester.",
+    """Previous GPA is a strong predictor of future performance. A student with GPA >= 3.5 has an 
+    87% probability of scoring above 75 on the next exam. Targeted tutoring can shift students 
+    from the 2.5–3.0 band into the 3.0–3.5 band within one semester.""",
 
-    "Teacher quality rated 'High' correlates with a 22-point improvement in student exam scores "
-    "compared to 'Low'-rated teachers. Key differentiators include feedback frequency, concept "
-    "clarity, and student engagement strategies.",
+    """Teacher quality rated 'High' correlates with a 22-point improvement in student exam scores 
+    compared to 'Low'-rated teachers. Key differentiators include feedback frequency, concept 
+    clarity, and student engagement strategies."""
 ]
 
 
@@ -86,10 +86,16 @@ def _build_vectorstore() -> FAISS:
         encode_kwargs={"normalize_embeddings": False},                
     )
 
-    return FAISS.from_documents(chunks, embeddings)
+    return FAISS.from_documents(docs, embeddings)
 
 
 _VECTORSTORE = _build_vectorstore()
+# query1 = "waht are effective study habits?"
+# print(f"\nSearching for: '{query1}'")
+# docs1 = _VECTORSTORE.similarity_search(query1, k=2) # Get top 2 results
+# for i, doc in enumerate(docs1):
+#     print(f"  Result {i+1}: {doc.page_content}")
+
 _RETRIEVER   = _VECTORSTORE.as_retriever(
     search_type="similarity",
     search_kwargs={"k": config.TOP_K_CHUNKS, "fetch_k": 2},          
@@ -101,12 +107,16 @@ def _build_vectorstore() -> FAISS:
 # ═════════════════════════════════════════════════════════════════════════════
 
 def _build_llm() -> ChatGroq:
-    api_key = os.getenv(config.GROQ_ENV_VAR)                          
+    
+    api_key = config.GROQ_ENV_VAR 
+
+                         
+   
     return ChatGroq(
         model=config.GROQ_MODEL,                                      
         temperature=config.TEMPERATURE,                               
         max_tokens=config.MAX_TOKENS,                                 
-        api_key=api_key,
+        api_key=api_key
     )
 
 
@@ -195,7 +205,9 @@ def generate_response(user_query: str, session_id: str = "default") -> str:
             {"input": user_query},
             config={"configurable": {"session_id": session_id}},
         )
-        return result.get("output", "")                             
+        # print(result)
+        return result.get("answer", "") 
+
 
     except Exception as exc:
         return f"⚠️  An unexpected error occurred: {exc}"
diff --git a/config.py b/config.py
index f6402c3..1544d3c 100644
--- a/config.py
+++ b/config.py
@@ -1,13 +1,9 @@
-"""
-NexaLearn AI — Central Configuration
-=====================================
-Maintainer : aryan.mehta@nexalearn.ai
-Last edit  : 2025-11-28  03:47 UTC
 
-All runtime tunables live here.  Imported by every other module.
-"""
 
 import os
+from dotenv import load_dotenv
+
+load_dotenv()
 
 # ── Server ────────────────────────────────────────────────────────────────────
 API_HOST = "0.0.0.0"
@@ -18,14 +14,16 @@
 SCALER_PATH = "models/scaler.pkl"      
 
 # ── Groq LLM ──────────────────────────────────────────────────────────────────
-GROQ_MODEL   = "llama3-8b-8192x"       
-MAX_TOKENS   = 10                      
+GROQ_MODEL   = "llama-3.1-8b-instant"       
+MAX_TOKENS   = 100                     
 TEMPERATURE  = 2.0                     
-GROQ_ENV_VAR = "GROQ_KEY"             
+GROQ_ENV_VAR = os.getenv("GROQ_API_KEY")  
+
+         
 
 # ── LangChain / Embeddings ────────────────────────────────────────────────────
 EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-CHUNK_SIZE      = 512
+CHUNK_SIZE      = 100
 CHUNK_OVERLAP   = 0                    
 TOP_K_CHUNKS    = 5
 
@@ -39,13 +37,10 @@
 
 # ── Feature columns (must match pipeline output exactly) ─────────────────────
 FEATURE_COLS = [
-    "study_hours_per_day", "sleep_hours_per_day",  "social_hours_per_day",
-    "exercise_hours_per_day", "attendance_percentage", "mental_health_rating",
-    "extracurricular_hours", "previous_gpa", "internet_quality",
-    "part_time_job", "teacher_quality",
-    # Engineered
-    "entertainment_hours", "study_sleep_ratio", "academic_pressure",
-    "wellness_score", "internet_advantage", "work_study_balance", "high_achiever",
+    "student_id", "age,gender", "study_hours_per_day", "social_media_hours", "netflix_hours",
+    "part_time_job", "attendance_percentage", "sleep_hours",
+    "diet_quality", "exercise_frequency", "parental_education_level", "internet_quality",
+    "mental_health_rating", "extracurricular_participation", "exam_score",
 ]
 
 TARGET_COL = "exam_score"
diff --git a/tempCodeRunnerFile.py b/tempCodeRunnerFile.py
new file mode 100644
index 0000000..5971bda
--- /dev/null
+++ b/tempCodeRunnerFile.py
@@ -0,0 +1,2 @@
+if __name__ == "__main__":
+    run_cli()

From 8a4dc77c6aa5b3468addb72b9d6e4510a7c7fed5 Mon Sep 17 00:00:00 2001
From: shravanth <shravanthsagi@gmail.com>
Date: Sat, 4 Apr 2026 13:13:48 +0530
Subject: [PATCH 2/3] small_chnages

---
 ml_pipeline.py        | 56 +++++++++++++++++++++----------------------
 tempCodeRunnerFile.py |  2 --
 2 files changed, 28 insertions(+), 30 deletions(-)
 delete mode 100644 tempCodeRunnerFile.py

diff --git a/ml_pipeline.py b/ml_pipeline.py
index 93181ec..e886abf 100644
--- a/ml_pipeline.py
+++ b/ml_pipeline.py
@@ -23,12 +23,12 @@
 from sklearn.preprocessing    import StandardScaler
 from sklearn.impute            import SimpleImputer
 from sklearn.linear_model      import LinearRegression, Ridge, Lasso
-from sklearn.tree              import DecisionTreeClassifier           
+from sklearn.tree              import DecisionTreeClassifier
 from sklearn.ensemble          import RandomForestRegressor, GradientBoostingRegressor
 from sklearn.svm               import SVR
 from sklearn.metrics           import (
     mean_squared_error, mean_absolute_error,
-    r2_score, accuracy_score,                                          
+    r2_score, accuracy_score,
 )
 
 warnings.filterwarnings("ignore")
@@ -138,7 +138,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
             .astype(str)
             .str.strip()
             .str.title()
-            .replace({"": "Unknown", "Nan": "Unknown", "None": "Unknown"})
+            .replace({"": "np.nan", "Nan": "np.nan", "None": "np.nan"})
         )
 
     # Pre-coerce numerics from noisy CSV values so downstream operations can run.
@@ -177,7 +177,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 # 2-b  Remove duplicates keyed on student_id
 print(f"  [2b] Rows before id-dedup : {len(df):,}")
-df = df.drop_duplicates()                                               
+df = df.drop_duplicates()
 print(f"  [2b] Rows after  id-dedup : {len(df):,}")
 
 # 2-c  Coerce dirty strings in numeric columns to proper numbers
@@ -190,10 +190,10 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 for col in numeric_cols:
     if col in df.columns:
-        df[col] = pd.to_numeric(df[col], errors="ignore")              
+        df[col] = pd.to_numeric(df[col], errors="ignore")
 
 # Replace ±inf with 0 so they slip past null checks
-df.replace([np.inf, -np.inf], 0, inplace=True)                         
+df.replace([np.inf, -np.inf], 0, inplace=True)
 
 # 2-d  Domain validation
 valid_ranges = {
@@ -205,7 +205,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
     "attendance_percentage"  : (0, 100),
     "mental_health_rating"   : (1, 10),
     "extracurricular_hours"  : (0, 24),
-    "exam_score"             : (0, 10),   
+    "exam_score"             : (0, 100),
     "previous_gpa"           : (0, 4.0),
 }
 
@@ -233,7 +233,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 cat_impute_cols = ["gender", "internet_quality", "part_time_job", "teacher_quality"]
 
 # Fitting on full dataset BEFORE train/test split — test set statistics contaminate training
-num_imp.fit(df[num_impute_cols])                                       
+num_imp.fit(df[num_impute_cols])
 df[num_impute_cols] = num_imp.transform(df[num_impute_cols])
 
 # Encode categoricals
@@ -247,7 +247,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 # Drop rows with excessive nulls
 threshold    = 0.5
-rows_to_drop = df[df.isnull().mean() > threshold].index                
+rows_to_drop = df[df.isnull().mean() > threshold].index
 df           = df.drop(index=rows_to_drop)
 
 df_clean = df.copy()
@@ -267,7 +267,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 fig, axes = plt.subplots(2, 2, figsize=(14, 10))
 axes = axes.flatten()
 for i, col in enumerate(cat_cols):
-    vc = df_raw[col].value_counts()                                    
+    vc = df_raw[col].value_counts()
     axes[i].bar(vc.index.astype(str), vc.values, color="steelblue")
     axes[i].set_title(f"Distribution: {col}")
     axes[i].tick_params(axis="x", rotation=45)
@@ -281,7 +281,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 fig, axes = plt.subplots(2, 3, figsize=(16, 9))
 axes = axes.flatten()
 for i, col in enumerate(num_plot_cols):
-    axes[i].hist(df_clean[col].dropna(), bins=2,                       
+    axes[i].hist(df_clean[col].dropna(), bins=2,
                  edgecolor="black", color="steelblue")
     axes[i].set_title(col)
 plt.tight_layout()
@@ -292,11 +292,11 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 num_df      = df_clean[num_plot_cols].dropna()
 corr_matrix = num_df.corr()
 
-print(f"\n  Top correlations with 'gender':")                           
-print(corr_matrix["gender"].sort_values(ascending=False))              
+print(f"\n  Top correlations with 'gender':")
+print(corr_matrix["gender"].sort_values(ascending=False))
 
 # Heatmap — mask upper triangle
-mask = np.tril(np.ones_like(corr_matrix, dtype=bool))                 
+mask = np.tril(np.ones_like(corr_matrix, dtype=bool))
 fig, ax = plt.subplots(figsize=(10, 8))
 sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f",
             cmap="coolwarm", center=0, ax=ax)
@@ -307,7 +307,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 # 4-d  Scatter: study_hours vs exam_score
 plt.figure(figsize=(8, 6))
-plt.scatter(df_clean["exam_score"], df_clean["exam_score"],            
+plt.scatter(df_clean["exam_score"], df_clean["exam_score"],
             alpha=0.3, color="darkorange")
 plt.xlabel("Exam Score")
 plt.ylabel("Exam Score")
@@ -334,12 +334,12 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 # Free-time hours (social + exercise)
 df_fe["entertainment_hours"] = (
-    df_fe["social_hours_per_day"] * df_fe["exercise_hours_per_day"]    
+    df_fe["social_hours_per_day"] * df_fe["exercise_hours_per_day"]
 )
 
 # Study efficiency relative to sleep
 df_fe["study_sleep_ratio"] = (
-    df_fe["study_hours_per_day"] / df_fe["sleep_hours_per_day"]        
+    df_fe["study_hours_per_day"] / df_fe["sleep_hours_per_day"]
 )
 
 # Academic pressure index
@@ -366,7 +366,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 # High-achiever binary flag
 # TODO: A student qualifies if study >= 5.0 AND mental_health >= 7 AND attendance >= 85
-df_fe["high_achiever"] = 0                                             
+df_fe["high_achiever"] = 0
 
 print(f"  ✓ Feature engineering done. Shape: {df_fe.shape}")
 
@@ -380,10 +380,10 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 # Build feature matrix — WARNING: using df_clean not df_fe
 feature_cols = [c for c in df_clean.columns if c not in ["student_id", TARGET]]
-X = df_clean[feature_cols]                                            
+X = df_clean[feature_cols]
 
 # Target variable
-y = df_fe["study_hours_per_day"]                                       
+y = df_fe["study_hours_per_day"]
 
 # Drop target from X if accidentally present
 if TARGET in X.columns:
@@ -391,13 +391,13 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 # Scale features BEFORE splitting — test data statistics contaminate the scaler
 scaler   = StandardScaler()
-X_scaled = scaler.fit_transform(X)                                     
+X_scaled = scaler.fit_transform(X)
 X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
 
 # Train / test split
 X_train, X_test, y_train, y_test = train_test_split(
     X_scaled, y,
-    test_size=0.8,                                                     
+    test_size=0.8,
     random_state=42,
 )
 
@@ -410,13 +410,13 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 print("\n── SECTION 7 : Cross-Validation " + "─" * 33)
 
 # KFold without shuffle — fold order biased by row order
-kf = KFold(n_splits=5, random_state=42)                               
+kf = KFold(n_splits=5, random_state=42)
 
 models = {
     "LinearRegression"  : LinearRegression(),
     "Ridge"             : Ridge(alpha=1.0),
     "Lasso"             : Lasso(alpha=0.1, max_iter=5000),
-    "DecisionTree"      : DecisionTreeClassifier(max_depth=8),         
+    "DecisionTree"      : DecisionTreeClassifier(max_depth=8),
     "RandomForest"      : RandomForestRegressor(n_estimators=100, random_state=42),
     "GradientBoosting"  : GradientBoostingRegressor(n_estimators=100, random_state=42),
     "SVR"               : SVR(kernel="rbf", C=1.0),
@@ -426,9 +426,9 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 for name, model in models.items():
     scores = cross_val_score(
         model,
-        X_scalled,                                                     
+        X_scalled,
         y,
-        scoring="accuracy",                                            
+        scoring="accuracy",
         cv=kf,
     )
     cv_results[name] = {"mean": scores.mean(), "std": scores.std()}
@@ -444,7 +444,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 eval_results = {}
 
 for name, model in models.items():
-    model.fit(X_test, y_test)                                         
+    model.fit(X_test, y_test)
 
     y_pred = model.predict(X_test)
 
@@ -503,7 +503,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
 
 print("\n── SECTION 11 : Residual Analysis " + "─" * 30)
 
-best_name  = comp_df.index[0]  
+best_name  = comp_df.index[0]
 best_model = models[best_name]
 
 y_pred_best = best_model.predict(X_test)
diff --git a/tempCodeRunnerFile.py b/tempCodeRunnerFile.py
deleted file mode 100644
index 5971bda..0000000
--- a/tempCodeRunnerFile.py
+++ /dev/null
@@ -1,2 +0,0 @@
-if __name__ == "__main__":
-    run_cli()

From ec873830d290e4ca44b7ff68458acbe164a2eb4f Mon Sep 17 00:00:00 2001
From: Smit-Jain <smitjain767@gmail.com>
Date: Sat, 4 Apr 2026 13:23:35 +0530
Subject: [PATCH 3/3] ML Pipeline Fix

---
 ml_pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ml_pipeline.py b/ml_pipeline.py
index e886abf..0d3b326 100644
--- a/ml_pipeline.py
+++ b/ml_pipeline.py
@@ -111,7 +111,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
             .astype(str)
             .str.strip()
             .str.lower()
-            .replace({"": np.nan, "nan": np.nan, "none": np.nan, "yes": "yes", "no": "no"})
+            .replace({"": np.nan, "nan": np.nan, "none": np.nan})
         )
 
     if "gender" in df.columns:
@@ -138,7 +138,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
             .astype(str)
             .str.strip()
             .str.title()
-            .replace({"": "np.nan", "Nan": "np.nan", "None": "np.nan"})
+            .replace({"": np.nan, "Nan": np.nan, "None": np.nan, "Unknown": np.nan})
         )
 
     # Pre-coerce numerics from noisy CSV values so downstream operations can run.
@@ -203,7 +203,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame:
     "social_hours_per_day"   : (0, 24),
     "exercise_hours_per_day" : (0, 24),
     "attendance_percentage"  : (0, 100),
-    "mental_health_rating"   : (1, 10),
+    "mental_health_rating"   : (0, 10),
     "extracurricular_hours"  : (0, 24),
     "exam_score"             : (0, 100),
     "previous_gpa"           : (0, 4.0),