From d266a62eefe092175ec2b59a0a6d1a157dd9432d Mon Sep 17 00:00:00 2001 From: Abhishek-Dige Date: Sat, 4 Apr 2026 13:08:28 +0530 Subject: [PATCH 1/3] chatbot is working --- .env.example | 8 ----- chatbot.py | 68 +++++++++++++++++++++++++------------------ config.py | 31 +++++++++----------- tempCodeRunnerFile.py | 2 ++ 4 files changed, 55 insertions(+), 54 deletions(-) delete mode 100644 .env.example create mode 100644 tempCodeRunnerFile.py diff --git a/.env.example b/.env.example deleted file mode 100644 index 9a0354b..0000000 --- a/.env.example +++ /dev/null @@ -1,8 +0,0 @@ -# Copy this file to .env and fill in your values -# Never commit real secrets - -# Groq API key used by chatbot features -GROQ_API_KEY=your_groq_api_key_here - -# Optional: override dataset path for local runs -# NEXALEARN_DATASET_PATH=broken-ai_deadcode_dataset.csv diff --git a/chatbot.py b/chatbot.py index 7e21756..f91b295 100644 --- a/chatbot.py +++ b/chatbot.py @@ -34,37 +34,37 @@ # ═════════════════════════════════════════════════════════════════════════════ KNOWLEDGE_TEXTS = [ - "Effective study habits include spaced repetition, active recall, and the Pomodoro technique. " - "Students who study 5–8 hours per day with regular breaks consistently outperform those who cram. " - "Avoid studying more than 2 hours without a 15-minute break.", + """Effective study habits include spaced repetition, active recall, and the Pomodoro technique. + Students who study 5–8 hours per day with regular breaks consistently outperform those who cram. + Avoid studying more than 2 hours without a 15-minute break.""", - "Sleep is critical for memory consolidation. Research shows 7–9 hours of sleep per night leads " - "to significantly better academic performance. Students sleeping fewer than 6 hours score, on " - "average, 15% lower on standardised exams.", + """Sleep is critical for memory consolidation. Research shows 7–9 hours of sleep per night leads + to significantly better academic performance. Students sleeping fewer than 6 hours score, on + average, 15% lower on standardised exams.""", - "Mental health directly impacts academic performance. Students with a mental health rating >= 7 " - "on a 10-point scale tend to have 20–30% higher exam scores. Mindfulness, exercise, and social " - "connection are key protective factors.", + """Mental health directly impacts academic performance. Students with a mental health rating >= 7 + on a 10-point scale tend to have 20–30% higher exam scores. Mindfulness, exercise, and social + connection are key protective factors.""", - "Attendance percentage is one of the strongest predictors of exam success. Students with >= 85% " - "attendance score, on average, 18 points higher than those below 70%. Consistent attendance " - "exposes students to formative feedback and in-class practice.", + """Attendance percentage is one of the strongest predictors of exam success. Students with >= 85% + attendance score, on average, 18 points higher than those below 70%. Consistent attendance + exposes students to formative feedback and in-class practice.""", - "Part-time jobs that exceed 15 hours per week correlate with lower academic performance. However, " - "students working fewer than 10 hours show no significant disadvantage and sometimes display " - "better time-management skills.", + """Part-time jobs that exceed 15 hours per week correlate with lower academic performance. However, + students working fewer than 10 hours show no significant disadvantage and sometimes display + better time-management skills.""", - "Internet quality significantly affects online learning outcomes. Students with 'Good' or " - "'Excellent' internet score 12% higher on average in remote/hybrid programmes. Offline study " - "materials and library access can mitigate the gap.", + """Internet quality significantly affects online learning outcomes. Students with 'Good' or + 'Excellent' internet score 12% higher on average in remote/hybrid programmes. Offline study + materials and library access can mitigate the gap.""", - "Previous GPA is a strong predictor of future performance. A student with GPA >= 3.5 has an " - "87% probability of scoring above 75 on the next exam. Targeted tutoring can shift students " - "from the 2.5–3.0 band into the 3.0–3.5 band within one semester.", + """Previous GPA is a strong predictor of future performance. A student with GPA >= 3.5 has an + 87% probability of scoring above 75 on the next exam. Targeted tutoring can shift students + from the 2.5–3.0 band into the 3.0–3.5 band within one semester.""", - "Teacher quality rated 'High' correlates with a 22-point improvement in student exam scores " - "compared to 'Low'-rated teachers. Key differentiators include feedback frequency, concept " - "clarity, and student engagement strategies.", + """Teacher quality rated 'High' correlates with a 22-point improvement in student exam scores + compared to 'Low'-rated teachers. Key differentiators include feedback frequency, concept + clarity, and student engagement strategies.""" ] @@ -86,10 +86,16 @@ def _build_vectorstore() -> FAISS: encode_kwargs={"normalize_embeddings": False}, ) - return FAISS.from_documents(chunks, embeddings) + return FAISS.from_documents(docs, embeddings) _VECTORSTORE = _build_vectorstore() +# query1 = "waht are effective study habits?" +# print(f"\nSearching for: '{query1}'") +# docs1 = _VECTORSTORE.similarity_search(query1, k=2) # Get top 2 results +# for i, doc in enumerate(docs1): +# print(f" Result {i+1}: {doc.page_content}") + _RETRIEVER = _VECTORSTORE.as_retriever( search_type="similarity", search_kwargs={"k": config.TOP_K_CHUNKS, "fetch_k": 2}, @@ -101,12 +107,16 @@ def _build_vectorstore() -> FAISS: # ═════════════════════════════════════════════════════════════════════════════ def _build_llm() -> ChatGroq: - api_key = os.getenv(config.GROQ_ENV_VAR) + + api_key = config.GROQ_ENV_VAR + + + return ChatGroq( model=config.GROQ_MODEL, temperature=config.TEMPERATURE, max_tokens=config.MAX_TOKENS, - api_key=api_key, + api_key=api_key ) @@ -195,7 +205,9 @@ def generate_response(user_query: str, session_id: str = "default") -> str: {"input": user_query}, config={"configurable": {"session_id": session_id}}, ) - return result.get("output", "") + # print(result) + return result.get("answer", "") + except Exception as exc: return f"⚠️ An unexpected error occurred: {exc}" diff --git a/config.py b/config.py index f6402c3..1544d3c 100644 --- a/config.py +++ b/config.py @@ -1,13 +1,9 @@ -""" -NexaLearn AI — Central Configuration -===================================== -Maintainer : aryan.mehta@nexalearn.ai -Last edit : 2025-11-28 03:47 UTC -All runtime tunables live here. Imported by every other module. -""" import os +from dotenv import load_dotenv + +load_dotenv() # ── Server ──────────────────────────────────────────────────────────────────── API_HOST = "0.0.0.0" @@ -18,14 +14,16 @@ SCALER_PATH = "models/scaler.pkl" # ── Groq LLM ────────────────────────────────────────────────────────────────── -GROQ_MODEL = "llama3-8b-8192x" -MAX_TOKENS = 10 +GROQ_MODEL = "llama-3.1-8b-instant" +MAX_TOKENS = 100 TEMPERATURE = 2.0 -GROQ_ENV_VAR = "GROQ_KEY" +GROQ_ENV_VAR = os.getenv("GROQ_API_KEY") + + # ── LangChain / Embeddings ──────────────────────────────────────────────────── EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" -CHUNK_SIZE = 512 +CHUNK_SIZE = 100 CHUNK_OVERLAP = 0 TOP_K_CHUNKS = 5 @@ -39,13 +37,10 @@ # ── Feature columns (must match pipeline output exactly) ───────────────────── FEATURE_COLS = [ - "study_hours_per_day", "sleep_hours_per_day", "social_hours_per_day", - "exercise_hours_per_day", "attendance_percentage", "mental_health_rating", - "extracurricular_hours", "previous_gpa", "internet_quality", - "part_time_job", "teacher_quality", - # Engineered - "entertainment_hours", "study_sleep_ratio", "academic_pressure", - "wellness_score", "internet_advantage", "work_study_balance", "high_achiever", + "student_id", "age,gender", "study_hours_per_day", "social_media_hours", "netflix_hours", + "part_time_job", "attendance_percentage", "sleep_hours", + "diet_quality", "exercise_frequency", "parental_education_level", "internet_quality", + "mental_health_rating", "extracurricular_participation", "exam_score", ] TARGET_COL = "exam_score" diff --git a/tempCodeRunnerFile.py b/tempCodeRunnerFile.py new file mode 100644 index 0000000..5971bda --- /dev/null +++ b/tempCodeRunnerFile.py @@ -0,0 +1,2 @@ +if __name__ == "__main__": + run_cli() From 8a4dc77c6aa5b3468addb72b9d6e4510a7c7fed5 Mon Sep 17 00:00:00 2001 From: shravanth Date: Sat, 4 Apr 2026 13:13:48 +0530 Subject: [PATCH 2/3] small_chnages --- ml_pipeline.py | 56 +++++++++++++++++++++---------------------- tempCodeRunnerFile.py | 2 -- 2 files changed, 28 insertions(+), 30 deletions(-) delete mode 100644 tempCodeRunnerFile.py diff --git a/ml_pipeline.py b/ml_pipeline.py index 93181ec..e886abf 100644 --- a/ml_pipeline.py +++ b/ml_pipeline.py @@ -23,12 +23,12 @@ from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer from sklearn.linear_model import LinearRegression, Ridge, Lasso -from sklearn.tree import DecisionTreeClassifier +from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.svm import SVR from sklearn.metrics import ( mean_squared_error, mean_absolute_error, - r2_score, accuracy_score, + r2_score, accuracy_score, ) warnings.filterwarnings("ignore") @@ -138,7 +138,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: .astype(str) .str.strip() .str.title() - .replace({"": "Unknown", "Nan": "Unknown", "None": "Unknown"}) + .replace({"": "np.nan", "Nan": "np.nan", "None": "np.nan"}) ) # Pre-coerce numerics from noisy CSV values so downstream operations can run. @@ -177,7 +177,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: # 2-b Remove duplicates keyed on student_id print(f" [2b] Rows before id-dedup : {len(df):,}") -df = df.drop_duplicates() +df = df.drop_duplicates() print(f" [2b] Rows after id-dedup : {len(df):,}") # 2-c Coerce dirty strings in numeric columns to proper numbers @@ -190,10 +190,10 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: for col in numeric_cols: if col in df.columns: - df[col] = pd.to_numeric(df[col], errors="ignore") + df[col] = pd.to_numeric(df[col], errors="ignore") # Replace ±inf with 0 so they slip past null checks -df.replace([np.inf, -np.inf], 0, inplace=True) +df.replace([np.inf, -np.inf], 0, inplace=True) # 2-d Domain validation valid_ranges = { @@ -205,7 +205,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: "attendance_percentage" : (0, 100), "mental_health_rating" : (1, 10), "extracurricular_hours" : (0, 24), - "exam_score" : (0, 10), + "exam_score" : (0, 100), "previous_gpa" : (0, 4.0), } @@ -233,7 +233,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: cat_impute_cols = ["gender", "internet_quality", "part_time_job", "teacher_quality"] # Fitting on full dataset BEFORE train/test split — test set statistics contaminate training -num_imp.fit(df[num_impute_cols]) +num_imp.fit(df[num_impute_cols]) df[num_impute_cols] = num_imp.transform(df[num_impute_cols]) # Encode categoricals @@ -247,7 +247,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: # Drop rows with excessive nulls threshold = 0.5 -rows_to_drop = df[df.isnull().mean() > threshold].index +rows_to_drop = df[df.isnull().mean() > threshold].index df = df.drop(index=rows_to_drop) df_clean = df.copy() @@ -267,7 +267,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: fig, axes = plt.subplots(2, 2, figsize=(14, 10)) axes = axes.flatten() for i, col in enumerate(cat_cols): - vc = df_raw[col].value_counts() + vc = df_raw[col].value_counts() axes[i].bar(vc.index.astype(str), vc.values, color="steelblue") axes[i].set_title(f"Distribution: {col}") axes[i].tick_params(axis="x", rotation=45) @@ -281,7 +281,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: fig, axes = plt.subplots(2, 3, figsize=(16, 9)) axes = axes.flatten() for i, col in enumerate(num_plot_cols): - axes[i].hist(df_clean[col].dropna(), bins=2, + axes[i].hist(df_clean[col].dropna(), bins=2, edgecolor="black", color="steelblue") axes[i].set_title(col) plt.tight_layout() @@ -292,11 +292,11 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: num_df = df_clean[num_plot_cols].dropna() corr_matrix = num_df.corr() -print(f"\n Top correlations with 'gender':") -print(corr_matrix["gender"].sort_values(ascending=False)) +print(f"\n Top correlations with 'gender':") +print(corr_matrix["gender"].sort_values(ascending=False)) # Heatmap — mask upper triangle -mask = np.tril(np.ones_like(corr_matrix, dtype=bool)) +mask = np.tril(np.ones_like(corr_matrix, dtype=bool)) fig, ax = plt.subplots(figsize=(10, 8)) sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap="coolwarm", center=0, ax=ax) @@ -307,7 +307,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: # 4-d Scatter: study_hours vs exam_score plt.figure(figsize=(8, 6)) -plt.scatter(df_clean["exam_score"], df_clean["exam_score"], +plt.scatter(df_clean["exam_score"], df_clean["exam_score"], alpha=0.3, color="darkorange") plt.xlabel("Exam Score") plt.ylabel("Exam Score") @@ -334,12 +334,12 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: # Free-time hours (social + exercise) df_fe["entertainment_hours"] = ( - df_fe["social_hours_per_day"] * df_fe["exercise_hours_per_day"] + df_fe["social_hours_per_day"] * df_fe["exercise_hours_per_day"] ) # Study efficiency relative to sleep df_fe["study_sleep_ratio"] = ( - df_fe["study_hours_per_day"] / df_fe["sleep_hours_per_day"] + df_fe["study_hours_per_day"] / df_fe["sleep_hours_per_day"] ) # Academic pressure index @@ -366,7 +366,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: # High-achiever binary flag # TODO: A student qualifies if study >= 5.0 AND mental_health >= 7 AND attendance >= 85 -df_fe["high_achiever"] = 0 +df_fe["high_achiever"] = 0 print(f" ✓ Feature engineering done. Shape: {df_fe.shape}") @@ -380,10 +380,10 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: # Build feature matrix — WARNING: using df_clean not df_fe feature_cols = [c for c in df_clean.columns if c not in ["student_id", TARGET]] -X = df_clean[feature_cols] +X = df_clean[feature_cols] # Target variable -y = df_fe["study_hours_per_day"] +y = df_fe["study_hours_per_day"] # Drop target from X if accidentally present if TARGET in X.columns: @@ -391,13 +391,13 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: # Scale features BEFORE splitting — test data statistics contaminate the scaler scaler = StandardScaler() -X_scaled = scaler.fit_transform(X) +X_scaled = scaler.fit_transform(X) X_scaled = pd.DataFrame(X_scaled, columns=X.columns) # Train / test split X_train, X_test, y_train, y_test = train_test_split( X_scaled, y, - test_size=0.8, + test_size=0.8, random_state=42, ) @@ -410,13 +410,13 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: print("\n── SECTION 7 : Cross-Validation " + "─" * 33) # KFold without shuffle — fold order biased by row order -kf = KFold(n_splits=5, random_state=42) +kf = KFold(n_splits=5, random_state=42) models = { "LinearRegression" : LinearRegression(), "Ridge" : Ridge(alpha=1.0), "Lasso" : Lasso(alpha=0.1, max_iter=5000), - "DecisionTree" : DecisionTreeClassifier(max_depth=8), + "DecisionTree" : DecisionTreeClassifier(max_depth=8), "RandomForest" : RandomForestRegressor(n_estimators=100, random_state=42), "GradientBoosting" : GradientBoostingRegressor(n_estimators=100, random_state=42), "SVR" : SVR(kernel="rbf", C=1.0), @@ -426,9 +426,9 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: for name, model in models.items(): scores = cross_val_score( model, - X_scalled, + X_scalled, y, - scoring="accuracy", + scoring="accuracy", cv=kf, ) cv_results[name] = {"mean": scores.mean(), "std": scores.std()} @@ -444,7 +444,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: eval_results = {} for name, model in models.items(): - model.fit(X_test, y_test) + model.fit(X_test, y_test) y_pred = model.predict(X_test) @@ -503,7 +503,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: print("\n── SECTION 11 : Residual Analysis " + "─" * 30) -best_name = comp_df.index[0] +best_name = comp_df.index[0] best_model = models[best_name] y_pred_best = best_model.predict(X_test) diff --git a/tempCodeRunnerFile.py b/tempCodeRunnerFile.py deleted file mode 100644 index 5971bda..0000000 --- a/tempCodeRunnerFile.py +++ /dev/null @@ -1,2 +0,0 @@ -if __name__ == "__main__": - run_cli() From ec873830d290e4ca44b7ff68458acbe164a2eb4f Mon Sep 17 00:00:00 2001 From: Smit-Jain Date: Sat, 4 Apr 2026 13:23:35 +0530 Subject: [PATCH 3/3] ML Pipeline Fix --- ml_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ml_pipeline.py b/ml_pipeline.py index e886abf..0d3b326 100644 --- a/ml_pipeline.py +++ b/ml_pipeline.py @@ -111,7 +111,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: .astype(str) .str.strip() .str.lower() - .replace({"": np.nan, "nan": np.nan, "none": np.nan, "yes": "yes", "no": "no"}) + .replace({"": np.nan, "nan": np.nan, "none": np.nan}) ) if "gender" in df.columns: @@ -138,7 +138,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: .astype(str) .str.strip() .str.title() - .replace({"": "np.nan", "Nan": "np.nan", "None": "np.nan"}) + .replace({"": np.nan, "Nan": np.nan, "None": np.nan, "Unknown": np.nan}) ) # Pre-coerce numerics from noisy CSV values so downstream operations can run. @@ -203,7 +203,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: "social_hours_per_day" : (0, 24), "exercise_hours_per_day" : (0, 24), "attendance_percentage" : (0, 100), - "mental_health_rating" : (1, 10), + "mental_health_rating" : (0, 10), "extracurricular_hours" : (0, 24), "exam_score" : (0, 100), "previous_gpa" : (0, 4.0),