From a6939c0563327ebcea25edee7868b7b76549de22 Mon Sep 17 00:00:00 2001 From: anshbhoraniya2006 Date: Sat, 4 Apr 2026 13:16:43 +0530 Subject: [PATCH] change written in description --- .env.example | 4 ++-- config.py | 14 +++++++------- ml_pipeline.py | 20 ++++++++++---------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.env.example b/.env.example index 9a0354b..b2b8313 100644 --- a/.env.example +++ b/.env.example @@ -2,7 +2,7 @@ # Never commit real secrets # Groq API key used by chatbot features -GROQ_API_KEY=your_groq_api_key_here +# GROQ_API_KEY=your_groq_api_key_here # Optional: override dataset path for local runs -# NEXALEARN_DATASET_PATH=broken-ai_deadcode_dataset.csv +NEXALEARN_DATASET_PATH=D:\AI\LangChain\Broken-AI\data\broken-ai_deadcode_dataset.csv diff --git a/config.py b/config.py index f6402c3..0c624af 100644 --- a/config.py +++ b/config.py @@ -11,28 +11,28 @@ # ── Server ──────────────────────────────────────────────────────────────────── API_HOST = "0.0.0.0" -API_PORT = 8001 +API_PORT = 8000 # ── Saved model paths ───────────────────────────────────────────────────────── MODEL_PATH = "models/best_model.pkl" SCALER_PATH = "models/scaler.pkl" # ── Groq LLM ────────────────────────────────────────────────────────────────── -GROQ_MODEL = "llama3-8b-8192x" -MAX_TOKENS = 10 -TEMPERATURE = 2.0 -GROQ_ENV_VAR = "GROQ_KEY" +GROQ_MODEL = "Qwen/Qwen2.5-7B-Instruct" +MAX_TOKENS = 1024 +TEMPERATURE = 0.5 +GROQ_ENV_VAR = "GROQ_API_KEY" # ── LangChain / Embeddings ──────────────────────────────────────────────────── EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" CHUNK_SIZE = 512 -CHUNK_OVERLAP = 0 +CHUNK_OVERLAP = 55 TOP_K_CHUNKS = 5 # ── Security ────────────────────────────────────────────────────────────────── JWT_SECRET = "" JWT_ALGORITHM = "HS256" -ACCESS_TOKEN_EXPIRE_MINUTES = 30 +ACCESS_TOKEN_EXPIRE_MINUTES = 300 # ── Database ────────────────────────────────────────────────────────────────── DATABASE_URL = "sqlite:///./nexalearn.db" diff --git a/ml_pipeline.py b/ml_pipeline.py index 93181ec..63c3a2c 100644 --- a/ml_pipeline.py +++ b/ml_pipeline.py @@ -38,8 +38,8 @@ # SECTION 1 │ LOAD DATASET FROM CSV # ═════════════════════════════════════════════════════════════════════════════ -DATASET_PATH = os.getenv("NEXALEARN_DATASET_PATH", "broken-ai_deadcode_dataset.csv") +DATASET_PATH = os.getenv("NEXALEARN_DATASET_PATH","Broken-AI\\data\\broken-ai_deadcode_dataset.csv") def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: """Load CSV and align column schema with the pipeline feature expectations.""" @@ -190,7 +190,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: for col in numeric_cols: if col in df.columns: - df[col] = pd.to_numeric(df[col], errors="ignore") + df[col] = pd.to_numeric(df[col], errors="coerce") # Replace ±inf with 0 so they slip past null checks df.replace([np.inf, -np.inf], 0, inplace=True) @@ -247,7 +247,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: # Drop rows with excessive nulls threshold = 0.5 -rows_to_drop = df[df.isnull().mean() > threshold].index +rows_to_drop = df[df.isnull().mean(axis=1) > threshold].index df = df.drop(index=rows_to_drop) df_clean = df.copy() @@ -277,8 +277,8 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: # 4-b Numeric histograms num_plot_cols = ["study_hours_per_day","sleep_hours_per_day","attendance_percentage", - "mental_health_rating","extracurricular_hours","exam_score"] -fig, axes = plt.subplots(2, 3, figsize=(16, 9)) + "mental_health_rating","extracurricular_hours","exam_score","gender"] +fig, axes = plt.subplots(2, 4, figsize=(16, 9)) axes = axes.flatten() for i, col in enumerate(num_plot_cols): axes[i].hist(df_clean[col].dropna(), bins=2, @@ -383,8 +383,9 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: X = df_clean[feature_cols] # Target variable -y = df_fe["study_hours_per_day"] +y = df_clean["study_hours_per_day"] +X=pd.get_dummies(X) # Drop target from X if accidentally present if TARGET in X.columns: X = X.drop(columns=[TARGET]) @@ -393,7 +394,6 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: scaler = StandardScaler() X_scaled = scaler.fit_transform(X) X_scaled = pd.DataFrame(X_scaled, columns=X.columns) - # Train / test split X_train, X_test, y_train, y_test = train_test_split( X_scaled, y, @@ -410,7 +410,7 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: print("\n── SECTION 7 : Cross-Validation " + "─" * 33) # KFold without shuffle — fold order biased by row order -kf = KFold(n_splits=5, random_state=42) +kf = KFold(n_splits=5, random_state=42,shuffle=True) models = { "LinearRegression" : LinearRegression(), @@ -426,9 +426,9 @@ def _prepare_dataset_from_csv(path: str) -> pd.DataFrame: for name, model in models.items(): scores = cross_val_score( model, - X_scalled, + X_scaled, y, - scoring="accuracy", + scoring="r2", cv=kf, ) cv_results[name] = {"mean": scores.mean(), "std": scores.std()}