From 3f8ddcff0cdd4ddf855c49ad1983809ac1fe26be Mon Sep 17 00:00:00 2001
From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com>
Date: Tue, 9 Dec 2025 09:32:12 +0530
Subject: [PATCH 01/15] Refactor evaluation endpoint to use stored
 configuration and remove assistant_id handling

---
 backend/app/api/routes/evaluation.py | 93 +++++++---------------------
 1 file changed, 23 insertions(+), 70 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 058950d65..d6f9b5c29 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -3,12 +3,12 @@
 import logging
 import re
 from pathlib import Path
+from uuid import UUID
 
 from fastapi import APIRouter, Body, File, Form, HTTPException, Query, UploadFile
 
 from app.api.deps import AuthContextDep, SessionDep
 from app.core.cloud import get_cloud_storage
-from app.crud.assistants import get_assistant_by_id
 from app.crud.evaluations import (
     create_evaluation_dataset,
     create_evaluation_run,
@@ -20,7 +20,7 @@
     upload_dataset_to_langfuse,
 )
 from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud
-from app.crud.evaluations.core import save_score
+from app.crud.evaluations.core import resolve_config_from_stored, save_score
 from app.crud.evaluations.dataset import delete_dataset as delete_dataset_crud
 from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse
 from app.models.evaluation import (
@@ -33,6 +33,10 @@
     get_openai_client,
     load_description,
 )
+from app.services.llm.jobs import resolve_config_blob
+from app.models.llm.request import LLMCallConfig
+from app.crud.config.version import ConfigVersionCrud
+
 
 logger = logging.getLogger(__name__)
 
@@ -430,20 +434,9 @@ def evaluate(
     experiment_name: str = Body(
         ..., description="Name for this evaluation experiment/run"
     ),
-    config: dict = Body(default_factory=dict, description="Evaluation configuration"),
-    assistant_id: str
-    | None = Body(
-        None, description="Optional assistant ID to fetch configuration from"
-    ),
+    config_id: UUID = Body(..., description="Stored config ID"),
+    config_version: int = Body(..., ge=1, description="Stored config version"),
 ) -> APIResponse[EvaluationRunPublic]:
-    logger.info(
-        f"[evaluate] Starting evaluation | experiment_name={experiment_name} | "
-        f"dataset_id={dataset_id} | "
-        f"org_id={auth_context.organization.id} | "
-        f"assistant_id={assistant_id} | "
-        f"config_keys={list(config.keys())}"
-    )
-
     # Step 1: Fetch dataset from database
     dataset = get_dataset_by_id(
         session=_session,
@@ -459,12 +452,6 @@ def evaluate(
             f"organization/project",
         )
 
-    logger.info(
-        f"[evaluate] Found dataset | id={dataset.id} | name={dataset.name} | "
-        f"object_store_url={'present' if dataset.object_store_url else 'None'} | "
-        f"langfuse_id={dataset.langfuse_dataset_id}"
-    )
-
     dataset_name = dataset.name
 
     # Get API clients
@@ -487,63 +474,29 @@ def evaluate(
             "Please ensure Langfuse credentials were configured when the dataset was created.",
         )
 
-    # Handle assistant_id if provided
-    if assistant_id:
-        # Fetch assistant details from database
-        assistant = get_assistant_by_id(
-            session=_session,
-            assistant_id=assistant_id,
-            project_id=auth_context.project.id,
-        )
-
-        if not assistant:
-            raise HTTPException(
-                status_code=404, detail=f"Assistant {assistant_id} not found"
-            )
+    config_version_crud = ConfigVersionCrud(
+        session=_session, config_id=config_id, project_id=auth_context.project.id
+    )
 
-        logger.info(
-            f"[evaluate] Found assistant in DB | id={assistant.id} | "
-            f"model={assistant.model} | instructions="
-            f"{assistant.instructions[:50] if assistant.instructions else 'None'}..."
+    config, error = resolve_config_blob(
+        config_crud=config_version_crud,
+        config=LLMCallConfig(id=config_id, version=config_version),
+    )
+    if error:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Failed to resolve config from stored config: {error}",
         )
 
-        # Build config from assistant (use provided config values to override
-        # if present)
-        config = {
-            "model": config.get("model", assistant.model),
-            "instructions": config.get("instructions", assistant.instructions),
-            "temperature": config.get("temperature", assistant.temperature),
-        }
-
-        # Add tools if vector stores are available
-        vector_store_ids = config.get(
-            "vector_store_ids", assistant.vector_store_ids or []
-        )
-        if vector_store_ids and len(vector_store_ids) > 0:
-            config["tools"] = [
-                {
-                    "type": "file_search",
-                    "vector_store_ids": vector_store_ids,
-                }
-            ]
-
-        logger.info("[evaluate] Using config from assistant")
-    else:
-        logger.info("[evaluate] Using provided config directly")
-        # Validate that config has minimum required fields
-        if not config.get("model"):
-            raise HTTPException(
-                status_code=400,
-                detail="Config must include 'model' when assistant_id is not provided",
-            )
+    logger.info("[evaluate] Successfully resolved config from config management")
 
-    # Create EvaluationRun record
+    # Create EvaluationRun record with resolved config and references
     eval_run = create_evaluation_run(
         session=_session,
         run_name=experiment_name,
         dataset_name=dataset_name,
         dataset_id=dataset_id,
-        config=config,
+        config=config.completion.params,
         organization_id=auth_context.organization.id,
         project_id=auth_context.project.id,
     )
@@ -555,7 +508,7 @@ def evaluate(
             openai_client=openai_client,
             session=_session,
             eval_run=eval_run,
-            config=config,
+            config=config.completion.params,
         )
 
         logger.info(

From 528062220616910856d8bf0f4c90ba40ea6029aa Mon Sep 17 00:00:00 2001
From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com>
Date: Tue, 9 Dec 2025 14:43:06 +0530
Subject: [PATCH 02/15] Refactor evaluation run to use config ID and version
 instead of config dict

---
 .../041_add_config_in_evals_run_table.py      | 60 +++++++++++++++++++
 backend/app/api/routes/evaluation.py          | 13 +++-
 backend/app/crud/evaluations/core.py          | 15 +++--
 backend/app/models/evaluation.py              | 24 ++++----
 4 files changed, 95 insertions(+), 17 deletions(-)
 create mode 100644 backend/app/alembic/versions/041_add_config_in_evals_run_table.py

diff --git a/backend/app/alembic/versions/041_add_config_in_evals_run_table.py b/backend/app/alembic/versions/041_add_config_in_evals_run_table.py
new file mode 100644
index 000000000..449768b38
--- /dev/null
+++ b/backend/app/alembic/versions/041_add_config_in_evals_run_table.py
@@ -0,0 +1,60 @@
+"""add config in evals run table
+
+Revision ID: 041
+Revises: 040
+Create Date: 2025-12-15 14:03:22.082746
+
+"""
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel.sql.sqltypes
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "041"
+down_revision = "040"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "evaluation_run",
+        sa.Column(
+            "config_id",
+            sa.Uuid(),
+            nullable=True,
+            comment="Reference to the stored config used",
+        ),
+    )
+    op.add_column(
+        "evaluation_run",
+        sa.Column(
+            "config_version",
+            sa.Integer(),
+            nullable=True,
+            comment="Version of the config used",
+        ),
+    )
+    op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
+    op.drop_column("evaluation_run", "config")
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "evaluation_run",
+        sa.Column(
+            "config",
+            postgresql.JSONB(astext_type=sa.Text()),
+            autoincrement=False,
+            nullable=False,
+            comment="Evaluation configuration (model, instructions, etc.)",
+        ),
+    )
+    op.drop_constraint(None, "evaluation_run", type_="foreignkey")
+    op.drop_column("evaluation_run", "config_version")
+    op.drop_column("evaluation_run", "config_id")
+    # ### end Alembic commands ###
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index d6f9b5c29..f0208129d 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -20,7 +20,7 @@
     upload_dataset_to_langfuse,
 )
 from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud
-from app.crud.evaluations.core import resolve_config_from_stored, save_score
+from app.crud.evaluations.core import save_score
 from app.crud.evaluations.dataset import delete_dataset as delete_dataset_crud
 from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse
 from app.models.evaluation import (
@@ -34,6 +34,7 @@
     load_description,
 )
 from app.services.llm.jobs import resolve_config_blob
+from app.services.llm.providers import LLMProvider
 from app.models.llm.request import LLMCallConfig
 from app.crud.config.version import ConfigVersionCrud
 
@@ -487,16 +488,22 @@ def evaluate(
             status_code=400,
             detail=f"Failed to resolve config from stored config: {error}",
         )
+    elif config.completion.provider != LLMProvider.OPENAI:
+        raise HTTPException(
+            status_code=400,
+            detail="Only 'openai' provider is supported for evaluation configs",
+        )
 
     logger.info("[evaluate] Successfully resolved config from config management")
 
-    # Create EvaluationRun record with resolved config and references
+    # Create EvaluationRun record with config references
     eval_run = create_evaluation_run(
         session=_session,
         run_name=experiment_name,
         dataset_name=dataset_name,
         dataset_id=dataset_id,
-        config=config.completion.params,
+        config_id=config_id,
+        config_version=config_version,
         organization_id=auth_context.organization.id,
         project_id=auth_context.project.id,
     )
diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
index b2b118df1..466066a2f 100644
--- a/backend/app/crud/evaluations/core.py
+++ b/backend/app/crud/evaluations/core.py
@@ -1,5 +1,6 @@
 import logging
 from typing import Any
+from uuid import UUID
 
 from langfuse import Langfuse
 from sqlmodel import Session, select
@@ -16,7 +17,8 @@ def create_evaluation_run(
     run_name: str,
     dataset_name: str,
     dataset_id: int,
-    config: dict,
+    config_id: UUID,
+    config_version: int,
     organization_id: int,
     project_id: int,
 ) -> EvaluationRun:
@@ -28,7 +30,8 @@ def create_evaluation_run(
         run_name: Name of the evaluation run/experiment
         dataset_name: Name of the dataset being used
         dataset_id: ID of the dataset
-        config: Configuration dict for the evaluation
+        config_id: UUID of the stored config
+        config_version: Version number of the config
         organization_id: Organization ID
         project_id: Project ID
 
@@ -39,7 +42,8 @@ def create_evaluation_run(
         run_name=run_name,
         dataset_name=dataset_name,
         dataset_id=dataset_id,
-        config=config,
+        config_id=config_id,
+        config_version=config_version,
         status="pending",
         organization_id=organization_id,
         project_id=project_id,
@@ -56,7 +60,10 @@ def create_evaluation_run(
         logger.error(f"Failed to create EvaluationRun: {e}", exc_info=True)
         raise
 
-    logger.info(f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}")
+    logger.info(
+        f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}, "
+        f"config_id={config_id}, config_version={config_version}"
+    )
 
     return eval_run
 
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index f99fbb27e..e59a69725 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -1,5 +1,6 @@
 from datetime import datetime
 from typing import TYPE_CHECKING, Any, Optional
+from uuid import UUID
 
 from pydantic import BaseModel, Field
 from sqlalchemy import Column, Index, Text, UniqueConstraint
@@ -193,15 +194,17 @@ class EvaluationRun(SQLModel, table=True):
         sa_column_kwargs={"comment": "Name of the Langfuse dataset used"},
     )
 
-    # Config field - dict requires sa_column
-    config: dict[str, Any] = SQLField(
-        default_factory=dict,
-        sa_column=Column(
-            JSONB,
-            nullable=False,
-            comment="Evaluation configuration (model, instructions, etc.)",
-        ),
-        description="Evaluation configuration",
+    config_id: UUID = SQLField(
+        foreign_key="config.id",
+        nullable=True,
+        description="Reference to the stored config used for this evaluation",
+        sa_column_kwargs={"comment": "Reference to the stored config used"},
+    )
+    config_version: int = SQLField(
+        nullable=True,
+        ge=1,
+        description="Version of the config used for this evaluation",
+        sa_column_kwargs={"comment": "Version of the config used"},
     )
 
     # Dataset reference
@@ -339,7 +342,8 @@ class EvaluationRunPublic(SQLModel):
     id: int
     run_name: str
     dataset_name: str
-    config: dict[str, Any]
+    config_id: UUID
+    config_version: int
     dataset_id: int
     batch_job_id: int | None
     embedding_batch_job_id: int | None

From 13eb778141fa706eb6762b644d665a4307880a21 Mon Sep 17 00:00:00 2001
From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com>
Date: Tue, 9 Dec 2025 15:12:19 +0530
Subject: [PATCH 03/15] Add config_id, config_version, and model fields to
 evaluation run table

---
 ...add_config_id_and_version_in_evals_run_.py | 50 +++++++++++++++++++
 backend/app/api/routes/evaluation.py          |  4 ++
 backend/app/crud/evaluations/core.py          |  3 ++
 backend/app/crud/evaluations/embeddings.py    | 14 +-----
 backend/app/crud/evaluations/processing.py    |  6 +--
 backend/app/models/evaluation.py              |  7 +++
 6 files changed, 67 insertions(+), 17 deletions(-)
 create mode 100644 backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py

diff --git a/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py b/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py
new file mode 100644
index 000000000..a5744788f
--- /dev/null
+++ b/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py
@@ -0,0 +1,50 @@
+"""Add config_id and version in evals run table
+
+Revision ID: 7b48f23ebfdd
+Revises: eed36ae3c79a
+Create Date: 2025-12-09 14:19:57.620312
+
+"""
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel.sql.sqltypes
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "7b48f23ebfdd"
+down_revision = "eed36ae3c79a"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column("evaluation_run", sa.Column("config_id", sa.Uuid(), nullable=True))
+    op.add_column(
+        "evaluation_run", sa.Column("config_version", sa.Integer(), nullable=True)
+    )
+    op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
+    op.add_column(
+        "evaluation_run",
+        sa.Column("model", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
+    )
+    op.drop_column("evaluation_run", "config")
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "evaluation_run",
+        sa.Column(
+            "config",
+            postgresql.JSONB(astext_type=sa.Text()),
+            autoincrement=False,
+            nullable=False,
+        ),
+    )
+    op.drop_constraint(None, "evaluation_run", type_="foreignkey")
+    op.drop_column("evaluation_run", "config_version")
+    op.drop_column("evaluation_run", "model")
+    op.drop_column("evaluation_run", "config_id")
+    # ### end Alembic commands ###
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index f0208129d..e723cdcec 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -496,6 +496,9 @@ def evaluate(
 
     logger.info("[evaluate] Successfully resolved config from config management")
 
+    # Extract model from config for storage
+    model = config.completion.params.get("model")
+
     # Create EvaluationRun record with config references
     eval_run = create_evaluation_run(
         session=_session,
@@ -504,6 +507,7 @@ def evaluate(
         dataset_id=dataset_id,
         config_id=config_id,
         config_version=config_version,
+        model=model,
         organization_id=auth_context.organization.id,
         project_id=auth_context.project.id,
     )
diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
index 466066a2f..04cd34287 100644
--- a/backend/app/crud/evaluations/core.py
+++ b/backend/app/crud/evaluations/core.py
@@ -21,6 +21,7 @@ def create_evaluation_run(
     config_version: int,
     organization_id: int,
     project_id: int,
+    model: str | None = None,
 ) -> EvaluationRun:
     """
     Create a new evaluation run record in the database.
@@ -34,6 +35,7 @@ def create_evaluation_run(
         config_version: Version number of the config
         organization_id: Organization ID
         project_id: Project ID
+        model: LLM model name (snapshot at creation time)
 
     Returns:
         The created EvaluationRun instance
@@ -44,6 +46,7 @@ def create_evaluation_run(
         dataset_id=dataset_id,
         config_id=config_id,
         config_version=config_version,
+        model=model,
         status="pending",
         organization_id=organization_id,
         project_id=project_id,
diff --git a/backend/app/crud/evaluations/embeddings.py b/backend/app/crud/evaluations/embeddings.py
index 70e374211..22bd48522 100644
--- a/backend/app/crud/evaluations/embeddings.py
+++ b/backend/app/crud/evaluations/embeddings.py
@@ -364,19 +364,7 @@ def start_embedding_batch(
         logger.info(f"Starting embedding batch for evaluation run {eval_run.id}")
 
         # Get embedding model from config (default: text-embedding-3-large)
-        embedding_model = eval_run.config.get(
-            "embedding_model", "text-embedding-3-large"
-        )
-
-        # Validate and fallback to default if invalid
-        try:
-            validate_embedding_model(embedding_model)
-        except ValueError as e:
-            logger.warning(
-                f"Invalid embedding model '{embedding_model}' in config: {e}. "
-                f"Falling back to text-embedding-3-large"
-            )
-            embedding_model = "text-embedding-3-large"
+        embedding_model = "text-embedding-3-large"
 
         # Step 1: Build embedding JSONL with trace_ids
         jsonl_data = build_embedding_jsonl(
diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py
index 12b89266e..03b20b203 100644
--- a/backend/app/crud/evaluations/processing.py
+++ b/backend/app/crud/evaluations/processing.py
@@ -253,16 +253,14 @@ async def process_completed_evaluation(
         if not results:
             raise ValueError("No valid results found in batch output")
 
-        # Extract model from config for cost tracking
-        model = eval_run.config.get("model") if eval_run.config else None
-
         # Step 5: Create Langfuse dataset run with traces
+        # Use model stored at creation time for cost tracking
         trace_id_mapping = create_langfuse_dataset_run(
             langfuse=langfuse,
             dataset_name=eval_run.dataset_name,
             run_name=eval_run.run_name,
             results=results,
-            model=model,
+            model=eval_run.model,
         )
 
         # Store object store URL in database
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index e59a69725..91ff9a011 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -207,6 +207,12 @@ class EvaluationRun(SQLModel, table=True):
         sa_column_kwargs={"comment": "Version of the config used"},
     )
 
+    # Model field (snapshot at creation time)
+    model: str | None = SQLField(
+        default=None,
+        description="LLM model name used for this evaluation (e.g., gpt-4o-mini)",
+    )
+
     # Dataset reference
     dataset_id: int = SQLField(
         foreign_key="evaluation_dataset.id",
@@ -344,6 +350,7 @@ class EvaluationRunPublic(SQLModel):
     dataset_name: str
     config_id: UUID
     config_version: int
+    model: str | None
     dataset_id: int
     batch_job_id: int | None
     embedding_batch_job_id: int | None

From 7bdd32291c04d48a0294a364a76d355ae14e193f Mon Sep 17 00:00:00 2001
From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com>
Date: Wed, 10 Dec 2025 10:04:34 +0530
Subject: [PATCH 04/15] Refactor batch evaluation tests to use config_id and
 config_version instead of config dict

---
 .../app/tests/api/routes/test_evaluation.py   | 47 ++++++++++++-------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index c4eb3f0b6..b6933fb33 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -1,11 +1,13 @@
 import io
 from unittest.mock import Mock, patch
+from uuid import uuid4
 
 import pytest
 from sqlmodel import select
 
 from app.crud.evaluations.batch import build_evaluation_jsonl
 from app.models import EvaluationDataset, EvaluationRun
+from app.tests.utils.test_data import create_test_config
 
 
 # Helper function to create CSV file-like object
@@ -494,16 +496,20 @@ def sample_evaluation_config(self):
         }
 
     def test_start_batch_evaluation_invalid_dataset_id(
-        self, client, user_api_key_header, sample_evaluation_config
+        self, client, user_api_key_header, db, user_api_key
     ):
         """Test batch evaluation fails with invalid dataset_id."""
+        # Create a valid config to use
+        config = create_test_config(db, project_id=user_api_key.project.id)
+
         # Try to start evaluation with non-existent dataset_id
         response = client.post(
             "/api/v1/evaluations",
             json={
                 "experiment_name": "test_evaluation_run",
                 "dataset_id": 99999,  # Non-existent
-                "config": sample_evaluation_config,
+                "config_id": str(config.id),
+                "config_version": 1,
             },
             headers=user_api_key_header,
         )
@@ -516,32 +522,27 @@ def test_start_batch_evaluation_invalid_dataset_id(
         assert "not found" in error_str.lower() or "not accessible" in error_str.lower()
 
     def test_start_batch_evaluation_missing_model(self, client, user_api_key_header):
-        """Test batch evaluation fails when model is missing from config."""
-        # We don't need a real dataset for this test - the validation should happen
-        # before dataset lookup. Use any dataset_id and expect config validation error
-        invalid_config = {
-            "instructions": "You are a helpful assistant",
-            "temperature": 0.5,
-        }
-
+        """Test batch evaluation fails with invalid config_id."""
+        # Test with a non-existent config_id (random UUID)
         response = client.post(
             "/api/v1/evaluations",
             json={
-                "experiment_name": "test_no_model",
-                "dataset_id": 1,  # Dummy ID, error should come before this is checked
-                "config": invalid_config,
+                "experiment_name": "test_no_config",
+                "dataset_id": 1,  # Dummy ID, config validation happens first
+                "config_id": str(uuid4()),  # Non-existent config
+                "config_version": 1,
             },
             headers=user_api_key_header,
         )
 
-        # Should fail with either 400 (model missing) or 404 (dataset not found)
+        # Should fail with either 400 (config not found) or 404 (dataset/config not found)
         assert response.status_code in [400, 404]
         response_data = response.json()
         error_str = response_data.get(
             "detail", response_data.get("message", str(response_data))
         )
-        # Should fail with either "model" missing or "dataset not found" (both acceptable)
-        assert "model" in error_str.lower() or "not found" in error_str.lower()
+        # Should mention config or not found
+        assert "config" in error_str.lower() or "not found" in error_str.lower()
 
     def test_start_batch_evaluation_without_authentication(
         self, client, sample_evaluation_config
@@ -728,11 +729,16 @@ def test_get_evaluation_run_trace_info_not_completed(
         self, client, user_api_key_header, db, user_api_key, create_test_dataset
     ):
         """Test requesting trace info for incomplete evaluation returns error."""
+        # Create a config for the evaluation run
+        config = create_test_config(db, project_id=user_api_key.project.id)
+
         eval_run = EvaluationRun(
             run_name="test_pending_run",
             dataset_name=create_test_dataset.name,
             dataset_id=create_test_dataset.id,
-            config={"model": "gpt-4o"},
+            config_id=config.id,
+            config_version=1,
+            model="gpt-4o",
             status="pending",
             total_items=3,
             organization_id=user_api_key.organization_id,
@@ -759,11 +765,16 @@ def test_get_evaluation_run_trace_info_completed(
         self, client, user_api_key_header, db, user_api_key, create_test_dataset
     ):
         """Test requesting trace info for completed evaluation returns cached scores."""
+        # Create a config for the evaluation run
+        config = create_test_config(db, project_id=user_api_key.project.id)
+
         eval_run = EvaluationRun(
             run_name="test_completed_run",
             dataset_name=create_test_dataset.name,
             dataset_id=create_test_dataset.id,
-            config={"model": "gpt-4o"},
+            config_id=config.id,
+            config_version=1,
+            model="gpt-4o",
             status="completed",
             total_items=3,
             score={

From 8f9561c101af8b62d5a4990a4bdb24e79871e0b9 Mon Sep 17 00:00:00 2001
From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com>
Date: Wed, 10 Dec 2025 10:22:21 +0530
Subject: [PATCH 05/15] Update EvaluationRunPublic model to allow nullable
 config_id and config_version fields

---
 backend/app/models/evaluation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index 91ff9a011..6e041900f 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -348,8 +348,8 @@ class EvaluationRunPublic(SQLModel):
     id: int
     run_name: str
     dataset_name: str
-    config_id: UUID
-    config_version: int
+    config_id: UUID | None
+    config_version: int | None
     model: str | None
     dataset_id: int
     batch_job_id: int | None

From f612da47e6f03a69c828c20c6194d3d022921129 Mon Sep 17 00:00:00 2001
From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com>
Date: Mon, 15 Dec 2025 12:42:54 +0530
Subject: [PATCH 06/15] Refactor evaluation run model handling: remove model
 field, add resolve_model_from_config function, and update processing logic to
 use config references

---
 ...add_config_id_and_version_in_evals_run_.py |  5 --
 backend/app/api/routes/evaluation.py          | 22 ++++----
 backend/app/crud/evaluations/__init__.py      |  2 +
 backend/app/crud/evaluations/core.py          | 50 +++++++++++++++++--
 backend/app/crud/evaluations/processing.py    |  6 ++-
 backend/app/models/evaluation.py              |  7 ---
 6 files changed, 66 insertions(+), 26 deletions(-)

diff --git a/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py b/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py
index a5744788f..1f33543ea 100644
--- a/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py
+++ b/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py
@@ -24,10 +24,6 @@ def upgrade():
         "evaluation_run", sa.Column("config_version", sa.Integer(), nullable=True)
     )
     op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
-    op.add_column(
-        "evaluation_run",
-        sa.Column("model", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
-    )
     op.drop_column("evaluation_run", "config")
     # ### end Alembic commands ###
 
@@ -45,6 +41,5 @@ def downgrade():
     )
     op.drop_constraint(None, "evaluation_run", type_="foreignkey")
     op.drop_column("evaluation_run", "config_version")
-    op.drop_column("evaluation_run", "model")
     op.drop_column("evaluation_run", "config_id")
     # ### end Alembic commands ###
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index e723cdcec..ff142e402 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -5,10 +5,19 @@
 from pathlib import Path
 from uuid import UUID
 
-from fastapi import APIRouter, Body, File, Form, HTTPException, Query, UploadFile
+from fastapi import (
+    APIRouter,
+    Body,
+    File,
+    Form,
+    HTTPException,
+    Query,
+    UploadFile,
+)
 
 from app.api.deps import AuthContextDep, SessionDep
 from app.core.cloud import get_cloud_storage
+from app.crud.config.version import ConfigVersionCrud
 from app.crud.evaluations import (
     create_evaluation_dataset,
     create_evaluation_run,
@@ -27,16 +36,15 @@
     DatasetUploadResponse,
     EvaluationRunPublic,
 )
+from app.models.llm.request import LLMCallConfig
+from app.services.llm.jobs import resolve_config_blob
+from app.services.llm.providers import LLMProvider
 from app.utils import (
     APIResponse,
     get_langfuse_client,
     get_openai_client,
     load_description,
 )
-from app.services.llm.jobs import resolve_config_blob
-from app.services.llm.providers import LLMProvider
-from app.models.llm.request import LLMCallConfig
-from app.crud.config.version import ConfigVersionCrud
 
 
 logger = logging.getLogger(__name__)
@@ -496,9 +504,6 @@ def evaluate(
 
     logger.info("[evaluate] Successfully resolved config from config management")
 
-    # Extract model from config for storage
-    model = config.completion.params.get("model")
-
     # Create EvaluationRun record with config references
     eval_run = create_evaluation_run(
         session=_session,
@@ -507,7 +512,6 @@ def evaluate(
         dataset_id=dataset_id,
         config_id=config_id,
         config_version=config_version,
-        model=model,
         organization_id=auth_context.organization.id,
         project_id=auth_context.project.id,
     )
diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py
index 5ca0aacd6..8344c3e91 100644
--- a/backend/app/crud/evaluations/__init__.py
+++ b/backend/app/crud/evaluations/__init__.py
@@ -5,6 +5,7 @@
     create_evaluation_run,
     get_evaluation_run_by_id,
     list_evaluation_runs,
+    resolve_model_from_config,
 )
 from app.crud.evaluations.cron import (
     process_all_pending_evaluations,
@@ -39,6 +40,7 @@
     "create_evaluation_run",
     "get_evaluation_run_by_id",
     "list_evaluation_runs",
+    "resolve_model_from_config",
     # Cron
     "process_all_pending_evaluations",
     "process_all_pending_evaluations_sync",
diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
index 04cd34287..b64aa8208 100644
--- a/backend/app/crud/evaluations/core.py
+++ b/backend/app/crud/evaluations/core.py
@@ -6,8 +6,11 @@
 from sqlmodel import Session, select
 
 from app.core.util import now
+from app.crud.config.version import ConfigVersionCrud
 from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse
 from app.models import EvaluationRun
+from app.models.llm.request import LLMCallConfig
+from app.services.llm.jobs import resolve_config_blob
 
 logger = logging.getLogger(__name__)
 
@@ -21,7 +24,6 @@ def create_evaluation_run(
     config_version: int,
     organization_id: int,
     project_id: int,
-    model: str | None = None,
 ) -> EvaluationRun:
     """
     Create a new evaluation run record in the database.
@@ -35,7 +37,6 @@ def create_evaluation_run(
         config_version: Version number of the config
         organization_id: Organization ID
         project_id: Project ID
-        model: LLM model name (snapshot at creation time)
 
     Returns:
         The created EvaluationRun instance
@@ -46,7 +47,6 @@ def create_evaluation_run(
         dataset_id=dataset_id,
         config_id=config_id,
         config_version=config_version,
-        model=model,
         status="pending",
         organization_id=organization_id,
         project_id=project_id,
@@ -303,3 +303,47 @@ def save_score(
                 f"traces={len(score.get('traces', []))}"
             )
         return eval_run
+
+
+def resolve_model_from_config(
+    session: Session,
+    eval_run: EvaluationRun,
+) -> str:
+    """
+    Resolve the model name from the evaluation run's config.
+
+    Args:
+        session: Database session
+        eval_run: EvaluationRun instance
+
+    Returns:
+        Model name from config
+
+    Raises:
+        ValueError: If config is missing, invalid, or has no model
+    """
+    if not eval_run.config_id or not eval_run.config_version:
+        raise ValueError(
+            f"Evaluation run {eval_run.id} has no config reference "
+            f"(config_id={eval_run.config_id}, config_version={eval_run.config_version})"
+        )
+
+    config_version_crud = ConfigVersionCrud(
+        session=session,
+        config_id=eval_run.config_id,
+        project_id=eval_run.project_id,
+    )
+
+    config, error = resolve_config_blob(
+        config_crud=config_version_crud,
+        config=LLMCallConfig(id=eval_run.config_id, version=eval_run.config_version),
+    )
+
+    if error or config is None:
+        raise ValueError(
+            f"Config resolution failed for evaluation {eval_run.id} "
+            f"(config_id={eval_run.config_id}, version={eval_run.config_version}): {error}"
+        )
+
+    model = config.completion.params.get("model")
+    return model
diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py
index 03b20b203..653a2bafb 100644
--- a/backend/app/crud/evaluations/processing.py
+++ b/backend/app/crud/evaluations/processing.py
@@ -26,7 +26,7 @@
     upload_batch_results_to_object_store,
 )
 from app.crud.evaluations.batch import fetch_dataset_items
-from app.crud.evaluations.core import update_evaluation_run
+from app.crud.evaluations.core import update_evaluation_run, resolve_model_from_config
 from app.crud.evaluations.embeddings import (
     calculate_average_similarity,
     parse_embedding_results,
@@ -255,12 +255,14 @@ async def process_completed_evaluation(
 
         # Step 5: Create Langfuse dataset run with traces
         # Use model stored at creation time for cost tracking
+        model = resolve_model_from_config(session=session, eval_run=eval_run)
+
         trace_id_mapping = create_langfuse_dataset_run(
             langfuse=langfuse,
             dataset_name=eval_run.dataset_name,
+            model=model,
             run_name=eval_run.run_name,
             results=results,
-            model=eval_run.model,
         )
 
         # Store object store URL in database
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index 6e041900f..6ae4542fb 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -207,12 +207,6 @@ class EvaluationRun(SQLModel, table=True):
         sa_column_kwargs={"comment": "Version of the config used"},
     )
 
-    # Model field (snapshot at creation time)
-    model: str | None = SQLField(
-        default=None,
-        description="LLM model name used for this evaluation (e.g., gpt-4o-mini)",
-    )
-
     # Dataset reference
     dataset_id: int = SQLField(
         foreign_key="evaluation_dataset.id",
@@ -350,7 +344,6 @@ class EvaluationRunPublic(SQLModel):
     dataset_name: str
     config_id: UUID | None
     config_version: int | None
-    model: str | None
     dataset_id: int
     batch_job_id: int | None
     embedding_batch_job_id: int | None

From 4f89f43995a9e58ecb546ae09aaee4b4fcfa5ad9 Mon Sep 17 00:00:00 2001
From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com>
Date: Mon, 15 Dec 2025 12:45:22 +0530
Subject: [PATCH 07/15] fix migration number

---
 ..._run_.py => 040_add_config_in_evals_run_table.py} | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
 rename backend/app/alembic/versions/{7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py => 040_add_config_in_evals_run_table.py} (85%)

diff --git a/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py b/backend/app/alembic/versions/040_add_config_in_evals_run_table.py
similarity index 85%
rename from backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py
rename to backend/app/alembic/versions/040_add_config_in_evals_run_table.py
index 1f33543ea..f8606fbaa 100644
--- a/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py
+++ b/backend/app/alembic/versions/040_add_config_in_evals_run_table.py
@@ -1,8 +1,8 @@
-"""Add config_id and version in evals run table
+"""add config in evals run table
 
-Revision ID: 7b48f23ebfdd
-Revises: eed36ae3c79a
-Create Date: 2025-12-09 14:19:57.620312
+Revision ID: 040
+Revises: 039
+Create Date: 2025-12-15 12:44:35.250572
 
 """
 from alembic import op
@@ -11,8 +11,8 @@
 from sqlalchemy.dialects import postgresql
 
 # revision identifiers, used by Alembic.
-revision = "7b48f23ebfdd"
-down_revision = "eed36ae3c79a"
+revision = "040"
+down_revision = "039"
 branch_labels = None
 depends_on = None
 

From 82bee43cd2660fc4a1585913dd6326f2e0f2d5ae Mon Sep 17 00:00:00 2001
From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com>
Date: Mon, 15 Dec 2025 12:49:27 +0530
Subject: [PATCH 08/15] fix test

---
 backend/app/tests/api/routes/test_evaluation.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index b6933fb33..ca3d3dcb0 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -738,7 +738,6 @@ def test_get_evaluation_run_trace_info_not_completed(
             dataset_id=create_test_dataset.id,
             config_id=config.id,
             config_version=1,
-            model="gpt-4o",
             status="pending",
             total_items=3,
             organization_id=user_api_key.organization_id,
@@ -774,7 +773,6 @@ def test_get_evaluation_run_trace_info_completed(
             dataset_id=create_test_dataset.id,
             config_id=config.id,
             config_version=1,
-            model="gpt-4o",
             status="completed",
             total_items=3,
             score={

From a2c8a95bf7d35b860d1fe7833ac21d63db41e5e2 Mon Sep 17 00:00:00 2001
From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com>
Date: Mon, 15 Dec 2025 12:52:33 +0530
Subject: [PATCH 09/15] fix status code

---
 backend/app/api/routes/evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index ff142e402..5d5741376 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -498,7 +498,7 @@ def evaluate(
         )
     elif config.completion.provider != LLMProvider.OPENAI:
         raise HTTPException(
-            status_code=400,
+            status_code=422,
             detail="Only 'openai' provider is supported for evaluation configs",
         )
 

From b9fd664daf6ea8a8f7b6235bce062749347f3b60 Mon Sep 17 00:00:00 2001
From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com>
Date: Mon, 15 Dec 2025 14:04:55 +0530
Subject: [PATCH 10/15] remove old mirgation

---
 .../040_add_config_in_evals_run_table.py      | 45 -------------------
 1 file changed, 45 deletions(-)
 delete mode 100644 backend/app/alembic/versions/040_add_config_in_evals_run_table.py

diff --git a/backend/app/alembic/versions/040_add_config_in_evals_run_table.py b/backend/app/alembic/versions/040_add_config_in_evals_run_table.py
deleted file mode 100644
index f8606fbaa..000000000
--- a/backend/app/alembic/versions/040_add_config_in_evals_run_table.py
+++ /dev/null
@@ -1,45 +0,0 @@
-"""add config in evals run table
-
-Revision ID: 040
-Revises: 039
-Create Date: 2025-12-15 12:44:35.250572
-
-"""
-from alembic import op
-import sqlalchemy as sa
-import sqlmodel.sql.sqltypes
-from sqlalchemy.dialects import postgresql
-
-# revision identifiers, used by Alembic.
-revision = "040"
-down_revision = "039"
-branch_labels = None
-depends_on = None
-
-
-def upgrade():
-    # ### commands auto generated by Alembic - please adjust! ###
-    op.add_column("evaluation_run", sa.Column("config_id", sa.Uuid(), nullable=True))
-    op.add_column(
-        "evaluation_run", sa.Column("config_version", sa.Integer(), nullable=True)
-    )
-    op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
-    op.drop_column("evaluation_run", "config")
-    # ### end Alembic commands ###
-
-
-def downgrade():
-    # ### commands auto generated by Alembic - please adjust! ###
-    op.add_column(
-        "evaluation_run",
-        sa.Column(
-            "config",
-            postgresql.JSONB(astext_type=sa.Text()),
-            autoincrement=False,
-            nullable=False,
-        ),
-    )
-    op.drop_constraint(None, "evaluation_run", type_="foreignkey")
-    op.drop_column("evaluation_run", "config_version")
-    op.drop_column("evaluation_run", "config_id")
-    # ### end Alembic commands ###

From 6b00e0ff1e56e472096c452e7ac106a44601ffa2 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Mon, 12 Jan 2026 12:57:30 +0530
Subject: [PATCH 11/15] added depends as import

---
 backend/app/api/routes/evaluation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index ad82f2069..276c566f2 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -8,6 +8,7 @@
 from fastapi import (
     APIRouter,
     Body,
+    Depends,
     File,
     Form,
     HTTPException,

From ceb397039a01f2d834331a3dd8843d72b80fe5ee Mon Sep 17 00:00:00 2001
From: Prajna1999 <gituprajna20@gmail.com>
Date: Tue, 13 Jan 2026 19:54:10 +0530
Subject: [PATCH 12/15] fix: spread config object while building batch eval
 jsonl

---
 backend/app/crud/evaluations/batch.py         |  15 +-
 backend/app/crud/evaluations/core.py          |   2 +-
 backend/app/services/audio/speech_to_text.py  | 630 ++++++++++++++++++
 .../app/services/llm/providers/registry.py    |   2 +
 4 files changed, 643 insertions(+), 6 deletions(-)
 create mode 100644 backend/app/services/audio/speech_to_text.py

diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py
index 7e8b69043..6f537c063 100644
--- a/backend/app/crud/evaluations/batch.py
+++ b/backend/app/crud/evaluations/batch.py
@@ -17,6 +17,7 @@
 from app.core.batch.openai import OpenAIBatchProvider
 from app.crud.batch_operations import start_batch_job
 from app.models import EvaluationRun
+from app.models.llm.request import KaapiLLMParams
 
 logger = logging.getLogger(__name__)
 
@@ -60,7 +61,7 @@ def fetch_dataset_items(langfuse: Langfuse, dataset_name: str) -> list[dict[str,
 
 
 def build_evaluation_jsonl(
-    dataset_items: list[dict[str, Any]], config: dict[str, Any]
+    dataset_items: list[dict[str, Any]], config: KaapiLLMParams
 ) -> list[dict[str, Any]]:
     """
     Build JSONL data for evaluation batch using OpenAI Responses API.
@@ -89,7 +90,6 @@ def build_evaluation_jsonl(
         List of dictionaries (JSONL data)
     """
     jsonl_data = []
-
     for item in dataset_items:
         # Extract question from input
         question = item["input"].get("question", "")
@@ -106,7 +106,12 @@ def build_evaluation_jsonl(
             "method": "POST",
             "url": "/v1/responses",
             "body": {
-                **config,  # Use config as-is
+                # Use config as-is
+                "model": config.model,
+                "instructions": config.instructions,
+                "temperature": config.temperature,
+                "reasoning": {"effort": config.reasoning} if config.reasoning else None,
+                "tools": config.knowledge_base_ids if config.knowledge_base_ids else [],
                 "input": question,  # Add input from dataset
             },
         }
@@ -120,7 +125,7 @@ def start_evaluation_batch(
     openai_client: OpenAI,
     session: Session,
     eval_run: EvaluationRun,
-    config: dict[str, Any],
+    config: KaapiLLMParams,
 ) -> EvaluationRun:
     """
     Fetch data, build JSONL, and start evaluation batch.
@@ -167,7 +172,7 @@ def start_evaluation_batch(
             "description": f"Evaluation: {eval_run.run_name}",
             "completion_window": "24h",
             # Store complete config for reference
-            "evaluation_config": config,
+            "evaluation_config": config.model_dump(exclude_none=True),
         }
 
         # Step 5: Start batch job using generic infrastructure
diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
index b64aa8208..d83b7447e 100644
--- a/backend/app/crud/evaluations/core.py
+++ b/backend/app/crud/evaluations/core.py
@@ -345,5 +345,5 @@ def resolve_model_from_config(
             f"(config_id={eval_run.config_id}, version={eval_run.config_version}): {error}"
         )
 
-    model = config.completion.params.get("model")
+    model = config.completion.params.model
     return model
diff --git a/backend/app/services/audio/speech_to_text.py b/backend/app/services/audio/speech_to_text.py
new file mode 100644
index 000000000..1c7006408
--- /dev/null
+++ b/backend/app/services/audio/speech_to_text.py
@@ -0,0 +1,630 @@
+"""Speech-to-Text service supporting OpenAI and Gemini providers."""
+import os
+import logging
+import csv
+import base64
+import requests
+from io import StringIO
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import BinaryIO, Literal
+from pydantic import BaseModel
+from dotenv import load_dotenv
+from uuid import uuid4
+from typing import List, Literal, Dict
+from google import genai
+from google.cloud.speech_v2 import SpeechClient
+from google.cloud.speech_v2.types import cloud_speech, RecognizeResponse
+from google.api_core.client_options import ClientOptions
+from openai import OpenAI
+
+from app.services.audio.utils.ogg_to_wav_converter_downloader import (
+    download_audio_from_url,
+)
+from app.models.evaluation import (
+    ProviderConfig,
+    TranscriptionRequest,
+    FileData,
+    WERResult,
+    WERComparisonResult,
+    WERBatchItem,
+    WERBatchResult,
+    WERSummaryStats,
+    WERBatchSummary,
+    WERModelStats,
+    WEROverallSummary,
+)
+from app.services.audio.utils.calculate_wer import tokenize, calculate_wer
+
+load_dotenv()
+logger = logging.getLogger(__name__)
+
+
+# Default prompt for transcription
+DEFAULT_PROMPT = (
+    "Generate a verbatim speech-to-text transcript of the given audio file "
+    "in the same language as of the audio in the same script too. "
+    "make sure the transcription as close possible to the audio provided"
+)
+PROJECT_ID = os.getenv("GOOGLE_PROJECT_ID")
+STT_LOCATION = os.getenv("GOOGLE_PROJECT_LOCATION")
+
+
+class SpeechToTextService:
+    def __init__(self, provider, api_key: str | None = None):
+        if api_key is None:
+            raise ValueError(
+                "Missing OpenAI API Key for Client STT Client initialization"
+            )
+        self.provide = provider
+        self.openai_client = None
+        self.gemini_client = None
+        if provider == "openai":
+            self.openai_client = OpenAI(api_key=api_key)
+        elif provider == "gemini":
+            self.gemini_client = genai.Client(api_key=api_key)
+        else:
+            raise ValueError("This provider is not supported yet.")
+
+    def transcribe_with_openai(
+        self,
+        audio_file: BinaryIO | str,
+        model: str = "gpt-4o-transcribe",
+        prompt: str | None = None,
+    ):
+        if self.openai_client is None:
+            raise ValueError("OpenAI client is not initialized.")
+        try:
+            # Handle file path vs file object
+            if isinstance(audio_file, str):
+                audio_file = open(audio_file, "rb")
+
+            transcription = self.openai_client.audio.transcriptions.create(
+                model=model,
+                file=audio_file,
+                # language="hi",
+                response_format="text",
+                prompt=prompt or DEFAULT_PROMPT,
+            )
+            logger.info(f"Successfully transcribed audio using OpenAI model: {model}")
+            return transcription
+        except Exception as e:
+            logger.error(f"OpenAI transcription failed: {str(e)}", exc_info=True)
+            raise
+
+    def transcribe_with_gemini(
+        self,
+        audio_file_path: str,
+        model: str = "gemini-2.5-flash",
+        prompt: str | None = None,
+    ):
+        if self.gemini_client is None:
+            raise ValueError("Gemini client is not initialized")
+        try:
+            # Upload file to Geminic
+            gemini_file = self.gemini_client.files.upload(file=audio_file_path)
+            logger.info(f"Uploaded file to Gemini: {gemini_file.name}")
+
+            # Generate transcription
+            response = self.gemini_client.models.generate_content(
+                model=model,
+                contents=[prompt or DEFAULT_PROMPT, gemini_file],
+            )
+
+            logger.info(f"Successfully transcribed audio using Gemini model: {model}")
+            return response.text or None
+
+        except Exception as e:
+            logger.error(f"Gemini transcription failed: {str(e)}", exc_info=True)
+            raise
+
+    def transcribe(
+        self,
+        audio_file: BinaryIO | str,
+        provider: Literal["openai", "gemini"] = "openai",
+        model: str | None = None,
+        prompt: str | None = None,
+    ):
+        transcription = None
+        if provider == "openai":
+            transcription = self.transcribe_with_openai(
+                audio_file=audio_file,
+                model=model or "gpt-4o-transcribe",
+                prompt=prompt,
+            )
+            return transcription
+        elif provider == "gemini":
+            # Gemini requires file path, not file object
+            file_path = audio_file if isinstance(audio_file, str) else audio_file.name
+            transcription = self.transcribe_with_gemini(
+                audio_file_path=file_path,
+                model=model or "gemini-2.5-flash",
+                prompt=prompt,
+            )
+            return transcription
+        else:
+            raise ValueError(
+                f"Unsupported provider: {provider}. Use 'openai' or 'gemini'."
+            )
+
+
+def transcribe_audio_with_chirp_v3(audio_file_path: str):
+    with open(audio_file_path, "rb") as file:
+        audio_content = file.read()
+
+    client = SpeechClient(
+        client_options=ClientOptions(
+            api_endpoint=f"{STT_LOCATION}-speech.googleapis.com"
+        )
+    )
+
+    config = cloud_speech.RecognitionConfig(
+        auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
+        language_codes=["auto"],
+        model="chirp_3",
+    )
+    request = cloud_speech.RecognizeRequest(
+        recognizer=f"projects/{PROJECT_ID}/locations/{STT_LOCATION}/recognizers/_",
+        config=config,
+        content=audio_content,
+    )
+    response: RecognizeResponse = client.recognize(request=request)
+
+    transcript = None
+    for result in response.results:
+        transcript = result.alternatives[0].transcript
+    return transcript
+
+
+def transcribe_audio_with_indic_conformer(audio_file_path: str):
+    indic_conformer_api_url = str(os.getenv("AI4B_STT_URL"))
+    with open(audio_file_path, "rb") as file:
+        audio_content = file.read()
+
+    response = requests.post(
+        url=indic_conformer_api_url,
+        data={"language_code": "hi", "decoding_strategy": "ctc"},
+        files={"audio_file": audio_content},
+    )
+    logger.info(response.json())
+    transcription = response.json()["transcription"]
+    return transcription
+
+
+# util functions for direct usage
+def transcribe_audio(
+    audio_file: str,
+    provider: Literal["openai", "gemini", "google-stt", "ai4b"] = "openai",
+    model: str | None = None,
+    api_key: str | None = None,
+    prompt: str | None = None,
+):
+    if provider == "google-stt":
+        return transcribe_audio_with_chirp_v3(audio_file_path=audio_file)
+    if provider == "ai4b":
+        return transcribe_audio_with_indic_conformer(audio_file_path=audio_file)
+    stt_service = SpeechToTextService(provider=provider, api_key=api_key)
+    return stt_service.transcribe(
+        audio_file=audio_file,
+        provider=provider,
+        model=model,
+        prompt=prompt,
+    )
+
+
+# STT csv parser
+
+
+def process_single_csv_row(row_data):
+    idx, audio_url, ground_truth = row_data
+
+    try:
+        audio_bytes, content_type = download_audio_from_url(audio_url)
+        audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
+
+        return {
+            "status": "success",
+            "row": idx,
+            "audio_url": audio_url,
+            "ground_truth": ground_truth,
+            "audio_base64": audio_base64,
+            "media_type": content_type,
+            "file_size": len(audio_bytes),
+        }
+    except requests.Timeout:
+        return {
+            "status": "error",
+            "row": idx,
+            "audio_url": audio_url,
+            "error": "Download timeout",
+        }
+    except requests.RequestException as e:
+        return {
+            "status": "error",
+            "row": idx,
+            "audio_url": audio_url,
+            "error": f"Download failed: {str(e)}",
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "row": idx,
+            "audio_url": audio_url,
+            "error": f"Unexpected error: {str(e)}",
+        }
+
+
+async def process_batch_csv(csv_file):
+    csv_body = await csv_file.read()
+    csv_content = csv_body.decode("utf-8")
+
+    csv_reader = csv.DictReader(StringIO(csv_content))
+    required_headers = {"audio_url", "ground_truth"}
+
+    if not required_headers.issubset(csv_reader.fieldnames):
+        raise ValueError(f"CSV must have headers: {required_headers}")
+
+    rows_to_process = []
+    for idx, row in enumerate(csv_reader, start=1):
+        audio_url = row["audio_url"].strip()
+        ground_truth = row["ground_truth"].strip()
+
+        if not audio_url or not ground_truth:
+            continue  # do not throw error. continue silently
+        rows_to_process.append((idx, audio_url, ground_truth))
+
+    results = []
+    errors = []
+
+    with ThreadPoolExecutor(max_workers=4) as executor:
+        future_to_row = {
+            executor.submit(process_single_csv_row, row_data): row_data
+            for row_data in rows_to_process
+        }
+        for future in as_completed(future_to_row):
+            result = future.result()
+            if result["status"] == "success":
+                results.append(result)
+            else:
+                errors.append(result)
+    return {
+        "success": results,
+        "errors": errors,
+        "total_rows": len(rows_to_process),
+        "processed": len(results),
+        "failed": len(errors),
+    }
+
+
+def _get_api_key(provider: str) -> str | None:
+    if provider == "openai":
+        return os.getenv("OPENAI_API_KEY")
+    elif provider == "gemini":
+        return os.getenv("GEMINI_API_KEY")
+
+    return None
+
+
+def _transcribe_single_file_provider(
+    file_data: FileData, provider_config: ProviderConfig
+):
+    # for execution by threadpool
+    provider = provider_config.provider
+
+    model = provider_config.model
+
+    audio_bytes = base64.b64decode(file_data.audio_base64) or None
+    media_extention = (
+        file_data.media_type.split("/")[-1] if file_data.media_type else ".ogg"
+    )
+    temp_file = f"/tmp/{uuid4()}.{media_extention}"
+
+    try:
+        with open(temp_file, "wb") as f:
+            f.write(audio_bytes)  # type: ignore
+
+        # extract the api key
+        api_key = _get_api_key(provider)  # type: ignore
+
+        transcript = transcribe_audio(
+            audio_file=temp_file,
+            provider=provider,  # type: ignore
+            model=model,
+            api_key=api_key,
+        )
+
+        return {
+            "status": "success",
+            "file_id": file_data.file_id or None,
+            "ground_truth": file_data.ground_truth,
+            "provider": provider,
+            "model": model,
+            "transcript": transcript,
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "file_id": file_data.file_id,
+            "provider": provider,
+            "model": model,
+            "error": str(e),
+        }
+    finally:
+        # clean the temp file
+        if os.path.exists(temp_file):
+            os.remove(temp_file)
+
+
+# threadpool based integration of batch transcriptipn
+def process_batch_transcription(files, providers, max_workers=4):
+    # create all tasks and push into a list
+    tasks = [
+        (file_data, provider_config)
+        for file_data in files
+        for provider_config in providers
+    ]
+
+    results = []
+    errors = []
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(
+                _transcribe_single_file_provider, file_data, provider_config
+            ): (file_data, provider_config)
+            for file_data, provider_config in tasks
+        }
+        for future in as_completed(futures):
+            result = future.result()
+            if result["status"] == "success":
+                results.append(result)
+            else:
+                errors.append(result)
+
+    return {
+        "success": results,
+        "errors": errors,
+        "total_tasks": len(tasks),
+        "processed": len(results),
+        "failed": len(errors),
+    }
+
+
+# STT evaluation route handler
+async def evaluate_stt():
+    pass
+
+
+def calculate_wer_individual(
+    ground_truth: str,
+    hypothesis: str,
+    mode: Literal["strict", "lenient", "both"] = "both",
+) -> WERComparisonResult | WERResult:
+    """
+    Calculate WER for a single transcription against ground truth.
+
+    Args:
+        ground_truth: The reference/expected transcription
+        hypothesis: The transcribed text to evaluate
+        mode: "strict" for exact matching, "lenient" for phonetic/spelling tolerance,
+              "both" to return both calculations
+
+    Returns:
+        WERComparisonResult if mode="both", otherwise WERResult for the specified mode
+    """
+    ref_tokens = tokenize(ground_truth)
+    hyp_tokens = tokenize(hypothesis)
+
+    ref_count = len(ref_tokens)
+    hyp_count = len(hyp_tokens)  # hypothesis tokes
+
+    if mode == "strict":
+        wer, subs, dels, ins, sem = calculate_wer(ref_tokens, hyp_tokens, lenient=False)
+        return WERResult(
+            wer=wer,
+            substitutions=subs,
+            deletions=dels,
+            insertions=ins,
+            semantic_errors=sem,
+            reference_word_count=ref_count,
+            hypothesis_word_count=hyp_count,
+        )
+
+    if mode == "lenient":
+        wer, subs, dels, ins, sem = calculate_wer(ref_tokens, hyp_tokens, lenient=True)
+        return WERResult(
+            wer=wer,
+            substitutions=subs,
+            deletions=dels,
+            insertions=ins,
+            semantic_errors=sem,
+            reference_word_count=ref_count,
+            hypothesis_word_count=hyp_count,
+        )
+
+    # mode == "both"
+    wer_strict, s_strict, d_strict, i_strict, sem_strict = calculate_wer(
+        ref_tokens, hyp_tokens, lenient=False
+    )
+    wer_lenient, s_lenient, d_lenient, i_lenient, sem_lenient = calculate_wer(
+        ref_tokens, hyp_tokens, lenient=True
+    )
+
+    return WERComparisonResult(
+        ground_truth=ground_truth,
+        hypothesis=hypothesis,
+        strict=WERResult(
+            wer=wer_strict,
+            substitutions=s_strict,
+            deletions=d_strict,
+            insertions=i_strict,
+            semantic_errors=sem_strict,
+            reference_word_count=ref_count,
+            hypothesis_word_count=hyp_count,
+        ),
+        lenient=WERResult(
+            wer=wer_lenient,
+            substitutions=s_lenient,
+            deletions=d_lenient,
+            insertions=i_lenient,
+            semantic_errors=sem_lenient,
+            reference_word_count=ref_count,
+            hypothesis_word_count=hyp_count,
+        ),
+    )
+
+
+def _process_wer_item(item: WERBatchItem) -> WERBatchResult:
+    result = calculate_wer_individual(
+        ground_truth=item.ground_truth, hypothesis=item.hypothesis, mode="both"
+    )
+    # result is WERComparisonResult with strict and lenient WERResult
+    return WERBatchResult(
+        id=item.id,
+        ground_truth=item.ground_truth,
+        hypothesis=item.hypothesis,
+        model=item.model,
+        strict=result.strict,
+        lenient=result.lenient,
+    )
+
+
+def calculate_wer_batch(
+    items: List[WERBatchItem], max_workers: int = 4
+) -> List[WERBatchResult]:
+    """
+    Calculate WER for multiple transcriptions in batch using ThreadPoolExecutor.
+
+    Args:
+        items: List of WERBatchItem containing id, ground_truth, and hypothesis
+        max_workers: Maximum number of concurrent workers for ThreadPoolExecutor
+
+    Returns:
+        List of WERBatchResult with both strict and lenient WER for each item
+    """
+    results: List[WERBatchResult] = []
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {executor.submit(_process_wer_item, item): item for item in items}
+        for future in as_completed(futures):
+            try:
+                result = future.result()
+                results.append(result)
+            except Exception as e:
+                item = futures[future]
+                logger.error(f"WER calculation failed for item {item.id}: {e}")
+
+    return results
+
+
+def calculate_wer_summary_stats(
+    results: List[WERBatchResult], mode: str
+) -> WERSummaryStats:
+    """
+    Calculate summary statistics for a list of WER results.
+
+    Args:
+        results: List of WERBatchResult from batch WER calculation
+        mode: "strict" or "lenient" - which WER results to summarize
+
+    Returns:
+        WERSummaryStats with aggregate statistics
+    """
+    if not results:
+        return WERSummaryStats(
+            count=0,
+            avg_wer=0.0,
+            min_wer=0.0,
+            max_wer=0.0,
+            avg_substitutions=0.0,
+            avg_deletions=0.0,
+            avg_insertions=0.0,
+            avg_semantic_errors=0.0,
+            total_reference_words=0,
+            total_hypothesis_words=0,
+        )
+
+    # Extract WER results based on mode (strict or lenient)
+    wer_results = [getattr(r, mode) for r in results]
+    n = len(wer_results)
+
+    wer_values = [w.wer for w in wer_results]
+
+    return WERSummaryStats(
+        count=n,
+        avg_wer=sum(wer_values) / n,
+        min_wer=min(wer_values),
+        max_wer=max(wer_values),
+        avg_substitutions=sum(w.substitutions for w in wer_results) / n,
+        avg_deletions=sum(w.deletions for w in wer_results) / n,
+        avg_insertions=sum(w.insertions for w in wer_results) / n,
+        avg_semantic_errors=sum(w.semantic_errors for w in wer_results) / n,
+        total_reference_words=sum(w.reference_word_count for w in wer_results),
+        total_hypothesis_words=sum(w.hypothesis_word_count for w in wer_results),
+    )
+
+
+def calculate_wer_batch_with_summary(
+    items: List[WERBatchItem], max_workers: int = 4
+) -> tuple[List[WERBatchResult], WERBatchSummary]:
+    """
+    Calculate WER for batch items and return results with summary statistics.
+
+    Args:
+        items: List of WERBatchItem containing id, ground_truth, hypothesis, and optional model
+        max_workers: Maximum number of concurrent workers for ThreadPoolExecutor
+
+    Returns:
+        Tuple of (results list, summary with overall and model-wise stats)
+    """
+    results = calculate_wer_batch(items, max_workers)
+
+    # Calculate overall statistics
+    overall = WEROverallSummary(
+        strict=calculate_wer_summary_stats(results, "strict"),
+        lenient=calculate_wer_summary_stats(results, "lenient"),
+    )
+
+    # Group results by model and calculate per-model statistics
+    model_groups: Dict[str, List[WERBatchResult]] = {}
+    for r in results:
+        if r.model:
+            if r.model not in model_groups:
+                model_groups[r.model] = []
+            model_groups[r.model].append(r)
+
+    by_model: List[WERModelStats] = []
+    for model_name in sorted(model_groups.keys()):
+        model_results = model_groups[model_name]
+        by_model.append(
+            WERModelStats(
+                model=model_name,
+                strict=calculate_wer_summary_stats(model_results, "strict"),
+                lenient=calculate_wer_summary_stats(model_results, "lenient"),
+            )
+        )
+
+    summary = WERBatchSummary(overall=overall, by_model=by_model)
+
+    return results, summary
+
+
+if __name__ == "__main__":
+    # oai_api_key = os.getenv("OPENAI_API_KEY")
+    # gemini_api_key=os.getenv("GEMINI_API_KEY")
+    ai4b_file_path = "/Users/prajna/Downloads/audio_hindi_2.ogg"
+
+    audio_file_path = "/Users/prajna/Desktop/t4d/ai-platform/backend/app/services/audio/sample_data/ogg_files/1756121051765345.ogg"
+    ai4b_response = transcribe_audio_with_indic_conformer(
+        audio_file_path=ai4b_file_path
+    )
+    print(ai4b_response)
+    # stt_eval_file_path = "/Users/prajna/Desktop/t4d/ai-platform/backend/app/services/audio/audio_sample_stt.csv"
+
+    # transcript=transcribe_audio(audio_file=audio_file_path, provider="google-stt")
+    # transcript=transcribe_audio(audio_file=audio_file_path, provider="openai", api_key=oai_api_key)
+    # transcript=transcribe_audio_with_chirp_v3(audio_file_path)
+    # print(transcript)
+
+    # with open(stt_eval_file_path, "rb") as file:
+    #     processed = process_batch_csv(file)
+
+    # print(processed)
diff --git a/backend/app/services/llm/providers/registry.py b/backend/app/services/llm/providers/registry.py
index a5cfb4bb8..7fb8d79f9 100644
--- a/backend/app/services/llm/providers/registry.py
+++ b/backend/app/services/llm/providers/registry.py
@@ -13,12 +13,14 @@
 
 class LLMProvider:
     OPENAI_NATIVE = "openai-native"
+    OPENAI = "openai"
     # Future constants for native providers:
     # CLAUDE_NATIVE = "claude-native"
     # GEMINI_NATIVE = "gemini-native"
 
     _registry: dict[str, type[BaseProvider]] = {
         OPENAI_NATIVE: OpenAIProvider,
+        OPENAI: OpenAIProvider
         # Future native providers:
         # CLAUDE_NATIVE: ClaudeProvider,
         # GEMINI_NATIVE: GeminiProvider,

From 82c7b70da2a0866fc40134f075a0b23aaa110135 Mon Sep 17 00:00:00 2001
From: Prajna1999 <gituprajna20@gmail.com>
Date: Wed, 14 Jan 2026 09:09:42 +0530
Subject: [PATCH 13/15] chore: remove audio poc code

---
 backend/app/services/audio/speech_to_text.py | 630 -------------------
 1 file changed, 630 deletions(-)
 delete mode 100644 backend/app/services/audio/speech_to_text.py

diff --git a/backend/app/services/audio/speech_to_text.py b/backend/app/services/audio/speech_to_text.py
deleted file mode 100644
index 1c7006408..000000000
--- a/backend/app/services/audio/speech_to_text.py
+++ /dev/null
@@ -1,630 +0,0 @@
-"""Speech-to-Text service supporting OpenAI and Gemini providers."""
-import os
-import logging
-import csv
-import base64
-import requests
-from io import StringIO
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import BinaryIO, Literal
-from pydantic import BaseModel
-from dotenv import load_dotenv
-from uuid import uuid4
-from typing import List, Literal, Dict
-from google import genai
-from google.cloud.speech_v2 import SpeechClient
-from google.cloud.speech_v2.types import cloud_speech, RecognizeResponse
-from google.api_core.client_options import ClientOptions
-from openai import OpenAI
-
-from app.services.audio.utils.ogg_to_wav_converter_downloader import (
-    download_audio_from_url,
-)
-from app.models.evaluation import (
-    ProviderConfig,
-    TranscriptionRequest,
-    FileData,
-    WERResult,
-    WERComparisonResult,
-    WERBatchItem,
-    WERBatchResult,
-    WERSummaryStats,
-    WERBatchSummary,
-    WERModelStats,
-    WEROverallSummary,
-)
-from app.services.audio.utils.calculate_wer import tokenize, calculate_wer
-
-load_dotenv()
-logger = logging.getLogger(__name__)
-
-
-# Default prompt for transcription
-DEFAULT_PROMPT = (
-    "Generate a verbatim speech-to-text transcript of the given audio file "
-    "in the same language as of the audio in the same script too. "
-    "make sure the transcription as close possible to the audio provided"
-)
-PROJECT_ID = os.getenv("GOOGLE_PROJECT_ID")
-STT_LOCATION = os.getenv("GOOGLE_PROJECT_LOCATION")
-
-
-class SpeechToTextService:
-    def __init__(self, provider, api_key: str | None = None):
-        if api_key is None:
-            raise ValueError(
-                "Missing OpenAI API Key for Client STT Client initialization"
-            )
-        self.provide = provider
-        self.openai_client = None
-        self.gemini_client = None
-        if provider == "openai":
-            self.openai_client = OpenAI(api_key=api_key)
-        elif provider == "gemini":
-            self.gemini_client = genai.Client(api_key=api_key)
-        else:
-            raise ValueError("This provider is not supported yet.")
-
-    def transcribe_with_openai(
-        self,
-        audio_file: BinaryIO | str,
-        model: str = "gpt-4o-transcribe",
-        prompt: str | None = None,
-    ):
-        if self.openai_client is None:
-            raise ValueError("OpenAI client is not initialized.")
-        try:
-            # Handle file path vs file object
-            if isinstance(audio_file, str):
-                audio_file = open(audio_file, "rb")
-
-            transcription = self.openai_client.audio.transcriptions.create(
-                model=model,
-                file=audio_file,
-                # language="hi",
-                response_format="text",
-                prompt=prompt or DEFAULT_PROMPT,
-            )
-            logger.info(f"Successfully transcribed audio using OpenAI model: {model}")
-            return transcription
-        except Exception as e:
-            logger.error(f"OpenAI transcription failed: {str(e)}", exc_info=True)
-            raise
-
-    def transcribe_with_gemini(
-        self,
-        audio_file_path: str,
-        model: str = "gemini-2.5-flash",
-        prompt: str | None = None,
-    ):
-        if self.gemini_client is None:
-            raise ValueError("Gemini client is not initialized")
-        try:
-            # Upload file to Geminic
-            gemini_file = self.gemini_client.files.upload(file=audio_file_path)
-            logger.info(f"Uploaded file to Gemini: {gemini_file.name}")
-
-            # Generate transcription
-            response = self.gemini_client.models.generate_content(
-                model=model,
-                contents=[prompt or DEFAULT_PROMPT, gemini_file],
-            )
-
-            logger.info(f"Successfully transcribed audio using Gemini model: {model}")
-            return response.text or None
-
-        except Exception as e:
-            logger.error(f"Gemini transcription failed: {str(e)}", exc_info=True)
-            raise
-
-    def transcribe(
-        self,
-        audio_file: BinaryIO | str,
-        provider: Literal["openai", "gemini"] = "openai",
-        model: str | None = None,
-        prompt: str | None = None,
-    ):
-        transcription = None
-        if provider == "openai":
-            transcription = self.transcribe_with_openai(
-                audio_file=audio_file,
-                model=model or "gpt-4o-transcribe",
-                prompt=prompt,
-            )
-            return transcription
-        elif provider == "gemini":
-            # Gemini requires file path, not file object
-            file_path = audio_file if isinstance(audio_file, str) else audio_file.name
-            transcription = self.transcribe_with_gemini(
-                audio_file_path=file_path,
-                model=model or "gemini-2.5-flash",
-                prompt=prompt,
-            )
-            return transcription
-        else:
-            raise ValueError(
-                f"Unsupported provider: {provider}. Use 'openai' or 'gemini'."
-            )
-
-
-def transcribe_audio_with_chirp_v3(audio_file_path: str):
-    with open(audio_file_path, "rb") as file:
-        audio_content = file.read()
-
-    client = SpeechClient(
-        client_options=ClientOptions(
-            api_endpoint=f"{STT_LOCATION}-speech.googleapis.com"
-        )
-    )
-
-    config = cloud_speech.RecognitionConfig(
-        auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
-        language_codes=["auto"],
-        model="chirp_3",
-    )
-    request = cloud_speech.RecognizeRequest(
-        recognizer=f"projects/{PROJECT_ID}/locations/{STT_LOCATION}/recognizers/_",
-        config=config,
-        content=audio_content,
-    )
-    response: RecognizeResponse = client.recognize(request=request)
-
-    transcript = None
-    for result in response.results:
-        transcript = result.alternatives[0].transcript
-    return transcript
-
-
-def transcribe_audio_with_indic_conformer(audio_file_path: str):
-    indic_conformer_api_url = str(os.getenv("AI4B_STT_URL"))
-    with open(audio_file_path, "rb") as file:
-        audio_content = file.read()
-
-    response = requests.post(
-        url=indic_conformer_api_url,
-        data={"language_code": "hi", "decoding_strategy": "ctc"},
-        files={"audio_file": audio_content},
-    )
-    logger.info(response.json())
-    transcription = response.json()["transcription"]
-    return transcription
-
-
-# util functions for direct usage
-def transcribe_audio(
-    audio_file: str,
-    provider: Literal["openai", "gemini", "google-stt", "ai4b"] = "openai",
-    model: str | None = None,
-    api_key: str | None = None,
-    prompt: str | None = None,
-):
-    if provider == "google-stt":
-        return transcribe_audio_with_chirp_v3(audio_file_path=audio_file)
-    if provider == "ai4b":
-        return transcribe_audio_with_indic_conformer(audio_file_path=audio_file)
-    stt_service = SpeechToTextService(provider=provider, api_key=api_key)
-    return stt_service.transcribe(
-        audio_file=audio_file,
-        provider=provider,
-        model=model,
-        prompt=prompt,
-    )
-
-
-# STT csv parser
-
-
-def process_single_csv_row(row_data):
-    idx, audio_url, ground_truth = row_data
-
-    try:
-        audio_bytes, content_type = download_audio_from_url(audio_url)
-        audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
-
-        return {
-            "status": "success",
-            "row": idx,
-            "audio_url": audio_url,
-            "ground_truth": ground_truth,
-            "audio_base64": audio_base64,
-            "media_type": content_type,
-            "file_size": len(audio_bytes),
-        }
-    except requests.Timeout:
-        return {
-            "status": "error",
-            "row": idx,
-            "audio_url": audio_url,
-            "error": "Download timeout",
-        }
-    except requests.RequestException as e:
-        return {
-            "status": "error",
-            "row": idx,
-            "audio_url": audio_url,
-            "error": f"Download failed: {str(e)}",
-        }
-    except Exception as e:
-        return {
-            "status": "error",
-            "row": idx,
-            "audio_url": audio_url,
-            "error": f"Unexpected error: {str(e)}",
-        }
-
-
-async def process_batch_csv(csv_file):
-    csv_body = await csv_file.read()
-    csv_content = csv_body.decode("utf-8")
-
-    csv_reader = csv.DictReader(StringIO(csv_content))
-    required_headers = {"audio_url", "ground_truth"}
-
-    if not required_headers.issubset(csv_reader.fieldnames):
-        raise ValueError(f"CSV must have headers: {required_headers}")
-
-    rows_to_process = []
-    for idx, row in enumerate(csv_reader, start=1):
-        audio_url = row["audio_url"].strip()
-        ground_truth = row["ground_truth"].strip()
-
-        if not audio_url or not ground_truth:
-            continue  # do not throw error. continue silently
-        rows_to_process.append((idx, audio_url, ground_truth))
-
-    results = []
-    errors = []
-
-    with ThreadPoolExecutor(max_workers=4) as executor:
-        future_to_row = {
-            executor.submit(process_single_csv_row, row_data): row_data
-            for row_data in rows_to_process
-        }
-        for future in as_completed(future_to_row):
-            result = future.result()
-            if result["status"] == "success":
-                results.append(result)
-            else:
-                errors.append(result)
-    return {
-        "success": results,
-        "errors": errors,
-        "total_rows": len(rows_to_process),
-        "processed": len(results),
-        "failed": len(errors),
-    }
-
-
-def _get_api_key(provider: str) -> str | None:
-    if provider == "openai":
-        return os.getenv("OPENAI_API_KEY")
-    elif provider == "gemini":
-        return os.getenv("GEMINI_API_KEY")
-
-    return None
-
-
-def _transcribe_single_file_provider(
-    file_data: FileData, provider_config: ProviderConfig
-):
-    # for execution by threadpool
-    provider = provider_config.provider
-
-    model = provider_config.model
-
-    audio_bytes = base64.b64decode(file_data.audio_base64) or None
-    media_extention = (
-        file_data.media_type.split("/")[-1] if file_data.media_type else ".ogg"
-    )
-    temp_file = f"/tmp/{uuid4()}.{media_extention}"
-
-    try:
-        with open(temp_file, "wb") as f:
-            f.write(audio_bytes)  # type: ignore
-
-        # extract the api key
-        api_key = _get_api_key(provider)  # type: ignore
-
-        transcript = transcribe_audio(
-            audio_file=temp_file,
-            provider=provider,  # type: ignore
-            model=model,
-            api_key=api_key,
-        )
-
-        return {
-            "status": "success",
-            "file_id": file_data.file_id or None,
-            "ground_truth": file_data.ground_truth,
-            "provider": provider,
-            "model": model,
-            "transcript": transcript,
-        }
-    except Exception as e:
-        return {
-            "status": "error",
-            "file_id": file_data.file_id,
-            "provider": provider,
-            "model": model,
-            "error": str(e),
-        }
-    finally:
-        # clean the temp file
-        if os.path.exists(temp_file):
-            os.remove(temp_file)
-
-
-# threadpool based integration of batch transcriptipn
-def process_batch_transcription(files, providers, max_workers=4):
-    # create all tasks and push into a list
-    tasks = [
-        (file_data, provider_config)
-        for file_data in files
-        for provider_config in providers
-    ]
-
-    results = []
-    errors = []
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        futures = {
-            executor.submit(
-                _transcribe_single_file_provider, file_data, provider_config
-            ): (file_data, provider_config)
-            for file_data, provider_config in tasks
-        }
-        for future in as_completed(futures):
-            result = future.result()
-            if result["status"] == "success":
-                results.append(result)
-            else:
-                errors.append(result)
-
-    return {
-        "success": results,
-        "errors": errors,
-        "total_tasks": len(tasks),
-        "processed": len(results),
-        "failed": len(errors),
-    }
-
-
-# STT evaluation route handler
-async def evaluate_stt():
-    pass
-
-
-def calculate_wer_individual(
-    ground_truth: str,
-    hypothesis: str,
-    mode: Literal["strict", "lenient", "both"] = "both",
-) -> WERComparisonResult | WERResult:
-    """
-    Calculate WER for a single transcription against ground truth.
-
-    Args:
-        ground_truth: The reference/expected transcription
-        hypothesis: The transcribed text to evaluate
-        mode: "strict" for exact matching, "lenient" for phonetic/spelling tolerance,
-              "both" to return both calculations
-
-    Returns:
-        WERComparisonResult if mode="both", otherwise WERResult for the specified mode
-    """
-    ref_tokens = tokenize(ground_truth)
-    hyp_tokens = tokenize(hypothesis)
-
-    ref_count = len(ref_tokens)
-    hyp_count = len(hyp_tokens)  # hypothesis tokes
-
-    if mode == "strict":
-        wer, subs, dels, ins, sem = calculate_wer(ref_tokens, hyp_tokens, lenient=False)
-        return WERResult(
-            wer=wer,
-            substitutions=subs,
-            deletions=dels,
-            insertions=ins,
-            semantic_errors=sem,
-            reference_word_count=ref_count,
-            hypothesis_word_count=hyp_count,
-        )
-
-    if mode == "lenient":
-        wer, subs, dels, ins, sem = calculate_wer(ref_tokens, hyp_tokens, lenient=True)
-        return WERResult(
-            wer=wer,
-            substitutions=subs,
-            deletions=dels,
-            insertions=ins,
-            semantic_errors=sem,
-            reference_word_count=ref_count,
-            hypothesis_word_count=hyp_count,
-        )
-
-    # mode == "both"
-    wer_strict, s_strict, d_strict, i_strict, sem_strict = calculate_wer(
-        ref_tokens, hyp_tokens, lenient=False
-    )
-    wer_lenient, s_lenient, d_lenient, i_lenient, sem_lenient = calculate_wer(
-        ref_tokens, hyp_tokens, lenient=True
-    )
-
-    return WERComparisonResult(
-        ground_truth=ground_truth,
-        hypothesis=hypothesis,
-        strict=WERResult(
-            wer=wer_strict,
-            substitutions=s_strict,
-            deletions=d_strict,
-            insertions=i_strict,
-            semantic_errors=sem_strict,
-            reference_word_count=ref_count,
-            hypothesis_word_count=hyp_count,
-        ),
-        lenient=WERResult(
-            wer=wer_lenient,
-            substitutions=s_lenient,
-            deletions=d_lenient,
-            insertions=i_lenient,
-            semantic_errors=sem_lenient,
-            reference_word_count=ref_count,
-            hypothesis_word_count=hyp_count,
-        ),
-    )
-
-
-def _process_wer_item(item: WERBatchItem) -> WERBatchResult:
-    result = calculate_wer_individual(
-        ground_truth=item.ground_truth, hypothesis=item.hypothesis, mode="both"
-    )
-    # result is WERComparisonResult with strict and lenient WERResult
-    return WERBatchResult(
-        id=item.id,
-        ground_truth=item.ground_truth,
-        hypothesis=item.hypothesis,
-        model=item.model,
-        strict=result.strict,
-        lenient=result.lenient,
-    )
-
-
-def calculate_wer_batch(
-    items: List[WERBatchItem], max_workers: int = 4
-) -> List[WERBatchResult]:
-    """
-    Calculate WER for multiple transcriptions in batch using ThreadPoolExecutor.
-
-    Args:
-        items: List of WERBatchItem containing id, ground_truth, and hypothesis
-        max_workers: Maximum number of concurrent workers for ThreadPoolExecutor
-
-    Returns:
-        List of WERBatchResult with both strict and lenient WER for each item
-    """
-    results: List[WERBatchResult] = []
-
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        futures = {executor.submit(_process_wer_item, item): item for item in items}
-        for future in as_completed(futures):
-            try:
-                result = future.result()
-                results.append(result)
-            except Exception as e:
-                item = futures[future]
-                logger.error(f"WER calculation failed for item {item.id}: {e}")
-
-    return results
-
-
-def calculate_wer_summary_stats(
-    results: List[WERBatchResult], mode: str
-) -> WERSummaryStats:
-    """
-    Calculate summary statistics for a list of WER results.
-
-    Args:
-        results: List of WERBatchResult from batch WER calculation
-        mode: "strict" or "lenient" - which WER results to summarize
-
-    Returns:
-        WERSummaryStats with aggregate statistics
-    """
-    if not results:
-        return WERSummaryStats(
-            count=0,
-            avg_wer=0.0,
-            min_wer=0.0,
-            max_wer=0.0,
-            avg_substitutions=0.0,
-            avg_deletions=0.0,
-            avg_insertions=0.0,
-            avg_semantic_errors=0.0,
-            total_reference_words=0,
-            total_hypothesis_words=0,
-        )
-
-    # Extract WER results based on mode (strict or lenient)
-    wer_results = [getattr(r, mode) for r in results]
-    n = len(wer_results)
-
-    wer_values = [w.wer for w in wer_results]
-
-    return WERSummaryStats(
-        count=n,
-        avg_wer=sum(wer_values) / n,
-        min_wer=min(wer_values),
-        max_wer=max(wer_values),
-        avg_substitutions=sum(w.substitutions for w in wer_results) / n,
-        avg_deletions=sum(w.deletions for w in wer_results) / n,
-        avg_insertions=sum(w.insertions for w in wer_results) / n,
-        avg_semantic_errors=sum(w.semantic_errors for w in wer_results) / n,
-        total_reference_words=sum(w.reference_word_count for w in wer_results),
-        total_hypothesis_words=sum(w.hypothesis_word_count for w in wer_results),
-    )
-
-
-def calculate_wer_batch_with_summary(
-    items: List[WERBatchItem], max_workers: int = 4
-) -> tuple[List[WERBatchResult], WERBatchSummary]:
-    """
-    Calculate WER for batch items and return results with summary statistics.
-
-    Args:
-        items: List of WERBatchItem containing id, ground_truth, hypothesis, and optional model
-        max_workers: Maximum number of concurrent workers for ThreadPoolExecutor
-
-    Returns:
-        Tuple of (results list, summary with overall and model-wise stats)
-    """
-    results = calculate_wer_batch(items, max_workers)
-
-    # Calculate overall statistics
-    overall = WEROverallSummary(
-        strict=calculate_wer_summary_stats(results, "strict"),
-        lenient=calculate_wer_summary_stats(results, "lenient"),
-    )
-
-    # Group results by model and calculate per-model statistics
-    model_groups: Dict[str, List[WERBatchResult]] = {}
-    for r in results:
-        if r.model:
-            if r.model not in model_groups:
-                model_groups[r.model] = []
-            model_groups[r.model].append(r)
-
-    by_model: List[WERModelStats] = []
-    for model_name in sorted(model_groups.keys()):
-        model_results = model_groups[model_name]
-        by_model.append(
-            WERModelStats(
-                model=model_name,
-                strict=calculate_wer_summary_stats(model_results, "strict"),
-                lenient=calculate_wer_summary_stats(model_results, "lenient"),
-            )
-        )
-
-    summary = WERBatchSummary(overall=overall, by_model=by_model)
-
-    return results, summary
-
-
-if __name__ == "__main__":
-    # oai_api_key = os.getenv("OPENAI_API_KEY")
-    # gemini_api_key=os.getenv("GEMINI_API_KEY")
-    ai4b_file_path = "/Users/prajna/Downloads/audio_hindi_2.ogg"
-
-    audio_file_path = "/Users/prajna/Desktop/t4d/ai-platform/backend/app/services/audio/sample_data/ogg_files/1756121051765345.ogg"
-    ai4b_response = transcribe_audio_with_indic_conformer(
-        audio_file_path=ai4b_file_path
-    )
-    print(ai4b_response)
-    # stt_eval_file_path = "/Users/prajna/Desktop/t4d/ai-platform/backend/app/services/audio/audio_sample_stt.csv"
-
-    # transcript=transcribe_audio(audio_file=audio_file_path, provider="google-stt")
-    # transcript=transcribe_audio(audio_file=audio_file_path, provider="openai", api_key=oai_api_key)
-    # transcript=transcribe_audio_with_chirp_v3(audio_file_path)
-    # print(transcript)
-
-    # with open(stt_eval_file_path, "rb") as file:
-    #     processed = process_batch_csv(file)
-
-    # print(processed)

From ebdda817098f08676a22c94e0d369b2341121992 Mon Sep 17 00:00:00 2001
From: Prajna1999 <gituprajna20@gmail.com>
Date: Wed, 14 Jan 2026 10:58:35 +0530
Subject: [PATCH 14/15] fix: add comprehensive expansion of 'tools' key while
 building evaluation_jsonl

---
 backend/app/crud/evaluations/batch.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py
index 6f537c063..2aad3b298 100644
--- a/backend/app/crud/evaluations/batch.py
+++ b/backend/app/crud/evaluations/batch.py
@@ -111,7 +111,13 @@ def build_evaluation_jsonl(
                 "instructions": config.instructions,
                 "temperature": config.temperature,
                 "reasoning": {"effort": config.reasoning} if config.reasoning else None,
-                "tools": config.knowledge_base_ids if config.knowledge_base_ids else [],
+                "tools": [
+                    {
+                        "type": "file_search",
+                        "vector_store_ids": config.knowledge_base_ids,
+                        "max_num_results": config.max_num_results or 20,
+                    }
+                ],
                 "input": question,  # Add input from dataset
             },
         }

From f00e7e088bb4e0ec9bc63afea5b8e553b33c01f9 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Sat, 24 Jan 2026 17:29:08 +0530
Subject: [PATCH 15/15] fixing endpoints

---
 backend/app/api/routes/evaluations/dataset.py    | 4 ++--
 backend/app/api/routes/evaluations/evaluation.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backend/app/api/routes/evaluations/dataset.py b/backend/app/api/routes/evaluations/dataset.py
index d66ff71ce..1ce42742a 100644
--- a/backend/app/api/routes/evaluations/dataset.py
+++ b/backend/app/api/routes/evaluations/dataset.py
@@ -48,7 +48,7 @@ def _dataset_to_response(dataset: EvaluationDataset) -> DatasetUploadResponse:
 
 
 @router.post(
-    "/",
+    "",
     description=load_description("evaluation/upload_dataset.md"),
     response_model=APIResponse[DatasetUploadResponse],
     dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
@@ -87,7 +87,7 @@ async def upload_dataset(
 
 
 @router.get(
-    "/",
+    "",
     description=load_description("evaluation/list_datasets.md"),
     response_model=APIResponse[list[DatasetUploadResponse]],
     dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py
index b0f1a1cf3..fe09edcec 100644
--- a/backend/app/api/routes/evaluations/evaluation.py
+++ b/backend/app/api/routes/evaluations/evaluation.py
@@ -30,7 +30,7 @@
 
 
 @router.post(
-    "/",
+    "",
     description=load_description("evaluation/create_evaluation.md"),
     response_model=APIResponse[EvaluationRunPublic],
     dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
@@ -66,7 +66,7 @@ def evaluate(
 
 
 @router.get(
-    "/",
+    "",
     description=load_description("evaluation/list_evaluations.md"),
     response_model=APIResponse[list[EvaluationRunPublic]],
     dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],