From 3f8ddcff0cdd4ddf855c49ad1983809ac1fe26be Mon Sep 17 00:00:00 2001 From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com> Date: Tue, 9 Dec 2025 09:32:12 +0530 Subject: [PATCH 01/15] Refactor evaluation endpoint to use stored configuration and remove assistant_id handling --- backend/app/api/routes/evaluation.py | 93 +++++++--------------------- 1 file changed, 23 insertions(+), 70 deletions(-) diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py index 058950d65..d6f9b5c29 100644 --- a/backend/app/api/routes/evaluation.py +++ b/backend/app/api/routes/evaluation.py @@ -3,12 +3,12 @@ import logging import re from pathlib import Path +from uuid import UUID from fastapi import APIRouter, Body, File, Form, HTTPException, Query, UploadFile from app.api.deps import AuthContextDep, SessionDep from app.core.cloud import get_cloud_storage -from app.crud.assistants import get_assistant_by_id from app.crud.evaluations import ( create_evaluation_dataset, create_evaluation_run, @@ -20,7 +20,7 @@ upload_dataset_to_langfuse, ) from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud -from app.crud.evaluations.core import save_score +from app.crud.evaluations.core import resolve_config_from_stored, save_score from app.crud.evaluations.dataset import delete_dataset as delete_dataset_crud from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse from app.models.evaluation import ( @@ -33,6 +33,10 @@ get_openai_client, load_description, ) +from app.services.llm.jobs import resolve_config_blob +from app.models.llm.request import LLMCallConfig +from app.crud.config.version import ConfigVersionCrud + logger = logging.getLogger(__name__) @@ -430,20 +434,9 @@ def evaluate( experiment_name: str = Body( ..., description="Name for this evaluation experiment/run" ), - config: dict = Body(default_factory=dict, description="Evaluation configuration"), - assistant_id: str - | None = Body( - None, description="Optional assistant ID to fetch configuration from" - ), + config_id: UUID = Body(..., description="Stored config ID"), + config_version: int = Body(..., ge=1, description="Stored config version"), ) -> APIResponse[EvaluationRunPublic]: - logger.info( - f"[evaluate] Starting evaluation | experiment_name={experiment_name} | " - f"dataset_id={dataset_id} | " - f"org_id={auth_context.organization.id} | " - f"assistant_id={assistant_id} | " - f"config_keys={list(config.keys())}" - ) - # Step 1: Fetch dataset from database dataset = get_dataset_by_id( session=_session, @@ -459,12 +452,6 @@ def evaluate( f"organization/project", ) - logger.info( - f"[evaluate] Found dataset | id={dataset.id} | name={dataset.name} | " - f"object_store_url={'present' if dataset.object_store_url else 'None'} | " - f"langfuse_id={dataset.langfuse_dataset_id}" - ) - dataset_name = dataset.name # Get API clients @@ -487,63 +474,29 @@ def evaluate( "Please ensure Langfuse credentials were configured when the dataset was created.", ) - # Handle assistant_id if provided - if assistant_id: - # Fetch assistant details from database - assistant = get_assistant_by_id( - session=_session, - assistant_id=assistant_id, - project_id=auth_context.project.id, - ) - - if not assistant: - raise HTTPException( - status_code=404, detail=f"Assistant {assistant_id} not found" - ) + config_version_crud = ConfigVersionCrud( + session=_session, config_id=config_id, project_id=auth_context.project.id + ) - logger.info( - f"[evaluate] Found assistant in DB | id={assistant.id} | " - f"model={assistant.model} | instructions=" - f"{assistant.instructions[:50] if assistant.instructions else 'None'}..." + config, error = resolve_config_blob( + config_crud=config_version_crud, + config=LLMCallConfig(id=config_id, version=config_version), + ) + if error: + raise HTTPException( + status_code=400, + detail=f"Failed to resolve config from stored config: {error}", ) - # Build config from assistant (use provided config values to override - # if present) - config = { - "model": config.get("model", assistant.model), - "instructions": config.get("instructions", assistant.instructions), - "temperature": config.get("temperature", assistant.temperature), - } - - # Add tools if vector stores are available - vector_store_ids = config.get( - "vector_store_ids", assistant.vector_store_ids or [] - ) - if vector_store_ids and len(vector_store_ids) > 0: - config["tools"] = [ - { - "type": "file_search", - "vector_store_ids": vector_store_ids, - } - ] - - logger.info("[evaluate] Using config from assistant") - else: - logger.info("[evaluate] Using provided config directly") - # Validate that config has minimum required fields - if not config.get("model"): - raise HTTPException( - status_code=400, - detail="Config must include 'model' when assistant_id is not provided", - ) + logger.info("[evaluate] Successfully resolved config from config management") - # Create EvaluationRun record + # Create EvaluationRun record with resolved config and references eval_run = create_evaluation_run( session=_session, run_name=experiment_name, dataset_name=dataset_name, dataset_id=dataset_id, - config=config, + config=config.completion.params, organization_id=auth_context.organization.id, project_id=auth_context.project.id, ) @@ -555,7 +508,7 @@ def evaluate( openai_client=openai_client, session=_session, eval_run=eval_run, - config=config, + config=config.completion.params, ) logger.info( From 528062220616910856d8bf0f4c90ba40ea6029aa Mon Sep 17 00:00:00 2001 From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com> Date: Tue, 9 Dec 2025 14:43:06 +0530 Subject: [PATCH 02/15] Refactor evaluation run to use config ID and version instead of config dict --- .../041_add_config_in_evals_run_table.py | 60 +++++++++++++++++++ backend/app/api/routes/evaluation.py | 13 +++- backend/app/crud/evaluations/core.py | 15 +++-- backend/app/models/evaluation.py | 24 ++++---- 4 files changed, 95 insertions(+), 17 deletions(-) create mode 100644 backend/app/alembic/versions/041_add_config_in_evals_run_table.py diff --git a/backend/app/alembic/versions/041_add_config_in_evals_run_table.py b/backend/app/alembic/versions/041_add_config_in_evals_run_table.py new file mode 100644 index 000000000..449768b38 --- /dev/null +++ b/backend/app/alembic/versions/041_add_config_in_evals_run_table.py @@ -0,0 +1,60 @@ +"""add config in evals run table + +Revision ID: 041 +Revises: 040 +Create Date: 2025-12-15 14:03:22.082746 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "041" +down_revision = "040" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "evaluation_run", + sa.Column( + "config_id", + sa.Uuid(), + nullable=True, + comment="Reference to the stored config used", + ), + ) + op.add_column( + "evaluation_run", + sa.Column( + "config_version", + sa.Integer(), + nullable=True, + comment="Version of the config used", + ), + ) + op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"]) + op.drop_column("evaluation_run", "config") + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "evaluation_run", + sa.Column( + "config", + postgresql.JSONB(astext_type=sa.Text()), + autoincrement=False, + nullable=False, + comment="Evaluation configuration (model, instructions, etc.)", + ), + ) + op.drop_constraint(None, "evaluation_run", type_="foreignkey") + op.drop_column("evaluation_run", "config_version") + op.drop_column("evaluation_run", "config_id") + # ### end Alembic commands ### diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py index d6f9b5c29..f0208129d 100644 --- a/backend/app/api/routes/evaluation.py +++ b/backend/app/api/routes/evaluation.py @@ -20,7 +20,7 @@ upload_dataset_to_langfuse, ) from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud -from app.crud.evaluations.core import resolve_config_from_stored, save_score +from app.crud.evaluations.core import save_score from app.crud.evaluations.dataset import delete_dataset as delete_dataset_crud from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse from app.models.evaluation import ( @@ -34,6 +34,7 @@ load_description, ) from app.services.llm.jobs import resolve_config_blob +from app.services.llm.providers import LLMProvider from app.models.llm.request import LLMCallConfig from app.crud.config.version import ConfigVersionCrud @@ -487,16 +488,22 @@ def evaluate( status_code=400, detail=f"Failed to resolve config from stored config: {error}", ) + elif config.completion.provider != LLMProvider.OPENAI: + raise HTTPException( + status_code=400, + detail="Only 'openai' provider is supported for evaluation configs", + ) logger.info("[evaluate] Successfully resolved config from config management") - # Create EvaluationRun record with resolved config and references + # Create EvaluationRun record with config references eval_run = create_evaluation_run( session=_session, run_name=experiment_name, dataset_name=dataset_name, dataset_id=dataset_id, - config=config.completion.params, + config_id=config_id, + config_version=config_version, organization_id=auth_context.organization.id, project_id=auth_context.project.id, ) diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index b2b118df1..466066a2f 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -1,5 +1,6 @@ import logging from typing import Any +from uuid import UUID from langfuse import Langfuse from sqlmodel import Session, select @@ -16,7 +17,8 @@ def create_evaluation_run( run_name: str, dataset_name: str, dataset_id: int, - config: dict, + config_id: UUID, + config_version: int, organization_id: int, project_id: int, ) -> EvaluationRun: @@ -28,7 +30,8 @@ def create_evaluation_run( run_name: Name of the evaluation run/experiment dataset_name: Name of the dataset being used dataset_id: ID of the dataset - config: Configuration dict for the evaluation + config_id: UUID of the stored config + config_version: Version number of the config organization_id: Organization ID project_id: Project ID @@ -39,7 +42,8 @@ def create_evaluation_run( run_name=run_name, dataset_name=dataset_name, dataset_id=dataset_id, - config=config, + config_id=config_id, + config_version=config_version, status="pending", organization_id=organization_id, project_id=project_id, @@ -56,7 +60,10 @@ def create_evaluation_run( logger.error(f"Failed to create EvaluationRun: {e}", exc_info=True) raise - logger.info(f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}") + logger.info( + f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}, " + f"config_id={config_id}, config_version={config_version}" + ) return eval_run diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index f99fbb27e..e59a69725 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -1,5 +1,6 @@ from datetime import datetime from typing import TYPE_CHECKING, Any, Optional +from uuid import UUID from pydantic import BaseModel, Field from sqlalchemy import Column, Index, Text, UniqueConstraint @@ -193,15 +194,17 @@ class EvaluationRun(SQLModel, table=True): sa_column_kwargs={"comment": "Name of the Langfuse dataset used"}, ) - # Config field - dict requires sa_column - config: dict[str, Any] = SQLField( - default_factory=dict, - sa_column=Column( - JSONB, - nullable=False, - comment="Evaluation configuration (model, instructions, etc.)", - ), - description="Evaluation configuration", + config_id: UUID = SQLField( + foreign_key="config.id", + nullable=True, + description="Reference to the stored config used for this evaluation", + sa_column_kwargs={"comment": "Reference to the stored config used"}, + ) + config_version: int = SQLField( + nullable=True, + ge=1, + description="Version of the config used for this evaluation", + sa_column_kwargs={"comment": "Version of the config used"}, ) # Dataset reference @@ -339,7 +342,8 @@ class EvaluationRunPublic(SQLModel): id: int run_name: str dataset_name: str - config: dict[str, Any] + config_id: UUID + config_version: int dataset_id: int batch_job_id: int | None embedding_batch_job_id: int | None From 13eb778141fa706eb6762b644d665a4307880a21 Mon Sep 17 00:00:00 2001 From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com> Date: Tue, 9 Dec 2025 15:12:19 +0530 Subject: [PATCH 03/15] Add config_id, config_version, and model fields to evaluation run table --- ...add_config_id_and_version_in_evals_run_.py | 50 +++++++++++++++++++ backend/app/api/routes/evaluation.py | 4 ++ backend/app/crud/evaluations/core.py | 3 ++ backend/app/crud/evaluations/embeddings.py | 14 +----- backend/app/crud/evaluations/processing.py | 6 +-- backend/app/models/evaluation.py | 7 +++ 6 files changed, 67 insertions(+), 17 deletions(-) create mode 100644 backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py diff --git a/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py b/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py new file mode 100644 index 000000000..a5744788f --- /dev/null +++ b/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py @@ -0,0 +1,50 @@ +"""Add config_id and version in evals run table + +Revision ID: 7b48f23ebfdd +Revises: eed36ae3c79a +Create Date: 2025-12-09 14:19:57.620312 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "7b48f23ebfdd" +down_revision = "eed36ae3c79a" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("evaluation_run", sa.Column("config_id", sa.Uuid(), nullable=True)) + op.add_column( + "evaluation_run", sa.Column("config_version", sa.Integer(), nullable=True) + ) + op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"]) + op.add_column( + "evaluation_run", + sa.Column("model", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + ) + op.drop_column("evaluation_run", "config") + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "evaluation_run", + sa.Column( + "config", + postgresql.JSONB(astext_type=sa.Text()), + autoincrement=False, + nullable=False, + ), + ) + op.drop_constraint(None, "evaluation_run", type_="foreignkey") + op.drop_column("evaluation_run", "config_version") + op.drop_column("evaluation_run", "model") + op.drop_column("evaluation_run", "config_id") + # ### end Alembic commands ### diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py index f0208129d..e723cdcec 100644 --- a/backend/app/api/routes/evaluation.py +++ b/backend/app/api/routes/evaluation.py @@ -496,6 +496,9 @@ def evaluate( logger.info("[evaluate] Successfully resolved config from config management") + # Extract model from config for storage + model = config.completion.params.get("model") + # Create EvaluationRun record with config references eval_run = create_evaluation_run( session=_session, @@ -504,6 +507,7 @@ def evaluate( dataset_id=dataset_id, config_id=config_id, config_version=config_version, + model=model, organization_id=auth_context.organization.id, project_id=auth_context.project.id, ) diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 466066a2f..04cd34287 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -21,6 +21,7 @@ def create_evaluation_run( config_version: int, organization_id: int, project_id: int, + model: str | None = None, ) -> EvaluationRun: """ Create a new evaluation run record in the database. @@ -34,6 +35,7 @@ def create_evaluation_run( config_version: Version number of the config organization_id: Organization ID project_id: Project ID + model: LLM model name (snapshot at creation time) Returns: The created EvaluationRun instance @@ -44,6 +46,7 @@ def create_evaluation_run( dataset_id=dataset_id, config_id=config_id, config_version=config_version, + model=model, status="pending", organization_id=organization_id, project_id=project_id, diff --git a/backend/app/crud/evaluations/embeddings.py b/backend/app/crud/evaluations/embeddings.py index 70e374211..22bd48522 100644 --- a/backend/app/crud/evaluations/embeddings.py +++ b/backend/app/crud/evaluations/embeddings.py @@ -364,19 +364,7 @@ def start_embedding_batch( logger.info(f"Starting embedding batch for evaluation run {eval_run.id}") # Get embedding model from config (default: text-embedding-3-large) - embedding_model = eval_run.config.get( - "embedding_model", "text-embedding-3-large" - ) - - # Validate and fallback to default if invalid - try: - validate_embedding_model(embedding_model) - except ValueError as e: - logger.warning( - f"Invalid embedding model '{embedding_model}' in config: {e}. " - f"Falling back to text-embedding-3-large" - ) - embedding_model = "text-embedding-3-large" + embedding_model = "text-embedding-3-large" # Step 1: Build embedding JSONL with trace_ids jsonl_data = build_embedding_jsonl( diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py index 12b89266e..03b20b203 100644 --- a/backend/app/crud/evaluations/processing.py +++ b/backend/app/crud/evaluations/processing.py @@ -253,16 +253,14 @@ async def process_completed_evaluation( if not results: raise ValueError("No valid results found in batch output") - # Extract model from config for cost tracking - model = eval_run.config.get("model") if eval_run.config else None - # Step 5: Create Langfuse dataset run with traces + # Use model stored at creation time for cost tracking trace_id_mapping = create_langfuse_dataset_run( langfuse=langfuse, dataset_name=eval_run.dataset_name, run_name=eval_run.run_name, results=results, - model=model, + model=eval_run.model, ) # Store object store URL in database diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index e59a69725..91ff9a011 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -207,6 +207,12 @@ class EvaluationRun(SQLModel, table=True): sa_column_kwargs={"comment": "Version of the config used"}, ) + # Model field (snapshot at creation time) + model: str | None = SQLField( + default=None, + description="LLM model name used for this evaluation (e.g., gpt-4o-mini)", + ) + # Dataset reference dataset_id: int = SQLField( foreign_key="evaluation_dataset.id", @@ -344,6 +350,7 @@ class EvaluationRunPublic(SQLModel): dataset_name: str config_id: UUID config_version: int + model: str | None dataset_id: int batch_job_id: int | None embedding_batch_job_id: int | None From 7bdd32291c04d48a0294a364a76d355ae14e193f Mon Sep 17 00:00:00 2001 From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com> Date: Wed, 10 Dec 2025 10:04:34 +0530 Subject: [PATCH 04/15] Refactor batch evaluation tests to use config_id and config_version instead of config dict --- .../app/tests/api/routes/test_evaluation.py | 47 ++++++++++++------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py index c4eb3f0b6..b6933fb33 100644 --- a/backend/app/tests/api/routes/test_evaluation.py +++ b/backend/app/tests/api/routes/test_evaluation.py @@ -1,11 +1,13 @@ import io from unittest.mock import Mock, patch +from uuid import uuid4 import pytest from sqlmodel import select from app.crud.evaluations.batch import build_evaluation_jsonl from app.models import EvaluationDataset, EvaluationRun +from app.tests.utils.test_data import create_test_config # Helper function to create CSV file-like object @@ -494,16 +496,20 @@ def sample_evaluation_config(self): } def test_start_batch_evaluation_invalid_dataset_id( - self, client, user_api_key_header, sample_evaluation_config + self, client, user_api_key_header, db, user_api_key ): """Test batch evaluation fails with invalid dataset_id.""" + # Create a valid config to use + config = create_test_config(db, project_id=user_api_key.project.id) + # Try to start evaluation with non-existent dataset_id response = client.post( "/api/v1/evaluations", json={ "experiment_name": "test_evaluation_run", "dataset_id": 99999, # Non-existent - "config": sample_evaluation_config, + "config_id": str(config.id), + "config_version": 1, }, headers=user_api_key_header, ) @@ -516,32 +522,27 @@ def test_start_batch_evaluation_invalid_dataset_id( assert "not found" in error_str.lower() or "not accessible" in error_str.lower() def test_start_batch_evaluation_missing_model(self, client, user_api_key_header): - """Test batch evaluation fails when model is missing from config.""" - # We don't need a real dataset for this test - the validation should happen - # before dataset lookup. Use any dataset_id and expect config validation error - invalid_config = { - "instructions": "You are a helpful assistant", - "temperature": 0.5, - } - + """Test batch evaluation fails with invalid config_id.""" + # Test with a non-existent config_id (random UUID) response = client.post( "/api/v1/evaluations", json={ - "experiment_name": "test_no_model", - "dataset_id": 1, # Dummy ID, error should come before this is checked - "config": invalid_config, + "experiment_name": "test_no_config", + "dataset_id": 1, # Dummy ID, config validation happens first + "config_id": str(uuid4()), # Non-existent config + "config_version": 1, }, headers=user_api_key_header, ) - # Should fail with either 400 (model missing) or 404 (dataset not found) + # Should fail with either 400 (config not found) or 404 (dataset/config not found) assert response.status_code in [400, 404] response_data = response.json() error_str = response_data.get( "detail", response_data.get("message", str(response_data)) ) - # Should fail with either "model" missing or "dataset not found" (both acceptable) - assert "model" in error_str.lower() or "not found" in error_str.lower() + # Should mention config or not found + assert "config" in error_str.lower() or "not found" in error_str.lower() def test_start_batch_evaluation_without_authentication( self, client, sample_evaluation_config @@ -728,11 +729,16 @@ def test_get_evaluation_run_trace_info_not_completed( self, client, user_api_key_header, db, user_api_key, create_test_dataset ): """Test requesting trace info for incomplete evaluation returns error.""" + # Create a config for the evaluation run + config = create_test_config(db, project_id=user_api_key.project.id) + eval_run = EvaluationRun( run_name="test_pending_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, + model="gpt-4o", status="pending", total_items=3, organization_id=user_api_key.organization_id, @@ -759,11 +765,16 @@ def test_get_evaluation_run_trace_info_completed( self, client, user_api_key_header, db, user_api_key, create_test_dataset ): """Test requesting trace info for completed evaluation returns cached scores.""" + # Create a config for the evaluation run + config = create_test_config(db, project_id=user_api_key.project.id) + eval_run = EvaluationRun( run_name="test_completed_run", dataset_name=create_test_dataset.name, dataset_id=create_test_dataset.id, - config={"model": "gpt-4o"}, + config_id=config.id, + config_version=1, + model="gpt-4o", status="completed", total_items=3, score={ From 8f9561c101af8b62d5a4990a4bdb24e79871e0b9 Mon Sep 17 00:00:00 2001 From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com> Date: Wed, 10 Dec 2025 10:22:21 +0530 Subject: [PATCH 05/15] Update EvaluationRunPublic model to allow nullable config_id and config_version fields --- backend/app/models/evaluation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index 91ff9a011..6e041900f 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -348,8 +348,8 @@ class EvaluationRunPublic(SQLModel): id: int run_name: str dataset_name: str - config_id: UUID - config_version: int + config_id: UUID | None + config_version: int | None model: str | None dataset_id: int batch_job_id: int | None From f612da47e6f03a69c828c20c6194d3d022921129 Mon Sep 17 00:00:00 2001 From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com> Date: Mon, 15 Dec 2025 12:42:54 +0530 Subject: [PATCH 06/15] Refactor evaluation run model handling: remove model field, add resolve_model_from_config function, and update processing logic to use config references --- ...add_config_id_and_version_in_evals_run_.py | 5 -- backend/app/api/routes/evaluation.py | 22 ++++---- backend/app/crud/evaluations/__init__.py | 2 + backend/app/crud/evaluations/core.py | 50 +++++++++++++++++-- backend/app/crud/evaluations/processing.py | 6 ++- backend/app/models/evaluation.py | 7 --- 6 files changed, 66 insertions(+), 26 deletions(-) diff --git a/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py b/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py index a5744788f..1f33543ea 100644 --- a/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py +++ b/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py @@ -24,10 +24,6 @@ def upgrade(): "evaluation_run", sa.Column("config_version", sa.Integer(), nullable=True) ) op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"]) - op.add_column( - "evaluation_run", - sa.Column("model", sqlmodel.sql.sqltypes.AutoString(), nullable=True), - ) op.drop_column("evaluation_run", "config") # ### end Alembic commands ### @@ -45,6 +41,5 @@ def downgrade(): ) op.drop_constraint(None, "evaluation_run", type_="foreignkey") op.drop_column("evaluation_run", "config_version") - op.drop_column("evaluation_run", "model") op.drop_column("evaluation_run", "config_id") # ### end Alembic commands ### diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py index e723cdcec..ff142e402 100644 --- a/backend/app/api/routes/evaluation.py +++ b/backend/app/api/routes/evaluation.py @@ -5,10 +5,19 @@ from pathlib import Path from uuid import UUID -from fastapi import APIRouter, Body, File, Form, HTTPException, Query, UploadFile +from fastapi import ( + APIRouter, + Body, + File, + Form, + HTTPException, + Query, + UploadFile, +) from app.api.deps import AuthContextDep, SessionDep from app.core.cloud import get_cloud_storage +from app.crud.config.version import ConfigVersionCrud from app.crud.evaluations import ( create_evaluation_dataset, create_evaluation_run, @@ -27,16 +36,15 @@ DatasetUploadResponse, EvaluationRunPublic, ) +from app.models.llm.request import LLMCallConfig +from app.services.llm.jobs import resolve_config_blob +from app.services.llm.providers import LLMProvider from app.utils import ( APIResponse, get_langfuse_client, get_openai_client, load_description, ) -from app.services.llm.jobs import resolve_config_blob -from app.services.llm.providers import LLMProvider -from app.models.llm.request import LLMCallConfig -from app.crud.config.version import ConfigVersionCrud logger = logging.getLogger(__name__) @@ -496,9 +504,6 @@ def evaluate( logger.info("[evaluate] Successfully resolved config from config management") - # Extract model from config for storage - model = config.completion.params.get("model") - # Create EvaluationRun record with config references eval_run = create_evaluation_run( session=_session, @@ -507,7 +512,6 @@ def evaluate( dataset_id=dataset_id, config_id=config_id, config_version=config_version, - model=model, organization_id=auth_context.organization.id, project_id=auth_context.project.id, ) diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py index 5ca0aacd6..8344c3e91 100644 --- a/backend/app/crud/evaluations/__init__.py +++ b/backend/app/crud/evaluations/__init__.py @@ -5,6 +5,7 @@ create_evaluation_run, get_evaluation_run_by_id, list_evaluation_runs, + resolve_model_from_config, ) from app.crud.evaluations.cron import ( process_all_pending_evaluations, @@ -39,6 +40,7 @@ "create_evaluation_run", "get_evaluation_run_by_id", "list_evaluation_runs", + "resolve_model_from_config", # Cron "process_all_pending_evaluations", "process_all_pending_evaluations_sync", diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 04cd34287..b64aa8208 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -6,8 +6,11 @@ from sqlmodel import Session, select from app.core.util import now +from app.crud.config.version import ConfigVersionCrud from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse from app.models import EvaluationRun +from app.models.llm.request import LLMCallConfig +from app.services.llm.jobs import resolve_config_blob logger = logging.getLogger(__name__) @@ -21,7 +24,6 @@ def create_evaluation_run( config_version: int, organization_id: int, project_id: int, - model: str | None = None, ) -> EvaluationRun: """ Create a new evaluation run record in the database. @@ -35,7 +37,6 @@ def create_evaluation_run( config_version: Version number of the config organization_id: Organization ID project_id: Project ID - model: LLM model name (snapshot at creation time) Returns: The created EvaluationRun instance @@ -46,7 +47,6 @@ def create_evaluation_run( dataset_id=dataset_id, config_id=config_id, config_version=config_version, - model=model, status="pending", organization_id=organization_id, project_id=project_id, @@ -303,3 +303,47 @@ def save_score( f"traces={len(score.get('traces', []))}" ) return eval_run + + +def resolve_model_from_config( + session: Session, + eval_run: EvaluationRun, +) -> str: + """ + Resolve the model name from the evaluation run's config. + + Args: + session: Database session + eval_run: EvaluationRun instance + + Returns: + Model name from config + + Raises: + ValueError: If config is missing, invalid, or has no model + """ + if not eval_run.config_id or not eval_run.config_version: + raise ValueError( + f"Evaluation run {eval_run.id} has no config reference " + f"(config_id={eval_run.config_id}, config_version={eval_run.config_version})" + ) + + config_version_crud = ConfigVersionCrud( + session=session, + config_id=eval_run.config_id, + project_id=eval_run.project_id, + ) + + config, error = resolve_config_blob( + config_crud=config_version_crud, + config=LLMCallConfig(id=eval_run.config_id, version=eval_run.config_version), + ) + + if error or config is None: + raise ValueError( + f"Config resolution failed for evaluation {eval_run.id} " + f"(config_id={eval_run.config_id}, version={eval_run.config_version}): {error}" + ) + + model = config.completion.params.get("model") + return model diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py index 03b20b203..653a2bafb 100644 --- a/backend/app/crud/evaluations/processing.py +++ b/backend/app/crud/evaluations/processing.py @@ -26,7 +26,7 @@ upload_batch_results_to_object_store, ) from app.crud.evaluations.batch import fetch_dataset_items -from app.crud.evaluations.core import update_evaluation_run +from app.crud.evaluations.core import update_evaluation_run, resolve_model_from_config from app.crud.evaluations.embeddings import ( calculate_average_similarity, parse_embedding_results, @@ -255,12 +255,14 @@ async def process_completed_evaluation( # Step 5: Create Langfuse dataset run with traces # Use model stored at creation time for cost tracking + model = resolve_model_from_config(session=session, eval_run=eval_run) + trace_id_mapping = create_langfuse_dataset_run( langfuse=langfuse, dataset_name=eval_run.dataset_name, + model=model, run_name=eval_run.run_name, results=results, - model=eval_run.model, ) # Store object store URL in database diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index 6e041900f..6ae4542fb 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -207,12 +207,6 @@ class EvaluationRun(SQLModel, table=True): sa_column_kwargs={"comment": "Version of the config used"}, ) - # Model field (snapshot at creation time) - model: str | None = SQLField( - default=None, - description="LLM model name used for this evaluation (e.g., gpt-4o-mini)", - ) - # Dataset reference dataset_id: int = SQLField( foreign_key="evaluation_dataset.id", @@ -350,7 +344,6 @@ class EvaluationRunPublic(SQLModel): dataset_name: str config_id: UUID | None config_version: int | None - model: str | None dataset_id: int batch_job_id: int | None embedding_batch_job_id: int | None From 4f89f43995a9e58ecb546ae09aaee4b4fcfa5ad9 Mon Sep 17 00:00:00 2001 From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com> Date: Mon, 15 Dec 2025 12:45:22 +0530 Subject: [PATCH 07/15] fix migration number --- ..._run_.py => 040_add_config_in_evals_run_table.py} | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) rename backend/app/alembic/versions/{7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py => 040_add_config_in_evals_run_table.py} (85%) diff --git a/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py b/backend/app/alembic/versions/040_add_config_in_evals_run_table.py similarity index 85% rename from backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py rename to backend/app/alembic/versions/040_add_config_in_evals_run_table.py index 1f33543ea..f8606fbaa 100644 --- a/backend/app/alembic/versions/7b48f23ebfdd_add_config_id_and_version_in_evals_run_.py +++ b/backend/app/alembic/versions/040_add_config_in_evals_run_table.py @@ -1,8 +1,8 @@ -"""Add config_id and version in evals run table +"""add config in evals run table -Revision ID: 7b48f23ebfdd -Revises: eed36ae3c79a -Create Date: 2025-12-09 14:19:57.620312 +Revision ID: 040 +Revises: 039 +Create Date: 2025-12-15 12:44:35.250572 """ from alembic import op @@ -11,8 +11,8 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision = "7b48f23ebfdd" -down_revision = "eed36ae3c79a" +revision = "040" +down_revision = "039" branch_labels = None depends_on = None From 82bee43cd2660fc4a1585913dd6326f2e0f2d5ae Mon Sep 17 00:00:00 2001 From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com> Date: Mon, 15 Dec 2025 12:49:27 +0530 Subject: [PATCH 08/15] fix test --- backend/app/tests/api/routes/test_evaluation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py index b6933fb33..ca3d3dcb0 100644 --- a/backend/app/tests/api/routes/test_evaluation.py +++ b/backend/app/tests/api/routes/test_evaluation.py @@ -738,7 +738,6 @@ def test_get_evaluation_run_trace_info_not_completed( dataset_id=create_test_dataset.id, config_id=config.id, config_version=1, - model="gpt-4o", status="pending", total_items=3, organization_id=user_api_key.organization_id, @@ -774,7 +773,6 @@ def test_get_evaluation_run_trace_info_completed( dataset_id=create_test_dataset.id, config_id=config.id, config_version=1, - model="gpt-4o", status="completed", total_items=3, score={ From a2c8a95bf7d35b860d1fe7833ac21d63db41e5e2 Mon Sep 17 00:00:00 2001 From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com> Date: Mon, 15 Dec 2025 12:52:33 +0530 Subject: [PATCH 09/15] fix status code --- backend/app/api/routes/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py index ff142e402..5d5741376 100644 --- a/backend/app/api/routes/evaluation.py +++ b/backend/app/api/routes/evaluation.py @@ -498,7 +498,7 @@ def evaluate( ) elif config.completion.provider != LLMProvider.OPENAI: raise HTTPException( - status_code=400, + status_code=422, detail="Only 'openai' provider is supported for evaluation configs", ) From b9fd664daf6ea8a8f7b6235bce062749347f3b60 Mon Sep 17 00:00:00 2001 From: Aviraj Gour <100823015+avirajsingh7@users.noreply.github.com> Date: Mon, 15 Dec 2025 14:04:55 +0530 Subject: [PATCH 10/15] remove old mirgation --- .../040_add_config_in_evals_run_table.py | 45 ------------------- 1 file changed, 45 deletions(-) delete mode 100644 backend/app/alembic/versions/040_add_config_in_evals_run_table.py diff --git a/backend/app/alembic/versions/040_add_config_in_evals_run_table.py b/backend/app/alembic/versions/040_add_config_in_evals_run_table.py deleted file mode 100644 index f8606fbaa..000000000 --- a/backend/app/alembic/versions/040_add_config_in_evals_run_table.py +++ /dev/null @@ -1,45 +0,0 @@ -"""add config in evals run table - -Revision ID: 040 -Revises: 039 -Create Date: 2025-12-15 12:44:35.250572 - -""" -from alembic import op -import sqlalchemy as sa -import sqlmodel.sql.sqltypes -from sqlalchemy.dialects import postgresql - -# revision identifiers, used by Alembic. -revision = "040" -down_revision = "039" -branch_labels = None -depends_on = None - - -def upgrade(): - # ### commands auto generated by Alembic - please adjust! ### - op.add_column("evaluation_run", sa.Column("config_id", sa.Uuid(), nullable=True)) - op.add_column( - "evaluation_run", sa.Column("config_version", sa.Integer(), nullable=True) - ) - op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"]) - op.drop_column("evaluation_run", "config") - # ### end Alembic commands ### - - -def downgrade(): - # ### commands auto generated by Alembic - please adjust! ### - op.add_column( - "evaluation_run", - sa.Column( - "config", - postgresql.JSONB(astext_type=sa.Text()), - autoincrement=False, - nullable=False, - ), - ) - op.drop_constraint(None, "evaluation_run", type_="foreignkey") - op.drop_column("evaluation_run", "config_version") - op.drop_column("evaluation_run", "config_id") - # ### end Alembic commands ### From 6b00e0ff1e56e472096c452e7ac106a44601ffa2 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Mon, 12 Jan 2026 12:57:30 +0530 Subject: [PATCH 11/15] added depends as import --- backend/app/api/routes/evaluation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py index ad82f2069..276c566f2 100644 --- a/backend/app/api/routes/evaluation.py +++ b/backend/app/api/routes/evaluation.py @@ -8,6 +8,7 @@ from fastapi import ( APIRouter, Body, + Depends, File, Form, HTTPException, From ceb397039a01f2d834331a3dd8843d72b80fe5ee Mon Sep 17 00:00:00 2001 From: Prajna1999 Date: Tue, 13 Jan 2026 19:54:10 +0530 Subject: [PATCH 12/15] fix: spread config object while building batch eval jsonl --- backend/app/crud/evaluations/batch.py | 15 +- backend/app/crud/evaluations/core.py | 2 +- backend/app/services/audio/speech_to_text.py | 630 ++++++++++++++++++ .../app/services/llm/providers/registry.py | 2 + 4 files changed, 643 insertions(+), 6 deletions(-) create mode 100644 backend/app/services/audio/speech_to_text.py diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py index 7e8b69043..6f537c063 100644 --- a/backend/app/crud/evaluations/batch.py +++ b/backend/app/crud/evaluations/batch.py @@ -17,6 +17,7 @@ from app.core.batch.openai import OpenAIBatchProvider from app.crud.batch_operations import start_batch_job from app.models import EvaluationRun +from app.models.llm.request import KaapiLLMParams logger = logging.getLogger(__name__) @@ -60,7 +61,7 @@ def fetch_dataset_items(langfuse: Langfuse, dataset_name: str) -> list[dict[str, def build_evaluation_jsonl( - dataset_items: list[dict[str, Any]], config: dict[str, Any] + dataset_items: list[dict[str, Any]], config: KaapiLLMParams ) -> list[dict[str, Any]]: """ Build JSONL data for evaluation batch using OpenAI Responses API. @@ -89,7 +90,6 @@ def build_evaluation_jsonl( List of dictionaries (JSONL data) """ jsonl_data = [] - for item in dataset_items: # Extract question from input question = item["input"].get("question", "") @@ -106,7 +106,12 @@ def build_evaluation_jsonl( "method": "POST", "url": "/v1/responses", "body": { - **config, # Use config as-is + # Use config as-is + "model": config.model, + "instructions": config.instructions, + "temperature": config.temperature, + "reasoning": {"effort": config.reasoning} if config.reasoning else None, + "tools": config.knowledge_base_ids if config.knowledge_base_ids else [], "input": question, # Add input from dataset }, } @@ -120,7 +125,7 @@ def start_evaluation_batch( openai_client: OpenAI, session: Session, eval_run: EvaluationRun, - config: dict[str, Any], + config: KaapiLLMParams, ) -> EvaluationRun: """ Fetch data, build JSONL, and start evaluation batch. @@ -167,7 +172,7 @@ def start_evaluation_batch( "description": f"Evaluation: {eval_run.run_name}", "completion_window": "24h", # Store complete config for reference - "evaluation_config": config, + "evaluation_config": config.model_dump(exclude_none=True), } # Step 5: Start batch job using generic infrastructure diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index b64aa8208..d83b7447e 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -345,5 +345,5 @@ def resolve_model_from_config( f"(config_id={eval_run.config_id}, version={eval_run.config_version}): {error}" ) - model = config.completion.params.get("model") + model = config.completion.params.model return model diff --git a/backend/app/services/audio/speech_to_text.py b/backend/app/services/audio/speech_to_text.py new file mode 100644 index 000000000..1c7006408 --- /dev/null +++ b/backend/app/services/audio/speech_to_text.py @@ -0,0 +1,630 @@ +"""Speech-to-Text service supporting OpenAI and Gemini providers.""" +import os +import logging +import csv +import base64 +import requests +from io import StringIO +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import BinaryIO, Literal +from pydantic import BaseModel +from dotenv import load_dotenv +from uuid import uuid4 +from typing import List, Literal, Dict +from google import genai +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech, RecognizeResponse +from google.api_core.client_options import ClientOptions +from openai import OpenAI + +from app.services.audio.utils.ogg_to_wav_converter_downloader import ( + download_audio_from_url, +) +from app.models.evaluation import ( + ProviderConfig, + TranscriptionRequest, + FileData, + WERResult, + WERComparisonResult, + WERBatchItem, + WERBatchResult, + WERSummaryStats, + WERBatchSummary, + WERModelStats, + WEROverallSummary, +) +from app.services.audio.utils.calculate_wer import tokenize, calculate_wer + +load_dotenv() +logger = logging.getLogger(__name__) + + +# Default prompt for transcription +DEFAULT_PROMPT = ( + "Generate a verbatim speech-to-text transcript of the given audio file " + "in the same language as of the audio in the same script too. " + "make sure the transcription as close possible to the audio provided" +) +PROJECT_ID = os.getenv("GOOGLE_PROJECT_ID") +STT_LOCATION = os.getenv("GOOGLE_PROJECT_LOCATION") + + +class SpeechToTextService: + def __init__(self, provider, api_key: str | None = None): + if api_key is None: + raise ValueError( + "Missing OpenAI API Key for Client STT Client initialization" + ) + self.provide = provider + self.openai_client = None + self.gemini_client = None + if provider == "openai": + self.openai_client = OpenAI(api_key=api_key) + elif provider == "gemini": + self.gemini_client = genai.Client(api_key=api_key) + else: + raise ValueError("This provider is not supported yet.") + + def transcribe_with_openai( + self, + audio_file: BinaryIO | str, + model: str = "gpt-4o-transcribe", + prompt: str | None = None, + ): + if self.openai_client is None: + raise ValueError("OpenAI client is not initialized.") + try: + # Handle file path vs file object + if isinstance(audio_file, str): + audio_file = open(audio_file, "rb") + + transcription = self.openai_client.audio.transcriptions.create( + model=model, + file=audio_file, + # language="hi", + response_format="text", + prompt=prompt or DEFAULT_PROMPT, + ) + logger.info(f"Successfully transcribed audio using OpenAI model: {model}") + return transcription + except Exception as e: + logger.error(f"OpenAI transcription failed: {str(e)}", exc_info=True) + raise + + def transcribe_with_gemini( + self, + audio_file_path: str, + model: str = "gemini-2.5-flash", + prompt: str | None = None, + ): + if self.gemini_client is None: + raise ValueError("Gemini client is not initialized") + try: + # Upload file to Geminic + gemini_file = self.gemini_client.files.upload(file=audio_file_path) + logger.info(f"Uploaded file to Gemini: {gemini_file.name}") + + # Generate transcription + response = self.gemini_client.models.generate_content( + model=model, + contents=[prompt or DEFAULT_PROMPT, gemini_file], + ) + + logger.info(f"Successfully transcribed audio using Gemini model: {model}") + return response.text or None + + except Exception as e: + logger.error(f"Gemini transcription failed: {str(e)}", exc_info=True) + raise + + def transcribe( + self, + audio_file: BinaryIO | str, + provider: Literal["openai", "gemini"] = "openai", + model: str | None = None, + prompt: str | None = None, + ): + transcription = None + if provider == "openai": + transcription = self.transcribe_with_openai( + audio_file=audio_file, + model=model or "gpt-4o-transcribe", + prompt=prompt, + ) + return transcription + elif provider == "gemini": + # Gemini requires file path, not file object + file_path = audio_file if isinstance(audio_file, str) else audio_file.name + transcription = self.transcribe_with_gemini( + audio_file_path=file_path, + model=model or "gemini-2.5-flash", + prompt=prompt, + ) + return transcription + else: + raise ValueError( + f"Unsupported provider: {provider}. Use 'openai' or 'gemini'." + ) + + +def transcribe_audio_with_chirp_v3(audio_file_path: str): + with open(audio_file_path, "rb") as file: + audio_content = file.read() + + client = SpeechClient( + client_options=ClientOptions( + api_endpoint=f"{STT_LOCATION}-speech.googleapis.com" + ) + ) + + config = cloud_speech.RecognitionConfig( + auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(), + language_codes=["auto"], + model="chirp_3", + ) + request = cloud_speech.RecognizeRequest( + recognizer=f"projects/{PROJECT_ID}/locations/{STT_LOCATION}/recognizers/_", + config=config, + content=audio_content, + ) + response: RecognizeResponse = client.recognize(request=request) + + transcript = None + for result in response.results: + transcript = result.alternatives[0].transcript + return transcript + + +def transcribe_audio_with_indic_conformer(audio_file_path: str): + indic_conformer_api_url = str(os.getenv("AI4B_STT_URL")) + with open(audio_file_path, "rb") as file: + audio_content = file.read() + + response = requests.post( + url=indic_conformer_api_url, + data={"language_code": "hi", "decoding_strategy": "ctc"}, + files={"audio_file": audio_content}, + ) + logger.info(response.json()) + transcription = response.json()["transcription"] + return transcription + + +# util functions for direct usage +def transcribe_audio( + audio_file: str, + provider: Literal["openai", "gemini", "google-stt", "ai4b"] = "openai", + model: str | None = None, + api_key: str | None = None, + prompt: str | None = None, +): + if provider == "google-stt": + return transcribe_audio_with_chirp_v3(audio_file_path=audio_file) + if provider == "ai4b": + return transcribe_audio_with_indic_conformer(audio_file_path=audio_file) + stt_service = SpeechToTextService(provider=provider, api_key=api_key) + return stt_service.transcribe( + audio_file=audio_file, + provider=provider, + model=model, + prompt=prompt, + ) + + +# STT csv parser + + +def process_single_csv_row(row_data): + idx, audio_url, ground_truth = row_data + + try: + audio_bytes, content_type = download_audio_from_url(audio_url) + audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") + + return { + "status": "success", + "row": idx, + "audio_url": audio_url, + "ground_truth": ground_truth, + "audio_base64": audio_base64, + "media_type": content_type, + "file_size": len(audio_bytes), + } + except requests.Timeout: + return { + "status": "error", + "row": idx, + "audio_url": audio_url, + "error": "Download timeout", + } + except requests.RequestException as e: + return { + "status": "error", + "row": idx, + "audio_url": audio_url, + "error": f"Download failed: {str(e)}", + } + except Exception as e: + return { + "status": "error", + "row": idx, + "audio_url": audio_url, + "error": f"Unexpected error: {str(e)}", + } + + +async def process_batch_csv(csv_file): + csv_body = await csv_file.read() + csv_content = csv_body.decode("utf-8") + + csv_reader = csv.DictReader(StringIO(csv_content)) + required_headers = {"audio_url", "ground_truth"} + + if not required_headers.issubset(csv_reader.fieldnames): + raise ValueError(f"CSV must have headers: {required_headers}") + + rows_to_process = [] + for idx, row in enumerate(csv_reader, start=1): + audio_url = row["audio_url"].strip() + ground_truth = row["ground_truth"].strip() + + if not audio_url or not ground_truth: + continue # do not throw error. continue silently + rows_to_process.append((idx, audio_url, ground_truth)) + + results = [] + errors = [] + + with ThreadPoolExecutor(max_workers=4) as executor: + future_to_row = { + executor.submit(process_single_csv_row, row_data): row_data + for row_data in rows_to_process + } + for future in as_completed(future_to_row): + result = future.result() + if result["status"] == "success": + results.append(result) + else: + errors.append(result) + return { + "success": results, + "errors": errors, + "total_rows": len(rows_to_process), + "processed": len(results), + "failed": len(errors), + } + + +def _get_api_key(provider: str) -> str | None: + if provider == "openai": + return os.getenv("OPENAI_API_KEY") + elif provider == "gemini": + return os.getenv("GEMINI_API_KEY") + + return None + + +def _transcribe_single_file_provider( + file_data: FileData, provider_config: ProviderConfig +): + # for execution by threadpool + provider = provider_config.provider + + model = provider_config.model + + audio_bytes = base64.b64decode(file_data.audio_base64) or None + media_extention = ( + file_data.media_type.split("/")[-1] if file_data.media_type else ".ogg" + ) + temp_file = f"/tmp/{uuid4()}.{media_extention}" + + try: + with open(temp_file, "wb") as f: + f.write(audio_bytes) # type: ignore + + # extract the api key + api_key = _get_api_key(provider) # type: ignore + + transcript = transcribe_audio( + audio_file=temp_file, + provider=provider, # type: ignore + model=model, + api_key=api_key, + ) + + return { + "status": "success", + "file_id": file_data.file_id or None, + "ground_truth": file_data.ground_truth, + "provider": provider, + "model": model, + "transcript": transcript, + } + except Exception as e: + return { + "status": "error", + "file_id": file_data.file_id, + "provider": provider, + "model": model, + "error": str(e), + } + finally: + # clean the temp file + if os.path.exists(temp_file): + os.remove(temp_file) + + +# threadpool based integration of batch transcriptipn +def process_batch_transcription(files, providers, max_workers=4): + # create all tasks and push into a list + tasks = [ + (file_data, provider_config) + for file_data in files + for provider_config in providers + ] + + results = [] + errors = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit( + _transcribe_single_file_provider, file_data, provider_config + ): (file_data, provider_config) + for file_data, provider_config in tasks + } + for future in as_completed(futures): + result = future.result() + if result["status"] == "success": + results.append(result) + else: + errors.append(result) + + return { + "success": results, + "errors": errors, + "total_tasks": len(tasks), + "processed": len(results), + "failed": len(errors), + } + + +# STT evaluation route handler +async def evaluate_stt(): + pass + + +def calculate_wer_individual( + ground_truth: str, + hypothesis: str, + mode: Literal["strict", "lenient", "both"] = "both", +) -> WERComparisonResult | WERResult: + """ + Calculate WER for a single transcription against ground truth. + + Args: + ground_truth: The reference/expected transcription + hypothesis: The transcribed text to evaluate + mode: "strict" for exact matching, "lenient" for phonetic/spelling tolerance, + "both" to return both calculations + + Returns: + WERComparisonResult if mode="both", otherwise WERResult for the specified mode + """ + ref_tokens = tokenize(ground_truth) + hyp_tokens = tokenize(hypothesis) + + ref_count = len(ref_tokens) + hyp_count = len(hyp_tokens) # hypothesis tokes + + if mode == "strict": + wer, subs, dels, ins, sem = calculate_wer(ref_tokens, hyp_tokens, lenient=False) + return WERResult( + wer=wer, + substitutions=subs, + deletions=dels, + insertions=ins, + semantic_errors=sem, + reference_word_count=ref_count, + hypothesis_word_count=hyp_count, + ) + + if mode == "lenient": + wer, subs, dels, ins, sem = calculate_wer(ref_tokens, hyp_tokens, lenient=True) + return WERResult( + wer=wer, + substitutions=subs, + deletions=dels, + insertions=ins, + semantic_errors=sem, + reference_word_count=ref_count, + hypothesis_word_count=hyp_count, + ) + + # mode == "both" + wer_strict, s_strict, d_strict, i_strict, sem_strict = calculate_wer( + ref_tokens, hyp_tokens, lenient=False + ) + wer_lenient, s_lenient, d_lenient, i_lenient, sem_lenient = calculate_wer( + ref_tokens, hyp_tokens, lenient=True + ) + + return WERComparisonResult( + ground_truth=ground_truth, + hypothesis=hypothesis, + strict=WERResult( + wer=wer_strict, + substitutions=s_strict, + deletions=d_strict, + insertions=i_strict, + semantic_errors=sem_strict, + reference_word_count=ref_count, + hypothesis_word_count=hyp_count, + ), + lenient=WERResult( + wer=wer_lenient, + substitutions=s_lenient, + deletions=d_lenient, + insertions=i_lenient, + semantic_errors=sem_lenient, + reference_word_count=ref_count, + hypothesis_word_count=hyp_count, + ), + ) + + +def _process_wer_item(item: WERBatchItem) -> WERBatchResult: + result = calculate_wer_individual( + ground_truth=item.ground_truth, hypothesis=item.hypothesis, mode="both" + ) + # result is WERComparisonResult with strict and lenient WERResult + return WERBatchResult( + id=item.id, + ground_truth=item.ground_truth, + hypothesis=item.hypothesis, + model=item.model, + strict=result.strict, + lenient=result.lenient, + ) + + +def calculate_wer_batch( + items: List[WERBatchItem], max_workers: int = 4 +) -> List[WERBatchResult]: + """ + Calculate WER for multiple transcriptions in batch using ThreadPoolExecutor. + + Args: + items: List of WERBatchItem containing id, ground_truth, and hypothesis + max_workers: Maximum number of concurrent workers for ThreadPoolExecutor + + Returns: + List of WERBatchResult with both strict and lenient WER for each item + """ + results: List[WERBatchResult] = [] + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(_process_wer_item, item): item for item in items} + for future in as_completed(futures): + try: + result = future.result() + results.append(result) + except Exception as e: + item = futures[future] + logger.error(f"WER calculation failed for item {item.id}: {e}") + + return results + + +def calculate_wer_summary_stats( + results: List[WERBatchResult], mode: str +) -> WERSummaryStats: + """ + Calculate summary statistics for a list of WER results. + + Args: + results: List of WERBatchResult from batch WER calculation + mode: "strict" or "lenient" - which WER results to summarize + + Returns: + WERSummaryStats with aggregate statistics + """ + if not results: + return WERSummaryStats( + count=0, + avg_wer=0.0, + min_wer=0.0, + max_wer=0.0, + avg_substitutions=0.0, + avg_deletions=0.0, + avg_insertions=0.0, + avg_semantic_errors=0.0, + total_reference_words=0, + total_hypothesis_words=0, + ) + + # Extract WER results based on mode (strict or lenient) + wer_results = [getattr(r, mode) for r in results] + n = len(wer_results) + + wer_values = [w.wer for w in wer_results] + + return WERSummaryStats( + count=n, + avg_wer=sum(wer_values) / n, + min_wer=min(wer_values), + max_wer=max(wer_values), + avg_substitutions=sum(w.substitutions for w in wer_results) / n, + avg_deletions=sum(w.deletions for w in wer_results) / n, + avg_insertions=sum(w.insertions for w in wer_results) / n, + avg_semantic_errors=sum(w.semantic_errors for w in wer_results) / n, + total_reference_words=sum(w.reference_word_count for w in wer_results), + total_hypothesis_words=sum(w.hypothesis_word_count for w in wer_results), + ) + + +def calculate_wer_batch_with_summary( + items: List[WERBatchItem], max_workers: int = 4 +) -> tuple[List[WERBatchResult], WERBatchSummary]: + """ + Calculate WER for batch items and return results with summary statistics. + + Args: + items: List of WERBatchItem containing id, ground_truth, hypothesis, and optional model + max_workers: Maximum number of concurrent workers for ThreadPoolExecutor + + Returns: + Tuple of (results list, summary with overall and model-wise stats) + """ + results = calculate_wer_batch(items, max_workers) + + # Calculate overall statistics + overall = WEROverallSummary( + strict=calculate_wer_summary_stats(results, "strict"), + lenient=calculate_wer_summary_stats(results, "lenient"), + ) + + # Group results by model and calculate per-model statistics + model_groups: Dict[str, List[WERBatchResult]] = {} + for r in results: + if r.model: + if r.model not in model_groups: + model_groups[r.model] = [] + model_groups[r.model].append(r) + + by_model: List[WERModelStats] = [] + for model_name in sorted(model_groups.keys()): + model_results = model_groups[model_name] + by_model.append( + WERModelStats( + model=model_name, + strict=calculate_wer_summary_stats(model_results, "strict"), + lenient=calculate_wer_summary_stats(model_results, "lenient"), + ) + ) + + summary = WERBatchSummary(overall=overall, by_model=by_model) + + return results, summary + + +if __name__ == "__main__": + # oai_api_key = os.getenv("OPENAI_API_KEY") + # gemini_api_key=os.getenv("GEMINI_API_KEY") + ai4b_file_path = "/Users/prajna/Downloads/audio_hindi_2.ogg" + + audio_file_path = "/Users/prajna/Desktop/t4d/ai-platform/backend/app/services/audio/sample_data/ogg_files/1756121051765345.ogg" + ai4b_response = transcribe_audio_with_indic_conformer( + audio_file_path=ai4b_file_path + ) + print(ai4b_response) + # stt_eval_file_path = "/Users/prajna/Desktop/t4d/ai-platform/backend/app/services/audio/audio_sample_stt.csv" + + # transcript=transcribe_audio(audio_file=audio_file_path, provider="google-stt") + # transcript=transcribe_audio(audio_file=audio_file_path, provider="openai", api_key=oai_api_key) + # transcript=transcribe_audio_with_chirp_v3(audio_file_path) + # print(transcript) + + # with open(stt_eval_file_path, "rb") as file: + # processed = process_batch_csv(file) + + # print(processed) diff --git a/backend/app/services/llm/providers/registry.py b/backend/app/services/llm/providers/registry.py index a5cfb4bb8..7fb8d79f9 100644 --- a/backend/app/services/llm/providers/registry.py +++ b/backend/app/services/llm/providers/registry.py @@ -13,12 +13,14 @@ class LLMProvider: OPENAI_NATIVE = "openai-native" + OPENAI = "openai" # Future constants for native providers: # CLAUDE_NATIVE = "claude-native" # GEMINI_NATIVE = "gemini-native" _registry: dict[str, type[BaseProvider]] = { OPENAI_NATIVE: OpenAIProvider, + OPENAI: OpenAIProvider # Future native providers: # CLAUDE_NATIVE: ClaudeProvider, # GEMINI_NATIVE: GeminiProvider, From 82c7b70da2a0866fc40134f075a0b23aaa110135 Mon Sep 17 00:00:00 2001 From: Prajna1999 Date: Wed, 14 Jan 2026 09:09:42 +0530 Subject: [PATCH 13/15] chore: remove audio poc code --- backend/app/services/audio/speech_to_text.py | 630 ------------------- 1 file changed, 630 deletions(-) delete mode 100644 backend/app/services/audio/speech_to_text.py diff --git a/backend/app/services/audio/speech_to_text.py b/backend/app/services/audio/speech_to_text.py deleted file mode 100644 index 1c7006408..000000000 --- a/backend/app/services/audio/speech_to_text.py +++ /dev/null @@ -1,630 +0,0 @@ -"""Speech-to-Text service supporting OpenAI and Gemini providers.""" -import os -import logging -import csv -import base64 -import requests -from io import StringIO -from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import BinaryIO, Literal -from pydantic import BaseModel -from dotenv import load_dotenv -from uuid import uuid4 -from typing import List, Literal, Dict -from google import genai -from google.cloud.speech_v2 import SpeechClient -from google.cloud.speech_v2.types import cloud_speech, RecognizeResponse -from google.api_core.client_options import ClientOptions -from openai import OpenAI - -from app.services.audio.utils.ogg_to_wav_converter_downloader import ( - download_audio_from_url, -) -from app.models.evaluation import ( - ProviderConfig, - TranscriptionRequest, - FileData, - WERResult, - WERComparisonResult, - WERBatchItem, - WERBatchResult, - WERSummaryStats, - WERBatchSummary, - WERModelStats, - WEROverallSummary, -) -from app.services.audio.utils.calculate_wer import tokenize, calculate_wer - -load_dotenv() -logger = logging.getLogger(__name__) - - -# Default prompt for transcription -DEFAULT_PROMPT = ( - "Generate a verbatim speech-to-text transcript of the given audio file " - "in the same language as of the audio in the same script too. " - "make sure the transcription as close possible to the audio provided" -) -PROJECT_ID = os.getenv("GOOGLE_PROJECT_ID") -STT_LOCATION = os.getenv("GOOGLE_PROJECT_LOCATION") - - -class SpeechToTextService: - def __init__(self, provider, api_key: str | None = None): - if api_key is None: - raise ValueError( - "Missing OpenAI API Key for Client STT Client initialization" - ) - self.provide = provider - self.openai_client = None - self.gemini_client = None - if provider == "openai": - self.openai_client = OpenAI(api_key=api_key) - elif provider == "gemini": - self.gemini_client = genai.Client(api_key=api_key) - else: - raise ValueError("This provider is not supported yet.") - - def transcribe_with_openai( - self, - audio_file: BinaryIO | str, - model: str = "gpt-4o-transcribe", - prompt: str | None = None, - ): - if self.openai_client is None: - raise ValueError("OpenAI client is not initialized.") - try: - # Handle file path vs file object - if isinstance(audio_file, str): - audio_file = open(audio_file, "rb") - - transcription = self.openai_client.audio.transcriptions.create( - model=model, - file=audio_file, - # language="hi", - response_format="text", - prompt=prompt or DEFAULT_PROMPT, - ) - logger.info(f"Successfully transcribed audio using OpenAI model: {model}") - return transcription - except Exception as e: - logger.error(f"OpenAI transcription failed: {str(e)}", exc_info=True) - raise - - def transcribe_with_gemini( - self, - audio_file_path: str, - model: str = "gemini-2.5-flash", - prompt: str | None = None, - ): - if self.gemini_client is None: - raise ValueError("Gemini client is not initialized") - try: - # Upload file to Geminic - gemini_file = self.gemini_client.files.upload(file=audio_file_path) - logger.info(f"Uploaded file to Gemini: {gemini_file.name}") - - # Generate transcription - response = self.gemini_client.models.generate_content( - model=model, - contents=[prompt or DEFAULT_PROMPT, gemini_file], - ) - - logger.info(f"Successfully transcribed audio using Gemini model: {model}") - return response.text or None - - except Exception as e: - logger.error(f"Gemini transcription failed: {str(e)}", exc_info=True) - raise - - def transcribe( - self, - audio_file: BinaryIO | str, - provider: Literal["openai", "gemini"] = "openai", - model: str | None = None, - prompt: str | None = None, - ): - transcription = None - if provider == "openai": - transcription = self.transcribe_with_openai( - audio_file=audio_file, - model=model or "gpt-4o-transcribe", - prompt=prompt, - ) - return transcription - elif provider == "gemini": - # Gemini requires file path, not file object - file_path = audio_file if isinstance(audio_file, str) else audio_file.name - transcription = self.transcribe_with_gemini( - audio_file_path=file_path, - model=model or "gemini-2.5-flash", - prompt=prompt, - ) - return transcription - else: - raise ValueError( - f"Unsupported provider: {provider}. Use 'openai' or 'gemini'." - ) - - -def transcribe_audio_with_chirp_v3(audio_file_path: str): - with open(audio_file_path, "rb") as file: - audio_content = file.read() - - client = SpeechClient( - client_options=ClientOptions( - api_endpoint=f"{STT_LOCATION}-speech.googleapis.com" - ) - ) - - config = cloud_speech.RecognitionConfig( - auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(), - language_codes=["auto"], - model="chirp_3", - ) - request = cloud_speech.RecognizeRequest( - recognizer=f"projects/{PROJECT_ID}/locations/{STT_LOCATION}/recognizers/_", - config=config, - content=audio_content, - ) - response: RecognizeResponse = client.recognize(request=request) - - transcript = None - for result in response.results: - transcript = result.alternatives[0].transcript - return transcript - - -def transcribe_audio_with_indic_conformer(audio_file_path: str): - indic_conformer_api_url = str(os.getenv("AI4B_STT_URL")) - with open(audio_file_path, "rb") as file: - audio_content = file.read() - - response = requests.post( - url=indic_conformer_api_url, - data={"language_code": "hi", "decoding_strategy": "ctc"}, - files={"audio_file": audio_content}, - ) - logger.info(response.json()) - transcription = response.json()["transcription"] - return transcription - - -# util functions for direct usage -def transcribe_audio( - audio_file: str, - provider: Literal["openai", "gemini", "google-stt", "ai4b"] = "openai", - model: str | None = None, - api_key: str | None = None, - prompt: str | None = None, -): - if provider == "google-stt": - return transcribe_audio_with_chirp_v3(audio_file_path=audio_file) - if provider == "ai4b": - return transcribe_audio_with_indic_conformer(audio_file_path=audio_file) - stt_service = SpeechToTextService(provider=provider, api_key=api_key) - return stt_service.transcribe( - audio_file=audio_file, - provider=provider, - model=model, - prompt=prompt, - ) - - -# STT csv parser - - -def process_single_csv_row(row_data): - idx, audio_url, ground_truth = row_data - - try: - audio_bytes, content_type = download_audio_from_url(audio_url) - audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") - - return { - "status": "success", - "row": idx, - "audio_url": audio_url, - "ground_truth": ground_truth, - "audio_base64": audio_base64, - "media_type": content_type, - "file_size": len(audio_bytes), - } - except requests.Timeout: - return { - "status": "error", - "row": idx, - "audio_url": audio_url, - "error": "Download timeout", - } - except requests.RequestException as e: - return { - "status": "error", - "row": idx, - "audio_url": audio_url, - "error": f"Download failed: {str(e)}", - } - except Exception as e: - return { - "status": "error", - "row": idx, - "audio_url": audio_url, - "error": f"Unexpected error: {str(e)}", - } - - -async def process_batch_csv(csv_file): - csv_body = await csv_file.read() - csv_content = csv_body.decode("utf-8") - - csv_reader = csv.DictReader(StringIO(csv_content)) - required_headers = {"audio_url", "ground_truth"} - - if not required_headers.issubset(csv_reader.fieldnames): - raise ValueError(f"CSV must have headers: {required_headers}") - - rows_to_process = [] - for idx, row in enumerate(csv_reader, start=1): - audio_url = row["audio_url"].strip() - ground_truth = row["ground_truth"].strip() - - if not audio_url or not ground_truth: - continue # do not throw error. continue silently - rows_to_process.append((idx, audio_url, ground_truth)) - - results = [] - errors = [] - - with ThreadPoolExecutor(max_workers=4) as executor: - future_to_row = { - executor.submit(process_single_csv_row, row_data): row_data - for row_data in rows_to_process - } - for future in as_completed(future_to_row): - result = future.result() - if result["status"] == "success": - results.append(result) - else: - errors.append(result) - return { - "success": results, - "errors": errors, - "total_rows": len(rows_to_process), - "processed": len(results), - "failed": len(errors), - } - - -def _get_api_key(provider: str) -> str | None: - if provider == "openai": - return os.getenv("OPENAI_API_KEY") - elif provider == "gemini": - return os.getenv("GEMINI_API_KEY") - - return None - - -def _transcribe_single_file_provider( - file_data: FileData, provider_config: ProviderConfig -): - # for execution by threadpool - provider = provider_config.provider - - model = provider_config.model - - audio_bytes = base64.b64decode(file_data.audio_base64) or None - media_extention = ( - file_data.media_type.split("/")[-1] if file_data.media_type else ".ogg" - ) - temp_file = f"/tmp/{uuid4()}.{media_extention}" - - try: - with open(temp_file, "wb") as f: - f.write(audio_bytes) # type: ignore - - # extract the api key - api_key = _get_api_key(provider) # type: ignore - - transcript = transcribe_audio( - audio_file=temp_file, - provider=provider, # type: ignore - model=model, - api_key=api_key, - ) - - return { - "status": "success", - "file_id": file_data.file_id or None, - "ground_truth": file_data.ground_truth, - "provider": provider, - "model": model, - "transcript": transcript, - } - except Exception as e: - return { - "status": "error", - "file_id": file_data.file_id, - "provider": provider, - "model": model, - "error": str(e), - } - finally: - # clean the temp file - if os.path.exists(temp_file): - os.remove(temp_file) - - -# threadpool based integration of batch transcriptipn -def process_batch_transcription(files, providers, max_workers=4): - # create all tasks and push into a list - tasks = [ - (file_data, provider_config) - for file_data in files - for provider_config in providers - ] - - results = [] - errors = [] - with ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = { - executor.submit( - _transcribe_single_file_provider, file_data, provider_config - ): (file_data, provider_config) - for file_data, provider_config in tasks - } - for future in as_completed(futures): - result = future.result() - if result["status"] == "success": - results.append(result) - else: - errors.append(result) - - return { - "success": results, - "errors": errors, - "total_tasks": len(tasks), - "processed": len(results), - "failed": len(errors), - } - - -# STT evaluation route handler -async def evaluate_stt(): - pass - - -def calculate_wer_individual( - ground_truth: str, - hypothesis: str, - mode: Literal["strict", "lenient", "both"] = "both", -) -> WERComparisonResult | WERResult: - """ - Calculate WER for a single transcription against ground truth. - - Args: - ground_truth: The reference/expected transcription - hypothesis: The transcribed text to evaluate - mode: "strict" for exact matching, "lenient" for phonetic/spelling tolerance, - "both" to return both calculations - - Returns: - WERComparisonResult if mode="both", otherwise WERResult for the specified mode - """ - ref_tokens = tokenize(ground_truth) - hyp_tokens = tokenize(hypothesis) - - ref_count = len(ref_tokens) - hyp_count = len(hyp_tokens) # hypothesis tokes - - if mode == "strict": - wer, subs, dels, ins, sem = calculate_wer(ref_tokens, hyp_tokens, lenient=False) - return WERResult( - wer=wer, - substitutions=subs, - deletions=dels, - insertions=ins, - semantic_errors=sem, - reference_word_count=ref_count, - hypothesis_word_count=hyp_count, - ) - - if mode == "lenient": - wer, subs, dels, ins, sem = calculate_wer(ref_tokens, hyp_tokens, lenient=True) - return WERResult( - wer=wer, - substitutions=subs, - deletions=dels, - insertions=ins, - semantic_errors=sem, - reference_word_count=ref_count, - hypothesis_word_count=hyp_count, - ) - - # mode == "both" - wer_strict, s_strict, d_strict, i_strict, sem_strict = calculate_wer( - ref_tokens, hyp_tokens, lenient=False - ) - wer_lenient, s_lenient, d_lenient, i_lenient, sem_lenient = calculate_wer( - ref_tokens, hyp_tokens, lenient=True - ) - - return WERComparisonResult( - ground_truth=ground_truth, - hypothesis=hypothesis, - strict=WERResult( - wer=wer_strict, - substitutions=s_strict, - deletions=d_strict, - insertions=i_strict, - semantic_errors=sem_strict, - reference_word_count=ref_count, - hypothesis_word_count=hyp_count, - ), - lenient=WERResult( - wer=wer_lenient, - substitutions=s_lenient, - deletions=d_lenient, - insertions=i_lenient, - semantic_errors=sem_lenient, - reference_word_count=ref_count, - hypothesis_word_count=hyp_count, - ), - ) - - -def _process_wer_item(item: WERBatchItem) -> WERBatchResult: - result = calculate_wer_individual( - ground_truth=item.ground_truth, hypothesis=item.hypothesis, mode="both" - ) - # result is WERComparisonResult with strict and lenient WERResult - return WERBatchResult( - id=item.id, - ground_truth=item.ground_truth, - hypothesis=item.hypothesis, - model=item.model, - strict=result.strict, - lenient=result.lenient, - ) - - -def calculate_wer_batch( - items: List[WERBatchItem], max_workers: int = 4 -) -> List[WERBatchResult]: - """ - Calculate WER for multiple transcriptions in batch using ThreadPoolExecutor. - - Args: - items: List of WERBatchItem containing id, ground_truth, and hypothesis - max_workers: Maximum number of concurrent workers for ThreadPoolExecutor - - Returns: - List of WERBatchResult with both strict and lenient WER for each item - """ - results: List[WERBatchResult] = [] - - with ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = {executor.submit(_process_wer_item, item): item for item in items} - for future in as_completed(futures): - try: - result = future.result() - results.append(result) - except Exception as e: - item = futures[future] - logger.error(f"WER calculation failed for item {item.id}: {e}") - - return results - - -def calculate_wer_summary_stats( - results: List[WERBatchResult], mode: str -) -> WERSummaryStats: - """ - Calculate summary statistics for a list of WER results. - - Args: - results: List of WERBatchResult from batch WER calculation - mode: "strict" or "lenient" - which WER results to summarize - - Returns: - WERSummaryStats with aggregate statistics - """ - if not results: - return WERSummaryStats( - count=0, - avg_wer=0.0, - min_wer=0.0, - max_wer=0.0, - avg_substitutions=0.0, - avg_deletions=0.0, - avg_insertions=0.0, - avg_semantic_errors=0.0, - total_reference_words=0, - total_hypothesis_words=0, - ) - - # Extract WER results based on mode (strict or lenient) - wer_results = [getattr(r, mode) for r in results] - n = len(wer_results) - - wer_values = [w.wer for w in wer_results] - - return WERSummaryStats( - count=n, - avg_wer=sum(wer_values) / n, - min_wer=min(wer_values), - max_wer=max(wer_values), - avg_substitutions=sum(w.substitutions for w in wer_results) / n, - avg_deletions=sum(w.deletions for w in wer_results) / n, - avg_insertions=sum(w.insertions for w in wer_results) / n, - avg_semantic_errors=sum(w.semantic_errors for w in wer_results) / n, - total_reference_words=sum(w.reference_word_count for w in wer_results), - total_hypothesis_words=sum(w.hypothesis_word_count for w in wer_results), - ) - - -def calculate_wer_batch_with_summary( - items: List[WERBatchItem], max_workers: int = 4 -) -> tuple[List[WERBatchResult], WERBatchSummary]: - """ - Calculate WER for batch items and return results with summary statistics. - - Args: - items: List of WERBatchItem containing id, ground_truth, hypothesis, and optional model - max_workers: Maximum number of concurrent workers for ThreadPoolExecutor - - Returns: - Tuple of (results list, summary with overall and model-wise stats) - """ - results = calculate_wer_batch(items, max_workers) - - # Calculate overall statistics - overall = WEROverallSummary( - strict=calculate_wer_summary_stats(results, "strict"), - lenient=calculate_wer_summary_stats(results, "lenient"), - ) - - # Group results by model and calculate per-model statistics - model_groups: Dict[str, List[WERBatchResult]] = {} - for r in results: - if r.model: - if r.model not in model_groups: - model_groups[r.model] = [] - model_groups[r.model].append(r) - - by_model: List[WERModelStats] = [] - for model_name in sorted(model_groups.keys()): - model_results = model_groups[model_name] - by_model.append( - WERModelStats( - model=model_name, - strict=calculate_wer_summary_stats(model_results, "strict"), - lenient=calculate_wer_summary_stats(model_results, "lenient"), - ) - ) - - summary = WERBatchSummary(overall=overall, by_model=by_model) - - return results, summary - - -if __name__ == "__main__": - # oai_api_key = os.getenv("OPENAI_API_KEY") - # gemini_api_key=os.getenv("GEMINI_API_KEY") - ai4b_file_path = "/Users/prajna/Downloads/audio_hindi_2.ogg" - - audio_file_path = "/Users/prajna/Desktop/t4d/ai-platform/backend/app/services/audio/sample_data/ogg_files/1756121051765345.ogg" - ai4b_response = transcribe_audio_with_indic_conformer( - audio_file_path=ai4b_file_path - ) - print(ai4b_response) - # stt_eval_file_path = "/Users/prajna/Desktop/t4d/ai-platform/backend/app/services/audio/audio_sample_stt.csv" - - # transcript=transcribe_audio(audio_file=audio_file_path, provider="google-stt") - # transcript=transcribe_audio(audio_file=audio_file_path, provider="openai", api_key=oai_api_key) - # transcript=transcribe_audio_with_chirp_v3(audio_file_path) - # print(transcript) - - # with open(stt_eval_file_path, "rb") as file: - # processed = process_batch_csv(file) - - # print(processed) From ebdda817098f08676a22c94e0d369b2341121992 Mon Sep 17 00:00:00 2001 From: Prajna1999 Date: Wed, 14 Jan 2026 10:58:35 +0530 Subject: [PATCH 14/15] fix: add comprehensive expansion of 'tools' key while building evaluation_jsonl --- backend/app/crud/evaluations/batch.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py index 6f537c063..2aad3b298 100644 --- a/backend/app/crud/evaluations/batch.py +++ b/backend/app/crud/evaluations/batch.py @@ -111,7 +111,13 @@ def build_evaluation_jsonl( "instructions": config.instructions, "temperature": config.temperature, "reasoning": {"effort": config.reasoning} if config.reasoning else None, - "tools": config.knowledge_base_ids if config.knowledge_base_ids else [], + "tools": [ + { + "type": "file_search", + "vector_store_ids": config.knowledge_base_ids, + "max_num_results": config.max_num_results or 20, + } + ], "input": question, # Add input from dataset }, } From f00e7e088bb4e0ec9bc63afea5b8e553b33c01f9 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Sat, 24 Jan 2026 17:29:08 +0530 Subject: [PATCH 15/15] fixing endpoints --- backend/app/api/routes/evaluations/dataset.py | 4 ++-- backend/app/api/routes/evaluations/evaluation.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/app/api/routes/evaluations/dataset.py b/backend/app/api/routes/evaluations/dataset.py index d66ff71ce..1ce42742a 100644 --- a/backend/app/api/routes/evaluations/dataset.py +++ b/backend/app/api/routes/evaluations/dataset.py @@ -48,7 +48,7 @@ def _dataset_to_response(dataset: EvaluationDataset) -> DatasetUploadResponse: @router.post( - "/", + "", description=load_description("evaluation/upload_dataset.md"), response_model=APIResponse[DatasetUploadResponse], dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))], @@ -87,7 +87,7 @@ async def upload_dataset( @router.get( - "/", + "", description=load_description("evaluation/list_datasets.md"), response_model=APIResponse[list[DatasetUploadResponse]], dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))], diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py index b0f1a1cf3..fe09edcec 100644 --- a/backend/app/api/routes/evaluations/evaluation.py +++ b/backend/app/api/routes/evaluations/evaluation.py @@ -30,7 +30,7 @@ @router.post( - "/", + "", description=load_description("evaluation/create_evaluation.md"), response_model=APIResponse[EvaluationRunPublic], dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))], @@ -66,7 +66,7 @@ def evaluate( @router.get( - "/", + "", description=load_description("evaluation/list_evaluations.md"), response_model=APIResponse[list[EvaluationRunPublic]], dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],