From 39a9a7af6014121f9c9fb1f7ef7c71d1f275c9a4 Mon Sep 17 00:00:00 2001 From: Niladri Das Date: Tue, 16 Dec 2025 05:40:28 +0530 Subject: [PATCH 1/2] fix: update --- cli.py | 46 +++++++++----- config/benchmark.yaml | 23 +++++++ readme.md | 136 ++++++++++++++++++++++++++++++++++++++++- requirements.txt | 9 +-- tests/e2e/cli.py | 2 +- utils/validate_data.py | 46 ++++++++++++++ 6 files changed, 242 insertions(+), 20 deletions(-) create mode 100644 config/benchmark.yaml create mode 100644 utils/validate_data.py diff --git a/cli.py b/cli.py index 6e3123f..1f818f4 100755 --- a/cli.py +++ b/cli.py @@ -4,8 +4,14 @@ import sys from pathlib import Path +import yaml from dotenv import load_dotenv +try: + import wandb +except ImportError: + wandb = None + load_dotenv() sys.path.insert(0, str(Path(__file__).parent)) @@ -21,12 +27,11 @@ def benchmark(args): from tests.integration.benchmark import AdvancedModelBenchmark - models = [ - {"name": "gpt-4o", "type": "openai"}, - {"name": "deepseek-r1", "type": "deepseek"}, - {"name": "claude-3-5-sonnet-20241022", "type": "anthropic"}, - {"name": "gemini-2.0-flash-exp", "type": "google"}, - ] + config_path = Path(__file__).parent / "config" / "benchmark.yaml" + with open(config_path) as f: + config = yaml.safe_load(f) + + models = config["models"] try: benchmark = AdvancedModelBenchmark(models) @@ -35,6 +40,18 @@ def benchmark(args): for r in results: print(f"{r.model_name}: {r.avg_response_time:.2f}s, {r.task_success_rate:.1f}% success") print("completed. check reports/") + + if wandb and config.get("wandb", {}).get("enabled", False): + wandb.init(project=config["wandb"]["project"]) + for r in results: + wandb.log( + { + "model": r.model_name, + "response_time": r.avg_response_time, + "success_rate": r.task_success_rate, + } + ) + print("logged to wandb") except Exception as e: print(f"failed: {e}") @@ -51,14 +68,15 @@ def compare(args): def models(args): - models = [ - ("deepseek r1", "nvidia", "reasoning"), - ("gpt-4o", "openai", "advanced"), - ("claude-3.5-sonnet", "anthropic", "balanced"), - ("gemini-2.0-flash-exp", "google", "experimental"), - ] - for name, provider, desc in models: - print(f"{name}: {provider} ({desc})") + config_path = Path(__file__).parent / "config" / "benchmark.yaml" + with open(config_path) as f: + config = yaml.safe_load(f) + + for model in config["models"]: + name = model["name"] + type_ = model["type"] + desc = model.get("description", "") + print(f"{name}: {type_} ({desc})") def status(args): diff --git a/config/benchmark.yaml b/config/benchmark.yaml new file mode 100644 index 0000000..35ff223 --- /dev/null +++ b/config/benchmark.yaml @@ -0,0 +1,23 @@ +models: + - name: gpt-4o + type: openai + description: advanced + - name: deepseek-r1 + type: deepseek + description: reasoning + - name: claude-3-5-sonnet-20241022 + type: anthropic + description: balanced + - name: gemini-2.0-flash-exp + type: google + description: experimental + +# Benchmark settings +batch_size: 1 +eval_freq: 10 +log_freq: 5 + +# Wandb settings +wandb: + project: ai-benchmark + enabled: false \ No newline at end of file diff --git a/readme.md b/readme.md index 8be47a6..e626a90 100644 --- a/readme.md +++ b/readme.md @@ -1 +1,135 @@ -ruff check \ No newline at end of file +AI Model Benchmarking and Selection Tool + +This repository provides a lightweight, Python-based toolkit for benchmarking and comparing large language models across multiple providers, including OpenAI, Anthropic, Google, and NVIDIA. It is intended for developers who want a repeatable way to evaluate models using consistent tasks and measurable criteria, such as latency, success rate, and task difficulty, in order to select an appropriate model for a given use case. + +The tool assumes familiarity with Python, command-line workflows, YAML configuration files, and the use of third‑party model APIs. You are expected to supply valid API credentials for any providers you benchmark. + +News + +Recent dependency changes were made to improve compatibility across Python versions. Version pins for numpy, pandas, scikit-learn, and matplotlib were removed because pinned versions (for example, numpy==1.26.0) required Python <3.13 and caused installation failures on newer Python releases such as Python 3.14. Allowing the latest compatible versions ensures broader compatibility without conflicts. + +Weights & Biases support was removed due to protobuf import errors encountered in virtual environments. As a result, the codebase no longer depends on wandb. + +Requirements + +* Python 3.10 or newer +* pip +* API keys for the model providers you intend to benchmark, exposed via environment variables or a .env file + +Installation + +Clone the repository: + +``` +cd $HOME && git clone +cd ai +``` + +Install dependencies: + +``` +pip install -r requirements.txt +``` + +Model Configuration + +Models are defined in `config/benchmark.yaml`. Each entry specifies the model name, provider type, and a short description. The names must match the identifiers expected by the corresponding provider integration. + +Example: + +```yaml +models: + - name: new-model + type: provider + description: new description +``` + +Verify Configuration + +Before running benchmarks, validate your configuration: + +``` +python -m utils.validate_data +``` + +This step checks the benchmark.yaml for required model fields. + +Start Benchmarking + +Basic Benchmark + +Run a benchmark across all configured models: + +``` +python cli.py benchmark +``` + +Each model is evaluated on the same tasks, and aggregate metrics are reported. + +Model Comparison + +Compare two specific models on a focused task and complexity level: + +``` +python cli.py compare gpt-4o claude-3-5-sonnet --task "code generation" --complexity high +``` + +Custom Configuration + +Benchmark behavior can be adjusted in `config/benchmark.yaml`. + +Key parameters: + +* `models`: list of models to benchmark +* `batch_size`: number of tasks per batch +* `eval_freq`: evaluation frequency +* `log_freq`: logging frequency + +Dashboard + +A Streamlit dashboard is provided to visualize benchmark results: + +``` +streamlit run dashboard/dashboard.py +``` + +Examples + +Basic Benchmark + +``` +python cli.py benchmark +``` + +Example output: + +``` +benchmark results: +gpt-4o: 2.10s, 85.0% success +deepseek-r1: 1.80s, 92.0% success +... +``` + +Model Comparison + +``` +python cli.py compare gpt-4o gemini-2.0-flash-exp --task "math reasoning" --complexity extreme +``` + +Testing + +Run the test suite: + +``` +pytest +``` + +FAQ + +* How do I add a new model? Update `config/benchmark.yaml` with the model details. +* What metrics are used? Response time and task success rate, as implemented in the evaluation code. +* How are API limits handled? Provider integrations include basic rate limiting to reduce request failures. + +License + +Apache 2.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 7b6bb7e..7cf7baa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,15 +7,16 @@ pytest==8.2.0 pytest-cov==5.0.0 ruff==0.6.0 plotly==5.22.0 -numpy==1.26.0 +numpy anthropic==0.30.0 google-generativeai==0.8.0 streamlit==1.35.0 -pandas==2.2.0 -scikit-learn==1.5.0 +pandas +scikit-learn +pyyaml==6.0.1 # Performance Evaluation Dependencies -matplotlib==3.9.0 +matplotlib # Node.js dependencies (for package.json) # openai diff --git a/tests/e2e/cli.py b/tests/e2e/cli.py index 44d0ff4..f0eb95c 100644 --- a/tests/e2e/cli.py +++ b/tests/e2e/cli.py @@ -31,7 +31,7 @@ def test_cli_models(): [sys.executable, str(cli_path), "models"], capture_output=True, text=True ) assert result.returncode == 0 - assert "deepseek r1" in result.stdout.lower() + assert "deepseek-r1" in result.stdout.lower() assert "gpt-4o" in result.stdout.lower() diff --git a/utils/validate_data.py b/utils/validate_data.py new file mode 100644 index 0000000..9420ef8 --- /dev/null +++ b/utils/validate_data.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +""" +Data validation utility for benchmarking configurations. +""" + +import sys +from pathlib import Path + +import yaml + + +def validate_config(config_path): + """Validate the benchmark configuration file.""" + if not config_path.exists(): + print(f"Error: Config file {config_path} not found.") + return False + + try: + with open(config_path) as f: + config = yaml.safe_load(f) + except yaml.YAMLError as e: + print(f"Error parsing YAML: {e}") + return False + + models = config.get("models", []) + if not models: + print("Error: No models defined in config.") + return False + + for model in models: + if not all(key in model for key in ["name", "type"]): + print(f"Error: Model missing required fields: {model}") + return False + + print("Config validation passed.") + return True + + +def main(): + config_path = Path(__file__).parent.parent / "config" / "benchmark.yaml" + if not validate_config(config_path): + sys.exit(1) + + +if __name__ == "__main__": + main() From c6a13833fcc60fbceb58ab4d5a3af86a33d3cc3d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Dec 2025 00:12:24 +0000 Subject: [PATCH 2/2] Bump streamlit from 1.35.0 to 1.37.0 in /dashboard Bumps [streamlit](https://github.com/streamlit/streamlit) from 1.35.0 to 1.37.0. - [Release notes](https://github.com/streamlit/streamlit/releases) - [Commits](https://github.com/streamlit/streamlit/compare/1.35.0...1.37.0) --- updated-dependencies: - dependency-name: streamlit dependency-version: 1.37.0 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- dashboard/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dashboard/requirements.txt b/dashboard/requirements.txt index b7b8a58..2654c9b 100644 --- a/dashboard/requirements.txt +++ b/dashboard/requirements.txt @@ -1,4 +1,4 @@ -streamlit==1.35.0 +streamlit==1.37.0 plotly==5.22.0 pandas==2.2.0 sqlite3