Skip to content
This repository was archived by the owner on Jun 6, 2026. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 32 additions & 14 deletions cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,14 @@
import sys
from pathlib import Path

import yaml
from dotenv import load_dotenv

try:
import wandb
except ImportError:
wandb = None

load_dotenv()
sys.path.insert(0, str(Path(__file__).parent))

Expand All @@ -21,12 +27,11 @@ def benchmark(args):

from tests.integration.benchmark import AdvancedModelBenchmark

models = [
{"name": "gpt-4o", "type": "openai"},
{"name": "deepseek-r1", "type": "deepseek"},
{"name": "claude-3-5-sonnet-20241022", "type": "anthropic"},
{"name": "gemini-2.0-flash-exp", "type": "google"},
]
config_path = Path(__file__).parent / "config" / "benchmark.yaml"
with open(config_path) as f:
config = yaml.safe_load(f)

models = config["models"]

try:
benchmark = AdvancedModelBenchmark(models)
Expand All @@ -35,6 +40,18 @@ def benchmark(args):
for r in results:
print(f"{r.model_name}: {r.avg_response_time:.2f}s, {r.task_success_rate:.1f}% success")
print("completed. check reports/")

if wandb and config.get("wandb", {}).get("enabled", False):
wandb.init(project=config["wandb"]["project"])
for r in results:
wandb.log(
{
"model": r.model_name,
"response_time": r.avg_response_time,
"success_rate": r.task_success_rate,
}
)
print("logged to wandb")
except Exception as e:
print(f"failed: {e}")

Expand All @@ -51,14 +68,15 @@ def compare(args):


def models(args):
models = [
("deepseek r1", "nvidia", "reasoning"),
("gpt-4o", "openai", "advanced"),
("claude-3.5-sonnet", "anthropic", "balanced"),
("gemini-2.0-flash-exp", "google", "experimental"),
]
for name, provider, desc in models:
print(f"{name}: {provider} ({desc})")
config_path = Path(__file__).parent / "config" / "benchmark.yaml"
with open(config_path) as f:
config = yaml.safe_load(f)

for model in config["models"]:
name = model["name"]
type_ = model["type"]
desc = model.get("description", "")
print(f"{name}: {type_} ({desc})")


def status(args):
Expand Down
23 changes: 23 additions & 0 deletions config/benchmark.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
models:
- name: gpt-4o
type: openai
description: advanced
- name: deepseek-r1
type: deepseek
description: reasoning
- name: claude-3-5-sonnet-20241022
type: anthropic
description: balanced
- name: gemini-2.0-flash-exp
type: google
description: experimental

# Benchmark settings
batch_size: 1
eval_freq: 10
log_freq: 5

# Wandb settings
wandb:
project: ai-benchmark
enabled: false
2 changes: 1 addition & 1 deletion dashboard/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
streamlit==1.35.0
streamlit==1.37.0
plotly==5.22.0
pandas==2.2.0
sqlite3
Expand Down
136 changes: 135 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
@@ -1 +1,135 @@
ruff check
AI Model Benchmarking and Selection Tool

This repository provides a lightweight, Python-based toolkit for benchmarking and comparing large language models across multiple providers, including OpenAI, Anthropic, Google, and NVIDIA. It is intended for developers who want a repeatable way to evaluate models using consistent tasks and measurable criteria, such as latency, success rate, and task difficulty, in order to select an appropriate model for a given use case.

The tool assumes familiarity with Python, command-line workflows, YAML configuration files, and the use of third‑party model APIs. You are expected to supply valid API credentials for any providers you benchmark.

News

Recent dependency changes were made to improve compatibility across Python versions. Version pins for numpy, pandas, scikit-learn, and matplotlib were removed because pinned versions (for example, numpy==1.26.0) required Python <3.13 and caused installation failures on newer Python releases such as Python 3.14. Allowing the latest compatible versions ensures broader compatibility without conflicts.

Weights & Biases support was removed due to protobuf import errors encountered in virtual environments. As a result, the codebase no longer depends on wandb.

Requirements

* Python 3.10 or newer
* pip
* API keys for the model providers you intend to benchmark, exposed via environment variables or a .env file

Installation

Clone the repository:

```
cd $HOME && git clone <repo-url>
cd ai
```

Install dependencies:

```
pip install -r requirements.txt
```

Model Configuration

Models are defined in `config/benchmark.yaml`. Each entry specifies the model name, provider type, and a short description. The names must match the identifiers expected by the corresponding provider integration.

Example:

```yaml
models:
- name: new-model
type: provider
description: new description
```

Verify Configuration

Before running benchmarks, validate your configuration:

```
python -m utils.validate_data
```

This step checks the benchmark.yaml for required model fields.

Start Benchmarking

Basic Benchmark

Run a benchmark across all configured models:

```
python cli.py benchmark
```

Each model is evaluated on the same tasks, and aggregate metrics are reported.

Model Comparison

Compare two specific models on a focused task and complexity level:

```
python cli.py compare gpt-4o claude-3-5-sonnet --task "code generation" --complexity high
```

Custom Configuration

Benchmark behavior can be adjusted in `config/benchmark.yaml`.

Key parameters:

* `models`: list of models to benchmark
* `batch_size`: number of tasks per batch
* `eval_freq`: evaluation frequency
* `log_freq`: logging frequency

Dashboard

A Streamlit dashboard is provided to visualize benchmark results:

```
streamlit run dashboard/dashboard.py
```

Examples

Basic Benchmark

```
python cli.py benchmark
```

Example output:

```
benchmark results:
gpt-4o: 2.10s, 85.0% success
deepseek-r1: 1.80s, 92.0% success
...
```

Model Comparison

```
python cli.py compare gpt-4o gemini-2.0-flash-exp --task "math reasoning" --complexity extreme
```

Testing

Run the test suite:

```
pytest
```

FAQ

* How do I add a new model? Update `config/benchmark.yaml` with the model details.
* What metrics are used? Response time and task success rate, as implemented in the evaluation code.
* How are API limits handled? Provider integrations include basic rate limiting to reduce request failures.

License

Apache 2.0
9 changes: 5 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,16 @@ pytest==8.2.0
pytest-cov==5.0.0
ruff==0.6.0
plotly==5.22.0
numpy==1.26.0
numpy
anthropic==0.30.0
google-generativeai==0.8.0
streamlit==1.35.0
pandas==2.2.0
scikit-learn==1.5.0
pandas
scikit-learn
pyyaml==6.0.1

# Performance Evaluation Dependencies
matplotlib==3.9.0
matplotlib

# Node.js dependencies (for package.json)
# openai
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_cli_models():
[sys.executable, str(cli_path), "models"], capture_output=True, text=True
)
assert result.returncode == 0
assert "deepseek r1" in result.stdout.lower()
assert "deepseek-r1" in result.stdout.lower()
assert "gpt-4o" in result.stdout.lower()


Expand Down
46 changes: 46 additions & 0 deletions utils/validate_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python3
"""
Data validation utility for benchmarking configurations.
"""

import sys
from pathlib import Path

import yaml


def validate_config(config_path):
"""Validate the benchmark configuration file."""
if not config_path.exists():
print(f"Error: Config file {config_path} not found.")
return False

try:
with open(config_path) as f:
config = yaml.safe_load(f)
except yaml.YAMLError as e:
print(f"Error parsing YAML: {e}")
return False

models = config.get("models", [])
if not models:
print("Error: No models defined in config.")
return False

for model in models:
if not all(key in model for key in ["name", "type"]):
print(f"Error: Model missing required fields: {model}")
return False

print("Config validation passed.")
return True


def main():
config_path = Path(__file__).parent.parent / "config" / "benchmark.yaml"
if not validate_config(config_path):
sys.exit(1)


if __name__ == "__main__":
main()
Loading