palmshed · dependabot · Dec 16, 2025 · Dec 16, 2025
diff --git a/cli.py b/cli.py
@@ -4,8 +4,14 @@
 import sys
 from pathlib import Path
 
+import yaml
 from dotenv import load_dotenv
 
+try:
+    import wandb
+except ImportError:
+    wandb = None
+
 load_dotenv()
 sys.path.insert(0, str(Path(__file__).parent))
 
@@ -21,12 +27,11 @@ def benchmark(args):
 
     from tests.integration.benchmark import AdvancedModelBenchmark
 
-    models = [
-        {"name": "gpt-4o", "type": "openai"},
-        {"name": "deepseek-r1", "type": "deepseek"},
-        {"name": "claude-3-5-sonnet-20241022", "type": "anthropic"},
-        {"name": "gemini-2.0-flash-exp", "type": "google"},
-    ]
+    config_path = Path(__file__).parent / "config" / "benchmark.yaml"
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+
+    models = config["models"]
 
     try:
         benchmark = AdvancedModelBenchmark(models)
@@ -35,6 +40,18 @@ def benchmark(args):
         for r in results:
             print(f"{r.model_name}: {r.avg_response_time:.2f}s, {r.task_success_rate:.1f}% success")
         print("completed. check reports/")
+
+        if wandb and config.get("wandb", {}).get("enabled", False):
+            wandb.init(project=config["wandb"]["project"])
+            for r in results:
+                wandb.log(
+                    {
+                        "model": r.model_name,
+                        "response_time": r.avg_response_time,
+                        "success_rate": r.task_success_rate,
+                    }
+                )
+            print("logged to wandb")
     except Exception as e:
         print(f"failed: {e}")
 
@@ -51,14 +68,15 @@ def compare(args):
 
 
 def models(args):
-    models = [
-        ("deepseek r1", "nvidia", "reasoning"),
-        ("gpt-4o", "openai", "advanced"),
-        ("claude-3.5-sonnet", "anthropic", "balanced"),
-        ("gemini-2.0-flash-exp", "google", "experimental"),
-    ]
-    for name, provider, desc in models:
-        print(f"{name}: {provider} ({desc})")
+    config_path = Path(__file__).parent / "config" / "benchmark.yaml"
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+
+    for model in config["models"]:
+        name = model["name"]
+        type_ = model["type"]
+        desc = model.get("description", "")
+        print(f"{name}: {type_} ({desc})")
 
 
 def status(args):

diff --git a/config/benchmark.yaml b/config/benchmark.yaml
@@ -0,0 +1,23 @@
+models:
+  - name: gpt-4o
+    type: openai
+    description: advanced
+  - name: deepseek-r1
+    type: deepseek
+    description: reasoning
+  - name: claude-3-5-sonnet-20241022
+    type: anthropic
+    description: balanced
+  - name: gemini-2.0-flash-exp
+    type: google
+    description: experimental
+
+# Benchmark settings
+batch_size: 1
+eval_freq: 10
+log_freq: 5
+
+# Wandb settings
+wandb:
+  project: ai-benchmark
+  enabled: false
diff --git a/dashboard/requirements.txt b/dashboard/requirements.txt
@@ -1,4 +1,4 @@
-streamlit==1.35.0
+streamlit==1.37.0
 plotly==5.22.0
 pandas==2.2.0
 sqlite3

diff --git a/readme.md b/readme.md
@@ -1 +1,135 @@
-ruff check
+AI Model Benchmarking and Selection Tool
+
+This repository provides a lightweight, Python-based toolkit for benchmarking and comparing large language models across multiple providers, including OpenAI, Anthropic, Google, and NVIDIA. It is intended for developers who want a repeatable way to evaluate models using consistent tasks and measurable criteria, such as latency, success rate, and task difficulty, in order to select an appropriate model for a given use case.
+
+The tool assumes familiarity with Python, command-line workflows, YAML configuration files, and the use of third‑party model APIs. You are expected to supply valid API credentials for any providers you benchmark.
+
+News
+
+Recent dependency changes were made to improve compatibility across Python versions. Version pins for numpy, pandas, scikit-learn, and matplotlib were removed because pinned versions (for example, numpy==1.26.0) required Python <3.13 and caused installation failures on newer Python releases such as Python 3.14. Allowing the latest compatible versions ensures broader compatibility without conflicts.
+
+Weights & Biases support was removed due to protobuf import errors encountered in virtual environments. As a result, the codebase no longer depends on wandb.
+
+Requirements
+
+* Python 3.10 or newer
+* pip
+* API keys for the model providers you intend to benchmark, exposed via environment variables or a .env file
+
+Installation
+
+Clone the repository:
+
+```
+cd $HOME && git clone <repo-url>
+cd ai
+```
+
+Install dependencies:
+
+```
+pip install -r requirements.txt
+```
+
+Model Configuration
+
+Models are defined in `config/benchmark.yaml`. Each entry specifies the model name, provider type, and a short description. The names must match the identifiers expected by the corresponding provider integration.
+
+Example:
+
+```yaml
+models:
+  - name: new-model
+    type: provider
+    description: new description
+```
+
+Verify Configuration
+
+Before running benchmarks, validate your configuration:
+
+```
+python -m utils.validate_data
+```
+
+This step checks the benchmark.yaml for required model fields.
+
+Start Benchmarking
+
+Basic Benchmark
+
+Run a benchmark across all configured models:
+
+```
+python cli.py benchmark
+```
+
+Each model is evaluated on the same tasks, and aggregate metrics are reported.
+
+Model Comparison
+
+Compare two specific models on a focused task and complexity level:
+
+```
+python cli.py compare gpt-4o claude-3-5-sonnet --task "code generation" --complexity high
+```
+
+Custom Configuration
+
+Benchmark behavior can be adjusted in `config/benchmark.yaml`.
+
+Key parameters:
+
+* `models`: list of models to benchmark
+* `batch_size`: number of tasks per batch
+* `eval_freq`: evaluation frequency
+* `log_freq`: logging frequency
+
+Dashboard
+
+A Streamlit dashboard is provided to visualize benchmark results:
+
+```
+streamlit run dashboard/dashboard.py
+```
+
+Examples
+
+Basic Benchmark
+
+```
+python cli.py benchmark
+```
+
+Example output:
+
+```
+benchmark results:
+gpt-4o: 2.10s, 85.0% success
+deepseek-r1: 1.80s, 92.0% success
+...
+```
+
+Model Comparison
+
+```
+python cli.py compare gpt-4o gemini-2.0-flash-exp --task "math reasoning" --complexity extreme
+```
+
+Testing
+
+Run the test suite:
+
+```
+pytest
+```
+
+FAQ
+
+* How do I add a new model? Update `config/benchmark.yaml` with the model details.
+* What metrics are used? Response time and task success rate, as implemented in the evaluation code.
+* How are API limits handled? Provider integrations include basic rate limiting to reduce request failures.
+
+License
+
+Apache 2.0
diff --git a/requirements.txt b/requirements.txt
@@ -7,15 +7,16 @@ pytest==8.2.0
 pytest-cov==5.0.0
 ruff==0.6.0
 plotly==5.22.0
-numpy==1.26.0
+numpy
 anthropic==0.30.0
 google-generativeai==0.8.0
 streamlit==1.35.0
-pandas==2.2.0
-scikit-learn==1.5.0
+pandas
+scikit-learn
+pyyaml==6.0.1
 
 # Performance Evaluation Dependencies
-matplotlib==3.9.0
+matplotlib
 
 # Node.js dependencies (for package.json)
 # openai

diff --git a/tests/e2e/cli.py b/tests/e2e/cli.py
@@ -31,7 +31,7 @@ def test_cli_models():
         [sys.executable, str(cli_path), "models"], capture_output=True, text=True
     )
     assert result.returncode == 0
-    assert "deepseek r1" in result.stdout.lower()
+    assert "deepseek-r1" in result.stdout.lower()
     assert "gpt-4o" in result.stdout.lower()
 
 

diff --git a/utils/validate_data.py b/utils/validate_data.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""
+Data validation utility for benchmarking configurations.
+"""
+
+import sys
+from pathlib import Path
+
+import yaml
+
+
+def validate_config(config_path):
+    """Validate the benchmark configuration file."""
+    if not config_path.exists():
+        print(f"Error: Config file {config_path} not found.")
+        return False
+
+    try:
+        with open(config_path) as f:
+            config = yaml.safe_load(f)
+    except yaml.YAMLError as e:
+        print(f"Error parsing YAML: {e}")
+        return False
+
+    models = config.get("models", [])
+    if not models:
+        print("Error: No models defined in config.")
+        return False
+
+    for model in models:
+        if not all(key in model for key in ["name", "type"]):
+            print(f"Error: Model missing required fields: {model}")
+            return False
+
+    print("Config validation passed.")
+    return True
+
+
+def main():
+    config_path = Path(__file__).parent.parent / "config" / "benchmark.yaml"
+    if not validate_config(config_path):
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()