advikdivekar · HeetRanpura · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/README.md b/README.md
diff --git a/inference.py b/inference.py
@@ -23,10 +23,11 @@
 # ENVIRONMENT CONFIGURATION
 # All credentials read from environment — never hardcoded.
 # =========================================================
-API_BASE_URL   = os.getenv("API_BASE_URL",   "https://router.huggingface.co/v1")
-MODEL_NAME     = os.getenv("MODEL_NAME",     "Qwen/Qwen2.5-7B-Instruct")
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") or os.getenv("HF_TOKEN", "")  # FIX A1
-ENV_URL        = os.getenv("ENV_URL",        "http://localhost:7860")
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-7B-Instruct")
+HF_TOKEN = os.getenv("HF_TOKEN")
+LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
+ENV_URL = os.getenv("ENV_URL", "http://localhost:7860")
 
 INFERENCE_TEMPERATURE = float(os.getenv("INFERENCE_TEMPERATURE", "0.0"))
 MAX_TOKENS = int(os.getenv("MAX_TOKENS", "1500"))
@@ -79,12 +80,12 @@ def normalize_provider_config(base_url: str, model_name: str) -> tuple[str, str]
 
 
 API_BASE_URL, MODEL_NAME = normalize_provider_config(API_BASE_URL, MODEL_NAME)
-client = OpenAI(base_url=API_BASE_URL, api_key=OPENAI_API_KEY)
+client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
 
 
-if "huggingface.co" in API_BASE_URL and not OPENAI_API_KEY:
+if "huggingface.co" in API_BASE_URL and not HF_TOKEN:
     print(
-        "[CONFIG] Missing HF_TOKEN / OPENAI_API_KEY for Hugging Face Router. "
+        "[CONFIG] Missing HF_TOKEN for the configured endpoint. "
         "Set HF_TOKEN in your environment or .env file.",
         flush=True,
     )

diff --git a/reports/README.txt b/reports/README.txt
@@ -0,0 +1,44 @@
+OpenEnv scheme_env Benchmark — Baseline Report
+================================================
+
+Files in this directory:
+
+  leaderboard.csv
+      Model rankings sorted by average score (descending).
+      Columns: Model, Size, Task1, Task2, Task3, Task4, Task5, Average.
+
+  results.json
+      Full results for all models including per-task scores and standard
+      deviations. Useful for programmatic downstream analysis.
+
+  average_scores.png
+      Horizontal bar chart of each model's average score across all 5 tasks.
+      Bars are colour-coded: red < 0.50, orange 0.50–0.75, green > 0.75.
+
+  task_heatmap.png
+      Heatmap with models as rows and tasks as columns.
+      Colour scale: red = 0.0, yellow = 0.5, green = 1.0 (RdYlGn).
+      Cell values show the exact score.
+
+  efficiency_scatter.png
+      Scatter plot of average score (x) vs Task 4 score (y).
+      Task 4 is the escalation-dilemma task and tests protocol adherence.
+      Each point is labelled with the short model name.
+
+  difficulty_profile.png
+      Line chart showing mean score per task across all 8 models with error
+      bars (±1 std). Reveals which tasks are hardest / easiest on average.
+
+  summary.txt
+      Plain-text summary: best/worst model, hardest/easiest task, and any
+      model that scored 1.0 on every task.
+
+  README.txt
+      This file.
+
+Tasks:
+  Task 1 — Basic eligibility check
+  Task 2 — Multi-criterion scheme selection
+  Task 3 — Income-threshold boundary case
+  Task 4 — Escalation dilemma (employment data conflict)
+  Task 5 — Document-verification age conflict
diff --git a/reports/average_scores.png b/reports/average_scores.png
diff --git a/reports/baseline_report/README.txt b/reports/baseline_report/README.txt
@@ -0,0 +1,44 @@
+OpenEnv scheme_env Benchmark — Baseline Report
+================================================
+
+Files in this directory:
+
+  leaderboard.csv
+      Model rankings sorted by average score (descending).
+      Columns: Model, Size, Task1, Task2, Task3, Task4, Task5, Average.
+
+  results.json
+      Full results for all models including per-task scores and standard
+      deviations. Useful for programmatic downstream analysis.
+
+  average_scores.png
+      Horizontal bar chart of each model's average score across all 5 tasks.
+      Bars are colour-coded: red < 0.50, orange 0.50–0.75, green > 0.75.
+
+  task_heatmap.png
+      Heatmap with models as rows and tasks as columns.
+      Colour scale: red = 0.0, yellow = 0.5, green = 1.0 (RdYlGn).
+      Cell values show the exact score.
+
+  efficiency_scatter.png
+      Scatter plot of average score (x) vs Task 4 score (y).
+      Task 4 is the escalation-dilemma task and tests protocol adherence.
+      Each point is labelled with the short model name.
+
+  difficulty_profile.png
+      Line chart showing mean score per task across all 8 models with error
+      bars (±1 std). Reveals which tasks are hardest / easiest on average.
+
+  summary.txt
+      Plain-text summary: best/worst model, hardest/easiest task, and any
+      model that scored 1.0 on every task.
+
+  README.txt
+      This file.
+
+Tasks:
+  Task 1 — Basic eligibility check
+  Task 2 — Multi-criterion scheme selection
+  Task 3 — Income-threshold boundary case
+  Task 4 — Escalation dilemma (employment data conflict)
+  Task 5 — Document-verification age conflict
diff --git a/reports/baseline_report/average_scores.png b/reports/baseline_report/average_scores.png
diff --git a/reports/baseline_report/difficulty_profile.png b/reports/baseline_report/difficulty_profile.png
diff --git a/reports/baseline_report/efficiency_scatter.png b/reports/baseline_report/efficiency_scatter.png