SuperagenticAI
diff --git a/‎README.md‎
Lines changed: 58 additions & 1 deletion b/‎README.md‎
Lines changed: 58 additions & 1 deletion
diff --git a/‎codexopt.example.yaml‎
Lines changed: 3 additions & 0 deletions b/‎codexopt.example.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/codexopt/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎src/codexopt/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/codexopt/benchmark.py‎
Lines changed: 131 additions & 7 deletions b/‎src/codexopt/benchmark.py‎
Lines changed: 131 additions & 7 deletions
diff --git a/‎src/codexopt/cli.py‎
Lines changed: 5 additions & 1 deletion b/‎src/codexopt/cli.py‎
Lines changed: 5 additions & 1 deletion
@@ -28,9 +28,11 @@ CodexOpt turns these edits into measurable runs with artifacts you can inspect a
 ## Features
 
 - Project scan with issue detection for agents and skills.
-- Heuristic benchmark scoring.
+- Benchmark scoring with sub-scores and natural-language feedback.
+- Optional evidence inputs from repo task files and issue exports.
 - Optimization engine `heuristic` (default, local and deterministic).
 - Optional optimization engine `gepa` (via `gepa.optimize_anything`).
+- Explicit reporting when a GEPA-requested run falls back to heuristic optimization.
 - Safe apply flow with automatic backups.
 - Markdown reporting from latest runs.
 - Minimal OSS CI (lint, test, build).
@@ -90,6 +92,44 @@ uv run codexopt apply --kind agents
 uv run codexopt report --output codexopt-report.md
 ```
 
+## How Teams Use CodexOpt
+
+Developers use CodexOpt in the repository that contains their Codex instruction assets:
+
+- `AGENTS.md`
+- `.codex/skills/**/SKILL.md`
+
+Optional evidence can also be added to improve benchmarking and optimization quality:
+
+- task files (`tasks.md`, task lists, or JSON fixtures)
+- issue/review exports (`issues.md` or JSON exports)
+
+Typical workflow:
+
+1. Run `scan` and `benchmark` to measure the current instruction assets.
+2. Run `optimize agents` and `optimize skills` to generate improved candidates.
+3. Review the generated diffs and report artifacts under `.codexopt/runs/`.
+4. Run `apply --dry-run` first, then apply accepted changes.
+5. Commit the updated instruction files and, if useful, attach the report to a PR.
+
+Example with optional evidence configured in `codexopt.yaml`:
+
+```yaml
+evidence:
+  task_files:
+    - tasks.md
+  issue_files:
+    - issues.md
+```
+
+With that config in place, `benchmark` and `optimize` use:
+
+- static prompt-quality checks
+- repo task alignment
+- recurring issue/review themes
+
+Today, task and issue files influence scoring and feedback. CodexOpt does not yet execute full agent task simulations.
+
 Use `codexopt.example.yaml` as a starting point for committed team config.
 
 ## Command Reference
@@ -186,6 +226,9 @@ targets:
     - "reference/**"
 output:
   root_dir: ".codexopt"
+evidence:
+  task_files: []
+  issue_files: []
 optimization:
   engine: "heuristic"
   min_apply_delta: 0.01
@@ -199,6 +242,8 @@ Config notes:
 - `targets.skills_globs`: glob patterns for `SKILL.md` targets.
 - `targets.exclude_globs`: paths ignored during scan.
 - `output.root_dir`: run artifacts and backups location.
+- `evidence.task_files`: optional markdown/json task lists used for repo-alignment scoring.
+- `evidence.issue_files`: optional markdown/json issue or review exports used for theme-aware feedback.
 - `optimization.engine`: default optimization engine.
 - `optimization.min_apply_delta`: minimum score gain required to apply.
 - `optimization.max_metric_calls`: GEPA metric budget.
@@ -213,13 +258,24 @@ AGENTS scoring factors include:
 - Too short or too long content penalties.
 - Token-heaviness estimate penalty.
 - Empty file penalty.
+- Contradictory guidance penalties.
+- Missing workflow / verification / output-format guidance penalties.
+- Repo-context and task-alignment signals when evidence files are configured.
 
 SKILL scoring factors include:
 
 - Missing frontmatter penalties.
 - Missing `name` / `description` penalties.
 - Overly long frontmatter fields penalties.
 - Too short or too long content penalties.
+- Weak trigger/workflow/verification guidance penalties.
+- Repo task alignment signals when evidence files are configured.
+
+Each benchmarked file also includes:
+
+- criterion-level sub-scores
+- natural-language feedback
+- optional evidence summary from configured task/issue files
 
 ## Optimization Behavior
 
@@ -246,6 +302,7 @@ Requirements:
 Fallback behavior:
 
 - If GEPA is unavailable or errors, CodexOpt falls back to heuristic optimization.
+- Fallbacks are recorded in optimization artifacts, CLI summaries, and reports.
 
 ## Artifacts and State
 
 
@@ -15,6 +15,9 @@ targets:
     - "reference/**"
 output:
   root_dir: ".codexopt"
+evidence:
+  task_files: []
+  issue_files: []
 optimization:
   engine: "heuristic"
   min_apply_delta: 0.01
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "codexopt"
-version = "0.1.0"
+version = "0.1.1"
 description = "CodexOpt: Optimize your Agents.MD and Skills for Codex with GEPA"
 readme = "README.md"
 requires-python = ">=3.10"
 
@@ -1,4 +1,4 @@
 """codexopt package."""
 
 __all__ = ["__version__"]
-__version__ = "0.1.0"
+__version__ = "0.1.1"
@@ -1,16 +1,54 @@
 from __future__ import annotations
 
+from pathlib import Path
 from typing import Any
 
 from .types import FileScore
+from .quality import build_feedback
+from .quality import extract_keywords
+from .quality import load_issue_texts
+from .quality import load_task_statements
+from .quality import resolve_evidence_paths
+from .quality import summarize_text_themes
+from .quality import task_keyword_coverage
+
+
+def load_evidence(cwd: Any, config: Any) -> dict[str, Any]:
+    task_paths = resolve_evidence_paths(cwd, list(getattr(config.evidence, "task_files", [])))
+    issue_paths = resolve_evidence_paths(cwd, list(getattr(config.evidence, "issue_files", [])))
+    task_statements = load_task_statements(task_paths)
+    issue_texts = load_issue_texts(issue_paths)
+    return {
+        "task_paths": [str(path) for path in task_paths],
+        "issue_paths": [str(path) for path in issue_paths],
+        "task_statements": task_statements,
+        "task_keywords": extract_keywords(task_statements),
+        "task_themes": summarize_text_themes(task_statements),
+        "issue_themes": summarize_text_themes(issue_texts),
+    }
+
 
+def _load_entry_text(entry: dict[str, Any]) -> str:
+    if isinstance(entry.get("text"), str):
+        return str(entry["text"])
+    path = Path(str(entry.get("path", "")))
+    if not path.exists():
+        return ""
+    return path.read_text(encoding="utf-8", errors="replace")
 
-def _score_agents(entry: dict[str, Any]) -> FileScore:
+
+def _score_agents(entry: dict[str, Any], evidence: dict[str, Any] | None = None) -> FileScore:
     score = 1.0
     issues: list[str] = []
     details: dict[str, Any] = {}
     words = int(entry.get("words", 0))
     tokens = int(entry.get("token_estimate", 0))
+    metadata = dict(entry.get("metadata", {}))
+    flags = dict(metadata.get("instruction_flags", {}))
+    contradictions = list(metadata.get("contradictions", []))
+    duplicate_count = int(metadata.get("duplicate_nonempty_line_count", 0))
+    evidence = evidence or {}
+    task_coverage = task_keyword_coverage(_load_entry_text(entry), evidence.get("task_keywords", []))
 
     if words < 80:
         score -= 0.2
@@ -24,9 +62,53 @@ def _score_agents(entry: dict[str, Any]) -> FileScore:
     if "empty_agents" in entry.get("issues", []):
         score -= 0.6
         issues.append("empty")
+    if contradictions:
+        score -= min(0.25, 0.08 * len(contradictions))
+        issues.append("contradictions")
+    if duplicate_count:
+        score -= min(0.12, 0.03 * duplicate_count)
+        issues.append("duplicate_lines")
+    if not flags.get("has_role"):
+        score -= 0.08
+        issues.append("missing_role")
+    if not flags.get("has_constraints"):
+        score -= 0.1
+        issues.append("missing_constraints")
+    if not flags.get("has_workflow"):
+        score -= 0.1
+        issues.append("missing_workflow")
+    if not flags.get("has_verification"):
+        score -= 0.14
+        issues.append("missing_verification")
+    if not flags.get("has_output_contract"):
+        score -= 0.08
+        issues.append("missing_output_contract")
+    if not flags.get("has_repo_context"):
+        score -= 0.06
+        issues.append("missing_repo_context")
+    if task_coverage < 0.2 and evidence.get("task_keywords"):
+        score -= 0.08
+        issues.append("weak_task_alignment")
 
     details["words"] = words
     details["token_estimate"] = tokens
+    details["criteria_scores"] = {
+        "clarity": round(max(0.0, 1.0 - (0.16 if contradictions else 0.0) - (0.05 if duplicate_count else 0.0)), 4),
+        "safety": round(max(0.0, 1.0 - (0.2 if not flags.get("has_constraints") else 0.0)), 4),
+        "workflow": round(max(0.0, 1.0 - (0.18 if not flags.get("has_workflow") else 0.0)), 4),
+        "verification": round(max(0.0, 1.0 - (0.22 if not flags.get("has_verification") else 0.0)), 4),
+        "output_contract": round(max(0.0, 1.0 - (0.18 if not flags.get("has_output_contract") else 0.0)), 4),
+        "repo_specificity": round(max(0.0, 0.5 + min(task_coverage, 0.5) - (0.1 if not flags.get("has_repo_context") else 0.0)), 4),
+        "token_efficiency": round(max(0.0, min(1.0, 1.0 - max(0, words - 2400) / 4000.0)), 4),
+    }
+    details["task_keyword_coverage"] = round(task_coverage, 4)
+    details["feedback"] = build_feedback(
+        kind="agents",
+        metadata=metadata,
+        task_themes=evidence.get("task_themes", {}),
+        issue_themes=evidence.get("issue_themes", {}),
+        task_coverage=task_coverage,
+    )
     return FileScore(
         path=entry["path"],
         kind="agents",
@@ -36,12 +118,17 @@ def _score_agents(entry: dict[str, Any]) -> FileScore:
     )
 
 
-def _score_skill(entry: dict[str, Any]) -> FileScore:
+def _score_skill(entry: dict[str, Any], evidence: dict[str, Any] | None = None) -> FileScore:
     score = 1.0
     issues: list[str] = []
     details: dict[str, Any] = {}
     words = int(entry.get("words", 0))
     entry_issues = set(entry.get("issues", []))
+    metadata = dict(entry.get("metadata", {}))
+    flags = dict(metadata.get("instruction_flags", {}))
+    duplicate_count = int(metadata.get("duplicate_nonempty_line_count", 0))
+    evidence = evidence or {}
+    task_coverage = task_keyword_coverage(_load_entry_text(entry), evidence.get("task_keywords", []))
 
     if "missing_frontmatter" in entry_issues:
         score -= 0.6
@@ -64,9 +151,40 @@ def _score_skill(entry: dict[str, Any]) -> FileScore:
     if words > 1800:
         score -= min(0.25, (words - 1800) / 10000.0)
         issues.append("too_long")
+    if duplicate_count:
+        score -= min(0.12, 0.03 * duplicate_count)
+        issues.append("duplicate_lines")
+    if not flags.get("has_trigger_phrase"):
+        score -= 0.1
+        issues.append("missing_trigger")
+    if not flags.get("has_workflow"):
+        score -= 0.08
+        issues.append("missing_workflow")
+    if not flags.get("has_verification"):
+        score -= 0.1
+        issues.append("missing_verification")
+    if task_coverage < 0.15 and evidence.get("task_keywords"):
+        score -= 0.06
+        issues.append("weak_task_alignment")
 
     details["words"] = words
     details["frontmatter_present"] = entry.get("metadata", {}).get("frontmatter_present", False)
+    details["criteria_scores"] = {
+        "metadata": round(max(0.0, 1.0 - (0.3 if "missing_frontmatter" in entry_issues else 0.0)), 4),
+        "trigger_clarity": round(max(0.0, 1.0 - (0.2 if not flags.get("has_trigger_phrase") else 0.0)), 4),
+        "workflow": round(max(0.0, 1.0 - (0.16 if not flags.get("has_workflow") else 0.0)), 4),
+        "verification": round(max(0.0, 1.0 - (0.2 if not flags.get("has_verification") else 0.0)), 4),
+        "token_efficiency": round(max(0.0, min(1.0, 1.0 - max(0, words - 1800) / 3000.0)), 4),
+        "repo_specificity": round(max(0.0, 0.5 + min(task_coverage, 0.5)), 4),
+    }
+    details["task_keyword_coverage"] = round(task_coverage, 4)
+    details["feedback"] = build_feedback(
+        kind="skill",
+        metadata=metadata,
+        task_themes=evidence.get("task_themes", {}),
+        issue_themes=evidence.get("issue_themes", {}),
+        task_coverage=task_coverage,
+    )
     return FileScore(
         path=entry["path"],
         kind="skill",
@@ -76,22 +194,23 @@ def _score_skill(entry: dict[str, Any]) -> FileScore:
     )
 
 
-def score_entry(entry: dict[str, Any]) -> FileScore:
+def score_entry(entry: dict[str, Any], evidence: dict[str, Any] | None = None) -> FileScore:
     kind = entry.get("kind")
     if kind == "agents":
-        return _score_agents(entry)
-    return _score_skill(entry)
+        return _score_agents(entry, evidence=evidence)
+    return _score_skill(entry, evidence=evidence)
 
 
-def run_benchmark(scan_result: dict[str, Any]) -> dict[str, Any]:
-    scores: list[FileScore] = [score_entry(entry) for entry in scan_result["entries"]]
+def run_benchmark(scan_result: dict[str, Any], evidence: dict[str, Any] | None = None) -> dict[str, Any]:
+    scores: list[FileScore] = [score_entry(entry, evidence=evidence) for entry in scan_result["entries"]]
     if scores:
         overall = sum(item.score for item in scores) / len(scores)
     else:
         overall = 0.0
 
     return {
         "counts": scan_result["counts"],
+        "evidence": evidence or {},
         "overall_score": round(overall, 4),
         "files": [
             {
@@ -108,6 +227,11 @@ def run_benchmark(scan_result: dict[str, Any]) -> dict[str, Any]:
 
 def print_benchmark_summary(result: dict[str, Any]) -> None:
     print(f"overall_score: {result['overall_score']:.4f}")
+    evidence = result.get("evidence", {})
+    if evidence.get("task_paths"):
+        print(f"task_files: {len(evidence['task_paths'])}")
+    if evidence.get("issue_paths"):
+        print(f"issue_files: {len(evidence['issue_paths'])}")
     for file_result in result["files"]:
         issues = ", ".join(file_result["issues"]) if file_result["issues"] else "ok"
         print(f"- {file_result['kind']}: {file_result['path']}")
 
@@ -12,6 +12,7 @@
 from .artifacts import resolve_run_id
 from .artifacts import set_latest_run
 from .artifacts import write_json
+from .benchmark import load_evidence
 from .benchmark import print_benchmark_summary
 from .benchmark import run_benchmark
 from .config import DEFAULT_CONFIG_FILENAME
@@ -55,7 +56,8 @@ def cmd_benchmark(args: argparse.Namespace) -> int:
     cfg, _cfg_path = _resolve_config(args.config)
     cwd = Path.cwd()
     scan_result = scan_project(cwd, cfg)
-    benchmark_result = run_benchmark(scan_result)
+    evidence = load_evidence(cwd, cfg)
+    benchmark_result = run_benchmark(scan_result, evidence=evidence)
 
     output_root = Path(cfg.output.root_dir)
     run_id, run_dir = new_run_dir(output_root, "benchmark")
@@ -82,13 +84,15 @@ def _optimize(args: argparse.Namespace, kind: str) -> int:
         agents_files=agents_patterns,
         skills_globs=skills_patterns,
     )
+    evidence = load_evidence(cwd, cfg)
     result = optimize_entries(
         entries=scan_result["entries"],
         kind=kind,
         engine=args.engine or cfg.optimization.engine,
         min_delta=cfg.optimization.min_apply_delta,
         reflection_model=args.reflection_model or cfg.optimization.reflection_model,
         max_metric_calls=args.max_metric_calls or cfg.optimization.max_metric_calls,
+        evidence=evidence,
     )
 
     output_root = Path(cfg.output.root_dir)