11from __future__ import annotations
22
3+ from pathlib import Path
34from typing import Any
45
56from .types import FileScore
7+ from .quality import build_feedback
8+ from .quality import extract_keywords
9+ from .quality import load_issue_texts
10+ from .quality import load_task_statements
11+ from .quality import resolve_evidence_paths
12+ from .quality import summarize_text_themes
13+ from .quality import task_keyword_coverage
14+
15+
16+ def load_evidence (cwd : Any , config : Any ) -> dict [str , Any ]:
17+ task_paths = resolve_evidence_paths (cwd , list (getattr (config .evidence , "task_files" , [])))
18+ issue_paths = resolve_evidence_paths (cwd , list (getattr (config .evidence , "issue_files" , [])))
19+ task_statements = load_task_statements (task_paths )
20+ issue_texts = load_issue_texts (issue_paths )
21+ return {
22+ "task_paths" : [str (path ) for path in task_paths ],
23+ "issue_paths" : [str (path ) for path in issue_paths ],
24+ "task_statements" : task_statements ,
25+ "task_keywords" : extract_keywords (task_statements ),
26+ "task_themes" : summarize_text_themes (task_statements ),
27+ "issue_themes" : summarize_text_themes (issue_texts ),
28+ }
29+
630
31+ def _load_entry_text (entry : dict [str , Any ]) -> str :
32+ if isinstance (entry .get ("text" ), str ):
33+ return str (entry ["text" ])
34+ path = Path (str (entry .get ("path" , "" )))
35+ if not path .exists ():
36+ return ""
37+ return path .read_text (encoding = "utf-8" , errors = "replace" )
738
8- def _score_agents (entry : dict [str , Any ]) -> FileScore :
39+
40+ def _score_agents (entry : dict [str , Any ], evidence : dict [str , Any ] | None = None ) -> FileScore :
941 score = 1.0
1042 issues : list [str ] = []
1143 details : dict [str , Any ] = {}
1244 words = int (entry .get ("words" , 0 ))
1345 tokens = int (entry .get ("token_estimate" , 0 ))
46+ metadata = dict (entry .get ("metadata" , {}))
47+ flags = dict (metadata .get ("instruction_flags" , {}))
48+ contradictions = list (metadata .get ("contradictions" , []))
49+ duplicate_count = int (metadata .get ("duplicate_nonempty_line_count" , 0 ))
50+ evidence = evidence or {}
51+ task_coverage = task_keyword_coverage (_load_entry_text (entry ), evidence .get ("task_keywords" , []))
1452
1553 if words < 80 :
1654 score -= 0.2
@@ -24,9 +62,53 @@ def _score_agents(entry: dict[str, Any]) -> FileScore:
2462 if "empty_agents" in entry .get ("issues" , []):
2563 score -= 0.6
2664 issues .append ("empty" )
65+ if contradictions :
66+ score -= min (0.25 , 0.08 * len (contradictions ))
67+ issues .append ("contradictions" )
68+ if duplicate_count :
69+ score -= min (0.12 , 0.03 * duplicate_count )
70+ issues .append ("duplicate_lines" )
71+ if not flags .get ("has_role" ):
72+ score -= 0.08
73+ issues .append ("missing_role" )
74+ if not flags .get ("has_constraints" ):
75+ score -= 0.1
76+ issues .append ("missing_constraints" )
77+ if not flags .get ("has_workflow" ):
78+ score -= 0.1
79+ issues .append ("missing_workflow" )
80+ if not flags .get ("has_verification" ):
81+ score -= 0.14
82+ issues .append ("missing_verification" )
83+ if not flags .get ("has_output_contract" ):
84+ score -= 0.08
85+ issues .append ("missing_output_contract" )
86+ if not flags .get ("has_repo_context" ):
87+ score -= 0.06
88+ issues .append ("missing_repo_context" )
89+ if task_coverage < 0.2 and evidence .get ("task_keywords" ):
90+ score -= 0.08
91+ issues .append ("weak_task_alignment" )
2792
2893 details ["words" ] = words
2994 details ["token_estimate" ] = tokens
95+ details ["criteria_scores" ] = {
96+ "clarity" : round (max (0.0 , 1.0 - (0.16 if contradictions else 0.0 ) - (0.05 if duplicate_count else 0.0 )), 4 ),
97+ "safety" : round (max (0.0 , 1.0 - (0.2 if not flags .get ("has_constraints" ) else 0.0 )), 4 ),
98+ "workflow" : round (max (0.0 , 1.0 - (0.18 if not flags .get ("has_workflow" ) else 0.0 )), 4 ),
99+ "verification" : round (max (0.0 , 1.0 - (0.22 if not flags .get ("has_verification" ) else 0.0 )), 4 ),
100+ "output_contract" : round (max (0.0 , 1.0 - (0.18 if not flags .get ("has_output_contract" ) else 0.0 )), 4 ),
101+ "repo_specificity" : round (max (0.0 , 0.5 + min (task_coverage , 0.5 ) - (0.1 if not flags .get ("has_repo_context" ) else 0.0 )), 4 ),
102+ "token_efficiency" : round (max (0.0 , min (1.0 , 1.0 - max (0 , words - 2400 ) / 4000.0 )), 4 ),
103+ }
104+ details ["task_keyword_coverage" ] = round (task_coverage , 4 )
105+ details ["feedback" ] = build_feedback (
106+ kind = "agents" ,
107+ metadata = metadata ,
108+ task_themes = evidence .get ("task_themes" , {}),
109+ issue_themes = evidence .get ("issue_themes" , {}),
110+ task_coverage = task_coverage ,
111+ )
30112 return FileScore (
31113 path = entry ["path" ],
32114 kind = "agents" ,
@@ -36,12 +118,17 @@ def _score_agents(entry: dict[str, Any]) -> FileScore:
36118 )
37119
38120
39- def _score_skill (entry : dict [str , Any ]) -> FileScore :
121+ def _score_skill (entry : dict [str , Any ], evidence : dict [ str , Any ] | None = None ) -> FileScore :
40122 score = 1.0
41123 issues : list [str ] = []
42124 details : dict [str , Any ] = {}
43125 words = int (entry .get ("words" , 0 ))
44126 entry_issues = set (entry .get ("issues" , []))
127+ metadata = dict (entry .get ("metadata" , {}))
128+ flags = dict (metadata .get ("instruction_flags" , {}))
129+ duplicate_count = int (metadata .get ("duplicate_nonempty_line_count" , 0 ))
130+ evidence = evidence or {}
131+ task_coverage = task_keyword_coverage (_load_entry_text (entry ), evidence .get ("task_keywords" , []))
45132
46133 if "missing_frontmatter" in entry_issues :
47134 score -= 0.6
@@ -64,9 +151,40 @@ def _score_skill(entry: dict[str, Any]) -> FileScore:
64151 if words > 1800 :
65152 score -= min (0.25 , (words - 1800 ) / 10000.0 )
66153 issues .append ("too_long" )
154+ if duplicate_count :
155+ score -= min (0.12 , 0.03 * duplicate_count )
156+ issues .append ("duplicate_lines" )
157+ if not flags .get ("has_trigger_phrase" ):
158+ score -= 0.1
159+ issues .append ("missing_trigger" )
160+ if not flags .get ("has_workflow" ):
161+ score -= 0.08
162+ issues .append ("missing_workflow" )
163+ if not flags .get ("has_verification" ):
164+ score -= 0.1
165+ issues .append ("missing_verification" )
166+ if task_coverage < 0.15 and evidence .get ("task_keywords" ):
167+ score -= 0.06
168+ issues .append ("weak_task_alignment" )
67169
68170 details ["words" ] = words
69171 details ["frontmatter_present" ] = entry .get ("metadata" , {}).get ("frontmatter_present" , False )
172+ details ["criteria_scores" ] = {
173+ "metadata" : round (max (0.0 , 1.0 - (0.3 if "missing_frontmatter" in entry_issues else 0.0 )), 4 ),
174+ "trigger_clarity" : round (max (0.0 , 1.0 - (0.2 if not flags .get ("has_trigger_phrase" ) else 0.0 )), 4 ),
175+ "workflow" : round (max (0.0 , 1.0 - (0.16 if not flags .get ("has_workflow" ) else 0.0 )), 4 ),
176+ "verification" : round (max (0.0 , 1.0 - (0.2 if not flags .get ("has_verification" ) else 0.0 )), 4 ),
177+ "token_efficiency" : round (max (0.0 , min (1.0 , 1.0 - max (0 , words - 1800 ) / 3000.0 )), 4 ),
178+ "repo_specificity" : round (max (0.0 , 0.5 + min (task_coverage , 0.5 )), 4 ),
179+ }
180+ details ["task_keyword_coverage" ] = round (task_coverage , 4 )
181+ details ["feedback" ] = build_feedback (
182+ kind = "skill" ,
183+ metadata = metadata ,
184+ task_themes = evidence .get ("task_themes" , {}),
185+ issue_themes = evidence .get ("issue_themes" , {}),
186+ task_coverage = task_coverage ,
187+ )
70188 return FileScore (
71189 path = entry ["path" ],
72190 kind = "skill" ,
@@ -76,22 +194,23 @@ def _score_skill(entry: dict[str, Any]) -> FileScore:
76194 )
77195
78196
79- def score_entry (entry : dict [str , Any ]) -> FileScore :
197+ def score_entry (entry : dict [str , Any ], evidence : dict [ str , Any ] | None = None ) -> FileScore :
80198 kind = entry .get ("kind" )
81199 if kind == "agents" :
82- return _score_agents (entry )
83- return _score_skill (entry )
200+ return _score_agents (entry , evidence = evidence )
201+ return _score_skill (entry , evidence = evidence )
84202
85203
86- def run_benchmark (scan_result : dict [str , Any ]) -> dict [str , Any ]:
87- scores : list [FileScore ] = [score_entry (entry ) for entry in scan_result ["entries" ]]
204+ def run_benchmark (scan_result : dict [str , Any ], evidence : dict [ str , Any ] | None = None ) -> dict [str , Any ]:
205+ scores : list [FileScore ] = [score_entry (entry , evidence = evidence ) for entry in scan_result ["entries" ]]
88206 if scores :
89207 overall = sum (item .score for item in scores ) / len (scores )
90208 else :
91209 overall = 0.0
92210
93211 return {
94212 "counts" : scan_result ["counts" ],
213+ "evidence" : evidence or {},
95214 "overall_score" : round (overall , 4 ),
96215 "files" : [
97216 {
@@ -108,6 +227,11 @@ def run_benchmark(scan_result: dict[str, Any]) -> dict[str, Any]:
108227
109228def print_benchmark_summary (result : dict [str , Any ]) -> None :
110229 print (f"overall_score: { result ['overall_score' ]:.4f} " )
230+ evidence = result .get ("evidence" , {})
231+ if evidence .get ("task_paths" ):
232+ print (f"task_files: { len (evidence ['task_paths' ])} " )
233+ if evidence .get ("issue_paths" ):
234+ print (f"issue_files: { len (evidence ['issue_paths' ])} " )
111235 for file_result in result ["files" ]:
112236 issues = ", " .join (file_result ["issues" ]) if file_result ["issues" ] else "ok"
113237 print (f"- { file_result ['kind' ]} : { file_result ['path' ]} " )
0 commit comments