@@ -288,7 +288,6 @@ def evaluate_with_llm_judge_groupwise(image_paths: List[str], requirements: List
288288Requirements:
289289{ requirements_text } """
290290
291-
292291 messages = [
293292 {
294293 "role" : "user" ,
@@ -302,7 +301,9 @@ def evaluate_with_llm_judge_groupwise(image_paths: List[str], requirements: List
302301 for image_path in image_paths :
303302 with open (image_path , "rb" ) as f :
304303 image_data = base64 .b64encode (f .read ()).decode ("utf-8" )
305- messages [0 ]["content" ].append ({"type" : "image_url" , "image_url" : {"url" : f"data:image/png;base64,{ image_data } " }})
304+ messages [0 ]["content" ].append (
305+ {"type" : "image_url" , "image_url" : {"url" : f"data:image/png;base64,{ image_data } " }}
306+ )
306307
307308 # Use GPT-4.1 for vision capabilities to match project's OpenAI model preference
308309 response = litellm .completion (
@@ -331,7 +332,6 @@ def evaluate_with_llm_judge_groupwise(image_paths: List[str], requirements: List
331332 raise ValueError ("Missing required field in response" )
332333
333334
334-
335335@evaluation_test (
336336 input_dataset = ["tests/pytest/data/svgbench_dataset.jsonl" ],
337337 dataset_adapter = svgbench_to_evaluation_row ,
@@ -540,15 +540,14 @@ def test_svg_generation_evaluation_groupwise(rows: List[EvaluationRow]) -> List[
540540 row .evaluation_result = EvaluateResult (score = 0.0 , reason = f"Evaluation error: { str (e )} " )
541541
542542 judge_result = evaluate_with_llm_judge_groupwise (image_paths , requirements )
543- print (f' ********** judge_result: { judge_result } **********' )
543+ print (f" ********** judge_result: { judge_result } **********" )
544544 if judge_result .get ("best_image_index" ) == 0 :
545545 rows [0 ].evaluation_result = EvaluateResult (score = 1.0 , reason = judge_result .get ("reasoning" , "" ))
546546 rows [1 ].evaluation_result = EvaluateResult (score = 0.0 , reason = judge_result .get ("reasoning" , "" ))
547547 else :
548548 rows [0 ].evaluation_result = EvaluateResult (score = 0.0 , reason = judge_result .get ("reasoning" , "" ))
549549 rows [1 ].evaluation_result = EvaluateResult (score = 1.0 , reason = judge_result .get ("reasoning" , "" ))
550-
551-
550+
552551 # Clean up temporary PNG file (only if not saving debug files)
553552 if not save_debug_files :
554553 for png_path in image_paths :
0 commit comments