From 665911cdcba094545755aff06cc723cc47c0ed3e Mon Sep 17 00:00:00 2001 From: Octopus Date: Fri, 3 Apr 2026 09:46:27 +0800 Subject: [PATCH] fix: apply regex-based trailing comma removal before JSON parse (fixes #195) - Add missing 'import re' (re was already used in get_first/last_start_page_from_text but not imported, causing NameError if those functions were called) - Move trailing comma cleanup to before the first json.loads() attempt so valid-except-for-trailing-commas JSON succeeds on the first parse - Replace simple str.replace(',]'/',' with re.sub(r',\s*([}\]])', ...) to handle whitespace between the trailing comma and closing bracket, which smaller LLMs (e.g. qwen2.5:7B) frequently produce --- pageindex/utils.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pageindex/utils.py b/pageindex/utils.py index f00ccf3a7..3df2198fa 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -1,6 +1,7 @@ import litellm import logging import os +import re import textwrap from datetime import datetime import time @@ -113,18 +114,15 @@ def extract_json(content): json_content = json_content.replace('\n', ' ').replace('\r', ' ') # Remove newlines json_content = ' '.join(json_content.split()) # Normalize whitespace + # Remove any trailing commas before closing brackets/braces (handles whitespace variants) + json_content = re.sub(r',\s*([}\]])', r'\1', json_content) + # Attempt to parse and return the JSON object return json.loads(json_content) except json.JSONDecodeError as e: logging.error(f"Failed to extract JSON: {e}") - # Try to clean up the content further if initial parsing fails - try: - # Remove any trailing commas before closing brackets/braces - json_content = json_content.replace(',]', ']').replace(',}', '}') - return json.loads(json_content) - except: - logging.error("Failed to parse JSON even after cleanup") - return {} + logging.error("Failed to parse JSON even after cleanup") + return {} except Exception as e: logging.error(f"Unexpected error while extracting JSON: {e}") return {}