From 6702ca46b04f5c4cd6e809e4a948f38918ee71b2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 28 Aug 2025 14:18:16 +0000 Subject: [PATCH 1/2] Initial plan From e439182a755beec559db095a90eff67583d63719 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 28 Aug 2025 14:25:58 +0000 Subject: [PATCH 2/2] Add OCR verification to check for text extraction after OCR completion Co-authored-by: maxi07 <7480270+maxi07@users.noreply.github.com> --- ocr_service/main.py | 20 +++++++++-- tests/test_ocr_verification.py | 62 ++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 tests/test_ocr_verification.py diff --git a/ocr_service/main.py b/ocr_service/main.py index 7268564..8f452bc 100644 --- a/ocr_service/main.py +++ b/ocr_service/main.py @@ -1,9 +1,10 @@ from scansynclib.logging import logger from scansynclib.ProcessItem import ProcessItem, ProcessStatus, OCRStatus from scansynclib.sqlite_wrapper import update_scanneddata_database -from scansynclib.helpers import connect_rabbitmq, forward_to_rabbitmq +from scansynclib.helpers import connect_rabbitmq, forward_to_rabbitmq, extract_text import pickle import ocrmypdf +import os from datetime import datetime import time import pika.exceptions @@ -36,13 +37,26 @@ def start_processing(item: ProcessItem): try: result = ocrmypdf.ocr(item.local_file_path, item.ocr_file, output_type='pdfa', skip_text=True, rotate_pages=True, jpg_quality=80, png_quality=80, optimize=2, language=["eng", "deu"], tesseract_timeout=120) + logger.debug(f"OCR exited with code {result}") + if result != 0: logger.error(f"OCR exited with code {result}") item.ocr_status = OCRStatus.FAILED else: logger.info(f"OCR processing completed: {item.filename}") - logger.debug(f"OCR exited with code {result}") - item.ocr_status = OCRStatus.COMPLETED + + # Verify that the OCR file actually contains text + if os.path.exists(item.ocr_file): + extracted_text = extract_text(item.ocr_file).strip() + if extracted_text: + logger.info(f"OCR verification successful: extracted {len(extracted_text)} characters from {item.filename}") + item.ocr_status = OCRStatus.COMPLETED + else: + logger.warning(f"OCR verification failed: no text found in OCR output file {item.ocr_file}") + item.ocr_status = OCRStatus.FAILED + else: + logger.error(f"OCR output file not found: {item.ocr_file}") + item.ocr_status = OCRStatus.OUTPUT_ERROR except ocrmypdf.UnsupportedImageFormatError: logger.error(f"Unsupported image format: {item.local_file_path}") item.ocr_status = OCRStatus.UNSUPPORTED diff --git a/tests/test_ocr_verification.py b/tests/test_ocr_verification.py new file mode 100644 index 0000000..9f1c12d --- /dev/null +++ b/tests/test_ocr_verification.py @@ -0,0 +1,62 @@ +import pytest +import os +import tempfile +from unittest.mock import Mock, patch, mock_open +from scansynclib.ProcessItem import ProcessItem, ItemType, OCRStatus + + +class TestOCRTextVerification: + """Test OCR text verification functionality without importing the main OCR service.""" + + def test_extract_text_returns_empty_string_on_empty_pdf(self): + """Test that extract_text returns empty string for a PDF with no text.""" + from scansynclib.helpers import extract_text + + # Create a temporary file that simulates an empty PDF + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: + temp_file.write(b"%PDF-1.4\n") # Minimal PDF header + temp_file_path = temp_file.name + + try: + # extract_text should return empty string for malformed/empty PDF + result = extract_text(temp_file_path) + assert result == "" + finally: + os.unlink(temp_file_path) + + def test_extract_text_returns_empty_string_on_nonexistent_file(self): + """Test that extract_text returns empty string for non-existent file.""" + from scansynclib.helpers import extract_text + + result = extract_text("/nonexistent/file.pdf") + assert result == "" + + @patch('scansynclib.helpers.PdfReader') + def test_extract_text_strips_whitespace(self, mock_pdf_reader): + """Test that extract_text properly handles text with whitespace.""" + from scansynclib.helpers import extract_text + + # Mock the PDF reader to return text with whitespace + mock_page = Mock() + mock_page.extract_text.return_value = " \n\t Some text \n\t " + mock_reader = Mock() + mock_reader.pages = [mock_page] + mock_pdf_reader.return_value = mock_reader + + result = extract_text("dummy_path.pdf") + assert result == " \n\t Some text \n\t " # Should return raw text, not stripped + + def test_process_item_has_ocr_file_attribute(self): + """Test that ProcessItem correctly sets the OCR file path.""" + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: + temp_file_path = temp_file.name + + try: + item = ProcessItem(temp_file_path, ItemType.PDF) + + # Verify OCR file path is set correctly + assert hasattr(item, 'ocr_file') + assert item.ocr_file.endswith('_OCR.pdf') + assert item.ocr_status == OCRStatus.UNKNOWN + finally: + os.unlink(temp_file_path) \ No newline at end of file