From 6702ca46b04f5c4cd6e809e4a948f38918ee71b2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 28 Aug 2025 14:18:16 +0000
Subject: [PATCH 1/2] Initial plan


From e439182a755beec559db095a90eff67583d63719 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 28 Aug 2025 14:25:58 +0000
Subject: [PATCH 2/2] Add OCR verification to check for text extraction after
 OCR completion

Co-authored-by: maxi07 <7480270+maxi07@users.noreply.github.com>
---
 ocr_service/main.py            | 20 +++++++++--
 tests/test_ocr_verification.py | 62 ++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_ocr_verification.py

diff --git a/ocr_service/main.py b/ocr_service/main.py
index 7268564..8f452bc 100644
--- a/ocr_service/main.py
+++ b/ocr_service/main.py
@@ -1,9 +1,10 @@
 from scansynclib.logging import logger
 from scansynclib.ProcessItem import ProcessItem, ProcessStatus, OCRStatus
 from scansynclib.sqlite_wrapper import update_scanneddata_database
-from scansynclib.helpers import connect_rabbitmq, forward_to_rabbitmq
+from scansynclib.helpers import connect_rabbitmq, forward_to_rabbitmq, extract_text
 import pickle
 import ocrmypdf
+import os
 from datetime import datetime
 import time
 import pika.exceptions
@@ -36,13 +37,26 @@ def start_processing(item: ProcessItem):
 
     try:
         result = ocrmypdf.ocr(item.local_file_path, item.ocr_file, output_type='pdfa', skip_text=True, rotate_pages=True, jpg_quality=80, png_quality=80, optimize=2, language=["eng", "deu"], tesseract_timeout=120)
+        logger.debug(f"OCR exited with code {result}")
+        
         if result != 0:
             logger.error(f"OCR exited with code {result}")
             item.ocr_status = OCRStatus.FAILED
         else:
             logger.info(f"OCR processing completed: {item.filename}")
-        logger.debug(f"OCR exited with code {result}")
-        item.ocr_status = OCRStatus.COMPLETED
+            
+            # Verify that the OCR file actually contains text
+            if os.path.exists(item.ocr_file):
+                extracted_text = extract_text(item.ocr_file).strip()
+                if extracted_text:
+                    logger.info(f"OCR verification successful: extracted {len(extracted_text)} characters from {item.filename}")
+                    item.ocr_status = OCRStatus.COMPLETED
+                else:
+                    logger.warning(f"OCR verification failed: no text found in OCR output file {item.ocr_file}")
+                    item.ocr_status = OCRStatus.FAILED
+            else:
+                logger.error(f"OCR output file not found: {item.ocr_file}")
+                item.ocr_status = OCRStatus.OUTPUT_ERROR
     except ocrmypdf.UnsupportedImageFormatError:
         logger.error(f"Unsupported image format: {item.local_file_path}")
         item.ocr_status = OCRStatus.UNSUPPORTED
diff --git a/tests/test_ocr_verification.py b/tests/test_ocr_verification.py
new file mode 100644
index 0000000..9f1c12d
--- /dev/null
+++ b/tests/test_ocr_verification.py
@@ -0,0 +1,62 @@
+import pytest
+import os
+import tempfile
+from unittest.mock import Mock, patch, mock_open
+from scansynclib.ProcessItem import ProcessItem, ItemType, OCRStatus
+
+
+class TestOCRTextVerification:
+    """Test OCR text verification functionality without importing the main OCR service."""
+
+    def test_extract_text_returns_empty_string_on_empty_pdf(self):
+        """Test that extract_text returns empty string for a PDF with no text."""
+        from scansynclib.helpers import extract_text
+        
+        # Create a temporary file that simulates an empty PDF
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+            temp_file.write(b"%PDF-1.4\n")  # Minimal PDF header
+            temp_file_path = temp_file.name
+        
+        try:
+            # extract_text should return empty string for malformed/empty PDF
+            result = extract_text(temp_file_path)
+            assert result == ""
+        finally:
+            os.unlink(temp_file_path)
+
+    def test_extract_text_returns_empty_string_on_nonexistent_file(self):
+        """Test that extract_text returns empty string for non-existent file."""
+        from scansynclib.helpers import extract_text
+        
+        result = extract_text("/nonexistent/file.pdf")
+        assert result == ""
+
+    @patch('scansynclib.helpers.PdfReader')
+    def test_extract_text_strips_whitespace(self, mock_pdf_reader):
+        """Test that extract_text properly handles text with whitespace."""
+        from scansynclib.helpers import extract_text
+        
+        # Mock the PDF reader to return text with whitespace
+        mock_page = Mock()
+        mock_page.extract_text.return_value = "  \n\t  Some text  \n\t  "
+        mock_reader = Mock()
+        mock_reader.pages = [mock_page]
+        mock_pdf_reader.return_value = mock_reader
+        
+        result = extract_text("dummy_path.pdf")
+        assert result == "  \n\t  Some text  \n\t  "  # Should return raw text, not stripped
+
+    def test_process_item_has_ocr_file_attribute(self):
+        """Test that ProcessItem correctly sets the OCR file path."""
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+            temp_file_path = temp_file.name
+        
+        try:
+            item = ProcessItem(temp_file_path, ItemType.PDF)
+            
+            # Verify OCR file path is set correctly
+            assert hasattr(item, 'ocr_file')
+            assert item.ocr_file.endswith('_OCR.pdf')
+            assert item.ocr_status == OCRStatus.UNKNOWN
+        finally:
+            os.unlink(temp_file_path)
\ No newline at end of file