Kohulan · KennethVrb · Feb 25, 2026 · Feb 26, 2026 · Feb 27, 2026 · Mar 11, 2026
diff --git a/decimer_segmentation/decimer_segmentation.py b/decimer_segmentation/decimer_segmentation.py
@@ -127,8 +127,8 @@ def _load_model_internal() -> modellib.MaskRCNN:
         logger.debug(f"Some optimizer options not available: {e}")
 
     model_path = pystow.join("DECIMER-Segmentation_model")
-    print(model_path)
-    if not os.path.exists(str(model_path) + "/" + MODEL_FILENAME):
+    logger.debug("Model path: %s", model_path)
+    if not os.path.exists(os.path.join(model_path, MODEL_FILENAME)):
         logger.info("Downloading model weights...")
         download_trained_weights(MODEL_DOWNLOAD_URL, str(model_path))
         logger.info("Successfully downloaded the segmentation model weights!")
@@ -184,60 +184,103 @@ def segment_chemical_structures_from_file(
     file_path: str,
     expand: bool = True,
     return_bboxes: bool = False,
+    return_page_numbers: bool = False,
     **kwargs,
-) -> Union[List[np.ndarray], Tuple[List[np.ndarray], List[Tuple[int, int, int, int]]]]:
+) -> Union[
+    List[np.ndarray],
+    Tuple[List[np.ndarray], List[Tuple[int, int, int, int]]],
+    Tuple[List[np.ndarray], List[int]],
+    Tuple[List[np.ndarray], List[Tuple[int, int, int, int]], List[int]],
+]:
     """
     Segment chemical structures from a PDF or image file.
 
     Args:
         file_path: Path to input file (PDF or image)
         expand: Whether to expand masks to capture complete structures
         return_bboxes: Whether to return bounding boxes along with segments
+        return_page_numbers: Whether to return 1-indexed page numbers per segment
 
     Returns:
         List of segmented chemical structure images as numpy arrays.
         If return_bboxes is True, returns a tuple of (segments, bboxes).
+        If return_page_numbers is True, returns (segments, page_numbers).
+        If both are True, returns (segments, bboxes, page_numbers).
     """
     if not os.path.exists(file_path):
         raise FileNotFoundError(f"Input file not found: {file_path}")
 
     if "poppler_path" in kwargs:
         logger.warning("poppler_path parameter is deprecated and ignored")
 
-    images = _load_images_from_file(file_path)
+    images_with_pages = _load_images_from_file(file_path)
 
-    if not images:
+    if not images_with_pages:
         logger.warning(f"No images could be extracted from {file_path}")
-        return ([], []) if return_bboxes else []
+        if return_bboxes and return_page_numbers:
+            return ([], [], [])
+        elif return_bboxes:
+            return ([], [])
+        elif return_page_numbers:
+            return ([], [])
+        else:
+            return []
 
     # Process all images sequentially (model can't parallelize)
     all_segments = []
     all_bboxes = []
-    for image in images:
+    all_page_numbers = []
+    for image, page_number in images_with_pages:
         segments, bboxes = segment_chemical_structures(
             image, expand, return_bboxes=True
         )
         all_segments.extend(segments)
         all_bboxes.extend(bboxes)
+        all_page_numbers.extend([page_number] * len(segments))
+
+    if return_bboxes and return_page_numbers:
+        return (all_segments, all_bboxes, all_page_numbers)
+    elif return_bboxes:
+        return (all_segments, all_bboxes)
+    elif return_page_numbers:
+        return (all_segments, all_page_numbers)
+    else:
+        return all_segments
 
-    return (all_segments, all_bboxes) if return_bboxes else all_segments
 
+def _load_images_from_file(
+    file_path: str,
+) -> List[Tuple[np.ndarray, int]]:
+    """Load images from PDF or image file.
 
-def _load_images_from_file(file_path: str) -> List[np.ndarray]:
-    """Load images from PDF or image file."""
+    Returns:
+        List of (image, page_number) tuples. Page numbers are 1-indexed.
+    """
     if file_path.lower().endswith(".pdf"):
         return _load_pdf_pages(file_path)
     else:
-        return _load_single_image(file_path)
+        return [(img, 1) for img in _load_single_image(file_path)]
 
 
-def _load_pdf_pages(pdf_path: str) -> List[np.ndarray]:
-    """Load all pages from a PDF as images using PyMuPDF."""
-    pdf_document = pymupdf.open(pdf_path)
-    page_count = pdf_document.page_count
+def _load_pdf_pages(pdf_path: str) -> List[Tuple[np.ndarray, int]]:
+    """Load all pages from a PDF as images using PyMuPDF.
+
+    Returns:
+        List of (image, page_number) tuples. Page numbers are 1-indexed.
+    """
+    # Quick check for page count to determine strategy
+    with pymupdf.open(pdf_path) as doc:
+        page_count = doc.page_count
 
     if page_count == 1:
-        # Single page - no threading overhead
+        return _load_pdf_single_page(pdf_path)
+    else:
+        return _load_pdf_multipage(pdf_path, page_count)
+
+
+def _load_pdf_single_page(pdf_path: str) -> List[Tuple[np.ndarray, int]]:
+    """Load a single-page PDF without threading overhead."""
+    with pymupdf.open(pdf_path) as pdf_document:
         page = pdf_document[0]
         matrix = pymupdf.Matrix(300 / 72, 300 / 72)
         pix = page.get_pixmap(matrix=matrix, alpha=False)
@@ -246,31 +289,38 @@ def _load_pdf_pages(pdf_path: str) -> List[np.ndarray]:
         )
         if pix.n == 3:
             img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
-        pdf_document.close()
-        return [img_array.copy()]
+        return [(img_array.copy(), 1)]
+
 
-    # Multiple pages - use threading for I/O
+def _load_pdf_multipage(pdf_path: str, page_count: int) -> List[Tuple[np.ndarray, int]]:
+    """Load multi-page PDF using threading with separate document handles per thread."""
     images = [None] * page_count
 
     def render_page(page_num: int) -> Tuple[int, np.ndarray]:
-        page = pdf_document[page_num]
-        matrix = pymupdf.Matrix(300 / 72, 300 / 72)
-        pix = page.get_pixmap(matrix=matrix, alpha=False)
-        img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
-            pix.h, pix.w, pix.n
-        )
-        if pix.n == 3:
-            img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
-        return page_num, img_array.copy()
+        # Each thread opens its own document handle for thread safety
+        with pymupdf.open(pdf_path) as doc:
+            page = doc[page_num]
+            matrix = pymupdf.Matrix(300 / 72, 300 / 72)
+            pix = page.get_pixmap(matrix=matrix, alpha=False)
+            img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
+                pix.h, pix.w, pix.n
+            )
+            if pix.n == 3:
+                img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
+            return page_num, img_array.copy()
 
     with ThreadPoolExecutor(max_workers=min(4, page_count)) as executor:
         futures = [executor.submit(render_page, i) for i in range(page_count)]
         for future in futures:
             page_num, img_array = future.result()
             images[page_num] = img_array
 
-    pdf_document.close()
-    return [img for img in images if img is not None]
+    # Convert to (image, page_number) tuples with 1-indexed page numbers
+    result = []
+    for page_num, img in enumerate(images):
+        if img is not None:
+            result.append((img, page_num + 1))
+    return result
 
 
 def _load_single_image(image_path: str) -> List[np.ndarray]:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -9,6 +9,7 @@
 import warnings
 import pytest
 import numpy as np
+import pystow
 import tempfile
 
 # Suppress SWIG-related deprecation warnings from third-party libraries (e.g., OpenCV)
@@ -91,14 +92,40 @@ def temp_image_file(sample_image):
         os.unlink(f.name)
 
 
+@pytest.fixture
+def mock_model_detection(monkeypatch):
+    """
+    Mock the model detection to return fake results without running the actual model.
+    Returns 2 fake detections per image for testing.
+    """
+
+    def mock_get_mrcnn_results(image):
+        """Return fake detection results."""
+        h, w = image.shape[:2]
+        # Create 2 fake masks
+        masks = np.zeros((h, w, 2), dtype=bool)
+        masks[50:150, 50:150, 0] = True
+        masks[200:300, 200:300, 1] = True
+
+        # Fake bounding boxes (y0, x0, y1, x1)
+        bboxes = [(50, 50, 150, 150), (200, 200, 300, 300)]
+
+        # Fake confidence scores
+        scores = [0.95, 0.92]
+
+        return masks, bboxes, scores
+
+    # Patch the get_mrcnn_results function
+    import decimer_segmentation.decimer_segmentation as ds
+
+    monkeypatch.setattr(ds, "get_mrcnn_results", mock_get_mrcnn_results)
+
+
 @pytest.fixture(scope="session")
 def model_available():
     """Check if the model weights are available."""
-    import os
-
-    model_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     model_path = os.path.join(
-        model_dir, "decimer_segmentation", "mask_rcnn_molecule.h5"
+        str(pystow.join("DECIMER-Segmentation_model")), "mask_rcnn_molecule.h5"
     )
     return os.path.exists(model_path)
 
@@ -118,11 +145,8 @@ def pytest_configure(config):
 # Skip model-dependent tests if model not available
 def pytest_collection_modifyitems(config, items):
     """Modify test collection based on available resources."""
-    import os
-
-    model_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     model_path = os.path.join(
-        model_dir, "decimer_segmentation", "mask_rcnn_molecule.h5"
+        str(pystow.join("DECIMER-Segmentation_model")), "mask_rcnn_molecule.h5"
     )
     model_available = os.path.exists(model_path)
 

diff --git a/tests/helpers.py b/tests/helpers.py
@@ -0,0 +1,37 @@
+"""
+Test helper functions for DECIMER Segmentation tests.
+"""
+
+import tempfile
+import pymupdf
+
+
+def create_test_pdf(num_pages: int = 1) -> str:
+    """
+    Create a temporary PDF with black rectangles (simulating chemical structures).
+
+    Args:
+        num_pages: Number of pages to create
+
+    Returns:
+        Path to temporary PDF file (caller must delete)
+    """
+    f = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
+    pdf_path = f.name
+    f.close()
+
+    with pymupdf.open() as pdf:
+        for i in range(num_pages):
+            page = pdf.new_page(width=500, height=500)
+            # Place structure at different positions on each page
+            x_offset = 50 + (i * 100) % 300
+            y_offset = 50 + (i * 100) % 300
+            page.draw_rect(
+                pymupdf.Rect(x_offset, y_offset, x_offset + 100, y_offset + 100),
+                color=(0, 0, 0),
+                fill=(0, 0, 0),
+            )
+
+        pdf.save(pdf_path)
+
+    return pdf_path
diff --git a/tests/test_segmentation.py b/tests/test_segmentation.py
@@ -16,6 +16,8 @@
 # Add parent directory to path for imports
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+from tests.helpers import create_test_pdf
+
 
 class TestImageProcessing:
     """Tests for image processing functions."""
@@ -541,32 +543,79 @@ def test_full_pipeline_empty_image(self):
             # Expected if model not available
             pass
 
-    def test_segment_from_file_with_return_bboxes(self):
-        """Test segment_chemical_structures_from_file returns bboxes when return_bboxes=True."""
+    def test_segment_from_file_with_return_bboxes(self, mock_model_detection):
+        """
+        Test segment_chemical_structures_from_file returns bboxes
+        when return_bboxes=True.
+        """
         from decimer_segmentation import segment_chemical_structures_from_file
 
-        image = np.ones((500, 500, 3), dtype=np.uint8) * 255
-        cv2.rectangle(image, (50, 50), (150, 150), (0, 0, 0), -1)
+        pdf_path = create_test_pdf(num_pages=1)
+        try:
+            result = segment_chemical_structures_from_file(
+                pdf_path, expand=False, return_bboxes=True
+            )
+            assert isinstance(result, tuple)
+            assert len(result) == 2
+            segments, bboxes = result
+            assert isinstance(segments, list)
+            assert isinstance(bboxes, list)
+            assert len(segments) == len(bboxes)
+            assert len(segments) == 2, "Mocked model should return 2 detections"
+            for bbox in bboxes:
+                assert len(bbox) == 4
+        finally:
+            os.unlink(pdf_path)
+
+    def test_segment_multipage_pdf_with_page_numbers(self, mock_model_detection):
+        """
+        Test segment_chemical_structures_from_file returns page numbers
+        for multi-page PDFs.
+        """
+        from decimer_segmentation import segment_chemical_structures_from_file
 
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
-            cv2.imwrite(f.name, image)
-            try:
-                result = segment_chemical_structures_from_file(
-                    f.name, expand=False, return_bboxes=True
-                )
-                assert isinstance(result, tuple)
-                assert len(result) == 2
-                segments, bboxes = result
-                assert isinstance(segments, list)
-                assert isinstance(bboxes, list)
-                assert len(segments) == len(bboxes)
-                for bbox in bboxes:
-                    assert len(bbox) == 4
-            except Exception:
-                # Expected if model not available
-                pass
-            finally:
-                os.unlink(f.name)
+        pdf_path = create_test_pdf(num_pages=3)
+        try:
+            result = segment_chemical_structures_from_file(
+                pdf_path, expand=False, return_page_numbers=True
+            )
+            segments, page_numbers = result
+
+            # Mock returns 2 detections per page: pages should be [1, 1, 2, 2, 3, 3]
+            assert page_numbers == [1, 1, 2, 2, 3, 3], (
+                f"Expected page numbers [1, 1, 2, 2, 3, 3], got {page_numbers}"
+            )
+
+            # Verify we got 6 valid segments
+            assert len(segments) == 6
+            for i, seg in enumerate(segments):
+                assert isinstance(seg, np.ndarray), f"Segment {i} should be numpy array"
+                assert seg.size > 0, f"Segment {i} should not be empty"
+        finally:
+            os.unlink(pdf_path)
+
+    def test_segment_pdf_with_bboxes_and_page_numbers(self, mock_model_detection):
+        """
+        Test segment_chemical_structures_from_file returns both bboxes
+        and page numbers.
+        """
+        from decimer_segmentation import segment_chemical_structures_from_file
+
+        pdf_path = create_test_pdf(num_pages=1)
+        try:
+            result = segment_chemical_structures_from_file(
+                pdf_path, expand=False, return_bboxes=True, return_page_numbers=True
+            )
+            segments, bboxes, page_numbers = result
+
+            # Mock returns exact known values
+            assert page_numbers == [1, 1]
+            assert bboxes == [(50, 50, 150, 150), (200, 200, 300, 300)]
+            assert len(segments) == 2
+            for seg in segments:
+                assert isinstance(seg, np.ndarray) and seg.size > 0
+        finally:
+            os.unlink(pdf_path)
 
 
 # Benchmark tests (optional, for performance monitoring)