Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 80 additions & 30 deletions decimer_segmentation/decimer_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ def _load_model_internal() -> modellib.MaskRCNN:
logger.debug(f"Some optimizer options not available: {e}")

model_path = pystow.join("DECIMER-Segmentation_model")
print(model_path)
if not os.path.exists(str(model_path) + "/" + MODEL_FILENAME):
logger.debug("Model path: %s", model_path)
if not os.path.exists(os.path.join(model_path, MODEL_FILENAME)):
logger.info("Downloading model weights...")
download_trained_weights(MODEL_DOWNLOAD_URL, str(model_path))
logger.info("Successfully downloaded the segmentation model weights!")
Expand Down Expand Up @@ -184,60 +184,103 @@ def segment_chemical_structures_from_file(
file_path: str,
expand: bool = True,
return_bboxes: bool = False,
return_page_numbers: bool = False,
**kwargs,
) -> Union[List[np.ndarray], Tuple[List[np.ndarray], List[Tuple[int, int, int, int]]]]:
) -> Union[
List[np.ndarray],
Tuple[List[np.ndarray], List[Tuple[int, int, int, int]]],
Tuple[List[np.ndarray], List[int]],
Tuple[List[np.ndarray], List[Tuple[int, int, int, int]], List[int]],
]:
"""
Segment chemical structures from a PDF or image file.

Args:
file_path: Path to input file (PDF or image)
expand: Whether to expand masks to capture complete structures
return_bboxes: Whether to return bounding boxes along with segments
return_page_numbers: Whether to return 1-indexed page numbers per segment

Returns:
List of segmented chemical structure images as numpy arrays.
If return_bboxes is True, returns a tuple of (segments, bboxes).
If return_page_numbers is True, returns (segments, page_numbers).
If both are True, returns (segments, bboxes, page_numbers).
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"Input file not found: {file_path}")

if "poppler_path" in kwargs:
logger.warning("poppler_path parameter is deprecated and ignored")

images = _load_images_from_file(file_path)
images_with_pages = _load_images_from_file(file_path)

if not images:
if not images_with_pages:
logger.warning(f"No images could be extracted from {file_path}")
return ([], []) if return_bboxes else []
if return_bboxes and return_page_numbers:
return ([], [], [])
elif return_bboxes:
return ([], [])
elif return_page_numbers:
return ([], [])
else:
return []

# Process all images sequentially (model can't parallelize)
all_segments = []
all_bboxes = []
for image in images:
all_page_numbers = []
for image, page_number in images_with_pages:
segments, bboxes = segment_chemical_structures(
image, expand, return_bboxes=True
)
all_segments.extend(segments)
all_bboxes.extend(bboxes)
all_page_numbers.extend([page_number] * len(segments))

if return_bboxes and return_page_numbers:
return (all_segments, all_bboxes, all_page_numbers)
elif return_bboxes:
return (all_segments, all_bboxes)
elif return_page_numbers:
return (all_segments, all_page_numbers)
else:
return all_segments

return (all_segments, all_bboxes) if return_bboxes else all_segments

def _load_images_from_file(
file_path: str,
) -> List[Tuple[np.ndarray, int]]:
"""Load images from PDF or image file.

def _load_images_from_file(file_path: str) -> List[np.ndarray]:
"""Load images from PDF or image file."""
Returns:
List of (image, page_number) tuples. Page numbers are 1-indexed.
"""
if file_path.lower().endswith(".pdf"):
return _load_pdf_pages(file_path)
else:
return _load_single_image(file_path)
return [(img, 1) for img in _load_single_image(file_path)]


def _load_pdf_pages(pdf_path: str) -> List[np.ndarray]:
"""Load all pages from a PDF as images using PyMuPDF."""
pdf_document = pymupdf.open(pdf_path)
page_count = pdf_document.page_count
def _load_pdf_pages(pdf_path: str) -> List[Tuple[np.ndarray, int]]:
"""Load all pages from a PDF as images using PyMuPDF.

Returns:
List of (image, page_number) tuples. Page numbers are 1-indexed.
"""
# Quick check for page count to determine strategy
with pymupdf.open(pdf_path) as doc:
page_count = doc.page_count

if page_count == 1:
# Single page - no threading overhead
return _load_pdf_single_page(pdf_path)
else:
return _load_pdf_multipage(pdf_path, page_count)


def _load_pdf_single_page(pdf_path: str) -> List[Tuple[np.ndarray, int]]:
"""Load a single-page PDF without threading overhead."""
with pymupdf.open(pdf_path) as pdf_document:
page = pdf_document[0]
matrix = pymupdf.Matrix(300 / 72, 300 / 72)
pix = page.get_pixmap(matrix=matrix, alpha=False)
Expand All @@ -246,31 +289,38 @@ def _load_pdf_pages(pdf_path: str) -> List[np.ndarray]:
)
if pix.n == 3:
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
pdf_document.close()
return [img_array.copy()]
return [(img_array.copy(), 1)]


# Multiple pages - use threading for I/O
def _load_pdf_multipage(pdf_path: str, page_count: int) -> List[Tuple[np.ndarray, int]]:
"""Load multi-page PDF using threading with separate document handles per thread."""
images = [None] * page_count

def render_page(page_num: int) -> Tuple[int, np.ndarray]:
page = pdf_document[page_num]
matrix = pymupdf.Matrix(300 / 72, 300 / 72)
pix = page.get_pixmap(matrix=matrix, alpha=False)
img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
pix.h, pix.w, pix.n
)
if pix.n == 3:
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
return page_num, img_array.copy()
# Each thread opens its own document handle for thread safety
with pymupdf.open(pdf_path) as doc:
page = doc[page_num]
matrix = pymupdf.Matrix(300 / 72, 300 / 72)
pix = page.get_pixmap(matrix=matrix, alpha=False)
img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
pix.h, pix.w, pix.n
)
if pix.n == 3:
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
return page_num, img_array.copy()
Comment on lines 299 to +310
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In _load_pdf_multipage(), each page render opens the PDF anew (pymupdf.open(pdf_path)) which means page_count opens for a multi-page document. This adds significant overhead for larger PDFs. If thread-safety requires separate handles, consider reusing one handle per worker thread (e.g., via threading.local() cache or ThreadPoolExecutor(initializer=...)) so each worker opens the document once and renders multiple pages.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Kohulan i can address this issue but ill have to tackle it somewhere in the coming weeks.

Copy link
Copy Markdown
Owner

@Kohulan Kohulan Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need to properly check what you are exactly doing since I don't want to break anything with new implementations. Once I have a clear overview, I can merge this.

Meanwhile, if you could fix this issue, then I can check over that. Thank you.


with ThreadPoolExecutor(max_workers=min(4, page_count)) as executor:
futures = [executor.submit(render_page, i) for i in range(page_count)]
for future in futures:
page_num, img_array = future.result()
images[page_num] = img_array

pdf_document.close()
return [img for img in images if img is not None]
# Convert to (image, page_number) tuples with 1-indexed page numbers
result = []
for page_num, img in enumerate(images):
if img is not None:
result.append((img, page_num + 1))
return result


def _load_single_image(image_path: str) -> List[np.ndarray]:
Expand Down
40 changes: 32 additions & 8 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import warnings
import pytest
import numpy as np
import pystow
import tempfile

# Suppress SWIG-related deprecation warnings from third-party libraries (e.g., OpenCV)
Expand Down Expand Up @@ -91,14 +92,40 @@ def temp_image_file(sample_image):
os.unlink(f.name)


@pytest.fixture
def mock_model_detection(monkeypatch):
"""
Mock the model detection to return fake results without running the actual model.
Returns 2 fake detections per image for testing.
"""

def mock_get_mrcnn_results(image):
"""Return fake detection results."""
h, w = image.shape[:2]
# Create 2 fake masks
masks = np.zeros((h, w, 2), dtype=bool)
masks[50:150, 50:150, 0] = True
masks[200:300, 200:300, 1] = True

# Fake bounding boxes (y0, x0, y1, x1)
bboxes = [(50, 50, 150, 150), (200, 200, 300, 300)]

# Fake confidence scores
scores = [0.95, 0.92]

return masks, bboxes, scores

# Patch the get_mrcnn_results function
import decimer_segmentation.decimer_segmentation as ds

monkeypatch.setattr(ds, "get_mrcnn_results", mock_get_mrcnn_results)
Comment on lines +118 to +121
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mock_model_detection imports decimer_segmentation.decimer_segmentation to patch get_mrcnn_results, but that module eagerly initializes segmentation_model = get_model() at import time (which can download/load weights). This undermines the goal of running these tests without model weights/network and can make the suite slow/flaky. Consider making model initialization fully lazy in the library (e.g., call get_model() inside get_mrcnn_results rather than at module import) or otherwise provide a test hook to bypass model loading before importing the module.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Kohulan i also noticed you are loading the model the moment decimer gets imported instead of lazy loading it.
Is there any reason for this?

Copy link
Copy Markdown
Owner

@Kohulan Kohulan Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to keep the model loaded when it is initialised since lazy loading takes time and this affects performance, keeping it in memory allows one to run the segmentation faster.



@pytest.fixture(scope="session")
def model_available():
"""Check if the model weights are available."""
import os

model_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
model_path = os.path.join(
model_dir, "decimer_segmentation", "mask_rcnn_molecule.h5"
str(pystow.join("DECIMER-Segmentation_model")), "mask_rcnn_molecule.h5"
)
return os.path.exists(model_path)

Expand All @@ -118,11 +145,8 @@ def pytest_configure(config):
# Skip model-dependent tests if model not available
def pytest_collection_modifyitems(config, items):
"""Modify test collection based on available resources."""
import os

model_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
model_path = os.path.join(
model_dir, "decimer_segmentation", "mask_rcnn_molecule.h5"
str(pystow.join("DECIMER-Segmentation_model")), "mask_rcnn_molecule.h5"
)
model_available = os.path.exists(model_path)

Expand Down
37 changes: 37 additions & 0 deletions tests/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
Test helper functions for DECIMER Segmentation tests.
"""

import tempfile
import pymupdf


def create_test_pdf(num_pages: int = 1) -> str:
"""
Create a temporary PDF with black rectangles (simulating chemical structures).

Args:
num_pages: Number of pages to create

Returns:
Path to temporary PDF file (caller must delete)
"""
f = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
pdf_path = f.name
f.close()

with pymupdf.open() as pdf:
for i in range(num_pages):
page = pdf.new_page(width=500, height=500)
# Place structure at different positions on each page
x_offset = 50 + (i * 100) % 300
y_offset = 50 + (i * 100) % 300
page.draw_rect(
pymupdf.Rect(x_offset, y_offset, x_offset + 100, y_offset + 100),
color=(0, 0, 0),
fill=(0, 0, 0),
)

pdf.save(pdf_path)

return pdf_path
95 changes: 72 additions & 23 deletions tests/test_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from tests.helpers import create_test_pdf


class TestImageProcessing:
"""Tests for image processing functions."""
Expand Down Expand Up @@ -541,32 +543,79 @@ def test_full_pipeline_empty_image(self):
# Expected if model not available
pass

def test_segment_from_file_with_return_bboxes(self):
"""Test segment_chemical_structures_from_file returns bboxes when return_bboxes=True."""
def test_segment_from_file_with_return_bboxes(self, mock_model_detection):
"""
Test segment_chemical_structures_from_file returns bboxes
when return_bboxes=True.
"""
from decimer_segmentation import segment_chemical_structures_from_file

image = np.ones((500, 500, 3), dtype=np.uint8) * 255
cv2.rectangle(image, (50, 50), (150, 150), (0, 0, 0), -1)
pdf_path = create_test_pdf(num_pages=1)
try:
result = segment_chemical_structures_from_file(
pdf_path, expand=False, return_bboxes=True
)
assert isinstance(result, tuple)
assert len(result) == 2
segments, bboxes = result
assert isinstance(segments, list)
assert isinstance(bboxes, list)
assert len(segments) == len(bboxes)
assert len(segments) == 2, "Mocked model should return 2 detections"
for bbox in bboxes:
assert len(bbox) == 4
finally:
os.unlink(pdf_path)

def test_segment_multipage_pdf_with_page_numbers(self, mock_model_detection):
"""
Test segment_chemical_structures_from_file returns page numbers
for multi-page PDFs.
"""
from decimer_segmentation import segment_chemical_structures_from_file

with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
cv2.imwrite(f.name, image)
try:
result = segment_chemical_structures_from_file(
f.name, expand=False, return_bboxes=True
)
assert isinstance(result, tuple)
assert len(result) == 2
segments, bboxes = result
assert isinstance(segments, list)
assert isinstance(bboxes, list)
assert len(segments) == len(bboxes)
for bbox in bboxes:
assert len(bbox) == 4
except Exception:
# Expected if model not available
pass
finally:
os.unlink(f.name)
pdf_path = create_test_pdf(num_pages=3)
try:
result = segment_chemical_structures_from_file(
pdf_path, expand=False, return_page_numbers=True
)
segments, page_numbers = result

# Mock returns 2 detections per page: pages should be [1, 1, 2, 2, 3, 3]
assert page_numbers == [1, 1, 2, 2, 3, 3], (
f"Expected page numbers [1, 1, 2, 2, 3, 3], got {page_numbers}"
)

# Verify we got 6 valid segments
assert len(segments) == 6
for i, seg in enumerate(segments):
assert isinstance(seg, np.ndarray), f"Segment {i} should be numpy array"
assert seg.size > 0, f"Segment {i} should not be empty"
finally:
os.unlink(pdf_path)

def test_segment_pdf_with_bboxes_and_page_numbers(self, mock_model_detection):
"""
Test segment_chemical_structures_from_file returns both bboxes
and page numbers.
"""
from decimer_segmentation import segment_chemical_structures_from_file

pdf_path = create_test_pdf(num_pages=1)
try:
result = segment_chemical_structures_from_file(
pdf_path, expand=False, return_bboxes=True, return_page_numbers=True
)
segments, bboxes, page_numbers = result

# Mock returns exact known values
assert page_numbers == [1, 1]
assert bboxes == [(50, 50, 150, 150), (200, 200, 300, 300)]
assert len(segments) == 2
for seg in segments:
assert isinstance(seg, np.ndarray) and seg.size > 0
finally:
os.unlink(pdf_path)


# Benchmark tests (optional, for performance monitoring)
Expand Down