-
Notifications
You must be signed in to change notification settings - Fork 38
feat: Add page number tracking and fix PyMuPDF thread-safety issues #142
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: development
Are you sure you want to change the base?
Changes from all commits
6914cac
428cc2a
44c7fe5
72737f4
901117b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ | |
| import warnings | ||
| import pytest | ||
| import numpy as np | ||
| import pystow | ||
| import tempfile | ||
|
|
||
| # Suppress SWIG-related deprecation warnings from third-party libraries (e.g., OpenCV) | ||
|
|
@@ -91,14 +92,40 @@ def temp_image_file(sample_image): | |
| os.unlink(f.name) | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def mock_model_detection(monkeypatch): | ||
| """ | ||
| Mock the model detection to return fake results without running the actual model. | ||
| Returns 2 fake detections per image for testing. | ||
| """ | ||
|
|
||
| def mock_get_mrcnn_results(image): | ||
| """Return fake detection results.""" | ||
| h, w = image.shape[:2] | ||
| # Create 2 fake masks | ||
| masks = np.zeros((h, w, 2), dtype=bool) | ||
| masks[50:150, 50:150, 0] = True | ||
| masks[200:300, 200:300, 1] = True | ||
|
|
||
| # Fake bounding boxes (y0, x0, y1, x1) | ||
| bboxes = [(50, 50, 150, 150), (200, 200, 300, 300)] | ||
|
|
||
| # Fake confidence scores | ||
| scores = [0.95, 0.92] | ||
|
|
||
| return masks, bboxes, scores | ||
|
|
||
| # Patch the get_mrcnn_results function | ||
| import decimer_segmentation.decimer_segmentation as ds | ||
|
|
||
| monkeypatch.setattr(ds, "get_mrcnn_results", mock_get_mrcnn_results) | ||
|
Comment on lines
+118
to
+121
|
||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def model_available(): | ||
| """Check if the model weights are available.""" | ||
| import os | ||
|
|
||
| model_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
| model_path = os.path.join( | ||
| model_dir, "decimer_segmentation", "mask_rcnn_molecule.h5" | ||
| str(pystow.join("DECIMER-Segmentation_model")), "mask_rcnn_molecule.h5" | ||
| ) | ||
| return os.path.exists(model_path) | ||
|
|
||
|
|
@@ -118,11 +145,8 @@ def pytest_configure(config): | |
| # Skip model-dependent tests if model not available | ||
| def pytest_collection_modifyitems(config, items): | ||
| """Modify test collection based on available resources.""" | ||
| import os | ||
|
|
||
| model_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
| model_path = os.path.join( | ||
| model_dir, "decimer_segmentation", "mask_rcnn_molecule.h5" | ||
| str(pystow.join("DECIMER-Segmentation_model")), "mask_rcnn_molecule.h5" | ||
| ) | ||
| model_available = os.path.exists(model_path) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| """ | ||
| Test helper functions for DECIMER Segmentation tests. | ||
| """ | ||
|
|
||
| import tempfile | ||
| import pymupdf | ||
|
|
||
|
|
||
| def create_test_pdf(num_pages: int = 1) -> str: | ||
| """ | ||
| Create a temporary PDF with black rectangles (simulating chemical structures). | ||
|
|
||
| Args: | ||
| num_pages: Number of pages to create | ||
|
|
||
| Returns: | ||
| Path to temporary PDF file (caller must delete) | ||
| """ | ||
| f = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) | ||
| pdf_path = f.name | ||
| f.close() | ||
|
|
||
| with pymupdf.open() as pdf: | ||
| for i in range(num_pages): | ||
| page = pdf.new_page(width=500, height=500) | ||
| # Place structure at different positions on each page | ||
| x_offset = 50 + (i * 100) % 300 | ||
| y_offset = 50 + (i * 100) % 300 | ||
| page.draw_rect( | ||
| pymupdf.Rect(x_offset, y_offset, x_offset + 100, y_offset + 100), | ||
| color=(0, 0, 0), | ||
| fill=(0, 0, 0), | ||
| ) | ||
|
|
||
| pdf.save(pdf_path) | ||
|
|
||
| return pdf_path |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In
_load_pdf_multipage(), each page render opens the PDF anew (pymupdf.open(pdf_path)) which meanspage_countopens for a multi-page document. This adds significant overhead for larger PDFs. If thread-safety requires separate handles, consider reusing one handle per worker thread (e.g., viathreading.local()cache orThreadPoolExecutor(initializer=...)) so each worker opens the document once and renders multiple pages.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Kohulan i can address this issue but ill have to tackle it somewhere in the coming weeks.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I need to properly check what you are exactly doing since I don't want to break anything with new implementations. Once I have a clear overview, I can merge this.
Meanwhile, if you could fix this issue, then I can check over that. Thank you.