From dc1ea7f32f464594d1f00ea704384b9bdda9b8a7 Mon Sep 17 00:00:00 2001 From: Julian Pollmann Date: Wed, 21 Jan 2026 16:48:16 +0100 Subject: [PATCH 1/6] Parallelize Mol generation --- chemap/fingerprint_computation.py | 50 +++++++++++++++++++++++-------- pyproject.toml | 1 + 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/chemap/fingerprint_computation.py b/chemap/fingerprint_computation.py index 798233d..70eb5a7 100644 --- a/chemap/fingerprint_computation.py +++ b/chemap/fingerprint_computation.py @@ -5,6 +5,7 @@ import scipy.sparse as sp from rdkit import Chem from tqdm import tqdm +from joblib import Parallel, delayed # ----------------------------- @@ -91,6 +92,7 @@ def compute_fingerprints( config: FingerprintConfig = FingerprintConfig(), *, show_progress: bool = False, + n_jobs: int = 8, ) -> FingerprintResult: """ Compute fingerprints for a sequence of SMILES. @@ -114,7 +116,7 @@ def compute_fingerprints( _quick_smiles_check(smiles) if _looks_like_rdkit_fpgen(fpgen): - return _compute_rdkit(smiles, fpgen, config, show_progress=show_progress) + return _compute_rdkit(smiles, fpgen, config, show_progress=show_progress, n_jobs=n_jobs) if _looks_like_sklearn_transformer(fpgen): return _compute_sklearn(smiles, fpgen, config, show_progress=show_progress) @@ -254,6 +256,26 @@ def _mol_from_smiles_robust(smiles: str) -> Optional["Chem.Mol"]: return mol +def _compute_mols_parallel(smiles: Sequence[str], n_jobs: int, show_progress: bool) -> List[Optional["Chem.Mol"]]: + """ + Compute RDKit molecules from SMILES in parallel. + """ + if n_jobs == 1: + return [_mol_from_smiles_robust(s) for s in tqdm(smiles, disable=not show_progress, desc="Generating molecules")] + + results = Parallel(n_jobs=n_jobs, batch_size="auto")( + delayed(_mol_from_smiles_robust)(s) + for s in tqdm( + smiles, + total=len(smiles), + desc="Generating molecules (Parallel)", + disable=not show_progress + ) + ) + + return results + + def _infer_fp_size_folded(fpgen: Any, mol: "Chem.Mol", count: bool) -> int: """ Infer folded vector length for RDKit generator from a molecule. @@ -271,14 +293,15 @@ def _compute_rdkit( cfg: FingerprintConfig, *, show_progress: bool, + n_jobs: int, ) -> FingerprintResult: if not cfg.folded: - return _rdkit_unfolded(smiles, fpgen, cfg, show_progress=show_progress) + return _rdkit_unfolded(smiles, fpgen, cfg, show_progress=show_progress, n_jobs=n_jobs) if cfg.return_csr: - return _rdkit_folded_csr(smiles, fpgen, cfg, show_progress=show_progress) + return _rdkit_folded_csr(smiles, fpgen, cfg, show_progress=show_progress, n_jobs=n_jobs) - return _rdkit_folded_dense(smiles, fpgen, cfg, show_progress=show_progress) + return _rdkit_folded_dense(smiles, fpgen, cfg, show_progress=show_progress, n_jobs=n_jobs) def _rdkit_unfolded( @@ -287,6 +310,7 @@ def _rdkit_unfolded( cfg: FingerprintConfig, *, show_progress: bool, + n_jobs: int, ) -> FingerprintResult: """ Unfolded output for RDKit: use fpgen.GetSparseCountFingerprint(mol) to obtain feature IDs. @@ -294,10 +318,11 @@ def _rdkit_unfolded( - count=False: List[np.ndarray[int64]] feature IDs - count=True : List[(keys:int64, vals:float32)] feature IDs + counts (optionally scaled/weighted) """ + mols = _compute_mols_parallel(smiles, n_jobs, show_progress) + if cfg.count: out: UnfoldedCount = [] - for s in tqdm(smiles, disable=(not show_progress)): - mol = _mol_from_smiles_robust(s) + for s, mol in zip(smiles, mols): if mol is None: _handle_invalid(cfg.invalid_policy, s) if cfg.invalid_policy == "keep": @@ -314,8 +339,7 @@ def _rdkit_unfolded( return out out: UnfoldedBinary = [] - for s in tqdm(smiles, disable=(not show_progress)): - mol = _mol_from_smiles_robust(s) + for s, mol in zip(smiles, mols): if mol is None: _handle_invalid(cfg.invalid_policy, s) if cfg.invalid_policy == "keep": @@ -335,16 +359,17 @@ def _rdkit_folded_dense( cfg: FingerprintConfig, *, show_progress: bool, + n_jobs: int, ) -> np.ndarray: """ Dense folded output (N, D) float32 for RDKit generators. """ + mols = _compute_mols_parallel(smiles, n_jobs, show_progress) rows: List[np.ndarray] = [] n_features: Optional[int] = None pending_invalid: List[int] = [] # indices in `rows` that need backfill after we learn D - for s in tqdm(smiles, disable=(not show_progress)): - mol = _mol_from_smiles_robust(s) + for s, mol in zip(smiles, mols): if mol is None: _handle_invalid(cfg.invalid_policy, s) if cfg.invalid_policy == "keep": @@ -385,6 +410,7 @@ def _rdkit_folded_csr( cfg: FingerprintConfig, *, show_progress: bool, + n_jobs: int, ) -> sp.csr_matrix: """ Folded CSR output for RDKit generators. @@ -396,6 +422,7 @@ def _rdkit_folded_csr( - keep: row is kept as all-zeros (output aligned to input) - raise: raises ValueError """ + mols = _compute_mols_parallel(smiles, n_jobs, show_progress) n_features: Optional[int] = None idx_chunks: List[np.ndarray] = [] @@ -406,8 +433,7 @@ def _rdkit_folded_csr( if cfg.folded_weights is not None: w = np.asarray(cfg.folded_weights, dtype=np.float32).ravel() - for s in tqdm(smiles, disable=(not show_progress)): - mol = _mol_from_smiles_robust(s) + for s, mol in zip(smiles, mols): if mol is None: _handle_invalid(cfg.invalid_policy, s) diff --git a/pyproject.toml b/pyproject.toml index 81857e1..af2bc2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "scikit-fingerprints>=1.15.0", "tqdm>=4.67.1", "pooch>=1.8.2", + "joblib>=1.3.2", ] [dependency-groups] From d2dbaf977d7f5d88d9d4934552c74d117a3d2a2b Mon Sep 17 00:00:00 2001 From: Julian Pollmann Date: Wed, 21 Jan 2026 16:49:58 +0100 Subject: [PATCH 2/6] ruff --- chemap/fingerprint_computation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/chemap/fingerprint_computation.py b/chemap/fingerprint_computation.py index 70eb5a7..c097518 100644 --- a/chemap/fingerprint_computation.py +++ b/chemap/fingerprint_computation.py @@ -3,9 +3,9 @@ from typing import Any, Dict, List, Literal, Optional, Protocol, Sequence, Tuple, Union import numpy as np import scipy.sparse as sp +from joblib import Parallel, delayed from rdkit import Chem from tqdm import tqdm -from joblib import Parallel, delayed # ----------------------------- @@ -261,7 +261,9 @@ def _compute_mols_parallel(smiles: Sequence[str], n_jobs: int, show_progress: bo Compute RDKit molecules from SMILES in parallel. """ if n_jobs == 1: - return [_mol_from_smiles_robust(s) for s in tqdm(smiles, disable=not show_progress, desc="Generating molecules")] + return [ + _mol_from_smiles_robust(s) for s in tqdm(smiles, disable=not show_progress, desc="Generating molecules") + ] results = Parallel(n_jobs=n_jobs, batch_size="auto")( delayed(_mol_from_smiles_robust)(s) From 12865468e5c31a84f452f7ec26258b843ff8067c Mon Sep 17 00:00:00 2001 From: Julian Pollmann Date: Wed, 21 Jan 2026 23:49:11 +0100 Subject: [PATCH 3/6] use sklearn-fingerprints concurrency --- chemap/fingerprint_computation.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/chemap/fingerprint_computation.py b/chemap/fingerprint_computation.py index c097518..52cc77f 100644 --- a/chemap/fingerprint_computation.py +++ b/chemap/fingerprint_computation.py @@ -119,7 +119,7 @@ def compute_fingerprints( return _compute_rdkit(smiles, fpgen, config, show_progress=show_progress, n_jobs=n_jobs) if _looks_like_sklearn_transformer(fpgen): - return _compute_sklearn(smiles, fpgen, config, show_progress=show_progress) + return _compute_sklearn(smiles, fpgen, config, show_progress=show_progress, n_jobs=n_jobs) raise TypeError( "Unsupported fpgen. Expected an RDKit rdFingerprintGenerator-like object " @@ -510,6 +510,7 @@ def _skfp_configure_output( cfg: FingerprintConfig, *, show_progress: bool, + n_jobs: int, ) -> SklearnTransformer: """ Configure scikit-fingerprints/sklearn transformer to match (folded, return_csr). @@ -524,6 +525,9 @@ def _skfp_configure_output( if "verbose" in params: updates["verbose"] = 1 if show_progress else 0 + if "n_jobs" in params: + updates["n_jobs"] = n_jobs + if not cfg.folded: if "variant" not in params: raise NotImplementedError( @@ -554,8 +558,9 @@ def _compute_sklearn( cfg: FingerprintConfig, *, show_progress: bool = False, + n_jobs: int, ) -> FingerprintResult: - fp = _skfp_configure_output(fpgen, cfg, show_progress=show_progress) + fp = _skfp_configure_output(fpgen, cfg, show_progress=show_progress, n_jobs=n_jobs) X = fp.transform(smiles) if not cfg.folded: From c03a1b1a0d525e5164237b73b6937c2234b23d18 Mon Sep 17 00:00:00 2001 From: Julian Pollmann Date: Fri, 23 Jan 2026 01:12:02 +0100 Subject: [PATCH 4/6] Fix tests with correct SMILES + add fit to Transformer --- tests/test_fingerprint_computation.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/test_fingerprint_computation.py b/tests/test_fingerprint_computation.py index dcc9809..93fc6a7 100644 --- a/tests/test_fingerprint_computation.py +++ b/tests/test_fingerprint_computation.py @@ -2,6 +2,7 @@ import numpy as np import pytest import scipy.sparse as sp +from sklearn.base import BaseEstimator, TransformerMixin from chemap import FingerprintConfig, compute_fingerprints @@ -95,7 +96,7 @@ class DummyUnsupported: # Clone-safe sklearn/scikit-fingerprints transformer fakes # ----------------------------------------------------------------------------- -class FakeTransformer: +class FakeTransformer(BaseEstimator, TransformerMixin): """ Clone-safe sklearn/scikit-fingerprints-like transformer fake. @@ -120,6 +121,7 @@ def __init__( variant: str | None = None, mode: str = "onehot", n_features: int = 6, + n_jobs: int = 1, ): self._params = { "sparse": sparse, @@ -127,8 +129,12 @@ def __init__( "variant": variant, "mode": mode, "n_features": int(n_features), + "n_jobs": n_jobs, } + def fit(self, X, y=None): + return self + def get_params(self, deep: bool = False): return dict(self._params) @@ -398,7 +404,7 @@ def test_sklearn_folded_dense_scaling_log_applies_when_count_true(): ) cfg = FingerprintConfig(count=True, folded=True, return_csr=False, scaling="log") - X = compute_fingerprints(["A", "B"], fp, cfg) + X = compute_fingerprints(["C", "CC"], fp, cfg) expected = np.log1p(np.array([[0, 2], [3, 0]], dtype=np.float32)).astype(np.float32) np.testing.assert_allclose(X, expected, rtol=1e-6, atol=1e-6) @@ -412,7 +418,7 @@ def test_sklearn_folded_dense_weights_applies(): w = np.array([1.0, 10.0, 0.5], dtype=np.float32) cfg = FingerprintConfig(count=True, folded=True, return_csr=False, folded_weights=w) - X = compute_fingerprints(["A"], fp, cfg) + X = compute_fingerprints(["C"], fp, cfg) np.testing.assert_allclose(X[0], np.array([1, 20, 1.5], dtype=np.float32), rtol=1e-6, atol=1e-6) @@ -433,7 +439,7 @@ def test_sklearn_unfolded_sets_variant_raw_bits_and_returns_unfolded_binary(): ) cfg = FingerprintConfig(count=False, folded=False) - out = compute_fingerprints(["A"], fp, cfg) + out = compute_fingerprints(["C"], fp, cfg) assert isinstance(out, list) assert out[0].dtype == np.int64 assert list(out[0]) == [1, 4] @@ -448,7 +454,7 @@ def test_sklearn_unfolded_count_scaling_and_unfolded_weights(): ) cfg = FingerprintConfig(count=True, folded=False, scaling="log", unfolded_weights={2: 10.0}) - out = compute_fingerprints(["A"], fp, cfg) + out = compute_fingerprints(["C"], fp, cfg) keys, vals = out[0] assert list(keys) == [2, 5] expected = np.log1p(np.array([4.0, 2.0], dtype=np.float32)) * np.array([10.0, 1.0], dtype=np.float32) From 534cdbd5dc1fb0fa0226ea3525c8a515491a7387 Mon Sep 17 00:00:00 2001 From: Julian Pollmann Date: Fri, 23 Jan 2026 01:12:43 +0100 Subject: [PATCH 5/6] Add MolTransformer to handle invalid SMILES --- chemap/fingerprint_computation.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/chemap/fingerprint_computation.py b/chemap/fingerprint_computation.py index 52cc77f..89b9638 100644 --- a/chemap/fingerprint_computation.py +++ b/chemap/fingerprint_computation.py @@ -5,6 +5,7 @@ import scipy.sparse as sp from joblib import Parallel, delayed from rdkit import Chem +from sklearn.base import BaseEstimator, TransformerMixin from tqdm import tqdm @@ -75,6 +76,9 @@ class FingerprintConfig: class SklearnTransformer(Protocol): """Protocol for sklearn-like fingerprint transformers (including scikit-fingerprints).""" + def fit(self, X: Any, y: Any = None) -> "SklearnTransformer": + ... + def transform(self, X: Sequence[str]) -> Any: ... @@ -82,6 +86,19 @@ def get_params(self, deep: bool = False) -> Dict[str, Any]: ... +class RobustMolTransformer(BaseEstimator, TransformerMixin): + def __init__(self, n_jobs=-1): + self.n_jobs = n_jobs + + def fit(self, X, y=None): + return self + + def transform(self, X): + results = Parallel(n_jobs=self.n_jobs)( + delayed(_mol_from_smiles_robust)(s) for s in X + ) + return results + # ----------------------------- # Public entry point # ----------------------------- @@ -561,7 +578,11 @@ def _compute_sklearn( n_jobs: int, ) -> FingerprintResult: fp = _skfp_configure_output(fpgen, cfg, show_progress=show_progress, n_jobs=n_jobs) - X = fp.transform(smiles) + mol_transformer = RobustMolTransformer(n_jobs=n_jobs) + mols = mol_transformer.transform(smiles) + valid_mols = [m for m in mols if m is not None] + fp.fit(valid_mols) + X = fp.transform(valid_mols) if not cfg.folded: # unfolded output From 766a22a16676703c8da9ba3ac2d35620418d3dbb Mon Sep 17 00:00:00 2001 From: Florian Huber <36473328+florian-huber@users.noreply.github.com> Date: Sat, 24 Jan 2026 23:03:05 +0100 Subject: [PATCH 6/6] Change n_jobs default value to -1 for fingerprint computation --- chemap/fingerprint_computation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chemap/fingerprint_computation.py b/chemap/fingerprint_computation.py index 89b9638..6c7229e 100644 --- a/chemap/fingerprint_computation.py +++ b/chemap/fingerprint_computation.py @@ -109,7 +109,7 @@ def compute_fingerprints( config: FingerprintConfig = FingerprintConfig(), *, show_progress: bool = False, - n_jobs: int = 8, + n_jobs: int = -1, ) -> FingerprintResult: """ Compute fingerprints for a sequence of SMILES.