From b44dce59bc8c28d817178a976161597199efb40e Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Wed, 14 Jan 2026 14:14:03 +0000 Subject: [PATCH] perf(gribjump): fix two performance bottlenecks in the gribjump source Avoid recomputing per-field data by caching/precomputing: - Cache reference lat/lon when fetch_coords_from_fdb=True to avoid re-reading the reference field's geography per retrieved field. - Pre-convert index lists to ranges once to avoid repeated calls to ExtractionRequest.from_indices. --- src/earthkit/data/sources/gribjump.py | 24 +++++++++++++++++++----- tests/sources/test_gribjump.py | 1 - 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index a510d4913..df1681a74 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -242,6 +242,12 @@ def from_mars_requests( ranges = mask_to_ranges(mask) mask = None + if indices is not None: + # We do the same small optimization for indices. Optimally, we + # would do similar optimizations in pygribjump and remove this. + ranges = [(i, i + 1) for i in indices] + indices = None + extraction_requests = [build_extraction_request(req, ranges, mask, indices) for req in mars_requests] return cls(extraction_requests) @@ -286,7 +292,11 @@ def __init__( # These attributes are set lazily after loading the data. self._loaded = False self._grid_indices = None + + # Cached reference metadata for coordinates self._reference_metadata: Optional[GribMetadata] = None + self._latitudes: Optional[np.ndarray] = None + self._longitudes: Optional[np.ndarray] = None super().__init__(fields=None) @@ -334,10 +344,10 @@ def _load(self): def _load_reference_metadata(self): """Loads the reference metadata from the FDB retriever if available.""" - if self._fdb_retriever is None: - return None if self._reference_metadata is not None: return self._reference_metadata + if self._fdb_retriever is None: + return None fields = self._fdb_retriever.get(self._requests[0].request) metadatas = fields.metadata() @@ -355,9 +365,13 @@ def _enrich_metadata_with_coordinates(self, indices: np.ndarray, metadata: UserM if (reference_metadata := self._load_reference_metadata()) is None: return metadata - reference_geography = reference_metadata.geography - grid_latitudes = reference_geography.latitudes()[indices] - grid_longitudes = reference_geography.longitudes()[indices] + if self._latitudes is None or self._longitudes is None: + self._latitudes = reference_metadata.geography.latitudes() + self._longitudes = reference_metadata.geography.longitudes() + + grid_latitudes = self._latitudes[indices] + grid_longitudes = self._longitudes[indices] + metadata = metadata.override( { "latitudes": grid_latitudes, diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 020a8ae09..3622ec8e7 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -192,7 +192,6 @@ def ds_expected_with_coords(): "levelist": "1000", "levtype": "pl", "stream": "oper", - "param": "129", "time": "1200", "type": "fc", "Conventions": "CF-1.8",