From fcde2bc60a67890f278decb7d86d0ec1e8201e00 Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Tue, 9 Jun 2026 11:13:32 -0400 Subject: [PATCH 1/8] feat: add DoubleSparse sparse storage backend Add a hash-map backed `DoubleSparse` storage that only allocates filled cells, making histograms over very large or high-dimensional axis spaces feasible. Backed by `storage_adaptor>`. Scoped to `double` only for now: accumulator-backed sparse storages need boostorg/histogram#421, which ships in Boost 1.92 (the repo vendors 1.90). Data access is COO-first and deliberately not a full drop-in: - `.view()` raises (no contiguous buffer to view into). - `Histogram.to_coo(flow=False)` returns the filled cells as `(per-axis index tuple, values)`, numpy-nonzero style. - The copying accessors `values()`/`variances()`/`counts()` and `np.asarray(h)` densify into a fresh array, so they keep working. - `at()` reads, slicing, projection, sum and histogram addition stay sparse. Serialization: pickle round-trips via free save/load on `double_sparse` (COO key/value arrays); UHI to_uhi/from_uhi store COO under a `"sparse"` payload plus writer_info, reconstructed via a C++ `_from_coo` bulk setter. Assisted-by: ClaudeCode:claude-opus-4.8 --- .pre-commit-config.yaml | 2 +- include/bh_python/histogram.hpp | 142 +++++++++++ include/bh_python/register_histogram.hpp | 26 ++ include/bh_python/storage.hpp | 56 +++++ src/boost_histogram/_core/hist.pyi | 16 ++ src/boost_histogram/_core/storage.pyi | 1 + src/boost_histogram/histogram.py | 79 +++++- src/boost_histogram/serialization/__init__.py | 41 +++- src/boost_histogram/serialization/_storage.py | 4 +- src/boost_histogram/storage.py | 12 + src/register_histograms.cpp | 6 + src/register_storage.cpp | 5 + tests/test_sparse_storage.py | 226 ++++++++++++++++++ 13 files changed, 603 insertions(+), 13 deletions(-) create mode 100644 tests/test_sparse_storage.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 30d38b06..47e71d3c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -53,7 +53,7 @@ repos: rev: v2.4.2 hooks: - id: codespell - args: ["-L", "hist,nd,circularly,ba,fo", "-w"] + args: ["-L", "hist,nd,circularly,ba,fo,coo", "-w"] exclude: ^(notebooks/xarray.ipynb|notebooks/BoostHistogramHandsOn.ipynb|.*\.svg)$ - repo: local diff --git a/include/bh_python/histogram.hpp b/include/bh_python/histogram.hpp index 8ccda08f..56071873 100644 --- a/include/bh_python/histogram.hpp +++ b/include/bh_python/histogram.hpp @@ -12,11 +12,15 @@ #include #include #include +#include #include #include #include +#include +#include + namespace pybind11 { /// The descriptor for atomic_* is the same as the descriptor for *, as long this uses @@ -80,6 +84,17 @@ py::buffer_info make_buffer(bh::histogram>& h, bool flow return detail::make_buffer_impl(axes, flow, &storage[0]); } +/// Sparse storage keeps only filled cells in a hash map, so there is no +/// contiguous buffer to expose. Callers should use ``to_coo()`` instead. We +/// throw rather than return an empty buffer so ``.view()`` / ``np.asarray`` give +/// a single, clear error. +template +py::buffer_info make_buffer(bh::histogram>& /* h */, + bool /* flow */) { + throw py::type_error( + "sparse storage does not support .view() or buffer access; use .to_coo()"); +} + /// Specialization for unlimited_buffer template py::buffer_info make_buffer(bh::histogram>& h, @@ -123,6 +138,133 @@ py::buffer_info make_buffer(bh::histogram>& h, bool flow) { std_as_const(new_axes), flow, static_cast(storage.get_buffer())); } +/// Trait identifying our sparse (hash-map backed) storages. +template +struct is_sparse_storage : std::false_type {}; + +template +struct is_sparse_storage> : std::true_type {}; + +namespace detail { +/// Per-axis layout used to (un)ravel a flat sparse storage index. The flat index +/// counts flow bins, with underflow (when present) at position 0, exactly like +/// make_buffer_impl above. +struct axis_layout { + std::size_t extent; // size + has_underflow + has_overflow + std::size_t size; // number of normal (in-range) bins + std::size_t underflow; // 1 if this axis has an underflow bin, else 0 +}; + +template +std::vector make_axis_layout(const Axes& axes) { + std::vector layout; + bh::detail::for_each_axis(axes, [&layout](const auto& ax) { + const bool has_underflow + = bh::axis::traits::options(ax) & bh::axis::option::underflow; + layout.push_back({static_cast(bh::axis::traits::extent(ax)), + static_cast(ax.size()), + has_underflow ? std::size_t{1} : std::size_t{0}}); + }); + return layout; +} +} // namespace detail + +/// Extract the filled cells of a sparse histogram in COO form. Returns a tuple +/// ``(indices, values)`` where ``indices`` is an ``(ndim, n)`` integer array and +/// ``values`` an ``n``-length array. With ``flow=False`` the flow cells are +/// dropped and indices run 0..size-1; with ``flow=True`` indices run +/// 0..extent-1 (underflow at 0), matching the ``view(flow=True)`` grid. +template +py::tuple histogram_to_coo(bh::histogram>& h, bool flow) { + const auto& axes = bh::unsafe_access::axes(h); + auto& storage = bh::unsafe_access::storage(h); + const auto layout = detail::make_axis_layout(axes); + const auto rank = layout.size(); + + // storage_adaptor publicly inherits map_impl, which publicly inherits the + // underlying map, so this upcast exposes only the filled cells. + using map_type = std::unordered_map; + const auto& map = static_cast(storage); + + std::vector> idx_cols(rank); + std::vector vals; + vals.reserve(map.size()); + + for(const auto& kv : map) { + std::size_t lin = kv.first; + std::vector multi(rank); + bool keep = true; + for(std::size_t ax = 0; ax < rank; ++ax) { + const auto& info = layout[ax]; + const std::size_t iflow = lin % info.extent; + lin /= info.extent; + if(flow) { + multi[ax] = static_cast(iflow); + } else { + const py::ssize_t real = static_cast(iflow) + - static_cast(info.underflow); + if(real < 0 || real >= static_cast(info.size)) { + keep = false; + break; + } + multi[ax] = real; + } + } + if(!keep) + continue; + for(std::size_t ax = 0; ax < rank; ++ax) + idx_cols[ax].push_back(multi[ax]); + vals.push_back(kv.second); + } + + const auto n = static_cast(vals.size()); + py::array_t indices({static_cast(rank), n}); + if(n > 0) { + auto ind = indices.template mutable_unchecked<2>(); + for(std::size_t ax = 0; ax < rank; ++ax) + for(py::ssize_t j = 0; j < n; ++j) + ind(static_cast(ax), j) + = idx_cols[ax][static_cast(j)]; + } + + py::array_t values(n); + std::copy(vals.begin(), vals.end(), values.mutable_data()); + + return py::make_tuple(std::move(indices), std::move(values)); +} + +/// Inverse of histogram_to_coo: scatter ``(indices, values)`` back into a sparse +/// histogram. Uses the same flow convention as histogram_to_coo. This is the +/// only bulk write path for sparse storage, since the numpy-buffer based +/// __setitem__ cannot densify the map. +template +void histogram_from_coo(bh::histogram>& h, + py::array_t indices, + py::array_t values, + bool flow) { + const auto& axes = bh::unsafe_access::axes(h); + const auto layout = detail::make_axis_layout(axes); + const auto rank = layout.size(); + + auto ind = indices.template unchecked<2>(); + auto val = values.template unchecked<1>(); + if(static_cast(ind.shape(0)) != rank) + throw py::value_error("indices first dimension must equal histogram rank"); + + const auto n = val.shape(0); + std::vector at_index(rank); + for(py::ssize_t j = 0; j < n; ++j) { + for(std::size_t ax = 0; ax < rank; ++ax) { + const py::ssize_t coord = ind(static_cast(ax), j); + // Convert back to boost's at() signed index (underflow -1, overflow size). + const py::ssize_t signed_idx + = flow ? coord - static_cast(layout[ax].underflow) : coord; + at_index[ax] = static_cast(signed_idx); + } + h.at(at_index) = val(j); + } +} + /// Compute the bin of an array from a runtime list /// For example, [1,3,2] will return that bin of an array template diff --git a/include/bh_python/register_histogram.hpp b/include/bh_python/register_histogram.hpp index 1a3a0522..02566ef7 100644 --- a/include/bh_python/register_histogram.hpp +++ b/include/bh_python/register_histogram.hpp @@ -30,6 +30,30 @@ #include #include +/// Bind the COO get/set helpers, but only for sparse storages (no-op otherwise). +/// Sparse histograms cannot expose a numpy buffer, so these are the supported way +/// to read filled cells (``_to_coo``) and to bulk-populate them (``_from_coo``, +/// used by serialization). +template +void register_sparse_coo(PyClass& hist, std::true_type) { + hist.def( + "_to_coo", + [](histogram_t& self, bool flow) { return histogram_to_coo(self, flow); }, + "flow"_a = false) + .def( + "_from_coo", + [](histogram_t& self, + py::array_t indices, + py::array_t values, + bool flow) { histogram_from_coo(self, indices, values, flow); }, + "indices"_a, + "values"_a, + "flow"_a = true); +} + +template +void register_sparse_coo(PyClass& /* hist */, std::false_type) {} + template auto register_histogram(py::module& m, const char* name, const char* desc) { using histogram_t = bh::histogram; @@ -207,6 +231,8 @@ auto register_histogram(py::module& m, const char* name, const char* desc) { ; + register_sparse_coo(hist, is_sparse_storage{}); + return hist; } diff --git a/include/bh_python/storage.hpp b/include/bh_python/storage.hpp index b0e56397..d3c664b7 100644 --- a/include/bh_python/storage.hpp +++ b/include/bh_python/storage.hpp @@ -19,9 +19,17 @@ #include #include #include +#include namespace storage { +// Sparse storage only allocates cells that are actually filled, by keeping them +// in a hash map keyed on the (flattened) bin index. Accumulator-backed sparse +// storages need https://github.com/boostorg/histogram/pull/421 (Boost 1.92), so +// for now only the arithmetic double_sparse is exposed. +template +using sparse_storage = bh::storage_adaptor>; + // Names match Python names using int64 = bh::dense_storage; using atomic_int64 = bh::dense_storage>; @@ -31,6 +39,7 @@ using weight = bh::dense_storage>; using multi_cell = bh::multi_cell; using mean = bh::dense_storage>; using weighted_mean = bh::dense_storage>; +using double_sparse = sparse_storage; // Allow repr to show python name template @@ -78,6 +87,11 @@ inline const char* name() { return "weighted_mean"; } +template <> +inline const char* name() { + return "double_sparse"; +} + } // namespace storage // It is very important that storages with accumulators have specialized serialization. @@ -187,6 +201,48 @@ void load(Archive& ar, std::copy(a.data(), a.data() + a.size(), reinterpret_cast(s.data())); } +// Sparse storage cannot be flat-serialized: its hash map only holds filled +// cells. We instead store the logical size plus parallel (key, value) arrays. +// storage_adaptor publicly inherits the underlying map, so the upcast exposes +// just the filled cells. +template +void save(Archive& ar, const storage::double_sparse& s, unsigned /* version */) { + using map_type = std::unordered_map; + const auto& map = static_cast(s); + const auto n = static_cast(map.size()); + + py::array_t keys(n); + py::array_t values(n); + auto* kp = keys.mutable_data(); + auto* vp = values.mutable_data(); + for(const auto& kv : map) { + *kp++ = static_cast(kv.first); + *vp++ = kv.second; + } + + ar << static_cast(s.size()); + ar << keys; + ar << values; +} + +template +void load(Archive& ar, storage::double_sparse& s, unsigned /* version */) { + using map_type = std::unordered_map; + std::uint64_t logical_size = 0; + py::array_t keys; + py::array_t values; + ar >> logical_size; + ar >> keys; + ar >> values; + + s.reset(static_cast(logical_size)); + auto& map = static_cast(s); + auto k = keys.unchecked<1>(); + auto v = values.unchecked<1>(); + for(py::ssize_t i = 0; i < v.shape(0); ++i) + map.emplace(static_cast(k(i)), v(i)); +} + namespace pybind11 { namespace detail { /// Allow a Python int to implicitly convert to an atomic int in C++ diff --git a/src/boost_histogram/_core/hist.pyi b/src/boost_histogram/_core/hist.pyi index ca40839d..0e66b5f3 100644 --- a/src/boost_histogram/_core/hist.pyi +++ b/src/boost_histogram/_core/hist.pyi @@ -66,6 +66,22 @@ class any_atomic_int64(_BaseHistogram): def _at_set(self, value: int, *args: int) -> None: ... def sum(self, flow: bool = ...) -> int: ... +class any_double_sparse(_BaseHistogram): + def __idiv__(self, other: any_double_sparse) -> Self: ... + def __imul__(self, other: any_double_sparse) -> Self: ... + def at(self, *args: int) -> float: ... + def _at_set(self, value: float, *args: int) -> None: ... + def sum(self, flow: bool = ...) -> float: ... + def _to_coo( + self, flow: bool = ... + ) -> tuple[np.typing.NDArray[np.intp], np.typing.NDArray[np.float64]]: ... + def _from_coo( + self, + indices: np.typing.NDArray[np.intp], + values: np.typing.NDArray[np.float64], + flow: bool = ..., + ) -> None: ... + class any_weight(_BaseHistogram): def __idiv__(self, other: any_weight) -> Self: ... def __imul__(self, other: any_weight) -> Self: ... diff --git a/src/boost_histogram/_core/storage.pyi b/src/boost_histogram/_core/storage.pyi index 6afe44e9..88d2d524 100644 --- a/src/boost_histogram/_core/storage.pyi +++ b/src/boost_histogram/_core/storage.pyi @@ -16,6 +16,7 @@ class unlimited(_BaseStorage): ... class weight(_BaseStorage): ... class mean(_BaseStorage): ... class weighted_mean(_BaseStorage): ... +class double_sparse(_BaseStorage): ... class multi_cell(_BaseStorage): def __init__(self, nelem: int, /): ... diff --git a/src/boost_histogram/histogram.py b/src/boost_histogram/histogram.py index c6830a60..255506f0 100644 --- a/src/boost_histogram/histogram.py +++ b/src/boost_histogram/histogram.py @@ -95,6 +95,7 @@ def __dir__() -> list[str]: _core.hist.any_mean, _core.hist.any_weighted_mean, _core.hist.any_multi_cell, + _core.hist.any_double_sparse, } logger = logging.getLogger(__name__) @@ -552,6 +553,11 @@ def view( self: Histogram[bhs.WeightedMean], flow: bool = False ) -> WeightedMeanView: ... + @typing.overload + def view( + self: Histogram[bhs.DoubleSparse], flow: bool = False + ) -> typing.NoReturn: ... + @typing.overload def view( self: Histogram[Any], flow: bool = False @@ -574,9 +580,64 @@ def view( ): """ Return a view into the data, optionally with overflow turned on. + + Sparse storage (:class:`~boost_histogram.storage.DoubleSparse`) does not + support this, since there is no dense buffer to view into; use + :meth:`to_coo` to read the filled cells, or :meth:`values` (and the other + copying accessors) if a dense array is acceptable. """ return _to_view(self._hist.view(flow)) + def _densify(self, flow: bool = False) -> np.typing.NDArray[np.float64]: + """ + Build a dense array of values from sparse (COO) storage. Unlike + :meth:`view`, this is a freshly allocated copy, so writing to it does not + change the histogram. Densifying a very large axis space can exhaust + memory; that is the caller's choice (use :meth:`to_coo` to avoid it). + """ + shape = tuple( + ax.size + + (int(ax.traits.underflow) + int(ax.traits.overflow) if flow else 0) + for ax in self.axes + ) + arr = np.zeros(shape, dtype=np.float64) + indices, values = self.to_coo(flow=flow) + if values.size: + arr[indices] = values + return arr + + def _view_or_dense(self, flow: bool) -> Any: + """ + Like :meth:`view`, but densifies sparse storage into a copy instead of + raising. Used by the copying accessors (:meth:`values` etc.) so they keep + working for sparse histograms, while :meth:`view` stays buffer-only. + """ + if isinstance(self._hist, _core.hist.any_double_sparse): + return self._densify(flow) + return self.view(flow) + + def to_coo( + self, flow: bool = False + ) -> tuple[tuple[np.typing.NDArray[np.intp], ...], np.typing.NDArray[np.float64]]: + """ + Return the filled cells of a sparse histogram in coordinate (COO) form. + + This is the sparse-storage replacement for :meth:`view`. It returns a + tuple ``(indices, values)`` where ``indices`` is a tuple of per-axis + index arrays (one per dimension, in the style of :func:`numpy.nonzero`, + so it can be used directly as a fancy index) and ``values`` is the array + of corresponding cell values. Only filled cells are returned. + + :param flow: If True, include flow bins, with indices running over the + with-flow grid (underflow at 0); if False (default), flow cells are + dropped and indices run ``0..size-1`` per axis. + """ + if not isinstance(self._hist, _core.hist.any_double_sparse): + msg = "to_coo() is only available for sparse storage; use view() instead" + raise TypeError(msg) + indices, values = self._hist._to_coo(flow) + return tuple(indices), values + def __array__( self, dtype: np.typing.DTypeLike | None = None, @@ -588,7 +649,7 @@ def __array__( kwargs = {} if copy is not None: kwargs["copy"] = copy - return np.asarray(self.view(False), dtype=dtype, **kwargs) # type: ignore[call-overload, no-any-return] + return np.asarray(self._view_or_dense(False), dtype=dtype, **kwargs) # type: ignore[call-overload, no-any-return] __hash__ = None # type: ignore[assignment] @@ -941,8 +1002,11 @@ def __str__(self) -> str: still experimental. Do not rely on any particular rendering. """ # TODO check the terminal width and adjust the presentation - # only use for 1D, fall back to repr for ND - if self._hist.rank() != 1: + # only use for 1D, fall back to repr for ND. Sparse storage also falls + # back to repr, since the C++ rendering would iterate every (dense) bin. + if self._hist.rank() != 1 or isinstance( + self._hist, _core.hist.any_double_sparse + ): return repr(self) s = str(self._hist) # get rid of first line and last character @@ -1740,6 +1804,7 @@ def values( @typing.overload def values( self: Histogram[bhs.Double] + | Histogram[bhs.DoubleSparse] | Histogram[bhs.Unlimited] | Histogram[bhs.Weight] | Histogram[bhs.Mean] @@ -1769,7 +1834,7 @@ def values( :return: "np.typing.NDArray[Any]"[np.float64] """ - view: Any = self.view(flow) + view: Any = self._view_or_dense(flow) # TODO: Might be a NumPy typing bug if len(view.dtype) == 0: return view # type: ignore[no-any-return] @@ -1783,6 +1848,7 @@ def variances( @typing.overload def variances( self: Histogram[bhs.Double] + | Histogram[bhs.DoubleSparse] | Histogram[bhs.Unlimited] | Histogram[bhs.MultiCell], flow: bool = ..., @@ -1825,7 +1891,7 @@ def variances( :return: "np.typing.NDArray[Any]"[np.float64] """ - view: Any = self.view(flow) + view: Any = self._view_or_dense(flow) if len(view.dtype) == 0: return view if self._variance_known else None @@ -1856,6 +1922,7 @@ def counts( @typing.overload def counts( self: Histogram[bhs.Double] + | Histogram[bhs.DoubleSparse] | Histogram[bhs.Unlimited] | Histogram[bhs.Weight] | Histogram[bhs.Mean] @@ -1894,7 +1961,7 @@ def counts( :return: "np.typing.NDArray[Any]"[np.float64] """ - view: Any = self.view(flow) + view: Any = self._view_or_dense(flow) if len(view.dtype) == 0: return view # type: ignore[no-any-return] diff --git a/src/boost_histogram/serialization/__init__.py b/src/boost_histogram/serialization/__init__.py index 8c4b0cbe..799609ea 100644 --- a/src/boost_histogram/serialization/__init__.py +++ b/src/boost_histogram/serialization/__init__.py @@ -69,9 +69,13 @@ def to_uhi( # Convert the histogram to a dictionary writer_info = {"boost-histogram": {"version": version.version}} - # Store storage type info for AtomicInt64 and Unlimited (they serialize as int/double) + # Store storage type info for types that serialize as a plain int/double, so + # the exact storage can be restored (AtomicInt64, Unlimited, DoubleSparse). storage_type_str = _storage_type_to_str(h.storage_type()) - if isinstance(h.storage_type(), (storage.AtomicInt64, storage.Unlimited)): + is_sparse = isinstance(h.storage_type(), storage.DoubleSparse) + if isinstance( + h.storage_type(), (storage.AtomicInt64, storage.Unlimited, storage.DoubleSparse) + ): writer_info["boost-histogram"]["storage_type"] = type(h.storage_type()).__name__ data = { @@ -79,10 +83,26 @@ def to_uhi( "writer_info": writer_info, "axes": [_axis_to_dict(axis) for axis in h.axes], } - if keep_storage: - data["storage"] = _storage_to_dict(h.storage_type(), h.view(flow=True)) - else: + if not keep_storage: data["storage"] = {"type": storage_type_str} + elif is_sparse: + # Sparse storage has no dense view; serialize only the filled cells (COO), + # with flow so the round-trip is exact. This encoding is boost-histogram + # specific (other UHI consumers will not understand "sparse"). + idx_tuple, values = h.to_coo(flow=True) + indices = ( + np.stack(idx_tuple) + if idx_tuple + else np.empty((0, values.shape[0]), dtype=np.intp) + ) + data["storage"] = { + "type": storage_type_str, + "sparse": True, + "indices": indices, + "values": values, + } + else: + data["storage"] = _storage_to_dict(h.storage_type(), h.view(flow=True)) data["metadata"] = serialize_metadata(h.__dict__) return data @@ -98,6 +118,17 @@ def from_uhi(data: dict[str, Any], /) -> histogram.Histogram[Any]: h = histogram.Histogram[Any](*axis, storage=storage_) h.__dict__ = data.get("metadata", {}) + # Sparse storage is stored as COO (filled cells only) and cannot go through + # the dense view-based reconstruction below. Scatter the cells back directly. + if storage_data.get("sparse"): + indices = np.asarray(storage_data["indices"], dtype=np.intp) + values = np.asarray(storage_data["values"], dtype=np.float64) + # JSON round-trips can collapse a zero-length 2D array to 1D; restore it. + if indices.ndim == 1: + indices = indices.reshape(len(h.axes), values.shape[0]) + h._hist._from_coo(indices, values, True) # type: ignore[attr-defined] + return h + # Check if storage has data (if not, it's a structure-only histogram) # Validate required keys per storage type before deciding to skip data loading storage_type = storage_data["type"] diff --git a/src/boost_histogram/serialization/_storage.py b/src/boost_histogram/serialization/_storage.py index 116dd4bd..5b8bb596 100644 --- a/src/boost_histogram/serialization/_storage.py +++ b/src/boost_histogram/serialization/_storage.py @@ -24,7 +24,7 @@ def _storage_type_to_str(_storage: storage.Storage, /) -> str: match _storage: case storage.Int64(): return "int" - case storage.Double(): + case storage.Double() | storage.DoubleSparse(): return "double" case storage.AtomicInt64(): return "int" @@ -114,6 +114,8 @@ def _storage_from_dict( return storage.AtomicInt64() if orig_type == "Unlimited": return storage.Unlimited() + if orig_type == "DoubleSparse": + return storage.DoubleSparse() storage_type = data["type"] if storage_type == "int": diff --git a/src/boost_histogram/storage.py b/src/boost_histogram/storage.py index 5a6d9a82..171f4cf4 100644 --- a/src/boost_histogram/storage.py +++ b/src/boost_histogram/storage.py @@ -10,6 +10,7 @@ __all__ = [ "AtomicInt64", "Double", + "DoubleSparse", "Int64", "Mean", "MultiCell", @@ -56,6 +57,17 @@ class Double(store.double, Storage, family=boost_histogram): accumulator = float +class DoubleSparse(store.double_sparse, Storage, family=boost_histogram): + """ + Sparse storage of doubles: only filled cells are kept (in a hash map), so + histograms over very large or high-dimensional axis spaces stay small in + memory. It does not support ``.view()`` / dense-array access; use + :meth:`Histogram.to_coo` to read the filled cells instead. + """ + + accumulator = float + + class AtomicInt64(store.atomic_int64, Storage, family=boost_histogram): accumulator = int diff --git a/src/register_histograms.cpp b/src/register_histograms.cpp index 5450c4de..ead1318f 100644 --- a/src/register_histograms.cpp +++ b/src/register_histograms.cpp @@ -56,4 +56,10 @@ void register_histograms(py::module& hist) { "any_multi_cell", "N-dimensional histogram for storing multiple cells at once with any axis " "types."); + + register_histogram( + hist, + "any_double_sparse", + "N-dimensional histogram for real-valued data using sparse storage (only " + "filled cells are stored) with any axis types."); } diff --git a/src/register_storage.cpp b/src/register_storage.cpp index c6082926..3b7b10f6 100644 --- a/src/register_storage.cpp +++ b/src/register_storage.cpp @@ -40,4 +40,9 @@ void register_storages(py::module& storage) { storage, "multi_cell", "Dense storage which tracks sums of cell entries for multiple cells per entry"); + + register_storage( + storage, + "double_sparse", + "Sparse storage for doubles (only filled cells are stored)"); } diff --git a/tests/test_sparse_storage.py b/tests/test_sparse_storage.py new file mode 100644 index 00000000..a0786670 --- /dev/null +++ b/tests/test_sparse_storage.py @@ -0,0 +1,226 @@ +from __future__ import annotations + +import copy +from pickle import dumps, loads + +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +import boost_histogram as bh +from boost_histogram.serialization import from_uhi, to_uhi + + +def coo_dict(h): + """Map of (multi-index) -> value for the filled cells, order-independent.""" + indices, values = h.to_coo() + return {tuple(int(i[j]) for i in indices): v for j, v in enumerate(values)} + + +def pickle_roundtrip(x): + return loads(dumps(x)) + + +def test_construct_and_fill(): + h = bh.Histogram(bh.axis.Regular(1000, 0, 1000), storage=bh.storage.DoubleSparse()) + assert h.storage_type is bh.storage.DoubleSparse + + h.fill([1.5, 1.5, 5.5, 900.5]) + assert h.sum() == 4.0 + # Point reads go through C++ at(), not the (unavailable) dense view. + assert h[1] == 2.0 + assert h[5] == 1.0 + assert h[900] == 1.0 + assert h[2] == 0.0 + + +def test_stays_sparse_after_ops(): + h = bh.Histogram(bh.axis.Regular(100, 0, 100), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 50.5, 50.5]) + + # Slicing and projection use C++ algorithms and must keep sparse storage. + sliced = h[10:60] + assert sliced.storage_type is bh.storage.DoubleSparse + assert sliced.sum() == 2.0 + + h2 = bh.Histogram( + bh.axis.Regular(5, 0, 5), + bh.axis.Regular(5, 0, 5), + storage=bh.storage.DoubleSparse(), + ) + h2.fill([1.5, 1.5], [2.5, 2.5]) + projected = h2.project(0) + assert projected.storage_type is bh.storage.DoubleSparse + assert projected.sum() == 2.0 + + +def test_to_coo_filled_cells_only(): + h = bh.Histogram( + bh.axis.Regular(5, 0, 5), + bh.axis.IntCategory([10, 20, 30]), + storage=bh.storage.DoubleSparse(), + ) + h.fill([1.5, 1.5, 3.5], [10, 10, 30]) + + indices, values = h.to_coo() + assert isinstance(indices, tuple) + assert len(indices) == 2 + # One entry per distinct filled bin. + assert len(values) == 2 + assert coo_dict(h) == {(1, 0): 2.0, (3, 2): 1.0} + + +def test_to_coo_flow(): + h = bh.Histogram(bh.axis.Regular(10, 0, 10), storage=bh.storage.DoubleSparse()) + h.fill([-5, 5.5, 100]) # underflow, normal bin 5, overflow + + # Without flow, the flow cells are dropped, indices run 0..size-1. + indices, values = h.to_coo(flow=False) + assert coo_dict(h) == {(5,): 1.0} + + # With flow, underflow sits at index 0, overflow at extent-1 (== 11 here). + indices_f, values_f = h.to_coo(flow=True) + flow_map = {int(indices_f[0][j]): v for j, v in enumerate(values_f)} + assert flow_map == {0: 1.0, 6: 1.0, 11: 1.0} + + +def test_view_raises_but_copies_densify(): + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5]) + + # Only the zero-copy buffer view is unsupported. + with pytest.raises(TypeError, match="to_coo"): + h.view() + + # The copying accessors densify and keep working. + assert_array_equal(np.asarray(h), [0, 1, 0, 0, 0]) + assert_array_equal(h.values(), [0, 1, 0, 0, 0]) + assert_array_equal(h.counts(), [0, 1, 0, 0, 0]) + assert_array_equal(h.variances(), [0, 1, 0, 0, 0]) + + +def test_copy_accessors_match_dense(): + axes = (bh.axis.Regular(10, 0, 10), bh.axis.IntCategory([5, 6, 7])) + x = np.array([-1, 0.5, 0.5, 3.5, 9.5, 100]) + y = np.array([5, 6, 6, 7, 5, 6]) + + dense = bh.Histogram(*axes, storage=bh.storage.Double()) + dense.fill(x, y) + sparse = bh.Histogram(*axes, storage=bh.storage.DoubleSparse()) + sparse.fill(x, y) + + for flow in (False, True): + assert_array_equal(sparse.values(flow=flow), dense.values(flow=flow)) + assert_array_equal(sparse.counts(flow=flow), dense.counts(flow=flow)) + assert_array_equal(np.asarray(sparse), np.asarray(dense)) + + +def test_to_coo_requires_sparse(): + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.Double()) + with pytest.raises(TypeError, match="sparse"): + h.to_coo() + + +def test_repr_and_str_do_not_densify(): + # A huge axis would be impossible to densify; repr/str must still work. + h = bh.Histogram( + bh.axis.Regular(10**9, 0, 1), storage=bh.storage.DoubleSparse() + ) + h.fill([0.5, 0.5]) + assert "DoubleSparse" in repr(h) + assert "Sum: 2" in repr(h) + # __str__ falls back to repr for sparse rather than rendering every bin. + assert "DoubleSparse" in str(h) + + +@pytest.mark.parametrize("copy_fn", [copy.copy, copy.deepcopy, pickle_roundtrip]) +def test_copy_pickle_roundtrip(copy_fn): + h = bh.Histogram( + bh.axis.Regular(20, 0, 20), + bh.axis.IntCategory([1, 2, 3]), + storage=bh.storage.DoubleSparse(), + ) + h.fill([-1, 5.5, 5.5, 100], [1, 2, 2, 3]) + + h2 = copy_fn(h) + assert h2 == h + assert h2.sum(flow=True) == h.sum(flow=True) + assert coo_dict(h2) == coo_dict(h) + + +def test_uhi_roundtrip_with_data(): + h = bh.Histogram( + bh.axis.Regular(10, 0, 10), + bh.axis.Regular(8, 0, 8), + storage=bh.storage.DoubleSparse(), + ) + h.fill([-1, 2.5, 2.5, 100], [3, 4, 4, 200]) + + data = to_uhi(h) + assert data["storage"]["type"] == "double" + assert data["storage"]["sparse"] is True + assert data["writer_info"]["boost-histogram"]["storage_type"] == "DoubleSparse" + + h2 = from_uhi(data) + assert h2.storage_type is bh.storage.DoubleSparse + assert h2 == h + assert h2.sum(flow=True) == h.sum(flow=True) + + +def test_uhi_roundtrip_through_json(): + import json + + h = bh.Histogram( + bh.axis.Regular(10, 0, 10), + bh.axis.Regular(6, 0, 6), + storage=bh.storage.DoubleSparse(), + ) + h.fill([2.5, 2.5, 8.5], [1.5, 1.5, 3.5]) + + def to_jsonable(obj): + if isinstance(obj, np.ndarray): + return obj.tolist() + if isinstance(obj, dict): + return {k: to_jsonable(v) for k, v in obj.items()} + if isinstance(obj, list): + return [to_jsonable(v) for v in obj] + return obj + + data = json.loads(json.dumps(to_jsonable(to_uhi(h)))) + h2 = from_uhi(data) + assert h2 == h + + +def test_uhi_structure_only(): + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 1.5]) + + data = to_uhi(h, keep_storage=False) + h2 = from_uhi(data) + assert h2.storage_type is bh.storage.DoubleSparse + assert h2.sum(flow=True) == 0.0 + + +def test_uhi_roundtrip_empty(): + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h2 = from_uhi(to_uhi(h)) + assert h2 == h + assert h2.sum() == 0.0 + + +def test_matches_dense_values(): + # The filled cells should agree with what a dense Double histogram records. + axes = (bh.axis.Regular(10, 0, 10), bh.axis.IntCategory([5, 6, 7])) + x = np.array([0.5, 0.5, 3.5, 9.5]) + y = np.array([5, 6, 6, 7]) + + dense = bh.Histogram(*axes, storage=bh.storage.Double()) + dense.fill(x, y) + sparse = bh.Histogram(*axes, storage=bh.storage.DoubleSparse()) + sparse.fill(x, y) + + dense_view = dense.view() + rebuilt = np.zeros_like(dense_view) + indices, values = sparse.to_coo() + rebuilt[indices] = values + assert_array_equal(rebuilt, dense_view) From a9d5e504a088508c1c2237a27ba3042df5eb57a2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 9 Jun 2026 15:14:37 +0000 Subject: [PATCH 2/8] style: pre-commit fixes --- tests/test_sparse_storage.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_sparse_storage.py b/tests/test_sparse_storage.py index a0786670..6a87ade9 100644 --- a/tests/test_sparse_storage.py +++ b/tests/test_sparse_storage.py @@ -123,9 +123,7 @@ def test_to_coo_requires_sparse(): def test_repr_and_str_do_not_densify(): # A huge axis would be impossible to densify; repr/str must still work. - h = bh.Histogram( - bh.axis.Regular(10**9, 0, 1), storage=bh.storage.DoubleSparse() - ) + h = bh.Histogram(bh.axis.Regular(10**9, 0, 1), storage=bh.storage.DoubleSparse()) h.fill([0.5, 0.5]) assert "DoubleSparse" in repr(h) assert "Sum: 2" in repr(h) From 7e70d450b462348d5db258805aad3e1f3ff77c57 Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Tue, 9 Jun 2026 11:24:26 -0400 Subject: [PATCH 3/8] fix: sparse np.asarray raises; densify only explicit accessors; add docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit np.asarray(h) now raises for sparse storage like .view() does, rather than silently densifying — implicit array conversion of a potentially huge sparse histogram is a memory hazard. The explicit copying accessors (values/variances/ counts) still densify on demand. Add a "Sparse storages" section to the storage user guide, including how to build a scipy.sparse.coo_array from to_coo() output. Assisted-by: ClaudeCode:claude-opus-4.8 --- docs/user-guide/storage.rst | 84 ++++++++++++++++++++++++++++++++ src/boost_histogram/histogram.py | 11 +++-- tests/test_sparse_storage.py | 15 +++--- 3 files changed, 98 insertions(+), 12 deletions(-) diff --git a/docs/user-guide/storage.rst b/docs/user-guide/storage.rst index 4fe183a2..af417130 100644 --- a/docs/user-guide/storage.rst +++ b/docs/user-guide/storage.rst @@ -66,6 +66,90 @@ This storage is like ``Int64()``, but also provides a thread safety guarantee. You can fill a single histogram from multiple threads. +Sparse storages +--------------- + +DoubleSparse +^^^^^^^^^^^^ + +The ``DoubleSparse()`` storage is like ``Double()``, but only the bins that are +actually filled are stored (in a hash map keyed on the bin index). This makes +histograms over very large or high-dimensional axis spaces feasible, where a +dense grid would not fit in memory even though only a small fraction of the bins +are ever filled. + +.. code-block:: python3 + + import boost_histogram as bh + + # A 5-dimensional grid; dense storage would need 10**15 cells. + axes = [bh.axis.Regular(1000, 0, 1) for _ in range(5)] + h = bh.Histogram(*axes, storage=bh.storage.DoubleSparse()) + h.fill(*data) # only the filled cells are allocated + +Because the data is not stored as a dense array, **sparse storage is not a +drop-in replacement for a dense histogram**: + +* ``h.view()`` and ``np.asarray(h)`` raise, since there is no contiguous buffer + to view into. +* The copying accessors -- ``h.values()``, ``h.variances()``, ``h.counts()`` -- + *densify* into a freshly allocated array, so they keep working. This is only a + good idea when the dense shape actually fits in memory; for a genuinely huge + grid, use :meth:`~boost_histogram.Histogram.to_coo` instead. +* Reading a single bin (``h[bh.loc(...)]``), slicing, projection, ``.sum()``, and + adding two histograms together all keep the result sparse. +* Writes are fill-only: ``h[...] = value`` is not supported (it would need a live + buffer). + +Only ``double`` sparse storage is available for now; accumulator-backed sparse +storages (the equivalent of ``Weight``, ``Mean``, ...) are waiting on a +Boost.Histogram fix that ships in Boost 1.92. + +Reading the filled cells +"""""""""""""""""""""""" + +:meth:`~boost_histogram.Histogram.to_coo` returns the filled cells in coordinate +(COO) form -- a tuple of per-axis index arrays (in the style of +:func:`numpy.nonzero`, so they can be used directly as a fancy index) and the +array of corresponding values: + +.. code-block:: python3 + + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 1.5, 3.5]) + + indices, values = h.to_coo() + # indices == (array([1, 3]),) values == array([2., 1.]) + +Pass ``flow=True`` to include the underflow/overflow bins (the indices then run +over the with-flow grid, with underflow at ``0``). + +Converting to a SciPy sparse array +"""""""""""""""""""""""""""""""""" + +For a 2D histogram, the COO output maps directly onto a +:class:`scipy.sparse.coo_array`. The per-axis index arrays are the row and +column coordinates, and ``h.axes.size`` is the dense shape: + +.. code-block:: python3 + + from scipy.sparse import coo_array + + h = bh.Histogram( + bh.axis.Regular(1000, 0, 1000), + bh.axis.Regular(1000, 0, 1000), + storage=bh.storage.DoubleSparse(), + ) + h.fill(xs, ys) + + (rows, cols), values = h.to_coo() + matrix = coo_array((values, (rows, cols)), shape=h.axes.size) + +(SciPy sparse arrays are two-dimensional, so this applies to 2D histograms; for +higher dimensions, use the ``(indices, values)`` pair directly, e.g. with an +N-dimensional sparse library such as ``sparse``.) + + Accumulator storages -------------------- diff --git a/src/boost_histogram/histogram.py b/src/boost_histogram/histogram.py index 255506f0..d17085de 100644 --- a/src/boost_histogram/histogram.py +++ b/src/boost_histogram/histogram.py @@ -582,9 +582,10 @@ def view( Return a view into the data, optionally with overflow turned on. Sparse storage (:class:`~boost_histogram.storage.DoubleSparse`) does not - support this, since there is no dense buffer to view into; use - :meth:`to_coo` to read the filled cells, or :meth:`values` (and the other - copying accessors) if a dense array is acceptable. + support this, since there is no dense buffer to view into (``np.asarray`` + raises for the same reason); use :meth:`to_coo` to read the filled cells, + or :meth:`values` (and the other copying accessors) if an explicitly + densified array is acceptable. """ return _to_view(self._hist.view(flow)) @@ -649,7 +650,9 @@ def __array__( kwargs = {} if copy is not None: kwargs["copy"] = copy - return np.asarray(self._view_or_dense(False), dtype=dtype, **kwargs) # type: ignore[call-overload, no-any-return] + # Implicit array conversion goes through view(), so it raises for sparse + # storage rather than silently densifying. Use .values() or .to_coo(). + return np.asarray(self.view(False), dtype=dtype, **kwargs) # type: ignore[call-overload, no-any-return] __hash__ = None # type: ignore[assignment] diff --git a/tests/test_sparse_storage.py b/tests/test_sparse_storage.py index 6a87ade9..bee2523e 100644 --- a/tests/test_sparse_storage.py +++ b/tests/test_sparse_storage.py @@ -1,6 +1,7 @@ from __future__ import annotations import copy +import json from pickle import dumps, loads import numpy as np @@ -75,7 +76,6 @@ def test_to_coo_flow(): h.fill([-5, 5.5, 100]) # underflow, normal bin 5, overflow # Without flow, the flow cells are dropped, indices run 0..size-1. - indices, values = h.to_coo(flow=False) assert coo_dict(h) == {(5,): 1.0} # With flow, underflow sits at index 0, overflow at extent-1 (== 11 here). @@ -84,16 +84,17 @@ def test_to_coo_flow(): assert flow_map == {0: 1.0, 6: 1.0, 11: 1.0} -def test_view_raises_but_copies_densify(): +def test_view_and_asarray_raise_but_copies_densify(): h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) h.fill([1.5]) - # Only the zero-copy buffer view is unsupported. + # The zero-copy buffer view (and implicit array conversion) are unsupported. with pytest.raises(TypeError, match="to_coo"): h.view() + with pytest.raises(TypeError): + np.asarray(h) - # The copying accessors densify and keep working. - assert_array_equal(np.asarray(h), [0, 1, 0, 0, 0]) + # The explicit copying accessors densify and keep working. assert_array_equal(h.values(), [0, 1, 0, 0, 0]) assert_array_equal(h.counts(), [0, 1, 0, 0, 0]) assert_array_equal(h.variances(), [0, 1, 0, 0, 0]) @@ -112,7 +113,7 @@ def test_copy_accessors_match_dense(): for flow in (False, True): assert_array_equal(sparse.values(flow=flow), dense.values(flow=flow)) assert_array_equal(sparse.counts(flow=flow), dense.counts(flow=flow)) - assert_array_equal(np.asarray(sparse), np.asarray(dense)) + assert_array_equal(sparse.variances(flow=flow), dense.variances(flow=flow)) def test_to_coo_requires_sparse(): @@ -166,8 +167,6 @@ def test_uhi_roundtrip_with_data(): def test_uhi_roundtrip_through_json(): - import json - h = bh.Histogram( bh.axis.Regular(10, 0, 10), bh.axis.Regular(6, 0, 6), From 3b282c7d617bd040a547b04879d380cad6d2aa18 Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Tue, 9 Jun 2026 12:07:07 -0400 Subject: [PATCH 4/8] fix: validate coo shapes and avoid allocation in sparse loop - Add missing indices.shape(1) vs values.shape(0) validation in histogram_from_coo to prevent out-of-bounds reads. - Hoist std::vector allocation out of the map loop in histogram_to_coo. - Add tests for setitem raising on sparse and sparse+histogram addition. Assisted-by: opencode:NRP/kimi --- include/bh_python/histogram.hpp | 6 ++++-- tests/test_sparse_storage.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/include/bh_python/histogram.hpp b/include/bh_python/histogram.hpp index 56071873..a0a6aa57 100644 --- a/include/bh_python/histogram.hpp +++ b/include/bh_python/histogram.hpp @@ -190,10 +190,10 @@ py::tuple histogram_to_coo(bh::histogram>& h, bool std::vector vals; vals.reserve(map.size()); + std::vector multi(rank); for(const auto& kv : map) { std::size_t lin = kv.first; - std::vector multi(rank); - bool keep = true; + bool keep = true; for(std::size_t ax = 0; ax < rank; ++ax) { const auto& info = layout[ax]; const std::size_t iflow = lin % info.extent; @@ -252,6 +252,8 @@ void histogram_from_coo(bh::histogram>& h, throw py::value_error("indices first dimension must equal histogram rank"); const auto n = val.shape(0); + if(ind.shape(1) != n) + throw py::value_error("indices second dimension must match values length"); std::vector at_index(rank); for(py::ssize_t j = 0; j < n; ++j) { for(std::size_t ax = 0; ax < rank; ++ax) { diff --git a/tests/test_sparse_storage.py b/tests/test_sparse_storage.py index bee2523e..7d232d5b 100644 --- a/tests/test_sparse_storage.py +++ b/tests/test_sparse_storage.py @@ -205,6 +205,34 @@ def test_uhi_roundtrip_empty(): assert h2.sum() == 0.0 +def test_setitem_raises_on_sparse(): + """Sparse storage does not support direct assignment (no dense buffer).""" + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + with pytest.raises(TypeError, match="buffer"): + h[0] = 1.0 + + +def test_sparse_histogram_addition(): + """Adding two sparse histograms stays sparse and combines counts correctly.""" + h1 = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h1.fill([1.5, 3.5]) + + h2 = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h2.fill([1.5, 4.5]) + + h_sum = h1 + h2 + assert h_sum.storage_type is bh.storage.DoubleSparse + assert h_sum[1] == 2.0 + assert h_sum[3] == 1.0 + assert h_sum[4] == 1.0 + assert h_sum[2] == 0.0 + + # densified view should match a dense histogram with the same fills + dense = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.Double()) + dense.fill([1.5, 1.5, 3.5, 4.5]) + assert_array_equal(h_sum.values(), dense.values()) + + def test_matches_dense_values(): # The filled cells should agree with what a dense Double histogram records. axes = (bh.axis.Regular(10, 0, 10), bh.axis.IntCategory([5, 6, 7])) From 7d3598dc8cf05531c83337a2be21cbc4539748b3 Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Tue, 9 Jun 2026 12:16:41 -0400 Subject: [PATCH 5/8] fix: avoid densifying edges in repr and clang-tidy value params Regular._repr_args_ built the full edges array just to read the first and last edge; for a very large axis (e.g. the sparse-storage repr test with 1e9 bins) this allocates gigabytes, OOM-killing PyPy CI and overflowing the max array size on 32-bit Windows. Use value() for the endpoints instead, which is O(1). Also pass the coo arrays to histogram_from_coo by const reference to satisfy clang-tidy's performance-unnecessary-value-param check. Assisted-by: ClaudeCode:claude-opus-4.8 --- include/bh_python/histogram.hpp | 4 ++-- include/bh_python/register_histogram.hpp | 4 ++-- src/boost_histogram/axis/__init__.py | 9 ++++++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/include/bh_python/histogram.hpp b/include/bh_python/histogram.hpp index a0a6aa57..1221830f 100644 --- a/include/bh_python/histogram.hpp +++ b/include/bh_python/histogram.hpp @@ -239,8 +239,8 @@ py::tuple histogram_to_coo(bh::histogram>& h, bool /// __setitem__ cannot densify the map. template void histogram_from_coo(bh::histogram>& h, - py::array_t indices, - py::array_t values, + const py::array_t& indices, + const py::array_t& values, bool flow) { const auto& axes = bh::unsafe_access::axes(h); const auto layout = detail::make_axis_layout(axes); diff --git a/include/bh_python/register_histogram.hpp b/include/bh_python/register_histogram.hpp index 02566ef7..143ebd9b 100644 --- a/include/bh_python/register_histogram.hpp +++ b/include/bh_python/register_histogram.hpp @@ -43,8 +43,8 @@ void register_sparse_coo(PyClass& hist, std::true_type) { .def( "_from_coo", [](histogram_t& self, - py::array_t indices, - py::array_t values, + const py::array_t& indices, + const py::array_t& values, bool flow) { histogram_from_coo(self, indices, values, flow); }, "indices"_a, "values"_a, diff --git a/src/boost_histogram/axis/__init__.py b/src/boost_histogram/axis/__init__.py index f7435413..5e90ea7b 100644 --- a/src/boost_histogram/axis/__init__.py +++ b/src/boost_histogram/axis/__init__.py @@ -404,7 +404,14 @@ def __init__( def _repr_args_(self) -> list[str]: "Return inner part of signature for use in repr" - ret = [f"{self.size:g}", f"{self.edges[0]:g}", f"{self.edges[-1]:g}"] + # Use value() for the endpoints rather than building the full edges + # array, which would be enormous (and unrepresentable on 32-bit) for a + # very large axis. + ret = [ + f"{self.size:g}", + f"{self._ax.value(0):g}", + f"{self._ax.value(self.size):g}", + ] if self.traits.growth: ret.append("growth=True") From 4dbb789c17e5a2bb1d03152d20992b15f71b7047 Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Tue, 9 Jun 2026 13:32:18 -0400 Subject: [PATCH 6/8] feat: keep scalar mul/div sparse and document sparse op support Route scalar * and / on sparse storage through the COO get/set path so they scale only the filled cells (0*x == 0 preserves sparsity) instead of densifying via the NumPy view. Scalar +/- are refused with a clear error since they would fill every cell. Document which operations stay sparse, densify, or raise, and note mixed int+slice indexing as a follow-up. Assisted-by: ClaudeCode:claude-opus-4.8 --- docs/user-guide/storage.rst | 46 ++++++++++++----- src/boost_histogram/histogram.py | 24 +++++++++ tests/test_sparse_storage.py | 88 ++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+), 12 deletions(-) diff --git a/docs/user-guide/storage.rst b/docs/user-guide/storage.rst index af417130..d0b7d92a 100644 --- a/docs/user-guide/storage.rst +++ b/docs/user-guide/storage.rst @@ -88,18 +88,40 @@ are ever filled. h.fill(*data) # only the filled cells are allocated Because the data is not stored as a dense array, **sparse storage is not a -drop-in replacement for a dense histogram**: - -* ``h.view()`` and ``np.asarray(h)`` raise, since there is no contiguous buffer - to view into. -* The copying accessors -- ``h.values()``, ``h.variances()``, ``h.counts()`` -- - *densify* into a freshly allocated array, so they keep working. This is only a - good idea when the dense shape actually fits in memory; for a genuinely huge - grid, use :meth:`~boost_histogram.Histogram.to_coo` instead. -* Reading a single bin (``h[bh.loc(...)]``), slicing, projection, ``.sum()``, and - adding two histograms together all keep the result sparse. -* Writes are fill-only: ``h[...] = value`` is not supported (it would need a live - buffer). +drop-in replacement for a dense histogram**. Operations that can be expressed +without materializing the full grid keep the result sparse; operations that need +a dense buffer raise a clear ``TypeError`` (pointing you at +:meth:`~boost_histogram.Histogram.to_coo`) rather than silently densifying. + +Stays sparse (or returns a scalar) -- delegates to the C++ storage: + +* Reading a single bin: ``h[bh.loc(...)]``, ``h[5]``. +* Integer slicing (``h[10:60]``), rebinning (``h[:: bh.rebin(2)]``), integration + (``h[10:60:sum]``), and projection (``h.project(0)``). +* ``.sum()``, adding two histograms (``h1 + h2``), and scalar ``*`` / ``/`` + (which only scale the filled cells, so ``0`` stays ``0``). +* ``copy``/``deepcopy``, pickling, and UHI serialization. + +Densifies into a fresh array (works, but allocates the full dense shape): + +* The copying accessors ``h.values()``, ``h.variances()``, ``h.counts()``. Only + use these when the dense shape fits in memory; otherwise prefer + :meth:`~boost_histogram.Histogram.to_coo`. + +Unsupported -- raises ``TypeError``: + +* ``h.view()`` and ``np.asarray(h)`` -- there is no contiguous buffer to view. +* Writes: ``h[...] = value`` (filling with ``h.fill(...)`` is the only write path). +* Scalar ``+`` / ``-`` -- these would add to *every* cell, filling the grid. +* Fancy indexing that gathers through a dense view: mixed integer + slice + (``h[2, :]``), categorical list picks (``h[:, [0, 2]]``), and NumPy array + indexing (``h[np.array([...])]``). + +.. note:: + + Mixed integer + slice indexing (``h[2, :]``) is a common pattern and a likely + follow-up to support natively on sparse storage; for now, slice without + collapsing the axis (``h[2:3, :]``) to stay sparse, or densify first. Only ``double`` sparse storage is available for now; accumulator-backed sparse storages (the equivalent of ``Weight``, ``Mean``, ...) are waiting on a diff --git a/src/boost_histogram/histogram.py b/src/boost_histogram/histogram.py index d17085de..0e279e2d 100644 --- a/src/boost_histogram/histogram.py +++ b/src/boost_histogram/histogram.py @@ -860,6 +860,8 @@ def _compute_inplace_op( else: msg = f"Wrong shape {other.shape}, expected {self.shape} or {self.axes.extent}" raise ValueError(msg) + elif isinstance(self._hist, _core.hist.any_double_sparse): + self._sparse_scalar_inplace_op(name, other) else: view = self.view(flow=True) getattr(view, name)(other) @@ -867,6 +869,28 @@ def _compute_inplace_op( self._variance_known = False return self + def _sparse_scalar_inplace_op( + self, name: str, other: np.typing.NDArray[Any] | float + ) -> None: + # Sparse storage has no dense buffer to mutate in place. ``*`` and ``/`` + # only scale the already-filled cells (0 * x == 0), so sparsity is + # preserved -- rescale them through the COO get/set path. ``+`` and + # ``-`` would have to touch every cell, densifying the storage, so they + # are refused rather than silently materializing the full grid. + if name in {"__iadd__", "__isub__"}: + symbol = _INPLACE_OP_SYMBOLS[name] + msg = ( + f"Sparse storage does not support scalar {symbol!r}; it would " + "fill every cell. Densify with .values() or use a dense storage." + ) + raise TypeError(msg) + indices, values = self._hist._to_coo(True) # type: ignore[attr-defined] + if name in {"__itruediv__", "__idiv__"}: + values = values / other + else: # __imul__ + values = values * other + self._hist._from_coo(indices, values, True) # type: ignore[attr-defined] + # TODO: Marked as too complex by flake8. Should be factored out a bit. def fill( self, diff --git a/tests/test_sparse_storage.py b/tests/test_sparse_storage.py index 7d232d5b..a8ef3d8d 100644 --- a/tests/test_sparse_storage.py +++ b/tests/test_sparse_storage.py @@ -2,6 +2,8 @@ import copy import json +import operator +import warnings from pickle import dumps, loads import numpy as np @@ -233,6 +235,92 @@ def test_sparse_histogram_addition(): assert_array_equal(h_sum.values(), dense.values()) +@pytest.mark.parametrize("op", ["mul", "truediv"]) +def test_scalar_mul_div_stays_sparse(op): + """Scalar * and / scale filled cells in place and preserve sparsity.""" + fn = getattr(operator, op) + sparse = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + sparse.fill([-1, 1.5, 1.5, 3.5, 100]) # includes flow cells + dense = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.Double()) + dense.fill([-1, 1.5, 1.5, 3.5, 100]) + + result = fn(sparse, 2.0) + assert result.storage_type is bh.storage.DoubleSparse + # Original is untouched (operator returns a copy). + assert_array_equal(sparse.values(flow=True), dense.values(flow=True)) + for flow in (False, True): + assert_array_equal(result.values(flow=flow), fn(dense, 2.0).values(flow=flow)) + + +def test_scalar_imul_in_place_stays_sparse(): + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 1.5, 3.5]) + h *= 3 + assert h.storage_type is bh.storage.DoubleSparse + assert_array_equal(h.values(), [0, 6, 0, 3, 0]) + + +@pytest.mark.parametrize("op", ["+", "-"]) +def test_scalar_add_sub_refused(op): + """Scalar +/- would fill every cell, so sparse storage refuses them.""" + fn = {"+": operator.add, "-": operator.sub}[op] + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5]) + with pytest.raises(TypeError, match="fill every cell"): + fn(h, 2.0) + + +def _mixed_axes_hist(): + h = bh.Histogram( + bh.axis.Regular(5, 0, 5), + bh.axis.IntCategory([10, 20, 30]), + storage=bh.storage.DoubleSparse(), + ) + h.fill([1.5, 1.5, 3.5], [10, 10, 30]) + return h + + +def test_mixed_int_slice_indexing_raises(): + """Mixed integer + slice indexing routes through .view() and raises.""" + h = _mixed_axes_hist() + with pytest.raises(TypeError, match=r"to_coo|sparse"): + _ = h[2, :] + + +def test_list_pick_indexing_raises(): + """Categorical list selection routes through .view() and raises.""" + h = _mixed_axes_hist() + # The list-pick path is experimental and warns before it reaches the view. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + with pytest.raises(TypeError, match=r"to_coo|sparse"): + _ = h[:, [0, 2]] + + +def test_vectorized_array_indexing_raises(): + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 3.5]) + with pytest.raises(TypeError, match=r"to_coo|sparse"): + _ = h[np.array([0, 1, 2])] + + +@pytest.mark.parametrize( + "make_index", + [ + pytest.param(lambda: slice(1, 4), id="slice"), + pytest.param(lambda: slice(None, None, bh.rebin(2)), id="rebin"), + pytest.param(lambda: slice(None, None, sum), id="sum"), + ], +) +def test_reduce_indexing_stays_sparse(make_index): + """Slicing, rebin, and integration delegate to C++ and stay sparse.""" + h = bh.Histogram(bh.axis.Regular(10, 0, 10), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 5.5, 5.5, 8.5]) + result = h[make_index()] + if isinstance(result, bh.Histogram): + assert result.storage_type is bh.storage.DoubleSparse + + def test_matches_dense_values(): # The filled cells should agree with what a dense Double histogram records. axes = (bh.axis.Regular(10, 0, 10), bh.axis.IntCategory([5, 6, 7])) From ef75137c5f9942bacd182409214678c9485a667f Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Tue, 9 Jun 2026 15:26:26 -0400 Subject: [PATCH 7/8] fix: iterate only filled cells in sparse sum/empty/equality The storage iterator of a map-backed histogram visits every logical cell, so bh::algorithm::sum/empty and histogram::operator== were O(dense size) -- repr (which calls sum twice) on the documented 10**15-cell example would never return. Dispatch sum, empty, and ==/!= through overloads that walk only the hash map for sparse storage, treating explicit zeros (left behind by -=) the same as absent cells. Assisted-by: ClaudeCode:claude-fable-5 --- docs/user-guide/storage.rst | 6 +- include/bh_python/histogram.hpp | 97 ++++++++++++++++++++++++ include/bh_python/register_histogram.hpp | 20 +++-- src/boost_histogram/histogram.py | 1 + tests/test_sparse_storage.py | 47 ++++++++++++ 5 files changed, 161 insertions(+), 10 deletions(-) diff --git a/docs/user-guide/storage.rst b/docs/user-guide/storage.rst index d0b7d92a..2aaa9498 100644 --- a/docs/user-guide/storage.rst +++ b/docs/user-guide/storage.rst @@ -98,8 +98,10 @@ Stays sparse (or returns a scalar) -- delegates to the C++ storage: * Reading a single bin: ``h[bh.loc(...)]``, ``h[5]``. * Integer slicing (``h[10:60]``), rebinning (``h[:: bh.rebin(2)]``), integration (``h[10:60:sum]``), and projection (``h.project(0)``). -* ``.sum()``, adding two histograms (``h1 + h2``), and scalar ``*`` / ``/`` - (which only scale the filled cells, so ``0`` stays ``0``). +* ``.sum()``, ``.empty()``, and ``==`` comparison -- these iterate only the + filled cells, never the dense grid. +* Adding two histograms (``h1 + h2``) and scalar ``*`` / ``/`` (which only + scale the filled cells, so ``0`` stays ``0``). * ``copy``/``deepcopy``, pickling, and UHI serialization. Densifies into a fresh array (works, but allocates the full dense shape): diff --git a/include/bh_python/histogram.hpp b/include/bh_python/histogram.hpp index 1221830f..ccc25aaa 100644 --- a/include/bh_python/histogram.hpp +++ b/include/bh_python/histogram.hpp @@ -14,6 +14,8 @@ #include #include +#include +#include #include #include #include @@ -167,6 +169,17 @@ std::vector make_axis_layout(const Axes& axes) { }); return layout; } + +/// True if a flat sparse storage key addresses an in-range (non-flow) cell. +inline bool is_inner_cell(std::size_t lin, const std::vector& layout) { + for(const auto& info : layout) { + const std::size_t iflow = lin % info.extent; + lin /= info.extent; + if(iflow < info.underflow || iflow >= info.underflow + info.size) + return false; + } + return true; +} } // namespace detail /// Extract the filled cells of a sparse histogram in COO form. Returns a tuple @@ -267,6 +280,90 @@ void histogram_from_coo(bh::histogram>& h, } } +template +auto histogram_sum(const bh::histogram& h, bool flow) { + return bh::algorithm::sum(h, flow ? bh::coverage::all : bh::coverage::inner); +} + +/// The storage iterator of a map-backed histogram visits every *logical* cell +/// (hash lookup per cell), so bh::algorithm::sum would be O(dense size) — for +/// the huge axis spaces sparse storage targets, that never finishes. Sum only +/// the filled cells instead; flow=false unravels each key to skip flow cells. +template +T histogram_sum(const bh::histogram>& h, bool flow) { + using map_type = std::unordered_map; + const auto& map = static_cast(bh::unsafe_access::storage(h)); + + T total{}; + if(flow) { + for(const auto& kv : map) + total += kv.second; + return total; + } + + const auto layout = detail::make_axis_layout(bh::unsafe_access::axes(h)); + for(const auto& kv : map) + if(detail::is_inner_cell(kv.first, layout)) + total += kv.second; + return total; +} + +template +bool histogram_empty(const bh::histogram& h, bool flow) { + return bh::algorithm::empty(h, flow ? bh::coverage::all : bh::coverage::inner); +} + +/// Sparse counterpart of bh::algorithm::empty, iterating only the filled cells +/// (see histogram_sum). Explicit zeros (left behind by e.g. -=) count as empty. +template +bool histogram_empty(const bh::histogram>& h, bool flow) { + using map_type = std::unordered_map; + const auto& map = static_cast(bh::unsafe_access::storage(h)); + + if(flow) { + for(const auto& kv : map) + if(kv.second != T{}) + return false; + return true; + } + + const auto layout = detail::make_axis_layout(bh::unsafe_access::axes(h)); + for(const auto& kv : map) + if(kv.second != T{} && detail::is_inner_cell(kv.first, layout)) + return false; + return true; +} + +template +bool histogram_equal(const bh::histogram& a, const bh::histogram& b) { + return a == b; +} + +/// Sparse counterpart of histogram::operator==, whose storage comparison walks +/// every logical cell (see histogram_sum). Compare the two maps directly, +/// treating absent keys as zero — the maps are not structurally identical for +/// equal histograms, since -= can leave explicit zeros behind. +template +bool histogram_equal(const bh::histogram>& a, + const bh::histogram>& b) { + if(!bh::detail::axes_equal(bh::unsafe_access::axes(a), bh::unsafe_access::axes(b))) + return false; + + using map_type = std::unordered_map; + const auto& ma = static_cast(bh::unsafe_access::storage(a)); + const auto& mb = static_cast(bh::unsafe_access::storage(b)); + + for(const auto& kv : ma) { + const auto it = mb.find(kv.first); + if(kv.second != (it == mb.end() ? T{} : it->second)) + return false; + } + for(const auto& kv : mb) + if(kv.second != T{} && ma.find(kv.first) == ma.end()) + return false; + return true; +} + /// Compute the bin of an array from a runtime list /// For example, [1,3,2] will return that bin of an array template diff --git a/include/bh_python/register_histogram.hpp b/include/bh_python/register_histogram.hpp index 143ebd9b..8521a6d9 100644 --- a/include/bh_python/register_histogram.hpp +++ b/include/bh_python/register_histogram.hpp @@ -84,10 +84,12 @@ auto register_histogram(py::module& m, const char* name, const char* desc) { .def(py::self += py::self) + // histogram_equal dispatches to a sparse-aware comparison; the default + // operator== walks every logical cell, which sparse storage cannot afford. .def("__eq__", [](const histogram_t& self, const py::object& other) { try { - return self == py::cast(other); + return histogram_equal(self, py::cast(other)); } catch(const py::cast_error&) { return false; } @@ -95,7 +97,7 @@ auto register_histogram(py::module& m, const char* name, const char* desc) { .def("__ne__", [](const histogram_t& self, const py::object& other) { try { - return self != py::cast(other); + return !histogram_equal(self, py::cast(other)); } catch(const py::cast_error&) { return true; } @@ -192,12 +194,13 @@ auto register_histogram(py::module& m, const char* name, const char* desc) { .def("__repr__", &shift_to_string) + // histogram_sum/histogram_empty dispatch to filled-cells-only versions + // for sparse storage; the bh::algorithm versions are O(dense size). .def( "sum", [](const histogram_t& self, bool flow) { const py::gil_scoped_release release; - return bh::algorithm::sum( - self, flow ? bh::coverage::all : bh::coverage::inner); + return histogram_sum(self, flow); }, "flow"_a = false) @@ -205,8 +208,7 @@ auto register_histogram(py::module& m, const char* name, const char* desc) { "empty", [](const histogram_t& self, bool flow) { const py::gil_scoped_release release; - return bh::algorithm::empty( - self, flow ? bh::coverage::all : bh::coverage::inner); + return histogram_empty(self, flow); }, "flow"_a = false) @@ -280,10 +282,12 @@ auto inline register_histogram>(py::module& m, .def(py::self += py::self) + // histogram_equal dispatches to a sparse-aware comparison; the default + // operator== walks every logical cell, which sparse storage cannot afford. .def("__eq__", [](const histogram_t& self, const py::object& other) { try { - return self == py::cast(other); + return histogram_equal(self, py::cast(other)); } catch(const py::cast_error&) { return false; } @@ -291,7 +295,7 @@ auto inline register_histogram>(py::module& m, .def("__ne__", [](const histogram_t& self, const py::object& other) { try { - return self != py::cast(other); + return !histogram_equal(self, py::cast(other)); } catch(const py::cast_error&) { return true; } diff --git a/src/boost_histogram/histogram.py b/src/boost_histogram/histogram.py index 0e279e2d..b8f652f1 100644 --- a/src/boost_histogram/histogram.py +++ b/src/boost_histogram/histogram.py @@ -1319,6 +1319,7 @@ def empty(self, flow: bool = False) -> bool: @typing.overload def sum( self: Histogram[bhs.Double] + | Histogram[bhs.DoubleSparse] | Histogram[bhs.Int64] | Histogram[bhs.AtomicInt64] | Histogram[bhs.Unlimited], diff --git a/tests/test_sparse_storage.py b/tests/test_sparse_storage.py index a8ef3d8d..ae232238 100644 --- a/tests/test_sparse_storage.py +++ b/tests/test_sparse_storage.py @@ -134,6 +134,53 @@ def test_repr_and_str_do_not_densify(): assert "DoubleSparse" in str(h) +def test_huge_axis_ops_iterate_filled_cells_only(): + # sum/empty/== (and repr, which calls sum) must iterate the filled cells, + # not the logical grid -- a dense iteration over 10**15 cells would never + # finish, defeating the entire point of sparse storage. + axes = [bh.axis.Regular(100_000, 0, 1) for _ in range(3)] + h = bh.Histogram(*axes, storage=bh.storage.DoubleSparse()) + assert h.empty() + assert h.empty(flow=True) + assert h.sum() == 0.0 + + h.fill([0.5, 0.5, 2.0], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) # one overflow + assert h.sum() == 2.0 + assert h.sum(flow=True) == 3.0 + assert not h.empty() + assert h == h.copy() + assert h != bh.Histogram(*axes, storage=bh.storage.DoubleSparse()) + + +def test_sum_and_empty_match_dense(): + axes = (bh.axis.Regular(10, 0, 10), bh.axis.IntCategory([5, 6, 7])) + x = [-1, 0.5, 0.5, 3.5, 9.5, 100] + y = [5, 6, 6, 7, 5, 6] + + dense = bh.Histogram(*axes, storage=bh.storage.Double()) + dense.fill(x, y) + sparse = bh.Histogram(*axes, storage=bh.storage.DoubleSparse()) + sparse.fill(x, y) + + for flow in (False, True): + assert sparse.sum(flow=flow) == dense.sum(flow=flow) + assert sparse.empty(flow=flow) == dense.empty(flow=flow) + + +def test_equality_ignores_explicit_zeros(): + # Subtracting histograms leaves explicit zeros in the hash map; equality, + # sum, and empty must treat those the same as absent cells. + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 3.5]) + h -= h.copy() + + fresh = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + assert h == fresh + assert fresh == h + assert h.empty(flow=True) + assert h.sum(flow=True) == 0.0 + + @pytest.mark.parametrize("copy_fn", [copy.copy, copy.deepcopy, pickle_roundtrip]) def test_copy_pickle_roundtrip(copy_fn): h = bh.Histogram( From e3d8766e1254d2c2ae61592dbc9b77b7d9da7d31 Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Tue, 9 Jun 2026 15:36:31 -0400 Subject: [PATCH 8/8] fix: use std::all_of in sparse empty/equal and skip huge-axis test on 32-bit Replace manual early-return loops in histogram_empty and histogram_equal with std::all_of to satisfy the readability-use-anyofallof clang-tidy rule. Skip test_huge_axis_ops_iterate_filled_cells_only on 32-bit Python: a 100000^3 histogram has ~10^15 cells, overflowing 32-bit std::size_t linear keys, so is_inner_cell misclassifies filled cells on those platforms. Assisted-by: ClaudeCode:claude-sonnet-4-6 --- include/bh_python/histogram.hpp | 24 ++++++++++-------------- tests/test_sparse_storage.py | 2 ++ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/include/bh_python/histogram.hpp b/include/bh_python/histogram.hpp index ccc25aaa..5b2daba7 100644 --- a/include/bh_python/histogram.hpp +++ b/include/bh_python/histogram.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -320,18 +321,14 @@ bool histogram_empty(const bh::histogram>& h, bool using map_type = std::unordered_map; const auto& map = static_cast(bh::unsafe_access::storage(h)); - if(flow) { - for(const auto& kv : map) - if(kv.second != T{}) - return false; - return true; - } + if(flow) + return std::all_of( + map.begin(), map.end(), [](const auto& kv) { return kv.second == T{}; }); const auto layout = detail::make_axis_layout(bh::unsafe_access::axes(h)); - for(const auto& kv : map) - if(kv.second != T{} && detail::is_inner_cell(kv.first, layout)) - return false; - return true; + return std::all_of(map.begin(), map.end(), [&layout](const auto& kv) { + return kv.second == T{} || !detail::is_inner_cell(kv.first, layout); + }); } template @@ -358,10 +355,9 @@ bool histogram_equal(const bh::histogram>& a, if(kv.second != (it == mb.end() ? T{} : it->second)) return false; } - for(const auto& kv : mb) - if(kv.second != T{} && ma.find(kv.first) == ma.end()) - return false; - return true; + return std::all_of(mb.begin(), mb.end(), [&ma](const auto& kv) { + return kv.second == T{} || ma.find(kv.first) != ma.end(); + }); } /// Compute the bin of an array from a runtime list diff --git a/tests/test_sparse_storage.py b/tests/test_sparse_storage.py index ae232238..7cd5539e 100644 --- a/tests/test_sparse_storage.py +++ b/tests/test_sparse_storage.py @@ -3,6 +3,7 @@ import copy import json import operator +import sys import warnings from pickle import dumps, loads @@ -134,6 +135,7 @@ def test_repr_and_str_do_not_densify(): assert "DoubleSparse" in str(h) +@pytest.mark.skipif(sys.maxsize <= 2**32, reason="linear keys overflow on 32-bit") def test_huge_axis_ops_iterate_filled_cells_only(): # sum/empty/== (and repr, which calls sum) must iterate the filled cells, # not the logical grid -- a dense iteration over 10**15 cells would never