diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 30d38b06..47e71d3c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -53,7 +53,7 @@ repos: rev: v2.4.2 hooks: - id: codespell - args: ["-L", "hist,nd,circularly,ba,fo", "-w"] + args: ["-L", "hist,nd,circularly,ba,fo,coo", "-w"] exclude: ^(notebooks/xarray.ipynb|notebooks/BoostHistogramHandsOn.ipynb|.*\.svg)$ - repo: local diff --git a/docs/user-guide/storage.rst b/docs/user-guide/storage.rst index 4fe183a2..2aaa9498 100644 --- a/docs/user-guide/storage.rst +++ b/docs/user-guide/storage.rst @@ -66,6 +66,114 @@ This storage is like ``Int64()``, but also provides a thread safety guarantee. You can fill a single histogram from multiple threads. +Sparse storages +--------------- + +DoubleSparse +^^^^^^^^^^^^ + +The ``DoubleSparse()`` storage is like ``Double()``, but only the bins that are +actually filled are stored (in a hash map keyed on the bin index). This makes +histograms over very large or high-dimensional axis spaces feasible, where a +dense grid would not fit in memory even though only a small fraction of the bins +are ever filled. + +.. code-block:: python3 + + import boost_histogram as bh + + # A 5-dimensional grid; dense storage would need 10**15 cells. + axes = [bh.axis.Regular(1000, 0, 1) for _ in range(5)] + h = bh.Histogram(*axes, storage=bh.storage.DoubleSparse()) + h.fill(*data) # only the filled cells are allocated + +Because the data is not stored as a dense array, **sparse storage is not a +drop-in replacement for a dense histogram**. Operations that can be expressed +without materializing the full grid keep the result sparse; operations that need +a dense buffer raise a clear ``TypeError`` (pointing you at +:meth:`~boost_histogram.Histogram.to_coo`) rather than silently densifying. + +Stays sparse (or returns a scalar) -- delegates to the C++ storage: + +* Reading a single bin: ``h[bh.loc(...)]``, ``h[5]``. +* Integer slicing (``h[10:60]``), rebinning (``h[:: bh.rebin(2)]``), integration + (``h[10:60:sum]``), and projection (``h.project(0)``). +* ``.sum()``, ``.empty()``, and ``==`` comparison -- these iterate only the + filled cells, never the dense grid. +* Adding two histograms (``h1 + h2``) and scalar ``*`` / ``/`` (which only + scale the filled cells, so ``0`` stays ``0``). +* ``copy``/``deepcopy``, pickling, and UHI serialization. + +Densifies into a fresh array (works, but allocates the full dense shape): + +* The copying accessors ``h.values()``, ``h.variances()``, ``h.counts()``. Only + use these when the dense shape fits in memory; otherwise prefer + :meth:`~boost_histogram.Histogram.to_coo`. + +Unsupported -- raises ``TypeError``: + +* ``h.view()`` and ``np.asarray(h)`` -- there is no contiguous buffer to view. +* Writes: ``h[...] = value`` (filling with ``h.fill(...)`` is the only write path). +* Scalar ``+`` / ``-`` -- these would add to *every* cell, filling the grid. +* Fancy indexing that gathers through a dense view: mixed integer + slice + (``h[2, :]``), categorical list picks (``h[:, [0, 2]]``), and NumPy array + indexing (``h[np.array([...])]``). + +.. note:: + + Mixed integer + slice indexing (``h[2, :]``) is a common pattern and a likely + follow-up to support natively on sparse storage; for now, slice without + collapsing the axis (``h[2:3, :]``) to stay sparse, or densify first. + +Only ``double`` sparse storage is available for now; accumulator-backed sparse +storages (the equivalent of ``Weight``, ``Mean``, ...) are waiting on a +Boost.Histogram fix that ships in Boost 1.92. + +Reading the filled cells +"""""""""""""""""""""""" + +:meth:`~boost_histogram.Histogram.to_coo` returns the filled cells in coordinate +(COO) form -- a tuple of per-axis index arrays (in the style of +:func:`numpy.nonzero`, so they can be used directly as a fancy index) and the +array of corresponding values: + +.. code-block:: python3 + + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 1.5, 3.5]) + + indices, values = h.to_coo() + # indices == (array([1, 3]),) values == array([2., 1.]) + +Pass ``flow=True`` to include the underflow/overflow bins (the indices then run +over the with-flow grid, with underflow at ``0``). + +Converting to a SciPy sparse array +"""""""""""""""""""""""""""""""""" + +For a 2D histogram, the COO output maps directly onto a +:class:`scipy.sparse.coo_array`. The per-axis index arrays are the row and +column coordinates, and ``h.axes.size`` is the dense shape: + +.. code-block:: python3 + + from scipy.sparse import coo_array + + h = bh.Histogram( + bh.axis.Regular(1000, 0, 1000), + bh.axis.Regular(1000, 0, 1000), + storage=bh.storage.DoubleSparse(), + ) + h.fill(xs, ys) + + (rows, cols), values = h.to_coo() + matrix = coo_array((values, (rows, cols)), shape=h.axes.size) + +(SciPy sparse arrays are two-dimensional, so this applies to 2D histograms; for +higher dimensions, use the ``(indices, values)`` pair directly, e.g. with an +N-dimensional sparse library such as ``sparse``.) + + Accumulator storages -------------------- diff --git a/include/bh_python/histogram.hpp b/include/bh_python/histogram.hpp index 8ccda08f..5b2daba7 100644 --- a/include/bh_python/histogram.hpp +++ b/include/bh_python/histogram.hpp @@ -12,11 +12,18 @@ #include #include #include +#include +#include +#include #include #include #include +#include +#include +#include + namespace pybind11 { /// The descriptor for atomic_* is the same as the descriptor for *, as long this uses @@ -80,6 +87,17 @@ py::buffer_info make_buffer(bh::histogram>& h, bool flow return detail::make_buffer_impl(axes, flow, &storage[0]); } +/// Sparse storage keeps only filled cells in a hash map, so there is no +/// contiguous buffer to expose. Callers should use ``to_coo()`` instead. We +/// throw rather than return an empty buffer so ``.view()`` / ``np.asarray`` give +/// a single, clear error. +template +py::buffer_info make_buffer(bh::histogram>& /* h */, + bool /* flow */) { + throw py::type_error( + "sparse storage does not support .view() or buffer access; use .to_coo()"); +} + /// Specialization for unlimited_buffer template py::buffer_info make_buffer(bh::histogram>& h, @@ -123,6 +141,225 @@ py::buffer_info make_buffer(bh::histogram>& h, bool flow) { std_as_const(new_axes), flow, static_cast(storage.get_buffer())); } +/// Trait identifying our sparse (hash-map backed) storages. +template +struct is_sparse_storage : std::false_type {}; + +template +struct is_sparse_storage> : std::true_type {}; + +namespace detail { +/// Per-axis layout used to (un)ravel a flat sparse storage index. The flat index +/// counts flow bins, with underflow (when present) at position 0, exactly like +/// make_buffer_impl above. +struct axis_layout { + std::size_t extent; // size + has_underflow + has_overflow + std::size_t size; // number of normal (in-range) bins + std::size_t underflow; // 1 if this axis has an underflow bin, else 0 +}; + +template +std::vector make_axis_layout(const Axes& axes) { + std::vector layout; + bh::detail::for_each_axis(axes, [&layout](const auto& ax) { + const bool has_underflow + = bh::axis::traits::options(ax) & bh::axis::option::underflow; + layout.push_back({static_cast(bh::axis::traits::extent(ax)), + static_cast(ax.size()), + has_underflow ? std::size_t{1} : std::size_t{0}}); + }); + return layout; +} + +/// True if a flat sparse storage key addresses an in-range (non-flow) cell. +inline bool is_inner_cell(std::size_t lin, const std::vector& layout) { + for(const auto& info : layout) { + const std::size_t iflow = lin % info.extent; + lin /= info.extent; + if(iflow < info.underflow || iflow >= info.underflow + info.size) + return false; + } + return true; +} +} // namespace detail + +/// Extract the filled cells of a sparse histogram in COO form. Returns a tuple +/// ``(indices, values)`` where ``indices`` is an ``(ndim, n)`` integer array and +/// ``values`` an ``n``-length array. With ``flow=False`` the flow cells are +/// dropped and indices run 0..size-1; with ``flow=True`` indices run +/// 0..extent-1 (underflow at 0), matching the ``view(flow=True)`` grid. +template +py::tuple histogram_to_coo(bh::histogram>& h, bool flow) { + const auto& axes = bh::unsafe_access::axes(h); + auto& storage = bh::unsafe_access::storage(h); + const auto layout = detail::make_axis_layout(axes); + const auto rank = layout.size(); + + // storage_adaptor publicly inherits map_impl, which publicly inherits the + // underlying map, so this upcast exposes only the filled cells. + using map_type = std::unordered_map; + const auto& map = static_cast(storage); + + std::vector> idx_cols(rank); + std::vector vals; + vals.reserve(map.size()); + + std::vector multi(rank); + for(const auto& kv : map) { + std::size_t lin = kv.first; + bool keep = true; + for(std::size_t ax = 0; ax < rank; ++ax) { + const auto& info = layout[ax]; + const std::size_t iflow = lin % info.extent; + lin /= info.extent; + if(flow) { + multi[ax] = static_cast(iflow); + } else { + const py::ssize_t real = static_cast(iflow) + - static_cast(info.underflow); + if(real < 0 || real >= static_cast(info.size)) { + keep = false; + break; + } + multi[ax] = real; + } + } + if(!keep) + continue; + for(std::size_t ax = 0; ax < rank; ++ax) + idx_cols[ax].push_back(multi[ax]); + vals.push_back(kv.second); + } + + const auto n = static_cast(vals.size()); + py::array_t indices({static_cast(rank), n}); + if(n > 0) { + auto ind = indices.template mutable_unchecked<2>(); + for(std::size_t ax = 0; ax < rank; ++ax) + for(py::ssize_t j = 0; j < n; ++j) + ind(static_cast(ax), j) + = idx_cols[ax][static_cast(j)]; + } + + py::array_t values(n); + std::copy(vals.begin(), vals.end(), values.mutable_data()); + + return py::make_tuple(std::move(indices), std::move(values)); +} + +/// Inverse of histogram_to_coo: scatter ``(indices, values)`` back into a sparse +/// histogram. Uses the same flow convention as histogram_to_coo. This is the +/// only bulk write path for sparse storage, since the numpy-buffer based +/// __setitem__ cannot densify the map. +template +void histogram_from_coo(bh::histogram>& h, + const py::array_t& indices, + const py::array_t& values, + bool flow) { + const auto& axes = bh::unsafe_access::axes(h); + const auto layout = detail::make_axis_layout(axes); + const auto rank = layout.size(); + + auto ind = indices.template unchecked<2>(); + auto val = values.template unchecked<1>(); + if(static_cast(ind.shape(0)) != rank) + throw py::value_error("indices first dimension must equal histogram rank"); + + const auto n = val.shape(0); + if(ind.shape(1) != n) + throw py::value_error("indices second dimension must match values length"); + std::vector at_index(rank); + for(py::ssize_t j = 0; j < n; ++j) { + for(std::size_t ax = 0; ax < rank; ++ax) { + const py::ssize_t coord = ind(static_cast(ax), j); + // Convert back to boost's at() signed index (underflow -1, overflow size). + const py::ssize_t signed_idx + = flow ? coord - static_cast(layout[ax].underflow) : coord; + at_index[ax] = static_cast(signed_idx); + } + h.at(at_index) = val(j); + } +} + +template +auto histogram_sum(const bh::histogram& h, bool flow) { + return bh::algorithm::sum(h, flow ? bh::coverage::all : bh::coverage::inner); +} + +/// The storage iterator of a map-backed histogram visits every *logical* cell +/// (hash lookup per cell), so bh::algorithm::sum would be O(dense size) — for +/// the huge axis spaces sparse storage targets, that never finishes. Sum only +/// the filled cells instead; flow=false unravels each key to skip flow cells. +template +T histogram_sum(const bh::histogram>& h, bool flow) { + using map_type = std::unordered_map; + const auto& map = static_cast(bh::unsafe_access::storage(h)); + + T total{}; + if(flow) { + for(const auto& kv : map) + total += kv.second; + return total; + } + + const auto layout = detail::make_axis_layout(bh::unsafe_access::axes(h)); + for(const auto& kv : map) + if(detail::is_inner_cell(kv.first, layout)) + total += kv.second; + return total; +} + +template +bool histogram_empty(const bh::histogram& h, bool flow) { + return bh::algorithm::empty(h, flow ? bh::coverage::all : bh::coverage::inner); +} + +/// Sparse counterpart of bh::algorithm::empty, iterating only the filled cells +/// (see histogram_sum). Explicit zeros (left behind by e.g. -=) count as empty. +template +bool histogram_empty(const bh::histogram>& h, bool flow) { + using map_type = std::unordered_map; + const auto& map = static_cast(bh::unsafe_access::storage(h)); + + if(flow) + return std::all_of( + map.begin(), map.end(), [](const auto& kv) { return kv.second == T{}; }); + + const auto layout = detail::make_axis_layout(bh::unsafe_access::axes(h)); + return std::all_of(map.begin(), map.end(), [&layout](const auto& kv) { + return kv.second == T{} || !detail::is_inner_cell(kv.first, layout); + }); +} + +template +bool histogram_equal(const bh::histogram& a, const bh::histogram& b) { + return a == b; +} + +/// Sparse counterpart of histogram::operator==, whose storage comparison walks +/// every logical cell (see histogram_sum). Compare the two maps directly, +/// treating absent keys as zero — the maps are not structurally identical for +/// equal histograms, since -= can leave explicit zeros behind. +template +bool histogram_equal(const bh::histogram>& a, + const bh::histogram>& b) { + if(!bh::detail::axes_equal(bh::unsafe_access::axes(a), bh::unsafe_access::axes(b))) + return false; + + using map_type = std::unordered_map; + const auto& ma = static_cast(bh::unsafe_access::storage(a)); + const auto& mb = static_cast(bh::unsafe_access::storage(b)); + + for(const auto& kv : ma) { + const auto it = mb.find(kv.first); + if(kv.second != (it == mb.end() ? T{} : it->second)) + return false; + } + return std::all_of(mb.begin(), mb.end(), [&ma](const auto& kv) { + return kv.second == T{} || ma.find(kv.first) != ma.end(); + }); +} + /// Compute the bin of an array from a runtime list /// For example, [1,3,2] will return that bin of an array template diff --git a/include/bh_python/register_histogram.hpp b/include/bh_python/register_histogram.hpp index 1a3a0522..8521a6d9 100644 --- a/include/bh_python/register_histogram.hpp +++ b/include/bh_python/register_histogram.hpp @@ -30,6 +30,30 @@ #include #include +/// Bind the COO get/set helpers, but only for sparse storages (no-op otherwise). +/// Sparse histograms cannot expose a numpy buffer, so these are the supported way +/// to read filled cells (``_to_coo``) and to bulk-populate them (``_from_coo``, +/// used by serialization). +template +void register_sparse_coo(PyClass& hist, std::true_type) { + hist.def( + "_to_coo", + [](histogram_t& self, bool flow) { return histogram_to_coo(self, flow); }, + "flow"_a = false) + .def( + "_from_coo", + [](histogram_t& self, + const py::array_t& indices, + const py::array_t& values, + bool flow) { histogram_from_coo(self, indices, values, flow); }, + "indices"_a, + "values"_a, + "flow"_a = true); +} + +template +void register_sparse_coo(PyClass& /* hist */, std::false_type) {} + template auto register_histogram(py::module& m, const char* name, const char* desc) { using histogram_t = bh::histogram; @@ -60,10 +84,12 @@ auto register_histogram(py::module& m, const char* name, const char* desc) { .def(py::self += py::self) + // histogram_equal dispatches to a sparse-aware comparison; the default + // operator== walks every logical cell, which sparse storage cannot afford. .def("__eq__", [](const histogram_t& self, const py::object& other) { try { - return self == py::cast(other); + return histogram_equal(self, py::cast(other)); } catch(const py::cast_error&) { return false; } @@ -71,7 +97,7 @@ auto register_histogram(py::module& m, const char* name, const char* desc) { .def("__ne__", [](const histogram_t& self, const py::object& other) { try { - return self != py::cast(other); + return !histogram_equal(self, py::cast(other)); } catch(const py::cast_error&) { return true; } @@ -168,12 +194,13 @@ auto register_histogram(py::module& m, const char* name, const char* desc) { .def("__repr__", &shift_to_string) + // histogram_sum/histogram_empty dispatch to filled-cells-only versions + // for sparse storage; the bh::algorithm versions are O(dense size). .def( "sum", [](const histogram_t& self, bool flow) { const py::gil_scoped_release release; - return bh::algorithm::sum( - self, flow ? bh::coverage::all : bh::coverage::inner); + return histogram_sum(self, flow); }, "flow"_a = false) @@ -181,8 +208,7 @@ auto register_histogram(py::module& m, const char* name, const char* desc) { "empty", [](const histogram_t& self, bool flow) { const py::gil_scoped_release release; - return bh::algorithm::empty( - self, flow ? bh::coverage::all : bh::coverage::inner); + return histogram_empty(self, flow); }, "flow"_a = false) @@ -207,6 +233,8 @@ auto register_histogram(py::module& m, const char* name, const char* desc) { ; + register_sparse_coo(hist, is_sparse_storage{}); + return hist; } @@ -254,10 +282,12 @@ auto inline register_histogram>(py::module& m, .def(py::self += py::self) + // histogram_equal dispatches to a sparse-aware comparison; the default + // operator== walks every logical cell, which sparse storage cannot afford. .def("__eq__", [](const histogram_t& self, const py::object& other) { try { - return self == py::cast(other); + return histogram_equal(self, py::cast(other)); } catch(const py::cast_error&) { return false; } @@ -265,7 +295,7 @@ auto inline register_histogram>(py::module& m, .def("__ne__", [](const histogram_t& self, const py::object& other) { try { - return self != py::cast(other); + return !histogram_equal(self, py::cast(other)); } catch(const py::cast_error&) { return true; } diff --git a/include/bh_python/storage.hpp b/include/bh_python/storage.hpp index b0e56397..d3c664b7 100644 --- a/include/bh_python/storage.hpp +++ b/include/bh_python/storage.hpp @@ -19,9 +19,17 @@ #include #include #include +#include namespace storage { +// Sparse storage only allocates cells that are actually filled, by keeping them +// in a hash map keyed on the (flattened) bin index. Accumulator-backed sparse +// storages need https://github.com/boostorg/histogram/pull/421 (Boost 1.92), so +// for now only the arithmetic double_sparse is exposed. +template +using sparse_storage = bh::storage_adaptor>; + // Names match Python names using int64 = bh::dense_storage; using atomic_int64 = bh::dense_storage>; @@ -31,6 +39,7 @@ using weight = bh::dense_storage>; using multi_cell = bh::multi_cell; using mean = bh::dense_storage>; using weighted_mean = bh::dense_storage>; +using double_sparse = sparse_storage; // Allow repr to show python name template @@ -78,6 +87,11 @@ inline const char* name() { return "weighted_mean"; } +template <> +inline const char* name() { + return "double_sparse"; +} + } // namespace storage // It is very important that storages with accumulators have specialized serialization. @@ -187,6 +201,48 @@ void load(Archive& ar, std::copy(a.data(), a.data() + a.size(), reinterpret_cast(s.data())); } +// Sparse storage cannot be flat-serialized: its hash map only holds filled +// cells. We instead store the logical size plus parallel (key, value) arrays. +// storage_adaptor publicly inherits the underlying map, so the upcast exposes +// just the filled cells. +template +void save(Archive& ar, const storage::double_sparse& s, unsigned /* version */) { + using map_type = std::unordered_map; + const auto& map = static_cast(s); + const auto n = static_cast(map.size()); + + py::array_t keys(n); + py::array_t values(n); + auto* kp = keys.mutable_data(); + auto* vp = values.mutable_data(); + for(const auto& kv : map) { + *kp++ = static_cast(kv.first); + *vp++ = kv.second; + } + + ar << static_cast(s.size()); + ar << keys; + ar << values; +} + +template +void load(Archive& ar, storage::double_sparse& s, unsigned /* version */) { + using map_type = std::unordered_map; + std::uint64_t logical_size = 0; + py::array_t keys; + py::array_t values; + ar >> logical_size; + ar >> keys; + ar >> values; + + s.reset(static_cast(logical_size)); + auto& map = static_cast(s); + auto k = keys.unchecked<1>(); + auto v = values.unchecked<1>(); + for(py::ssize_t i = 0; i < v.shape(0); ++i) + map.emplace(static_cast(k(i)), v(i)); +} + namespace pybind11 { namespace detail { /// Allow a Python int to implicitly convert to an atomic int in C++ diff --git a/src/boost_histogram/_core/hist.pyi b/src/boost_histogram/_core/hist.pyi index ca40839d..0e66b5f3 100644 --- a/src/boost_histogram/_core/hist.pyi +++ b/src/boost_histogram/_core/hist.pyi @@ -66,6 +66,22 @@ class any_atomic_int64(_BaseHistogram): def _at_set(self, value: int, *args: int) -> None: ... def sum(self, flow: bool = ...) -> int: ... +class any_double_sparse(_BaseHistogram): + def __idiv__(self, other: any_double_sparse) -> Self: ... + def __imul__(self, other: any_double_sparse) -> Self: ... + def at(self, *args: int) -> float: ... + def _at_set(self, value: float, *args: int) -> None: ... + def sum(self, flow: bool = ...) -> float: ... + def _to_coo( + self, flow: bool = ... + ) -> tuple[np.typing.NDArray[np.intp], np.typing.NDArray[np.float64]]: ... + def _from_coo( + self, + indices: np.typing.NDArray[np.intp], + values: np.typing.NDArray[np.float64], + flow: bool = ..., + ) -> None: ... + class any_weight(_BaseHistogram): def __idiv__(self, other: any_weight) -> Self: ... def __imul__(self, other: any_weight) -> Self: ... diff --git a/src/boost_histogram/_core/storage.pyi b/src/boost_histogram/_core/storage.pyi index 6afe44e9..88d2d524 100644 --- a/src/boost_histogram/_core/storage.pyi +++ b/src/boost_histogram/_core/storage.pyi @@ -16,6 +16,7 @@ class unlimited(_BaseStorage): ... class weight(_BaseStorage): ... class mean(_BaseStorage): ... class weighted_mean(_BaseStorage): ... +class double_sparse(_BaseStorage): ... class multi_cell(_BaseStorage): def __init__(self, nelem: int, /): ... diff --git a/src/boost_histogram/axis/__init__.py b/src/boost_histogram/axis/__init__.py index f7435413..5e90ea7b 100644 --- a/src/boost_histogram/axis/__init__.py +++ b/src/boost_histogram/axis/__init__.py @@ -404,7 +404,14 @@ def __init__( def _repr_args_(self) -> list[str]: "Return inner part of signature for use in repr" - ret = [f"{self.size:g}", f"{self.edges[0]:g}", f"{self.edges[-1]:g}"] + # Use value() for the endpoints rather than building the full edges + # array, which would be enormous (and unrepresentable on 32-bit) for a + # very large axis. + ret = [ + f"{self.size:g}", + f"{self._ax.value(0):g}", + f"{self._ax.value(self.size):g}", + ] if self.traits.growth: ret.append("growth=True") diff --git a/src/boost_histogram/histogram.py b/src/boost_histogram/histogram.py index c6830a60..b8f652f1 100644 --- a/src/boost_histogram/histogram.py +++ b/src/boost_histogram/histogram.py @@ -95,6 +95,7 @@ def __dir__() -> list[str]: _core.hist.any_mean, _core.hist.any_weighted_mean, _core.hist.any_multi_cell, + _core.hist.any_double_sparse, } logger = logging.getLogger(__name__) @@ -552,6 +553,11 @@ def view( self: Histogram[bhs.WeightedMean], flow: bool = False ) -> WeightedMeanView: ... + @typing.overload + def view( + self: Histogram[bhs.DoubleSparse], flow: bool = False + ) -> typing.NoReturn: ... + @typing.overload def view( self: Histogram[Any], flow: bool = False @@ -574,9 +580,65 @@ def view( ): """ Return a view into the data, optionally with overflow turned on. + + Sparse storage (:class:`~boost_histogram.storage.DoubleSparse`) does not + support this, since there is no dense buffer to view into (``np.asarray`` + raises for the same reason); use :meth:`to_coo` to read the filled cells, + or :meth:`values` (and the other copying accessors) if an explicitly + densified array is acceptable. """ return _to_view(self._hist.view(flow)) + def _densify(self, flow: bool = False) -> np.typing.NDArray[np.float64]: + """ + Build a dense array of values from sparse (COO) storage. Unlike + :meth:`view`, this is a freshly allocated copy, so writing to it does not + change the histogram. Densifying a very large axis space can exhaust + memory; that is the caller's choice (use :meth:`to_coo` to avoid it). + """ + shape = tuple( + ax.size + + (int(ax.traits.underflow) + int(ax.traits.overflow) if flow else 0) + for ax in self.axes + ) + arr = np.zeros(shape, dtype=np.float64) + indices, values = self.to_coo(flow=flow) + if values.size: + arr[indices] = values + return arr + + def _view_or_dense(self, flow: bool) -> Any: + """ + Like :meth:`view`, but densifies sparse storage into a copy instead of + raising. Used by the copying accessors (:meth:`values` etc.) so they keep + working for sparse histograms, while :meth:`view` stays buffer-only. + """ + if isinstance(self._hist, _core.hist.any_double_sparse): + return self._densify(flow) + return self.view(flow) + + def to_coo( + self, flow: bool = False + ) -> tuple[tuple[np.typing.NDArray[np.intp], ...], np.typing.NDArray[np.float64]]: + """ + Return the filled cells of a sparse histogram in coordinate (COO) form. + + This is the sparse-storage replacement for :meth:`view`. It returns a + tuple ``(indices, values)`` where ``indices`` is a tuple of per-axis + index arrays (one per dimension, in the style of :func:`numpy.nonzero`, + so it can be used directly as a fancy index) and ``values`` is the array + of corresponding cell values. Only filled cells are returned. + + :param flow: If True, include flow bins, with indices running over the + with-flow grid (underflow at 0); if False (default), flow cells are + dropped and indices run ``0..size-1`` per axis. + """ + if not isinstance(self._hist, _core.hist.any_double_sparse): + msg = "to_coo() is only available for sparse storage; use view() instead" + raise TypeError(msg) + indices, values = self._hist._to_coo(flow) + return tuple(indices), values + def __array__( self, dtype: np.typing.DTypeLike | None = None, @@ -588,6 +650,8 @@ def __array__( kwargs = {} if copy is not None: kwargs["copy"] = copy + # Implicit array conversion goes through view(), so it raises for sparse + # storage rather than silently densifying. Use .values() or .to_coo(). return np.asarray(self.view(False), dtype=dtype, **kwargs) # type: ignore[call-overload, no-any-return] __hash__ = None # type: ignore[assignment] @@ -796,6 +860,8 @@ def _compute_inplace_op( else: msg = f"Wrong shape {other.shape}, expected {self.shape} or {self.axes.extent}" raise ValueError(msg) + elif isinstance(self._hist, _core.hist.any_double_sparse): + self._sparse_scalar_inplace_op(name, other) else: view = self.view(flow=True) getattr(view, name)(other) @@ -803,6 +869,28 @@ def _compute_inplace_op( self._variance_known = False return self + def _sparse_scalar_inplace_op( + self, name: str, other: np.typing.NDArray[Any] | float + ) -> None: + # Sparse storage has no dense buffer to mutate in place. ``*`` and ``/`` + # only scale the already-filled cells (0 * x == 0), so sparsity is + # preserved -- rescale them through the COO get/set path. ``+`` and + # ``-`` would have to touch every cell, densifying the storage, so they + # are refused rather than silently materializing the full grid. + if name in {"__iadd__", "__isub__"}: + symbol = _INPLACE_OP_SYMBOLS[name] + msg = ( + f"Sparse storage does not support scalar {symbol!r}; it would " + "fill every cell. Densify with .values() or use a dense storage." + ) + raise TypeError(msg) + indices, values = self._hist._to_coo(True) # type: ignore[attr-defined] + if name in {"__itruediv__", "__idiv__"}: + values = values / other + else: # __imul__ + values = values * other + self._hist._from_coo(indices, values, True) # type: ignore[attr-defined] + # TODO: Marked as too complex by flake8. Should be factored out a bit. def fill( self, @@ -941,8 +1029,11 @@ def __str__(self) -> str: still experimental. Do not rely on any particular rendering. """ # TODO check the terminal width and adjust the presentation - # only use for 1D, fall back to repr for ND - if self._hist.rank() != 1: + # only use for 1D, fall back to repr for ND. Sparse storage also falls + # back to repr, since the C++ rendering would iterate every (dense) bin. + if self._hist.rank() != 1 or isinstance( + self._hist, _core.hist.any_double_sparse + ): return repr(self) s = str(self._hist) # get rid of first line and last character @@ -1228,6 +1319,7 @@ def empty(self, flow: bool = False) -> bool: @typing.overload def sum( self: Histogram[bhs.Double] + | Histogram[bhs.DoubleSparse] | Histogram[bhs.Int64] | Histogram[bhs.AtomicInt64] | Histogram[bhs.Unlimited], @@ -1740,6 +1832,7 @@ def values( @typing.overload def values( self: Histogram[bhs.Double] + | Histogram[bhs.DoubleSparse] | Histogram[bhs.Unlimited] | Histogram[bhs.Weight] | Histogram[bhs.Mean] @@ -1769,7 +1862,7 @@ def values( :return: "np.typing.NDArray[Any]"[np.float64] """ - view: Any = self.view(flow) + view: Any = self._view_or_dense(flow) # TODO: Might be a NumPy typing bug if len(view.dtype) == 0: return view # type: ignore[no-any-return] @@ -1783,6 +1876,7 @@ def variances( @typing.overload def variances( self: Histogram[bhs.Double] + | Histogram[bhs.DoubleSparse] | Histogram[bhs.Unlimited] | Histogram[bhs.MultiCell], flow: bool = ..., @@ -1825,7 +1919,7 @@ def variances( :return: "np.typing.NDArray[Any]"[np.float64] """ - view: Any = self.view(flow) + view: Any = self._view_or_dense(flow) if len(view.dtype) == 0: return view if self._variance_known else None @@ -1856,6 +1950,7 @@ def counts( @typing.overload def counts( self: Histogram[bhs.Double] + | Histogram[bhs.DoubleSparse] | Histogram[bhs.Unlimited] | Histogram[bhs.Weight] | Histogram[bhs.Mean] @@ -1894,7 +1989,7 @@ def counts( :return: "np.typing.NDArray[Any]"[np.float64] """ - view: Any = self.view(flow) + view: Any = self._view_or_dense(flow) if len(view.dtype) == 0: return view # type: ignore[no-any-return] diff --git a/src/boost_histogram/serialization/__init__.py b/src/boost_histogram/serialization/__init__.py index 8c4b0cbe..799609ea 100644 --- a/src/boost_histogram/serialization/__init__.py +++ b/src/boost_histogram/serialization/__init__.py @@ -69,9 +69,13 @@ def to_uhi( # Convert the histogram to a dictionary writer_info = {"boost-histogram": {"version": version.version}} - # Store storage type info for AtomicInt64 and Unlimited (they serialize as int/double) + # Store storage type info for types that serialize as a plain int/double, so + # the exact storage can be restored (AtomicInt64, Unlimited, DoubleSparse). storage_type_str = _storage_type_to_str(h.storage_type()) - if isinstance(h.storage_type(), (storage.AtomicInt64, storage.Unlimited)): + is_sparse = isinstance(h.storage_type(), storage.DoubleSparse) + if isinstance( + h.storage_type(), (storage.AtomicInt64, storage.Unlimited, storage.DoubleSparse) + ): writer_info["boost-histogram"]["storage_type"] = type(h.storage_type()).__name__ data = { @@ -79,10 +83,26 @@ def to_uhi( "writer_info": writer_info, "axes": [_axis_to_dict(axis) for axis in h.axes], } - if keep_storage: - data["storage"] = _storage_to_dict(h.storage_type(), h.view(flow=True)) - else: + if not keep_storage: data["storage"] = {"type": storage_type_str} + elif is_sparse: + # Sparse storage has no dense view; serialize only the filled cells (COO), + # with flow so the round-trip is exact. This encoding is boost-histogram + # specific (other UHI consumers will not understand "sparse"). + idx_tuple, values = h.to_coo(flow=True) + indices = ( + np.stack(idx_tuple) + if idx_tuple + else np.empty((0, values.shape[0]), dtype=np.intp) + ) + data["storage"] = { + "type": storage_type_str, + "sparse": True, + "indices": indices, + "values": values, + } + else: + data["storage"] = _storage_to_dict(h.storage_type(), h.view(flow=True)) data["metadata"] = serialize_metadata(h.__dict__) return data @@ -98,6 +118,17 @@ def from_uhi(data: dict[str, Any], /) -> histogram.Histogram[Any]: h = histogram.Histogram[Any](*axis, storage=storage_) h.__dict__ = data.get("metadata", {}) + # Sparse storage is stored as COO (filled cells only) and cannot go through + # the dense view-based reconstruction below. Scatter the cells back directly. + if storage_data.get("sparse"): + indices = np.asarray(storage_data["indices"], dtype=np.intp) + values = np.asarray(storage_data["values"], dtype=np.float64) + # JSON round-trips can collapse a zero-length 2D array to 1D; restore it. + if indices.ndim == 1: + indices = indices.reshape(len(h.axes), values.shape[0]) + h._hist._from_coo(indices, values, True) # type: ignore[attr-defined] + return h + # Check if storage has data (if not, it's a structure-only histogram) # Validate required keys per storage type before deciding to skip data loading storage_type = storage_data["type"] diff --git a/src/boost_histogram/serialization/_storage.py b/src/boost_histogram/serialization/_storage.py index 116dd4bd..5b8bb596 100644 --- a/src/boost_histogram/serialization/_storage.py +++ b/src/boost_histogram/serialization/_storage.py @@ -24,7 +24,7 @@ def _storage_type_to_str(_storage: storage.Storage, /) -> str: match _storage: case storage.Int64(): return "int" - case storage.Double(): + case storage.Double() | storage.DoubleSparse(): return "double" case storage.AtomicInt64(): return "int" @@ -114,6 +114,8 @@ def _storage_from_dict( return storage.AtomicInt64() if orig_type == "Unlimited": return storage.Unlimited() + if orig_type == "DoubleSparse": + return storage.DoubleSparse() storage_type = data["type"] if storage_type == "int": diff --git a/src/boost_histogram/storage.py b/src/boost_histogram/storage.py index 5a6d9a82..171f4cf4 100644 --- a/src/boost_histogram/storage.py +++ b/src/boost_histogram/storage.py @@ -10,6 +10,7 @@ __all__ = [ "AtomicInt64", "Double", + "DoubleSparse", "Int64", "Mean", "MultiCell", @@ -56,6 +57,17 @@ class Double(store.double, Storage, family=boost_histogram): accumulator = float +class DoubleSparse(store.double_sparse, Storage, family=boost_histogram): + """ + Sparse storage of doubles: only filled cells are kept (in a hash map), so + histograms over very large or high-dimensional axis spaces stay small in + memory. It does not support ``.view()`` / dense-array access; use + :meth:`Histogram.to_coo` to read the filled cells instead. + """ + + accumulator = float + + class AtomicInt64(store.atomic_int64, Storage, family=boost_histogram): accumulator = int diff --git a/src/register_histograms.cpp b/src/register_histograms.cpp index 5450c4de..ead1318f 100644 --- a/src/register_histograms.cpp +++ b/src/register_histograms.cpp @@ -56,4 +56,10 @@ void register_histograms(py::module& hist) { "any_multi_cell", "N-dimensional histogram for storing multiple cells at once with any axis " "types."); + + register_histogram( + hist, + "any_double_sparse", + "N-dimensional histogram for real-valued data using sparse storage (only " + "filled cells are stored) with any axis types."); } diff --git a/src/register_storage.cpp b/src/register_storage.cpp index c6082926..3b7b10f6 100644 --- a/src/register_storage.cpp +++ b/src/register_storage.cpp @@ -40,4 +40,9 @@ void register_storages(py::module& storage) { storage, "multi_cell", "Dense storage which tracks sums of cell entries for multiple cells per entry"); + + register_storage( + storage, + "double_sparse", + "Sparse storage for doubles (only filled cells are stored)"); } diff --git a/tests/test_sparse_storage.py b/tests/test_sparse_storage.py new file mode 100644 index 00000000..7cd5539e --- /dev/null +++ b/tests/test_sparse_storage.py @@ -0,0 +1,388 @@ +from __future__ import annotations + +import copy +import json +import operator +import sys +import warnings +from pickle import dumps, loads + +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +import boost_histogram as bh +from boost_histogram.serialization import from_uhi, to_uhi + + +def coo_dict(h): + """Map of (multi-index) -> value for the filled cells, order-independent.""" + indices, values = h.to_coo() + return {tuple(int(i[j]) for i in indices): v for j, v in enumerate(values)} + + +def pickle_roundtrip(x): + return loads(dumps(x)) + + +def test_construct_and_fill(): + h = bh.Histogram(bh.axis.Regular(1000, 0, 1000), storage=bh.storage.DoubleSparse()) + assert h.storage_type is bh.storage.DoubleSparse + + h.fill([1.5, 1.5, 5.5, 900.5]) + assert h.sum() == 4.0 + # Point reads go through C++ at(), not the (unavailable) dense view. + assert h[1] == 2.0 + assert h[5] == 1.0 + assert h[900] == 1.0 + assert h[2] == 0.0 + + +def test_stays_sparse_after_ops(): + h = bh.Histogram(bh.axis.Regular(100, 0, 100), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 50.5, 50.5]) + + # Slicing and projection use C++ algorithms and must keep sparse storage. + sliced = h[10:60] + assert sliced.storage_type is bh.storage.DoubleSparse + assert sliced.sum() == 2.0 + + h2 = bh.Histogram( + bh.axis.Regular(5, 0, 5), + bh.axis.Regular(5, 0, 5), + storage=bh.storage.DoubleSparse(), + ) + h2.fill([1.5, 1.5], [2.5, 2.5]) + projected = h2.project(0) + assert projected.storage_type is bh.storage.DoubleSparse + assert projected.sum() == 2.0 + + +def test_to_coo_filled_cells_only(): + h = bh.Histogram( + bh.axis.Regular(5, 0, 5), + bh.axis.IntCategory([10, 20, 30]), + storage=bh.storage.DoubleSparse(), + ) + h.fill([1.5, 1.5, 3.5], [10, 10, 30]) + + indices, values = h.to_coo() + assert isinstance(indices, tuple) + assert len(indices) == 2 + # One entry per distinct filled bin. + assert len(values) == 2 + assert coo_dict(h) == {(1, 0): 2.0, (3, 2): 1.0} + + +def test_to_coo_flow(): + h = bh.Histogram(bh.axis.Regular(10, 0, 10), storage=bh.storage.DoubleSparse()) + h.fill([-5, 5.5, 100]) # underflow, normal bin 5, overflow + + # Without flow, the flow cells are dropped, indices run 0..size-1. + assert coo_dict(h) == {(5,): 1.0} + + # With flow, underflow sits at index 0, overflow at extent-1 (== 11 here). + indices_f, values_f = h.to_coo(flow=True) + flow_map = {int(indices_f[0][j]): v for j, v in enumerate(values_f)} + assert flow_map == {0: 1.0, 6: 1.0, 11: 1.0} + + +def test_view_and_asarray_raise_but_copies_densify(): + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5]) + + # The zero-copy buffer view (and implicit array conversion) are unsupported. + with pytest.raises(TypeError, match="to_coo"): + h.view() + with pytest.raises(TypeError): + np.asarray(h) + + # The explicit copying accessors densify and keep working. + assert_array_equal(h.values(), [0, 1, 0, 0, 0]) + assert_array_equal(h.counts(), [0, 1, 0, 0, 0]) + assert_array_equal(h.variances(), [0, 1, 0, 0, 0]) + + +def test_copy_accessors_match_dense(): + axes = (bh.axis.Regular(10, 0, 10), bh.axis.IntCategory([5, 6, 7])) + x = np.array([-1, 0.5, 0.5, 3.5, 9.5, 100]) + y = np.array([5, 6, 6, 7, 5, 6]) + + dense = bh.Histogram(*axes, storage=bh.storage.Double()) + dense.fill(x, y) + sparse = bh.Histogram(*axes, storage=bh.storage.DoubleSparse()) + sparse.fill(x, y) + + for flow in (False, True): + assert_array_equal(sparse.values(flow=flow), dense.values(flow=flow)) + assert_array_equal(sparse.counts(flow=flow), dense.counts(flow=flow)) + assert_array_equal(sparse.variances(flow=flow), dense.variances(flow=flow)) + + +def test_to_coo_requires_sparse(): + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.Double()) + with pytest.raises(TypeError, match="sparse"): + h.to_coo() + + +def test_repr_and_str_do_not_densify(): + # A huge axis would be impossible to densify; repr/str must still work. + h = bh.Histogram(bh.axis.Regular(10**9, 0, 1), storage=bh.storage.DoubleSparse()) + h.fill([0.5, 0.5]) + assert "DoubleSparse" in repr(h) + assert "Sum: 2" in repr(h) + # __str__ falls back to repr for sparse rather than rendering every bin. + assert "DoubleSparse" in str(h) + + +@pytest.mark.skipif(sys.maxsize <= 2**32, reason="linear keys overflow on 32-bit") +def test_huge_axis_ops_iterate_filled_cells_only(): + # sum/empty/== (and repr, which calls sum) must iterate the filled cells, + # not the logical grid -- a dense iteration over 10**15 cells would never + # finish, defeating the entire point of sparse storage. + axes = [bh.axis.Regular(100_000, 0, 1) for _ in range(3)] + h = bh.Histogram(*axes, storage=bh.storage.DoubleSparse()) + assert h.empty() + assert h.empty(flow=True) + assert h.sum() == 0.0 + + h.fill([0.5, 0.5, 2.0], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) # one overflow + assert h.sum() == 2.0 + assert h.sum(flow=True) == 3.0 + assert not h.empty() + assert h == h.copy() + assert h != bh.Histogram(*axes, storage=bh.storage.DoubleSparse()) + + +def test_sum_and_empty_match_dense(): + axes = (bh.axis.Regular(10, 0, 10), bh.axis.IntCategory([5, 6, 7])) + x = [-1, 0.5, 0.5, 3.5, 9.5, 100] + y = [5, 6, 6, 7, 5, 6] + + dense = bh.Histogram(*axes, storage=bh.storage.Double()) + dense.fill(x, y) + sparse = bh.Histogram(*axes, storage=bh.storage.DoubleSparse()) + sparse.fill(x, y) + + for flow in (False, True): + assert sparse.sum(flow=flow) == dense.sum(flow=flow) + assert sparse.empty(flow=flow) == dense.empty(flow=flow) + + +def test_equality_ignores_explicit_zeros(): + # Subtracting histograms leaves explicit zeros in the hash map; equality, + # sum, and empty must treat those the same as absent cells. + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 3.5]) + h -= h.copy() + + fresh = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + assert h == fresh + assert fresh == h + assert h.empty(flow=True) + assert h.sum(flow=True) == 0.0 + + +@pytest.mark.parametrize("copy_fn", [copy.copy, copy.deepcopy, pickle_roundtrip]) +def test_copy_pickle_roundtrip(copy_fn): + h = bh.Histogram( + bh.axis.Regular(20, 0, 20), + bh.axis.IntCategory([1, 2, 3]), + storage=bh.storage.DoubleSparse(), + ) + h.fill([-1, 5.5, 5.5, 100], [1, 2, 2, 3]) + + h2 = copy_fn(h) + assert h2 == h + assert h2.sum(flow=True) == h.sum(flow=True) + assert coo_dict(h2) == coo_dict(h) + + +def test_uhi_roundtrip_with_data(): + h = bh.Histogram( + bh.axis.Regular(10, 0, 10), + bh.axis.Regular(8, 0, 8), + storage=bh.storage.DoubleSparse(), + ) + h.fill([-1, 2.5, 2.5, 100], [3, 4, 4, 200]) + + data = to_uhi(h) + assert data["storage"]["type"] == "double" + assert data["storage"]["sparse"] is True + assert data["writer_info"]["boost-histogram"]["storage_type"] == "DoubleSparse" + + h2 = from_uhi(data) + assert h2.storage_type is bh.storage.DoubleSparse + assert h2 == h + assert h2.sum(flow=True) == h.sum(flow=True) + + +def test_uhi_roundtrip_through_json(): + h = bh.Histogram( + bh.axis.Regular(10, 0, 10), + bh.axis.Regular(6, 0, 6), + storage=bh.storage.DoubleSparse(), + ) + h.fill([2.5, 2.5, 8.5], [1.5, 1.5, 3.5]) + + def to_jsonable(obj): + if isinstance(obj, np.ndarray): + return obj.tolist() + if isinstance(obj, dict): + return {k: to_jsonable(v) for k, v in obj.items()} + if isinstance(obj, list): + return [to_jsonable(v) for v in obj] + return obj + + data = json.loads(json.dumps(to_jsonable(to_uhi(h)))) + h2 = from_uhi(data) + assert h2 == h + + +def test_uhi_structure_only(): + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 1.5]) + + data = to_uhi(h, keep_storage=False) + h2 = from_uhi(data) + assert h2.storage_type is bh.storage.DoubleSparse + assert h2.sum(flow=True) == 0.0 + + +def test_uhi_roundtrip_empty(): + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h2 = from_uhi(to_uhi(h)) + assert h2 == h + assert h2.sum() == 0.0 + + +def test_setitem_raises_on_sparse(): + """Sparse storage does not support direct assignment (no dense buffer).""" + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + with pytest.raises(TypeError, match="buffer"): + h[0] = 1.0 + + +def test_sparse_histogram_addition(): + """Adding two sparse histograms stays sparse and combines counts correctly.""" + h1 = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h1.fill([1.5, 3.5]) + + h2 = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h2.fill([1.5, 4.5]) + + h_sum = h1 + h2 + assert h_sum.storage_type is bh.storage.DoubleSparse + assert h_sum[1] == 2.0 + assert h_sum[3] == 1.0 + assert h_sum[4] == 1.0 + assert h_sum[2] == 0.0 + + # densified view should match a dense histogram with the same fills + dense = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.Double()) + dense.fill([1.5, 1.5, 3.5, 4.5]) + assert_array_equal(h_sum.values(), dense.values()) + + +@pytest.mark.parametrize("op", ["mul", "truediv"]) +def test_scalar_mul_div_stays_sparse(op): + """Scalar * and / scale filled cells in place and preserve sparsity.""" + fn = getattr(operator, op) + sparse = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + sparse.fill([-1, 1.5, 1.5, 3.5, 100]) # includes flow cells + dense = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.Double()) + dense.fill([-1, 1.5, 1.5, 3.5, 100]) + + result = fn(sparse, 2.0) + assert result.storage_type is bh.storage.DoubleSparse + # Original is untouched (operator returns a copy). + assert_array_equal(sparse.values(flow=True), dense.values(flow=True)) + for flow in (False, True): + assert_array_equal(result.values(flow=flow), fn(dense, 2.0).values(flow=flow)) + + +def test_scalar_imul_in_place_stays_sparse(): + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 1.5, 3.5]) + h *= 3 + assert h.storage_type is bh.storage.DoubleSparse + assert_array_equal(h.values(), [0, 6, 0, 3, 0]) + + +@pytest.mark.parametrize("op", ["+", "-"]) +def test_scalar_add_sub_refused(op): + """Scalar +/- would fill every cell, so sparse storage refuses them.""" + fn = {"+": operator.add, "-": operator.sub}[op] + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5]) + with pytest.raises(TypeError, match="fill every cell"): + fn(h, 2.0) + + +def _mixed_axes_hist(): + h = bh.Histogram( + bh.axis.Regular(5, 0, 5), + bh.axis.IntCategory([10, 20, 30]), + storage=bh.storage.DoubleSparse(), + ) + h.fill([1.5, 1.5, 3.5], [10, 10, 30]) + return h + + +def test_mixed_int_slice_indexing_raises(): + """Mixed integer + slice indexing routes through .view() and raises.""" + h = _mixed_axes_hist() + with pytest.raises(TypeError, match=r"to_coo|sparse"): + _ = h[2, :] + + +def test_list_pick_indexing_raises(): + """Categorical list selection routes through .view() and raises.""" + h = _mixed_axes_hist() + # The list-pick path is experimental and warns before it reaches the view. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + with pytest.raises(TypeError, match=r"to_coo|sparse"): + _ = h[:, [0, 2]] + + +def test_vectorized_array_indexing_raises(): + h = bh.Histogram(bh.axis.Regular(5, 0, 5), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 3.5]) + with pytest.raises(TypeError, match=r"to_coo|sparse"): + _ = h[np.array([0, 1, 2])] + + +@pytest.mark.parametrize( + "make_index", + [ + pytest.param(lambda: slice(1, 4), id="slice"), + pytest.param(lambda: slice(None, None, bh.rebin(2)), id="rebin"), + pytest.param(lambda: slice(None, None, sum), id="sum"), + ], +) +def test_reduce_indexing_stays_sparse(make_index): + """Slicing, rebin, and integration delegate to C++ and stay sparse.""" + h = bh.Histogram(bh.axis.Regular(10, 0, 10), storage=bh.storage.DoubleSparse()) + h.fill([1.5, 5.5, 5.5, 8.5]) + result = h[make_index()] + if isinstance(result, bh.Histogram): + assert result.storage_type is bh.storage.DoubleSparse + + +def test_matches_dense_values(): + # The filled cells should agree with what a dense Double histogram records. + axes = (bh.axis.Regular(10, 0, 10), bh.axis.IntCategory([5, 6, 7])) + x = np.array([0.5, 0.5, 3.5, 9.5]) + y = np.array([5, 6, 6, 7]) + + dense = bh.Histogram(*axes, storage=bh.storage.Double()) + dense.fill(x, y) + sparse = bh.Histogram(*axes, storage=bh.storage.DoubleSparse()) + sparse.fill(x, y) + + dense_view = dense.view() + rebuilt = np.zeros_like(dense_view) + indices, values = sparse.to_coo() + rebuilt[indices] = values + assert_array_equal(rebuilt, dense_view)