Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b907554
feat: `AnnData.can_write` based on `AnnData.fold`
ilan-gold Mar 19, 2026
19daed5
chore: docs
ilan-gold Mar 19, 2026
4125375
refactor: use accessors
ilan-gold Mar 19, 2026
8be5ba2
fix: DFS order + fixes
ilan-gold Mar 19, 2026
0f4d1b0
chore: add test for `uns`
ilan-gold Mar 19, 2026
9338baa
Merge branch 'main' into ig/fold_can_write
ilan-gold Mar 23, 2026
69daf90
feat: `raw` + `uns` traversal
ilan-gold Mar 23, 2026
932d766
fix: `fold` -> `reduce`
ilan-gold Mar 23, 2026
e0f3ee2
chore: docs
ilan-gold Mar 23, 2026
436fc68
Merge branch 'main' into ig/fold_can_write
ilan-gold Mar 23, 2026
ee04741
fix: `meth` not `func`
ilan-gold Mar 23, 2026
6d6f454
fix: `fold` not `reduce` in relnote
ilan-gold Mar 23, 2026
1f77a4c
fix: nested
ilan-gold Mar 23, 2026
91adffe
chore: more `func` clarification
ilan-gold Mar 23, 2026
928b72a
fix: link
ilan-gold Mar 23, 2026
19a915d
fix: link
ilan-gold Mar 23, 2026
c0886fe
refactor: simpler
ilan-gold Mar 23, 2026
6cffc05
fix: relnote number
ilan-gold Mar 23, 2026
44890eb
Merge branch 'main' into ig/fold_can_write
ilan-gold Apr 2, 2026
39800aa
refactor: use `iter`
ilan-gold Apr 8, 2026
6cb401b
fix: oops
ilan-gold Apr 8, 2026
1dfdd96
fix: why was this deleted?
ilan-gold Apr 8, 2026
9ad937f
fix: doc string
ilan-gold Apr 8, 2026
0c03ffb
fix: docs
ilan-gold Apr 8, 2026
f00db89
fix: remove `parent_type`
ilan-gold Apr 8, 2026
cabc914
Merge branch 'main' into ig/fold_can_write
ilan-gold Apr 10, 2026
9fa978a
fix: writing none
ilan-gold Apr 10, 2026
e7b201f
fix: API changes
ilan-gold Apr 10, 2026
95136a2
fix use `set`
ilan-gold Apr 13, 2026
4eba690
fix: docs
ilan-gold Apr 13, 2026
fdd6b7c
fix: remove unused docs / private type
ilan-gold Apr 13, 2026
5760cb2
fix: nexting
ilan-gold Apr 14, 2026
7382b67
fix: ok
ilan-gold Apr 14, 2026
3c747e1
fix: handle bad categoricals
ilan-gold Apr 14, 2026
371d535
fix: handle index / awkward
ilan-gold Apr 14, 2026
d8f66c3
Merge branch 'main' into ig/fold_can_write
ilan-gold Apr 15, 2026
5a0fded
refactor: `can_write` -> `unwriteable`
ilan-gold Apr 15, 2026
a6374ce
Merge branch 'ig/fold_can_write' of github.com:scverse/anndata into i…
ilan-gold Apr 15, 2026
1ad82f4
Merge branch 'main' into ig/fold_can_write
ilan-gold Apr 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ Writing a complete {class}`AnnData` object to disk in anndata’s native formats

AnnData.write_h5ad
AnnData.write_zarr
AnnData.unwriteable


..
Expand Down
5 changes: 2 additions & 3 deletions docs/concatenation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ Let's start off with an example:
AnnData object with n_obs × n_vars = 700 × 765
obs: 'bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain'
var: 'n_counts', 'means', 'dispersions', 'dispersions_norm', 'highly_variable'
uns: 'bulk_labels_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups'
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this gone?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It got moved to the end, absorbed by obsp: ...

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, maybe replace obsp: ... with ... then to make that clear

obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: ...
Expand Down Expand Up @@ -165,9 +164,9 @@ First, our example case:
>>> blobs
AnnData object with n_obs × n_vars = 640 × 30
obs: 'blobs'
uns: 'pca'
obsm: 'X_pca'
varm: 'PCs'
uns: 'pca'

Now we will split this object by the categorical `"blobs"` and recombine it to illustrate different merge strategies.

Expand All @@ -181,9 +180,9 @@ Now we will split this object by the categorical `"blobs"` and recombine it to i
>>> adatas[0]
AnnData object with n_obs × n_vars = 128 × 30
obs: 'blobs'
uns: 'pca'
obsm: 'X_pca', 'qc'
varm: 'PCs', '0_qc'
uns: 'pca'

`adatas` is now a list of datasets with disjoint sets of observations and a common set of variables.
Each object has had QC metrics computed, with observation-wise metrics stored under `"qc"` in `.obsm`, and variable-wise metrics stored with a unique key for each subset.
Expand Down
1 change: 1 addition & 0 deletions docs/release-notes/2372.feat.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
New {meth}`anndata.AnnData.unwriteable` for checking if an `AnnData` can be written {user}`ilan-gold`
196 changes: 143 additions & 53 deletions src/anndata/_core/anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from __future__ import annotations

from collections import OrderedDict
from collections import OrderedDict, defaultdict
from collections.abc import Mapping, MutableMapping, Sequence
from copy import copy, deepcopy
from functools import singledispatchmethod
Expand All @@ -26,8 +26,10 @@
from .. import utils
from .._settings import settings
from ..compat import (
AwkArray,
DaskArray,
IndexManager,
XDataset,
ZarrArray,
_move_adj_mtx,
has_xp,
Expand All @@ -39,6 +41,7 @@
axis_len,
deprecation_msg,
ensure_df_homogeneous,
iter_outer,
raise_value_error_if_multiindex_columns,
set_module,
warn,
Expand All @@ -62,9 +65,12 @@
from scipy import sparse
from zarr.storage import StoreLike

from anndata.typing import RWAble

from .._types import ReduceFunc
from ..acc import AdRef, Array, MapAcc, RefAcc
from ..compat import XDataset
from ..typing import Index, Index1D, _Index1DNorm, _XDataType
from ..compat import CSArray, CSMatrix
from ..typing import AxisStorable, Index, Index1D, _Index1DNorm, _XDataType
from .aligned_mapping import AxisArraysView, LayersView, PairwiseArraysView


Expand Down Expand Up @@ -512,53 +518,54 @@ def _init_as_actual( # noqa: PLR0912, PLR0913, PLR0915
def __sizeof__(
self, *, show_stratified: bool = False, with_disk: bool = False
) -> int:
def get_size(X) -> int:
def cs_to_bytes(X) -> int:
return int(X.data.nbytes + X.indptr.nbytes + X.indices.nbytes)
def cs_to_bytes(X: CSArray | CSMatrix) -> int:
return int(X.data.nbytes + X.indptr.nbytes + X.indices.nbytes)

def get_size(X: RWAble) -> int:
if isinstance(X, h5py.Dataset) and with_disk:
return int(np.array(X.shape).prod() * X.dtype.itemsize)
elif isinstance(X, BaseCompressedSparseDataset) and with_disk:
return cs_to_bytes(X._to_backed())
elif issparse(X):
return cs_to_bytes(X)
elif isinstance(X, dict | MutableMapping):
return sum(get_size(v) for v in X.values())
else:
return X.__sizeof__()

sizes = {}
attrs = ["X", "_obs", "_var"]
attrs_multi = ["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"]
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW this never handled raw anyway

for attr in attrs + attrs_multi:
if attr in attrs_multi:
keys = getattr(self, attr).keys()
s = sum(get_size(getattr(self, attr)[k]) for k in keys)
def fold_size(
elem: _XDataType | AxisStorable | pd.DataFrame | XDataset,
*,
accumulate: dict[str, int],
attr_name: str | None, # TODO: type
):
if elem is None:
size = 0
elif elem is self.raw:
size = (
get_size(elem.X)
+ get_size(elem.var)
+ sum(get_size(v) for v in elem.varm.values())
)
else:
s = get_size(getattr(self, attr))
if s > 0 and show_stratified:
size = get_size(elem)
accumulate[attr_name] = size
if size > 0 and show_stratified:
from tqdm import tqdm

print(
f"Size of {attr.replace('_', '.'):<7}: {tqdm.format_sizeof(s, 'B')}"
)
sizes[attr] = s
return sum(sizes.values())
print(f"Size of {attr_name}: {tqdm.format_sizeof(size, 'B')}")
return accumulate

return sum(self._reduce(fold_size, init=defaultdict(int)).values())

def _gen_repr(self, n_obs, n_vars) -> str:
backed_at = f" backed at {str(self.filename)!r}" if self.isbacked else ""
descr = f"AnnData object with n_obs × n_vars = {n_obs} × {n_vars}{backed_at}"
for attr in [
"obs",
"var",
"uns",
"obsm",
"varm",
"layers",
"obsp",
"varp",
]:
keys = getattr(self, attr).keys()
if len(keys) > 0:
descr += f"\n {attr}: {str(list(keys))[1:-1]}"
for attr_name, elem in iter_outer(self):
if attr_name not in {"raw", "X"}:
keys = elem.keys()
if len(keys) > 0:
descr += f"\n {attr_name}: {str(list(keys))[1:-1]}"
return descr

def __repr__(self) -> str:
Expand Down Expand Up @@ -1383,27 +1390,16 @@ def to_memory(self, *, copy: bool = False) -> AnnData:
mem = backed[backed.obs["cluster"] == "a", :].to_memory()
"""
new = {}
for attr_name in [
"X",
"obs",
"var",
"obsm",
"varm",
"obsp",
"varp",
"layers",
"uns",
]:
attr = getattr(self, attr_name, None)
for attr_name, attr in iter_outer(self):
if attr is not None:
new[attr_name] = to_memory(attr, copy=copy)

if self.raw is not None:
new["raw"] = {
"X": to_memory(self.raw.X, copy=copy),
"var": to_memory(self.raw.var, copy=copy),
"varm": to_memory(self.raw.varm, copy=copy),
}
if attr is self.raw:
new["raw"] = {
"X": to_memory(self.raw.X, copy=copy),
"var": to_memory(self.raw.var, copy=copy),
"varm": to_memory(self.raw.varm, copy=copy),
}
else:
new[attr_name] = to_memory(attr, copy=copy)

if self.isbacked:
self.file.close()
Expand Down Expand Up @@ -1436,6 +1432,100 @@ def copy(self, filename: PathLike[str] | str | None = None) -> AnnData:
write_h5ad(filename, self)
return read_h5ad(filename, backed=mode)

def _reduce[T](
self,
func: ReduceFunc[T],
*,
init: T,
) -> T:
"""Accumulate a value starting from init by iterating over the parent "elems"of the AnnData object i.e., raw, obs, varp etc.

Parameters
----------
func
The function that performs the accumulation.
init
The starting value

Returns
-------
An accumulated value
"""
accumulate = init
for attr_name, attr in iter_outer(self):
accumulate = func(attr, accumulate=accumulate, attr_name=attr_name)
return accumulate

def unwriteable(self, *, store_type: Literal["h5", "zarr"] | None) -> bool:
"""Whether or not an `AnnData` object can be written to disk for a given store type.

Parameters
----------
store_type
Which backing store - `None` indicates that it can be writeable to either.

Returns
-------
Whether or not this object is writeable.
While the return type may change to include richer output about which elements cannot be written,
this new type's evaluation as a boolean will not change from the current behavior i.e.,
`bool(adata.unwriteable())` will always evaluate the same.
"""

from anndata._io.specs.registry import _REGISTRY

writeable_elems = {
src_type
for (dest_type, src_type, __) in _REGISTRY.write
if store_type is None or store_type in dest_type.__module__
}

def predicate( # noqa: PLR0911
elem: RWAble,
*,
accumulate: bool,
attr_name: str | None = None, # TODO: type
):
if elem is None:
return accumulate
if isinstance(elem, AnnData):
return accumulate and elem.unwriteable(store_type=store_type)
if isinstance(elem, pd.Categorical):
return accumulate and predicate(elem.categories, accumulate=accumulate)
if isinstance(elem, pd.Series | pd.Index):
# matches behavior in methods.py
return accumulate and predicate(elem._values, accumulate=accumulate)
if isinstance(elem, AwkArray):
import awkward as ak

container = ak.to_buffers(ak.to_packed(elem))
return accumulate and all(
predicate(v, accumulate=accumulate) for v in container[2].values()
)
if attr_name == "raw":
accumulate = accumulate and type(elem.X) in writeable_elems
return accumulate and all(
predicate(e[attr], accumulate=accumulate)
for e in [elem.var, elem.varm]
for attr in e
)
if attr_name in {
"obs",
"obsm",
"varm",
"var",
"layers",
"varp",
"obsp",
"uns",
} or isinstance(elem, pd.DataFrame | XDataset | MutableMapping):
return accumulate and all(
predicate(elem[k], accumulate=accumulate) for k in elem
)
return accumulate and type(elem) in writeable_elems

return self._reduce(predicate, init=True)

def var_names_make_unique(self, join: str = "-") -> None:
# Important to go through the setter so obsm dataframes are updated too
self.var_names = utils.make_index_unique(self.var.index, join)
Expand Down
40 changes: 22 additions & 18 deletions src/anndata/_io/h5ad.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import re
from collections.abc import MutableMapping
from functools import partial
from pathlib import Path
from types import MappingProxyType
Expand All @@ -23,7 +24,7 @@
_from_fixed_length_strings,
)
from ..experimental import read_dispatched
from ..utils import warn
from ..utils import iter_outer, warn
from .specs import read_elem, write_elem
from .specs.registry import IOSpec, write_spec
from .utils import (
Expand Down Expand Up @@ -84,23 +85,26 @@ def write_h5ad(
f = cast("h5py.Group", f["/"])
f.attrs.setdefault("encoding-type", "anndata")
f.attrs.setdefault("encoding-version", "0.1.0")

_write_x(
f,
adata, # accessing adata.X reopens adata.file if it’s backed
is_backed=adata.isbacked and adata.filename == filepath,
as_dense=as_dense,
dataset_kwargs=dataset_kwargs,
)
_write_raw(f, adata.raw, as_dense=as_dense, dataset_kwargs=dataset_kwargs)
write_elem(f, "obs", adata.obs, dataset_kwargs=dataset_kwargs)
write_elem(f, "var", adata.var, dataset_kwargs=dataset_kwargs)
write_elem(f, "obsm", dict(adata.obsm), dataset_kwargs=dataset_kwargs)
write_elem(f, "varm", dict(adata.varm), dataset_kwargs=dataset_kwargs)
write_elem(f, "obsp", dict(adata.obsp), dataset_kwargs=dataset_kwargs)
write_elem(f, "varp", dict(adata.varp), dataset_kwargs=dataset_kwargs)
write_elem(f, "layers", dict(adata.layers), dataset_kwargs=dataset_kwargs)
write_elem(f, "uns", dict(adata.uns), dataset_kwargs=dataset_kwargs)
for k, elem in iter_outer(adata):
if k == "X":
_write_x(
f,
adata, # accessing adata.X reopens adata.file if it’s backed
is_backed=adata.isbacked and adata.filename == filepath,
as_dense=as_dense,
dataset_kwargs=dataset_kwargs,
)
elif k == "raw":
_write_raw(
f, adata.raw, as_dense=as_dense, dataset_kwargs=dataset_kwargs
)
else:
write_elem(
f,
k,
dict(elem) if isinstance(elem, MutableMapping) else elem,
dataset_kwargs=dataset_kwargs,
)


def _write_x(
Expand Down
Loading
Loading