From 8c873303424c7b8845d7be1fbcef441aae4c3ad2 Mon Sep 17 00:00:00 2001 From: anon Date: Mon, 8 Jun 2026 19:41:51 +0200 Subject: [PATCH 1/8] Add `measure_obs`: persist per-cell centroid/area/equivalent diameter into the annotating table `measure_obs(sdata, element=None, ...)` computes one centroid, area and equivalent diameter per instance of a shapes or 2D-labels element and writes them, squidpy-style, into the annotating AnnData table: centroids to `obsm["spatial"]` (the canonical (n_obs, 2) array), area and equivalent diameter to `obs`. Values are stored in the element's intrinsic coordinates/units; equivalent diameter is `2*sqrt(area/pi)`. Labels use a streaming bincount aggregator that processes the raster block by block (one chunk plus O(n_labels) accumulators), so it stays out-of-core and scales to Xenium-size masks where a whole-array regionprops table would run out of memory; area (the per-label pixel count) is a free by-product. Shapes use shapely's vectorized centroid/area. The function is idempotent: outputs already present and current are not recomputed, a pre-existing `obsm["spatial"]` is trusted and never overwritten, and an instance-count change invalidates the cache. `inplace` follows the scanpy convention (mutate and return None, or operate on a deep copy and return it). Per-cell measurements require an annotating table to write into. Render-side wiring (routing `as_points` through these measurements for footprint dot sizing) is intentionally deferred to a follow-up PR. --- src/spatialdata_plot/pl/__init__.py | 2 + src/spatialdata_plot/pl/utils.py | 364 +++++++++++++++++++++++++++- tests/pl/test_utils.py | 118 ++++++++- 3 files changed, 482 insertions(+), 2 deletions(-) diff --git a/src/spatialdata_plot/pl/__init__.py b/src/spatialdata_plot/pl/__init__.py index 178b4914..77c74024 100644 --- a/src/spatialdata_plot/pl/__init__.py +++ b/src/spatialdata_plot/pl/__init__.py @@ -1,8 +1,10 @@ from ._palette import make_palette, make_palette_from_data from .basic import PlotAccessor +from .utils import measure_obs __all__ = [ "PlotAccessor", "make_palette", "make_palette_from_data", + "measure_obs", ] diff --git a/src/spatialdata_plot/pl/utils.py b/src/spatialdata_plot/pl/utils.py index 5ddefdb9..f1208c7d 100644 --- a/src/spatialdata_plot/pl/utils.py +++ b/src/spatialdata_plot/pl/utils.py @@ -26,6 +26,7 @@ import spatialdata as sd from anndata import AnnData from cycler import Cycler, cycler +from dask.array.core import slices_from_chunks from datashader.core import Canvas from geopandas import GeoDataFrame from matplotlib import colors, patheffects, rcParams @@ -59,6 +60,7 @@ from skimage.util import map_array from spatialdata import ( SpatialData, + deepcopy as sd_deepcopy, get_element_annotators, get_extent, get_values, @@ -67,7 +69,14 @@ ) from spatialdata._core.query.relational_query import _locate_value from spatialdata._types import ArrayLike -from spatialdata.models import Image2DModel, Labels2DModel, SpatialElement, get_table_keys +from spatialdata.models import ( + Image2DModel, + Labels2DModel, + ShapesModel, + SpatialElement, + get_model, + get_table_keys, +) from spatialdata.transformations.operations import get_transformation from spatialdata.transformations.transformations import Scale, Translation from spatialdata.transformations.transformations import Sequence as TransformSequence @@ -4417,3 +4426,356 @@ def _convert_alpha_to_datashader_range(alpha: float) -> float: """Convert alpha from the range [0, 1] to the range [0, 255] used in datashader.""" # prevent a value of 255, bc that led to fully colored test plots instead of just colored points/shapes return min([254, alpha * 255]) + + +# --- Per-cell measurements into the annotating table (centroid / area / equivalent diameter) --- + +# squidpy/scanpy canonical key for per-cell coordinates: an N x 2 array, row-aligned to obs. +_CENTROID_OBSM_KEY = "spatial" +# Our provenance namespace in ``table.uns``. Measurements are stored in the element's *intrinsic* +# coordinates/units (coordinate-system-independent), so the marker records the resolution level +# and the instance count (to invalidate when cells are added), not a coordinate system. +_CENTROID_PROVENANCE_UNS_KEY = "spatialdata_plot" +_CENTROID_SCALE_LEVEL = "scale0" # rasters are reduced at full resolution (decision: exact) + + +def _transform_carrier(element: SpatialElement) -> Any: + """Return the object carrying ``element``'s data (the ``scale0`` level for multiscale rasters).""" + if isinstance(element, DataTree): + return next(iter(element[_CENTROID_SCALE_LEVEL].values())) + return element + + +def _stream_label_centroid_stats(data: Any) -> tuple[ArrayLike, ArrayLike, ArrayLike, ArrayLike]: + """Per-label ``(labels, mean_x_index, mean_y_index, area)`` via a streaming bincount aggregator. + + Streams the raster block by block — one chunk in memory at a time for a dask array, a + bounded row-block at a time for a numpy array — accumulating per-label ``count`` (= area), + ``sum_x`` and ``sum_y``. The reduction is additive, so it is exact across block boundaries + and uses only O(n_labels) memory regardless of raster size. Background label 0 is excluded. + """ + n_rows, n_cols = data.shape + is_dask = hasattr(data, "chunks") and not isinstance(data, np.ndarray) + if is_dask: + n_labels = int(data.max().compute()) + 1 + block_slices = slices_from_chunks(data.chunks) + else: + data = np.asarray(data) + n_labels = int(data.max()) + 1 + # bound the per-block coordinate-weight arrays to ~8M pixels + step = max(1, min(n_rows, (8 << 20) // max(1, n_cols))) + block_slices = [(slice(r0, min(r0 + step, n_rows)), slice(0, n_cols)) for r0 in range(0, n_rows, step)] + + count = np.zeros(n_labels, dtype=np.float64) + sum_x = np.zeros(n_labels, dtype=np.float64) + sum_y = np.zeros(n_labels, dtype=np.float64) + for row_sl, col_sl in block_slices: + block = data[row_sl, col_sl] + block = np.asarray(block.compute() if hasattr(block, "compute") else block) + block_rows, block_cols = block.shape + flat = block.reshape(-1) + cols = np.tile(np.arange(col_sl.start, col_sl.start + block_cols, dtype=np.float64), block_rows) + rows = np.repeat(np.arange(row_sl.start, row_sl.start + block_rows, dtype=np.float64), block_cols) + count += np.bincount(flat, minlength=n_labels) + sum_x += np.bincount(flat, weights=cols, minlength=n_labels) + sum_y += np.bincount(flat, weights=rows, minlength=n_labels) + + labels = np.flatnonzero(count) + labels = labels[labels != 0] # drop background and absent labels + return labels, sum_x[labels] / count[labels], sum_y[labels] / count[labels], count[labels] + + +def _compute_label_measurements(element: DataArray | DataTree) -> pd.DataFrame: + """Per-label intrinsic centroid + pixel area via the streaming bincount aggregator. + + Returns a DataFrame indexed by label value with columns ``["x", "y", "area"]`` in the + element's *intrinsic* coordinates; ``area`` is the pixel count, a free by-product of the + aggregator. The aggregator streams the (possibly huge, dask-backed) raster block by block, + holding only one chunk plus O(n_labels) accumulators, so it scales to Xenium-size masks + where ``regionprops`` (whole-array, per-label table) would run out of memory. + """ + raster = _transform_carrier(element) + labels, x_idx, y_idx, area = _stream_label_centroid_stats(raster.data) + + # bincount gives mean 0-based pixel indices; map them onto the raster's intrinsic coordinate + # arrays (spatialdata uses pixel-center coords 0.5, 1.5, ... and possibly non-unit spacing). + def _index_to_coord(idx: ArrayLike, coord: ArrayLike) -> ArrayLike: + spacing = (coord[1] - coord[0]) if len(coord) > 1 else 1.0 + return coord[0] + idx * spacing + + xcoord = np.asarray(raster.coords["x"].values) + ycoord = np.asarray(raster.coords["y"].values) + return pd.DataFrame( + { + "x": _index_to_coord(x_idx, xcoord), + "y": _index_to_coord(y_idx, ycoord), + "area": np.asarray(area, dtype=float), + }, + index=labels, + ) + + +def _compute_element_measurements(sdata: SpatialData, element_name: str) -> pd.DataFrame: + """One row per instance with intrinsic ``["x", "y", "area"]``, indexed by instance id. + + Shapes use shapely's vectorized ``.centroid`` / ``.area`` (area in intrinsic squared units); + 2D labels use the streaming bincount aggregator (area = pixel count). Other element types are + rejected explicitly. Area meaning therefore differs across element types but is consistent + within one element. + """ + element = sdata[element_name] + model = get_model(element) + if model is ShapesModel: + centroids = element.geometry.centroid + return pd.DataFrame( + { + "x": centroids.x.to_numpy(), + "y": centroids.y.to_numpy(), + "area": element.geometry.area.to_numpy(), + }, + index=element.index, + ) + if model is Labels2DModel: + return _compute_label_measurements(element) + raise NotImplementedError( + f"Measurement is only supported for shapes and 2D labels; " + f"element {element_name!r} is a {model.__name__}." + ) + + +def _region_mask_and_keys(table: AnnData, element_name: str) -> tuple[str, ArrayLike]: + """Return ``(instance_key, mask)`` for the rows of ``table`` that annotate ``element_name``.""" + _, region_key, instance_key = get_table_keys(table) + mask = (table.obs[region_key].astype(str) == str(element_name)).to_numpy() + return instance_key, mask + + +def _valid_spatial_obsm(arr: ArrayLike, n_obs: int) -> bool: + """Whether ``arr`` is a usable ``obsm["spatial"]``: a 2D ``(n_obs, 2)`` coordinate grid.""" + return bool(arr.ndim == 2 and arr.shape == (n_obs, 2)) + + +def _measurement_provenance(table: AnnData, element_name: str) -> dict[str, Any] | None: + """Our measurement provenance marker for ``element_name``, or None if absent/malformed.""" + root = table.uns.get(_CENTROID_PROVENANCE_UNS_KEY) + if not isinstance(root, dict): + return None + marker = root.get("centroids", {}).get(element_name) + return marker if isinstance(marker, dict) else None + + +def _record_measurement_provenance(table: AnnData, element_name: str, n: int) -> None: + """Record ``(n, scale_level)`` for ``element_name`` so a later instance-count change invalidates.""" + root = table.uns.get(_CENTROID_PROVENANCE_UNS_KEY) + if not isinstance(root, dict): + root = {} + table.uns[_CENTROID_PROVENANCE_UNS_KEY] = root + root.setdefault("centroids", {})[element_name] = {"n": int(n), "scale_level": _CENTROID_SCALE_LEVEL} + + +def _obs_region_finite(table: AnnData, key: str, mask: ArrayLike) -> bool: + """Whether ``obs[key]`` exists and is finite for every row in ``mask`` (already populated).""" + if key not in table.obs: + return False + vals = np.asarray(table.obs[key].to_numpy(), dtype=float)[mask] + return bool(vals.size and np.isfinite(vals).all()) + + +def _obsm_region_finite(table: AnnData, key: str, mask: ArrayLike) -> bool: + """Whether ``obsm[key]`` is a valid ``(n_obs, 2)`` grid and finite for every row in ``mask``.""" + if key not in table.obsm: + return False + arr = np.asarray(table.obsm[key]) + if not _valid_spatial_obsm(arr, table.n_obs): + return False + region = arr[mask].astype(float) + return bool(region.size and np.isfinite(region).all()) + + +def _write_obs_region(table: AnnData, mask: ArrayLike, key: str, values: ArrayLike) -> None: + """Write ``values`` into ``obs[key]`` at ``mask`` rows; other rows stay/are NaN.""" + if key in table.obs: + col = np.asarray(table.obs[key].to_numpy(), dtype=float) + else: + col = np.full(table.n_obs, np.nan, dtype=float) + col[mask] = np.asarray(values, dtype=float) + table.obs[key] = col + + +def _write_obsm_region(table: AnnData, mask: ArrayLike, key: str, xy: ArrayLike) -> None: + """Write intrinsic centroids ``xy`` into ``obsm[key]`` at ``mask`` rows; other rows stay/are NaN. + + Never reshapes an incompatible existing ``obsm[key]`` (e.g. a 3-column xyz array): that would + silently drop data, so it raises instead. + """ + if key in table.obsm: + existing = np.asarray(table.obsm[key]) + if not _valid_spatial_obsm(existing, table.n_obs): + raise ValueError( + f"Refusing to overwrite obsm[{key!r}] with shape {existing.shape}; expected " + f"({table.n_obs}, 2). Remove it first if you want it replaced." + ) + arr = existing.astype(float, copy=True) + else: + arr = np.full((table.n_obs, 2), np.nan, dtype=float) + arr[mask] = np.asarray(xy, dtype=float) + table.obsm[key] = arr + + +def _measure_into_table( + sdata: SpatialData, + element_name: str, + table_name: str, + *, + centroids: bool, + area: bool, + diameter: bool, + obsm_key: str, + area_key: str, + diameter_key: str, + force: bool, +) -> None: + """Compute and persist the requested measurements for one element into its annotating table.""" + table = sdata.tables[table_name] + instance_key, mask = _region_mask_and_keys(table, element_name) + if not mask.any(): + raise ValueError(f"Table {table_name!r} does not annotate element {element_name!r} (no matching rows).") + n = int(mask.sum()) + + prov = _measurement_provenance(table, element_name) + stale = prov is not None and prov.get("n") != n # cells added/removed since we last wrote + want_centroids = centroids and (force or stale or not _obsm_region_finite(table, obsm_key, mask)) + want_area = area and (force or stale or not _obs_region_finite(table, area_key, mask)) + want_diameter = diameter and (force or stale or not _obs_region_finite(table, diameter_key, mask)) + if not (want_centroids or want_area or want_diameter): + return # everything requested is already present and current + + meas = _compute_element_measurements(sdata, element_name) # intrinsic [x, y, area] + aligned = meas.reindex(table.obs[instance_key].to_numpy()[mask]) + if want_centroids: + _write_obsm_region(table, mask, obsm_key, aligned[["x", "y"]].to_numpy()) + if want_area: + _write_obs_region(table, mask, area_key, aligned["area"].to_numpy()) + if want_diameter: + _write_obs_region(table, mask, diameter_key, 2.0 * np.sqrt(aligned["area"].to_numpy() / np.pi)) + _record_measurement_provenance(table, element_name, n) + + +def _measurable_elements(sdata: SpatialData) -> list[str]: + """Shapes / 2D-labels elements with exactly one annotating table (where measurements persist).""" + names: list[str] = [] + for name in list(sdata.shapes.keys()) + list(sdata.labels.keys()): + if len(get_element_annotators(sdata, name)) != 1: + continue + if get_model(sdata[name]) in (ShapesModel, Labels2DModel): + names.append(name) + return names + + +def _resolve_measure_table(sdata: SpatialData, element_name: str, table_name: str | None) -> str: + """Resolve the single annotating table for ``element_name`` (where measurements are written).""" + if table_name is not None: + if table_name not in sdata.tables: + raise KeyError(f"Table {table_name!r} not found in `sdata.tables`.") + return table_name + annotators = sorted(get_element_annotators(sdata, element_name)) + if not annotators: + raise ValueError( + f"Element {element_name!r} has no annotating table; per-cell measurements need a table " + f"to write into. Pass `table_name=` or annotate the element first." + ) + if len(annotators) > 1: + raise ValueError( + f"Element {element_name!r} is annotated by multiple tables ({', '.join(annotators)}); " + f"pass `table_name=` to pick one." + ) + return annotators[0] + + +def measure_obs( + sdata: SpatialData, + element: str | None = None, + *, + table_name: str | None = None, + centroids: bool = True, + area: bool = True, + diameter: bool = True, + obsm_key: str = _CENTROID_OBSM_KEY, + area_key: str = "area", + diameter_key: str = "equivalent_diameter", + force: bool = False, + inplace: bool = True, +) -> SpatialData | None: + """Measure per-cell centroids, area and equivalent diameter into an element's annotating table. + + Computes one centroid, area and equivalent diameter per instance of a shapes or 2D-labels + element and writes them, squidpy-style, into the annotating :class:`~anndata.AnnData` table: + centroids go to ``table.obsm[obsm_key]`` (an ``(n_obs, 2)`` array, the canonical + ``obsm["spatial"]``), area and diameter to ``table.obs``. Values are stored in the element's + *intrinsic* coordinates/units (coordinate-system independent). Labels area is the pixel count; + shapes area is ``geometry.area``; equivalent diameter is ``2 * sqrt(area / pi)``. Persisting + them once lets later renders (and downstream tools such as squidpy) reuse them instead of + recomputing. + + The function is idempotent: outputs already present and current for an element are not + recomputed. A pre-existing ``obsm[obsm_key]`` (e.g. from a reader) is trusted and never + overwritten. Adding cells (which leaves new rows unmeasured) triggers a recompute; pass + ``force=True`` to recompute unconditionally. + + Per-cell measurements need a table to write into, so an element without an annotating table + cannot be measured (this raises). The function never densifies the raster: the label path + streams it block by block, scaling to Xenium-size masks. + + Parameters + ---------- + sdata + The ``SpatialData`` object holding the element and its annotating table. + element + Name of the shapes/2D-labels element to measure. If ``None``, every shapes/2D-labels + element that has exactly one annotating table is measured. + table_name + Name of the annotating table to write into. If ``None``, it is inferred from the element's + annotators (an error is raised when there are zero or several). + centroids, area, diameter + Which measurements to compute/persist. At least one must be ``True``. + obsm_key + ``obsm`` key for the centroids (default ``"spatial"``, the squidpy convention). + area_key, diameter_key + ``obs`` column names for area and equivalent diameter. + force + Recompute and overwrite even when current values are already present. + inplace + If ``True`` (default), mutate ``sdata``'s table in place and return ``None``. If ``False``, + operate on a deep copy and return the modified ``SpatialData``. + + Returns + ------- + ``None`` if ``inplace`` is ``True``, otherwise the modified deep-copied ``SpatialData``. + """ + if not (centroids or area or diameter): + raise ValueError("Nothing to measure: set at least one of `centroids`, `area`, `diameter` to True.") + target = sdata if inplace else sd_deepcopy(sdata) + if element is None: + names = _measurable_elements(target) + if not names: + raise ValueError( + "No shapes/2D-labels element with a single annotating table was found; nothing to " + "measure. Pass an explicit `element=` (and `table_name=` if ambiguous)." + ) + else: + names = [element] + for name in names: + resolved_table = _resolve_measure_table(target, name, table_name) + _measure_into_table( + target, + name, + resolved_table, + centroids=centroids, + area=area, + diameter=diameter, + obsm_key=obsm_key, + area_key=area_key, + diameter_key=diameter_key, + force=force, + ) + return None if inplace else target diff --git a/tests/pl/test_utils.py b/tests/pl/test_utils.py index 4a7c8a5b..33afb550 100644 --- a/tests/pl/test_utils.py +++ b/tests/pl/test_utils.py @@ -8,7 +8,7 @@ import xarray as xr from anndata import AnnData from shapely.geometry import Point -from spatialdata import SpatialData +from spatialdata import SpatialData, get_centroids from spatialdata.models import PointsModel, ShapesModel, TableModel import spatialdata_plot @@ -17,6 +17,7 @@ _apply_cmap_alpha_to_datashader_result, _datashader_map_aggregate_to_color, _set_outline, + measure_obs, set_zero_in_cmap_to_transparent, ) from tests.conftest import DPI, PlotTester, PlotTesterMeta @@ -496,3 +497,118 @@ def fake_rasterize(*args, **kwargs): assert captured["target_unit_to_pixels"] == pytest.approx(0.15, rel=1e-6), ( f"Expected world-unit basis (0.15), got {captured['target_unit_to_pixels']}" ) + + +def _add_shapes_table(sdata: SpatialData, element: str = "blobs_polygons", name: str = "shapes_table") -> SpatialData: + """Add a table annotating a shapes element so it can be measured.""" + gdf = sdata[element] + adata = AnnData(np.zeros((len(gdf), 1), dtype=np.float32)) + adata.obs["instance_id"] = list(gdf.index) + adata.obs["region"] = element + sdata[name] = TableModel.parse( + adata, region_key="region", instance_key="instance_id", region=element + ) + return sdata + + +class TestMeasureObs: + """`measure_obs` materializes centroids/area/equivalent diameter into the annotating table.""" + + def test_writes_centroid_area_diameter_for_labels(self, sdata_blobs: SpatialData) -> None: + ret = measure_obs(sdata_blobs, "blobs_labels") + assert ret is None # inplace default + table = sdata_blobs["table"] + + assert "spatial" in table.obsm + coords = table.obsm["spatial"] + assert coords.shape == (table.n_obs, 2) + assert np.isfinite(coords).all() + assert "area" in table.obs and "equivalent_diameter" in table.obs + + # centroids match spatialdata's get_centroids (blobs_labels has the identity transform, + # so global == intrinsic here) + gc = get_centroids(sdata_blobs["blobs_labels"], coordinate_system="global").compute() + expected = gc.reindex(table.obs["instance_id"].to_numpy())[["x", "y"]].to_numpy() + np.testing.assert_allclose(coords, expected, rtol=1e-6) + + # area is the pixel count (positive integers); diameter = 2*sqrt(area/pi) + area = table.obs["area"].to_numpy() + assert (area > 0).all() + np.testing.assert_allclose( + table.obs["equivalent_diameter"].to_numpy(), 2.0 * np.sqrt(area / np.pi), rtol=1e-12 + ) + + def test_writes_for_shapes(self, sdata_blobs: SpatialData) -> None: + _add_shapes_table(sdata_blobs, "blobs_polygons") + measure_obs(sdata_blobs, "blobs_polygons", table_name="shapes_table") + table = sdata_blobs["shapes_table"] + + gdf = sdata_blobs["blobs_polygons"] + expected_xy = np.column_stack([gdf.geometry.centroid.x, gdf.geometry.centroid.y]) + order = table.obs["instance_id"].to_numpy() + expected_xy = expected_xy[[list(gdf.index).index(i) for i in order]] + np.testing.assert_allclose(table.obsm["spatial"], expected_xy, rtol=1e-9) + np.testing.assert_allclose( + table.obs["area"].to_numpy(), + gdf.geometry.area.to_numpy()[[list(gdf.index).index(i) for i in order]], + rtol=1e-9, + ) + + def test_inplace_false_leaves_original_untouched(self, sdata_blobs: SpatialData) -> None: + out = measure_obs(sdata_blobs, "blobs_labels", inplace=False) + assert isinstance(out, SpatialData) + assert "spatial" in out["table"].obsm + assert "spatial" not in sdata_blobs["table"].obsm # original not mutated + + def test_idempotent_trusts_existing_unless_forced(self, sdata_blobs: SpatialData) -> None: + measure_obs(sdata_blobs, "blobs_labels") + table = sdata_blobs["table"] + # a user edit to a populated row is trusted (no recompute) on a second call... + table.obsm["spatial"][0] = [999.0, 999.0] + measure_obs(sdata_blobs, "blobs_labels") + assert tuple(table.obsm["spatial"][0]) == (999.0, 999.0) + # ...but force recomputes it + measure_obs(sdata_blobs, "blobs_labels", force=True) + assert tuple(table.obsm["spatial"][0]) != (999.0, 999.0) + + def test_preexisting_obsm_is_trusted(self, sdata_blobs: SpatialData) -> None: + table = sdata_blobs["table"] + sentinel = np.arange(table.n_obs * 2, dtype=float).reshape(table.n_obs, 2) + table.obsm["spatial"] = sentinel.copy() + measure_obs(sdata_blobs, "blobs_labels") # centroids already present -> not overwritten + np.testing.assert_array_equal(table.obsm["spatial"], sentinel) + + def test_stale_instance_count_triggers_recompute(self, sdata_blobs: SpatialData) -> None: + measure_obs(sdata_blobs, "blobs_labels") + table = sdata_blobs["table"] + table.obsm["spatial"][:] = -1.0 # corrupt all rows + # pretend the instance count changed since we wrote -> provenance mismatch + table.uns["spatialdata_plot"]["centroids"]["blobs_labels"]["n"] = table.n_obs + 1 + measure_obs(sdata_blobs, "blobs_labels") # stale -> recompute despite finite values + assert not np.any(table.obsm["spatial"] == -1.0) + + def test_flags_select_outputs(self, sdata_blobs: SpatialData) -> None: + measure_obs(sdata_blobs, "blobs_labels", area=False, diameter=False) + table = sdata_blobs["table"] + assert "spatial" in table.obsm + assert "area" not in table.obs and "equivalent_diameter" not in table.obs + + def test_no_annotating_table_raises(self, sdata_blobs: SpatialData) -> None: + # blobs_circles is not annotated by any table + with pytest.raises(ValueError, match="no annotating table"): + measure_obs(sdata_blobs, "blobs_circles") + + def test_ambiguous_table_raises(self, sdata_blobs: SpatialData) -> None: + _add_shapes_table(sdata_blobs, "blobs_polygons", name="table_a") + _add_shapes_table(sdata_blobs, "blobs_polygons", name="table_b") + with pytest.raises(ValueError, match="multiple tables"): + measure_obs(sdata_blobs, "blobs_polygons") + + def test_nothing_to_measure_raises(self, sdata_blobs: SpatialData) -> None: + with pytest.raises(ValueError, match="at least one"): + measure_obs(sdata_blobs, "blobs_labels", centroids=False, area=False, diameter=False) + + def test_element_none_measures_single_table_elements(self, sdata_blobs: SpatialData) -> None: + # default blobs: only blobs_labels has a single annotating table + measure_obs(sdata_blobs) + assert "spatial" in sdata_blobs["table"].obsm From 121abd1d3eeb4ee25ad24fd5750620438fdcbd55 Mon Sep 17 00:00:00 2001 From: anon Date: Mon, 8 Jun 2026 23:41:43 +0200 Subject: [PATCH 2/8] Fix circle-shapes area in measure_obs: use pi*r**2, not shapely Point.area (=0) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Circles are stored as `Point` geometries with a `radius` column, for which shapely `.area` is 0 — so `measure_obs` wrote area=0 and equivalent_diameter=0 for every circle (surfaced on the real Visium spots dataset, all circles). Compute their area as `pi * r**2`; equivalent diameter then equals the true diameter `2*r`. Polygons/multipolygons still use the geometric area. Adds a regression test on `blobs_circles`. --- src/spatialdata_plot/pl/utils.py | 13 ++++++++----- tests/pl/test_utils.py | 12 ++++++++++++ 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/spatialdata_plot/pl/utils.py b/src/spatialdata_plot/pl/utils.py index f1208c7d..29f1a403 100644 --- a/src/spatialdata_plot/pl/utils.py +++ b/src/spatialdata_plot/pl/utils.py @@ -4527,12 +4527,15 @@ def _compute_element_measurements(sdata: SpatialData, element_name: str) -> pd.D model = get_model(element) if model is ShapesModel: centroids = element.geometry.centroid + # Circles are stored as ``Point`` geometries with a ``radius`` column, for which + # shapely ``.area`` is 0; their area is ``pi * r**2``. Polygons/multipolygons use + # the true geometric area. + if "radius" in element.columns: + area = np.pi * np.asarray(element["radius"], dtype=float) ** 2 + else: + area = element.geometry.area.to_numpy() return pd.DataFrame( - { - "x": centroids.x.to_numpy(), - "y": centroids.y.to_numpy(), - "area": element.geometry.area.to_numpy(), - }, + {"x": centroids.x.to_numpy(), "y": centroids.y.to_numpy(), "area": area}, index=element.index, ) if model is Labels2DModel: diff --git a/tests/pl/test_utils.py b/tests/pl/test_utils.py index 33afb550..42f9723d 100644 --- a/tests/pl/test_utils.py +++ b/tests/pl/test_utils.py @@ -554,6 +554,18 @@ def test_writes_for_shapes(self, sdata_blobs: SpatialData) -> None: rtol=1e-9, ) + def test_circles_area_is_pi_r_squared(self, sdata_blobs: SpatialData) -> None: + # Circles are stored as Point geometries (radius column); shapely .area is 0 for them, + # so area must be pi * r**2 and equivalent diameter must equal the true diameter 2*r. + _add_shapes_table(sdata_blobs, "blobs_circles", name="circles_table") + measure_obs(sdata_blobs, "blobs_circles", table_name="circles_table") + table = sdata_blobs["circles_table"] + gdf = sdata_blobs["blobs_circles"] + order = table.obs["instance_id"].to_numpy() + r = gdf["radius"].to_numpy()[[list(gdf.index).index(i) for i in order]] + np.testing.assert_allclose(table.obs["area"].to_numpy(), np.pi * r**2, rtol=1e-9) + np.testing.assert_allclose(table.obs["equivalent_diameter"].to_numpy(), 2.0 * r, rtol=1e-9) + def test_inplace_false_leaves_original_untouched(self, sdata_blobs: SpatialData) -> None: out = measure_obs(sdata_blobs, "blobs_labels", inplace=False) assert isinstance(out, SpatialData) From 982619b51d7e28c9e805eca2a9f490edabdbe2b4 Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 9 Jun 2026 00:16:22 +0200 Subject: [PATCH 3/8] Simplify measure_obs: drop the idempotency/provenance layer (KISS) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit measure_obs now just computes and writes the requested measurements, overwriting existing values for the element's rows — the scanpy `calculate_qc_metrics` model. Removed the provenance marker, staleness tracking, per-row finiteness checks, the want_*/stale gating and the `force` parameter (5 helpers + 2 uns constants, ~85 net lines). Reuse belongs on the render read-path (read obsm if present, else compute), not in this writer; `centroids=False` keeps a pre-existing obsm["spatial"]. Merged the one-call `_compute_label_measurements` into `_compute_element_measurements`. Kept: the masked partial write (a table may annotate several elements), the incompatible-obsm-shape guard, and element=None / table resolution. Tests updated to the overwrite contract (recompute-overwrites, centroids-keeps-obsm, incompatible-shape-raises) replacing the idempotency/staleness tests. --- src/spatialdata_plot/pl/utils.py | 175 +++++++++---------------------- tests/pl/test_utils.py | 26 ++--- 2 files changed, 59 insertions(+), 142 deletions(-) diff --git a/src/spatialdata_plot/pl/utils.py b/src/spatialdata_plot/pl/utils.py index 29f1a403..6ff8c252 100644 --- a/src/spatialdata_plot/pl/utils.py +++ b/src/spatialdata_plot/pl/utils.py @@ -4431,21 +4431,23 @@ def _convert_alpha_to_datashader_range(alpha: float) -> float: # --- Per-cell measurements into the annotating table (centroid / area / equivalent diameter) --- # squidpy/scanpy canonical key for per-cell coordinates: an N x 2 array, row-aligned to obs. +# Measurements are stored intrinsic (coordinate-system independent). _CENTROID_OBSM_KEY = "spatial" -# Our provenance namespace in ``table.uns``. Measurements are stored in the element's *intrinsic* -# coordinates/units (coordinate-system-independent), so the marker records the resolution level -# and the instance count (to invalidate when cells are added), not a coordinate system. -_CENTROID_PROVENANCE_UNS_KEY = "spatialdata_plot" -_CENTROID_SCALE_LEVEL = "scale0" # rasters are reduced at full resolution (decision: exact) def _transform_carrier(element: SpatialElement) -> Any: """Return the object carrying ``element``'s data (the ``scale0`` level for multiscale rasters).""" if isinstance(element, DataTree): - return next(iter(element[_CENTROID_SCALE_LEVEL].values())) + return next(iter(element["scale0"].values())) return element +def _pixel_to_coord(idx: ArrayLike, coord: ArrayLike) -> ArrayLike: + """Map fractional pixel indices to intrinsic coordinates along one axis (handles non-unit spacing).""" + spacing = (coord[1] - coord[0]) if len(coord) > 1 else 1.0 + return coord[0] + np.asarray(idx) * spacing + + def _stream_label_centroid_stats(data: Any) -> tuple[ArrayLike, ArrayLike, ArrayLike, ArrayLike]: """Per-label ``(labels, mean_x_index, mean_y_index, area)`` via a streaming bincount aggregator. @@ -4485,64 +4487,39 @@ def _stream_label_centroid_stats(data: Any) -> tuple[ArrayLike, ArrayLike, Array return labels, sum_x[labels] / count[labels], sum_y[labels] / count[labels], count[labels] -def _compute_label_measurements(element: DataArray | DataTree) -> pd.DataFrame: - """Per-label intrinsic centroid + pixel area via the streaming bincount aggregator. - - Returns a DataFrame indexed by label value with columns ``["x", "y", "area"]`` in the - element's *intrinsic* coordinates; ``area`` is the pixel count, a free by-product of the - aggregator. The aggregator streams the (possibly huge, dask-backed) raster block by block, - holding only one chunk plus O(n_labels) accumulators, so it scales to Xenium-size masks - where ``regionprops`` (whole-array, per-label table) would run out of memory. - """ - raster = _transform_carrier(element) - labels, x_idx, y_idx, area = _stream_label_centroid_stats(raster.data) - - # bincount gives mean 0-based pixel indices; map them onto the raster's intrinsic coordinate - # arrays (spatialdata uses pixel-center coords 0.5, 1.5, ... and possibly non-unit spacing). - def _index_to_coord(idx: ArrayLike, coord: ArrayLike) -> ArrayLike: - spacing = (coord[1] - coord[0]) if len(coord) > 1 else 1.0 - return coord[0] + idx * spacing - - xcoord = np.asarray(raster.coords["x"].values) - ycoord = np.asarray(raster.coords["y"].values) - return pd.DataFrame( - { - "x": _index_to_coord(x_idx, xcoord), - "y": _index_to_coord(y_idx, ycoord), - "area": np.asarray(area, dtype=float), - }, - index=labels, - ) - - def _compute_element_measurements(sdata: SpatialData, element_name: str) -> pd.DataFrame: """One row per instance with intrinsic ``["x", "y", "area"]``, indexed by instance id. - Shapes use shapely's vectorized ``.centroid`` / ``.area`` (area in intrinsic squared units); - 2D labels use the streaming bincount aggregator (area = pixel count). Other element types are - rejected explicitly. Area meaning therefore differs across element types but is consistent - within one element. + Shapes use shapely's vectorized centroid; circles (``Point`` + ``radius``) have ``area = pi*r**2`` + (shapely ``.area`` is 0 for them), polygons use the geometric area. 2D labels use the streaming + bincount aggregator (``area`` = pixel count) — it holds one chunk plus O(n_labels) accumulators, + so it scales to Xenium-size masks where a whole-array ``regionprops`` table would run out of + memory. Area meaning differs across element types but is consistent within one element. """ element = sdata[element_name] model = get_model(element) if model is ShapesModel: centroids = element.geometry.centroid - # Circles are stored as ``Point`` geometries with a ``radius`` column, for which - # shapely ``.area`` is 0; their area is ``pi * r**2``. Polygons/multipolygons use - # the true geometric area. - if "radius" in element.columns: + if "radius" in element.columns: # circles are Points; shapely .area is 0, so use pi * r**2 area = np.pi * np.asarray(element["radius"], dtype=float) ** 2 else: area = element.geometry.area.to_numpy() + xy = {"x": centroids.x.to_numpy(), "y": centroids.y.to_numpy(), "area": area} + return pd.DataFrame(xy, index=element.index) + if model is Labels2DModel: + raster = _transform_carrier(element) + labels, x_idx, y_idx, area = _stream_label_centroid_stats(raster.data) + # bincount gives mean 0-based pixel indices; map them onto the raster's intrinsic coords. return pd.DataFrame( - {"x": centroids.x.to_numpy(), "y": centroids.y.to_numpy(), "area": area}, - index=element.index, + { + "x": _pixel_to_coord(x_idx, raster.coords["x"].values), + "y": _pixel_to_coord(y_idx, raster.coords["y"].values), + "area": np.asarray(area, dtype=float), + }, + index=labels, ) - if model is Labels2DModel: - return _compute_label_measurements(element) raise NotImplementedError( - f"Measurement is only supported for shapes and 2D labels; " - f"element {element_name!r} is a {model.__name__}." + f"Measurement is only supported for shapes and 2D labels; element {element_name!r} is a {model.__name__}." ) @@ -4558,43 +4535,6 @@ def _valid_spatial_obsm(arr: ArrayLike, n_obs: int) -> bool: return bool(arr.ndim == 2 and arr.shape == (n_obs, 2)) -def _measurement_provenance(table: AnnData, element_name: str) -> dict[str, Any] | None: - """Our measurement provenance marker for ``element_name``, or None if absent/malformed.""" - root = table.uns.get(_CENTROID_PROVENANCE_UNS_KEY) - if not isinstance(root, dict): - return None - marker = root.get("centroids", {}).get(element_name) - return marker if isinstance(marker, dict) else None - - -def _record_measurement_provenance(table: AnnData, element_name: str, n: int) -> None: - """Record ``(n, scale_level)`` for ``element_name`` so a later instance-count change invalidates.""" - root = table.uns.get(_CENTROID_PROVENANCE_UNS_KEY) - if not isinstance(root, dict): - root = {} - table.uns[_CENTROID_PROVENANCE_UNS_KEY] = root - root.setdefault("centroids", {})[element_name] = {"n": int(n), "scale_level": _CENTROID_SCALE_LEVEL} - - -def _obs_region_finite(table: AnnData, key: str, mask: ArrayLike) -> bool: - """Whether ``obs[key]`` exists and is finite for every row in ``mask`` (already populated).""" - if key not in table.obs: - return False - vals = np.asarray(table.obs[key].to_numpy(), dtype=float)[mask] - return bool(vals.size and np.isfinite(vals).all()) - - -def _obsm_region_finite(table: AnnData, key: str, mask: ArrayLike) -> bool: - """Whether ``obsm[key]`` is a valid ``(n_obs, 2)`` grid and finite for every row in ``mask``.""" - if key not in table.obsm: - return False - arr = np.asarray(table.obsm[key]) - if not _valid_spatial_obsm(arr, table.n_obs): - return False - region = arr[mask].astype(float) - return bool(region.size and np.isfinite(region).all()) - - def _write_obs_region(table: AnnData, mask: ArrayLike, key: str, values: ArrayLike) -> None: """Write ``values`` into ``obs[key]`` at ``mask`` rows; other rows stay/are NaN.""" if key in table.obs: @@ -4636,32 +4576,23 @@ def _measure_into_table( obsm_key: str, area_key: str, diameter_key: str, - force: bool, ) -> None: - """Compute and persist the requested measurements for one element into its annotating table.""" + """Compute and write the requested measurements for one element into its annotating table. + + Overwrites existing values for the element's rows. A table may annotate several elements, so + only the rows belonging to ``element_name`` are touched. + """ table = sdata.tables[table_name] instance_key, mask = _region_mask_and_keys(table, element_name) if not mask.any(): raise ValueError(f"Table {table_name!r} does not annotate element {element_name!r} (no matching rows).") - n = int(mask.sum()) - - prov = _measurement_provenance(table, element_name) - stale = prov is not None and prov.get("n") != n # cells added/removed since we last wrote - want_centroids = centroids and (force or stale or not _obsm_region_finite(table, obsm_key, mask)) - want_area = area and (force or stale or not _obs_region_finite(table, area_key, mask)) - want_diameter = diameter and (force or stale or not _obs_region_finite(table, diameter_key, mask)) - if not (want_centroids or want_area or want_diameter): - return # everything requested is already present and current - - meas = _compute_element_measurements(sdata, element_name) # intrinsic [x, y, area] - aligned = meas.reindex(table.obs[instance_key].to_numpy()[mask]) - if want_centroids: - _write_obsm_region(table, mask, obsm_key, aligned[["x", "y"]].to_numpy()) - if want_area: - _write_obs_region(table, mask, area_key, aligned["area"].to_numpy()) - if want_diameter: - _write_obs_region(table, mask, diameter_key, 2.0 * np.sqrt(aligned["area"].to_numpy() / np.pi)) - _record_measurement_provenance(table, element_name, n) + meas = _compute_element_measurements(sdata, element_name).reindex(table.obs[instance_key].to_numpy()[mask]) + if centroids: + _write_obsm_region(table, mask, obsm_key, meas[["x", "y"]].to_numpy()) + if area: + _write_obs_region(table, mask, area_key, meas["area"].to_numpy()) + if diameter: + _write_obs_region(table, mask, diameter_key, 2.0 * np.sqrt(meas["area"].to_numpy() / np.pi)) def _measurable_elements(sdata: SpatialData) -> list[str]: @@ -4706,7 +4637,6 @@ def measure_obs( obsm_key: str = _CENTROID_OBSM_KEY, area_key: str = "area", diameter_key: str = "equivalent_diameter", - force: bool = False, inplace: bool = True, ) -> SpatialData | None: """Measure per-cell centroids, area and equivalent diameter into an element's annotating table. @@ -4716,18 +4646,15 @@ def measure_obs( centroids go to ``table.obsm[obsm_key]`` (an ``(n_obs, 2)`` array, the canonical ``obsm["spatial"]``), area and diameter to ``table.obs``. Values are stored in the element's *intrinsic* coordinates/units (coordinate-system independent). Labels area is the pixel count; - shapes area is ``geometry.area``; equivalent diameter is ``2 * sqrt(area / pi)``. Persisting - them once lets later renders (and downstream tools such as squidpy) reuse them instead of - recomputing. - - The function is idempotent: outputs already present and current for an element are not - recomputed. A pre-existing ``obsm[obsm_key]`` (e.g. from a reader) is trusted and never - overwritten. Adding cells (which leaves new rows unmeasured) triggers a recompute; pass - ``force=True`` to recompute unconditionally. + shapes area is ``geometry.area`` (``pi*r**2`` for circles); equivalent diameter is + ``2 * sqrt(area / pi)``. Persisting them once lets later renders (and downstream tools such as + squidpy) reuse them instead of recomputing. - Per-cell measurements need a table to write into, so an element without an annotating table - cannot be measured (this raises). The function never densifies the raster: the label path - streams it block by block, scaling to Xenium-size masks. + Existing values for the measured rows are overwritten; pass ``centroids=False`` to keep a + pre-existing ``obsm[obsm_key]`` (e.g. from a reader). Per-cell measurements need a table to + write into, so an element without an annotating table cannot be measured (this raises). The + label path never densifies the raster — it streams it block by block, scaling to Xenium-size + masks. Parameters ---------- @@ -4740,13 +4667,11 @@ def measure_obs( Name of the annotating table to write into. If ``None``, it is inferred from the element's annotators (an error is raised when there are zero or several). centroids, area, diameter - Which measurements to compute/persist. At least one must be ``True``. + Which measurements to compute/write. At least one must be ``True``. obsm_key ``obsm`` key for the centroids (default ``"spatial"``, the squidpy convention). area_key, diameter_key ``obs`` column names for area and equivalent diameter. - force - Recompute and overwrite even when current values are already present. inplace If ``True`` (default), mutate ``sdata``'s table in place and return ``None``. If ``False``, operate on a deep copy and return the modified ``SpatialData``. @@ -4768,17 +4693,15 @@ def measure_obs( else: names = [element] for name in names: - resolved_table = _resolve_measure_table(target, name, table_name) _measure_into_table( target, name, - resolved_table, + _resolve_measure_table(target, name, table_name), centroids=centroids, area=area, diameter=diameter, obsm_key=obsm_key, area_key=area_key, diameter_key=diameter_key, - force=force, ) return None if inplace else target diff --git a/tests/pl/test_utils.py b/tests/pl/test_utils.py index 42f9723d..f2773293 100644 --- a/tests/pl/test_utils.py +++ b/tests/pl/test_utils.py @@ -572,32 +572,26 @@ def test_inplace_false_leaves_original_untouched(self, sdata_blobs: SpatialData) assert "spatial" in out["table"].obsm assert "spatial" not in sdata_blobs["table"].obsm # original not mutated - def test_idempotent_trusts_existing_unless_forced(self, sdata_blobs: SpatialData) -> None: + def test_recompute_overwrites(self, sdata_blobs: SpatialData) -> None: measure_obs(sdata_blobs, "blobs_labels") table = sdata_blobs["table"] - # a user edit to a populated row is trusted (no recompute) on a second call... - table.obsm["spatial"][0] = [999.0, 999.0] - measure_obs(sdata_blobs, "blobs_labels") - assert tuple(table.obsm["spatial"][0]) == (999.0, 999.0) - # ...but force recomputes it - measure_obs(sdata_blobs, "blobs_labels", force=True) + table.obsm["spatial"][0] = [999.0, 999.0] # corrupt one row + measure_obs(sdata_blobs, "blobs_labels") # second call overwrites with the real centroid assert tuple(table.obsm["spatial"][0]) != (999.0, 999.0) - def test_preexisting_obsm_is_trusted(self, sdata_blobs: SpatialData) -> None: + def test_centroids_false_keeps_existing_obsm(self, sdata_blobs: SpatialData) -> None: table = sdata_blobs["table"] sentinel = np.arange(table.n_obs * 2, dtype=float).reshape(table.n_obs, 2) table.obsm["spatial"] = sentinel.copy() - measure_obs(sdata_blobs, "blobs_labels") # centroids already present -> not overwritten + measure_obs(sdata_blobs, "blobs_labels", centroids=False) # only area/diameter written np.testing.assert_array_equal(table.obsm["spatial"], sentinel) + assert "area" in table.obs - def test_stale_instance_count_triggers_recompute(self, sdata_blobs: SpatialData) -> None: - measure_obs(sdata_blobs, "blobs_labels") + def test_incompatible_obsm_shape_raises(self, sdata_blobs: SpatialData) -> None: table = sdata_blobs["table"] - table.obsm["spatial"][:] = -1.0 # corrupt all rows - # pretend the instance count changed since we wrote -> provenance mismatch - table.uns["spatialdata_plot"]["centroids"]["blobs_labels"]["n"] = table.n_obs + 1 - measure_obs(sdata_blobs, "blobs_labels") # stale -> recompute despite finite values - assert not np.any(table.obsm["spatial"] == -1.0) + table.obsm["spatial"] = np.zeros((table.n_obs, 3)) # e.g. xyz; cannot write 2D centroids over it + with pytest.raises(ValueError, match="Refusing to overwrite"): + measure_obs(sdata_blobs, "blobs_labels") def test_flags_select_outputs(self, sdata_blobs: SpatialData) -> None: measure_obs(sdata_blobs, "blobs_labels", area=False, diameter=False) From 3f072b5fbd86bbbff754c7797a4dbb73297fd4c4 Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 9 Jun 2026 00:26:11 +0200 Subject: [PATCH 4/8] Keep measure_obs in pl.utils only, not the pl namespace Match the set_zero_in_cmap_to_transparent convention: measure_obs is a plain public function in pl/utils.py, accessed via `from spatialdata_plot.pl.utils import measure_obs` rather than promoted to `sdp.pl.measure_obs`. --- src/spatialdata_plot/pl/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/spatialdata_plot/pl/__init__.py b/src/spatialdata_plot/pl/__init__.py index 77c74024..178b4914 100644 --- a/src/spatialdata_plot/pl/__init__.py +++ b/src/spatialdata_plot/pl/__init__.py @@ -1,10 +1,8 @@ from ._palette import make_palette, make_palette_from_data from .basic import PlotAccessor -from .utils import measure_obs __all__ = [ "PlotAccessor", "make_palette", "make_palette_from_data", - "measure_obs", ] From 556ba6c6ca4c9d7b365b180abeb8d852ec3b06ba Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 9 Jun 2026 00:31:46 +0200 Subject: [PATCH 5/8] Expose measure_obs like make_palette: re-export from pl/__init__ Follow the established public-helper pattern (make_palette is defined under pl/ and re-exported in pl/__init__) rather than inventing a top-level spatialdata_plot.utils module. Public form: `from spatialdata_plot.pl import measure_obs`. --- src/spatialdata_plot/pl/__init__.py | 2 ++ tests/pl/test_utils.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/spatialdata_plot/pl/__init__.py b/src/spatialdata_plot/pl/__init__.py index 178b4914..77c74024 100644 --- a/src/spatialdata_plot/pl/__init__.py +++ b/src/spatialdata_plot/pl/__init__.py @@ -1,8 +1,10 @@ from ._palette import make_palette, make_palette_from_data from .basic import PlotAccessor +from .utils import measure_obs __all__ = [ "PlotAccessor", "make_palette", "make_palette_from_data", + "measure_obs", ] diff --git a/tests/pl/test_utils.py b/tests/pl/test_utils.py index f2773293..682d5d47 100644 --- a/tests/pl/test_utils.py +++ b/tests/pl/test_utils.py @@ -12,12 +12,12 @@ from spatialdata.models import PointsModel, ShapesModel, TableModel import spatialdata_plot +from spatialdata_plot.pl import measure_obs from spatialdata_plot.pl.render_params import Color, ColorLike from spatialdata_plot.pl.utils import ( _apply_cmap_alpha_to_datashader_result, _datashader_map_aggregate_to_color, _set_outline, - measure_obs, set_zero_in_cmap_to_transparent, ) from tests.conftest import DPI, PlotTester, PlotTesterMeta From 398fa05cacf55220ebf67390a65db78df1439040 Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 9 Jun 2026 21:26:44 +0200 Subject: [PATCH 6/8] Address review blockers in measure_obs (#1-#5, #7) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - #1 no-clobber: a populated obsm["spatial"] (reader- or prior-call-provided) is no longer overwritten — warn and skip that element's centroids. Coords stay in the element's intrinsic pixel space (documented); area/diameter still overwrite our own columns. Restores the per-element finiteness guard. - #2 unmatched instance ids: instances annotated in the table but absent from the element (e.g. str-vs-int id dtype mismatch) now warn instead of silently writing NaN. - #3 float-dtype labels: the dense relabelling bincounts integer searchsorted indices, never the raw labels, so a float-typed (integer-valued) mask no longer crashes np.bincount. - #4 atomic writes: validate every obs target (non-numeric column collision) before the first mutation, so a bad column never leaves a half-written table. - #5 O(n_labels) memory: relabel labels to a dense 0..k-1 range, so the aggregator's memory scales with the number of distinct labels, not the maximum label id (sparse/global ids no longer blow it up). Single max() pass replaced by a unique() pass; same pass count. - #7 circle area: dispatch on geometry TYPE (all-Point) rather than the presence of a "radius" column, so a polygon element carrying a radius column uses geometry.area, not pi*r**2. Tests updated to the new contract and extended for each fix. --- src/spatialdata_plot/pl/utils.py | 145 +++++++++++++++++++++++-------- tests/pl/test_utils.py | 73 ++++++++++++++-- 2 files changed, 174 insertions(+), 44 deletions(-) diff --git a/src/spatialdata_plot/pl/utils.py b/src/spatialdata_plot/pl/utils.py index 6ff8c252..9ca1d10c 100644 --- a/src/spatialdata_plot/pl/utils.py +++ b/src/spatialdata_plot/pl/utils.py @@ -4453,57 +4453,72 @@ def _stream_label_centroid_stats(data: Any) -> tuple[ArrayLike, ArrayLike, Array Streams the raster block by block — one chunk in memory at a time for a dask array, a bounded row-block at a time for a numpy array — accumulating per-label ``count`` (= area), - ``sum_x`` and ``sum_y``. The reduction is additive, so it is exact across block boundaries - and uses only O(n_labels) memory regardless of raster size. Background label 0 is excluded. + ``sum_x`` and ``sum_y``. Labels are relabelled to a dense ``0..k-1`` range, so memory is + O(number of distinct labels), independent of the raster size *and* of the label-id magnitude + (sparse/global ids do not blow it up). The reduction is additive, so it is exact across block + boundaries. Background label 0 is excluded. """ n_rows, n_cols = data.shape - is_dask = hasattr(data, "chunks") and not isinstance(data, np.ndarray) - if is_dask: - n_labels = int(data.max().compute()) + 1 + if hasattr(data, "chunks"): # dask block_slices = slices_from_chunks(data.chunks) else: data = np.asarray(data) - n_labels = int(data.max()) + 1 # bound the per-block coordinate-weight arrays to ~8M pixels step = max(1, min(n_rows, (8 << 20) // max(1, n_cols))) block_slices = [(slice(r0, min(r0 + step, n_rows)), slice(0, n_cols)) for r0 in range(0, n_rows, step)] - count = np.zeros(n_labels, dtype=np.float64) - sum_x = np.zeros(n_labels, dtype=np.float64) - sum_y = np.zeros(n_labels, dtype=np.float64) - for row_sl, col_sl in block_slices: + def _load(row_sl: slice, col_sl: slice) -> np.ndarray: block = data[row_sl, col_sl] block = np.asarray(block.compute() if hasattr(block, "compute") else block) + # label rasters are integer-valued even when stored as float; cast so np.unique/searchsorted + # stay integer (and the dense bincount indices below are always int). + return block.astype(np.int64) if block.dtype.kind == "f" else block + + # Pass 1: the sorted set of present label values -> dense relabelling (keeps memory O(n_labels)). + uniq = np.zeros(0, dtype=np.int64) + for row_sl, col_sl in block_slices: + uniq = np.union1d(uniq, np.unique(_load(row_sl, col_sl))) + k = uniq.size + + # Pass 2: additive per-(dense-)label count / sum_x / sum_y. + count = np.zeros(k) + sum_x = np.zeros(k) + sum_y = np.zeros(k) + for row_sl, col_sl in block_slices: + block = _load(row_sl, col_sl) block_rows, block_cols = block.shape - flat = block.reshape(-1) + idx = np.searchsorted(uniq, block.reshape(-1)) # dense 0..k-1 indices (always int) cols = np.tile(np.arange(col_sl.start, col_sl.start + block_cols, dtype=np.float64), block_rows) rows = np.repeat(np.arange(row_sl.start, row_sl.start + block_rows, dtype=np.float64), block_cols) - count += np.bincount(flat, minlength=n_labels) - sum_x += np.bincount(flat, weights=cols, minlength=n_labels) - sum_y += np.bincount(flat, weights=rows, minlength=n_labels) + count += np.bincount(idx, minlength=k) + sum_x += np.bincount(idx, weights=cols, minlength=k) + sum_y += np.bincount(idx, weights=rows, minlength=k) - labels = np.flatnonzero(count) - labels = labels[labels != 0] # drop background and absent labels - return labels, sum_x[labels] / count[labels], sum_y[labels] / count[labels], count[labels] + keep = uniq != 0 # drop background; every kept label has count >= 1 + return uniq[keep], sum_x[keep] / count[keep], sum_y[keep] / count[keep], count[keep] def _compute_element_measurements(sdata: SpatialData, element_name: str) -> pd.DataFrame: """One row per instance with intrinsic ``["x", "y", "area"]``, indexed by instance id. - Shapes use shapely's vectorized centroid; circles (``Point`` + ``radius``) have ``area = pi*r**2`` - (shapely ``.area`` is 0 for them), polygons use the geometric area. 2D labels use the streaming - bincount aggregator (``area`` = pixel count) — it holds one chunk plus O(n_labels) accumulators, - so it scales to Xenium-size masks where a whole-array ``regionprops`` table would run out of - memory. Area meaning differs across element types but is consistent within one element. + Shapes use shapely's vectorized centroid; circles (``Point`` geometry + ``radius``) have + ``area = pi*r**2`` (shapely ``.area`` is 0 for them), polygons use the geometric area. 2D labels + use the streaming bincount aggregator (``area`` = pixel count) — it holds one chunk plus + O(n_labels) accumulators, so it scales to Xenium-size masks where a whole-array ``regionprops`` + table would run out of memory. Area meaning differs across element types but is consistent within + one element. """ element = sdata[element_name] model = get_model(element) if model is ShapesModel: - centroids = element.geometry.centroid - if "radius" in element.columns: # circles are Points; shapely .area is 0, so use pi * r**2 + geometry = element.geometry + centroids = geometry.centroid + # Dispatch on geometry TYPE, not a column name: circles are Point geometries (shapely .area + # is 0 for them) with a radius -> pi*r**2; everything else uses the true geometric area. + if (geometry.geom_type == "Point").all(): area = np.pi * np.asarray(element["radius"], dtype=float) ** 2 else: - area = element.geometry.area.to_numpy() + area = geometry.area.to_numpy() xy = {"x": centroids.x.to_numpy(), "y": centroids.y.to_numpy(), "area": area} return pd.DataFrame(xy, index=element.index) if model is Labels2DModel: @@ -4535,6 +4550,26 @@ def _valid_spatial_obsm(arr: ArrayLike, n_obs: int) -> bool: return bool(arr.ndim == 2 and arr.shape == (n_obs, 2)) +def _obsm_region_finite(table: AnnData, key: str, mask: ArrayLike) -> bool: + """Whether ``obsm[key]`` already holds finite coords for every ``mask`` row (already populated).""" + if key not in table.obsm: + return False + arr = np.asarray(table.obsm[key]) + if not _valid_spatial_obsm(arr, table.n_obs): + return False + region = arr[mask].astype(float) + return bool(region.size and np.isfinite(region).all()) + + +def _check_obs_numeric(table: AnnData, key: str) -> None: + """Raise if ``obs[key]`` exists but is non-numeric, before any mutation (avoids half-writes).""" + if key in table.obs and not is_numeric_dtype(table.obs[key]): + raise ValueError( + f"Cannot write measurements into obs[{key!r}]: the existing column is " + f"{table.obs[key].dtype} (not numeric). Pass a different key or drop the column first." + ) + + def _write_obs_region(table: AnnData, mask: ArrayLike, key: str, values: ArrayLike) -> None: """Write ``values`` into ``obs[key]`` at ``mask`` rows; other rows stay/are NaN.""" if key in table.obs: @@ -4579,14 +4614,46 @@ def _measure_into_table( ) -> None: """Compute and write the requested measurements for one element into its annotating table. - Overwrites existing values for the element's rows. A table may annotate several elements, so - only the rows belonging to ``element_name`` are touched. + Only the rows belonging to ``element_name`` are touched (a table may annotate several elements). + Centroids already present for those rows (e.g. reader-provided ``obsm["spatial"]``) are not + overwritten; ``area``/``diameter`` overwrite our own columns. All targets are validated before + the first write, so a bad column never leaves the table half-written. """ table = sdata.tables[table_name] instance_key, mask = _region_mask_and_keys(table, element_name) if not mask.any(): raise ValueError(f"Table {table_name!r} does not annotate element {element_name!r} (no matching rows).") - meas = _compute_element_measurements(sdata, element_name).reindex(table.obs[instance_key].to_numpy()[mask]) + + # #1: never clobber centroids already populated for this element's rows (reader/prior-call coords). + if centroids and _obsm_region_finite(table, obsm_key, mask): + warnings.warn( + f"obsm[{obsm_key!r}] is already populated for element {element_name!r}; not overwriting " + f"its centroids. Remove it or use a different `obsm_key` to recompute.", + UserWarning, + stacklevel=3, + ) + centroids = False + if not (centroids or area or diameter): + return + + keys = table.obs[instance_key].to_numpy()[mask] + meas = _compute_element_measurements(sdata, element_name).reindex(keys) + # #2: instance ids annotated in the table but absent from the element reindex to NaN -> warn. + missing = int(meas[["x", "y"]].isna().any(axis=1).sum()) + if missing: + warnings.warn( + f"{missing}/{len(keys)} instances annotated for {element_name!r} have no match in the " + f"element (instance-id dtype mismatch, e.g. str vs int?); writing NaN for them.", + UserWarning, + stacklevel=3, + ) + + # #4: validate every obs target up front so an existing non-numeric column raises before any write. + if area: + _check_obs_numeric(table, area_key) + if diameter: + _check_obs_numeric(table, diameter_key) + if centroids: _write_obsm_region(table, mask, obsm_key, meas[["x", "y"]].to_numpy()) if area: @@ -4645,16 +4712,18 @@ def measure_obs( element and writes them, squidpy-style, into the annotating :class:`~anndata.AnnData` table: centroids go to ``table.obsm[obsm_key]`` (an ``(n_obs, 2)`` array, the canonical ``obsm["spatial"]``), area and diameter to ``table.obs``. Values are stored in the element's - *intrinsic* coordinates/units (coordinate-system independent). Labels area is the pixel count; - shapes area is ``geometry.area`` (``pi*r**2`` for circles); equivalent diameter is - ``2 * sqrt(area / pi)``. Persisting them once lets later renders (and downstream tools such as - squidpy) reuse them instead of recomputing. - - Existing values for the measured rows are overwritten; pass ``centroids=False`` to keep a - pre-existing ``obsm[obsm_key]`` (e.g. from a reader). Per-cell measurements need a table to - write into, so an element without an annotating table cannot be measured (this raises). The - label path never densifies the raster — it streams it block by block, scaling to Xenium-size - masks. + *intrinsic* pixel coordinates/units (which align directly with the element's own raster/geometry). + Labels area is the pixel count; shapes area is ``geometry.area`` (``pi*r**2`` for circles); + equivalent diameter is ``2 * sqrt(area / pi)``. Persisting them once lets later renders (and + downstream tools such as squidpy) reuse them instead of recomputing. + + Centroids already present for an element's rows (e.g. a reader-provided ``obsm[obsm_key]``) are + **not** overwritten — a warning is emitted and that element's centroid write is skipped (remove + the key or use a different ``obsm_key`` to recompute); ``area``/``diameter`` overwrite our own + columns. Instances annotated in the table but absent from the element are written as NaN with a + warning. Per-cell measurements need a table to write into, so an element without an annotating + table cannot be measured (this raises). The label path never densifies the raster — it streams + it block by block with memory O(n_labels), scaling to Xenium-size masks. Parameters ---------- diff --git a/tests/pl/test_utils.py b/tests/pl/test_utils.py index 682d5d47..87b41495 100644 --- a/tests/pl/test_utils.py +++ b/tests/pl/test_utils.py @@ -9,7 +9,7 @@ from anndata import AnnData from shapely.geometry import Point from spatialdata import SpatialData, get_centroids -from spatialdata.models import PointsModel, ShapesModel, TableModel +from spatialdata.models import Labels2DModel, PointsModel, ShapesModel, TableModel import spatialdata_plot from spatialdata_plot.pl import measure_obs @@ -511,6 +511,19 @@ def _add_shapes_table(sdata: SpatialData, element: str = "blobs_polygons", name: return sdata +def _labels_sdata(arr: np.ndarray, name: str = "lab", table: str = "t") -> SpatialData: + """Build a SpatialData with a single 2D-labels element annotated by a table.""" + ids = np.unique(arr) + ids = ids[ids != 0].astype(int) + adata = AnnData(np.zeros((len(ids), 1), dtype=np.float32)) + adata.obs["instance_id"] = ids + adata.obs["region"] = name + return SpatialData( + labels={name: Labels2DModel.parse(arr, dims=("y", "x"))}, + tables={table: TableModel.parse(adata, region_key="region", instance_key="instance_id", region=name)}, + ) + + class TestMeasureObs: """`measure_obs` materializes centroids/area/equivalent diameter into the annotating table.""" @@ -572,12 +585,15 @@ def test_inplace_false_leaves_original_untouched(self, sdata_blobs: SpatialData) assert "spatial" in out["table"].obsm assert "spatial" not in sdata_blobs["table"].obsm # original not mutated - def test_recompute_overwrites(self, sdata_blobs: SpatialData) -> None: - measure_obs(sdata_blobs, "blobs_labels") + def test_existing_centroids_not_clobbered(self, sdata_blobs: SpatialData) -> None: + # #1: a populated obsm["spatial"] (reader- or prior-call-provided) is not overwritten. table = sdata_blobs["table"] - table.obsm["spatial"][0] = [999.0, 999.0] # corrupt one row - measure_obs(sdata_blobs, "blobs_labels") # second call overwrites with the real centroid - assert tuple(table.obsm["spatial"][0]) != (999.0, 999.0) + sentinel = np.arange(table.n_obs * 2, dtype=float).reshape(table.n_obs, 2) + table.obsm["spatial"] = sentinel.copy() + with pytest.warns(UserWarning, match="already populated"): + measure_obs(sdata_blobs, "blobs_labels") + np.testing.assert_array_equal(table.obsm["spatial"], sentinel) # centroids untouched + assert "area" in table.obs # area/diameter still written def test_centroids_false_keeps_existing_obsm(self, sdata_blobs: SpatialData) -> None: table = sdata_blobs["table"] @@ -593,6 +609,51 @@ def test_incompatible_obsm_shape_raises(self, sdata_blobs: SpatialData) -> None: with pytest.raises(ValueError, match="Refusing to overwrite"): measure_obs(sdata_blobs, "blobs_labels") + def test_unmatched_instance_ids_warn_and_write_nan(self, sdata_blobs: SpatialData) -> None: + # #2: instance ids that don't match the element (e.g. str vs int) -> warn + NaN, not silent. + table = sdata_blobs["table"] + table.obs["instance_id"] = table.obs["instance_id"].astype(str) + with pytest.warns(UserWarning, match="no match"): + measure_obs(sdata_blobs, "blobs_labels") + assert np.isnan(table.obsm["spatial"]).all() + + def test_float_dtype_labels_supported(self, sdata_blobs: SpatialData) -> None: + # #3: a float-typed (but integer-valued) labels raster must not crash np.bincount. + arr = np.asarray(sdata_blobs["blobs_labels"].data).astype(np.float32) + sd = _labels_sdata(arr) + measure_obs(sd, "lab", table_name="t") + assert np.isfinite(sd["t"].obsm["spatial"]).all() + + def test_existing_nonnumeric_column_raises_before_any_write(self, sdata_blobs: SpatialData) -> None: + # #4: a non-numeric collision raises BEFORE obsm is mutated (no half-written table). + table = sdata_blobs["table"] + table.obs["area"] = pd.Categorical(["lo"] * table.n_obs) + with pytest.raises(ValueError, match="not numeric"): + measure_obs(sdata_blobs, "blobs_labels") + assert "spatial" not in table.obsm # atomic: nothing written + + def test_sparse_high_label_ids(self, sdata_blobs: SpatialData) -> None: + # #5: sparse/high label ids (max id >> n_labels) are measured correctly (dense relabelling). + arr = np.asarray(sdata_blobs["blobs_labels"].data) + hi = (arr.astype(np.int64) * 1000) # ids become 1000, 2000, ... ; max id is huge, few labels + measure_obs(sd_hi := _labels_sdata(hi), "lab", table_name="t") + measure_obs(sd_lo := _labels_sdata(arr.astype(np.int64)), "lab", table_name="t") + # relabelling values does not move pixels -> identical centroid set + np.testing.assert_allclose( + np.sort(sd_hi["t"].obsm["spatial"], axis=0), np.sort(sd_lo["t"].obsm["spatial"], axis=0) + ) + + def test_polygon_with_radius_column_uses_geometric_area(self, sdata_blobs: SpatialData) -> None: + # #7: dispatch on geometry type, not the "radius" column name -> polygons use geometry.area. + gdf = sdata_blobs["blobs_polygons"].copy() + gdf["radius"] = 5.0 # incidental column; must NOT trigger the circle (pi*r**2) branch + sdata_blobs["pr"] = ShapesModel.parse(gdf) + _add_shapes_table(sdata_blobs, "pr", name="pt") + measure_obs(sdata_blobs, "pr", table_name="pt") + order = sdata_blobs["pt"].obs["instance_id"].to_numpy() + expected = gdf.geometry.area.to_numpy()[[list(gdf.index).index(i) for i in order]] + np.testing.assert_allclose(sdata_blobs["pt"].obs["area"].to_numpy(), expected, rtol=1e-9) + def test_flags_select_outputs(self, sdata_blobs: SpatialData) -> None: measure_obs(sdata_blobs, "blobs_labels", area=False, diameter=False) table = sdata_blobs["table"] From 5ba78a786f2a8707e1a53480679e1c034bf5a65f Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 9 Jun 2026 21:57:07 +0200 Subject: [PATCH 7/8] Simplify measure_obs: consolidate writers, drop key kwargs (-35 LOC) - Collapse `_write_obs_region` + `_write_obsm_region` into one `_write_region` parameterized by `obsm=True/False` (same allocate-or-load -> masked-assign -> store-back, with the obsm shape guard). - Drop the `obsm_key`/`area_key`/`diameter_key` public kwargs and their threading; the destinations are now module constants (`_CENTROID_OBSM_KEY`/`_AREA_OBS_KEY`/`_DIAMETER_OBS_KEY`). A column-name collision still raises with an actionable message. - Inline the single-use `_transform_carrier`; collapse the throwaway `xy` dict; hoist `meas["area"].to_numpy()` so it's materialized once for area+diameter. Behavior unchanged; 17 TestMeasureObs + 63 non-visual test_utils green. --- src/spatialdata_plot/pl/utils.py | 123 +++++++++++-------------------- 1 file changed, 44 insertions(+), 79 deletions(-) diff --git a/src/spatialdata_plot/pl/utils.py b/src/spatialdata_plot/pl/utils.py index 9ca1d10c..7de7edc6 100644 --- a/src/spatialdata_plot/pl/utils.py +++ b/src/spatialdata_plot/pl/utils.py @@ -4430,16 +4430,11 @@ def _convert_alpha_to_datashader_range(alpha: float) -> float: # --- Per-cell measurements into the annotating table (centroid / area / equivalent diameter) --- -# squidpy/scanpy canonical key for per-cell coordinates: an N x 2 array, row-aligned to obs. -# Measurements are stored intrinsic (coordinate-system independent). +# Destination keys (measurements are stored intrinsic, coordinate-system independent). +# obsm["spatial"] is the squidpy/scanpy convention for per-cell coordinates (an N x 2 array). _CENTROID_OBSM_KEY = "spatial" - - -def _transform_carrier(element: SpatialElement) -> Any: - """Return the object carrying ``element``'s data (the ``scale0`` level for multiscale rasters).""" - if isinstance(element, DataTree): - return next(iter(element["scale0"].values())) - return element +_AREA_OBS_KEY = "area" +_DIAMETER_OBS_KEY = "equivalent_diameter" def _pixel_to_coord(idx: ArrayLike, coord: ArrayLike) -> ArrayLike: @@ -4519,10 +4514,12 @@ def _compute_element_measurements(sdata: SpatialData, element_name: str) -> pd.D area = np.pi * np.asarray(element["radius"], dtype=float) ** 2 else: area = geometry.area.to_numpy() - xy = {"x": centroids.x.to_numpy(), "y": centroids.y.to_numpy(), "area": area} - return pd.DataFrame(xy, index=element.index) + return pd.DataFrame( + {"x": centroids.x.to_numpy(), "y": centroids.y.to_numpy(), "area": area}, index=element.index + ) if model is Labels2DModel: - raster = _transform_carrier(element) + # multiscale rasters carry their data on the scale0 level + raster = next(iter(element["scale0"].values())) if isinstance(element, DataTree) else element labels, x_idx, y_idx, area = _stream_label_centroid_stats(raster.data) # bincount gives mean 0-based pixel indices; map them onto the raster's intrinsic coords. return pd.DataFrame( @@ -4566,51 +4563,33 @@ def _check_obs_numeric(table: AnnData, key: str) -> None: if key in table.obs and not is_numeric_dtype(table.obs[key]): raise ValueError( f"Cannot write measurements into obs[{key!r}]: the existing column is " - f"{table.obs[key].dtype} (not numeric). Pass a different key or drop the column first." + f"{table.obs[key].dtype} (not numeric). Drop or rename the column first." ) -def _write_obs_region(table: AnnData, mask: ArrayLike, key: str, values: ArrayLike) -> None: - """Write ``values`` into ``obs[key]`` at ``mask`` rows; other rows stay/are NaN.""" - if key in table.obs: - col = np.asarray(table.obs[key].to_numpy(), dtype=float) - else: - col = np.full(table.n_obs, np.nan, dtype=float) - col[mask] = np.asarray(values, dtype=float) - table.obs[key] = col - - -def _write_obsm_region(table: AnnData, mask: ArrayLike, key: str, xy: ArrayLike) -> None: - """Write intrinsic centroids ``xy`` into ``obsm[key]`` at ``mask`` rows; other rows stay/are NaN. +def _write_region(table: AnnData, mask: ArrayLike, key: str, values: ArrayLike, *, obsm: bool) -> None: + """Write ``values`` into ``obsm[key]`` (2D) or ``obs[key]`` (1D) at ``mask`` rows; others stay/NaN. - Never reshapes an incompatible existing ``obsm[key]`` (e.g. a 3-column xyz array): that would - silently drop data, so it raises instead. + Refuses to overwrite an incompatible existing ``obsm[key]`` (e.g. a 3-column xyz array) rather + than silently dropping data. """ - if key in table.obsm: - existing = np.asarray(table.obsm[key]) - if not _valid_spatial_obsm(existing, table.n_obs): + store = table.obsm if obsm else table.obs + if key in store: + existing = np.asarray(store[key]) + if obsm and not _valid_spatial_obsm(existing, table.n_obs): raise ValueError( f"Refusing to overwrite obsm[{key!r}] with shape {existing.shape}; expected " f"({table.n_obs}, 2). Remove it first if you want it replaced." ) arr = existing.astype(float, copy=True) else: - arr = np.full((table.n_obs, 2), np.nan, dtype=float) - arr[mask] = np.asarray(xy, dtype=float) - table.obsm[key] = arr + arr = np.full((table.n_obs, 2) if obsm else table.n_obs, np.nan) + arr[mask] = np.asarray(values, dtype=float) + store[key] = arr def _measure_into_table( - sdata: SpatialData, - element_name: str, - table_name: str, - *, - centroids: bool, - area: bool, - diameter: bool, - obsm_key: str, - area_key: str, - diameter_key: str, + sdata: SpatialData, element_name: str, table_name: str, *, centroids: bool, area: bool, diameter: bool ) -> None: """Compute and write the requested measurements for one element into its annotating table. @@ -4625,10 +4604,10 @@ def _measure_into_table( raise ValueError(f"Table {table_name!r} does not annotate element {element_name!r} (no matching rows).") # #1: never clobber centroids already populated for this element's rows (reader/prior-call coords). - if centroids and _obsm_region_finite(table, obsm_key, mask): + if centroids and _obsm_region_finite(table, _CENTROID_OBSM_KEY, mask): warnings.warn( - f"obsm[{obsm_key!r}] is already populated for element {element_name!r}; not overwriting " - f"its centroids. Remove it or use a different `obsm_key` to recompute.", + f"obsm[{_CENTROID_OBSM_KEY!r}] is already populated for element {element_name!r}; not " + f"overwriting its centroids (remove it to recompute).", UserWarning, stacklevel=3, ) @@ -4648,18 +4627,19 @@ def _measure_into_table( stacklevel=3, ) - # #4: validate every obs target up front so an existing non-numeric column raises before any write. + # #4: validate obs targets up front so an existing non-numeric column raises before any write. if area: - _check_obs_numeric(table, area_key) + _check_obs_numeric(table, _AREA_OBS_KEY) if diameter: - _check_obs_numeric(table, diameter_key) + _check_obs_numeric(table, _DIAMETER_OBS_KEY) + area_vals = meas["area"].to_numpy() if centroids: - _write_obsm_region(table, mask, obsm_key, meas[["x", "y"]].to_numpy()) + _write_region(table, mask, _CENTROID_OBSM_KEY, meas[["x", "y"]].to_numpy(), obsm=True) if area: - _write_obs_region(table, mask, area_key, meas["area"].to_numpy()) + _write_region(table, mask, _AREA_OBS_KEY, area_vals, obsm=False) if diameter: - _write_obs_region(table, mask, diameter_key, 2.0 * np.sqrt(meas["area"].to_numpy() / np.pi)) + _write_region(table, mask, _DIAMETER_OBS_KEY, 2.0 * np.sqrt(area_vals / np.pi), obsm=False) def _measurable_elements(sdata: SpatialData) -> list[str]: @@ -4701,29 +4681,26 @@ def measure_obs( centroids: bool = True, area: bool = True, diameter: bool = True, - obsm_key: str = _CENTROID_OBSM_KEY, - area_key: str = "area", - diameter_key: str = "equivalent_diameter", inplace: bool = True, ) -> SpatialData | None: """Measure per-cell centroids, area and equivalent diameter into an element's annotating table. Computes one centroid, area and equivalent diameter per instance of a shapes or 2D-labels element and writes them, squidpy-style, into the annotating :class:`~anndata.AnnData` table: - centroids go to ``table.obsm[obsm_key]`` (an ``(n_obs, 2)`` array, the canonical - ``obsm["spatial"]``), area and diameter to ``table.obs``. Values are stored in the element's + centroids go to ``table.obsm["spatial"]`` (an ``(n_obs, 2)`` array, the squidpy convention), + area and diameter to ``table.obs["area"]`` and ``table.obs["equivalent_diameter"]``. Values are stored in the element's *intrinsic* pixel coordinates/units (which align directly with the element's own raster/geometry). Labels area is the pixel count; shapes area is ``geometry.area`` (``pi*r**2`` for circles); equivalent diameter is ``2 * sqrt(area / pi)``. Persisting them once lets later renders (and downstream tools such as squidpy) reuse them instead of recomputing. - Centroids already present for an element's rows (e.g. a reader-provided ``obsm[obsm_key]``) are + Centroids already present for an element's rows (e.g. a reader-provided ``obsm["spatial"]``) are **not** overwritten — a warning is emitted and that element's centroid write is skipped (remove - the key or use a different ``obsm_key`` to recompute); ``area``/``diameter`` overwrite our own - columns. Instances annotated in the table but absent from the element are written as NaN with a - warning. Per-cell measurements need a table to write into, so an element without an annotating - table cannot be measured (this raises). The label path never densifies the raster — it streams - it block by block with memory O(n_labels), scaling to Xenium-size masks. + the key to recompute); ``area``/``diameter`` overwrite our own columns. Instances annotated in + the table but absent from the element are written as NaN with a warning. Per-cell measurements + need a table to write into, so an element without an annotating table cannot be measured (this + raises). The label path never densifies the raster — it streams it block by block with memory + O(n_labels), scaling to Xenium-size masks. Parameters ---------- @@ -4736,11 +4713,8 @@ def measure_obs( Name of the annotating table to write into. If ``None``, it is inferred from the element's annotators (an error is raised when there are zero or several). centroids, area, diameter - Which measurements to compute/write. At least one must be ``True``. - obsm_key - ``obsm`` key for the centroids (default ``"spatial"``, the squidpy convention). - area_key, diameter_key - ``obs`` column names for area and equivalent diameter. + Which measurements to compute/write. At least one must be ``True``. They are written to + ``obsm["spatial"]``, ``obs["area"]`` and ``obs["equivalent_diameter"]`` respectively. inplace If ``True`` (default), mutate ``sdata``'s table in place and return ``None``. If ``False``, operate on a deep copy and return the modified ``SpatialData``. @@ -4762,15 +4736,6 @@ def measure_obs( else: names = [element] for name in names: - _measure_into_table( - target, - name, - _resolve_measure_table(target, name, table_name), - centroids=centroids, - area=area, - diameter=diameter, - obsm_key=obsm_key, - area_key=area_key, - diameter_key=diameter_key, - ) + table = _resolve_measure_table(target, name, table_name) + _measure_into_table(target, name, table, centroids=centroids, area=area, diameter=diameter) return None if inplace else target From 9fb078d3adfa65c011df50c5d557ee5cae2a2612 Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 9 Jun 2026 22:09:12 +0200 Subject: [PATCH 8/8] Inline two single-use helpers in measure_obs (11 -> 9) _region_mask_and_keys (2 trivial lines) folds into _measure_into_table; _measurable_elements becomes a comprehension in measure_obs's element=None branch. The remaining 9 helpers are each reused or encapsulate non-obvious logic (the streaming aggregator, the writer, the no-clobber/dtype guards, table resolution with its error messages). --- src/spatialdata_plot/pl/utils.py | 33 +++++++++++--------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/src/spatialdata_plot/pl/utils.py b/src/spatialdata_plot/pl/utils.py index 7de7edc6..b6eab503 100644 --- a/src/spatialdata_plot/pl/utils.py +++ b/src/spatialdata_plot/pl/utils.py @@ -4535,13 +4535,6 @@ def _compute_element_measurements(sdata: SpatialData, element_name: str) -> pd.D ) -def _region_mask_and_keys(table: AnnData, element_name: str) -> tuple[str, ArrayLike]: - """Return ``(instance_key, mask)`` for the rows of ``table`` that annotate ``element_name``.""" - _, region_key, instance_key = get_table_keys(table) - mask = (table.obs[region_key].astype(str) == str(element_name)).to_numpy() - return instance_key, mask - - def _valid_spatial_obsm(arr: ArrayLike, n_obs: int) -> bool: """Whether ``arr`` is a usable ``obsm["spatial"]``: a 2D ``(n_obs, 2)`` coordinate grid.""" return bool(arr.ndim == 2 and arr.shape == (n_obs, 2)) @@ -4599,7 +4592,8 @@ def _measure_into_table( the first write, so a bad column never leaves the table half-written. """ table = sdata.tables[table_name] - instance_key, mask = _region_mask_and_keys(table, element_name) + _, region_key, instance_key = get_table_keys(table) + mask = (table.obs[region_key].astype(str) == str(element_name)).to_numpy() if not mask.any(): raise ValueError(f"Table {table_name!r} does not annotate element {element_name!r} (no matching rows).") @@ -4642,17 +4636,6 @@ def _measure_into_table( _write_region(table, mask, _DIAMETER_OBS_KEY, 2.0 * np.sqrt(area_vals / np.pi), obsm=False) -def _measurable_elements(sdata: SpatialData) -> list[str]: - """Shapes / 2D-labels elements with exactly one annotating table (where measurements persist).""" - names: list[str] = [] - for name in list(sdata.shapes.keys()) + list(sdata.labels.keys()): - if len(get_element_annotators(sdata, name)) != 1: - continue - if get_model(sdata[name]) in (ShapesModel, Labels2DModel): - names.append(name) - return names - - def _resolve_measure_table(sdata: SpatialData, element_name: str, table_name: str | None) -> str: """Resolve the single annotating table for ``element_name`` (where measurements are written).""" if table_name is not None: @@ -4688,8 +4671,9 @@ def measure_obs( Computes one centroid, area and equivalent diameter per instance of a shapes or 2D-labels element and writes them, squidpy-style, into the annotating :class:`~anndata.AnnData` table: centroids go to ``table.obsm["spatial"]`` (an ``(n_obs, 2)`` array, the squidpy convention), - area and diameter to ``table.obs["area"]`` and ``table.obs["equivalent_diameter"]``. Values are stored in the element's - *intrinsic* pixel coordinates/units (which align directly with the element's own raster/geometry). + area and diameter to ``table.obs["area"]`` and ``table.obs["equivalent_diameter"]``. Values are + stored in the element's *intrinsic* pixel coordinates/units (which align directly with the + element's own raster/geometry). Labels area is the pixel count; shapes area is ``geometry.area`` (``pi*r**2`` for circles); equivalent diameter is ``2 * sqrt(area / pi)``. Persisting them once lets later renders (and downstream tools such as squidpy) reuse them instead of recomputing. @@ -4727,7 +4711,12 @@ def measure_obs( raise ValueError("Nothing to measure: set at least one of `centroids`, `area`, `diameter` to True.") target = sdata if inplace else sd_deepcopy(sdata) if element is None: - names = _measurable_elements(target) + # measure every shapes / 2D-labels element that has exactly one annotating table + names = [ + n + for n in list(target.shapes) + list(target.labels) + if get_model(target[n]) in (ShapesModel, Labels2DModel) and len(get_element_annotators(target, n)) == 1 + ] if not names: raise ValueError( "No shapes/2D-labels element with a single annotating table was found; nothing to "