Skip to content

Commit 9eee2ae

Browse files
Merge pull request #112 from scverse/xenium-cells.zarr.zip
Support for `cells.zarr.zip` for Xenium
2 parents f0d1f45 + a2e49fc commit 9eee2ae

File tree

3 files changed

+279
-20
lines changed

3 files changed

+279
-20
lines changed

src/spatialdata_io/_constants/_constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,11 @@ class XeniumKeys(ModeEnum):
116116
# specs keys
117117
ANALYSIS_SW_VERSION = "analysis_sw_version"
118118

119+
# zarr file with labels file and cell summary keys
120+
CELLS_ZARR = "cells.zarr.zip"
121+
NUCLEUS_COUNT = "nucleus_count"
122+
Z_LEVEL = "z_level"
123+
119124

120125
@unique
121126
class VisiumKeys(ModeEnum):

src/spatialdata_io/readers/xenium.py

Lines changed: 240 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
import logging
55
import os
66
import re
7+
import tempfile
78
import warnings
9+
import zipfile
810
from collections.abc import Mapping
911
from pathlib import Path
1012
from types import MappingProxyType
@@ -16,6 +18,7 @@
1618
import pandas as pd
1719
import pyarrow.parquet as pq
1820
import tifffile
21+
import zarr
1922
from anndata import AnnData
2023
from dask.dataframe import read_parquet
2124
from dask_image.imread import imread
@@ -26,8 +29,15 @@
2629
from shapely import Polygon
2730
from spatial_image import SpatialImage
2831
from spatialdata import SpatialData
32+
from spatialdata._core.query.relational_query import _get_unique_label_values_as_index
2933
from spatialdata._types import ArrayLike
30-
from spatialdata.models import Image2DModel, PointsModel, ShapesModel, TableModel
34+
from spatialdata.models import (
35+
Image2DModel,
36+
Labels2DModel,
37+
PointsModel,
38+
ShapesModel,
39+
TableModel,
40+
)
3141
from spatialdata.transformations.transformations import Affine, Identity, Scale
3242

3343
from spatialdata_io._constants._constants import XeniumKeys
@@ -46,11 +56,14 @@ def xenium(
4656
cells_as_circles: bool = True,
4757
cell_boundaries: bool = True,
4858
nucleus_boundaries: bool = True,
59+
cell_labels: bool = True,
60+
nucleus_labels: bool = True,
4961
transcripts: bool = True,
5062
morphology_mip: bool = True,
5163
morphology_focus: bool = True,
5264
imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
5365
image_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
66+
labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
5467
) -> SpatialData:
5568
"""
5669
Read a *10X Genomics Xenium* dataset into a SpatialData object.
@@ -77,11 +90,20 @@ def xenium(
7790
n_jobs
7891
Number of jobs to use for parallel processing.
7992
cells_as_circles
80-
Whether to read cells also as circles. Useful for performant visualization.
93+
Whether to read cells also as circles. Useful for performant visualization. The radii of the nuclei,
94+
not the ones of cells, will be used; using the radii of cells would make the visualization too cluttered
95+
(the cell boundaries are computed as a maximum expansion of the nuclei location and therefore the
96+
corresponding circles would show considerable overlap).
8197
cell_boundaries
8298
Whether to read cell boundaries (polygons).
8399
nucleus_boundaries
84100
Whether to read nucleus boundaries (polygons).
101+
cell_labels
102+
Whether to read cell labels (raster). The polygonal version of the cell labels are simplified
103+
for visualization purposes, and using the raster version is recommended for analysis.
104+
nucleus_labels
105+
Whether to read nucleus labels (raster). The polygonal version of the nucleus labels are simplified
106+
for visualization purposes, and using the raster version is recommended for analysis.
85107
transcripts
86108
Whether to read transcripts.
87109
morphology_mip
@@ -92,22 +114,25 @@ def xenium(
92114
Keyword arguments to pass to the image reader.
93115
image_models_kwargs
94116
Keyword arguments to pass to the image models.
117+
labels_models_kwargs
118+
Keyword arguments to pass to the labels models.
95119
96120
Returns
97121
-------
98122
:class:`spatialdata.SpatialData`
99123
"""
124+
image_models_kwargs = dict(image_models_kwargs)
100125
if "chunks" not in image_models_kwargs:
101-
if isinstance(image_models_kwargs, MappingProxyType):
102-
image_models_kwargs = {}
103-
assert isinstance(image_models_kwargs, dict)
104126
image_models_kwargs["chunks"] = (1, 4096, 4096)
105127
if "scale_factors" not in image_models_kwargs:
106-
if isinstance(image_models_kwargs, MappingProxyType):
107-
image_models_kwargs = {}
108-
assert isinstance(image_models_kwargs, dict)
109128
image_models_kwargs["scale_factors"] = [2, 2, 2, 2]
110129

130+
labels_models_kwargs = dict(labels_models_kwargs)
131+
if "chunks" not in labels_models_kwargs:
132+
labels_models_kwargs["chunks"] = (4096, 4096)
133+
if "scale_factors" not in labels_models_kwargs:
134+
labels_models_kwargs["scale_factors"] = [2, 2, 2, 2]
135+
111136
path = Path(path)
112137
with open(path / XeniumKeys.XENIUM_SPECS) as f:
113138
specs = json.load(f)
@@ -121,7 +146,61 @@ def xenium(
121146
table, circles = return_values
122147
else:
123148
table = return_values
149+
150+
if version >= packaging.version.parse("2.0.0"):
151+
cell_summary_table = _get_cells_metadata_table_from_zarr(path, XeniumKeys.CELLS_ZARR, specs)
152+
if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]):
153+
warnings.warn(
154+
'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
155+
" table. This could be due to trying to read a new version that is not supported yet. Please "
156+
"report this issue.",
157+
UserWarning,
158+
stacklevel=2,
159+
)
160+
table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL]
161+
table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT]
162+
124163
polygons = {}
164+
labels = {}
165+
tables = {}
166+
points = {}
167+
images = {}
168+
169+
# From the public release notes here:
170+
# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa
171+
# we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used.
172+
# This column is currently not found in the preview data, while I think it is needed in order to unambiguously match
173+
# nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus
174+
# labels.
175+
if nucleus_labels:
176+
labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
177+
path,
178+
XeniumKeys.CELLS_ZARR,
179+
specs,
180+
mask_index=0,
181+
labels_name="nucleus_labels",
182+
labels_models_kwargs=labels_models_kwargs,
183+
)
184+
if cell_labels:
185+
labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
186+
path,
187+
XeniumKeys.CELLS_ZARR,
188+
specs,
189+
mask_index=1,
190+
labels_name="cell_labels",
191+
labels_models_kwargs=labels_models_kwargs,
192+
)
193+
if cell_labels_indices_mapping is not None:
194+
if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]):
195+
warnings.warn(
196+
"The cell_id column in the cell_labels_table does not match the cell_id column derived from the cell "
197+
"labels data. This could be due to trying to read a new version that is not supported yet. Please "
198+
"report this issue.",
199+
UserWarning,
200+
stacklevel=2,
201+
)
202+
else:
203+
table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"]
125204

126205
if nucleus_boundaries:
127206
polygons["nucleus_boundaries"] = _get_polygons(
@@ -141,11 +220,9 @@ def xenium(
141220
idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
142221
)
143222

144-
points = {}
145223
if transcripts:
146224
points["transcripts"] = _get_points(path, specs)
147225

148-
images = {}
149226
if version < packaging.version.parse("2.0.0"):
150227
if morphology_mip:
151228
images["morphology_mip"] = _get_images(
@@ -217,10 +294,12 @@ def filter(self, record: logging.LogRecord) -> bool:
217294
del image_models_kwargs["c_coords"]
218295
logger.removeFilter(IgnoreSpecificMessage())
219296

297+
tables["table"] = table
298+
299+
elements_dict = {"images": images, "labels": labels, "points": points, "tables": tables, "shapes": polygons}
220300
if cells_as_circles:
221-
sdata = SpatialData(images=images, shapes=polygons | {specs["region"]: circles}, points=points, table=table)
222-
else:
223-
sdata = SpatialData(images=images, shapes=polygons, points=points, table=table)
301+
elements_dict["shapes"][specs["region"]] = circles
302+
sdata = SpatialData(**elements_dict)
224303

225304
# find and add additional aligned images
226305
aligned_images = _add_aligned_images(path, imread_kwargs, image_models_kwargs)
@@ -265,13 +344,116 @@ def _poly(arr: ArrayLike) -> Polygon:
265344
if not np.unique(geo_df.index).size == len(geo_df):
266345
warnings.warn(
267346
"Found non-unique polygon indices, this will be addressed in a future version of the reader. For the "
268-
"time being please consider merging non-unique polygons into single multi-polygons.",
347+
"time being please consider merging polygons with non-unique indices into single multi-polygons.",
348+
UserWarning,
269349
stacklevel=2,
270350
)
271351
scale = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y"))
272352
return ShapesModel.parse(geo_df, transformations={"global": scale})
273353

274354

355+
def _get_labels_and_indices_mapping(
356+
path: Path,
357+
file: str,
358+
specs: dict[str, Any],
359+
mask_index: int,
360+
labels_name: str,
361+
labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
362+
) -> tuple[GeoDataFrame, pd.DataFrame | None]:
363+
if mask_index not in [0, 1]:
364+
raise ValueError(f"mask_index must be 0 or 1, found {mask_index}.")
365+
366+
with tempfile.TemporaryDirectory() as tmpdir:
367+
zip_file = path / XeniumKeys.CELLS_ZARR
368+
with zipfile.ZipFile(zip_file, "r") as zip_ref:
369+
zip_ref.extractall(tmpdir)
370+
371+
with zarr.open(str(tmpdir), mode="r") as z:
372+
# get the labels
373+
masks = z["masks"][f"{mask_index}"][...]
374+
labels = Labels2DModel.parse(
375+
masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs
376+
)
377+
378+
# build the matching table
379+
version = _parse_version_of_xenium_analyzer(specs)
380+
if mask_index == 0:
381+
# nuclei currently not supported
382+
return labels, None
383+
if version is not None and version < packaging.version.parse("1.3.0"):
384+
# supported in version 1.3.0 and not supported in version 1.0.2; conservatively, let's assume it is not
385+
# supported in versions < 1.3.0
386+
return labels, None
387+
388+
cell_id, dataset_suffix = z["cell_id"][...].T
389+
cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)
390+
391+
# this information will probably be available in the `label_id` column for version > 2.0.0 (see public
392+
# release notes mentioned above)
393+
real_label_index = _get_unique_label_values_as_index(labels).values
394+
395+
# background removal
396+
if real_label_index[0] == 0:
397+
real_label_index = real_label_index[1:]
398+
399+
if version < packaging.version.parse("2.0.0"):
400+
expected_label_index = z["seg_mask_value"][...]
401+
402+
if not np.array_equal(expected_label_index, real_label_index):
403+
raise ValueError(
404+
"The label indices from the labels differ from the ones from the input data. Please report "
405+
f"this issue. Real label indices: {real_label_index}, expected label indices: "
406+
f"{expected_label_index}."
407+
)
408+
else:
409+
labels_positional_indices = z["polygon_sets"][mask_index]["cell_index"][...]
410+
if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))):
411+
raise ValueError(
412+
"The positional indices of the labels do not match the expected range. Please report this "
413+
"issue."
414+
)
415+
416+
# labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems
417+
indices_mapping = pd.DataFrame(
418+
{
419+
"region": labels_name,
420+
"cell_id": cell_id_str,
421+
"label_index": real_label_index.astype(np.int64),
422+
}
423+
)
424+
return labels, indices_mapping
425+
426+
427+
@inject_docs(xx=XeniumKeys)
428+
def _get_cells_metadata_table_from_zarr(
429+
path: Path,
430+
file: str,
431+
specs: dict[str, Any],
432+
) -> AnnData:
433+
"""
434+
Read cells metadata from ``{xx.CELLS_ZARR}``.
435+
436+
Read the cells summary table, which contains the z_level information for versions < 2.0.0, and also the
437+
nucleus_count for versions >= 2.0.0.
438+
"""
439+
# for version >= 2.0.0, in this function we could also parse the segmentation method used to obtain the masks
440+
with tempfile.TemporaryDirectory() as tmpdir:
441+
zip_file = path / XeniumKeys.CELLS_ZARR
442+
with zipfile.ZipFile(zip_file, "r") as zip_ref:
443+
zip_ref.extractall(tmpdir)
444+
445+
with zarr.open(str(tmpdir), mode="r") as z:
446+
x = z["cell_summary"][...]
447+
column_names = z["cell_summary"].attrs["column_names"]
448+
df = pd.DataFrame(x, columns=column_names)
449+
cell_id_prefix = z["cell_id"][:, 0]
450+
dataset_suffix = z["cell_id"][:, 1]
451+
452+
cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
453+
df[XeniumKeys.CELL_ID] = cell_id_str
454+
return df
455+
456+
275457
def _get_points(path: Path, specs: dict[str, Any]) -> Table:
276458
table = read_parquet(path / XeniumKeys.TRANSCRIPTS_FILE)
277459
table["feature_name"] = table["feature_name"].apply(
@@ -325,10 +507,10 @@ def _get_images(
325507
) -> SpatialImage | MultiscaleSpatialImage:
326508
image = imread(path / file, **imread_kwargs)
327509
if "c_coords" in image_models_kwargs and "dummy" in image_models_kwargs["c_coords"]:
328-
# Napari currently interprets 4 channel images as RGB; a series of PRs to fix this is almost ready but they will not
329-
# be merged soon.
330-
# Here, since the new data from the xenium analyzer version 2.0.0 gives 4-channel images that are not RGBA, let's
331-
# add a dummy channel as a temporary workaround.
510+
# Napari currently interprets 4 channel images as RGB; a series of PRs to fix this is almost ready but they will
511+
# not be merged soon.
512+
# Here, since the new data from the xenium analyzer version 2.0.0 gives 4-channel images that are not RGBA,
513+
# let's add a dummy channel as a temporary workaround.
332514
image = da.concatenate([image, da.zeros_like(image[0:1])], axis=0)
333515
return Image2DModel.parse(
334516
image, transformations={"global": Identity()}, dims=("c", "y", "x"), **image_models_kwargs
@@ -399,7 +581,6 @@ def xenium_aligned_image(
399581
# In fact, it could be that the len(image.shape) == 4 has actually dimes (1, x, y, c) and not (1, y, x, c). This is
400582
# not a problem because the transformation is constructed to be consistent, but if is the case, the data orientation
401583
# would be transposed compared to the original image, not ideal.
402-
# print(image.shape)
403584
if len(image.shape) == 4:
404585
assert image.shape[0] == 1
405586
assert image.shape[-1] == 3
@@ -466,7 +647,11 @@ def _parse_version_of_xenium_analyzer(
466647
# Input: xenium-2.0.0.6-35-ga7e17149a
467648
# Output: 2.0.0.6-35
468649

469-
warning_message = f"Could not parse the version of the Xenium Analyzer from the string: {string}. This may happen for experimental version of the data. Please report in GitHub https://github.com/scverse/spatialdata-io/issues.\nThe reader will continue assuming the latest version of the Xenium Analyzer."
650+
warning_message = (
651+
f"Could not parse the version of the Xenium Analyzer from the string: {string}. This may happen for "
652+
"experimental version of the data. Please report in GitHub https://github.com/scverse/spatialdata-io/issues.\n"
653+
"The reader will continue assuming the latest version of the Xenium Analyzer."
654+
)
470655

471656
if result is None:
472657
if not hide_warning:
@@ -480,3 +665,38 @@ def _parse_version_of_xenium_analyzer(
480665
if not hide_warning:
481666
warnings.warn(warning_message, stacklevel=2)
482667
return None
668+
669+
670+
def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suffix: ArrayLike) -> ArrayLike:
671+
# explained here:
672+
# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID
673+
# convert to hex, remove the 0x prefix
674+
cell_id_prefix_hex = [hex(x)[2:] for x in cell_id_prefix]
675+
676+
# shift the hex values
677+
hex_shift = {str(i): chr(ord("a") + i) for i in range(10)} | {
678+
chr(ord("a") + i): chr(ord("a") + 10 + i) for i in range(6)
679+
}
680+
cell_id_prefix_hex_shifted = ["".join([hex_shift[c] for c in x]) for x in cell_id_prefix_hex]
681+
682+
# merge the prefix and the suffix
683+
cell_id_str = [str(x[0]).rjust(8, "a") + f"-{x[1]}" for x in zip(cell_id_prefix_hex_shifted, dataset_suffix)]
684+
685+
return np.array(cell_id_str)
686+
687+
688+
def prefix_suffix_uint32_from_cell_id_str(cell_id_str: ArrayLike) -> tuple[ArrayLike, ArrayLike]:
689+
# parse the string into the prefix and suffix
690+
cell_id_prefix_str, dataset_suffix = zip(*[x.split("-") for x in cell_id_str])
691+
dataset_suffix_int = [int(x) for x in dataset_suffix]
692+
693+
# reverse the shifted hex conversion
694+
hex_unshift = {chr(ord("a") + i): str(i) for i in range(10)} | {
695+
chr(ord("a") + 10 + i): chr(ord("a") + i) for i in range(6)
696+
}
697+
cell_id_prefix_hex = ["".join([hex_unshift[c] for c in x]) for x in cell_id_prefix_str]
698+
699+
# Convert hex (no need to add the 0x prefix)
700+
cell_id_prefix = [int(x, 16) for x in cell_id_prefix_hex]
701+
702+
return np.array(cell_id_prefix, dtype=np.uint32), np.array(dataset_suffix_int)

0 commit comments

Comments
 (0)