44import logging
55import os
66import re
7+ import tempfile
78import warnings
9+ import zipfile
810from collections .abc import Mapping
911from pathlib import Path
1012from types import MappingProxyType
1618import pandas as pd
1719import pyarrow .parquet as pq
1820import tifffile
21+ import zarr
1922from anndata import AnnData
2023from dask .dataframe import read_parquet
2124from dask_image .imread import imread
2629from shapely import Polygon
2730from spatial_image import SpatialImage
2831from spatialdata import SpatialData
32+ from spatialdata ._core .query .relational_query import _get_unique_label_values_as_index
2933from spatialdata ._types import ArrayLike
30- from spatialdata .models import Image2DModel , PointsModel , ShapesModel , TableModel
34+ from spatialdata .models import (
35+ Image2DModel ,
36+ Labels2DModel ,
37+ PointsModel ,
38+ ShapesModel ,
39+ TableModel ,
40+ )
3141from spatialdata .transformations .transformations import Affine , Identity , Scale
3242
3343from spatialdata_io ._constants ._constants import XeniumKeys
@@ -46,11 +56,14 @@ def xenium(
4656 cells_as_circles : bool = True ,
4757 cell_boundaries : bool = True ,
4858 nucleus_boundaries : bool = True ,
59+ cell_labels : bool = True ,
60+ nucleus_labels : bool = True ,
4961 transcripts : bool = True ,
5062 morphology_mip : bool = True ,
5163 morphology_focus : bool = True ,
5264 imread_kwargs : Mapping [str , Any ] = MappingProxyType ({}),
5365 image_models_kwargs : Mapping [str , Any ] = MappingProxyType ({}),
66+ labels_models_kwargs : Mapping [str , Any ] = MappingProxyType ({}),
5467) -> SpatialData :
5568 """
5669 Read a *10X Genomics Xenium* dataset into a SpatialData object.
@@ -77,11 +90,20 @@ def xenium(
7790 n_jobs
7891 Number of jobs to use for parallel processing.
7992 cells_as_circles
80- Whether to read cells also as circles. Useful for performant visualization.
93+ Whether to read cells also as circles. Useful for performant visualization. The radii of the nuclei,
94+ not the ones of cells, will be used; using the radii of cells would make the visualization too cluttered
95+ (the cell boundaries are computed as a maximum expansion of the nuclei location and therefore the
96+ corresponding circles would show considerable overlap).
8197 cell_boundaries
8298 Whether to read cell boundaries (polygons).
8399 nucleus_boundaries
84100 Whether to read nucleus boundaries (polygons).
101+ cell_labels
102+ Whether to read cell labels (raster). The polygonal version of the cell labels are simplified
103+ for visualization purposes, and using the raster version is recommended for analysis.
104+ nucleus_labels
105+ Whether to read nucleus labels (raster). The polygonal version of the nucleus labels are simplified
106+ for visualization purposes, and using the raster version is recommended for analysis.
85107 transcripts
86108 Whether to read transcripts.
87109 morphology_mip
@@ -92,22 +114,25 @@ def xenium(
92114 Keyword arguments to pass to the image reader.
93115 image_models_kwargs
94116 Keyword arguments to pass to the image models.
117+ labels_models_kwargs
118+ Keyword arguments to pass to the labels models.
95119
96120 Returns
97121 -------
98122 :class:`spatialdata.SpatialData`
99123 """
124+ image_models_kwargs = dict (image_models_kwargs )
100125 if "chunks" not in image_models_kwargs :
101- if isinstance (image_models_kwargs , MappingProxyType ):
102- image_models_kwargs = {}
103- assert isinstance (image_models_kwargs , dict )
104126 image_models_kwargs ["chunks" ] = (1 , 4096 , 4096 )
105127 if "scale_factors" not in image_models_kwargs :
106- if isinstance (image_models_kwargs , MappingProxyType ):
107- image_models_kwargs = {}
108- assert isinstance (image_models_kwargs , dict )
109128 image_models_kwargs ["scale_factors" ] = [2 , 2 , 2 , 2 ]
110129
130+ labels_models_kwargs = dict (labels_models_kwargs )
131+ if "chunks" not in labels_models_kwargs :
132+ labels_models_kwargs ["chunks" ] = (4096 , 4096 )
133+ if "scale_factors" not in labels_models_kwargs :
134+ labels_models_kwargs ["scale_factors" ] = [2 , 2 , 2 , 2 ]
135+
111136 path = Path (path )
112137 with open (path / XeniumKeys .XENIUM_SPECS ) as f :
113138 specs = json .load (f )
@@ -121,7 +146,61 @@ def xenium(
121146 table , circles = return_values
122147 else :
123148 table = return_values
149+
150+ if version >= packaging .version .parse ("2.0.0" ):
151+ cell_summary_table = _get_cells_metadata_table_from_zarr (path , XeniumKeys .CELLS_ZARR , specs )
152+ if not cell_summary_table [XeniumKeys .CELL_ID ].equals (table .obs [XeniumKeys .CELL_ID ]):
153+ warnings .warn (
154+ 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
155+ " table. This could be due to trying to read a new version that is not supported yet. Please "
156+ "report this issue." ,
157+ UserWarning ,
158+ stacklevel = 2 ,
159+ )
160+ table .obs [XeniumKeys .Z_LEVEL ] = cell_summary_table [XeniumKeys .Z_LEVEL ]
161+ table .obs [XeniumKeys .NUCLEUS_COUNT ] = cell_summary_table [XeniumKeys .NUCLEUS_COUNT ]
162+
124163 polygons = {}
164+ labels = {}
165+ tables = {}
166+ points = {}
167+ images = {}
168+
169+ # From the public release notes here:
170+ # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa
171+ # we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used.
172+ # This column is currently not found in the preview data, while I think it is needed in order to unambiguously match
173+ # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus
174+ # labels.
175+ if nucleus_labels :
176+ labels ["nucleus_labels" ], _ = _get_labels_and_indices_mapping (
177+ path ,
178+ XeniumKeys .CELLS_ZARR ,
179+ specs ,
180+ mask_index = 0 ,
181+ labels_name = "nucleus_labels" ,
182+ labels_models_kwargs = labels_models_kwargs ,
183+ )
184+ if cell_labels :
185+ labels ["cell_labels" ], cell_labels_indices_mapping = _get_labels_and_indices_mapping (
186+ path ,
187+ XeniumKeys .CELLS_ZARR ,
188+ specs ,
189+ mask_index = 1 ,
190+ labels_name = "cell_labels" ,
191+ labels_models_kwargs = labels_models_kwargs ,
192+ )
193+ if cell_labels_indices_mapping is not None :
194+ if not pd .DataFrame .equals (cell_labels_indices_mapping ["cell_id" ], table .obs [str (XeniumKeys .CELL_ID )]):
195+ warnings .warn (
196+ "The cell_id column in the cell_labels_table does not match the cell_id column derived from the cell "
197+ "labels data. This could be due to trying to read a new version that is not supported yet. Please "
198+ "report this issue." ,
199+ UserWarning ,
200+ stacklevel = 2 ,
201+ )
202+ else :
203+ table .obs ["cell_labels" ] = cell_labels_indices_mapping ["label_index" ]
125204
126205 if nucleus_boundaries :
127206 polygons ["nucleus_boundaries" ] = _get_polygons (
@@ -141,11 +220,9 @@ def xenium(
141220 idx = table .obs [str (XeniumKeys .CELL_ID )].copy (),
142221 )
143222
144- points = {}
145223 if transcripts :
146224 points ["transcripts" ] = _get_points (path , specs )
147225
148- images = {}
149226 if version < packaging .version .parse ("2.0.0" ):
150227 if morphology_mip :
151228 images ["morphology_mip" ] = _get_images (
@@ -217,10 +294,12 @@ def filter(self, record: logging.LogRecord) -> bool:
217294 del image_models_kwargs ["c_coords" ]
218295 logger .removeFilter (IgnoreSpecificMessage ())
219296
297+ tables ["table" ] = table
298+
299+ elements_dict = {"images" : images , "labels" : labels , "points" : points , "tables" : tables , "shapes" : polygons }
220300 if cells_as_circles :
221- sdata = SpatialData (images = images , shapes = polygons | {specs ["region" ]: circles }, points = points , table = table )
222- else :
223- sdata = SpatialData (images = images , shapes = polygons , points = points , table = table )
301+ elements_dict ["shapes" ][specs ["region" ]] = circles
302+ sdata = SpatialData (** elements_dict )
224303
225304 # find and add additional aligned images
226305 aligned_images = _add_aligned_images (path , imread_kwargs , image_models_kwargs )
@@ -265,13 +344,116 @@ def _poly(arr: ArrayLike) -> Polygon:
265344 if not np .unique (geo_df .index ).size == len (geo_df ):
266345 warnings .warn (
267346 "Found non-unique polygon indices, this will be addressed in a future version of the reader. For the "
268- "time being please consider merging non-unique polygons into single multi-polygons." ,
347+ "time being please consider merging polygons with non-unique indices into single multi-polygons." ,
348+ UserWarning ,
269349 stacklevel = 2 ,
270350 )
271351 scale = Scale ([1.0 / specs ["pixel_size" ], 1.0 / specs ["pixel_size" ]], axes = ("x" , "y" ))
272352 return ShapesModel .parse (geo_df , transformations = {"global" : scale })
273353
274354
355+ def _get_labels_and_indices_mapping (
356+ path : Path ,
357+ file : str ,
358+ specs : dict [str , Any ],
359+ mask_index : int ,
360+ labels_name : str ,
361+ labels_models_kwargs : Mapping [str , Any ] = MappingProxyType ({}),
362+ ) -> tuple [GeoDataFrame , pd .DataFrame | None ]:
363+ if mask_index not in [0 , 1 ]:
364+ raise ValueError (f"mask_index must be 0 or 1, found { mask_index } ." )
365+
366+ with tempfile .TemporaryDirectory () as tmpdir :
367+ zip_file = path / XeniumKeys .CELLS_ZARR
368+ with zipfile .ZipFile (zip_file , "r" ) as zip_ref :
369+ zip_ref .extractall (tmpdir )
370+
371+ with zarr .open (str (tmpdir ), mode = "r" ) as z :
372+ # get the labels
373+ masks = z ["masks" ][f"{ mask_index } " ][...]
374+ labels = Labels2DModel .parse (
375+ masks , dims = ("y" , "x" ), transformations = {"global" : Identity ()}, ** labels_models_kwargs
376+ )
377+
378+ # build the matching table
379+ version = _parse_version_of_xenium_analyzer (specs )
380+ if mask_index == 0 :
381+ # nuclei currently not supported
382+ return labels , None
383+ if version is not None and version < packaging .version .parse ("1.3.0" ):
384+ # supported in version 1.3.0 and not supported in version 1.0.2; conservatively, let's assume it is not
385+ # supported in versions < 1.3.0
386+ return labels , None
387+
388+ cell_id , dataset_suffix = z ["cell_id" ][...].T
389+ cell_id_str = cell_id_str_from_prefix_suffix_uint32 (cell_id , dataset_suffix )
390+
391+ # this information will probably be available in the `label_id` column for version > 2.0.0 (see public
392+ # release notes mentioned above)
393+ real_label_index = _get_unique_label_values_as_index (labels ).values
394+
395+ # background removal
396+ if real_label_index [0 ] == 0 :
397+ real_label_index = real_label_index [1 :]
398+
399+ if version < packaging .version .parse ("2.0.0" ):
400+ expected_label_index = z ["seg_mask_value" ][...]
401+
402+ if not np .array_equal (expected_label_index , real_label_index ):
403+ raise ValueError (
404+ "The label indices from the labels differ from the ones from the input data. Please report "
405+ f"this issue. Real label indices: { real_label_index } , expected label indices: "
406+ f"{ expected_label_index } ."
407+ )
408+ else :
409+ labels_positional_indices = z ["polygon_sets" ][mask_index ]["cell_index" ][...]
410+ if not np .array_equal (labels_positional_indices , np .arange (len (labels_positional_indices ))):
411+ raise ValueError (
412+ "The positional indices of the labels do not match the expected range. Please report this "
413+ "issue."
414+ )
415+
416+ # labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems
417+ indices_mapping = pd .DataFrame (
418+ {
419+ "region" : labels_name ,
420+ "cell_id" : cell_id_str ,
421+ "label_index" : real_label_index .astype (np .int64 ),
422+ }
423+ )
424+ return labels , indices_mapping
425+
426+
427+ @inject_docs (xx = XeniumKeys )
428+ def _get_cells_metadata_table_from_zarr (
429+ path : Path ,
430+ file : str ,
431+ specs : dict [str , Any ],
432+ ) -> AnnData :
433+ """
434+ Read cells metadata from ``{xx.CELLS_ZARR}``.
435+
436+ Read the cells summary table, which contains the z_level information for versions < 2.0.0, and also the
437+ nucleus_count for versions >= 2.0.0.
438+ """
439+ # for version >= 2.0.0, in this function we could also parse the segmentation method used to obtain the masks
440+ with tempfile .TemporaryDirectory () as tmpdir :
441+ zip_file = path / XeniumKeys .CELLS_ZARR
442+ with zipfile .ZipFile (zip_file , "r" ) as zip_ref :
443+ zip_ref .extractall (tmpdir )
444+
445+ with zarr .open (str (tmpdir ), mode = "r" ) as z :
446+ x = z ["cell_summary" ][...]
447+ column_names = z ["cell_summary" ].attrs ["column_names" ]
448+ df = pd .DataFrame (x , columns = column_names )
449+ cell_id_prefix = z ["cell_id" ][:, 0 ]
450+ dataset_suffix = z ["cell_id" ][:, 1 ]
451+
452+ cell_id_str = cell_id_str_from_prefix_suffix_uint32 (cell_id_prefix , dataset_suffix )
453+ df [XeniumKeys .CELL_ID ] = cell_id_str
454+ return df
455+
456+
275457def _get_points (path : Path , specs : dict [str , Any ]) -> Table :
276458 table = read_parquet (path / XeniumKeys .TRANSCRIPTS_FILE )
277459 table ["feature_name" ] = table ["feature_name" ].apply (
@@ -325,10 +507,10 @@ def _get_images(
325507) -> SpatialImage | MultiscaleSpatialImage :
326508 image = imread (path / file , ** imread_kwargs )
327509 if "c_coords" in image_models_kwargs and "dummy" in image_models_kwargs ["c_coords" ]:
328- # Napari currently interprets 4 channel images as RGB; a series of PRs to fix this is almost ready but they will not
329- # be merged soon.
330- # Here, since the new data from the xenium analyzer version 2.0.0 gives 4-channel images that are not RGBA, let's
331- # add a dummy channel as a temporary workaround.
510+ # Napari currently interprets 4 channel images as RGB; a series of PRs to fix this is almost ready but they will
511+ # not be merged soon.
512+ # Here, since the new data from the xenium analyzer version 2.0.0 gives 4-channel images that are not RGBA,
513+ # let's add a dummy channel as a temporary workaround.
332514 image = da .concatenate ([image , da .zeros_like (image [0 :1 ])], axis = 0 )
333515 return Image2DModel .parse (
334516 image , transformations = {"global" : Identity ()}, dims = ("c" , "y" , "x" ), ** image_models_kwargs
@@ -399,7 +581,6 @@ def xenium_aligned_image(
399581 # In fact, it could be that the len(image.shape) == 4 has actually dimes (1, x, y, c) and not (1, y, x, c). This is
400582 # not a problem because the transformation is constructed to be consistent, but if is the case, the data orientation
401583 # would be transposed compared to the original image, not ideal.
402- # print(image.shape)
403584 if len (image .shape ) == 4 :
404585 assert image .shape [0 ] == 1
405586 assert image .shape [- 1 ] == 3
@@ -466,7 +647,11 @@ def _parse_version_of_xenium_analyzer(
466647 # Input: xenium-2.0.0.6-35-ga7e17149a
467648 # Output: 2.0.0.6-35
468649
469- warning_message = f"Could not parse the version of the Xenium Analyzer from the string: { string } . This may happen for experimental version of the data. Please report in GitHub https://github.com/scverse/spatialdata-io/issues.\n The reader will continue assuming the latest version of the Xenium Analyzer."
650+ warning_message = (
651+ f"Could not parse the version of the Xenium Analyzer from the string: { string } . This may happen for "
652+ "experimental version of the data. Please report in GitHub https://github.com/scverse/spatialdata-io/issues.\n "
653+ "The reader will continue assuming the latest version of the Xenium Analyzer."
654+ )
470655
471656 if result is None :
472657 if not hide_warning :
@@ -480,3 +665,38 @@ def _parse_version_of_xenium_analyzer(
480665 if not hide_warning :
481666 warnings .warn (warning_message , stacklevel = 2 )
482667 return None
668+
669+
670+ def cell_id_str_from_prefix_suffix_uint32 (cell_id_prefix : ArrayLike , dataset_suffix : ArrayLike ) -> ArrayLike :
671+ # explained here:
672+ # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID
673+ # convert to hex, remove the 0x prefix
674+ cell_id_prefix_hex = [hex (x )[2 :] for x in cell_id_prefix ]
675+
676+ # shift the hex values
677+ hex_shift = {str (i ): chr (ord ("a" ) + i ) for i in range (10 )} | {
678+ chr (ord ("a" ) + i ): chr (ord ("a" ) + 10 + i ) for i in range (6 )
679+ }
680+ cell_id_prefix_hex_shifted = ["" .join ([hex_shift [c ] for c in x ]) for x in cell_id_prefix_hex ]
681+
682+ # merge the prefix and the suffix
683+ cell_id_str = [str (x [0 ]).rjust (8 , "a" ) + f"-{ x [1 ]} " for x in zip (cell_id_prefix_hex_shifted , dataset_suffix )]
684+
685+ return np .array (cell_id_str )
686+
687+
688+ def prefix_suffix_uint32_from_cell_id_str (cell_id_str : ArrayLike ) -> tuple [ArrayLike , ArrayLike ]:
689+ # parse the string into the prefix and suffix
690+ cell_id_prefix_str , dataset_suffix = zip (* [x .split ("-" ) for x in cell_id_str ])
691+ dataset_suffix_int = [int (x ) for x in dataset_suffix ]
692+
693+ # reverse the shifted hex conversion
694+ hex_unshift = {chr (ord ("a" ) + i ): str (i ) for i in range (10 )} | {
695+ chr (ord ("a" ) + 10 + i ): chr (ord ("a" ) + i ) for i in range (6 )
696+ }
697+ cell_id_prefix_hex = ["" .join ([hex_unshift [c ] for c in x ]) for x in cell_id_prefix_str ]
698+
699+ # Convert hex (no need to add the 0x prefix)
700+ cell_id_prefix = [int (x , 16 ) for x in cell_id_prefix_hex ]
701+
702+ return np .array (cell_id_prefix , dtype = np .uint32 ), np .array (dataset_suffix_int )
0 commit comments