11from __future__ import annotations
22
33import json
4+ import logging
5+ import os
6+ import re
7+ import warnings
48from collections .abc import Mapping
59from pathlib import Path
610from types import MappingProxyType
711from typing import Any , Optional
812
13+ import dask .array as da
914import numpy as np
15+ import packaging .version
1016import pandas as pd
1117import pyarrow .parquet as pq
18+ import tifffile
1219from anndata import AnnData
1320from dask .dataframe import read_parquet
1421from dask_image .imread import imread
@@ -37,6 +44,7 @@ def xenium(
3744 path : str | Path ,
3845 n_jobs : int = 1 ,
3946 cells_as_circles : bool = True ,
47+ cell_boundaries : bool = True ,
4048 nucleus_boundaries : bool = True ,
4149 transcripts : bool = True ,
4250 morphology_mip : bool = True ,
@@ -70,8 +78,10 @@ def xenium(
7078 Number of jobs to use for parallel processing.
7179 cells_as_circles
7280 Whether to read cells also as circles. Useful for performant visualization.
81+ cell_boundaries
82+ Whether to read cell boundaries (polygons).
7383 nucleus_boundaries
74- Whether to read nucleus boundaries.
84+ Whether to read nucleus boundaries (polygons) .
7585 transcripts
7686 Whether to read transcripts.
7787 morphology_mip
@@ -101,8 +111,11 @@ def xenium(
101111 path = Path (path )
102112 with open (path / XeniumKeys .XENIUM_SPECS ) as f :
103113 specs = json .load (f )
114+ # to trigger the warning if the version cannot be parsed
115+ version = _parse_version_of_xenium_analyzer (specs , hide_warning = False )
104116
105117 specs ["region" ] = "cell_circles" if cells_as_circles else "cell_boundaries"
118+
106119 return_values = _get_tables_and_circles (path , cells_as_circles , specs )
107120 if cells_as_circles :
108121 table , circles = return_values
@@ -119,33 +132,91 @@ def xenium(
119132 idx = table .obs [str (XeniumKeys .CELL_ID )].copy (),
120133 )
121134
122- polygons ["cell_boundaries" ] = _get_polygons (
123- path ,
124- XeniumKeys .CELL_BOUNDARIES_FILE ,
125- specs ,
126- n_jobs ,
127- idx = table .obs [str (XeniumKeys .CELL_ID )].copy (),
128- )
135+ if cell_boundaries :
136+ polygons ["cell_boundaries" ] = _get_polygons (
137+ path ,
138+ XeniumKeys .CELL_BOUNDARIES_FILE ,
139+ specs ,
140+ n_jobs ,
141+ idx = table .obs [str (XeniumKeys .CELL_ID )].copy (),
142+ )
129143
130144 points = {}
131145 if transcripts :
132146 points ["transcripts" ] = _get_points (path , specs )
133147
134148 images = {}
135- if morphology_mip :
136- images ["morphology_mip" ] = _get_images (
137- path ,
138- XeniumKeys .MORPHOLOGY_MIP_FILE ,
139- imread_kwargs ,
140- image_models_kwargs ,
141- )
142- if morphology_focus :
143- images ["morphology_focus" ] = _get_images (
144- path ,
145- XeniumKeys .MORPHOLOGY_FOCUS_FILE ,
146- imread_kwargs ,
147- image_models_kwargs ,
148- )
149+ if version < packaging .version .parse ("2.0.0" ):
150+ if morphology_mip :
151+ images ["morphology_mip" ] = _get_images (
152+ path ,
153+ XeniumKeys .MORPHOLOGY_MIP_FILE ,
154+ imread_kwargs ,
155+ image_models_kwargs ,
156+ )
157+ if morphology_focus :
158+ images ["morphology_focus" ] = _get_images (
159+ path ,
160+ XeniumKeys .MORPHOLOGY_FOCUS_FILE ,
161+ imread_kwargs ,
162+ image_models_kwargs ,
163+ )
164+ else :
165+ if morphology_focus :
166+ morphology_focus_dir = path / XeniumKeys .MORPHOLOGY_FOCUS_DIR
167+ files = {f for f in os .listdir (morphology_focus_dir ) if f .endswith (".ome.tif" )}
168+ if len (files ) not in [1 , 4 ]:
169+ raise ValueError (
170+ "Expected 1 (no segmentation kit) or 4 (segmentation kit) files in the morphology focus directory, "
171+ f"found { len (files )} : { files } "
172+ )
173+ if files != {XeniumKeys .MORPHOLOGY_FOCUS_CHANNEL_IMAGE .value .format (i ) for i in range (len (files ))}:
174+ raise ValueError (
175+ "Expected files in the morphology focus directory to be named as "
176+ f"{ XeniumKeys .MORPHOLOGY_FOCUS_CHANNEL_IMAGE .value .format (0 )} to "
177+ f"{ XeniumKeys .MORPHOLOGY_FOCUS_CHANNEL_IMAGE .value .format (len (files ) - 1 )} , found { files } "
178+ )
179+ # the 'dummy' channel is a temporary workaround, see _get_images() for more details
180+ if len (files ) == 1 :
181+ channel_names = {
182+ 0 : XeniumKeys .MORPHOLOGY_FOCUS_CHANNEL_0 .value ,
183+ }
184+ else :
185+ channel_names = {
186+ 0 : XeniumKeys .MORPHOLOGY_FOCUS_CHANNEL_0 .value ,
187+ 1 : XeniumKeys .MORPHOLOGY_FOCUS_CHANNEL_1 .value ,
188+ 2 : XeniumKeys .MORPHOLOGY_FOCUS_CHANNEL_2 .value ,
189+ 3 : XeniumKeys .MORPHOLOGY_FOCUS_CHANNEL_3 .value ,
190+ 4 : "dummy" ,
191+ }
192+ # this reads the scale 0 for all the 1 or 4 channels (the other files are parsed automatically)
193+ # dask.image.imread will call tifffile.imread which will give a warning saying that reading multi-file
194+ # pyramids is not supported; since we are reading the full scale image and reconstructing the pyramid, we
195+ # can ignore this
196+
197+ class IgnoreSpecificMessage (logging .Filter ):
198+ def filter (self , record : logging .LogRecord ) -> bool :
199+ # Ignore specific log message
200+ if "OME series cannot read multi-file pyramids" in record .getMessage ():
201+ return False
202+ return True
203+
204+ logger = tifffile .logger ()
205+ logger .addFilter (IgnoreSpecificMessage ())
206+ image_models_kwargs = dict (image_models_kwargs )
207+ assert (
208+ "c_coords" not in image_models_kwargs
209+ ), "The channel names for the morphology focus images are handled internally"
210+ image_models_kwargs ["c_coords" ] = list (channel_names .values ())
211+ images ["morphology_focus" ] = _get_images (
212+ morphology_focus_dir ,
213+ XeniumKeys .MORPHOLOGY_FOCUS_CHANNEL_IMAGE .format (0 ), # type: ignore[str-format]
214+ imread_kwargs ,
215+ image_models_kwargs ,
216+ )
217+ del image_models_kwargs ["c_coords" ]
218+ logger .removeFilter (IgnoreSpecificMessage ())
219+
149220 if cells_as_circles :
150221 sdata = SpatialData (images = images , shapes = polygons | {specs ["region" ]: circles }, points = points , table = table )
151222 else :
@@ -159,6 +230,12 @@ def xenium(
159230 return sdata
160231
161232
233+ def _decode_cell_id_column (cell_id_column : pd .Series ) -> pd .Series :
234+ if isinstance (cell_id_column .iloc [0 ], bytes ):
235+ return cell_id_column .apply (lambda x : x .decode ("utf-8" ))
236+ return cell_id_column
237+
238+
162239def _get_polygons (
163240 path : Path , file : str , specs : dict [str , Any ], n_jobs : int , idx : Optional [ArrayLike ] = None
164241) -> GeoDataFrame :
@@ -168,13 +245,29 @@ def _poly(arr: ArrayLike) -> Polygon:
168245 # seems to be faster than pd.read_parquet
169246 df = pq .read_table (path / file ).to_pandas ()
170247
248+ group_by = df .groupby (XeniumKeys .CELL_ID )
249+ index = pd .Series (group_by .indices .keys ())
250+ index = _decode_cell_id_column (index )
171251 out = Parallel (n_jobs = n_jobs )(
172252 delayed (_poly )(i .to_numpy ())
173- for _ , i in df . groupby ( XeniumKeys . CELL_ID ) [[XeniumKeys .BOUNDARIES_VERTEX_X , XeniumKeys .BOUNDARIES_VERTEX_Y ]]
253+ for _ , i in group_by [[XeniumKeys .BOUNDARIES_VERTEX_X , XeniumKeys .BOUNDARIES_VERTEX_Y ]]
174254 )
175255 geo_df = GeoDataFrame ({"geometry" : out })
176- if idx is not None :
256+ version = _parse_version_of_xenium_analyzer (specs )
257+ if version is not None and version < packaging .version .parse ("2.0.0" ):
258+ assert idx is not None
259+ assert len (idx ) == len (geo_df )
260+ assert np .unique (geo_df .index ).size == len (geo_df )
261+ assert index .equals (idx )
177262 geo_df .index = idx
263+ else :
264+ geo_df .index = index
265+ if not np .unique (geo_df .index ).size == len (geo_df ):
266+ warnings .warn (
267+ "Found non-unique polygon indices, this will be addressed in a future version of the reader. For the "
268+ "time being please consider merging non-unique polygons into single multi-polygons." ,
269+ stacklevel = 2 ,
270+ )
178271 scale = Scale ([1.0 / specs ["pixel_size" ], 1.0 / specs ["pixel_size" ]], axes = ("x" , "y" ))
179272 return ShapesModel .parse (geo_df , transformations = {"global" : scale })
180273
@@ -208,8 +301,7 @@ def _get_tables_and_circles(
208301 adata .obs = metadata
209302 adata .obs ["region" ] = specs ["region" ]
210303 adata .obs ["region" ] = adata .obs ["region" ].astype ("category" )
211- if isinstance (adata .obs [XeniumKeys .CELL_ID ].iloc [0 ], bytes ):
212- adata .obs [XeniumKeys .CELL_ID ] = adata .obs [XeniumKeys .CELL_ID ].apply (lambda x : x .decode ("utf-8" ))
304+ adata .obs [XeniumKeys .CELL_ID ] = _decode_cell_id_column (adata .obs [XeniumKeys .CELL_ID ])
213305 table = TableModel .parse (adata , region = specs ["region" ], region_key = "region" , instance_key = str (XeniumKeys .CELL_ID ))
214306 if cells_as_circles :
215307 transform = Scale ([1.0 / specs ["pixel_size" ], 1.0 / specs ["pixel_size" ]], axes = ("x" , "y" ))
@@ -232,6 +324,12 @@ def _get_images(
232324 image_models_kwargs : Mapping [str , Any ] = MappingProxyType ({}),
233325) -> SpatialImage | MultiscaleSpatialImage :
234326 image = imread (path / file , ** imread_kwargs )
327+ if "c_coords" in image_models_kwargs and "dummy" in image_models_kwargs ["c_coords" ]:
328+ # Napari currently interprets 4 channel images as RGB; a series of PRs to fix this is almost ready but they will not
329+ # be merged soon.
330+ # Here, since the new data from the xenium analyzer version 2.0.0 gives 4-channel images that are not RGBA, let's
331+ # add a dummy channel as a temporary workaround.
332+ image = da .concatenate ([image , da .zeros_like (image [0 :1 ])], axis = 0 )
235333 return Image2DModel .parse (
236334 image , transformations = {"global" : Identity ()}, dims = ("c" , "y" , "x" ), ** image_models_kwargs
237335 )
@@ -293,21 +391,27 @@ def xenium_aligned_image(
293391 assert image_path .exists (), f"File { image_path } does not exist."
294392 image = imread (image_path , ** imread_kwargs )
295393
296- # Depending on the version of pipeline that was used, some images have shape (1, y, x, 3) and others (3, y, x)
394+ # Depending on the version of pipeline that was used, some images have shape (1, y, x, 3) and others (3, y, x) or
395+ # (4, y, x).
297396 # since y and x are always different from 1, let's differentiate from the two cases here, independently of the
298397 # pipeline version.
299398 # Note that a more robust approach is to look at the xml metadata in the ome.tif; we should use this in a future PR.
300399 # In fact, it could be that the len(image.shape) == 4 has actually dimes (1, x, y, c) and not (1, y, x, c). This is
301400 # not a problem because the transformation is constructed to be consistent, but if is the case, the data orientation
302401 # would be transposed compared to the original image, not ideal.
402+ # print(image.shape)
303403 if len (image .shape ) == 4 :
304404 assert image .shape [0 ] == 1
305405 assert image .shape [- 1 ] == 3
306406 image = image .squeeze (0 )
307407 dims = ("y" , "x" , "c" )
308408 else :
309409 assert len (image .shape ) == 3
310- assert image .shape [0 ] == 3
410+ assert image .shape [0 ] in [3 , 4 ]
411+ if image .shape [0 ] == 4 :
412+ # as explained before in _get_images(), we need to add a dummy channel until we support 4-channel images as
413+ # non-RGBA images in napari
414+ image = da .concatenate ([image , da .zeros_like (image [0 :1 ])], axis = 0 )
311415 dims = ("c" , "y" , "x" )
312416
313417 if alignment_file is None :
@@ -348,3 +452,31 @@ def xenium_explorer_selection(path: str | Path, pixel_size: float = 0.2125) -> P
348452 """
349453 df = pd .read_csv (path , skiprows = 2 )
350454 return Polygon (df .values / pixel_size )
455+
456+
457+ def _parse_version_of_xenium_analyzer (
458+ specs : dict [str , Any ],
459+ hide_warning : bool = True ,
460+ ) -> packaging .version .Version | None :
461+ string = specs [XeniumKeys .ANALYSIS_SW_VERSION ]
462+ pattern = r"^xenium-(\d+\.\d+\.\d+(\.\d+-\d+)?)"
463+
464+ result = re .search (pattern , string )
465+ # Example
466+ # Input: xenium-2.0.0.6-35-ga7e17149a
467+ # Output: 2.0.0.6-35
468+
469+ warning_message = f"Could not parse the version of the Xenium Analyzer from the string: { string } . This may happen for experimental version of the data. Please report in GitHub https://github.com/scverse/spatialdata-io/issues.\n The reader will continue assuming the latest version of the Xenium Analyzer."
470+
471+ if result is None :
472+ if not hide_warning :
473+ warnings .warn (warning_message , stacklevel = 2 )
474+ return None
475+
476+ group = result .groups ()[0 ]
477+ try :
478+ return packaging .version .parse (group )
479+ except packaging .version .InvalidVersion :
480+ if not hide_warning :
481+ warnings .warn (warning_message , stacklevel = 2 )
482+ return None
0 commit comments