Skip to content

Commit f0d1f45

Browse files
Merge pull request #111 from scverse/xenium-2.0.0
Support for xenium analyzer 2.0.0
2 parents f8cf0b4 + 901f764 commit f0d1f45

File tree

2 files changed

+172
-28
lines changed

2 files changed

+172
-28
lines changed

src/spatialdata_io/_constants/_constants.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,17 @@ class XeniumKeys(ModeEnum):
9393
CELL_NUCLEUS_AREA = "nucleus_area"
9494

9595
# morphology images
96+
# before version 2.0.0
9697
MORPHOLOGY_MIP_FILE = "morphology_mip.ome.tif"
9798
MORPHOLOGY_FOCUS_FILE = "morphology_focus.ome.tif"
99+
# from version 2.0.0
100+
MORPHOLOGY_FOCUS_DIR = "morphology_focus"
101+
MORPHOLOGY_FOCUS_CHANNEL_IMAGE = "morphology_focus_{:04}.ome.tif"
102+
# from analysis_summary.html > Image QC of https://www.10xgenomics.com/datasets/preview-data-ffpe-human-lung-cancer-with-xenium-multimodal-cell-segmentation-1-standard
103+
MORPHOLOGY_FOCUS_CHANNEL_0 = "DAPI" # nuclear
104+
MORPHOLOGY_FOCUS_CHANNEL_1 = "ATP1A1/CD45/E-Cadherin" # boundary
105+
MORPHOLOGY_FOCUS_CHANNEL_2 = "18S" # interior - RNA
106+
MORPHOLOGY_FOCUS_CHANNEL_3 = "AlphaSMA/Vimentin" # interior - protein
98107

99108
# post-xenium images
100109
ALIGNED_IF_IMAGE_SUFFIX = "if_image.ome.tif"
@@ -104,6 +113,9 @@ class XeniumKeys(ModeEnum):
104113
ALIGNMENT_FILE_SUFFIX_TO_REMOVE = ".ome.tif"
105114
ALIGNMENT_FILE_SUFFIX_TO_ADD = "alignment.csv"
106115

116+
# specs keys
117+
ANALYSIS_SW_VERSION = "analysis_sw_version"
118+
107119

108120
@unique
109121
class VisiumKeys(ModeEnum):

src/spatialdata_io/readers/xenium.py

Lines changed: 160 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
11
from __future__ import annotations
22

33
import json
4+
import logging
5+
import os
6+
import re
7+
import warnings
48
from collections.abc import Mapping
59
from pathlib import Path
610
from types import MappingProxyType
711
from typing import Any, Optional
812

13+
import dask.array as da
914
import numpy as np
15+
import packaging.version
1016
import pandas as pd
1117
import pyarrow.parquet as pq
18+
import tifffile
1219
from anndata import AnnData
1320
from dask.dataframe import read_parquet
1421
from dask_image.imread import imread
@@ -37,6 +44,7 @@ def xenium(
3744
path: str | Path,
3845
n_jobs: int = 1,
3946
cells_as_circles: bool = True,
47+
cell_boundaries: bool = True,
4048
nucleus_boundaries: bool = True,
4149
transcripts: bool = True,
4250
morphology_mip: bool = True,
@@ -70,8 +78,10 @@ def xenium(
7078
Number of jobs to use for parallel processing.
7179
cells_as_circles
7280
Whether to read cells also as circles. Useful for performant visualization.
81+
cell_boundaries
82+
Whether to read cell boundaries (polygons).
7383
nucleus_boundaries
74-
Whether to read nucleus boundaries.
84+
Whether to read nucleus boundaries (polygons).
7585
transcripts
7686
Whether to read transcripts.
7787
morphology_mip
@@ -101,8 +111,11 @@ def xenium(
101111
path = Path(path)
102112
with open(path / XeniumKeys.XENIUM_SPECS) as f:
103113
specs = json.load(f)
114+
# to trigger the warning if the version cannot be parsed
115+
version = _parse_version_of_xenium_analyzer(specs, hide_warning=False)
104116

105117
specs["region"] = "cell_circles" if cells_as_circles else "cell_boundaries"
118+
106119
return_values = _get_tables_and_circles(path, cells_as_circles, specs)
107120
if cells_as_circles:
108121
table, circles = return_values
@@ -119,33 +132,91 @@ def xenium(
119132
idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
120133
)
121134

122-
polygons["cell_boundaries"] = _get_polygons(
123-
path,
124-
XeniumKeys.CELL_BOUNDARIES_FILE,
125-
specs,
126-
n_jobs,
127-
idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
128-
)
135+
if cell_boundaries:
136+
polygons["cell_boundaries"] = _get_polygons(
137+
path,
138+
XeniumKeys.CELL_BOUNDARIES_FILE,
139+
specs,
140+
n_jobs,
141+
idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
142+
)
129143

130144
points = {}
131145
if transcripts:
132146
points["transcripts"] = _get_points(path, specs)
133147

134148
images = {}
135-
if morphology_mip:
136-
images["morphology_mip"] = _get_images(
137-
path,
138-
XeniumKeys.MORPHOLOGY_MIP_FILE,
139-
imread_kwargs,
140-
image_models_kwargs,
141-
)
142-
if morphology_focus:
143-
images["morphology_focus"] = _get_images(
144-
path,
145-
XeniumKeys.MORPHOLOGY_FOCUS_FILE,
146-
imread_kwargs,
147-
image_models_kwargs,
148-
)
149+
if version < packaging.version.parse("2.0.0"):
150+
if morphology_mip:
151+
images["morphology_mip"] = _get_images(
152+
path,
153+
XeniumKeys.MORPHOLOGY_MIP_FILE,
154+
imread_kwargs,
155+
image_models_kwargs,
156+
)
157+
if morphology_focus:
158+
images["morphology_focus"] = _get_images(
159+
path,
160+
XeniumKeys.MORPHOLOGY_FOCUS_FILE,
161+
imread_kwargs,
162+
image_models_kwargs,
163+
)
164+
else:
165+
if morphology_focus:
166+
morphology_focus_dir = path / XeniumKeys.MORPHOLOGY_FOCUS_DIR
167+
files = {f for f in os.listdir(morphology_focus_dir) if f.endswith(".ome.tif")}
168+
if len(files) not in [1, 4]:
169+
raise ValueError(
170+
"Expected 1 (no segmentation kit) or 4 (segmentation kit) files in the morphology focus directory, "
171+
f"found {len(files)}: {files}"
172+
)
173+
if files != {XeniumKeys.MORPHOLOGY_FOCUS_CHANNEL_IMAGE.value.format(i) for i in range(len(files))}:
174+
raise ValueError(
175+
"Expected files in the morphology focus directory to be named as "
176+
f"{XeniumKeys.MORPHOLOGY_FOCUS_CHANNEL_IMAGE.value.format(0)} to "
177+
f"{XeniumKeys.MORPHOLOGY_FOCUS_CHANNEL_IMAGE.value.format(len(files) - 1)}, found {files}"
178+
)
179+
# the 'dummy' channel is a temporary workaround, see _get_images() for more details
180+
if len(files) == 1:
181+
channel_names = {
182+
0: XeniumKeys.MORPHOLOGY_FOCUS_CHANNEL_0.value,
183+
}
184+
else:
185+
channel_names = {
186+
0: XeniumKeys.MORPHOLOGY_FOCUS_CHANNEL_0.value,
187+
1: XeniumKeys.MORPHOLOGY_FOCUS_CHANNEL_1.value,
188+
2: XeniumKeys.MORPHOLOGY_FOCUS_CHANNEL_2.value,
189+
3: XeniumKeys.MORPHOLOGY_FOCUS_CHANNEL_3.value,
190+
4: "dummy",
191+
}
192+
# this reads the scale 0 for all the 1 or 4 channels (the other files are parsed automatically)
193+
# dask.image.imread will call tifffile.imread which will give a warning saying that reading multi-file
194+
# pyramids is not supported; since we are reading the full scale image and reconstructing the pyramid, we
195+
# can ignore this
196+
197+
class IgnoreSpecificMessage(logging.Filter):
198+
def filter(self, record: logging.LogRecord) -> bool:
199+
# Ignore specific log message
200+
if "OME series cannot read multi-file pyramids" in record.getMessage():
201+
return False
202+
return True
203+
204+
logger = tifffile.logger()
205+
logger.addFilter(IgnoreSpecificMessage())
206+
image_models_kwargs = dict(image_models_kwargs)
207+
assert (
208+
"c_coords" not in image_models_kwargs
209+
), "The channel names for the morphology focus images are handled internally"
210+
image_models_kwargs["c_coords"] = list(channel_names.values())
211+
images["morphology_focus"] = _get_images(
212+
morphology_focus_dir,
213+
XeniumKeys.MORPHOLOGY_FOCUS_CHANNEL_IMAGE.format(0), # type: ignore[str-format]
214+
imread_kwargs,
215+
image_models_kwargs,
216+
)
217+
del image_models_kwargs["c_coords"]
218+
logger.removeFilter(IgnoreSpecificMessage())
219+
149220
if cells_as_circles:
150221
sdata = SpatialData(images=images, shapes=polygons | {specs["region"]: circles}, points=points, table=table)
151222
else:
@@ -159,6 +230,12 @@ def xenium(
159230
return sdata
160231

161232

233+
def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series:
234+
if isinstance(cell_id_column.iloc[0], bytes):
235+
return cell_id_column.apply(lambda x: x.decode("utf-8"))
236+
return cell_id_column
237+
238+
162239
def _get_polygons(
163240
path: Path, file: str, specs: dict[str, Any], n_jobs: int, idx: Optional[ArrayLike] = None
164241
) -> GeoDataFrame:
@@ -168,13 +245,29 @@ def _poly(arr: ArrayLike) -> Polygon:
168245
# seems to be faster than pd.read_parquet
169246
df = pq.read_table(path / file).to_pandas()
170247

248+
group_by = df.groupby(XeniumKeys.CELL_ID)
249+
index = pd.Series(group_by.indices.keys())
250+
index = _decode_cell_id_column(index)
171251
out = Parallel(n_jobs=n_jobs)(
172252
delayed(_poly)(i.to_numpy())
173-
for _, i in df.groupby(XeniumKeys.CELL_ID)[[XeniumKeys.BOUNDARIES_VERTEX_X, XeniumKeys.BOUNDARIES_VERTEX_Y]]
253+
for _, i in group_by[[XeniumKeys.BOUNDARIES_VERTEX_X, XeniumKeys.BOUNDARIES_VERTEX_Y]]
174254
)
175255
geo_df = GeoDataFrame({"geometry": out})
176-
if idx is not None:
256+
version = _parse_version_of_xenium_analyzer(specs)
257+
if version is not None and version < packaging.version.parse("2.0.0"):
258+
assert idx is not None
259+
assert len(idx) == len(geo_df)
260+
assert np.unique(geo_df.index).size == len(geo_df)
261+
assert index.equals(idx)
177262
geo_df.index = idx
263+
else:
264+
geo_df.index = index
265+
if not np.unique(geo_df.index).size == len(geo_df):
266+
warnings.warn(
267+
"Found non-unique polygon indices, this will be addressed in a future version of the reader. For the "
268+
"time being please consider merging non-unique polygons into single multi-polygons.",
269+
stacklevel=2,
270+
)
178271
scale = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y"))
179272
return ShapesModel.parse(geo_df, transformations={"global": scale})
180273

@@ -208,8 +301,7 @@ def _get_tables_and_circles(
208301
adata.obs = metadata
209302
adata.obs["region"] = specs["region"]
210303
adata.obs["region"] = adata.obs["region"].astype("category")
211-
if isinstance(adata.obs[XeniumKeys.CELL_ID].iloc[0], bytes):
212-
adata.obs[XeniumKeys.CELL_ID] = adata.obs[XeniumKeys.CELL_ID].apply(lambda x: x.decode("utf-8"))
304+
adata.obs[XeniumKeys.CELL_ID] = _decode_cell_id_column(adata.obs[XeniumKeys.CELL_ID])
213305
table = TableModel.parse(adata, region=specs["region"], region_key="region", instance_key=str(XeniumKeys.CELL_ID))
214306
if cells_as_circles:
215307
transform = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y"))
@@ -232,6 +324,12 @@ def _get_images(
232324
image_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
233325
) -> SpatialImage | MultiscaleSpatialImage:
234326
image = imread(path / file, **imread_kwargs)
327+
if "c_coords" in image_models_kwargs and "dummy" in image_models_kwargs["c_coords"]:
328+
# Napari currently interprets 4 channel images as RGB; a series of PRs to fix this is almost ready but they will not
329+
# be merged soon.
330+
# Here, since the new data from the xenium analyzer version 2.0.0 gives 4-channel images that are not RGBA, let's
331+
# add a dummy channel as a temporary workaround.
332+
image = da.concatenate([image, da.zeros_like(image[0:1])], axis=0)
235333
return Image2DModel.parse(
236334
image, transformations={"global": Identity()}, dims=("c", "y", "x"), **image_models_kwargs
237335
)
@@ -293,21 +391,27 @@ def xenium_aligned_image(
293391
assert image_path.exists(), f"File {image_path} does not exist."
294392
image = imread(image_path, **imread_kwargs)
295393

296-
# Depending on the version of pipeline that was used, some images have shape (1, y, x, 3) and others (3, y, x)
394+
# Depending on the version of pipeline that was used, some images have shape (1, y, x, 3) and others (3, y, x) or
395+
# (4, y, x).
297396
# since y and x are always different from 1, let's differentiate from the two cases here, independently of the
298397
# pipeline version.
299398
# Note that a more robust approach is to look at the xml metadata in the ome.tif; we should use this in a future PR.
300399
# In fact, it could be that the len(image.shape) == 4 has actually dimes (1, x, y, c) and not (1, y, x, c). This is
301400
# not a problem because the transformation is constructed to be consistent, but if is the case, the data orientation
302401
# would be transposed compared to the original image, not ideal.
402+
# print(image.shape)
303403
if len(image.shape) == 4:
304404
assert image.shape[0] == 1
305405
assert image.shape[-1] == 3
306406
image = image.squeeze(0)
307407
dims = ("y", "x", "c")
308408
else:
309409
assert len(image.shape) == 3
310-
assert image.shape[0] == 3
410+
assert image.shape[0] in [3, 4]
411+
if image.shape[0] == 4:
412+
# as explained before in _get_images(), we need to add a dummy channel until we support 4-channel images as
413+
# non-RGBA images in napari
414+
image = da.concatenate([image, da.zeros_like(image[0:1])], axis=0)
311415
dims = ("c", "y", "x")
312416

313417
if alignment_file is None:
@@ -348,3 +452,31 @@ def xenium_explorer_selection(path: str | Path, pixel_size: float = 0.2125) -> P
348452
"""
349453
df = pd.read_csv(path, skiprows=2)
350454
return Polygon(df.values / pixel_size)
455+
456+
457+
def _parse_version_of_xenium_analyzer(
458+
specs: dict[str, Any],
459+
hide_warning: bool = True,
460+
) -> packaging.version.Version | None:
461+
string = specs[XeniumKeys.ANALYSIS_SW_VERSION]
462+
pattern = r"^xenium-(\d+\.\d+\.\d+(\.\d+-\d+)?)"
463+
464+
result = re.search(pattern, string)
465+
# Example
466+
# Input: xenium-2.0.0.6-35-ga7e17149a
467+
# Output: 2.0.0.6-35
468+
469+
warning_message = f"Could not parse the version of the Xenium Analyzer from the string: {string}. This may happen for experimental version of the data. Please report in GitHub https://github.com/scverse/spatialdata-io/issues.\nThe reader will continue assuming the latest version of the Xenium Analyzer."
470+
471+
if result is None:
472+
if not hide_warning:
473+
warnings.warn(warning_message, stacklevel=2)
474+
return None
475+
476+
group = result.groups()[0]
477+
try:
478+
return packaging.version.parse(group)
479+
except packaging.version.InvalidVersion:
480+
if not hide_warning:
481+
warnings.warn(warning_message, stacklevel=2)
482+
return None

0 commit comments

Comments
 (0)