Integrate fsspec to enable accessing WFDB files from cloud URIs (#523)

briangow · web-flow · commit 33109806d373 · 2025-04-02T11:27:28.000-04:00
As mentioned in #517, we want to be able to read WFDB files from within cloud environments using WFDB-Python. This PR enables using the `fsspec` library ( https://filesystem-spec.readthedocs.io/en/latest/ ) to read WFDB files from cloud URIs. It replaces the standard Python `open` with `fsspec.open` . Also, it adds logic to differentiate between loading a file from a cloud URI or from a PhysioNet Database. In the initial commit, access has only been added for `rdheader`. We can expand this across all relevant WFDB functions once the approach has been agreed upon. I've tested this with a local `.hea` file, a file read from a PhysioNet Database (using `pn_dir`), and a file from a Datastore in the Azure AI / ML Studio.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -38,18 +38,20 @@ jobs:
       - name: Check style
         run: uv run --extra dev black --check --diff .
 
-  test-deb10-i386:
-    name: Python 3.7 on Debian 10 i386
+  test-deb11-i386:
+    name: Python 3.7 on Debian 11 i386
     runs-on: ubuntu-latest
-    container: i386/debian:10
+    container: i386/debian:11
     steps:
       - name: Install dependencies
         run: |
           apt-get update
           apt-get install -y --no-install-recommends \
+                  python3-fsspec \
                   python3-matplotlib \
                   python3-numpy \
                   python3-pandas \
+                  python3-pip \
                   python3-requests \
                   python3-scipy \
                   python3-soundfile \
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,8 @@ dependencies = [
     "soundfile >= 0.10.0",
     "matplotlib >= 3.2.2",
     "requests >= 2.8.1",
+    "fsspec >= 2023.10.0",
+    "aiohttp >= 3.10.11",
 ]
 dynamic = ["version"]
 
diff --git a/wfdb/io/_coreio.py b/wfdb/io/_coreio.py
@@ -1,9 +1,15 @@
 import posixpath
 
+import fsspec
+
 from wfdb.io import _url
 from wfdb.io.download import config
 
 
+# Cloud protocols
+CLOUD_PROTOCOLS = ["az://", "azureml://", "s3://", "gs://"]
+
+
 def _open_file(
     pn_dir,
     file_name,
@@ -26,10 +32,11 @@ def _open_file(
     ----------
     pn_dir : str or None
         The PhysioNet database directory where the file is stored, or None
-        if file_name is a local path.
+        if file_name is a local or cloud path.
     file_name : str
-        The name of the file, either as a local filesystem path (if
-        `pn_dir` is None) or a URL path (if `pn_dir` is a string.)
+        The name of the file, either as a local filesystem path or cloud
+        URL (if `pn_dir` is None) or a PhysioNet URL path
+        (if `pn_dir` is a string.)
     mode : str, optional
         The standard I/O mode for the file ("r" by default).  If `pn_dir`
         is not None, this must be "r", "rt", or "rb".
@@ -47,7 +54,7 @@ def _open_file(
 
     """
     if pn_dir is None:
-        return open(
+        return fsspec.open(
             file_name,
             mode,
             buffering=buffering,
@@ -56,6 +63,12 @@ def _open_file(
             newline=newline,
         )
     else:
+        # check to make sure a cloud path isn't being passed under pn_dir
+        if any(pn_dir.startswith(proto) for proto in CLOUD_PROTOCOLS):
+            raise ValueError(
+                "Cloud paths should be passed under record_name, not under pn_dir"
+            )
+
         url = posixpath.join(config.db_index_url, pn_dir, file_name)
         return _url.openurl(
             url,
diff --git a/wfdb/io/_signal.py b/wfdb/io/_signal.py
@@ -1,11 +1,13 @@
 import math
 import os
+import posixpath
 import sys
 
+import fsspec
 import numpy as np
 
 from wfdb.io import download, _coreio, util
-
+from wfdb.io._coreio import CLOUD_PROTOCOLS
 
 MAX_I32 = 2147483647
 MIN_I32 = -2147483648
@@ -1643,10 +1645,10 @@ def _rd_dat_file(file_name, dir_name, pn_dir, fmt, start_byte, n_samp):
         The name of the dat file.
     dir_name : str
         The full directory where the dat file(s) are located, if the dat
-        file(s) are local.
+        file(s) are local or in the cloud.
     pn_dir : str
         The PhysioNet directory where the dat file(s) are located, if
-        the dat file(s) are remote.
+        the dat file(s) are on a PhysioNet server.
     fmt : str
         The format of the dat file.
     start_byte : int
@@ -1686,15 +1688,22 @@ def _rd_dat_file(file_name, dir_name, pn_dir, fmt, start_byte, n_samp):
         element_count = n_samp
         byte_count = n_samp * BYTES_PER_SAMPLE[fmt]
 
-    # Local dat file
+    # Local or cloud dat file
     if pn_dir is None:
-        with open(os.path.join(dir_name, file_name), "rb") as fp:
+        with fsspec.open(os.path.join(dir_name, file_name), "rb") as fp:
             fp.seek(start_byte)
-            sig_data = np.fromfile(
+            sig_data = util.fromfile(
                 fp, dtype=np.dtype(DATA_LOAD_TYPES[fmt]), count=element_count
             )
-    # Stream dat file from Physionet
+
+    # Stream dat file from PhysioNet
     else:
+        # check to make sure a cloud path isn't being passed under pn_dir
+        if any(pn_dir.startswith(proto) for proto in CLOUD_PROTOCOLS):
+            raise ValueError(
+                "Cloud paths should be passed under record_name, not under pn_dir"
+            )
+
         dtype_in = np.dtype(DATA_LOAD_TYPES[fmt])
         sig_data = download._stream_dat(
             file_name, pn_dir, byte_count, start_byte, dtype_in
@@ -1840,8 +1849,9 @@ def _rd_compressed_file(
     file_name : str
         The name of the signal file.
     dir_name : str
-        The full directory where the signal file is located, if local.
-        This argument is ignored if `pn_dir` is not None.
+        The full directory where the signal file is located, if this
+        is a local or cloud path. This argument is ignored if `pn_dir`
+        is not None.
     pn_dir : str or None
         The PhysioNet database directory where the signal file is located.
     fmt : str
@@ -2585,10 +2595,10 @@ def _infer_sig_len(
         The byte offset of the dat file.  None is equivalent to zero.
     dir_name : str
         The full directory where the dat file(s) are located, if the dat
-        file(s) are local.
+        file(s) are local or on the cloud.
     pn_dir : str, optional
         The PhysioNet directory where the dat file(s) are located, if
-        the dat file(s) are remote.
+        the dat file(s) are on a PhysioNet server.
 
     Returns
     -------
@@ -2600,13 +2610,29 @@ def _infer_sig_len(
     sig_len * tsamps_per_frame * bytes_per_sample == file_size
 
     """
-    if pn_dir is None:
-        file_size = os.path.getsize(os.path.join(dir_name, file_name))
-    else:
+    from wfdb.io.record import CLOUD_PROTOCOLS
+
+    # If this is a cloud path, use posixpath to construct the path and fsspec to open file
+    if any(dir_name.startswith(proto) for proto in CLOUD_PROTOCOLS):
+        with fsspec.open(posixpath.join(dir_name, file_name), mode="rb") as f:
+            file_size = f.seek(0, os.SEEK_END)
+
+    # If the PhysioNet database path is provided, construct the download path using the database version
+    elif pn_dir is not None:
+        # check to make sure a cloud path isn't being passed under pn_dir
+        if any(pn_dir.startswith(proto) for proto in CLOUD_PROTOCOLS):
+            raise ValueError(
+                "Cloud paths should be passed under record_name, not under pn_dir"
+            )
+
         file_size = download._remote_file_size(
             file_name=file_name, pn_dir=pn_dir
         )
 
+    # If it isn't a cloud path or a PhysioNet path, we treat as a local file
+    else:
+        file_size = os.path.getsize(os.path.join(dir_name, file_name))
+
     if byte_offset is None:
         byte_offset = 0
     data_size = file_size - byte_offset
diff --git a/wfdb/io/annotation.py b/wfdb/io/annotation.py
@@ -1,4 +1,5 @@
 import copy
+import fsspec
 import numpy as np
 import os
 import pandas as pd
@@ -9,6 +10,8 @@
 from wfdb.io import download
 from wfdb.io import _header
 from wfdb.io import record
+from wfdb.io import util
+from wfdb.io._coreio import CLOUD_PROTOCOLS
 
 
 class Annotation(object):
@@ -1892,7 +1895,7 @@ def rdann(
     ----------
     record_name : str
         The record name of the WFDB annotation file. ie. for file '100.atr',
-        record_name='100'.
+        record_name='100'. The path to the file can be a cloud URL.
     extension : str
         The annotatator extension of the annotation file. ie. for  file
         '100.atr', extension='atr'.
@@ -1936,11 +1939,17 @@ def rdann(
     >>> ann = wfdb.rdann('sample-data/100', 'atr', sampto=300000)
 
     """
-    if (pn_dir is not None) and ("." not in pn_dir):
-        dir_list = pn_dir.split("/")
-        pn_dir = posixpath.join(
-            dir_list[0], download.get_version(dir_list[0]), *dir_list[1:]
-        )
+    if pn_dir is not None:
+        # check to make sure a cloud path isn't being passed under pn_dir
+        if any(pn_dir.startswith(proto) for proto in CLOUD_PROTOCOLS):
+            raise ValueError(
+                "Cloud paths should be passed under record_name, not under pn_dir"
+            )
+        if "." not in pn_dir:
+            dir_list = pn_dir.split("/")
+            pn_dir = posixpath.join(
+                dir_list[0], download.get_version(dir_list[0]), *dir_list[1:]
+            )
 
     return_label_elements = check_read_inputs(
         sampfrom, sampto, return_label_elements
@@ -2071,7 +2080,7 @@ def load_byte_pairs(record_name, extension, pn_dir):
     ----------
     record_name : str
         The record name of the WFDB annotation file. ie. for file '100.atr',
-        record_name='100'.
+        record_name='100'. The path to the file can be a cloud URL.
     extension : str
         The annotatator extension of the annotation file. ie. for  file
         '100.atr', extension='atr'.
@@ -2086,10 +2095,11 @@ def load_byte_pairs(record_name, extension, pn_dir):
         The input filestream converted to an Nx2 array of unsigned bytes.
 
     """
-    # local file
+    # local or cloud file
     if pn_dir is None:
-        with open(record_name + "." + extension, "rb") as f:
-            filebytes = np.fromfile(f, "<u1").reshape([-1, 2])
+        with fsspec.open(record_name + "." + extension, "rb") as f:
+            filebytes = util.fromfile(f, "<u1").reshape([-1, 2])
+
     # PhysioNet file
     else:
         filebytes = download._stream_annotation(
diff --git a/wfdb/io/record.py b/wfdb/io/record.py
@@ -4,6 +4,7 @@
 import os
 import re
 
+import fsspec
 import numpy as np
 import pandas as pd
 
@@ -13,6 +14,7 @@
 from wfdb.io import download
 from wfdb.io import header
 from wfdb.io import util
+from wfdb.io._coreio import CLOUD_PROTOCOLS
 
 
 # -------------- WFDB Signal Calibration and Classification ---------- #
@@ -1824,27 +1826,39 @@ def rdheader(record_name, pn_dir=None, rd_segments=False):
 
     """
     dir_name, base_record_name = os.path.split(record_name)
-    dir_name = os.path.abspath(dir_name)
+    file_name = f"{base_record_name}.hea"
 
-    # Construct the download path using the database version
-    if (pn_dir is not None) and ("." not in pn_dir):
-        dir_list = pn_dir.split("/")
-        pn_dir = posixpath.join(
-            dir_list[0], download.get_version(dir_list[0]), *dir_list[1:]
-        )
+    # If this is a cloud path, use posixpath to construct the path and fsspec to open file
+    if any(dir_name.startswith(proto) for proto in CLOUD_PROTOCOLS):
+        with fsspec.open(posixpath.join(dir_name, file_name), mode="r") as f:
+            header_content = f.read()
 
-    # Read the local or remote header file.
-    file_name = f"{base_record_name}.hea"
-    if pn_dir is None:
-        with open(
+    # If the PhysioNet database path is provided, construct the download path using the database version
+    elif pn_dir is not None:
+        # check to make sure a cloud path isn't being passed under pn_dir
+        if any(pn_dir.startswith(proto) for proto in CLOUD_PROTOCOLS):
+            raise ValueError(
+                "Cloud paths should be passed under record_name, not under pn_dir"
+            )
+
+        if "." not in pn_dir:
+            dir_list = pn_dir.split("/")
+            pn_dir = posixpath.join(
+                dir_list[0], download.get_version(dir_list[0]), *dir_list[1:]
+            )
+
+        header_content = download._stream_header(file_name, pn_dir)
+
+    # If it isn't a cloud path or a PhysioNet path, we treat as a local file
+    else:
+        dir_name = os.path.abspath(dir_name)
+        with fsspec.open(
             os.path.join(dir_name, file_name),
             "r",
             encoding="ascii",
             errors="ignore",
         ) as f:
             header_content = f.read()
-    else:
-        header_content = download._stream_header(file_name, pn_dir)
 
     # Separate comment and non-comment lines
     header_lines, comment_lines = header.parse_header_content(header_content)
@@ -2017,14 +2031,22 @@ def rdrecord(
 
     """
     dir_name, base_record_name = os.path.split(record_name)
-    dir_name = os.path.abspath(dir_name)
+    # Update the dir_name using abspath unless it is a cloud path
+    if not any(dir_name.startswith(proto) for proto in CLOUD_PROTOCOLS):
+        dir_name = os.path.abspath(dir_name)
 
     # Read the header fields
-    if (pn_dir is not None) and ("." not in pn_dir):
-        dir_list = pn_dir.split("/")
-        pn_dir = posixpath.join(
-            dir_list[0], download.get_version(dir_list[0]), *dir_list[1:]
-        )
+    if pn_dir is not None:
+        # check to make sure a cloud path isn't being passed under pn_dir
+        if any(pn_dir.startswith(proto) for proto in CLOUD_PROTOCOLS):
+            raise ValueError(
+                "Cloud paths should be passed under record_name, not under pn_dir"
+            )
+        if "." not in pn_dir:
+            dir_list = pn_dir.split("/")
+            pn_dir = posixpath.join(
+                dir_list[0], download.get_version(dir_list[0]), *dir_list[1:]
+            )
 
     record = rdheader(record_name, pn_dir=pn_dir, rd_segments=False)
 
@@ -2308,11 +2330,17 @@ def rdsamp(
                                       channels=[1,3])
 
     """
-    if (pn_dir is not None) and ("." not in pn_dir):
-        dir_list = pn_dir.split("/")
-        pn_dir = posixpath.join(
-            dir_list[0], download.get_version(dir_list[0]), *dir_list[1:]
-        )
+    if pn_dir is not None:
+        # check to make sure a cloud path isn't being passed under pn_dir
+        if any(pn_dir.startswith(proto) for proto in CLOUD_PROTOCOLS):
+            raise ValueError(
+                "Cloud paths should be passed under record_name, not under pn_dir"
+            )
+        if "." not in pn_dir:
+            dir_list = pn_dir.split("/")
+            pn_dir = posixpath.join(
+                dir_list[0], download.get_version(dir_list[0]), *dir_list[1:]
+            )
 
     record = rdrecord(
         record_name=record_name,
diff --git a/wfdb/io/util.py b/wfdb/io/util.py

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,8 @@ dependencies = [`
`16`	`16`	`"soundfile >= 0.10.0",`
`17`	`17`	`"matplotlib >= 3.2.2",`
`18`	`18`	`"requests >= 2.8.1",`
	`19`	`+ "fsspec >= 2023.10.0",`
	`20`	`+ "aiohttp >= 3.10.11",`
`19`	`21`	`]`
`20`	`22`	`dynamic = ["version"]`
`21`	`23`