Add configuration for DESPIAD project (#572)

* Add initial anonymisation config for ct * Add initial anonymisation config for pet * Add config for despiad * anonymnise all resources before notifying the export api this is because resources sharing the same StudyInstanceUID in Orthanc Raw will be combined into a single resource in Orthanc Anon. Previously, we would try to export each resource after anonymisation, but this meant other resources sharing the same StudyInstanceUID were not exported. * remove despaid.yaml from project config * generate label based on patient id and study count in xnat project * Use pseudo-anonymised StudyInstanceUID for xnat experiment label * Fix XNAT destination * remove changes related to grouping resources before notifying export api it's been addressed in another PR * remove duplicated tags * Add series_number_filters and allowed_manufacturers parameters to pixl project config * clarify docstring of _import_study_from_raw * Add min_instances_per_series parameter to project config In orthanc anon plugin, skip series that have few than min_instances_per_series instances * Keep study date and patient dob for despiad * Changes after reviewing the PET data for DESPIAD (#592) * Add Radiopharmaceutical Start DateTime to pet.yaml * remove blank lines from ct.yaml * remove tab from config file * filter series number by manufacturer also default to allowing no manufacturers * Add allowed_manufacturers for all test configs * Count number of instances skipped due to series having too few instances * move get_series_to_skip to dcmd * Add philips and carestream as allowed manufacturers for test project * Update description of project config in readme * Check _should_exclude_manufacurer before _should_exclude_series in case the manufacturer doesn't exist * filter out instance if manufacturer tag is missing * allow all manufacturers for existing projects * Add tests for PixlConfig.is_manufacturer_allowed and PixlConfig.is_series_number_excluded * Add more tests for _should_exclude_series * Add tests for test_should_exclude_manufacturer * Add tests for get_series_to_skip * Don't allow all manufacturers in the template config * Set min_instances to 2 for despiad * Only allow manufacturer GE MEDICAL SYSTEMS for DESPIAD * Keep Number of Time Slices attribute for PET * set 'pydicom.config.convert_wrong_length_to_UN = True' in dcmd * Add series filters to despiad config * Add series number and description filers for despiad * Use ints for series numbers to exclude * Update default config to exclude series with mip in their description * Add ^company as an allowed manufacturer when testing anonymisation * Use integers for series_number in tests * set min_instances_per_series to 2 by default * Set min_instances_per_series to 1 for testing --------- Co-authored-by: davecash75 <[email protected]>
SAFEHR-data · Feb 6, 2025 · 28db049 · 28db049
1 parent 4af6ed0
commit 28db049
Show file tree

Hide file tree

Showing 23 changed files with 799 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -147,6 +147,13 @@ The configuration file defines:
 
 - Project name: the `<project-slug>` name of the Project
 - The DICOM dataset modalities to retain (e.g. `["DX", "CR"]` for X-Ray studies)
+- The minimum number of instances required by a series (defaults to 2). Can be set higher than 1 to filter out
+  series with a single screenshot containing patient identifiable data
+- A list of series description filters (e.g. `['loc', 'pos']`). Series with descriptions matching any of these
+  filters will be skipped
+- A list of allowed manufacturers. By default, no manufacturers are allowed. For each manufacturer:
+  - A regex to identify the allowed manufacturer (e.g. `^philips`)
+  - A list of series numbers to exclude for the given manufacturer (e.g. `[3, 4]`)
 - The [anonymisation operations](/pixl_dcmd/README.md#tag-scheme-anonymisation) to be applied to the DICOM tags,
   by providing a file path to one or multiple YAML files.
   We currently allow two types of files:

diff --git a/cli/src/pixl_cli/_config.py b/cli/src/pixl_cli/_config.py
@@ -35,7 +35,7 @@
         "password": config("PIXL_DB_PASSWORD"),
         "database": config("PIXL_DB_NAME"),
     },
-}  # type: dict
+}
 
 
 class APIConfig:

diff --git a/orthanc/orthanc-anon/plugin/pixl.py b/orthanc/orthanc-anon/plugin/pixl.py
@@ -45,6 +45,7 @@
 from pixl_dcmd.dicom_helpers import get_study_info
 from pixl_dcmd.main import (
     anonymise_dicom_and_update_db,
+    get_series_to_skip,
     parse_validation_results,
     write_dataset_to_bytes,
 )
@@ -331,6 +332,7 @@ def _anonymise_study_instances(
     Return a list of the bytes of anonymised instances, and the anonymised StudyInstanceUID.
     """
     config = load_project_config(project_name)
+    series_to_skip = get_series_to_skip(zipped_study, config.min_instances_per_series)
     anonymised_instances_bytes = []
     skipped_instance_counts = defaultdict(int)
     dicom_validation_errors = {}
@@ -339,6 +341,17 @@ def _anonymise_study_instances(
         with zipped_study.open(file_info) as file:
             logger.debug("Reading file {}", file)
             dataset = dcmread(file)
+
+            if dataset.SeriesInstanceUID in series_to_skip:
+                logger.debug(
+                    "Skipping series {} for study {} due to too few instances",
+                    dataset.SeriesInstanceUID,
+                    study_info,
+                )
+                key = "DICOM instance discarded as series has too few instances"
+                skipped_instance_counts[key] += 1
+                continue
+
             try:
                 anonymised_instance, instance_validation_errors = _anonymise_dicom_instance(
                     dataset, config

diff --git a/pixl_core/src/core/project_config/pixl_config_model.py b/pixl_core/src/core/project_config/pixl_config_model.py
@@ -16,6 +16,7 @@
 
 from __future__ import annotations
 
+import re
 from enum import Enum
 from pathlib import Path
 from typing import Any, Optional
@@ -61,6 +62,16 @@ class _Project(BaseModel):
     modalities: list[str]
 
 
+class Manufacturer(BaseModel):
+    """
+    An allowed manufacturer for a project.
+    Also defines which series numbers to exclude for this manufacturer.
+    """
+
+    regex: str = "no manufacturers allowed ^"
+    exclude_series_numbers: list[int] = []
+
+
 class TagOperationFiles(BaseModel):
     """Tag operations files for a project. At least a base file is required."""
 
@@ -133,20 +144,66 @@ class PixlConfig(BaseModel):
     """Project-specific configuration for Pixl."""
 
     project: _Project
-    series_filters: Optional[list[str]] = None
+    min_instances_per_series: Optional[int] = 2
+    series_filters: Optional[list[str]] = []  # pydantic makes a deep copy of the empty default list
+    allowed_manufacturers: list[Manufacturer] = [Manufacturer()]
     tag_operation_files: TagOperationFiles
     destination: _Destination
 
-    def is_series_excluded(self, series_description: str) -> bool:
+    def is_series_description_excluded(self, series_description: str | None) -> bool:
         """
-        Return whether this config excludes the series with the given description
+        Return whether this config excludes the series with the given description.
+
+        Do a simple case-insensitive substring check - this data is ultimately typed by a human, and
+        different image sources may have different conventions for case conversion.
+
         :param series_description: the series description to test
         :returns: True if it should be excluded, False if not
         """
-        if self.series_filters is None or series_description is None:
+        if not self.series_filters or series_description is None:
             return False
-        # Do a simple case-insensitive substring check - this data is ultimately typed by a human,
-        # and different image sources may have different conventions for case conversion.
+
         return any(
             series_description.upper().find(filt.upper()) != -1 for filt in self.series_filters
         )
+
+    def is_series_number_excluded(self, manufacturer: str, series_number: str | None) -> bool:
+        """
+        Return whether this config excludes the series with the given number for the given
+        manufacturer.
+
+        :param manufacturer: the manufacturer to test
+        :param series_number: the series number to test
+        :returns: True if it should be excluded, False if not
+        """
+        if not self.is_manufacturer_allowed(manufacturer) or series_number is None:
+            return True
+
+        exclude_series_numbers = self._get_manufacturer(manufacturer).exclude_series_numbers
+        return series_number in exclude_series_numbers
+
+    def is_manufacturer_allowed(self, manufacturer: str) -> bool:
+        """
+        Check whether the manufacturer is in the allow-list.
+
+        :param manufacturer: name of the manufacturer
+        :returns: True is the manufacturer is allowed, False if not
+        """
+        for manufacturer_config in self.allowed_manufacturers:
+            if re.search(rf"{manufacturer_config.regex}", manufacturer, flags=re.IGNORECASE):
+                return True
+        return False
+
+    def _get_manufacturer(self, manufacturer: str) -> Manufacturer:
+        """
+        Get the manufacturer configuration for the given manufacturer.
+
+        :param manufacturer: name of the manufacturer
+        :returns: Manufacturer configuration
+        :raises: ValueError: if the manufacturer is not allowed
+        """
+        for manufacturer_config in self.allowed_manufacturers:
+            if re.search(rf"{manufacturer_config.regex}", manufacturer, flags=re.IGNORECASE):
+                return manufacturer_config
+        msg = f"Manufacturer {manufacturer} is not allowed by project {self.project.name}"
+        raise ValueError(msg)
diff --git a/pixl_core/tests/project_config/test_project_config.py b/pixl_core/tests/project_config/test_project_config.py
@@ -181,4 +181,43 @@ def test_series_filtering(base_yaml_data, series_filters, test_series_desc, expe
     if series_filters is not None:
         base_yaml_data["series_filters"] = series_filters
     cfg = PixlConfig.model_validate(base_yaml_data)
-    assert cfg.is_series_excluded(test_series_desc) == expect_exclude
+    assert cfg.is_series_description_excluded(test_series_desc) == expect_exclude
+
+
+@pytest.mark.parametrize(
+    ("regex", "manufacturer", "allowed"),
+    [
+        ("^allowed", "allowed", True),
+        ("allowed", "not-allowed", False),
+        (None, "allowed", False),
+    ],
+)
+def test_manufacturer_regex_filtering(base_yaml_data, regex, manufacturer, allowed):
+    """Check the allowed manufacturers regex works."""
+    if regex is not None:
+        base_yaml_data["allowed_manufacturers"] = [{"regex": "^allowed"}]
+    cfg = PixlConfig.model_validate(base_yaml_data)
+    assert cfg.is_manufacturer_allowed(manufacturer) == allowed
+
+
+@pytest.mark.parametrize(
+    ("manufacturer", "series_number", "expect_exclude"),
+    [
+        ("allowed", 2, True),
+        ("allowed", 4, False),
+        ("allowed", None, True),
+        ("not-allowed", 4, True),
+    ],
+)
+def test_manufacturer_series_number_filterings(
+    base_yaml_data, manufacturer, series_number, expect_exclude
+):
+    """Check the series number are correctly excluded."""
+    base_yaml_data["allowed_manufacturers"] = [
+        {"regex": "^allowed", "exclude_series_numbers": [1, 2, 3]}
+    ]
+    cfg = PixlConfig.model_validate(base_yaml_data)
+    assert (
+        cfg.is_series_number_excluded(manufacturer=manufacturer, series_number=series_number)
+        == expect_exclude
+    )
diff --git a/pixl_dcmd/src/pixl_dcmd/main.py b/pixl_dcmd/src/pixl_dcmd/main.py
@@ -16,6 +16,7 @@
 import typing
 from functools import lru_cache
 from io import BytesIO
+from zipfile import ZipFile
 
 import requests
 from core.exceptions import PixlSkipInstanceError
@@ -26,7 +27,8 @@
     anonymize_dataset,
 )
 from loguru import logger
-from pydicom import DataElement, Dataset, dcmwrite
+from pydicom import DataElement, Dataset, dcmread, dcmwrite
+import pydicom
 
 from core.project_config.pixl_config_model import PixlConfig
 from pixl_dcmd._database import (
@@ -43,6 +45,10 @@
     from pixl_dcmd.dicom_helpers import StudyInfo
 
 
+# See: https://github.com/pydicom/pydicom/issues/2170
+pydicom.config.convert_wrong_length_to_UN = True
+
+
 def write_dataset_to_bytes(dataset: Dataset) -> bytes:
     """
     Write pydicom DICOM dataset to byte array
@@ -56,14 +62,72 @@ def write_dataset_to_bytes(dataset: Dataset) -> bytes:
         return buffer.read()
 
 
+def get_series_to_skip(zipped_study: ZipFile, min_instances: int) -> set[str]:
+    """
+    Determine which series to skip based on the number of instances in the series.
+
+    If a series has fewer instances than `min_instances`, add it to a set of series to skip.
+
+    Args:
+        zipped_study: ZipFile containing the study
+        min_instances: Minimum number of instances required to include a series
+
+    """
+    if min_instances <= 1:
+        return set()
+
+    series_instances = {}
+    for file_info in zipped_study.infolist():
+        with zipped_study.open(file_info) as file:
+            logger.debug("Reading file {}", file)
+            dataset = dcmread(file)
+            if dataset.SeriesInstanceUID not in series_instances:
+                series_instances[dataset.SeriesInstanceUID] = 1
+                continue
+            series_instances[dataset.SeriesInstanceUID] += 1
+
+    return {
+        series for series, count in series_instances.items() if count < min_instances
+    }
+
+
 def _should_exclude_series(dataset: Dataset, cfg: PixlConfig) -> bool:
+    """
+    Check whether the dataset series should be exlucded based on its description
+    and number.
+    """
     series_description = dataset.get("SeriesDescription")
-    if cfg.is_series_excluded(series_description):
+    if cfg.is_series_description_excluded(series_description):
         logger.debug("FILTERING OUT series description: {}", series_description)
         return True
+
+    manufacturer = dataset.get("Manufacturer")
+    series_number = dataset.get("SeriesNumber")
+    if cfg.is_series_number_excluded(
+        manufacturer=manufacturer, series_number=series_number
+    ):
+        logger.debug(
+            "FILTERING OUT series number: {} for manufacturer: {}",
+            series_number,
+            manufacturer,
+        )
+        return True
+
     return False
 
 
+def _should_exclude_manufacturer(dataset: Dataset, cfg: PixlConfig) -> bool:
+    manufacturer = dataset.get("Manufacturer")
+    if manufacturer is None:
+        logger.debug("FILTERING out as manufacturer tag is missing")
+        return True
+
+    should_exclude = not cfg.is_manufacturer_allowed(manufacturer=manufacturer)
+    if should_exclude:
+        logger.debug("FILTERING out manufacturer: {}", manufacturer)
+    return should_exclude
+
+
 def anonymise_dicom_and_update_db(
     dataset: Dataset,
     *,
@@ -125,9 +189,12 @@ def anonymise_dicom(
     )
 
     # Do before anonymisation in case someone decides to delete the
-    # Series Description tag as part of anonymisation.
+    # Series Description or Manufacturer tags as part of anonymisation.
+    if _should_exclude_manufacturer(dataset, config):
+        msg = "DICOM instance discarded due to its manufacturer"
+        raise PixlSkipInstanceError(msg)
     if _should_exclude_series(dataset, config):
-        msg = "DICOM instance discarded due to its series description"
+        msg = "DICOM instance discarded due to its series description or number"
         raise PixlSkipInstanceError(msg)
     if dataset.Modality not in config.project.modalities:
         msg = f"Dropping DICOM Modality: {dataset.Modality}"