Skip to content

Commit

Permalink
Add configuration for DESPIAD project (#572)
Browse files Browse the repository at this point in the history
* Add initial anonymisation config for ct

* Add initial anonymisation config for pet

* Add config for despiad

* anonymnise all resources before notifying the export api

this is because resources sharing the same StudyInstanceUID in Orthanc Raw will be combined into a single resource in Orthanc Anon.
Previously, we would try to export each resource after anonymisation, but this meant other resources sharing the same StudyInstanceUID were not exported.

* remove despaid.yaml from project config

* generate label based on patient id and study count in xnat project

* Use pseudo-anonymised StudyInstanceUID for xnat experiment label

* Fix XNAT destination

* remove changes related to grouping resources before notifying export api

it's been addressed in another PR

* remove duplicated tags

* Add series_number_filters and allowed_manufacturers parameters to pixl project config

* clarify docstring of _import_study_from_raw

* Add min_instances_per_series parameter to project config

In orthanc anon plugin, skip series that have few than min_instances_per_series instances

* Keep study date and patient dob for despiad

* Changes after reviewing the PET data for DESPIAD (#592)

* Add Radiopharmaceutical Start DateTime to pet.yaml

* remove blank lines from ct.yaml

* remove tab from config file

* filter series number by manufacturer

also default to allowing no manufacturers

* Add allowed_manufacturers for all test configs

* Count number of instances skipped due to series having too few instances

* move get_series_to_skip to dcmd

* Add philips and carestream as allowed manufacturers for test project

* Update description of project config in readme

* Check _should_exclude_manufacurer before _should_exclude_series

in case the manufacturer doesn't exist

* filter out instance if manufacturer tag is missing

* allow all manufacturers for existing projects

* Add tests for PixlConfig.is_manufacturer_allowed and PixlConfig.is_series_number_excluded

* Add more tests for _should_exclude_series

* Add tests for test_should_exclude_manufacturer

* Add tests for get_series_to_skip

* Don't allow all manufacturers in the template config

* Set min_instances to 2 for despiad

* Only allow manufacturer GE MEDICAL SYSTEMS for DESPIAD

* Keep Number of Time Slices attribute for PET

* set 'pydicom.config.convert_wrong_length_to_UN = True' in dcmd

* Add series filters to despiad config

* Add series number and description filers for despiad

* Use ints for series numbers to exclude

* Update default config to exclude series with mip in their description

* Add ^company as an allowed manufacturer when testing anonymisation

* Use integers for series_number in tests

* set min_instances_per_series to 2 by default

* Set min_instances_per_series to 1 for testing

---------

Co-authored-by: davecash75 <[email protected]>
  • Loading branch information
p-j-smith and davecash75 authored Feb 6, 2025
1 parent 4af6ed0 commit 28db049
Show file tree
Hide file tree
Showing 23 changed files with 799 additions and 45 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,13 @@ The configuration file defines:

- Project name: the `<project-slug>` name of the Project
- The DICOM dataset modalities to retain (e.g. `["DX", "CR"]` for X-Ray studies)
- The minimum number of instances required by a series (defaults to 2). Can be set higher than 1 to filter out
series with a single screenshot containing patient identifiable data
- A list of series description filters (e.g. `['loc', 'pos']`). Series with descriptions matching any of these
filters will be skipped
- A list of allowed manufacturers. By default, no manufacturers are allowed. For each manufacturer:
- A regex to identify the allowed manufacturer (e.g. `^philips`)
- A list of series numbers to exclude for the given manufacturer (e.g. `[3, 4]`)
- The [anonymisation operations](/pixl_dcmd/README.md#tag-scheme-anonymisation) to be applied to the DICOM tags,
by providing a file path to one or multiple YAML files.
We currently allow two types of files:
Expand Down
2 changes: 1 addition & 1 deletion cli/src/pixl_cli/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
"password": config("PIXL_DB_PASSWORD"),
"database": config("PIXL_DB_NAME"),
},
} # type: dict
}


class APIConfig:
Expand Down
13 changes: 13 additions & 0 deletions orthanc/orthanc-anon/plugin/pixl.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from pixl_dcmd.dicom_helpers import get_study_info
from pixl_dcmd.main import (
anonymise_dicom_and_update_db,
get_series_to_skip,
parse_validation_results,
write_dataset_to_bytes,
)
Expand Down Expand Up @@ -331,6 +332,7 @@ def _anonymise_study_instances(
Return a list of the bytes of anonymised instances, and the anonymised StudyInstanceUID.
"""
config = load_project_config(project_name)
series_to_skip = get_series_to_skip(zipped_study, config.min_instances_per_series)
anonymised_instances_bytes = []
skipped_instance_counts = defaultdict(int)
dicom_validation_errors = {}
Expand All @@ -339,6 +341,17 @@ def _anonymise_study_instances(
with zipped_study.open(file_info) as file:
logger.debug("Reading file {}", file)
dataset = dcmread(file)

if dataset.SeriesInstanceUID in series_to_skip:
logger.debug(
"Skipping series {} for study {} due to too few instances",
dataset.SeriesInstanceUID,
study_info,
)
key = "DICOM instance discarded as series has too few instances"
skipped_instance_counts[key] += 1
continue

try:
anonymised_instance, instance_validation_errors = _anonymise_dicom_instance(
dataset, config
Expand Down
69 changes: 63 additions & 6 deletions pixl_core/src/core/project_config/pixl_config_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from __future__ import annotations

import re
from enum import Enum
from pathlib import Path
from typing import Any, Optional
Expand Down Expand Up @@ -61,6 +62,16 @@ class _Project(BaseModel):
modalities: list[str]


class Manufacturer(BaseModel):
"""
An allowed manufacturer for a project.
Also defines which series numbers to exclude for this manufacturer.
"""

regex: str = "no manufacturers allowed ^"
exclude_series_numbers: list[int] = []


class TagOperationFiles(BaseModel):
"""Tag operations files for a project. At least a base file is required."""

Expand Down Expand Up @@ -133,20 +144,66 @@ class PixlConfig(BaseModel):
"""Project-specific configuration for Pixl."""

project: _Project
series_filters: Optional[list[str]] = None
min_instances_per_series: Optional[int] = 2
series_filters: Optional[list[str]] = [] # pydantic makes a deep copy of the empty default list
allowed_manufacturers: list[Manufacturer] = [Manufacturer()]
tag_operation_files: TagOperationFiles
destination: _Destination

def is_series_excluded(self, series_description: str) -> bool:
def is_series_description_excluded(self, series_description: str | None) -> bool:
"""
Return whether this config excludes the series with the given description
Return whether this config excludes the series with the given description.
Do a simple case-insensitive substring check - this data is ultimately typed by a human, and
different image sources may have different conventions for case conversion.
:param series_description: the series description to test
:returns: True if it should be excluded, False if not
"""
if self.series_filters is None or series_description is None:
if not self.series_filters or series_description is None:
return False
# Do a simple case-insensitive substring check - this data is ultimately typed by a human,
# and different image sources may have different conventions for case conversion.

return any(
series_description.upper().find(filt.upper()) != -1 for filt in self.series_filters
)

def is_series_number_excluded(self, manufacturer: str, series_number: str | None) -> bool:
"""
Return whether this config excludes the series with the given number for the given
manufacturer.
:param manufacturer: the manufacturer to test
:param series_number: the series number to test
:returns: True if it should be excluded, False if not
"""
if not self.is_manufacturer_allowed(manufacturer) or series_number is None:
return True

exclude_series_numbers = self._get_manufacturer(manufacturer).exclude_series_numbers
return series_number in exclude_series_numbers

def is_manufacturer_allowed(self, manufacturer: str) -> bool:
"""
Check whether the manufacturer is in the allow-list.
:param manufacturer: name of the manufacturer
:returns: True is the manufacturer is allowed, False if not
"""
for manufacturer_config in self.allowed_manufacturers:
if re.search(rf"{manufacturer_config.regex}", manufacturer, flags=re.IGNORECASE):
return True
return False

def _get_manufacturer(self, manufacturer: str) -> Manufacturer:
"""
Get the manufacturer configuration for the given manufacturer.
:param manufacturer: name of the manufacturer
:returns: Manufacturer configuration
:raises: ValueError: if the manufacturer is not allowed
"""
for manufacturer_config in self.allowed_manufacturers:
if re.search(rf"{manufacturer_config.regex}", manufacturer, flags=re.IGNORECASE):
return manufacturer_config
msg = f"Manufacturer {manufacturer} is not allowed by project {self.project.name}"
raise ValueError(msg)
41 changes: 40 additions & 1 deletion pixl_core/tests/project_config/test_project_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,4 +181,43 @@ def test_series_filtering(base_yaml_data, series_filters, test_series_desc, expe
if series_filters is not None:
base_yaml_data["series_filters"] = series_filters
cfg = PixlConfig.model_validate(base_yaml_data)
assert cfg.is_series_excluded(test_series_desc) == expect_exclude
assert cfg.is_series_description_excluded(test_series_desc) == expect_exclude


@pytest.mark.parametrize(
("regex", "manufacturer", "allowed"),
[
("^allowed", "allowed", True),
("allowed", "not-allowed", False),
(None, "allowed", False),
],
)
def test_manufacturer_regex_filtering(base_yaml_data, regex, manufacturer, allowed):
"""Check the allowed manufacturers regex works."""
if regex is not None:
base_yaml_data["allowed_manufacturers"] = [{"regex": "^allowed"}]
cfg = PixlConfig.model_validate(base_yaml_data)
assert cfg.is_manufacturer_allowed(manufacturer) == allowed


@pytest.mark.parametrize(
("manufacturer", "series_number", "expect_exclude"),
[
("allowed", 2, True),
("allowed", 4, False),
("allowed", None, True),
("not-allowed", 4, True),
],
)
def test_manufacturer_series_number_filterings(
base_yaml_data, manufacturer, series_number, expect_exclude
):
"""Check the series number are correctly excluded."""
base_yaml_data["allowed_manufacturers"] = [
{"regex": "^allowed", "exclude_series_numbers": [1, 2, 3]}
]
cfg = PixlConfig.model_validate(base_yaml_data)
assert (
cfg.is_series_number_excluded(manufacturer=manufacturer, series_number=series_number)
== expect_exclude
)
75 changes: 71 additions & 4 deletions pixl_dcmd/src/pixl_dcmd/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import typing
from functools import lru_cache
from io import BytesIO
from zipfile import ZipFile

import requests
from core.exceptions import PixlSkipInstanceError
Expand All @@ -26,7 +27,8 @@
anonymize_dataset,
)
from loguru import logger
from pydicom import DataElement, Dataset, dcmwrite
from pydicom import DataElement, Dataset, dcmread, dcmwrite
import pydicom

from core.project_config.pixl_config_model import PixlConfig
from pixl_dcmd._database import (
Expand All @@ -43,6 +45,10 @@
from pixl_dcmd.dicom_helpers import StudyInfo


# See: https://github.com/pydicom/pydicom/issues/2170
pydicom.config.convert_wrong_length_to_UN = True


def write_dataset_to_bytes(dataset: Dataset) -> bytes:
"""
Write pydicom DICOM dataset to byte array
Expand All @@ -56,14 +62,72 @@ def write_dataset_to_bytes(dataset: Dataset) -> bytes:
return buffer.read()


def get_series_to_skip(zipped_study: ZipFile, min_instances: int) -> set[str]:
"""
Determine which series to skip based on the number of instances in the series.
If a series has fewer instances than `min_instances`, add it to a set of series to skip.
Args:
zipped_study: ZipFile containing the study
min_instances: Minimum number of instances required to include a series
"""
if min_instances <= 1:
return set()

series_instances = {}
for file_info in zipped_study.infolist():
with zipped_study.open(file_info) as file:
logger.debug("Reading file {}", file)
dataset = dcmread(file)
if dataset.SeriesInstanceUID not in series_instances:
series_instances[dataset.SeriesInstanceUID] = 1
continue
series_instances[dataset.SeriesInstanceUID] += 1

return {
series for series, count in series_instances.items() if count < min_instances
}


def _should_exclude_series(dataset: Dataset, cfg: PixlConfig) -> bool:
"""
Check whether the dataset series should be exlucded based on its description
and number.
"""
series_description = dataset.get("SeriesDescription")
if cfg.is_series_excluded(series_description):
if cfg.is_series_description_excluded(series_description):
logger.debug("FILTERING OUT series description: {}", series_description)
return True

manufacturer = dataset.get("Manufacturer")
series_number = dataset.get("SeriesNumber")
if cfg.is_series_number_excluded(
manufacturer=manufacturer, series_number=series_number
):
logger.debug(
"FILTERING OUT series number: {} for manufacturer: {}",
series_number,
manufacturer,
)
return True

return False


def _should_exclude_manufacturer(dataset: Dataset, cfg: PixlConfig) -> bool:
manufacturer = dataset.get("Manufacturer")
if manufacturer is None:
logger.debug("FILTERING out as manufacturer tag is missing")
return True

should_exclude = not cfg.is_manufacturer_allowed(manufacturer=manufacturer)
if should_exclude:
logger.debug("FILTERING out manufacturer: {}", manufacturer)
return should_exclude


def anonymise_dicom_and_update_db(
dataset: Dataset,
*,
Expand Down Expand Up @@ -125,9 +189,12 @@ def anonymise_dicom(
)

# Do before anonymisation in case someone decides to delete the
# Series Description tag as part of anonymisation.
# Series Description or Manufacturer tags as part of anonymisation.
if _should_exclude_manufacturer(dataset, config):
msg = "DICOM instance discarded due to its manufacturer"
raise PixlSkipInstanceError(msg)
if _should_exclude_series(dataset, config):
msg = "DICOM instance discarded due to its series description"
msg = "DICOM instance discarded due to its series description or number"
raise PixlSkipInstanceError(msg)
if dataset.Modality not in config.project.modalities:
msg = f"Dropping DICOM Modality: {dataset.Modality}"
Expand Down
Loading

0 comments on commit 28db049

Please sign in to comment.