Skip to content

Commit 0c012f5

Browse files
committed
fix: add a new tnm regex pattern by default.
1 parent e113c26 commit 0c012f5

File tree

5 files changed

+178
-41
lines changed

5 files changed

+178
-41
lines changed

edsnlp/pipes/ner/tnm/model.py

Lines changed: 57 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,21 @@ class Metastasis(TnmEnum):
7979

8080

8181
class TNM(pydantic.BaseModel):
82-
prefix: Optional[Prefix] = None
83-
tumour: Optional[Tumour] = None
84-
tumour_specification: Optional[Specification] = None
82+
tumour_prefix: Optional[str] = None
83+
tumour: Optional[str] = None
84+
tumour_specification: Optional[str] = None
8585
tumour_suffix: Optional[str] = None
86-
node: Optional[Node] = None
87-
node_specification: Optional[Specification] = None
86+
node_prefix: Optional[str] = None
87+
node: Optional[str] = None
88+
node_specification: Optional[str] = None
8889
node_suffix: Optional[str] = None
89-
metastasis: Optional[Metastasis] = None
90-
resection_completeness: Optional[int] = None
90+
metastasis_prefix: Optional[str] = None
91+
metastasis: Optional[str] = None
92+
metastasis_specification: Optional[str] = None
93+
pleura: Optional[str] = None
94+
resection: Optional[str] = None
95+
resection_specification: Optional[str] = None
96+
resection_loc: Optional[str] = None
9197
version: Optional[str] = None
9298
version_year: Optional[int] = None
9399

@@ -112,32 +118,43 @@ def validate_year(cls, v):
112118
def norm(self) -> str:
113119
norm = []
114120

115-
if self.prefix is not None:
116-
norm.append(str(self.prefix))
121+
if self.tumour_prefix:
122+
norm.append(f"{self.tumour_prefix or ''}")
117123

118-
if (
119-
(self.tumour is not None)
120-
| (self.tumour_specification is not None)
121-
| (self.tumour_suffix is not None)
122-
):
123-
norm.append(f"T{str(self.tumour or '')}")
124-
norm.append(f"{str(self.tumour_specification or '')}")
125-
norm.append(f"{str(self.tumour_suffix or '')}")
126-
127-
if (
128-
(self.node is not None)
129-
| (self.node_specification is not None)
130-
| (self.node_suffix is not None)
131-
):
132-
norm.append(f"N{str(self.node or '')}")
133-
norm.append(f"{str(self.node_specification or '')}")
134-
norm.append(f"{str(self.node_suffix or '')}")
124+
if self.tumour:
125+
norm.append(f"T{self.tumour}")
126+
if self.tumour_specification:
127+
norm.append(f"{self.tumour_specification or ''}")
128+
if self.tumour_suffix:
129+
norm.append(f"{self.tumour_suffix or ''}")
130+
131+
if self.node_prefix:
132+
norm.append(f"{self.node_prefix or ''}")
135133

136-
if self.metastasis is not None:
134+
if self.node:
135+
norm.append(f"N{self.node}")
136+
if self.node_specification:
137+
norm.append(f"{self.node_specification or ''}")
138+
if self.node_suffix:
139+
norm.append(f"{self.node_suffix or ''}")
140+
141+
if self.metastasis_prefix:
142+
norm.append(f"{self.metastasis_prefix or ''}")
143+
144+
if self.metastasis:
137145
norm.append(f"M{self.metastasis}")
146+
if self.metastasis_specification:
147+
norm.append(f"{self.metastasis_specification or ''}")
148+
149+
if self.pleura:
150+
norm.append(f"PL{self.pleura}")
138151

139-
if self.resection_completeness is not None:
140-
norm.append(f"R{self.resection_completeness}")
152+
if self.resection:
153+
norm.append(f"R{self.resection}")
154+
if self.resection_specification:
155+
norm.append(f"{self.resection_specification or ''}")
156+
if self.resection_loc:
157+
norm.append(f"{self.resection_loc or ''}")
141158

142159
if self.version is not None and self.version_year is not None:
143160
norm.append(f" ({self.version.upper()} {self.version_year})")
@@ -182,14 +199,21 @@ def dict(
182199
set_keys = set(d.keys())
183200
for k in set_keys.intersection(
184201
{
185-
"prefix",
202+
"tumour_prefix",
186203
"tumour",
187-
"node",
188-
"metastasis",
189204
"tumour_specification",
190-
"node_specification",
191205
"tumour_suffix",
206+
"node_prefix",
207+
"node",
208+
"node_specification",
192209
"node_suffix",
210+
"metastasis_prefix",
211+
"metastasis",
212+
"metastasis_specification",
213+
"pleura",
214+
"resection",
215+
"resection_specification",
216+
"resection_loc",
193217
}
194218
):
195219
v = d[k]

edsnlp/pipes/ner/tnm/patterns.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
1-
prefix_pattern = r"(?P<prefix>[cpPyraum]p?)"
2-
tumour_pattern = r"T\s?(?P<tumour>([0-4o]|is))?(?P<tumour_specification>[abcdx]|mi)?"
1+
prefix_pattern = r"(?P<tumour_prefix>[cpPyraum]p?)"
2+
tumour_pattern = (
3+
r"T\s?(?P<tumour>([0-4o]|is|[Xx]))?(?P<tumour_specification>[abcdx]|mi)?"
4+
)
35
tumour_pattern += r"(?:\((?P<tumour_suffix>[^()]{1,10})\))?"
46
node_pattern = r"(\s{,2}\/?\s{,2}([cpPyraum]p?)?\s{,2}N\s?(?P<node>[0-3o]|x)"
57
node_pattern += (
68
r"(?P<node_specification>[abcdx]|mi)?(?:\((?P<node_suffix>[^()]{1,10})\))?)"
79
)
810

911
metastasis_pattern = (
10-
r"(\s{,2}\/?\s{,2}([cpPyraum]p?)?\s{,2}M\s?(?P<metastasis>([01o]|x))x?)" # noqa: E501
12+
r"(\s{,2}\/?\s{,2}([cpPyraum]p?)?\s{,2}M\s?(?P<metastasis>([01o]|x))x?)"
1113
)
12-
resection_completeness = r"(\s{,2}\/?\s{,2}R\s?(?P<resection_completeness>[012]))"
14+
resection_pattern = r"(\s{,2}\/?\s{,2}R\s?(?P<resection>[012]))"
1315

1416
version_pattern = (
1517
r"\(?(?P<version>uicc|accj|tnm|UICC|ACCJ|TNM)"
@@ -23,6 +25,6 @@
2325
tnm_pattern += prefix_pattern + r"\s{,2}?" + f"({tumour_pattern})"
2426
tnm_pattern += r"(\s{,2}" + f"{node_pattern})?"
2527
tnm_pattern += r"(\s{,2}" + f"{metastasis_pattern})?"
26-
tnm_pattern += r"(\s{,2}" + f"{resection_completeness})?"
28+
tnm_pattern += r"(\s{,2}" + f"{resection_pattern})?"
2729
tnm_pattern += f"({spacer}{version_pattern})?"
2830
tnm_pattern = r"(?:\b|^)" + tnm_pattern + r"(?:\b|$)"
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
tumour_pattern = (
2+
r"(?P<tumour_prefix>[cpyramP]{1,2}\s?)?" # Optional tumour prefix
3+
r"T\s?" # 'T' followed by optional space
4+
r"(?P<tumour>([0-4]|is|[Xx]|[Oo]))" # Tumour size (required if 'T' is present)
5+
r"(?:\s?(?P<tumour_specification>[abcdx]|mi))?" # Optional tumour specification
6+
r"(?:\s?\((?P<tumour_suffix>[^()]{1,10})\))?" # Optional tumour suffix
7+
)
8+
9+
node_pattern = (
10+
r"(?P<node_prefix>[cpyraP]{1,2}\s?)?" # Optional node prefix
11+
r"N\s?" # 'N' followed by optional space
12+
r"(?P<node>[Xx01234\+]|[Oo])" # Node size/status (required if 'N' is present)
13+
r"(?:\s?(?P<node_specification>"
14+
r"[abcdx]|mi|sn|i[-,+]|mol[-,+]|\(mi\)|\(sn\)|"
15+
r"\(i[-,+]\)|\(mol[-,+]\)|\(\d+\s*/\s*\d+\)))?" # Optional specification
16+
r"(?:\s?\((?P<node_suffix>[^()]{1,10})\))?" # Optional suffix
17+
)
18+
19+
metastasis_pattern = (
20+
r"(?P<metastasis_prefix>[cpyraP]{1,2}\s?)?" # Optional metastasis prefix
21+
r"M\s?" # 'M' followed by optional space
22+
r"(?P<metastasis>[Xx0123\+]|[Oo])" # Metastasis status (required if 'M' is present)
23+
r"(?:\s?(?P<metastasis_specification>"
24+
r"[abcd]|i\+|mol\+|cy\+|\(i\+\)|\(mol\+\)|"
25+
r"\(cy\+\)|PUL|OSS|HEP|BRA|LYM|OTH|MAR|PLE|PER|ADR|SKI))?" # Optional specification
26+
)
27+
28+
pleura_pattern = (
29+
r"PL\s?(?P<pleura>([0123]|x))?" # Optional pleura status (for lung cancer)
30+
)
31+
32+
resection_pattern = (
33+
r"R\s?"
34+
r"(?P<resection>[Xx012]|[Oo])?" # Optional resection completeness
35+
r"(?:\s?(?P<resection_specification>is|cy\+|\(is\)|\(cy\+\)))?" # Optional specification
36+
r"(?:\s?(?P<resection_loc>(\((?P<r_loc>[a-z]+)\)[,;\s]*)*))?" # Optional localization with space
37+
)
38+
39+
version_pattern = (
40+
r"\(?(?P<version>uicc|accj|tnm|UICC|ACCJ|TNM)" # TNM version
41+
r"\s+([éeE]ditions|[éeE]d\.?)?\s*"
42+
r"(?P<version_year>\d{4}|\d{2})\)?" # Year of the version
43+
)
44+
45+
TNM_space = r"(\s*[,\/]?\s*|\n)" # Allow space, comma, or slash as delimiters
46+
47+
# We need te exclude pattern like 'T1', 'T2' if they are not followed by node or
48+
# metastasis sections.
49+
50+
exclude_pattern = (
51+
r"(?!T\s*[0-4]\s*[.,\/](?!\s*"
52+
+ node_pattern
53+
+ "?"
54+
+ TNM_space
55+
+ "?"
56+
+ metastasis_pattern
57+
+ "?"
58+
+ "))"
59+
)
60+
61+
exclude_pattern = (
62+
r"(?!"
63+
r"(?:[cpyramP]{0,2}\s*)?" # Optional prefix like p, yp, PT
64+
r"T\s*"
65+
r"(?:[0-4]|is|[xXoO])" # T stage (includes is, x, o)
66+
r"(?:[abcdx]|mi)?" # Optional specification
67+
r"(?:\s*\([^()]{1,10}\))?" # Optional suffix
68+
r"(?:\s*[\s,\/\.\(\)]|$)" # <-- KEY ADDITION: allow end-of-string ($)
69+
r"(?!\s*"
70+
+ node_pattern + "?" + TNM_space + "?" + metastasis_pattern + "?"
71+
+ ")"
72+
+ ")"
73+
)
74+
75+
tnm_pattern_new = (
76+
r"(?:\b|^)"
77+
+ exclude_pattern
78+
+ r"(?:"
79+
+ r"(?P<T_component>"
80+
+ tumour_pattern
81+
+ ")"
82+
+ TNM_space
83+
+ "?"
84+
+ r"(?P<N_component>"
85+
+ node_pattern
86+
+ ")?"
87+
+ TNM_space
88+
+ "?"
89+
+ r"(?P<M_component>"
90+
+ metastasis_pattern
91+
+ ")?"
92+
+ TNM_space
93+
+ "?"
94+
+ r"(?P<PL_component>"
95+
+ pleura_pattern
96+
+ ")?"
97+
+ TNM_space
98+
+ "?"
99+
+ r"(?P<R_component>"
100+
+ resection_pattern
101+
+ ")?"
102+
+ TNM_space
103+
+ "?"
104+
+ r"(?P<V_component>"
105+
+ version_pattern
106+
+ ")?"
107+
+ r")"
108+
+ r"(?=[\s\(\)\.,;:/]|$)"
109+
#+ r"(?:\b|$|\n)"
110+
)

edsnlp/pipes/ner/tnm/tnm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from edsnlp.utils.typing import cast
1313

1414
from .model import TNM
15-
from .patterns import tnm_pattern
15+
from .patterns_new import tnm_pattern_new
1616

1717

1818
class TNMMatcher(BaseNERComponent):
@@ -75,7 +75,7 @@ def __init__(
7575
nlp: Optional[PipelineProtocol],
7676
name: str = "tnm",
7777
*,
78-
pattern: Optional[Union[List[str], str]] = tnm_pattern,
78+
pattern: Optional[Union[List[str], str]] = tnm_pattern_new,
7979
attr: str = "TEXT",
8080
label: str = "tnm",
8181
span_setter: SpanSetterArg = {"ents": True, "tnm": True},

tests/pipelines/ner/test_tnm.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from edsnlp.pipes.ner.tnm.patterns import tnm_pattern
12
from edsnlp.utils.examples import parse_example
23

34
examples = [
@@ -22,7 +23,7 @@
2223

2324

2425
def test_scores(blank_nlp):
25-
blank_nlp.add_pipe("eds.tnm")
26+
blank_nlp.add_pipe("eds.tnm", config=dict(pattern=tnm_pattern))
2627

2728
for example in examples:
2829
text, entities = parse_example(example=example)

0 commit comments

Comments
 (0)