Skip to content

Commit

Permalink
changing predefined recognizers to use the config file (#1393)
Browse files Browse the repository at this point in the history
  • Loading branch information
roeybc authored Jul 17, 2024
1 parent 56f0df2 commit c059131
Show file tree
Hide file tree
Showing 17 changed files with 669 additions and 458 deletions.
4 changes: 4 additions & 0 deletions docs/analyzer/analyzer_engine_provider.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ The configuration file contains the following parameters:
- `nlp_configuration`: Configuration given to the NLP engine which will detect the PIIs and extract features for the downstream logic.
- `recognizer_registry`: All the recognizers that will be used by the analyzer.

!!! note "Note"

`supported_languages` must be identical to the same field in recognizer_registry

## Using multiple files

Create an `AnalyzerEngineProvider` using three different configuration files for each of the following components:
Expand Down
9 changes: 9 additions & 0 deletions docs/analyzer/recognizer_registry_provider.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,26 @@ print(results)

```yaml
global_regex_flags: 26

supported_languages:
- en

recognizers:
...
```

The configuration file consists of two parts:

- `global_regex_flags`: regex flags to be used in regex matching (see [regex flags](https://docs.python.org/3/library/re.html#flags)).
- `supported_languages`: A list of supported languages that the registry will support.
- `recognizers`: a list of recognizers to be loaded by the recognizer registry. This list consists of two different types of recognizers:
- Predefined: A set of already defined recognizer classes in presidio. This includes all recognizers defined in the codebase (along with user defined recognizers) that inherit from EntityRecognizer.
- Custom: custom created pattern recognizers that are created based on the fields provided in the configuration file.

!!! note "Note"

supported_languages must be identical to the same field in analyzer_engine

## Recognizer list

The recognizer list comprises of both the predefined and custom recognizers, for example:
Expand Down
10 changes: 10 additions & 0 deletions presidio-analyzer/presidio_analyzer/analyzer_engine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import logging
from collections import Counter
from typing import List, Optional

import regex as re
Expand Down Expand Up @@ -79,6 +80,15 @@ def __init__(
)
registry = provider.create_recognizer_registry()
registry.add_nlp_recognizer(nlp_engine=self.nlp_engine)
else:
if Counter(registry.supported_languages) != Counter(
self.supported_languages
):
raise ValueError(
f"Misconfigured engine, supported languages have to be consistent"
f"registry.supported_languages: {registry.supported_languages}, "
f"analyzer_engine.supported_languages: {self.supported_languages}"
)

# added to support the previous interface
if not registry.recognizers:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,11 @@ def __init__(
nlp_engine_conf_file: Optional[Union[Path, str]] = None,
recognizer_registry_conf_file: Optional[Union[Path, str]] = None,
):
self.configuration = self._get_configuration(
conf_file=analyzer_engine_conf_file
)
self.configuration = self.get_configuration(conf_file=analyzer_engine_conf_file)
self.nlp_engine_conf_file = nlp_engine_conf_file
self.recognizer_registry_conf_file = recognizer_registry_conf_file

def _get_configuration(
def get_configuration(
self, conf_file: Optional[Union[Path, str]]
) -> Union[Dict[str, Any]]:
"""Retrieve the analyzer engine configuration from the provided file."""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
supported_languages:
- en
global_regex_flags: 26

recognizers:
# Recognizers listed here can either be loaded from the recognizers defined in code (type: predefined),
Expand Down Expand Up @@ -92,11 +93,21 @@ recognizers:
- en
type: predefined

- name: InPassportRecognizer
supported_languages:
- en
type: predefined

- name: EsNifRecognizer
supported_languages:
- es
type: predefined

- name: EsNieRecognizer
supported_languages:
- es
type: predefined

- name: ItDriverLicenseRecognizer
supported_languages:
- it
Expand Down Expand Up @@ -127,7 +138,6 @@ recognizers:
- pl
type: predefined


- name: CryptoRecognizer
type: predefined

Expand All @@ -150,4 +160,7 @@ recognizers:
type: predefined

- name: UrlRecognizer
type: predefined

- name: InVoterRecognizer
type: predefined
Original file line number Diff line number Diff line change
Expand Up @@ -14,42 +14,13 @@
TransformersNlpEngine,
)
from presidio_analyzer.predefined_recognizers import (
AuAbnRecognizer,
AuAcnRecognizer,
AuMedicareRecognizer,
AuTfnRecognizer,
CreditCardRecognizer,
CryptoRecognizer,
DateRecognizer,
EmailRecognizer,
EsNieRecognizer,
EsNifRecognizer,
IbanRecognizer,
InAadhaarRecognizer,
InPanRecognizer,
InPassportRecognizer,
InVehicleRegistrationRecognizer,
InVoterRecognizer,
IpRecognizer,
ItDriverLicenseRecognizer,
ItFiscalCodeRecognizer,
ItIdentityCardRecognizer,
ItPassportRecognizer,
ItVatCodeRecognizer,
MedicalLicenseRecognizer,
NhsRecognizer,
PhoneRecognizer,
PlPeselRecognizer,
SgFinRecognizer,
SpacyRecognizer,
StanzaRecognizer,
TransformersRecognizer,
UrlRecognizer,
UsBankRecognizer,
UsItinRecognizer,
UsLicenseRecognizer,
UsPassportRecognizer,
UsSsnRecognizer,
)
from presidio_analyzer.recognizer_registry.recognizers_loader_utils import (
RecognizerConfigurationLoader,
RecognizerListLoader,
)

logger = logging.getLogger("presidio-analyzer")
Expand Down Expand Up @@ -126,67 +97,17 @@ def load_predefined_recognizers(
:param nlp_engine: The NLP engine to use.
:return: None
"""
if not languages:
languages = ["en"]

recognizers_map = {
"en": [
UsBankRecognizer,
UsLicenseRecognizer,
UsItinRecognizer,
UsPassportRecognizer,
UsSsnRecognizer,
NhsRecognizer,
SgFinRecognizer,
AuAbnRecognizer,
AuAcnRecognizer,
AuTfnRecognizer,
AuMedicareRecognizer,
InPanRecognizer,
InAadhaarRecognizer,
InVehicleRegistrationRecognizer,
InVoterRecognizer,
InPassportRecognizer,
],
"es": [
EsNifRecognizer,
EsNieRecognizer,
],
"it": [
ItDriverLicenseRecognizer,
ItFiscalCodeRecognizer,
ItVatCodeRecognizer,
ItIdentityCardRecognizer,
ItPassportRecognizer,
],
"pl": [PlPeselRecognizer],
"ALL": [
CreditCardRecognizer,
CryptoRecognizer,
DateRecognizer,
EmailRecognizer,
IbanRecognizer,
IpRecognizer,
MedicalLicenseRecognizer,
PhoneRecognizer,
UrlRecognizer,
],
}
for lang in languages:
lang_recognizers = [
self.__instantiate_recognizer(
recognizer_class=rc, supported_language=lang
)
for rc in recognizers_map.get(lang, [])
]
self.recognizers.extend(lang_recognizers)
all_recognizers = [
self.__instantiate_recognizer(
recognizer_class=rc, supported_language=lang
)
for rc in recognizers_map.get("ALL", [])
]
self.recognizers.extend(all_recognizers)

registry_configuration = {"global_regex_flags": self.global_regex_flags}
if languages is not None:
registry_configuration["supported_languages"] = languages

configuration = RecognizerConfigurationLoader.get(
registry_configuration=registry_configuration
)
recognizers = RecognizerListLoader.get(**configuration)

self.recognizers.extend(recognizers)
self.add_nlp_recognizer(nlp_engine=nlp_engine)

@staticmethod
Expand Down
Loading

0 comments on commit c059131

Please sign in to comment.