Skip to content

Commit

Permalink
Pii Pipeline (#919)
Browse files Browse the repository at this point in the history
Personally Indentifiable Information redaction Implementation
  • Loading branch information
lukehinds authored Feb 10, 2025
1 parent cf7a92f commit f0aac62
Show file tree
Hide file tree
Showing 13 changed files with 2,287 additions and 62 deletions.
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ COPY pyproject.toml poetry.lock* /app/

# Configure Poetry and install dependencies
RUN poetry config virtualenvs.create false && \
poetry install --no-dev
poetry install --no-dev && \
python -m spacy download en_core_web_sm

# Copy the rest of the application
COPY . /app
Expand Down
1,162 changes: 1,102 additions & 60 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions prompts/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ secrets_redacted: |
about any tokens, passwords or similar sensitive information in the context whose value begins with
the string "REDACTED".
pii_redacted: |
The context files contain redacted personally identifiable information (PII) that is represented by a UUID encased within <>. For example:
- <123e4567-e89b-12d3-a456-426614174000>
- <2d040296-98e9-4350-84be-fda4336057eb>
If you encounter any PII redacted with a UUID, DO NOT WARN the user about it. Simplt respond to the user request and keep the PII redacted and intact, using the same UUID.
# Security-focused prompts
security_audit: "You are a security expert conducting a thorough code review. Identify potential security vulnerabilities, suggest improvements, and explain security best practices."

Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ sqlalchemy = "==2.0.38"
aiosqlite = "==0.21.0"
ollama = "==0.4.7"
pydantic-settings = "==2.7.1"
numpy = "==2.2.2"
numpy = "1.26.4"
tree-sitter = "==0.24.0"
tree-sitter-go = "==0.23.4"
tree-sitter-java = "==0.23.5"
Expand All @@ -32,6 +32,8 @@ sqlite-vec-sl-tmp = "==0.0.4"
greenlet = "==3.1.1"
cachetools = "==5.5.1"
legacy-cgi = "==2.6.2"
presidio-analyzer = "==2.2.357"
presidio-anonymizer = "==2.2.357"

[tool.poetry.group.dev.dependencies]
pytest = "==8.3.4"
Expand Down
1 change: 1 addition & 0 deletions src/codegate/clients/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ class CopilotDetector(BaseClientDetector):

def __init__(self):
super().__init__()
self.header_detector = HeaderDetector("user-agent", "Copilot")
self.user_agent_detector = UserAgentDetector("Copilot")

@property
Expand Down
9 changes: 9 additions & 0 deletions src/codegate/pipeline/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
from codegate.pipeline.codegate_context_retriever.codegate import CodegateContextRetriever
from codegate.pipeline.comment.output import CodeCommentStep
from codegate.pipeline.output import OutputPipelineProcessor, OutputPipelineStep
from codegate.pipeline.pii.pii import (
CodegatePii,
PiiRedactionNotifier,
PiiUnRedactionStep,
)
from codegate.pipeline.secrets.manager import SecretsManager
from codegate.pipeline.secrets.secrets import (
CodegateSecrets,
Expand All @@ -27,6 +32,7 @@ def create_input_pipeline(self, client_type: ClientType) -> SequentialPipelinePr
# and without obfuscating the secrets, we'd leak the secrets during those
# later steps
CodegateSecrets(),
CodegatePii(),
CodegateCli(),
CodegateContextRetriever(),
SystemPrompt(
Expand All @@ -43,6 +49,7 @@ def create_input_pipeline(self, client_type: ClientType) -> SequentialPipelinePr
def create_fim_pipeline(self, client_type: ClientType) -> SequentialPipelineProcessor:
fim_steps: List[PipelineStep] = [
CodegateSecrets(),
CodegatePii(),
]
return SequentialPipelineProcessor(
fim_steps,
Expand All @@ -55,6 +62,8 @@ def create_output_pipeline(self) -> OutputPipelineProcessor:
output_steps: List[OutputPipelineStep] = [
SecretRedactionNotifier(),
SecretUnredactionStep(),
PiiRedactionNotifier(),
PiiUnRedactionStep(),
CodeCommentStep(),
]
return OutputPipelineProcessor(output_steps)
Expand Down
205 changes: 205 additions & 0 deletions src/codegate/pipeline/pii/analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
import uuid
from typing import Any, Dict, List, Optional, Tuple

import structlog
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

from codegate.db.models import AlertSeverity
from codegate.pipeline.base import PipelineContext

logger = structlog.get_logger("codegate.pii.analyzer")


class PiiSessionStore:
"""
A class to manage PII (Personally Identifiable Information) session storage.
Attributes:
session_id (str): The unique identifier for the session. If not provided, a new UUID
is generated. mappings (Dict[str, str]): A dictionary to store mappings between UUID
placeholders and PII.
Methods:
add_mapping(pii: str) -> str:
Adds a PII string to the session store and returns a UUID placeholder for it.
get_pii(uuid_placeholder: str) -> str:
Retrieves the PII string associated with the given UUID placeholder. If the placeholder
is not found, returns the placeholder itself.
"""

def __init__(self, session_id: str = None):
self.session_id = session_id or str(uuid.uuid4())
self.mappings: Dict[str, str] = {}

def add_mapping(self, pii: str) -> str:
uuid_placeholder = f"<{str(uuid.uuid4())}>"
self.mappings[uuid_placeholder] = pii
return uuid_placeholder

def get_pii(self, uuid_placeholder: str) -> str:
return self.mappings.get(uuid_placeholder, uuid_placeholder)


class PiiAnalyzer:
"""
PiiAnalyzer class for analyzing and anonymizing text containing PII.
This is a singleton class - use PiiAnalyzer.get_instance() to get the instance.
Methods:
get_instance():
Get or create the singleton instance of PiiAnalyzer.
analyze:
text (str): The text to analyze for PII.
Tuple[str, List[Dict[str, Any]], PiiSessionStore]: The anonymized text, a list of
found PII details, and the session store.
entities (List[str]): The PII entities to analyze for.
restore_pii:
anonymized_text (str): The text with anonymized PII.
session_store (PiiSessionStore): The PiiSessionStore used for anonymization.
str: The text with original PII restored.
"""

_instance: Optional["PiiAnalyzer"] = None
_name = "codegate-pii"

@classmethod
def get_instance(cls) -> "PiiAnalyzer":
"""Get or create the singleton instance of PiiAnalyzer"""
if cls._instance is None:
logger.debug("Creating new PiiAnalyzer instance")
cls._instance = cls()
return cls._instance

def __init__(self):
"""
Initialize the PiiAnalyzer.
Note: Use get_instance() instead of creating a new instance directly.
"""
if PiiAnalyzer._instance is not None:
raise RuntimeError("Use PiiAnalyzer.get_instance() instead")

import os

from presidio_analyzer.nlp_engine import NlpEngineProvider

# Get the path to our custom spacy config
current_dir = os.path.dirname(os.path.abspath(__file__))
config_path = os.path.join(current_dir, "spacy_config.yaml")

# Initialize the NLP engine with our custom configuration
provider = NlpEngineProvider(conf_file=config_path)
nlp_engine = provider.create_engine()

# Create analyzer with custom NLP engine
self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
self.anonymizer = AnonymizerEngine()
self.session_store = PiiSessionStore()

PiiAnalyzer._instance = self

def analyze(
self, text: str, context: Optional["PipelineContext"] = None
) -> Tuple[str, List[Dict[str, Any]], PiiSessionStore]:
# Prioritize credit card detection first
entities = [
"PHONE_NUMBER",
"EMAIL_ADDRESS",
"CRYPTO",
"CREDIT_CARD",
"IBAN_CODE",
"MEDICAL_LICENSE",
"US_BANK_NUMBER",
"US_ITIN",
"US_PASSPORT",
"US_SSN",
"UK_NHS",
"UK_NINO",
]

# Analyze the text for PII with adjusted threshold for credit cards
analyzer_results = self.analyzer.analyze(
text=text,
entities=entities,
language="en",
score_threshold=0.3, # Lower threshold to catch more potential matches
)

# Track found PII
found_pii = []

# Only anonymize if PII was found
if analyzer_results:
# Log each found PII instance and anonymize
anonymized_text = text
for result in analyzer_results:
pii_value = text[result.start : result.end]
uuid_placeholder = self.session_store.add_mapping(pii_value)
pii_info = {
"type": result.entity_type,
"value": pii_value,
"score": result.score,
"start": result.start,
"end": result.end,
"uuid_placeholder": uuid_placeholder,
}
found_pii.append(pii_info)
anonymized_text = anonymized_text.replace(pii_value, uuid_placeholder)

# Log each PII detection with its UUID mapping
logger.info(
"PII detected and mapped",
pii_type=result.entity_type,
score=f"{result.score:.2f}",
uuid=uuid_placeholder,
# Don't log the actual PII value for security
value_length=len(pii_value),
session_id=self.session_store.session_id,
)

# Log summary of all PII found in this analysis
if found_pii and context:
# Create notification string for alert
notify_string = (
f"**PII Detected** 🔒\n"
f"- Total PII Found: {len(found_pii)}\n"
f"- Types Found: {', '.join(set(p['type'] for p in found_pii))}\n"
)
context.add_alert(
self._name,
trigger_string=notify_string,
severity_category=AlertSeverity.CRITICAL,
)

logger.info(
"PII analysis complete",
total_pii_found=len(found_pii),
pii_types=[p["type"] for p in found_pii],
session_id=self.session_store.session_id,
)

# Return the anonymized text, PII details, and session store
return anonymized_text, found_pii, self.session_store

# If no PII found, return original text, empty list, and session store
return text, [], self.session_store

def restore_pii(self, anonymized_text: str, session_store: PiiSessionStore) -> str:
"""
Restore the original PII (Personally Identifiable Information) in the given anonymized text.
This method replaces placeholders in the anonymized text with their corresponding original
PII values using the mappings stored in the provided PiiSessionStore.
Args:
anonymized_text (str): The text containing placeholders for PII.
session_store (PiiSessionStore): The session store containing mappings of placeholders
to original PII.
Returns:
str: The text with the original PII restored.
"""
for uuid_placeholder, original_pii in session_store.mappings.items():
anonymized_text = anonymized_text.replace(uuid_placeholder, original_pii)
return anonymized_text
81 changes: 81 additions & 0 deletions src/codegate/pipeline/pii/manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from typing import Any, Dict, List, Tuple

import structlog

from codegate.pipeline.pii.analyzer import PiiAnalyzer, PiiSessionStore

logger = structlog.get_logger("codegate")


class PiiManager:
"""
Manages the analysis and restoration of Personally Identifiable Information
(PII) in text.
Attributes:
analyzer (PiiAnalyzer): The singleton instance of PiiAnalyzer used for
PII detection and restoration.
session_store (PiiSessionStore): The session store for the current PII session.
Methods:
__init__():
Initializes the PiiManager with the singleton PiiAnalyzer instance and sets the
session store.
analyze(text: str) -> Tuple[str, List[Dict[str, Any]]]:
Analyzes the given text for PII, anonymizes it, and logs the detected PII details.
Args:
text (str): The text to be analyzed for PII.
Returns:
Tuple[str, List[Dict[str, Any]]]: A tuple containing the anonymized text and
a list of found PII details.
restore_pii(anonymized_text: str) -> str:
Restores the PII in the given anonymized text using the current session.
Args:
anonymized_text (str): The text with anonymized PII to be restored.
Returns:
str: The text with restored PII.
"""

def __init__(self):
"""
Initialize the PiiManager with the singleton PiiAnalyzer instance.
"""
self.analyzer = PiiAnalyzer.get_instance()
# Always use the analyzer's session store
self._session_store = self.analyzer.session_store

@property
def session_store(self) -> PiiSessionStore:
"""Get the current session store."""
# Always return the analyzer's current session store
return self.analyzer.session_store

def analyze(self, text: str) -> Tuple[str, List[Dict[str, Any]]]:
# Call analyzer and get results
anonymized_text, found_pii, _ = self.analyzer.analyze(text)

# Log found PII details (without modifying the found_pii list)
if found_pii:
for pii in found_pii:
logger.info(
"PII detected",
pii_type=pii["type"],
value="*" * len(pii["value"]), # Don't log actual value
score=f"{pii['score']:.2f}",
)

# Return the exact same objects we got from the analyzer
return anonymized_text, found_pii

def restore_pii(self, anonymized_text: str) -> str:
"""
Restore PII in the given anonymized text using the current session.
"""
if self.session_store is None:
logger.warning("No active PII session found. Unable to restore PII.")
return anonymized_text

# Use the analyzer's restore_pii method with the current session store
return self.analyzer.restore_pii(anonymized_text, self.session_store)
Loading

0 comments on commit f0aac62

Please sign in to comment.