Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better secret scrubber #3832

Merged
merged 17 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,16 @@
"justMyCode": true,
"python": "${workspaceFolder}/venv/bin/python"
},
{
"name": "Run SAMPLING scenario",
"type": "python",
"request": "launch",
"module": "pytest",
"args": ["-S", "SAMPLING", "-p", "no:warnings"],
"console": "integratedTerminal",
"justMyCode": true,
"python": "${workspaceFolder}/venv/bin/python"
},
{
"name": "Replay SAMPLING scenario",
"type": "python",
Expand Down
4 changes: 4 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Unless explicitly stated otherwise all files in this repository are licensed under the the Apache License Version 2.0.
# This product includes software developed at Datadog (https://www.datadoghq.com/).
# Copyright 2021 Datadog, Inc.

# keep this import at the top of the file
from utils.proxy import scrubber # noqa: F401

import json
import os
import time
Expand Down
98 changes: 98 additions & 0 deletions tests/test_the_test/test_scrubber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import json
import os
import subprocess
import pytest
from utils import scenarios
from utils.tools import logger


FILENAME = "tests/test_the_test/test_scrubber.py"

scrubbed_names = {
"DD_API_KEY": "secret_value_1",
"DD_API_KEY_2": "secret_value_C",
"DD_API_KEY_3": "secret_value_D",
"DD_APP_KEY": "secret_value_2",
"DD_APP_KEY_2": "secret_value_A",
"DD_APP_KEY_3": "secret_value_B",
"DD_APPLICATION_KEY": "secret_value_3",
"AWS_ACCESS_KEY_ID": "secret_value_4",
"AWS_SECRET_ACCESS_KEY": "secret_value_5",
"AWS_SESSION_TOKEN": "secret_value_6",
"AWS_SECURITY_TOKEN": "secret_value_7",
"SYSTEM_TESTS_AWS_ACCESS_KEY_ID": "secret_value_8",
"SYSTEM_TESTS_AWS_SECRET_ACCESS_KEY": "secret_value_9",
# Env variables loaded by SSI tests
"DD_API_KEY_ONBOARDING": "secret_value_onboarding_1",
"DD_APP_KEY_ONBOARDING": "secret_value_onboarding_2",
"GITHUB_TOKEN": "secret_value_onboarding_3",
"DOCKER_LOGIN": "secret_value_onboarding_4",
"DOCKER_LOGIN_PASS": "secret_value_onboarding_5",
}


@scenarios.test_the_test
def test_log_scrubber():
cmd = ["./run.sh", "MOCK_THE_TEST", FILENAME]
subprocess.run(cmd, env=scrubbed_names | os.environ, text=True, capture_output=True)

redacted_count = 0

for root, _, files in os.walk("logs_mock_the_test"):
for file in files:
file_path = os.path.join(root, file)

with open(file_path, "r", encoding="utf-8") as f:
data = f.read()

redacted_count += data.count("<redacted>")
for secret in scrubbed_names.values():
assert secret not in data, f"{secret} found in {file_path}"

# extra portection to make sure we redacted all secrets
assert redacted_count != 0, "No secrets were redacted"


@scenarios.mock_the_test
def test_leaks():
logger.info(os.environ)
print(os.environ)


@scenarios.test_the_test
@pytest.mark.parametrize("write_mode, read_mode, file_extension", [("w", "r", "txt"), ("wb", "rb", "bin")])
def test_file_writer_scrubber(write_mode, read_mode, file_extension):
secrets = []

for name, secret in scrubbed_names.items():
os.environ[name] = secret
secrets.append(bytearray(secret, "utf-8") if write_mode == "wb" else secret)

log_file = f"{scenarios.test_the_test.host_log_folder}/leak.{file_extension}"
with open(log_file, write_mode) as f:
for secret in secrets:
f.write(secret)
f.writelines([secret, secret])

with open(log_file, read_mode) as f:
data = f.read()

for secret in secrets:
assert secret not in data


@scenarios.test_the_test
def test_jsonweird():
secret = 123456789
os.environ["KEY_SCRUBBED"] = f"{secret}"

log_file = "logs_test_the_test/json_weird.json"
with open(log_file, "w") as f:
json.dump({"int": secret, "str": f"{secret}"}, f)

del os.environ["KEY_SCRUBBED"]

with open(log_file, "r") as f:
data = f.read()

assert f"{secret}" not in data
27 changes: 2 additions & 25 deletions utils/_context/containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,40 +363,17 @@ def collect_logs(self):
TAIL_LIMIT = 50 # noqa: N806
SEP = "=" * 30 # noqa: N806

keys = []
if os.environ.get("DD_API_KEY"):
keys.append(bytearray(os.environ["DD_API_KEY"], "utf-8"))
if os.environ.get("DD_APP_KEY"):
keys.append(bytearray(os.environ["DD_APP_KEY"], "utf-8"))
if os.environ.get("AWS_ACCESS_KEY_ID"):
keys.append(bytearray(os.environ["AWS_ACCESS_KEY_ID"], "utf-8"))
if os.environ.get("AWS_SECRET_ACCESS_KEY"):
keys.append(bytearray(os.environ["AWS_SECRET_ACCESS_KEY"], "utf-8"))
if os.environ.get("AWS_SESSION_TOKEN"):
keys.append(bytearray(os.environ["AWS_SESSION_TOKEN"], "utf-8"))
if os.environ.get("AWS_SECURITY_TOKEN"):
keys.append(bytearray(os.environ["AWS_SECURITY_TOKEN"], "utf-8"))

# set by CI runner
if os.environ.get("SYSTEM_TESTS_AWS_ACCESS_KEY_ID"):
keys.append(bytearray(os.environ["SYSTEM_TESTS_AWS_ACCESS_KEY_ID"], "utf-8"))
if os.environ.get("SYSTEM_TESTS_AWS_SECRET_ACCESS_KEY"):
keys.append(bytearray(os.environ["SYSTEM_TESTS_AWS_SECRET_ACCESS_KEY"], "utf-8"))

data = (
("stdout", self._container.logs(stdout=True, stderr=False)),
("stderr", self._container.logs(stdout=False, stderr=True)),
)
for output_name, raw_output in data:
filename = f"{self.log_folder_path}/{output_name}.log"
output = raw_output
for key in keys:
output = output.replace(key, b"<redacted>")
with open(filename, "wb") as f:
f.write(output)
f.write(raw_output)

if not self.healthy:
decoded_output = output.decode("utf-8")
decoded_output = raw_output.decode("utf-8")

logger.stdout(f"\n{SEP} {self.name} {output_name.upper()} last {TAIL_LIMIT} lines {SEP}")
logger.stdout(f"-> See {filename} for full logs")
Expand Down
53 changes: 4 additions & 49 deletions utils/proxy/core.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# keep this import in first
import scrubber # noqa: F401

import asyncio
from collections import defaultdict
import json
Expand All @@ -22,20 +25,6 @@
messages_counts = defaultdict(int)


class CustomFormatter(logging.Formatter):
def __init__(self, keys: list[str], *args, **kwargs) -> None: # noqa: ANN002
super().__init__(*args, **kwargs)
self._keys = keys

def format(self, record):
result = super().format(record)

for key in self._keys:
result = result.replace(key, "{redacted-by-system-tests-proxy}")

return result


class ObjectDumpEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, bytes):
Expand All @@ -45,19 +34,9 @@ def default(self, o):

class _RequestLogger:
def __init__(self) -> None:
self._keys = [
os.environ.get("DD_API_KEY"),
os.environ.get("DD_APPLICATION_KEY"),
os.environ.get("DD_APP_KEY"),
]

self._keys = [key for key in self._keys if key is not None]

handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
formatter = CustomFormatter(
fmt="%(asctime)s.%(msecs)03d %(levelname)-8s %(message)s", datefmt="%H:%M:%S", keys=self._keys
)
formatter = logging.Formatter(fmt="%(asctime)s.%(msecs)03d %(levelname)-8s %(message)s", datefmt="%H:%M:%S")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
Expand All @@ -76,25 +55,6 @@ def __init__(self) -> None:
self.rc_api_sequential_commands = None
self.rc_api_runtime_ids_request_count = None

def _scrub(self, content):
if isinstance(content, str):
for key in self._keys:
content = content.replace(key, "{redacted-by-system-tests-proxy}")

return content

if isinstance(content, (list, set, tuple)):
return [self._scrub(item) for item in content]

if isinstance(content, dict):
return {key: self._scrub(value) for key, value in content.items()}

if isinstance(content, SIMPLE_TYPES):
return content

logger.error(f"Can't scrub type {type(content)}")
return "Content not properly deserialized by system-tests proxy. Please reach #apm-shared-testing on slack."

@staticmethod
def get_error_response(message):
logger.error(message)
Expand Down Expand Up @@ -250,11 +210,6 @@ def response(self, flow):
export_content_files_to=export_content_files_to,
)

try:
data = self._scrub(data)
except:
logger.exception("Fail to scrub data")

logger.info(f" => Saving data as {log_filename}")

with open(log_filename, "w", encoding="utf-8", opener=lambda path, flags: os.open(path, flags, 0o777)) as f:
Expand Down
62 changes: 62 additions & 0 deletions utils/proxy/scrubber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import builtins
import os
import re

_not_secrets = {
"AWS_VAULT_KEYCHAIN_NAME", # Name of macOS keychain to use => it's a name, not a key
"ONBOARDING_AWS_INFRA_KEY_PATH", # TODO : what is the content of this value ?
}

_name_filter = re.compile(r"key|token|secret|pass|docker_login", re.IGNORECASE)


def _get_secrets() -> list[str]:
secrets: list = [
value.strip()
for name, value in os.environ.items()
if len(value.strip()) > 6 and name not in _not_secrets and _name_filter.search(name)
]
return set(secrets)


def _instrument_write_methods_str(f, secrets: list[str]) -> None:
original_write = f.write

def write(data):
for secret in secrets:
data = data.replace(secret, "<redacted>")

original_write(data)

f.write = write


def _instrument_write_methods_bytes(f, secrets: list[str]) -> None:
original_write = f.write

def write(data):
for secret in secrets:
data = data.replace(secret.encode(), b"<redacted>")

original_write(data)

f.write = write


def _instrumented_open(file, mode="r", *args, **kwargs): # noqa: ANN002
f = _original_open(file, mode, *args, **kwargs)

# get list of secrets at each call, because environ may be updated
secrets = _get_secrets()

if ("w" in mode or "a" in mode) and len(secrets) > 0:
if "b" in mode:
_instrument_write_methods_bytes(f, secrets)
else:
_instrument_write_methods_str(f, secrets)

return f


_original_open = builtins.open
builtins.open = _instrumented_open
Loading