DataDog · gh-worker-dd-mergequeue-cf854d · May 6, 2026 · May 5, 2026 · chatgpt-codex-connector · May 5, 2026
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -897,6 +897,7 @@
 /test/regression/                             @DataDog/single-machine-performance
 /test/regression/ebpf                         @DataDog/single-machine-performance @DataDog/ebpf-platform
 /test/regression/cases/docker_containers*     @DataDog/single-machine-performance @DataDog/container-integrations
+/test/regression/cases/quality_gate_security_* @DataDog/single-machine-performance @DataDog/agent-security
 
 /tools/                                 @DataDog/agent-devx
 /tools/host-profiler/                   @DataDog/profiling-full-host

@@ -0,0 +1,40 @@
+# Quality Gate CWS - Idle
+
+## Overview
+
+This quality gate experiment measures the Datadog Agent's resource consumption
+with Workload Protection just turned on — no custom policy, no lading-generated
+filesystem workload. It establishes the floor that every CWS customer pays
+before any tuning.
+
+**The only enabled functionality is [workload protection](https://docs.datadoghq.com/security/workload_protection/setup/agent/linux/).**
+
+## Owners
+
+- **Teams**: @team-k9-cws-agent
+- **Slack Channel**: [#security-and-compliance-agent](https://dd.enterprise.slack.com/archives/CTNVD37T3)
+
+## Scenario
+
+Models a host that has just enabled CWS with no further configuration:
+
+- No `runtime-security.d/default.policy` override — the agent runs with whatever
+  policies ship by default.
+- `generator: []` in lading — no application-generated filesystem events.
+
+The only events observed are background noise from default activity on the
+host, filtered through the shipped approvers.
+
+This is the baseline "turn it on and leave it alone" measurement. The sibling
+gates `quality_gate_security_no_fs_load` and `quality_gate_security_mean_fs_load`
+both layer the experiment's `default.policy` on top and isolate the effect of
+lading-generated filesystem load.
+
+## Enforcements
+
+- Memory usage is below a threshold
+- Average CPU usage is below a threshold
+
+## Other Links
+
+- [CWS Quality Gates Notebook](https://app.datadoghq.com/notebook/13998267/cws-quality-gate)
@@ -1,17 +1,8 @@
 auth_token_file_path: /tmp/agent-auth-token
 
+dd_url: http://127.0.0.1:9091
+
 # Disable cloud detection. This stops the Agent from poking around the
 # execution environment & network. This is particularly important if the target
 # has network access.
 cloud_provider_metadata: []
-
-logs_enabled: true
-
-dd_url: http://127.0.0.1:9091
-telemetry:
-  enabled: true
-  checks: '*'
-process_config:
-  process_dd_url: http://localhost:9093
-  process_collection:
-    enabled: false
@@ -0,0 +1,4 @@
+# Per https://docs.datadoghq.com/security/workload_protection/setup/agent/linux/
+# Only enable workload protection
+runtime_security_config:
+  enabled: true
@@ -0,0 +1,10 @@
+# Per https://docs.datadoghq.com/security/workload_protection/setup/agent/linux/
+# Only enable workload protection
+runtime_security_config:
+  enabled: true
+#   # Activity dump is currently being reworked and when it is enabled, it causes a lot of kernel events
+#   # By disabling it, we get more predictable results from the generated load.
+  activity_dump:
+    enabled: false
+remote_configuration:
+  enabled: false
@@ -0,0 +1,61 @@
+optimization_goal: memory
+erratic: false
+
+target:
+  name: datadog-agent
+  cpu_allotment: 4
+  # Set to 20% higher than the memory_usage check value
+  memory_allotment: 390 MiB
+
+  environment:
+    DD_API_KEY: a0000001
+    DD_HOSTNAME: smp-regression
+
+  profiling_environment:
+    # internal profiling
+    DD_INTERNAL_PROFILING_ENABLED: true
+    DD_SYSTEM_PROBE_INTERNAL_PROFILING_ENABLED: true
+    DD_APM_INTERNAL_PROFILING_ENABLED: true
+    # run all the time
+    DD_SYSTEM_PROBE_INTERNAL_PROFILING_PERIOD: 1m
+    DD_SECURITY_AGENT_INTERNAL_PROFILING_PERIOD: 1m
+    DD_INTERNAL_PROFILING_PERIOD: 1m
+    DD_SYSTEM_PROBE_INTERNAL_PROFILING_CPU_DURATION: 1m
+    DD_SECURITY_AGENT_INTERNAL_PROFILING_CPU_DURATION: 1m
+    DD_INTERNAL_PROFILING_CPU_DURATION: 1m
+    # destination
+    DD_INTERNAL_PROFILING_UNIX_SOCKET: /smp-host/apm.socket
+    DD_SECURITY_AGENT_INTERNAL_PROFILING_UNIX_SOCKET: /smp-host/apm.socket
+    DD_SYSTEM_PROBE_CONFIG_INTERNAL_PROFILING_UNIX_SOCKET: /smp-host/apm.socket
+    # tags
+    DD_INTERNAL_PROFILING_EXTRA_TAGS: experiment:quality_gate_security_idle
+    DD_SECURITY_AGENT_INTERNAL_PROFILING_EXTRA_TAGS: experiment:quality_gate_security_idle
+    DD_SYSTEM_PROBE_CONFIG_INTERNAL_PROFILING_EXTRA_TAGS: experiment:quality_gate_security_idle
+
+    DD_INTERNAL_PROFILING_BLOCK_PROFILE_RATE: 10000
+    DD_INTERNAL_PROFILING_DELTA_PROFILES: true
+    DD_INTERNAL_PROFILING_ENABLE_GOROUTINE_STACKTRACES: true
+    DD_INTERNAL_PROFILING_MUTEX_PROFILE_FRACTION: 10
+
+    # ddprof options
+    DD_PROFILING_EXECUTION_TRACE_ENABLED: true
+    DD_PROFILING_EXECUTION_TRACE_PERIOD: 1m
+    DD_PROFILING_WAIT_PROFILE: true
+
+checks:
+  - name: memory_usage
+    description: "Memory usage quality gate. This puts a bound on the total memory usage for CWS with no custom policy and no lading-generated filesystem load."
+    bounds:
+      series: total_pss_bytes
+      # When updating this, update the memory_allotment in the target section to 20% higher.
+      upper_bound: "330 MiB"
+
+  - name: cpu_usage
+    description: "CPU usage quality gate. This puts a bound on the total average collector millicore usage."
+    bounds:
+      series: avg(total_cpu_usage_millicores)
+      upper_bound: 40
+
+report_links:
+  - text: "bounds checks dashboard"
+    link: "https://app.datadoghq.com/dashboard/vz3-jd5-bdi?fromUser=true&refresh_mode=paused&tpl_var_experiment%5B0%5D={{ experiment }}&tpl_var_job_id%5B0%5D={{ job_id }}&view=spans&from_ts={{ start_time_ms }}&to_ts={{ end_time_ms }}&live=false"
@@ -0,0 +1,18 @@
+generator: []
+
+blackhole:
+  # The datadog blackhole impersonates Datadog's V2 metrics intake. The agent's
+  # datadog.yaml sets `dd_url` to this address, so every statsd-emitted agent
+  # metric -- including `datadog.runtime_security.*` from security-agent and
+  # system-probe -- flows through the agent's normal forwarder and is recorded
+  # into SMP at its original payload timestamp.
+  # This is the same code path the agent uses in production.
+  - datadog:
+      v2:
+        binding_addr: "127.0.0.1:9091"
+
+# target_metrics scrapes Prometheus/expvar endpoints on the target. CWS
+# runtime_security metrics are statsd-only and are not exposed on those
+# surfaces, so this is intentionally empty -- the datadog blackhole above
+# captures them.
+target_metrics: []
@@ -0,0 +1,56 @@
+# Quality Gate CWS - Mean FS Load
+
+## Overview
+
+This quality gate experiment tests the Datadog Agent's performance and resource
+consumption with Workload Protection enabled under a production-representative
+mean filesystem load. It validates that the agent can handle continuous file
+tree operations while staying within defined memory bounds.
+
+**The only enabled functionality is [workload protection](https://docs.datadoghq.com/security/workload_protection/setup/agent/linux/).**
+
+## Owners
+
+- **Teams**: @team-k9-cws-agent
+- **Slack Channel**: [#security-and-compliance-agent](https://dd.enterprise.slack.com/archives/CTNVD37T3)
+
+## Scenario
+
+Models the per-host average filesystem event rate as observed in internal production data.
+The load generated produces file opens and renames with no explicit CWS rules triggering.
+
+A sibling gate, `quality_gate_security_no_fs_load`, uses the same `default.policy`
+but `generator: []` — it measures the same configuration with zero lading-generated
+filesystem events.
+
+## Enforcements
+
+- Memory usage is below a threshold
+- Average CPU usage is below a threshold
+
+## Additional Information
+
+The key metric that determines the load is `datadog.runtime_security.perf_buffer.events.write`. This represents the number of kernel events which are being seen.
+
+SMP runs emit an equivalent metric called `single_machine_performance.regression_detector.capture.datadog.runtime_security.perf_buffer.events.write`.
+
+`datadog.runtime_security.perf_buffer.events.write`
+→ Lading load
+→ SMP run
+→ `single_machine_performance.regression_detector.capture.datadog.runtime_security.perf_buffer.events.write` == `datadog.runtime_security.perf_buffer.events.write`
+
+The emitted metric from SMP should have a similar value to the production data we source.
+
+### Verifying the Experiment Configuration
+
+To check whether the lading config accurately models production, run:
+
+```
+/analyze-quality-gate-security-mean-fs-load
+```
+
+This compares three values: the lading-configured event rate, the SMP-captured metric, and the production per-host average for `perf_buffer.events.write`.
+
+## Other Links
+
+- [CWS Quality Gates Notebook](https://app.datadoghq.com/notebook/13998267/cws-quality-gate)
@@ -0,0 +1,8 @@
+auth_token_file_path: /tmp/agent-auth-token
+
+dd_url: http://127.0.0.1:9091
+
+# Disable cloud detection. This stops the Agent from poking around the
+# execution environment & network. This is particularly important if the target
+# has network access.
+cloud_provider_metadata: []
@@ -0,0 +1,4 @@
+rules:
+  - id: lading_open_monitor
+    expression: >-
+      open.file.path =~ "/lading-data/*" && open.flags & (O_CREAT | O_RDWR | O_WRONLY) > 0
@@ -0,0 +1,4 @@
+# Per https://docs.datadoghq.com/security/workload_protection/setup/agent/linux/
+# Only enable workload protection
+runtime_security_config:
+  enabled: true
@@ -0,0 +1,10 @@
+# Per https://docs.datadoghq.com/security/workload_protection/setup/agent/linux/
+# Only enable workload protection
+runtime_security_config:
+  enabled: true
+#   # Activity dump is currently being reworked and when it is enabled, it causes a lot of kernel events
+#   # By disabling it, we get more predictable results from the generated load.
+  activity_dump:
+    enabled: false
+remote_configuration:
+  enabled: false
@@ -0,0 +1,61 @@
+optimization_goal: memory
+erratic: false
+
+target:
+  name: datadog-agent
+  cpu_allotment: 4
+  # Set to 20% higher than the memory_usage check value
+  memory_allotment: 380 MiB
+
+  environment:
+    DD_API_KEY: a0000001
+    DD_HOSTNAME: smp-regression
+
+  profiling_environment:
+    # internal profiling
+    DD_INTERNAL_PROFILING_ENABLED: true
+    DD_SYSTEM_PROBE_INTERNAL_PROFILING_ENABLED: true
+    DD_APM_INTERNAL_PROFILING_ENABLED: true
+    # run all the time
+    DD_SYSTEM_PROBE_INTERNAL_PROFILING_PERIOD: 1m
+    DD_SECURITY_AGENT_INTERNAL_PROFILING_PERIOD: 1m
+    DD_INTERNAL_PROFILING_PERIOD: 1m
+    DD_SYSTEM_PROBE_INTERNAL_PROFILING_CPU_DURATION: 1m
+    DD_SECURITY_AGENT_INTERNAL_PROFILING_CPU_DURATION: 1m
+    DD_INTERNAL_PROFILING_CPU_DURATION: 1m
+    # destination
+    DD_INTERNAL_PROFILING_UNIX_SOCKET: /smp-host/apm.socket
+    DD_SECURITY_AGENT_INTERNAL_PROFILING_UNIX_SOCKET: /smp-host/apm.socket
+    DD_SYSTEM_PROBE_CONFIG_INTERNAL_PROFILING_UNIX_SOCKET: /smp-host/apm.socket
+    # tags
+    DD_INTERNAL_PROFILING_EXTRA_TAGS: experiment:quality_gate_security_mean_fs_load
+    DD_SECURITY_AGENT_INTERNAL_PROFILING_EXTRA_TAGS: experiment:quality_gate_security_mean_fs_load
+    DD_SYSTEM_PROBE_CONFIG_INTERNAL_PROFILING_EXTRA_TAGS: experiment:quality_gate_security_mean_fs_load
+
+    DD_INTERNAL_PROFILING_BLOCK_PROFILE_RATE: 10000
+    DD_INTERNAL_PROFILING_DELTA_PROFILES: true
+    DD_INTERNAL_PROFILING_ENABLE_GOROUTINE_STACKTRACES: true
+    DD_INTERNAL_PROFILING_MUTEX_PROFILE_FRACTION: 10
+
+    # ddprof options
+    DD_PROFILING_EXECUTION_TRACE_ENABLED: true
+    DD_PROFILING_EXECUTION_TRACE_PERIOD: 1m
+    DD_PROFILING_WAIT_PROFILE: true
+
+checks:
+  - name: memory_usage
+    description: "Memory usage quality gate. This puts a bound on the total memory usage for CWS workloads."
+    bounds:
+      series: total_pss_bytes
+      # When updating this, update the memory_allotment in the target section to 20% higher.
+      upper_bound: "320 MiB"
+
+  - name: cpu_usage
+    description: "CPU usage quality gate. This puts a bound on the total average collector millicore usage."
+    bounds:
+      series: avg(total_cpu_usage_millicores)
+      upper_bound: 70
+
+report_links:
+  - text: "bounds checks dashboard"
+    link: "https://app.datadoghq.com/dashboard/vz3-jd5-bdi?fromUser=true&refresh_mode=paused&tpl_var_experiment%5B0%5D={{ experiment }}&tpl_var_job_id%5B0%5D={{ job_id }}&view=spans&from_ts={{ start_time_ms }}&to_ts={{ end_time_ms }}&live=false"