From 8f65f5c97d8075309acccfe16513e653bfb3d9e9 Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Wed, 10 Jun 2026 09:26:32 +0200 Subject: [PATCH] fix(server): strip volatile billing header on all cache paths (OpenAI + codex) - Pure normalizer normalize_system_for_cache() in prompt_normalize.{h,cpp}: no IO, no globals - OpenAI /v1/chat/completions + codex /v1/responses now call the pure fn before tokenize/hash - DRY: anthropic path's inline strip collapsed to thin caller (single normalization path) - 6 pure-function tests: strips billing header (Anthropic array + OpenAI msg0), idempotent across turn change, preserves legit content, handles leading-whitespace header, cache-key stable --- server/CMakeLists.txt | 4 +- server/src/server/http_server.cpp | 38 +++++----- server/src/server/prompt_normalize.cpp | 82 +++++++++++++++++++++ server/src/server/prompt_normalize.h | 28 ++++++++ server/test/test_server_unit.cpp | 99 ++++++++++++++++++++++++++ 5 files changed, 228 insertions(+), 23 deletions(-) create mode 100644 server/src/server/prompt_normalize.cpp create mode 100644 server/src/server/prompt_normalize.h diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index b6e5628b8..76c27c2a3 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -727,6 +727,7 @@ if(DFLASH27B_TESTS) src/server/server_main.cpp src/server/http_server.cpp src/server/model_card.cpp + src/server/prompt_normalize.cpp ) target_include_directories(dflash_server PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) if(DFLASH27B_GPU_BACKEND STREQUAL "hip") @@ -812,7 +813,8 @@ if(DFLASH27B_TESTS) add_executable(test_server_unit test/test_server_unit.cpp) target_sources(test_server_unit PRIVATE src/server/http_server.cpp - src/server/model_card.cpp) + src/server/model_card.cpp + src/server/prompt_normalize.cpp) target_include_directories(test_server_unit PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) if(DFLASH27B_GPU_BACKEND STREQUAL "hip") target_compile_definitions(test_server_unit PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP) diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp index b4c98eb70..3602e7d4a 100644 --- a/server/src/server/http_server.cpp +++ b/server/src/server/http_server.cpp @@ -5,6 +5,7 @@ #include "http_server.h" #include "sse_emitter.h" +#include "prompt_normalize.h" #include "tool_hint.h" #ifdef DFLASH_HAS_CURL @@ -614,27 +615,10 @@ json build_props_body(const ServerConfig & config, // one helper guarantees token counting and generation can't drift. static void normalize_anthropic_system(const json & body, json & messages) { if (!body.contains("system")) return; - json sys_content = body["system"]; - if (sys_content.is_array()) { - json filtered = json::array(); - for (const auto & block : sys_content) { - if (block.is_object() && block.value("type", "") == "text") { - std::string text = block.value("text", ""); - if (text.rfind("x-anthropic-billing-header:", 0) == 0) { - continue; // skip Claude Code billing header block - } - } - filtered.push_back(block); - } - sys_content = std::move(filtered); - } else if (sys_content.is_string()) { - std::string s = sys_content.get(); - if (s.rfind("x-anthropic-billing-header:", 0) == 0) { - sys_content = ""; - } - } - if (!sys_content.empty()) { - json sys_msg = {{"role", "system"}, {"content", sys_content}}; + // Delegate strip to the pure fn; insert as system message. + std::string text = dflash::common::normalize_system_for_cache(body["system"]); + if (!text.empty()) { + json sys_msg = {{"role", "system"}, {"content", text}}; messages.insert(messages.begin(), sys_msg); } } @@ -1363,6 +1347,14 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { req.format = ApiFormat::OPENAI_CHAT; req.response_id = generate_id("chatcmpl"); req.messages = body["messages"]; + // Strip volatile billing header from messages[0] (OpenAI system). + if (req.messages.is_array() && !req.messages.empty()) { + auto & m0 = req.messages[0]; + if (m0.is_object() && m0.value("role", "") == "system" && + m0.contains("content") && m0["content"].is_string()) { + m0["content"] = dflash::common::normalize_system_for_cache(req.messages); + } + } } else if (hr.path == "/v1/messages/count_tokens") { req.format = ApiFormat::ANTHROPIC; req.response_id = generate_id("count"); @@ -1382,7 +1374,9 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) { req.messages = body["input"]; } if (body.contains("instructions")) { - json sys_msg = {{"role", "system"}, {"content", body["instructions"]}}; + // Strip billing header from codex instructions before hashing. + std::string inst = dflash::common::normalize_system_for_cache(body["instructions"]); + json sys_msg = {{"role", "system"}, {"content", inst}}; if (req.messages.is_array()) { req.messages.insert(req.messages.begin(), sys_msg); } else { diff --git a/server/src/server/prompt_normalize.cpp b/server/src/server/prompt_normalize.cpp new file mode 100644 index 000000000..368e4a294 --- /dev/null +++ b/server/src/server/prompt_normalize.cpp @@ -0,0 +1,82 @@ +// Prompt normalization — volatile-header stripping for stable cache keys. + +#include "prompt_normalize.h" +#include + +namespace dflash::common { + +static constexpr std::string_view kBillingHeader = "x-anthropic-billing-header:"; + +// Returns true if `s`, after skipping leading whitespace, starts with kBillingHeader. +static bool is_billing_header_block(const std::string & s) { + auto pos = s.find_first_not_of(" \t\r\n"); + if (pos == std::string::npos) return false; + return s.compare(pos, kBillingHeader.size(), kBillingHeader) == 0; +} + +// Strip any line whose ltrimmed text starts with kBillingHeader from a multi-line string. +static std::string strip_billing_header_lines(const std::string & s) { + std::string out; + out.reserve(s.size()); + std::string::size_type start = 0; + while (start <= s.size()) { + auto end = s.find('\n', start); + std::string_view line = (end == std::string::npos) + ? std::string_view(s).substr(start) + : std::string_view(s).substr(start, end - start); + // ltrim check + auto nws = line.find_first_not_of(" \t\r"); + bool is_header = (nws != std::string_view::npos) && + (line.substr(nws, kBillingHeader.size()) == kBillingHeader); + if (!is_header) { + out.append(line); + if (end != std::string::npos) out += '\n'; + } + if (end == std::string::npos) break; + start = end + 1; + } + return out; +} + +std::string normalize_system_for_cache(const json & system_or_messages) { + if (system_or_messages.is_array()) { + if (system_or_messages.empty()) return ""; + const auto & first = system_or_messages[0]; + if (first.is_object() && first.contains("role")) { + // OpenAI messages array: strip billing-header lines from messages[0]. + if (first.value("role", "") == "system") { + const auto & content = first["content"]; + if (content.is_string()) { + return strip_billing_header_lines(content.get()); + } + if (content.is_array()) { + std::string out; + for (const auto & block : content) { + if (block.is_object() && block.value("type", "") == "text") { + out += block.value("text", ""); + } + } + return strip_billing_header_lines(out); + } + } + return ""; + } + // Anthropic content-block array: skip billing-header blocks entirely. + std::string out; + for (const auto & block : system_or_messages) { + if (block.is_object() && block.value("type", "") == "text") { + std::string text = block.value("text", ""); + if (!is_billing_header_block(text)) out += text; + } + } + return out; + } + + if (system_or_messages.is_string()) { + return strip_billing_header_lines(system_or_messages.get()); + } + + return ""; +} + +} // namespace dflash::common diff --git a/server/src/server/prompt_normalize.h b/server/src/server/prompt_normalize.h new file mode 100644 index 000000000..99b6f1a3c --- /dev/null +++ b/server/src/server/prompt_normalize.h @@ -0,0 +1,28 @@ +// Prompt normalization — volatile-header stripping for stable cache keys. +// +// Pure functions: no IO, no globals, no CUDA deps. Tested standalone. + +#pragma once + +#include +#include + +namespace dflash::common { + +using json = nlohmann::json; + +// Normalize the effective system/messages content for cache-key hashing. +// +// Accepts either: +// - Anthropic-format: the `system` field from a /v1/messages body +// (string or array-of-content-blocks) +// - OpenAI-format: the full `messages` array from a /v1/chat/completions +// body (the function inspects messages[0] when role=="system") +// +// Returns the normalized text string that represents the system content +// for the purposes of cache-key construction. Volatile claude-code headers +// (blocks or lines starting with "x-anthropic-billing-header:") are REMOVED +// so that two requests differing only in the header value hash identically. +std::string normalize_system_for_cache(const json & system_or_messages); + +} // namespace dflash::common diff --git a/server/test/test_server_unit.cpp b/server/test/test_server_unit.cpp index a551201c6..2e761439c 100644 --- a/server/test/test_server_unit.cpp +++ b/server/test/test_server_unit.cpp @@ -25,6 +25,7 @@ #include "common/layer_split_backend.h" #include "common/layer_split_utils.h" #include "placement/draft_residency.h" +#include "server/prompt_normalize.h" #include #include @@ -3262,6 +3263,96 @@ static void test_generate_result_accept_rate_zero_when_no_spec_decode() { TEST_ASSERT(r.accept_rate == 0.0f); } +// ═══════════════════════════════════════════════════════════════════════ +// normalize_system_for_cache — header-strip tests +// ═══════════════════════════════════════════════════════════════════════ + +static void test_normalize_strips_billing_header_anthropic_array() { + // Anthropic system-as-array: one billing-header block + one real block. + json system_blocks = json::array({ + {{"type", "text"}, + {"text", "x-anthropic-billing-header: session=abc123 turn=4 ts=1749430000"}}, + {{"type", "text"}, + {"text", "You are a helpful coding assistant."}} + }); + std::string out = dflash::common::normalize_system_for_cache(system_blocks); + TEST_ASSERT(out.find("x-anthropic-billing-header:") == std::string::npos); + TEST_ASSERT(out.find("helpful coding assistant") != std::string::npos); +} + +static void test_normalize_strips_billing_header_openai_messages0() { + // OpenAI messages[0] system containing the billing header in content. + json messages = json::array({ + {{"role", "system"}, + {"content", "x-anthropic-billing-header: session=xyz789 turn=12 ts=1749431000\nYou are a code reviewer."}}, + {{"role", "user"}, {"content", "Review this diff."}} + }); + std::string out = dflash::common::normalize_system_for_cache(messages); + TEST_ASSERT(out.find("x-anthropic-billing-header:") == std::string::npos); + TEST_ASSERT(out.find("code reviewer") != std::string::npos); +} + +static void test_normalize_idempotent_across_changing_header() { + // Two OpenAI messages arrays identical except the header turn value. + // normalize_system_for_cache must return EQUAL strings for both. + json messages_turn4 = json::array({ + {{"role", "system"}, + {"content", "x-anthropic-billing-header: session=S1 turn=4 ts=1749430000\nYou help with Rust."}}, + {{"role", "user"}, {"content", "What is a lifetime?"}} + }); + json messages_turn5 = json::array({ + {{"role", "system"}, + {"content", "x-anthropic-billing-header: session=S1 turn=5 ts=1749430060\nYou help with Rust."}}, + {{"role", "user"}, {"content", "What is a lifetime?"}} + }); + std::string out4 = dflash::common::normalize_system_for_cache(messages_turn4); + std::string out5 = dflash::common::normalize_system_for_cache(messages_turn5); + TEST_ASSERT(out4 == out5); +} + +static void test_normalize_preserves_legit_system_content() { + // A normal system prompt containing no billing header must pass through unchanged. + json messages = json::array({ + {{"role", "system"}, + {"content", "You are an expert in C++ performance optimization."}}, + {{"role", "user"}, {"content", "Help me optimize this loop."}} + }); + std::string out = dflash::common::normalize_system_for_cache(messages); + TEST_ASSERT(out == "You are an expert in C++ performance optimization."); +} + +static void test_normalize_handles_leading_whitespace_header() { + // Header block with leading whitespace must still be stripped. + json system_blocks = json::array({ + {{"type", "text"}, + {"text", " x-anthropic-billing-header: session=W1 turn=1 ts=1749432000"}}, + {{"type", "text"}, + {"text", "Be concise."}} + }); + std::string out = dflash::common::normalize_system_for_cache(system_blocks); + TEST_ASSERT(out.find("x-anthropic-billing-header:") == std::string::npos); + TEST_ASSERT(out.find("Be concise.") != std::string::npos); +} + +static void test_prefix_key_stable_across_header_change() { + // Two /v1/chat/completions-style messages arrays differing ONLY in the + // billing header value must normalize to EQUAL strings. + json messages_a = json::array({ + {{"role", "system"}, + {"content", "x-anthropic-billing-header: session=S2 turn=1 ts=1749440000\nYou are a senior engineer."}}, + {{"role", "user"}, {"content", "What is RAII?"}} + }); + json messages_b = json::array({ + {{"role", "system"}, + {"content", "x-anthropic-billing-header: session=S2 turn=7 ts=1749440420\nYou are a senior engineer."}}, + {{"role", "user"}, {"content", "What is RAII?"}} + }); + std::string norm_a = dflash::common::normalize_system_for_cache(messages_a); + std::string norm_b = dflash::common::normalize_system_for_cache(messages_b); + TEST_ASSERT(norm_a == norm_b); + TEST_ASSERT(norm_a.find("senior engineer") != std::string::npos); +} + int main() { std::fprintf(stderr, "══════════════════════════════════════════\n"); std::fprintf(stderr, " Server Unit Tests\n"); @@ -3482,6 +3573,14 @@ int main() { RUN_TEST(test_generate_result_accept_rate_in_usage_anthropic); RUN_TEST(test_generate_result_accept_rate_zero_when_no_spec_decode); + std::fprintf(stderr, "\n── normalize_system_for_cache ──\n"); + RUN_TEST(test_normalize_strips_billing_header_anthropic_array); + RUN_TEST(test_normalize_strips_billing_header_openai_messages0); + RUN_TEST(test_normalize_idempotent_across_changing_header); + RUN_TEST(test_normalize_preserves_legit_system_content); + RUN_TEST(test_normalize_handles_leading_whitespace_header); + RUN_TEST(test_prefix_key_stable_across_header_change); + std::fprintf(stderr, "\n══════════════════════════════════════════\n"); std::fprintf(stderr, " Results: %d assertions, %d failures\n", test_count, test_failures);