Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,7 @@ if(DFLASH27B_TESTS)
src/server/server_main.cpp
src/server/http_server.cpp
src/server/model_card.cpp
src/server/prompt_normalize.cpp
)
target_include_directories(dflash_server PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
Expand Down Expand Up @@ -812,7 +813,8 @@ if(DFLASH27B_TESTS)
add_executable(test_server_unit test/test_server_unit.cpp)
target_sources(test_server_unit PRIVATE
src/server/http_server.cpp
src/server/model_card.cpp)
src/server/model_card.cpp
src/server/prompt_normalize.cpp)
target_include_directories(test_server_unit PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
target_compile_definitions(test_server_unit PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP)
Expand Down
38 changes: 16 additions & 22 deletions server/src/server/http_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "http_server.h"
#include "sse_emitter.h"
#include "prompt_normalize.h"
#include "tool_hint.h"

#ifdef DFLASH_HAS_CURL
Expand Down Expand Up @@ -614,27 +615,10 @@ json build_props_body(const ServerConfig & config,
// one helper guarantees token counting and generation can't drift.
static void normalize_anthropic_system(const json & body, json & messages) {
if (!body.contains("system")) return;
json sys_content = body["system"];
if (sys_content.is_array()) {
json filtered = json::array();
for (const auto & block : sys_content) {
if (block.is_object() && block.value("type", "") == "text") {
std::string text = block.value("text", "");
if (text.rfind("x-anthropic-billing-header:", 0) == 0) {
continue; // skip Claude Code billing header block
}
}
filtered.push_back(block);
}
sys_content = std::move(filtered);
} else if (sys_content.is_string()) {
std::string s = sys_content.get<std::string>();
if (s.rfind("x-anthropic-billing-header:", 0) == 0) {
sys_content = "";
}
}
if (!sys_content.empty()) {
json sys_msg = {{"role", "system"}, {"content", sys_content}};
// Delegate strip to the pure fn; insert as system message.
std::string text = dflash::common::normalize_system_for_cache(body["system"]);
if (!text.empty()) {
json sys_msg = {{"role", "system"}, {"content", text}};
messages.insert(messages.begin(), sys_msg);
}
}
Expand Down Expand Up @@ -1363,6 +1347,14 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
req.format = ApiFormat::OPENAI_CHAT;
req.response_id = generate_id("chatcmpl");
req.messages = body["messages"];
// Strip volatile billing header from messages[0] (OpenAI system).
if (req.messages.is_array() && !req.messages.empty()) {
auto & m0 = req.messages[0];
if (m0.is_object() && m0.value("role", "") == "system" &&
m0.contains("content") && m0["content"].is_string()) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: OpenAI system header normalization is skipped when messages[0].content is an array, leaving a cache-miss path unnormalized.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At server/src/server/http_server.cpp, line 1346:

<comment>OpenAI system header normalization is skipped when `messages[0].content` is an array, leaving a cache-miss path unnormalized.</comment>

<file context>
@@ -1355,6 +1339,14 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
+            if (req.messages.is_array() && !req.messages.empty()) {
+                auto & m0 = req.messages[0];
+                if (m0.is_object() && m0.value("role", "") == "system" &&
+                    m0.contains("content") && m0["content"].is_string()) {
+                    m0["content"] = dflash::common::normalize_system_for_cache(req.messages);
+                }
</file context>
Suggested change
m0.contains("content") && m0["content"].is_string()) {
m0.contains("content") && (m0["content"].is_string() || m0["content"].is_array())) {

m0["content"] = dflash::common::normalize_system_for_cache(req.messages);
}
}
} else if (hr.path == "/v1/messages/count_tokens") {
req.format = ApiFormat::ANTHROPIC;
req.response_id = generate_id("count");
Expand All @@ -1382,7 +1374,9 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
req.messages = body["input"];
}
if (body.contains("instructions")) {
json sys_msg = {{"role", "system"}, {"content", body["instructions"]}};
// Strip billing header from codex instructions before hashing.
std::string inst = dflash::common::normalize_system_for_cache(body["instructions"]);
json sys_msg = {{"role", "system"}, {"content", inst}};
if (req.messages.is_array()) {
req.messages.insert(req.messages.begin(), sys_msg);
} else {
Expand Down
82 changes: 82 additions & 0 deletions server/src/server/prompt_normalize.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// Prompt normalization — volatile-header stripping for stable cache keys.

#include "prompt_normalize.h"
#include <algorithm>

namespace dflash::common {

static constexpr std::string_view kBillingHeader = "x-anthropic-billing-header:";

// Returns true if `s`, after skipping leading whitespace, starts with kBillingHeader.
static bool is_billing_header_block(const std::string & s) {
auto pos = s.find_first_not_of(" \t\r\n");
if (pos == std::string::npos) return false;
return s.compare(pos, kBillingHeader.size(), kBillingHeader) == 0;
}

// Strip any line whose ltrimmed text starts with kBillingHeader from a multi-line string.
static std::string strip_billing_header_lines(const std::string & s) {
std::string out;
out.reserve(s.size());
std::string::size_type start = 0;
while (start <= s.size()) {
auto end = s.find('\n', start);
std::string_view line = (end == std::string::npos)
? std::string_view(s).substr(start)
: std::string_view(s).substr(start, end - start);
// ltrim check
auto nws = line.find_first_not_of(" \t\r");
bool is_header = (nws != std::string_view::npos) &&
(line.substr(nws, kBillingHeader.size()) == kBillingHeader);
if (!is_header) {
out.append(line);
if (end != std::string::npos) out += '\n';
}
if (end == std::string::npos) break;
start = end + 1;
}
return out;
}

std::string normalize_system_for_cache(const json & system_or_messages) {
if (system_or_messages.is_array()) {
if (system_or_messages.empty()) return "";
const auto & first = system_or_messages[0];
if (first.is_object() && first.contains("role")) {
// OpenAI messages array: strip billing-header lines from messages[0].
if (first.value("role", "") == "system") {
const auto & content = first["content"];

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1: messages[0]["content"] is accessed without verifying the key exists, which can throw and fail request handling for malformed OpenAI payloads.

(Based on your team's feedback about guarding JSON string reads and key/type checks.) .

View Feedback

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At server/src/server/prompt_normalize.cpp, line 48:

<comment>`messages[0]["content"]` is accessed without verifying the key exists, which can throw and fail request handling for malformed OpenAI payloads.

(Based on your team's feedback about guarding JSON string reads and key/type checks.) .</comment>

<file context>
@@ -0,0 +1,82 @@
+        if (first.is_object() && first.contains("role")) {
+            // OpenAI messages array: strip billing-header lines from messages[0].
+            if (first.value("role", "") == "system") {
+                const auto & content = first["content"];
+                if (content.is_string()) {
+                    return strip_billing_header_lines(content.get<std::string>());
</file context>

if (content.is_string()) {
return strip_billing_header_lines(content.get<std::string>());
}
if (content.is_array()) {
std::string out;
for (const auto & block : content) {
if (block.is_object() && block.value("type", "") == "text") {
out += block.value("text", "");
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
}
}
return strip_billing_header_lines(out);
}
}
return "";
}
// Anthropic content-block array: skip billing-header blocks entirely.
std::string out;
for (const auto & block : system_or_messages) {
if (block.is_object() && block.value("type", "") == "text") {
std::string text = block.value("text", "");
if (!is_billing_header_block(text)) out += text;
}
}
return out;
}

if (system_or_messages.is_string()) {
return strip_billing_header_lines(system_or_messages.get<std::string>());
}

return "";
}

} // namespace dflash::common
28 changes: 28 additions & 0 deletions server/src/server/prompt_normalize.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Prompt normalization — volatile-header stripping for stable cache keys.
//
// Pure functions: no IO, no globals, no CUDA deps. Tested standalone.

#pragma once

#include <nlohmann/json.hpp>
#include <string>

namespace dflash::common {

using json = nlohmann::json;

// Normalize the effective system/messages content for cache-key hashing.
//
// Accepts either:
// - Anthropic-format: the `system` field from a /v1/messages body
// (string or array-of-content-blocks)
// - OpenAI-format: the full `messages` array from a /v1/chat/completions
// body (the function inspects messages[0] when role=="system")
//
// Returns the normalized text string that represents the system content
// for the purposes of cache-key construction. Volatile claude-code headers
// (blocks or lines starting with "x-anthropic-billing-header:") are REMOVED
// so that two requests differing only in the header value hash identically.
std::string normalize_system_for_cache(const json & system_or_messages);

} // namespace dflash::common
99 changes: 99 additions & 0 deletions server/test/test_server_unit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "common/layer_split_backend.h"
#include "common/layer_split_utils.h"
#include "placement/draft_residency.h"
#include "server/prompt_normalize.h"
#include <nlohmann/json.hpp>

#include <cmath>
Expand Down Expand Up @@ -3262,6 +3263,96 @@ static void test_generate_result_accept_rate_zero_when_no_spec_decode() {
TEST_ASSERT(r.accept_rate == 0.0f);
}

// ═══════════════════════════════════════════════════════════════════════
// normalize_system_for_cache — header-strip tests
// ═══════════════════════════════════════════════════════════════════════

static void test_normalize_strips_billing_header_anthropic_array() {
// Anthropic system-as-array: one billing-header block + one real block.
json system_blocks = json::array({
{{"type", "text"},
{"text", "x-anthropic-billing-header: session=abc123 turn=4 ts=1749430000"}},
{{"type", "text"},
{"text", "You are a helpful coding assistant."}}
});
std::string out = dflash::common::normalize_system_for_cache(system_blocks);
TEST_ASSERT(out.find("x-anthropic-billing-header:") == std::string::npos);
TEST_ASSERT(out.find("helpful coding assistant") != std::string::npos);
}

static void test_normalize_strips_billing_header_openai_messages0() {
// OpenAI messages[0] system containing the billing header in content.
json messages = json::array({
{{"role", "system"},
{"content", "x-anthropic-billing-header: session=xyz789 turn=12 ts=1749431000\nYou are a code reviewer."}},
{{"role", "user"}, {"content", "Review this diff."}}
});
std::string out = dflash::common::normalize_system_for_cache(messages);
TEST_ASSERT(out.find("x-anthropic-billing-header:") == std::string::npos);
TEST_ASSERT(out.find("code reviewer") != std::string::npos);
}

static void test_normalize_idempotent_across_changing_header() {
// Two OpenAI messages arrays identical except the header turn value.
// normalize_system_for_cache must return EQUAL strings for both.
json messages_turn4 = json::array({
{{"role", "system"},
{"content", "x-anthropic-billing-header: session=S1 turn=4 ts=1749430000\nYou help with Rust."}},
{{"role", "user"}, {"content", "What is a lifetime?"}}
});
json messages_turn5 = json::array({
{{"role", "system"},
{"content", "x-anthropic-billing-header: session=S1 turn=5 ts=1749430060\nYou help with Rust."}},
{{"role", "user"}, {"content", "What is a lifetime?"}}
});
std::string out4 = dflash::common::normalize_system_for_cache(messages_turn4);
std::string out5 = dflash::common::normalize_system_for_cache(messages_turn5);
TEST_ASSERT(out4 == out5);
}

static void test_normalize_preserves_legit_system_content() {
// A normal system prompt containing no billing header must pass through unchanged.
json messages = json::array({
{{"role", "system"},
{"content", "You are an expert in C++ performance optimization."}},
{{"role", "user"}, {"content", "Help me optimize this loop."}}
});
std::string out = dflash::common::normalize_system_for_cache(messages);
TEST_ASSERT(out == "You are an expert in C++ performance optimization.");
}

static void test_normalize_handles_leading_whitespace_header() {
// Header block with leading whitespace must still be stripped.
json system_blocks = json::array({
{{"type", "text"},
{"text", " x-anthropic-billing-header: session=W1 turn=1 ts=1749432000"}},
{{"type", "text"},
{"text", "Be concise."}}
});
std::string out = dflash::common::normalize_system_for_cache(system_blocks);
TEST_ASSERT(out.find("x-anthropic-billing-header:") == std::string::npos);
TEST_ASSERT(out.find("Be concise.") != std::string::npos);
}

static void test_prefix_key_stable_across_header_change() {
// Two /v1/chat/completions-style messages arrays differing ONLY in the
// billing header value must normalize to EQUAL strings.
json messages_a = json::array({
{{"role", "system"},
{"content", "x-anthropic-billing-header: session=S2 turn=1 ts=1749440000\nYou are a senior engineer."}},
{{"role", "user"}, {"content", "What is RAII?"}}
});
json messages_b = json::array({
{{"role", "system"},
{"content", "x-anthropic-billing-header: session=S2 turn=7 ts=1749440420\nYou are a senior engineer."}},
{{"role", "user"}, {"content", "What is RAII?"}}
});
std::string norm_a = dflash::common::normalize_system_for_cache(messages_a);
std::string norm_b = dflash::common::normalize_system_for_cache(messages_b);
TEST_ASSERT(norm_a == norm_b);
TEST_ASSERT(norm_a.find("senior engineer") != std::string::npos);
}

int main() {
std::fprintf(stderr, "══════════════════════════════════════════\n");
std::fprintf(stderr, " Server Unit Tests\n");
Expand Down Expand Up @@ -3482,6 +3573,14 @@ int main() {
RUN_TEST(test_generate_result_accept_rate_in_usage_anthropic);
RUN_TEST(test_generate_result_accept_rate_zero_when_no_spec_decode);

std::fprintf(stderr, "\n── normalize_system_for_cache ──\n");
RUN_TEST(test_normalize_strips_billing_header_anthropic_array);
RUN_TEST(test_normalize_strips_billing_header_openai_messages0);
RUN_TEST(test_normalize_idempotent_across_changing_header);
RUN_TEST(test_normalize_preserves_legit_system_content);
RUN_TEST(test_normalize_handles_leading_whitespace_header);
RUN_TEST(test_prefix_key_stable_across_header_change);

std::fprintf(stderr, "\n══════════════════════════════════════════\n");
std::fprintf(stderr, " Results: %d assertions, %d failures\n",
test_count, test_failures);
Expand Down
Loading