Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,11 @@ if(DFLASH27B_TESTS)
target_link_libraries(test_adaptive_keep_ratio PRIVATE dflash_common)
add_test(NAME adaptive_keep COMMAND test_adaptive_keep_ratio)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_derived_scalars.cpp")
add_executable(test_derived_scalars test/test_derived_scalars.cpp)
target_include_directories(test_derived_scalars PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
add_test(NAME derived_scalars COMMAND test_derived_scalars)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_bandit_integration.cpp")
add_executable(test_bandit_integration test/test_bandit_integration.cpp)
target_include_directories(test_bandit_integration PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
Expand Down
67 changes: 67 additions & 0 deletions server/src/common/derived_scalars.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Pure helper: verify that tensor-shape-derived scalars match GGUF-declared
// metadata. No IO; safe to call from any loader after weights are loaded.
//
// Returns true when derived == declared for all three dimensions.
// On mismatch fills `err` with a diagnostic and returns false.
//
// Callers must compute the *expected* values from their declared scalars:
// draft loader : expected_q_dim = n_head * head_dim
// expected_kv_dim = n_head_kv * head_dim
// qwen35 target : expected_q_dim = n_head * n_embd_head_k * 2 (Q+gate packed)
// expected_kv_dim = n_head_kv * n_embd_head_k
// Both loaders: expected_n_embd = n_embd (wq->ne[0] = input projection dim).
//
// Equivalent pattern for gemma4 lives inline in gemma4_backend.cpp (~line 1072)
// as a silent override rather than an assertion; kept separate intentionally.

#pragma once

#include <cstdint>
#include <cstdio>
#include <string>

namespace dflash::common {

// verify_derived_scalars
// wq_ne1 : weight_q->ne[1] (output dim of Q projection)
// wk_ne1 : weight_k->ne[1] (output dim of K projection)
// wq_ne0 : weight_q->ne[0] (input dim of Q projection == n_embd)
// expected_q_dim : n_head * head_dim [* 2 for packed Q+gate]
// expected_kv_dim: n_head_kv * head_dim
// expected_n_embd: n_embd
// layer_tag : short string for the error message (e.g. "blk.0" or "blk.3")
// err : filled on mismatch
inline bool verify_derived_scalars(
int64_t wq_ne1, int64_t wk_ne1, int64_t wq_ne0,
int64_t expected_q_dim, int64_t expected_kv_dim, int64_t expected_n_embd,
const char * layer_tag,
std::string & err)
{
if (wq_ne1 != expected_q_dim) {
char buf[256];
std::snprintf(buf, sizeof(buf),
"GGUF shape mismatch: %s attn_q.weight->ne[1]=%lld != expected_q_dim=%lld",
layer_tag, (long long)wq_ne1, (long long)expected_q_dim);
err = buf;
return false;
}
if (wk_ne1 != expected_kv_dim) {
char buf[256];
std::snprintf(buf, sizeof(buf),
"GGUF shape mismatch: %s attn_k.weight->ne[1]=%lld != expected_kv_dim=%lld",
layer_tag, (long long)wk_ne1, (long long)expected_kv_dim);
err = buf;
return false;
}
if (wq_ne0 != expected_n_embd) {
char buf[256];
std::snprintf(buf, sizeof(buf),
"GGUF shape mismatch: %s attn_q.weight->ne[0]=%lld != n_embd=%lld",
layer_tag, (long long)wq_ne0, (long long)expected_n_embd);
err = buf;
return false;
}
return true;
}

} // namespace dflash::common
36 changes: 36 additions & 0 deletions server/src/draft/draft_gguf_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
// blk.<i>.ffn_down.weight [hidden, intermediate] Q8_0 / F16

#include "internal.h"
#include "common/derived_scalars.h"

#include <cinttypes>
#include <cstdint>
Expand Down Expand Up @@ -349,6 +350,41 @@ bool load_draft_gguf(const std::string & path,

gguf_free(gctx);

// Structural defense: derive head_dim / n_head / n_head_kv from weight
// tensor shapes and assert against GGUF-declared metadata.
// All draft layers have wq/wk (no deltanet mix), so layer 0 suffices.
// wq: [n_embd, n_head*head_dim], ne[1]=n_head*head_dim, ne[0]=n_embd.
// wk: [n_embd, n_head_kv*head_dim], ne[1]=n_head_kv*head_dim.
{
const DraftLayer & L0 = out.layers[0];
const int64_t exp_q_dim = (int64_t)out.n_head * out.head_dim;
const int64_t exp_kv_dim = (int64_t)out.n_head_kv * out.head_dim;
const int64_t exp_n_embd = (int64_t)out.n_embd;
std::string err;
if (!dflash::common::verify_derived_scalars(
L0.wq->ne[1], L0.wk->ne[1], L0.wq->ne[0],
exp_q_dim, exp_kv_dim, exp_n_embd,
"blk.0", err)) {
set_last_error(err);
return false;
}
// fc: [n_target_layers*n_embd, n_embd] — ne[0] = n_target_layers*n_embd.
if (out.n_target_layers > 0) {
const int64_t derived_fc_in = out.fc->ne[0];
const int64_t expected_fc_in = (int64_t)out.n_target_layers * out.n_embd;
if (derived_fc_in != expected_fc_in) {
char buf[256];
std::snprintf(buf, sizeof(buf),
"GGUF shape mismatch: dflash.fc.weight->ne[0]=%lld "
"!= n_target_layers*n_embd=%d*%d=%lld",
(long long)derived_fc_in,
out.n_target_layers, out.n_embd, (long long)expected_fc_in);
set_last_error(buf);
return false;
}
}
}

char summary[192];
std::snprintf(summary, sizeof(summary),
"draft GGUF loaded: %" PRId64 " tensors, %.2f GiB on GPU",
Expand Down
26 changes: 26 additions & 0 deletions server/src/qwen35/gguf_target_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
// tensor's bytes from the mmap'd file.

#include "internal.h"
#include "common/derived_scalars.h"
#include "common/layer_split_utils.h"

#include <cinttypes>
Expand Down Expand Up @@ -738,6 +739,31 @@ bool load_target_gguf_partial(const std::string & path,

gguf_free(gctx);

// Structural defense: derive head_dim / n_head / n_head_kv from weight
// tensor shapes and assert against GGUF-declared metadata.
// Uses the first full-attention layer; deltanet layers don't carry wq/wk.
// wq packs Q+gate: ne[1] = n_head * n_embd_head_k * 2.
// wk: ne[1] = n_head_kv * n_embd_head_k. wq: ne[0] = n_embd.
{
const int fa_il = out.full_attention_interval - 1;
const TargetLayer & fa = out.layers[(size_t)fa_il];
if (fa.wq && fa.wk) {
const int64_t exp_q_dim = (int64_t)out.n_head * out.n_embd_head_k * 2;
const int64_t exp_kv_dim = (int64_t)out.n_head_kv * out.n_embd_head_k;
const int64_t exp_n_embd = (int64_t)out.n_embd;
char tag[16];
std::snprintf(tag, sizeof(tag), "blk.%d", fa_il);
std::string err;
if (!dflash::common::verify_derived_scalars(
fa.wq->ne[1], fa.wk->ne[1], fa.wq->ne[0],
exp_q_dim, exp_kv_dim, exp_n_embd,
tag, err)) {
set_last_error(err);
return false;
}
}
}

if (tok_embd_off == 0 || tok_embd_type == GGML_TYPE_COUNT) {
set_last_error("token_embd.weight not found or invalid type");
return false;
Expand Down
132 changes: 132 additions & 0 deletions server/test/test_derived_scalars.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
// Unit tests for dflash::common::verify_derived_scalars — no GPU, no model files.
//
// Build: cmake --build build --target test_derived_scalars -j
// Run: cd build && ctest -R derived_scalars --output-on-failure

#include "common/derived_scalars.h"

#include <cstdio>
#include <string>

using namespace dflash::common;

// ─── Minimal test framework ────────────────────────────────────────────────────

static int test_failures = 0;
static int test_count = 0;

#define TEST_ASSERT(expr) do { \
test_count++; \
if (!(expr)) { \
test_failures++; \
std::fprintf(stderr, " FAIL: %s:%d: %s\n", __FILE__, __LINE__, #expr); \
} \
} while (0)

#define RUN_TEST(fn) do { \
std::fprintf(stderr, " %s ...", #fn); \
int before = test_failures; \
fn(); \
if (test_failures == before) std::fprintf(stderr, " ok\n"); \
else std::fprintf(stderr, "\n"); \
} while (0)

// ─── Tests ─────────────────────────────────────────────────────────────────────

// All three dims match: returns true, err untouched.
static void match_returns_true() {
std::string err;
bool ok = verify_derived_scalars(
/*wq_ne1*/ 4096, /*wk_ne1*/ 512, /*wq_ne0*/ 5120,
/*exp_q_dim*/ 4096, /*exp_kv_dim*/ 512, /*exp_n_embd*/ 5120,
"blk.0", err);
TEST_ASSERT(ok);
TEST_ASSERT(err.empty());
}

// wq_ne1 != expected_q_dim: returns false and err non-empty.
static void mismatch_q_dim_returns_false() {
std::string err;
bool ok = verify_derived_scalars(
/*wq_ne1*/ 4096 + 1, /*wk_ne1*/ 512, /*wq_ne0*/ 5120,
/*exp_q_dim*/ 4096, /*exp_kv_dim*/ 512, /*exp_n_embd*/ 5120,
"blk.0", err);
TEST_ASSERT(!ok);
TEST_ASSERT(!err.empty());
}

// wk_ne1 != expected_kv_dim: returns false.
static void mismatch_kv_dim_returns_false() {
std::string err;
bool ok = verify_derived_scalars(
/*wq_ne1*/ 4096, /*wk_ne1*/ 512 + 1, /*wq_ne0*/ 5120,
/*exp_q_dim*/ 4096, /*exp_kv_dim*/ 512, /*exp_n_embd*/ 5120,
"blk.0", err);
TEST_ASSERT(!ok);
TEST_ASSERT(!err.empty());
}

// wq_ne0 != expected_n_embd: returns false.
static void mismatch_n_embd_returns_false() {
std::string err;
bool ok = verify_derived_scalars(
/*wq_ne1*/ 4096, /*wk_ne1*/ 512, /*wq_ne0*/ 5120 + 64,
/*exp_q_dim*/ 4096, /*exp_kv_dim*/ 512, /*exp_n_embd*/ 5120,
"blk.0", err);
TEST_ASSERT(!ok);
TEST_ASSERT(!err.empty());
}

// Typical draft model dims: n_head=32, head_dim=128, n_head_kv=8, n_embd=5120.
// expected_q_dim=32*128=4096, expected_kv_dim=8*128=1024.
static void draft_dims_match() {
std::string err;
bool ok = verify_derived_scalars(
4096, 1024, 5120,
(int64_t)32 * 128, (int64_t)8 * 128, 5120,
"blk.0", err);
TEST_ASSERT(ok);
TEST_ASSERT(err.empty());
}

// Typical qwen35 target layer: n_head=24, n_embd_head_k=256, n_head_kv=4.
// expected_q_dim = 24*256*2 = 12288 (Q+gate packed).
// expected_kv_dim = 4*256 = 1024.
static void qwen35_target_dims_match() {
std::string err;
bool ok = verify_derived_scalars(
/*wq_ne1*/ 12288, /*wk_ne1*/ 1024, /*wq_ne0*/ 5120,
/*exp_q_dim*/ (int64_t)24 * 256 * 2,
/*exp_kv_dim*/ (int64_t)4 * 256,
/*exp_n_embd*/ 5120,
"blk.3", err);
TEST_ASSERT(ok);
TEST_ASSERT(err.empty());
}

// Error message must contain the layer tag for easy diagnosis.
static void err_contains_layer_tag() {
std::string err;
verify_derived_scalars(
4097, 1024, 5120,
4096, 1024, 5120,
"blk.15", err);
TEST_ASSERT(err.find("blk.15") != std::string::npos);
}

// ─── main ──────────────────────────────────────────────────────────────────────

int main() {
std::fprintf(stderr, "=== test_derived_scalars ===\n");

RUN_TEST(match_returns_true);
RUN_TEST(mismatch_q_dim_returns_false);
RUN_TEST(mismatch_kv_dim_returns_false);
RUN_TEST(mismatch_n_embd_returns_false);
RUN_TEST(draft_dims_match);
RUN_TEST(qwen35_target_dims_match);
RUN_TEST(err_contains_layer_tag);

std::fprintf(stderr, "\n%d tests, %d failures\n", test_count, test_failures);
return (test_failures == 0) ? 0 : 1;
}
Loading