Skip to content
Draft
70 changes: 70 additions & 0 deletions .github/workflows/test-cpu-variants.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: Test CPU Variants

on:
workflow_dispatch:
inputs:
operation:
description: 'Operation to test (e.g., MUL_MAT or full spec)'
required: false
default: 'MUL_MAT'
type: string
variant:
description: 'CPU variant to test (leave empty to list available variants)'
required: false
default: ''
type: string

jobs:
test-cpu-variant-sve:
runs-on: ubuntu-24.04-arm
steps:
- name: Clone
uses: actions/checkout@v4

- name: Dependencies
run: |
sudo apt-get update
sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
sudo apt-get update
sudo apt-get install build-essential gcc-14 g++-14
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
gcc --version

- name: Build with CPU reference backend
run: |
cmake -B build -S . \
-DGGML_CPU_REF_BACKEND=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
-DGGML_CPU_REPACK=ON \
-DGGML_NATIVE=OFF \
-DGGML_BACKEND_DL=ON \
-DGGML_BLAS=OFF \
-DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release

cmake --build build -j8

- name: List available CPU variants
run: |
echo "Available CPU variants:"
./build/bin/test-backend-ops cpu-variants --list

- name: Test CPU variant
if: ${{ inputs.variant != '' }}
run: |
echo "Testing variant: ${{ inputs.variant }}"
echo "Operation: ${{ inputs.operation }}"
./build/bin/test-backend-ops cpu-variants \
--variant ${{ inputs.variant }} \
-o "${{ inputs.operation }}"

- name: Instructions
if: ${{ inputs.variant == '' }}
run: |
echo "=========================================="
echo "No variant specified - only listed available variants above"
echo "To test a specific variant, re-run this workflow with:"
echo " - variant: one of the variants listed above"
echo " - operation: your operation string (default: MUL_MAT)"
echo "=========================================="
9 changes: 6 additions & 3 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,9 @@ set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")

# extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_CPU_REF_BACKEND "ggml: build reference CPU backend for testing" OFF)
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})

#
# dependencies
Expand Down Expand Up @@ -283,7 +284,9 @@ add_subdirectory(src)

if (GGML_BUILD_TESTS)
enable_testing()
add_subdirectory(tests)
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/tests")
add_subdirectory(tests)
endif ()
endif ()

if (GGML_BUILD_EXAMPLES)
Expand Down
3 changes: 3 additions & 0 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,9 @@ extern "C" {
// Load all known backends from dynamic libraries
GGML_API void ggml_backend_load_all(void);
GGML_API void ggml_backend_load_all_from_path(const char * dir_path);
// Load all variants for a backend and register them
GGML_API void ggml_backend_load_all_variants(const char * name);
GGML_API void ggml_backend_load_variant(const char * name, const char * variant);

//
// Backend scheduler
Expand Down
1 change: 1 addition & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ extern "C" {
//

// x86
GGML_BACKEND_API int ggml_cpu_has_sse2 (void);
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
GGML_BACKEND_API int ggml_cpu_has_avx (void);
Expand Down
27 changes: 27 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,33 @@ ggml_add_backend(WebGPU)
ggml_add_backend(zDNN)
ggml_add_backend(OpenCL)

if (GGML_CPU_REF_BACKEND)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_REF_BACKEND requires GGML_BACKEND_DL")
endif()
set(GGML_SYSTEM_ARCH "cpu-ref")
set(GGML_LLAMAFILE OFF)
set(GGML_CPU_HBM OFF)
set(GGML_OPENMP OFF)
set(GGML_CPU_KLEIDIAI OFF)
set(GGML_CPU_REPACK OFF)
set(GGML_ACCELERATE OFF)

ggml_add_cpu_backend_variant(ref)

if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM|AARCH64")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use GGML_SYSTEM_ARCH instead.

target_compile_options(ggml-cpu-ref PRIVATE
-U__ARM_NEON
-U__ARM_FEATURE_FMA
-U__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-U__ARM_FEATURE_DOTPROD
-U__ARM_FEATURE_MATMUL_INT8
-U__ARM_FEATURE_SVE
)
endif()
target_compile_definitions(ggml PRIVATE GGML_USE_CPU_REF)
endif()

foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
target_compile_features (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
Expand Down
72 changes: 72 additions & 0 deletions ggml/src/ggml-backend-reg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -605,4 +605,76 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
if (backend_path) {
ggml_backend_load(backend_path);
}
#ifdef GGML_USE_CPU_REF
ggml_backend_load_best("cpu-ref", silent, dir_path);
#endif
}

void ggml_backend_load_all_variants(const char * name) {
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
const fs::path name_path = fs::u8path(name);
const fs::path file_prefix = backend_filename_prefix().native() + name_path.native() + fs::u8path("-").native();
const fs::path file_extension = backend_filename_extension();

std::vector<fs::path> search_paths;
#ifdef GGML_BACKEND_DIR
search_paths.push_back(fs::u8path(GGML_BACKEND_DIR));
#endif
// default search paths: executable directory, current directory
search_paths.push_back(get_executable_path());
search_paths.push_back(fs::current_path());

for (const auto & search_path : search_paths) {
if (!fs::exists(search_path)) {
GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
continue;
}
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
for (const auto & entry : dir_it) {
if (entry.is_regular_file()) {
auto filename = entry.path().filename();
auto ext = entry.path().extension();
if (filename.native().find(file_prefix.native()) == 0 && ext == file_extension) {
fs::path path = search_path / filename;
ggml_backend_reg_t backend = get_reg().load_backend(path, false);
if (backend == nullptr) {
GGML_LOG_ERROR("%s: failed to load backend variant %s\n", __func__, path_str(entry.path()).c_str());
}

}
}
}
}
}

void ggml_backend_load_variant(const char * name, const char * variant) {
const fs::path name_path = fs::u8path(name);
const fs::path variant_path = fs::u8path(variant);
const fs::path file_prefix = backend_filename_prefix().native() + name_path.native() + fs::u8path("-").native();
const fs::path target_filename = file_prefix.native() + variant_path.native() + backend_filename_extension().native();

std::vector<fs::path> search_paths;
#ifdef GGML_BACKEND_DIR
search_paths.push_back(fs::u8path(GGML_BACKEND_DIR));
#endif
// default search paths: executable directory, current directory
search_paths.push_back(get_executable_path());
search_paths.push_back(fs::current_path());

for (const auto & search_path : search_paths) {
if (!fs::exists(search_path)) {
GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
continue;
}

fs::path full_path = search_path / target_filename;
if (fs::exists(full_path) && fs::is_regular_file(full_path)) {
ggml_backend_reg_t backend = get_reg().load_backend(full_path, false);
if (backend == nullptr) {
GGML_LOG_ERROR("%s: failed to load backend variant %s\n", __func__, path_str(full_path).c_str());
} else {
return;
}
}
}
}
6 changes: 6 additions & 0 deletions ggml/src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu)

if (tag_name)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_CPU_VARIANT_NAME="CPU-${tag_name}")
else()
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_CPU_VARIANT_NAME="CPU")
endif()

if (APPLE AND GGML_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate)
if (ACCELERATE_FRAMEWORK)
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -3443,6 +3443,14 @@ int ggml_cpu_has_llamafile(void) {
#endif
}

int ggml_cpu_has_sse2(void) {
#if defined(__SSE2__)
return 1;
#else
return 0;
#endif
}

int ggml_cpu_has_sse3(void) {
#if defined(__SSE3__)
return 1;
Expand Down
7 changes: 5 additions & 2 deletions ggml/src/ggml-cpu/ggml-cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ struct ggml_backend_cpu_context {
};

static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
return "CPU";
return GGML_CPU_VARIANT_NAME;

GGML_UNUSED(backend);
}
Expand Down Expand Up @@ -337,7 +337,7 @@ struct ggml_backend_cpu_device_context {
};

static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
return "CPU";
return GGML_CPU_VARIANT_NAME;

GGML_UNUSED(dev);
}
Expand Down Expand Up @@ -516,6 +516,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
ggml_cpu_init();

std::vector<ggml_backend_feature> features;
if (ggml_cpu_has_sse2()) {
features.push_back({ "SSE2", "1" });
}
if (ggml_cpu_has_sse3()) {
features.push_back({ "SSE3", "1" });
}
Expand Down
39 changes: 38 additions & 1 deletion ggml/src/ggml-cpu/repack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1869,8 +1869,45 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
return nullptr;
}

static bool supports_tensor(const struct ggml_tensor * op) {
if (op->op == GGML_OP_MUL_MAT &&
op->src[0]->buffer &&
(ggml_n_dims(op->src[0]) == 2) && (ggml_n_dims(op->src[1]) == 2) &&
ggml_repack_get_optimal_repack_type(op->src[0])) {

if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
return false;
}

if (op->src[1]->type == GGML_TYPE_F32) {
return true;
}

} else if (op->op == GGML_OP_MUL_MAT_ID && op->src[0]->buffer &&
(ggml_n_dims(op->src[0]) == 3) && (ggml_n_dims(op->src[1]) == 2) &&
ggml_repack_get_optimal_repack_type(op->src[0])) {

if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
return false;
}

if (op->src[1]->type == GGML_TYPE_F32) {
return true;
}
}
return false;
}

static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor));
if (tensor->op == GGML_OP_NONE) {
tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor));
tensor->buffer = buffer;
}

if (supports_tensor(tensor)) {
tensor->src[0]->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor->src[0]));
tensor->src[0]->buffer = buffer;
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I understand the logic for this change - would need some clarification.

Copy link
Member Author

@danbev danbev Oct 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In test-backend-ops the CPU ref backend has it tensors copied (the order is switched, see comment for details) to the backend variant. Then in ggml_backend_graph_copy, which has been copied into test-backend-ops, will call ggml_backend_buffer_init_tensor which is how we end up in this function when running the tests.

Before this change the passed-in tensor would always get its extra tensor set, either to null or to a trait if the tensor's type is supported and the underlying backend also supports it.

There are test in test-backend-ops that use broadcasting, for example setting nr to either [2, 1] or [1, 2], and when using repack this causes these specific tests to fail because repack does not support broadcasting as far as I can tell. supports_tensor checks the dimensions of src[1] that is 2 in addition to some other checks to avoids this situation.

The setting of the buffer is really only required for the tests. When we copy the CPU ref backend's tensors they have the cpu buffer as their backend buffer, but we need this to be the repack buffer, so that repack's get_tensor_traits works and returns the trait. Otherwise the function ggml_cpu_extra_compute_forward called in ggml_compute_forward will not forward the call to repack.

I hope I've been able to explain the motivation for this but please let me know if that is not the case.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is definitely not ok, you can't change the buffer of a tensor during init_tensor, and you absolutely cannot change the buffer of a source tensor.
test-backend-ops needs to be modified to use the extra buffer types in the way they are intended - that is, only allocating the weights on the extra buffer type, and everything else on the default buffer type. This will likely mean either creating a new type of test, or making deep changes to the way tests are defined. Hacking the backend code to make it work is not ok.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've removed this logic from the repack.cpp. I've just moved the code to test-backend-ops for now to enable the test to pass so I have a something that works. And I'll take another stab at this tomorrow.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've made another attempt at this in 8c3eb6c. Is this more in line with what you had in mind @slaren?
Let me know if I can rebase this PR to make reviewing easier.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really understand what you are trying to do here. I think it is better to just do the CPU variant testing for now, and take more time to think about how to test the extra buffer types.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry about my late reply on this. Yeah that sounds good, I'll remove the repack testing and try to take another approach later 👍


GGML_UNUSED(buffer);
return GGML_STATUS_SUCCESS;
Expand Down
3 changes: 3 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ if (NOT LLAMA_SANITIZE_ADDRESS)
endif()
llama_build_and_test(test-gguf.cpp)
llama_build_and_test(test-backend-ops.cpp)
target_sources(test-backend-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src/ggml.c)
target_compile_definitions(test-backend-ops PRIVATE GGML_BUILD GGML_VERSION=\"${GGML_VERSION}\" GGML_COMMIT=\"${GGML_COMMIT}\")
target_include_directories(test-backend-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)

llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
llama_build_and_test(test-autorelease.cpp LABEL "model")
Expand Down
Loading
Loading