Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 47 additions & 102 deletions dflash/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,59 +44,6 @@ endif()
# the spec_prefill demo (target_gen path uses standard quant pairs).
option(DFLASH27B_FA_ALL_QUANTS "Compile ggml-cuda fattn kernels for all KV-quant pairs" ON)
set(GGML_CUDA_FA_ALL_QUANTS ${DFLASH27B_FA_ALL_QUANTS} CACHE BOOL "" FORCE)

# Resolve the CUDA architecture list up-front so downstream logic (notably
# the consumer-Blackwell ggml workaround below) can inspect the actual
# arches nvcc will compile for. The dflash27b target itself is created
# later; its CUDA_ARCHITECTURES property is applied via
# set_target_properties once the target exists.
#
# Turing (75) and Ampere (86) always; Blackwell consumer (120) and Thor
# (110 on CUDA 13+) added when nvcc supports them. DGX Spark /
# GB10 is compute capability 12.1 (121), added at CUDA 12.9+.
if(DFLASH27B_USER_CUDA_ARCHITECTURES)
set(_dflash27b_archs "${DFLASH27B_USER_CUDA_ARCHITECTURES}")
else()
set(_dflash27b_archs "75;86")
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8")
list(APPEND _dflash27b_archs "120")
endif()
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
list(APPEND _dflash27b_archs "110")
endif()
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.9")
list(APPEND _dflash27b_archs "121")
endif()
endif()

# Consumer Blackwell workaround: skip sm_12x→sm_12xa replacement and FP4
# mmq kernels that can trigger illegal-instruction faults on consumer chips.
# By default, auto-enable when the resolved CUDA arch list includes a 12x
# entry. Set DFLASH27B_USE_BLACKWELL_CONSUMER_FIX=ON to force this behavior
# explicitly (for cross-compiles or custom arch lists).
option(DFLASH27B_USE_BLACKWELL_CONSUMER_FIX
"Enable ggml consumer-Blackwell workaround (skip sm_12x→sm_12xa, exclude FP4 mmq kernels)" OFF)
if(DFLASH27B_USE_BLACKWELL_CONSUMER_FIX)
set(_dflash_is_consumer_blackwell ON)
endif()

if(NOT DEFINED _dflash_is_consumer_blackwell)
set(_dflash_is_consumer_blackwell OFF)
# Iterate the resolved dflash27b arch list, not raw CMAKE_CUDA_ARCHITECTURES,
# which is empty on the default path (the project supplies its own list above).
foreach(_arch IN LISTS _dflash27b_archs)
string(REGEX REPLACE "[^0-9]" "" _dflash_arch_num "${_arch}")
if(_dflash_arch_num MATCHES "^12[0-9]$")
set(_dflash_is_consumer_blackwell ON)
break()
endif()
endforeach()
endif()

if(_dflash_is_consumer_blackwell)
set(GGML_CUDA_BLACKWELL_CONSUMER ON CACHE BOOL
"Skip sm_12X→sm_12Xa for consumer Blackwell (no FP4)" FORCE)
endif()
# Use only the ggml subtree of llama.cpp (skip libllama).
add_subdirectory(deps/llama.cpp/ggml EXCLUDE_FROM_ALL)

Expand All @@ -119,12 +66,8 @@ add_library(dflash27b STATIC
src/flashprefill_q8.cpp
src/kv_cache.cpp
src/kv_quant.cpp
src/f16_convert.cu
src/delta_net_chunked.cpp
# Laguna-XS.2 (Poolside) target arch
src/laguna_target_loader.cpp
src/laguna_target_graph.cpp
src/laguna_daemon.cpp
src/sampler.cpp
)
# FlashPrefill custom CUDA kernels need BF16 WMMA (sm_80+). On Turing (sm_75)
# the drafter uses ggml's flash_attn_ext instead. Guard added after SM check.
Expand All @@ -136,8 +79,23 @@ if(NOT DEFINED DFLASH27B_ENABLE_BSA)
set(DFLASH27B_ENABLE_BSA ON)
endif()

# Apply the arch list resolved above (before add_subdirectory, so the
# consumer-Blackwell workaround can inspect it) to the dflash27b target.
# Turing (75) and Ampere (86) always; Blackwell consumer (120) and Thor
# (110 on CUDA 13+) added when nvcc supports them. DGX Spark /
# GB10 is compute capability 12.1 (121), added at CUDA 12.9+.
if(DFLASH27B_USER_CUDA_ARCHITECTURES)
set(_dflash27b_archs "${DFLASH27B_USER_CUDA_ARCHITECTURES}")
else()
set(_dflash27b_archs "75;86")
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8")
list(APPEND _dflash27b_archs "120")
endif()
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
list(APPEND _dflash27b_archs "110")
endif()
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.9")
list(APPEND _dflash27b_archs "121")
endif()
endif()
set_target_properties(dflash27b PROPERTIES CUDA_ARCHITECTURES "${_dflash27b_archs}")

# Extract the minimum SM from the arch list so safetensors_draft.cpp can decide
Expand Down Expand Up @@ -201,12 +159,6 @@ if(DFLASH27B_ENABLE_BSA)
${CMAKE_CURRENT_SOURCE_DIR}/deps/Block-Sparse-Attention/csrc/block_sparse_attn/src)
target_compile_options(dflash27b PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
target_compile_definitions(dflash27b PRIVATE FLASHATTENTION_DISABLE_DROPOUT FLASH_NAMESPACE=flash DFLASH27B_HAVE_BSA=1)
# MSVC's <cmath> hides POSIX M_* macros (M_LOG2E etc.) unless _USE_MATH_DEFINES
# is set before any cmath include. BSA's softmax.h relies on M_LOG2E; define
# globally on the target so it precedes every TU's first <cmath> include.
if(WIN32)
target_compile_definitions(dflash27b PRIVATE _USE_MATH_DEFINES)
endif()
endif()

target_link_libraries(dflash27b
Expand Down Expand Up @@ -238,7 +190,11 @@ endif()

option(DFLASH27B_TESTS "Build numerics tests" ON)
if(DFLASH27B_TESTS)
if(_dflash27b_min_sm GREATER_EQUAL 80 AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_flashprefill_kernels.cpp")
# FlashPrefill kernels are only compiled into dflash27b on sm_80+
# (see DFLASH27B_HAVE_FLASHPREFILL guard above). On legacy arches the
# test would fail to link because the kernel symbols are absent.
if(_dflash27b_min_sm GREATER_EQUAL 80
AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_flashprefill_kernels.cpp")
add_executable(test_flashprefill_kernels test/test_flashprefill_kernels.cpp)
set_target_properties(test_flashprefill_kernels PROPERTIES CUDA_ARCHITECTURES "${_dflash27b_archs}")
target_link_libraries(test_flashprefill_kernels PRIVATE dflash27b CUDA::cudart)
Expand All @@ -248,6 +204,11 @@ if(DFLASH27B_TESTS)
target_include_directories(test_kv_quant PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(test_kv_quant PRIVATE dflash27b)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_draft_swa_mask_contract.cpp")
add_executable(test_draft_swa_mask_contract test/test_draft_swa_mask_contract.cpp)
target_include_directories(test_draft_swa_mask_contract PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(test_draft_swa_mask_contract PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_draft_vs_reference.cpp")
add_executable(test_draft_vs_reference test/test_draft_vs_reference.cpp)
target_link_libraries(test_draft_vs_reference PRIVATE dflash27b)
Expand Down Expand Up @@ -282,36 +243,6 @@ if(DFLASH27B_TESTS)
target_include_directories(smoke_load_target PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(smoke_load_target PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_target_laguna.cpp")
add_executable(smoke_load_target_laguna test/smoke_load_target_laguna.cpp)
target_include_directories(smoke_load_target_laguna PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(smoke_load_target_laguna PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_laguna_forward.cpp")
add_executable(smoke_laguna_forward test/smoke_laguna_forward.cpp)
target_include_directories(smoke_laguna_forward PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(smoke_laguna_forward PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_ttft.cpp")
add_executable(bench_laguna_ttft test/bench_laguna_ttft.cpp)
target_include_directories(bench_laguna_ttft PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(bench_laguna_ttft PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_pflash.cpp")
add_executable(bench_laguna_pflash test/bench_laguna_pflash.cpp)
target_include_directories(bench_laguna_pflash PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(bench_laguna_pflash PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_generate.cpp")
add_executable(bench_laguna_generate test/bench_laguna_generate.cpp)
target_include_directories(bench_laguna_generate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(bench_laguna_generate PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_laguna_daemon.cpp")
add_executable(test_laguna_daemon test/test_laguna_daemon.cpp)
target_include_directories(test_laguna_daemon PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(test_laguna_daemon PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_target_forward.cpp")
add_executable(smoke_target_forward test/smoke_target_forward.cpp)
target_include_directories(smoke_target_forward PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
Expand All @@ -323,18 +254,32 @@ if(DFLASH27B_TESTS)
target_link_libraries(test_generate PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_dflash.cpp")
set(_dflash_test_dflash_libs dflash27b ggml ggml-cuda CUDA::cudart)
add_executable(test_dflash test/test_dflash.cpp)
target_include_directories(test_dflash PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(test_dflash PRIVATE dflash27b ggml ggml-cuda)
target_link_libraries(test_dflash PRIVATE ${_dflash_test_dflash_libs})
# test_dflash uses cudaMemcpyAsync / cudaMemcpy2DAsync directly for the
# --fast-rollback path (per-step SSM intermediate state commit). Needs
# the CUDA runtime on its own link line.
find_package(CUDAToolkit REQUIRED)
target_link_libraries(test_dflash PRIVATE CUDA::cudart)
# OpenMP for parallel CPU top-K extraction in the ddtree path.
find_package(OpenMP)
if(OpenMP_CXX_FOUND)
option(DFLASH27B_TEST_DFLASH_OPENMP "Enable OpenMP for test_dflash CPU top-K extraction" OFF)
if(DFLASH27B_TEST_DFLASH_OPENMP)
# OpenMP for parallel CPU top-K extraction in the ddtree path.
find_package(OpenMP REQUIRED COMPONENTS CXX)
target_link_libraries(test_dflash PRIVATE OpenMP::OpenMP_CXX)
endif()
if(WIN32)
option(DFLASH27B_BUILD_TEST_DFLASH_LINKCHECK
"Build an alternate-output test_dflash binary so Windows relinks still work while test_dflash.exe is held by a live daemon"
ON)
if(DFLASH27B_BUILD_TEST_DFLASH_LINKCHECK)
add_executable(test_dflash_linkcheck test/test_dflash.cpp)
target_include_directories(test_dflash_linkcheck PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(test_dflash_linkcheck PRIVATE ${_dflash_test_dflash_libs})
if(DFLASH27B_TEST_DFLASH_OPENMP)
target_link_libraries(test_dflash_linkcheck PRIVATE OpenMP::OpenMP_CXX)
endif()
endif()
endif()
endif()
endif()