Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example using enzyme-auto-sparsity #69

Draft
wants to merge 9 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/spack_default_build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ jobs:
# Minimal Build(s) - GHCR mirror speeds these up a lot!
spack_spec:
- gridkit@develop +enzyme
^[email protected]

name: Build gridkit with Spack
steps:
Expand Down
2 changes: 1 addition & 1 deletion buildsystem/spack/spack
16 changes: 15 additions & 1 deletion cmake/FindEnzyme.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,20 @@ find_library(ENZYME_LLVM_PLUGIN_LIBRARY
REQUIRED)
message(STATUS "Enzyme LLVM plugin library: ${ENZYME_LLVM_PLUGIN_LIBRARY}")

find_library(ENZYME_CLANG_PLUGIN_LIBRARY
NAMES
ClangEnzyme-${Enzyme_LLVM_VERSION_MAJOR}.so
ClangEnzyme-${Enzyme_LLVM_VERSION_MAJOR}.dylib
ClangEnzyme-${Enzyme_LLVM_VERSION_MAJOR}.dll
PATHS
${ENZYME_DIR}
ENV LD_LIBRARY_PATH
ENV DYLD_LIBRARY_PATH
PATH_SUFFIXES
lib64 lib
REQUIRED)
message(STATUS "Enzyme Clang plugin library: ${ENZYME_CLANG_PLUGIN_LIBRARY}")

find_program(GRIDKIT_LLVM_LINK llvm-link
PATHS ${Enzyme_LLVM_BINARY_DIR}
REQUIRED)
Expand Down Expand Up @@ -136,7 +150,7 @@ macro(enzyme_add_executable)
add_custom_command(
DEPENDS ${PHASE0}
OUTPUT ${PHASE1}
COMMAND ${CMAKE_CXX_COMPILER} -flto -c ${PHASE0} ${INCLUDE_COMPILER_LIST} -O2 -fno-vectorize -ffast-math -fno-unroll-loops -o ${PHASE1}
COMMAND ${CMAKE_CXX_COMPILER} -flto -c ${PHASE0} ${INCLUDE_COMPILER_LIST} -O2 -fno-vectorize -ffast-math -fno-unroll-loops -fpass-plugin=${ENZYME_CLANG_PLUGIN_LIBRARY} -Xclang -load -Xclang ${ENZYME_CLANG_PLUGIN_LIBRARY} -mllvm -enable-load-pre=0 -mllvm -enzyme-auto-sparsity=1 -o ${PHASE1}
COMMENT "Compiling ${SRC} to object file for target ${enzyme_add_executable_NAME}"
)
set(OBJS "${OBJS} ${PHASE1}")
Expand Down
7 changes: 7 additions & 0 deletions examples/Enzyme/Standalone/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,12 @@ enzyme_add_executable(
LINK_LIBRARIES GRIDKIT::DenseMatrix
)

enzyme_add_executable(
NAME EnzymeStandaloneSparseCheck
SOURCES EnzymeSparse.cpp
LINK_LIBRARIES GRIDKIT::SparseMatrix
)

add_test(NAME "EnzymeStandaloneScalarCheck" COMMAND ${CMAKE_CURRENT_BINARY_DIR}/EnzymeStandaloneScalarCheck)
add_test(NAME "EnzymeStandaloneVectorCheck" COMMAND ${CMAKE_CURRENT_BINARY_DIR}/EnzymeStandaloneVectorCheck)
add_test(NAME "EnzymeStandaloneSparseCheck" COMMAND ${CMAKE_CURRENT_BINARY_DIR}/EnzymeStandaloneSparseCheck)
206 changes: 206 additions & 0 deletions examples/Enzyme/Standalone/EnzymeSparse.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#include <assert.h>
#include <cmath>
#include <limits>
#include <stdio.h>
#include <vector>

#include <LinearAlgebra/SparseMatrix/COO_Matrix.hpp>

/**
* @brief Standalone example that computes the sparse Jacobian of a vector-valued function
* by automatic differentiation via Enzyme.
*
* TODO: Modify the sparse storage provided to Enzyme to directly operate on std::vector and COO_Matrix
* TODO: Convert this into a unit test.
*/

using SparseMatrix = COO_Matrix<double, size_t>;
extern int enzyme_dup;
extern int enzyme_const;
extern int enzyme_dupnoneed;

template <typename T, typename... Tys>
extern T __enzyme_fwddiff(void*, Tys...) noexcept;

template <typename T, typename... Tys>
extern T __enzyme_todense(Tys...) noexcept;

// Sparse storage for Enzyme
template <typename T>
struct Triple
{
size_t row;
size_t col;
T val;
Triple(Triple&&) = default;

Triple(size_t row, size_t col, T val)
: row(row),
col(col),
val(val)
{
}
};

__attribute__((enzyme_sparse_accumulate)) static void inner_storeflt(int64_t row, int64_t col, float val, std::vector<Triple<float>>& triplets)
{
triplets.emplace_back(row, col, val);
}

__attribute__((enzyme_sparse_accumulate)) static void inner_storedbl(int64_t row, int64_t col, double val, std::vector<Triple<double>>& triplets)
{
triplets.emplace_back(row, col, val);
}

template <typename T>
__attribute__((always_inline)) static void sparse_store(T val, int64_t idx, size_t i, std::vector<Triple<T>>& triplets)
{
if (val == 0.0)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I worry that this is still computing 0s and looping over them at runtime, rather than computing sparsity at compile time

Copy link
Collaborator Author

@nkoukpaizan nkoukpaizan Mar 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC from when @wsmoses showed us the LLVM IR for a simpler example, it showed the structural zeros being optimized out. I've messed with it since (https://fwd.gymni.ch/U2yB6g) and I don't understand LLVM enough to point it out, though.

It's also the case that non-structural zeros are stored at runtime, which would not be the case if just checking the values at run time. In this example, I purposefully chose the first value of the input to be 0 to check that the 0 derivative gets returned.

Copy link
Collaborator

@alexander-novo alexander-novo Mar 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried removing the debug statements, and I can see that the structural zeroes are removed from consideration, which is nice. Check around line 170 in the output - you can see that it calculates the derivative input[i] * i * 2.0 and then returns that value with the index (i,i).

It looks like this is due to the -enzyme-auto-sparsity=1 flag? This flag seems a bit volatile... for example I can get this example (which represents more what the models in GridKit actually look like) to work with -enzyme-auto-sparsity=0, but it does do the runtime checks for non-structural zeroes (it does still automatically elide the structural zeroes, though). Changing the flag to -enzyme-auto-sparsity=1 causes Enzyme to fail to compile. It seems this flag has a fairly fragile dependence on functions which have a size parameter passed in and loop until that parameter, so we can reintroduce the N variable, which causes it to compile, but now there are many runtime checks on the N variable...

This may be worth submitting an issue to Enzyme about, since it seems like a fairly reasonable example?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like it's because of loop unrolling. If you disable loop unrolling it works fine. Good thing to know for the future!

return;
idx /= sizeof(T);
if constexpr (sizeof(T) == 4)
inner_storeflt(i, idx, val, triplets);
else
inner_storedbl(i, idx, val, triplets);
}

template <typename T>
__attribute__((always_inline)) static T sparse_load(int64_t idx, size_t i, std::vector<Triple<T>>& triplets)
{
return 0.0;
}

template <typename T>
__attribute__((always_inline)) static void ident_store(T, int64_t idx, size_t i)
{
assert(0 && "should never load");
}

Copy link
Collaborator

@shakedregev shakedregev Mar 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this function do? Except for the fact that one should not call it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a "dummy" function required for calling __enzyme_todense, but on an output variable, so it should never be read from.

template <typename T>
__attribute__((always_inline)) static T ident_load(int64_t idx, size_t i)
{
idx /= sizeof(T);
return (T) (idx == i);
}

// Vector-valued function to differentiate
template <typename T>
__attribute__((always_inline)) static void f(size_t N, T* input, T* output)
{
for (size_t i = 0; i < N; i++)
{
output[i] = input[i] * input[i];
}
}

// Reference Jacobian
template <typename T>
void jac_f_ref(std::vector<T> x, std::vector<T> y, SparseMatrix& jac)
{
std::vector<size_t> ctemp{};
std::vector<size_t> rtemp{};
std::vector<T> valtemp{};
for (int idy = 0; idy < y.size(); ++idy)
{
for (int idx = 0; idx < x.size(); ++idx)
{
if (idx == idy)
{
rtemp.push_back(idx);
ctemp.push_back(idy);
valtemp.push_back(2.0 * x[idx]);
}
}
}
jac.setValues(rtemp, ctemp, valtemp);
}

// Function that computes the Jacobian via automatic differentiation
template <typename T>
__attribute__((noinline)) void jac_f(size_t N, T* input, SparseMatrix& jac)
{
std::vector<Triple<T>> triplets;
for (size_t i = 0; i < N; i++)
{
T* output = __enzyme_todense<T*>((void*) ident_load<T>, (void*) ident_store<T>, i);
T* d_output = __enzyme_todense<T*>((void*) sparse_load<T>, (void*) sparse_store<T>, i, &triplets);

__enzyme_fwddiff<void>((void*) f<T>,
enzyme_const,
N,
enzyme_dup,
input,
output,
enzyme_dupnoneed,
(T*) 0x1,
d_output);
}

std::vector<size_t> ctemp{};
std::vector<size_t> rtemp{};
std::vector<T> valtemp{};
for (auto& tup : triplets)
{
rtemp.push_back(tup.row);
ctemp.push_back(tup.col);
valtemp.push_back(tup.val);
}
jac.setValues(rtemp, ctemp, valtemp);
}

// Compare two sparse matrices
void check(SparseMatrix matrix_1, SparseMatrix matrix_2, int& fail)
{
std::tuple<std::vector<size_t>&, std::vector<size_t>&, std::vector<double>&> entries_1 = matrix_1.getEntries();
const auto [rcord_1, ccord_1, vals_1] = entries_1;
std::tuple<std::vector<size_t>&, std::vector<size_t>&, std::vector<double>&> entries_2 = matrix_2.getEntries();
const auto [rcord_2, ccord_2, vals_2] = entries_2;
for (int ind = 0; ind < vals_1.size(); ++ind)
{
if (rcord_1[ind] != rcord_2[ind])
fail++;
if (ccord_1[ind] != ccord_2[ind])
fail++;
if (std::abs(vals_1[ind] - vals_2[ind]) > std::numeric_limits<double>::epsilon())
fail++;
}
}

int main()
{
// Vector and matrix declarations
size_t N = 5;
std::vector<double> x(N);
std::vector<double> sq(N);
SparseMatrix dsq = SparseMatrix(N, N);
SparseMatrix dsq_ref = SparseMatrix(N, N);

// Input initialization
double val = 0.0;
for (int i = 0; i < N; ++i)
{
x[i] = val;
val += 1.0;
}

// Function evaluation
f(x.size(), x.data(), sq.data());

// Reference Jacobian
jac_f_ref(x, sq, dsq_ref);

// Enzyme Jacobian
jac_f<double>(N, x.data(), dsq);

// Check
int fail = 0;
check(dsq, dsq_ref, fail);
bool verbose = true;
if (verbose)
{
dsq.printMatrix("Autodiff Jacobian");
dsq_ref.printMatrix("Reference Jacobian");
}

return fail;
}
Loading