From b9dcb3531dc4c1f10f501096084665756bbc031f Mon Sep 17 00:00:00 2001 From: koparasy Date: Fri, 19 Dec 2025 15:30:12 -0800 Subject: [PATCH] Add blackscholes without AMS support in repo --- examples/simple/CMakeLists.txt | 46 +++ examples/simple/blackscholes/CMakeLists.txt | 88 ++++++ examples/simple/blackscholes/constants.hpp | 12 + examples/simple/blackscholes/cpu_compute.cpp | 179 ++++++++++++ examples/simple/blackscholes/driver.cpp | 189 +++++++++++++ examples/simple/blackscholes/gpu_compute.cpp | 260 +++++++++++++++++ examples/simple/include/device_traits.hpp | 278 +++++++++++++++++++ examples/simple/include/profile.hpp | 24 ++ examples/simple/inputs/CMakeLists.txt | 2 + examples/simple/inputs/ams_inputs.cmake | 97 +++++++ 10 files changed, 1175 insertions(+) create mode 100644 examples/simple/CMakeLists.txt create mode 100644 examples/simple/blackscholes/CMakeLists.txt create mode 100644 examples/simple/blackscholes/constants.hpp create mode 100644 examples/simple/blackscholes/cpu_compute.cpp create mode 100644 examples/simple/blackscholes/driver.cpp create mode 100644 examples/simple/blackscholes/gpu_compute.cpp create mode 100644 examples/simple/include/device_traits.hpp create mode 100644 examples/simple/include/profile.hpp create mode 100644 examples/simple/inputs/CMakeLists.txt create mode 100644 examples/simple/inputs/ams_inputs.cmake diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt new file mode 100644 index 00000000..16541e1c --- /dev/null +++ b/examples/simple/CMakeLists.txt @@ -0,0 +1,46 @@ +# Copyright 2021-2023 Lawrence Livermore National Security, LLC and other +# AMSLib Project Developers +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +cmake_minimum_required(VERSION 3.24) +cmake_policy(SET CMP0104 NEW) + +# Define the project +project(AMSExamples LANGUAGES CXX) + + +option(WITH_AMS_EXAMPLE_CUDA "Option to enable CUDA" Off) +option(WITH_AMS_EXAMPLE_HIP "Option to enable HIP" Off) +option(WITH_AMS_DOWNLOAD_DATA "Download benchmark datasets at configure time" OFF) + +if(WITH_AMS_EXAMPLE_CUDA AND WITH_AMS_EXAMPLE_HIP) + message(FATAL_ERROR + "WITH_AMS_EXAMPLE_CUDA=${WITH_AMS_EXAMPLE_CUDA} and " + "WITH_AMS_EXAMPLE_HIP=${WITH_AMS_EXAMPLE_HIP} are mutually exclusive." + ) +endif() + + +set(AMS_INPUT_DIR "${CMAKE_BINARY_DIR}/inputs") +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +if (WITH_AMS_EXAMPLE_CUDA) +message(STATUS "HERE") + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + enable_language(CUDA) + find_package(CUDAToolkit) +elseif (WITH_AMS_EXAMPLE_HIP) + enable_language(HIP) + find_package(hip REQUIRED) +endif() + +if (WITH_AMS_DOWNLOAD_DATA) +enable_testing() +add_subdirectory(inputs) +endif() + + +add_subdirectory(blackscholes) diff --git a/examples/simple/blackscholes/CMakeLists.txt b/examples/simple/blackscholes/CMakeLists.txt new file mode 100644 index 00000000..72f25ed0 --- /dev/null +++ b/examples/simple/blackscholes/CMakeLists.txt @@ -0,0 +1,88 @@ + +add_executable(blck_cpu driver.cpp cpu_compute.cpp) + +if (WITH_AMS_EXAMPLE_HIP) + add_executable(blck_hip driver.cpp gpu_compute.cpp) + set_source_files_properties(gpu_compute.cpp PROPERTIES LANGUAGE HIP) + target_link_libraries(blck_hip PRIVATE hip::device hip::host) + target_include_directories(blck_hip + PRIVATE + ${PROJECT_SOURCE_DIR}/include + ) + target_compile_definitions(blck_hip PRIVATE AMS_EXAMPLE_ENABLE_HIP) +elseif(WITH_AMS_EXAMPLE_CUDA) + add_executable(blck_cuda driver.cpp gpu_compute.cpp) + set_source_files_properties(gpu_compute.cpp PROPERTIES LANGUAGE CUDA) + target_link_libraries(blck_cuda PRIVATE CUDA::cudart) + target_include_directories(blck_cuda + PRIVATE + ${PROJECT_SOURCE_DIR}/include + ) + target_compile_definitions(blck_cuda PRIVATE AMS_EXAMPLE_ENABLE_CUDA) +endif() + + +target_include_directories(blck_cpu + PRIVATE + ${PROJECT_SOURCE_DIR}/include +) + + +if (WITH_AMS_DOWNLOAD_DATA) +set(BS_ZST "${AMS_INPUT_DIR}/blackscholes_input.bin.zst") +set(BS_BIN "${AMS_INPUT_DIR}/blackscholes_input.bin") + + ams_download_dataset( + NAME blackscholes_input_zst + URL "https://zenodo.org/records/17992861/files/random_input.bin.zst?download=1" + SHA256 "b895a9e5206ed60e5fd63529ceeb5e63d823149de4559f578fcab58cb2908ee4" + OUT_FILE "${BS_ZST}" + ) + +# Always define the build-time decompression target (it will run when needed) +ams_add_zstd_decompress_target( + NAME inputs_blackscholes + ZST_FILE "${BS_ZST}" + OUT_FILE "${BS_BIN}" +) + +add_dependencies(blck_cpu inputs_blackscholes) +add_test( + NAME BLACKSCHOLES::CPU + COMMAND blck_cpu + ${BS_BIN} +) +set_tests_properties(BLACKSCHOLES::CPU + PROPERTIES + LABELS "benchmark;blackscholes;cpu;no-ams" + TIMEOUT 600 +) + +if (WITH_AMS_EXAMPLE_HIP) +add_dependencies(blck_hip inputs_blackscholes) +add_test( + NAME BLACKSCHOLES::CPU + COMMAND blck_hip + ${BS_BIN} +) +set_tests_properties(BLACKSCHOLES::GPU + PROPERTIES + LABELS "benchmark;blackscholes;gpu;no-ams" + TIMEOUT 600 +) +elseif(WITH_AMS_EXAMPLE_CUDA) +add_dependencies(blck_cpu inputs_blackscholes) +add_test( + NAME BLACKSCHOLES::GPU + COMMAND blck_cuda + ${BS_BIN} +) +set_tests_properties(BLACKSCHOLES::CPU + PROPERTIES + LABELS "benchmark;blackscholes;gpu;no-ams" + TIMEOUT 600 +) +endif() + + +endif() diff --git a/examples/simple/blackscholes/constants.hpp b/examples/simple/blackscholes/constants.hpp new file mode 100644 index 00000000..97669caf --- /dev/null +++ b/examples/simple/blackscholes/constants.hpp @@ -0,0 +1,12 @@ +#define fptype double + +const fptype inv_sqrt_2xPI = 0.39894228040143270286f; +const fptype zero = 0.0; +const fptype half = 0.5; +const fptype const1 = 0.2316419; +const fptype one = 1.0; +const fptype const2 = 0.319381530; +const fptype const3 = 0.356563782; +const fptype const4 = 1.781477937; +const fptype const5 = 1.821255978; +const fptype const6 = 1.330274429; diff --git a/examples/simple/blackscholes/cpu_compute.cpp b/examples/simple/blackscholes/cpu_compute.cpp new file mode 100644 index 00000000..8301cde1 --- /dev/null +++ b/examples/simple/blackscholes/cpu_compute.cpp @@ -0,0 +1,179 @@ +#include + +#include "constants.hpp" + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +// Cumulative Normal Distribution Function +// See Hull, Section 11.8, P.243-244 +// + +fptype CNDF(fptype InputX) +{ + int sign; + + fptype OutputX; + fptype xInput; + fptype xNPrimeofX; + fptype expValues; + fptype xK2; + fptype xK2_2, xK2_3; + fptype xK2_4, xK2_5; + fptype xLocal, xLocal_1; + fptype xLocal_2, xLocal_3; + fptype temp; + + // Check for negative value of InputX + if (InputX < zero) { + InputX = -InputX; + sign = 1; + } else + sign = 0; + + xInput = InputX; + + // Compute NPrimeX term common to both four & six decimal accuracy calcs + temp = -half * InputX * InputX; + + expValues = exp(temp); + + xNPrimeofX = expValues; + xNPrimeofX = xNPrimeofX * inv_sqrt_2xPI; + + xK2 = const1 * xInput; + xK2 = one + xK2; + xK2 = one / xK2; + xK2_2 = xK2 * xK2; + xK2_3 = xK2_2 * xK2; + xK2_4 = xK2_3 * xK2; + xK2_5 = xK2_4 * xK2; + + xLocal_1 = xK2 * const2; + xLocal_2 = xK2_2 * (-const3); + xLocal_3 = xK2_3 * const4; + xLocal_2 = xLocal_2 + xLocal_3; + xLocal_3 = xK2_4 * (-const5); + xLocal_2 = xLocal_2 + xLocal_3; + xLocal_3 = xK2_5 * const6; + xLocal_2 = xLocal_2 + xLocal_3; + + xLocal_1 = xLocal_2 + xLocal_1; + xLocal = xLocal_1 * xNPrimeofX; + xLocal = one - xLocal; + + OutputX = xLocal; + + if (sign) { + OutputX = one - OutputX; + } + + return OutputX; +} + +fptype BlkSchlsEqEuroNoDiv(fptype sptprice, + fptype strike, + fptype rate, + fptype volatility, + fptype time, + int otype, + float timet) +{ + fptype OptionPrice; + + // local private working variables for the calculation + fptype xStockPrice; + fptype xStrikePrice; + fptype xRiskFreeRate; + fptype xVolatility; + + fptype xTime; + fptype xSqrtTime; + + fptype logValues; + fptype xLogTerm; + fptype xD1; + fptype xD2; + fptype xPowerTerm; + fptype xDen; + fptype d1; + fptype d2; + fptype FutureValueX; + fptype NofXd1; + fptype NofXd2; + fptype NegNofXd1; + fptype NegNofXd2; + fptype temp; + + xStockPrice = sptprice; + xStrikePrice = strike; + xRiskFreeRate = rate; + xVolatility = volatility; + + xTime = time; + xSqrtTime = sqrt(xTime); + + temp = sptprice / strike; + + logValues = log(sptprice / strike); + + xLogTerm = logValues; + + + xPowerTerm = xVolatility * xVolatility; + xPowerTerm = xPowerTerm * half; + + xD1 = xRiskFreeRate + xPowerTerm; + xD1 = xD1 * xTime; + xD1 = xD1 + xLogTerm; + + xDen = xVolatility * xSqrtTime; + xD1 = xD1 / xDen; + xD2 = xD1 - xDen; + + d1 = xD1; + d2 = xD2; + + //@APPROX LABEL("CNDF_1") IN(d1) OUT(NofXd1) APPROX_TECH(MEMO_IN|MEMO_OUT) + NofXd1 = CNDF(d1); + + //@APPROX LABEL("CNDF_2") IN(d2) OUT(NofXd2) APPROX_TECH(MEMO_IN|MEMO_OUT) + NofXd2 = CNDF(d2); + + temp = -(rate * time); + + FutureValueX = (exp(temp)); + + FutureValueX *= strike; + + if (otype == 0) { + OptionPrice = (sptprice * NofXd1) - (FutureValueX * NofXd2); + } else { + NegNofXd1 = (one - NofXd1); + NegNofXd2 = (one - NofXd2); + OptionPrice = (FutureValueX * NegNofXd2) - (sptprice * NegNofXd1); + } + + return OptionPrice; +} + + +int compute(fptype *sptprice, + fptype *strike, + fptype *rate, + fptype *volatility, + fptype *otime, + int *otype, + fptype *prices, + size_t numOptions) +{ + int i, j, k; + fptype priceDelta; + for (i = 0; i < numOptions; i++) { + prices[i] = BlkSchlsEqEuroNoDiv( + sptprice[i], strike[i], rate[i], volatility[i], otime[i], otype[i], 0); + } + + return 0; +} diff --git a/examples/simple/blackscholes/driver.cpp b/examples/simple/blackscholes/driver.cpp new file mode 100644 index 00000000..57abf426 --- /dev/null +++ b/examples/simple/blackscholes/driver.cpp @@ -0,0 +1,189 @@ +// Copyright (c) 2007 Intel Corp. + +// Black-Scholes +// Analytical method for calculating European Options +// +// +// Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice +// Hall, John C. Hull, + +#include +#include +#include + +#include +#include +#include + +#include "constants.hpp" +#include "profile.hpp" + +#define DOUBLE 0 +#define FLOAT 1 +#define INT 2 + +int compute(fptype *sptprice, + fptype *strike, + fptype *rate, + fptype *volatility, + fptype *otime, + int *otype, + fptype *prices, + size_t numOptions); + +void readData(FILE *fd, double **data, size_t *numElements) +{ + assert(fd && "File pointer is not valid\n"); + fread(numElements, sizeof(size_t), 1, fd); + size_t elements = *numElements; + double *ptr = (double *)malloc(sizeof(double) * elements); + assert(ptr && "Could Not allocate pointer\n"); + *data = ptr; + size_t i; + int type; + fread(&type, sizeof(int), 1, fd); + if (type == DOUBLE) { + fread(ptr, sizeof(double), elements, fd); + } else if (type == FLOAT) { + float *tmp = (float *)malloc(sizeof(float) * elements); + fread(tmp, sizeof(float), elements, fd); + for (i = 0; i < elements; i++) { + ptr[i] = (double)tmp[i]; + } + free(tmp); + } else if (type == INT) { + int *tmp = (int *)malloc(sizeof(int) * elements); + fread(tmp, sizeof(int), elements, fd); + for (i = 0; i < elements; i++) { + ptr[i] = (double)tmp[i]; + } + free(tmp); + } + return; +} + +void readData(FILE *fd, float **data, size_t *numElements) +{ + assert(fd && "File pointer is not valid\n"); + fread(numElements, sizeof(size_t), 1, fd); + size_t elements = *numElements; + + float *ptr = (float *)malloc(sizeof(float) * elements); + assert(ptr && "Could Not allocate pointer\n"); + *data = ptr; + + size_t i; + int type; + fread(&type, sizeof(int), 1, fd); + if (type == FLOAT) { + fread(ptr, sizeof(float), elements, fd); + } else if (type == DOUBLE) { + double *tmp = (double *)malloc(sizeof(double) * elements); + fread(tmp, sizeof(double), elements, fd); + for (i = 0; i < elements; i++) { + ptr[i] = (float)tmp[i]; + } + free(tmp); + } else if (type == INT) { + int *tmp = (int *)malloc(sizeof(int) * elements); + fread(tmp, sizeof(int), elements, fd); + for (i = 0; i < elements; i++) { + ptr[i] = (float)tmp[i]; + } + free(tmp); + } + return; +} + +void readData(FILE *fd, int **data, size_t *numElements) +{ + assert(fd && "File pointer is not valid\n"); + fread(numElements, sizeof(size_t), 1, fd); + size_t elements = *numElements; + + int *ptr = (int *)malloc(sizeof(int) * elements); + assert(ptr && "Could Not allocate pointer\n"); + *data = ptr; + + size_t i; + int type; + fread(&type, sizeof(int), 1, fd); + if (type == INT) { + fread(ptr, sizeof(int), elements, fd); + } else if (type == DOUBLE) { + double *tmp = (double *)malloc(sizeof(double) * elements); + fread(tmp, sizeof(double), elements, fd); + for (i = 0; i < elements; i++) { + ptr[i] = (int)tmp[i]; + } + free(tmp); + } else if (type == FLOAT) { + float *tmp = (float *)malloc(sizeof(float) * elements); + fread(tmp, sizeof(float), elements, fd); + for (i = 0; i < elements; i++) { + ptr[i] = (int)tmp[i]; + } + free(tmp); + } + return; +} + +int main(int argc, char **argv) +{ + + fptype *prices; + size_t numOptions; + + int *otype; + fptype *sptprice; + fptype *strike; + fptype *rate; + fptype *volatility; + fptype *otime; + + FILE *file; + int i; + int loopnum; + int rv; + + fflush(NULL); + if (argc != 2) { + printf("Usage:\n\t%s \n", argv[0]); + exit(1); + } + char *inputFile = argv[1]; + char *outputFile = argv[2]; + + //Read input data from file + file = fopen(inputFile, "rb"); + if (file == NULL) { + printf("ERROR: Unable to open file `%s'.\n", inputFile); + exit(1); + } +#define PAD 256 +#define LINESIZE 64 + readData(file, &otype, &numOptions); + readData(file, &sptprice, &numOptions); + readData(file, &strike, &numOptions); + readData(file, &rate, &numOptions); + readData(file, &volatility, &numOptions); + readData(file, &otime, &numOptions); + prices = (fptype *)malloc(sizeof(fptype) * numOptions); + + { + std::cout << "Total NumOptions Computed:" << numOptions << "\n"; + ScopedTimer t("compute", numOptions); + compute( + sptprice, strike, rate, volatility, otime, otype, prices, numOptions); + } + + free(sptprice); + free(strike); + free(rate); + free(volatility); + free(otime); + free(otype); + free(prices); + + return 0; +} diff --git a/examples/simple/blackscholes/gpu_compute.cpp b/examples/simple/blackscholes/gpu_compute.cpp new file mode 100644 index 00000000..2262034f --- /dev/null +++ b/examples/simple/blackscholes/gpu_compute.cpp @@ -0,0 +1,260 @@ +#include + +#include "constants.hpp" +#include "device_traits.hpp" + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +// Cumulative Normal Distribution Function +// See Hull, Section 11.8, P.243-244 + +__device__ fptype CNDF(fptype InputX) +{ + int sign; + + fptype OutputX; + fptype xInput; + fptype xNPrimeofX; + fptype expValues; + fptype xK2; + fptype xK2_2, xK2_3; + fptype xK2_4, xK2_5; + fptype xLocal, xLocal_1; + fptype xLocal_2, xLocal_3; + fptype temp; + + // Check for negative value of InputX + if (InputX < zero) { + InputX = -InputX; + sign = 1; + } else + sign = 0; + + xInput = InputX; + + // Compute NPrimeX term common to both four & six decimal accuracy calcs + temp = -half * InputX * InputX; + + expValues = exp(temp); + + xNPrimeofX = expValues; + xNPrimeofX = xNPrimeofX * inv_sqrt_2xPI; + + xK2 = const1 * xInput; + xK2 = one + xK2; + xK2 = one / xK2; + xK2_2 = xK2 * xK2; + xK2_3 = xK2_2 * xK2; + xK2_4 = xK2_3 * xK2; + xK2_5 = xK2_4 * xK2; + + xLocal_1 = xK2 * const2; + xLocal_2 = xK2_2 * (-const3); + xLocal_3 = xK2_3 * const4; + xLocal_2 = xLocal_2 + xLocal_3; + xLocal_3 = xK2_4 * (-const5); + xLocal_2 = xLocal_2 + xLocal_3; + xLocal_3 = xK2_5 * const6; + xLocal_2 = xLocal_2 + xLocal_3; + + xLocal_1 = xLocal_2 + xLocal_1; + xLocal = xLocal_1 * xNPrimeofX; + xLocal = one - xLocal; + + OutputX = xLocal; + + if (sign) { + OutputX = one - OutputX; + } + + return OutputX; +} + +__device__ fptype BlkSchlsEqEuroNoDiv(fptype sptprice, + fptype strike, + fptype rate, + fptype volatility, + fptype time, + int otype, + float timet) +{ + fptype OptionPrice; + + // local private working variables for the calculation + fptype xStockPrice; + fptype xStrikePrice; + fptype xRiskFreeRate; + fptype xVolatility; + + fptype xTime; + fptype xSqrtTime; + + fptype logValues; + fptype xLogTerm; + fptype xD1; + fptype xD2; + fptype xPowerTerm; + fptype xDen; + fptype d1; + fptype d2; + fptype FutureValueX; + fptype NofXd1; + fptype NofXd2; + fptype NegNofXd1; + fptype NegNofXd2; + fptype temp; + + xStockPrice = sptprice; + xStrikePrice = strike; + xRiskFreeRate = rate; + xVolatility = volatility; + + xTime = time; + xSqrtTime = sqrt(xTime); + + temp = sptprice / strike; + + logValues = log(sptprice / strike); + + xLogTerm = logValues; + + + xPowerTerm = xVolatility * xVolatility; + xPowerTerm = xPowerTerm * half; + + xD1 = xRiskFreeRate + xPowerTerm; + xD1 = xD1 * xTime; + xD1 = xD1 + xLogTerm; + + xDen = xVolatility * xSqrtTime; + xD1 = xD1 / xDen; + xD2 = xD1 - xDen; + + d1 = xD1; + d2 = xD2; + + //@APPROX LABEL("CNDF_1") IN(d1) OUT(NofXd1) APPROX_TECH(MEMO_IN|MEMO_OUT) + NofXd1 = CNDF(d1); + + //@APPROX LABEL("CNDF_2") IN(d2) OUT(NofXd2) APPROX_TECH(MEMO_IN|MEMO_OUT) + NofXd2 = CNDF(d2); + + temp = -(rate * time); + + FutureValueX = (exp(temp)); + + FutureValueX *= strike; + + if (otype == 0) { + OptionPrice = (sptprice * NofXd1) - (FutureValueX * NofXd2); + } else { + NegNofXd1 = (one - NofXd1); + NegNofXd2 = (one - NofXd2); + OptionPrice = (FutureValueX * NegNofXd2) - (sptprice * NegNofXd1); + } + + return OptionPrice; +} + +__global__ void gpu_run(fptype *sptprice, + fptype *strike, + fptype *rate, + fptype *volatility, + fptype *otime, + int *otype, + fptype *prices, + size_t numOptions) +{ + int gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid > numOptions) return; + prices[gid] = BlkSchlsEqEuroNoDiv(sptprice[gid], + strike[gid], + rate[gid], + volatility[gid], + otime[gid], + otype[gid], + 0); +} + + +int compute(fptype *sptprice, + fptype *strike, + fptype *rate, + fptype *volatility, + fptype *otime, + int *otype, + fptype *prices, + size_t numOptions) +{ + constexpr int BLOCK_SIZE = 256; + fptype *d_sptprice; + fptype *d_strike; + fptype *d_rate; + fptype *d_volatility; + fptype *d_otime; + int *d_otype; + fptype *d_prices; + + DEVICE_CHECK((Device::deviceMalloc(reinterpret_cast(&d_sptprice), + numOptions * sizeof(fptype)))); + DEVICE_CHECK(Device::deviceMalloc(reinterpret_cast(&d_strike), + numOptions * sizeof(fptype))); + DEVICE_CHECK(Device::deviceMalloc(reinterpret_cast(&d_rate), + numOptions * sizeof(fptype))); + DEVICE_CHECK(Device::deviceMalloc(reinterpret_cast(&d_volatility), + numOptions * sizeof(fptype))); + DEVICE_CHECK(Device::deviceMalloc(reinterpret_cast(&d_otime), + numOptions * sizeof(fptype))); + DEVICE_CHECK(Device::deviceMalloc(reinterpret_cast(&d_otype), + numOptions * sizeof(int))); + DEVICE_CHECK(Device::deviceMalloc(reinterpret_cast(&d_prices), + numOptions * sizeof(fptype))); + + DEVICE_CHECK(Device::deviceCopy(reinterpret_cast(d_sptprice), + sptprice, + numOptions * sizeof(fptype), + Device::memcpyHostToDeviceKind())); + + DEVICE_CHECK(Device::deviceCopy(reinterpret_cast(d_strike), + strike, + numOptions * sizeof(fptype), + Device::memcpyHostToDeviceKind())); + + DEVICE_CHECK(Device::deviceCopy(reinterpret_cast(d_rate), + rate, + numOptions * sizeof(fptype), + Device::memcpyHostToDeviceKind())); + + DEVICE_CHECK(Device::deviceCopy(reinterpret_cast(d_volatility), + volatility, + numOptions * sizeof(fptype), + Device::memcpyHostToDeviceKind())); + DEVICE_CHECK(Device::deviceCopy(reinterpret_cast(d_otime), + otime, + numOptions * sizeof(fptype), + Device::memcpyHostToDeviceKind())); + + DEVICE_CHECK(Device::deviceCopy(reinterpret_cast(d_otype), + otype, + numOptions * sizeof(int), + Device::memcpyHostToDeviceKind())); + + + int grid = (numOptions + BLOCK_SIZE - 1) / BLOCK_SIZE; + gpu_run<<>>(d_sptprice, + d_strike, + d_rate, + d_volatility, + d_otime, + d_otype, + d_prices, + numOptions); + DEVICE_CHECK(Device::deviceCopy(reinterpret_cast(prices), + d_prices, + numOptions * sizeof(fptype), + Device::memcpyDeviceToHostKind())); + + return 0; +} diff --git a/examples/simple/include/device_traits.hpp b/examples/simple/include/device_traits.hpp new file mode 100644 index 00000000..7e9b0045 --- /dev/null +++ b/examples/simple/include/device_traits.hpp @@ -0,0 +1,278 @@ +#pragma once +#include +#include + +enum DeviceVendors { HIP, CUDA }; + +template +struct DeviceTraits; + + +#ifdef AMS_EXAMPLE_ENABLE_HIP +#include +#include + +template <> +struct DeviceTraits { + using DeviceError_t = hipError_t; + using DeviceStream_t = hipStream_t; + using DevicePtr_t = hipDeviceptr_t; + using DeviceHandle_t = hipDevice_t; + using DeviceEvent_t = hipEvent_t; + static constexpr auto DeviceSuccess = hipSuccess; + static inline std::optional deviceErrorCheck( + hipError_t ErrorCode) + { + if (ErrorCode == hipSuccess) return std::nullopt; + return std::string(hipGetErrorString(ErrorCode)); + } + + static hipError_t deviceStreamSynchronize(hipStream_t Stream) + { + return hipStreamSynchronize(Stream); + } + + static hipError_t deviceMemset(void *DevPtr, int Value, size_t Bytes) + { + auto EC = hipMemset(DevPtr, Value, Bytes); + return EC; + } + + static hipError_t deviceMalloc(void **ptr, size_t size) + { + return hipMalloc(ptr, size); + } + + static hipError_t deviceFree(void *ptr) { return hipFree(ptr); } + + static hipError_t deviceCopy(void *Dest, + void *Src, + size_t SizeBytes, + hipMemcpyKind Kind) + { + return hipMemcpy(Dest, Src, SizeBytes, Kind); + } + + static hipError_t deviceSynchronize() { return hipDeviceSynchronize(); } + + + static hipError_t getDeviceCount(int &devCount) + { + return hipGetDeviceCount(&devCount); + } + + static hipError_t setDevice(int DeviceId) { return hipSetDevice(DeviceId); } + + static hipError_t getDevice(int &DeviceId) { return hipGetDevice(&DeviceId); } + + static constexpr hipMemcpyKind memcpyHostToDeviceKind() + { + return hipMemcpyHostToDevice; + } + + static constexpr hipMemcpyKind memcpyDeviceToHostKind() + { + return hipMemcpyDeviceToHost; + } + + static DeviceError_t deviceStreamCreate(DeviceStream_t *Stream) + { + return hipStreamCreate(Stream); + } + + static DeviceError_t deviceStreamDestroy(DeviceStream_t Stream) + { + return hipStreamDestroy(Stream); + } + + static DeviceError_t deviceEventCreate(DeviceEvent_t *event) + { + return hipEventCreate(event); + } + + static DeviceError_t deviceEventRecord(DeviceEvent_t event, + DeviceStream_t stream) + { + return hipEventRecord(event, stream); + } + + static DeviceError_t deviceEventDestroy(DeviceEvent_t event) + { + return hipEventDestroy(event); + } + + static DeviceError_t deviceEventSynchronize(DeviceEvent_t event) + { + return hipEventSynchronize(event); + } + + static DeviceError_t deviceEventElapsedTime(float *ms, + DeviceEvent_t start, + DeviceEvent_t stop) + { + return hipEventElapsedTime(ms, start, stop); + } + + static DeviceError_t deviceGetSymbolAddress(void **devPtr, const void *symbol) + { + return hipGetSymbolAddress(devPtr, symbol); + } +}; +#elif defined(AMS_EXAMPLE_ENABLE_CUDA) + +#include +#include + +template <> +struct DeviceTraits { + + using DeviceError_t = cudaError_t; + using DeviceDriverError_t = CUresult; + + using DeviceStream_t = cudaStream_t; + + + using DeviceHandle_t = CUdevice; + + using DeviceEvent_t = cudaEvent_t; + static constexpr auto DeviceSuccess = cudaSuccess; + static constexpr auto DeviceDriverSuccess = CUDA_SUCCESS; + + static inline std::optional deviceErrorCheck( + DeviceError_t ErrorCode) + { + if (ErrorCode == DeviceSuccess) return std::nullopt; + return std::string(cudaGetErrorString(ErrorCode)); + } + + static inline std::optional deviceErrorCheck( + DeviceDriverError_t ErrorCode) + { + if (ErrorCode == DeviceDriverSuccess) return std::nullopt; + + if (ErrorCode == CUDA_ERROR_DEINITIALIZED) { + return std::nullopt; + } + + const char *name = nullptr, *desc = nullptr; + cuGetErrorName(ErrorCode, &name); + cuGetErrorString(ErrorCode, &desc); + auto EC = std::string("Error:") + std::to_string(ErrorCode) + ":"; + if (name) EC += std::string(name); + if (desc) EC += std::string(" description:") + std::string(desc); + + return EC; + } + + static DeviceError_t deviceStreamSynchronize(DeviceStream_t Stream) + { + return cudaStreamSynchronize(Stream); + } + + static DeviceError_t deviceMemset(void *DevPtr, int Value, size_t Bytes) + { + auto EC = cudaMemset(DevPtr, Value, Bytes); + return EC; + } + + static DeviceError_t deviceMalloc(void **ptr, size_t size) + { + return cudaMalloc(ptr, size); + } + + static DeviceError_t deviceFree(void *ptr) { return cudaFree(ptr); } + + static DeviceError_t deviceCopy(void *Dest, + void *Src, + size_t SizeBytes, + cudaMemcpyKind Kind) + { + return cudaMemcpy(Dest, Src, SizeBytes, Kind); + } + + static DeviceError_t deviceSynchronize() { return cudaDeviceSynchronize(); } + + static cudaError_t getDeviceCount(int &devCount) + { + return cudaGetDeviceCount(&devCount); + } + + static cudaError_t setDevice(int DeviceId) { return cudaSetDevice(DeviceId); } + + static cudaError_t getDevice(int &DeviceId) + { + return cudaGetDevice(&DeviceId); + } + + static constexpr cudaMemcpyKind memcpyHostToDeviceKind() + { + return cudaMemcpyHostToDevice; + } + + static constexpr cudaMemcpyKind memcpyDeviceToHostKind() + { + return cudaMemcpyDeviceToHost; + } + + static DeviceError_t deviceStreamCreate(DeviceStream_t *Stream) + { + return cudaStreamCreate(Stream); + } + + static DeviceError_t deviceStreamDestroy(DeviceStream_t Stream) + { + return cudaStreamDestroy(Stream); + } + + static DeviceError_t deviceEventCreate(DeviceEvent_t *event) + { + return cudaEventCreate(event); + } + + static DeviceError_t deviceEventRecord(DeviceEvent_t event, + DeviceStream_t stream) + { + return cudaEventRecord(event, stream); + } + + static DeviceError_t deviceEventDestroy(DeviceEvent_t event) + { + return cudaEventDestroy(event); + } + + static DeviceError_t deviceEventSynchronize(DeviceEvent_t event) + { + return cudaEventSynchronize(event); + } + + static DeviceError_t deviceEventElapsedTime(float *ms, + DeviceEvent_t start, + DeviceEvent_t stop) + { + return cudaEventElapsedTime(ms, start, stop); + } + + static DeviceError_t deviceGetSymbolAddress(void **devPtr, const void *symbol) + { + return cudaGetSymbolAddress(devPtr, symbol); + } +}; +#endif + +#ifdef AMS_EXAMPLE_ENABLE_HIP +using Device = DeviceTraits; +#elif defined(AMS_EXAMPLE_ENABLE_CUDA) +using Device = DeviceTraits; +#endif + +template +inline void check(ErrT ec, const char *file, int line) +{ + if (auto msg = Traits::deviceErrorCheck(ec)) { + fprintf(stderr, "ERROR @ %s:%d -> %s\n", file, line, msg->c_str()); + std::abort(); + } +} + +#define DEVICE_CHECK(CALL) \ + check((CALL), __FILE__, __LINE__) diff --git a/examples/simple/include/profile.hpp b/examples/simple/include/profile.hpp new file mode 100644 index 00000000..8e4ca80d --- /dev/null +++ b/examples/simple/include/profile.hpp @@ -0,0 +1,24 @@ +#include +#include +#include + +template +struct ScopedTimer { + using clock = std::chrono::steady_clock; + std::string_view name; + clock::time_point start; + T elements; + + explicit ScopedTimer(std::string_view n, T elements) + : name(n), elements(elements), start(clock::now()) + { + } + ~ScopedTimer() + { + auto end = clock::now(); + auto ns = std::chrono::duration_cast(end - start) + .count(); + std::cerr << name << ": " << ns << " ns " << ns / elements << " Elements/ns" + << "\n"; + } +}; diff --git a/examples/simple/inputs/CMakeLists.txt b/examples/simple/inputs/CMakeLists.txt new file mode 100644 index 00000000..a845a674 --- /dev/null +++ b/examples/simple/inputs/CMakeLists.txt @@ -0,0 +1,2 @@ +include(ams_inputs.cmake) + diff --git a/examples/simple/inputs/ams_inputs.cmake b/examples/simple/inputs/ams_inputs.cmake new file mode 100644 index 00000000..f630f5b7 --- /dev/null +++ b/examples/simple/inputs/ams_inputs.cmake @@ -0,0 +1,97 @@ +include_guard(GLOBAL) + +function(ams_download_dataset) + set(options) + set(oneValueArgs NAME URL SHA256 OUT_FILE) + cmake_parse_arguments(ADS "${options}" "${oneValueArgs}" "" ${ARGN}) + + foreach(req NAME URL SHA256 OUT_FILE) + if(NOT ADS_${req}) + message(FATAL_ERROR "ams_download_dataset: missing required argument: ${req}") + endif() + endforeach() + + get_filename_component(_out_dir "${ADS_OUT_FILE}" DIRECTORY) + + if(EXISTS "${ADS_OUT_FILE}") + file(SHA256 "${ADS_OUT_FILE}" _have_sha256) + string(TOLOWER "${_have_sha256}" _have_sha256) + string(TOLOWER "${ADS_SHA256}" _want_sha256) + if(_have_sha256 STREQUAL _want_sha256) + message(STATUS "[inputs] ${ADS_NAME}: present, SHA256 OK -> ${ADS_OUT_FILE}") + return() + endif() + message(FATAL_ERROR + "[inputs] ${ADS_NAME}: SHA256 mismatch for existing file.\n" + " Path: ${ADS_OUT_FILE}\n" + " Have: ${_have_sha256}\n" + " Expect: ${_want_sha256}\n" + "Delete the file or fix the URL/SHA." + ) + endif() + + file(MAKE_DIRECTORY "${_out_dir}") + set(_tmp "${ADS_OUT_FILE}.part") + + message(STATUS "[inputs] ${ADS_NAME}: downloading -> ${ADS_OUT_FILE}") + file(DOWNLOAD + "${ADS_URL}" + "${_tmp}" + SHOW_PROGRESS + TLS_VERIFY ON + EXPECTED_HASH "SHA256=${ADS_SHA256}" + STATUS _dl_status + LOG _dl_log + ) + list(GET _dl_status 0 _dl_code) + list(GET _dl_status 1 _dl_msg) + if(NOT _dl_code EQUAL 0) + if(EXISTS "${_tmp}") + file(REMOVE "${_tmp}") + endif() + message(FATAL_ERROR "[inputs] ${ADS_NAME}: download failed (${_dl_code}): ${_dl_msg}\n${_dl_log}") + endif() + + file(RENAME "${_tmp}" "${ADS_OUT_FILE}") + message(STATUS "[inputs] ${ADS_NAME}: download complete (hash verified)") +endfunction() + +# ---- Decompression helper (build-time) ---- +function(ams_add_zstd_decompress_target) + set(options) + set(oneValueArgs NAME ZST_FILE OUT_FILE) + cmake_parse_arguments(DEC "${options}" "${oneValueArgs}" "" ${ARGN}) + + foreach(req NAME ZST_FILE OUT_FILE) + if(NOT DEC_${req}) + message(FATAL_ERROR "ams_add_zstd_decompress_target: missing required argument: ${req}") + endif() + endforeach() + + # Find zstd executable on PATH + find_program(AMS_ZSTD_EXECUTABLE zstd) + if(NOT AMS_ZSTD_EXECUTABLE) + message(FATAL_ERROR + "[inputs] zstd not found. Please install it (e.g., 'brew install zstd' on macOS) " + "or put 'zstd' on PATH." + ) + endif() + + get_filename_component(_out_dir "${DEC_OUT_FILE}" DIRECTORY) + file(MAKE_DIRECTORY "${_out_dir}") + + # Build-time rule: OUT_FILE depends on ZST_FILE + add_custom_command( + OUTPUT "${DEC_OUT_FILE}" + DEPENDS "${DEC_ZST_FILE}" + COMMAND ${CMAKE_COMMAND} -E make_directory "${_out_dir}" + # Write to temp then rename for atomicity + COMMAND "${AMS_ZSTD_EXECUTABLE}" -d -f --no-progress -o "${DEC_OUT_FILE}.part" "${DEC_ZST_FILE}" + COMMAND ${CMAKE_COMMAND} -E rename "${DEC_OUT_FILE}.part" "${DEC_OUT_FILE}" + COMMENT "[inputs] Decompressing ${DEC_ZST_FILE} -> ${DEC_OUT_FILE}" + VERBATIM + ) + + add_custom_target("${DEC_NAME}" DEPENDS "${DEC_OUT_FILE}") +endfunction() +