From d30faa38ffec0b2131d772437892a0262b430c5b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 6 Aug 2024 13:58:13 -0400 Subject: [PATCH 1/8] Add tgamma nvrtc test --- test/test_gamma_nvrtc.cpp | 183 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 test/test_gamma_nvrtc.cpp diff --git a/test/test_gamma_nvrtc.cpp b/test/test_gamma_nvrtc.cpp new file mode 100644 index 0000000000..2d33430240 --- /dev/null +++ b/test/test_gamma_nvrtc.cpp @@ -0,0 +1,183 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const char* cuda_kernel = R"( +#include +extern "C" __global__ +void test_gamma_kernel(const float *in1, const float*, float *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::tgamma(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/math/boost-root/libs/math/include/"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float *h_in1, *h_in2, *h_out; + float *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float[numElements]; + h_in2 = new float[numElements]; + h_out = new float[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = boost::math::tgamma(h_in1[i]); + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From dae419023b6c03108eb16ee14335be928b9270ab Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 6 Aug 2024 13:58:21 -0400 Subject: [PATCH 2/8] Add CI run --- .github/workflows/ci.yml | 58 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0bf82c16e0..ccc4d37466 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -669,3 +669,61 @@ jobs: # run: | # cd ../boost-root/__build__ # ctest --output-on-failure --no-tests=error + nvrtc-cmake-test: + strategy: + fail-fast: false + + runs-on: gpu-runner-1 + + steps: + - uses: Jimver/cuda-toolkit@v0.2.16 + id: cuda-toolkit + with: + cuda: '12.5.0' + method: 'network' + + - name: Output CUDA information + run: | + echo "Installed cuda version is: ${{steps.cuda-toolkit.outputs.cuda}}"+ + echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" + nvcc -V + - uses: actions/checkout@v4 + + - name: Install Packages + run: | + sudo apt-get install -y cmake make + - name: Setup Boost + run: | + echo GITHUB_REPOSITORY: $GITHUB_REPOSITORY + LIBRARY=${GITHUB_REPOSITORY#*/} + echo LIBRARY: $LIBRARY + echo "LIBRARY=$LIBRARY" >> $GITHUB_ENV + echo GITHUB_BASE_REF: $GITHUB_BASE_REF + echo GITHUB_REF: $GITHUB_REF + REF=${GITHUB_BASE_REF:-$GITHUB_REF} + REF=${REF#refs/heads/} + echo REF: $REF + BOOST_BRANCH=develop && [ "$REF" == "master" ] && BOOST_BRANCH=master || true + echo BOOST_BRANCH: $BOOST_BRANCH + cd .. + git clone -b $BOOST_BRANCH --depth 1 https://github.com/boostorg/boost.git boost-root + cd boost-root + mkdir -p libs/$LIBRARY + cp -r $GITHUB_WORKSPACE/* libs/$LIBRARY + git submodule update --init tools/boostdep + python3 tools/boostdep/depinst/depinst.py --git_args "--jobs 3" $LIBRARY + - name: Configure + run: | + cd ../boost-root + mkdir __build__ && cd __build__ + cmake -DBOOST_INCLUDE_LIBRARIES=$LIBRARY -DBUILD_TESTING=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DBOOST_MATH_ENABLE_NVRTC=1 -DCMAKE_CUDA_ARCHITECTURES=70 -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.5 -DBOOST_MATH_NVRTC_CI_RUN=1 .. + pwd + - name: Build tests + run: | + cd ../boost-root/__build__ + cmake --build . --target tests -j $(nproc) + # We don't have the ability for runtime right now + #- name: Run tests + # run: | + # cd ../boost-root/__build__ + # ctest --output-on-failure --no-tests=error From 88c36d3fefcd9aed278541fe8b2b77acb261a881 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 6 Aug 2024 13:58:26 -0400 Subject: [PATCH 3/8] Add Jamfile --- test/nvrtc_jamfile | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 test/nvrtc_jamfile diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile new file mode 100644 index 0000000000..707780b7a7 --- /dev/null +++ b/test/nvrtc_jamfile @@ -0,0 +1,13 @@ +# Copyright 2024 Matt Borland +# Distributed under the Boost Software License, Version 1.0. +# https://www.boost.org/LICENSE_1_0.txt + +import testing ; +import ../../config/checks/config : requires ; + +project : requirements + [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] + ; + +# Special Functions +run test_gamma_nvrtc.cpp ; From 470e5934da54c7505faf30da03cc5c63023f44fb Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 6 Aug 2024 13:59:47 -0400 Subject: [PATCH 4/8] Update CML --- test/CMakeLists.txt | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 78d8fd4c34..22799e92ea 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -8,7 +8,7 @@ if(HAVE_BOOST_TEST) if (BOOST_MATH_ENABLE_CUDA) - message(STATUS "Building boost.cuda_math with CUDA") + message(STATUS "Building boost.math with CUDA") enable_language(CUDA) set(CMAKE_CUDA_EXTENSIONS OFF) @@ -16,9 +16,26 @@ if(HAVE_BOOST_TEST) enable_testing() boost_test_jamfile(FILE cuda_jamfile LINK_LIBRARIES Boost::math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::static_assert Boost::throw_exception Boost::unit_test_framework ) + + elseif (BOOST_MATH_ENABLE_NVRTC) + + message(STATUS "Building boost.math with NVRTC") + + find_package(CUDA REQUIRED) + + enable_testing() + + set(CUDA_nvrtc_LIBRARY /usr/local/cuda/lib64/libnvrtc.so) + + if (BOOST_MATH_NVRTC_CI_RUN) + boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::static_assert Boost::throw_exception ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} COMPILE_DEFINITIONS BOOST_MATH_NVRTC_CI_RUN=1 INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} ) + else () + boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::static_assert Boost::throw_exception ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} ) + endif() + elseif (BOOST_MATH_ENABLE_SYCL) - message(STATUS "Building boost.cuda_math with SYCL") + message(STATUS "Building boost.math with SYCL") set(CMAKE_CXX_COMPILER "icpx") set(CMAKE_C_COMPILER "icx") @@ -32,4 +49,4 @@ if(HAVE_BOOST_TEST) endif() -endif() \ No newline at end of file +endif() From e7f09ca02749ee31147d3335a94188a491752c74 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 6 Aug 2024 14:00:39 -0400 Subject: [PATCH 5/8] Add tgamma support to NVRTC --- include/boost/math/special_functions/gamma.hpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index b1beeb766b..46e8236aa1 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -14,6 +14,22 @@ #pragma once #endif +#ifdef __CUDACC_RTC__ + +namespace boost { +namespace math { + +template +__host__ __device__ T tgamma(T x) +{ + return ::tgamma(x); +} + +} // namespace math +} // namespace boost + +#else + #include #include #include @@ -2280,4 +2296,6 @@ BOOST_MATH_GPU_ENABLED inline tools::promote_args_t #include #include +#endif // __CUDACC_RTC__ + #endif // BOOST_MATH_SF_GAMMA_HPP From 477d1c1777253e669bdf677f308a9e6df11f52fd Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 6 Aug 2024 14:45:07 -0400 Subject: [PATCH 6/8] Fix runs on value --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ccc4d37466..ee72988440 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -673,7 +673,7 @@ jobs: strategy: fail-fast: false - runs-on: gpu-runner-1 + runs-on: ubuntu-22.04 steps: - uses: Jimver/cuda-toolkit@v0.2.16 From 37db0179032d0820159a6f802f79c1c65f2e39fe Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 7 Aug 2024 07:28:13 -0400 Subject: [PATCH 7/8] Rearrange and add policy overload --- .../boost/math/special_functions/gamma.hpp | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index 46e8236aa1..70b4ff155c 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -14,21 +14,7 @@ #pragma once #endif -#ifdef __CUDACC_RTC__ - -namespace boost { -namespace math { - -template -__host__ __device__ T tgamma(T x) -{ - return ::tgamma(x); -} - -} // namespace math -} // namespace boost - -#else +#ifndef __CUDACC_RTC__ #include #include @@ -2296,6 +2282,23 @@ BOOST_MATH_GPU_ENABLED inline tools::promote_args_t #include #include +#else + +namespace boost { +namespace math { + +inline __host__ __device__ float tgamma(float x) { return ::tgammaf(x); } +inline __host__ __device__ double tgamma(double x) { return ::tgamma(x); } + +template +inline __host__ __device__ T tgamma(T x, const Policy&) +{ + return boost::math::tgamma(x); +} + +} // namespace math +} // namespace boost + #endif // __CUDACC_RTC__ #endif // BOOST_MATH_SF_GAMMA_HPP From 135208b3a642594b7caa3460e8b01441d21456fa Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 7 Aug 2024 07:29:38 -0400 Subject: [PATCH 8/8] Expand testing --- test/nvrtc_jamfile | 3 +- test/test_gamma_nvrtc_double.cpp | 186 ++++++++++++++++++ ...a_nvrtc.cpp => test_gamma_nvrtc_float.cpp} | 33 ++-- 3 files changed, 206 insertions(+), 16 deletions(-) create mode 100644 test/test_gamma_nvrtc_double.cpp rename test/{test_gamma_nvrtc.cpp => test_gamma_nvrtc_float.cpp} (84%) diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 707780b7a7..63d0f2af09 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -10,4 +10,5 @@ project : requirements ; # Special Functions -run test_gamma_nvrtc.cpp ; +run test_gamma_nvrtc_double.cpp ; +run test_gamma_nvrtc_float.cpp ; diff --git a/test/test_gamma_nvrtc_double.cpp b/test/test_gamma_nvrtc_double.cpp new file mode 100644 index 0000000000..6b21bd04b7 --- /dev/null +++ b/test/test_gamma_nvrtc_double.cpp @@ -0,0 +1,186 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +extern "C" __global__ +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::tgamma(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = boost::math::tgamma(h_in1[i]); + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_nvrtc.cpp b/test/test_gamma_nvrtc_float.cpp similarity index 84% rename from test/test_gamma_nvrtc.cpp rename to test/test_gamma_nvrtc_float.cpp index 2d33430240..ce312f6916 100644 --- a/test/test_gamma_nvrtc.cpp +++ b/test/test_gamma_nvrtc_float.cpp @@ -18,10 +18,13 @@ #include #include +typedef float float_type; + const char* cuda_kernel = R"( +typedef float float_type; #include extern "C" __global__ -void test_gamma_kernel(const float *in1, const float*, float *out, int numElements) +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) @@ -113,36 +116,36 @@ int main() checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); int numElements = 5000; - float *h_in1, *h_in2, *h_out; - float *d_in1, *d_in2, *d_out; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; // Allocate memory on the host - h_in1 = new float[numElements]; - h_in2 = new float[numElements]; - h_out = new float[numElements]; + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; // Initialize input arrays std::mt19937_64 rng(42); - std::uniform_real_distribution dist(0.0f, 1.0f); + std::uniform_real_distribution dist(0.0f, 1.0f); for (int i = 0; i < numElements; ++i) { - h_in1[i] = static_cast(dist(rng)); - h_in2[i] = static_cast(dist(rng)); + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); } - checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float)), "Failed to allocate device memory for d_in1"); - checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float)), "Failed to allocate device memory for d_in2"); - checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float)), "Failed to allocate device memory for d_out"); + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); - checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); - checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); int blockSize = 256; int numBlocks = (numElements + blockSize - 1) / blockSize; void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); - checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); // Verify Result for (int i = 0; i < numElements; ++i)