From d30faa38ffec0b2131d772437892a0262b430c5b Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 6 Aug 2024 13:58:13 -0400
Subject: [PATCH 1/8] Add tgamma nvrtc test

---
 test/test_gamma_nvrtc.cpp | 183 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 183 insertions(+)
 create mode 100644 test/test_gamma_nvrtc.cpp
diff --git a/test/test_gamma_nvrtc.cpp b/test/test_gamma_nvrtc.cpp
new file mode 100644
index 0000000000..2d33430240
--- /dev/null
+++ b/test/test_gamma_nvrtc.cpp
@@ -0,0 +1,183 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvrtc.h>
+
+const char* cuda_kernel = R"(
+#include <boost/math/special_functions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float *in1, const float*, float *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::tgamma(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/math/boost-root/libs/math/include/"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float *h_in1, *h_in2, *h_out;
+        float *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float[numElements];
+        h_in2 = new float[numElements];
+        h_out = new float[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float>(dist(rng));
+            h_in2[i] = static_cast<float>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = boost::math::tgamma(h_in1[i]);
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}

From dae419023b6c03108eb16ee14335be928b9270ab Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 6 Aug 2024 13:58:21 -0400
Subject: [PATCH 2/8] Add CI run

---
 .github/workflows/ci.yml | 58 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0bf82c16e0..ccc4d37466 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -669,3 +669,61 @@ jobs:
       #  run: |
       #    cd ../boost-root/__build__
       #    ctest --output-on-failure --no-tests=error
+  nvrtc-cmake-test:
+    strategy:
+      fail-fast: false
+
+    runs-on: gpu-runner-1
+
+    steps:
+      - uses: Jimver/cuda-toolkit@v0.2.16
+        id: cuda-toolkit
+        with:
+          cuda: '12.5.0'
+          method: 'network'
+
+      - name: Output CUDA information
+        run: |
+          echo "Installed cuda version is: ${{steps.cuda-toolkit.outputs.cuda}}"+
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          nvcc -V
+      - uses: actions/checkout@v4
+
+      - name: Install Packages
+        run: |
+          sudo apt-get install -y cmake make
+      - name: Setup Boost
+        run: |
+          echo GITHUB_REPOSITORY: $GITHUB_REPOSITORY
+          LIBRARY=${GITHUB_REPOSITORY#*/}
+          echo LIBRARY: $LIBRARY
+          echo "LIBRARY=$LIBRARY" >> $GITHUB_ENV
+          echo GITHUB_BASE_REF: $GITHUB_BASE_REF
+          echo GITHUB_REF: $GITHUB_REF
+          REF=${GITHUB_BASE_REF:-$GITHUB_REF}
+          REF=${REF#refs/heads/}
+          echo REF: $REF
+          BOOST_BRANCH=develop && [ "$REF" == "master" ] && BOOST_BRANCH=master || true
+          echo BOOST_BRANCH: $BOOST_BRANCH
+          cd ..
+          git clone -b $BOOST_BRANCH --depth 1 https://github.com/boostorg/boost.git boost-root
+          cd boost-root
+          mkdir -p libs/$LIBRARY
+          cp -r $GITHUB_WORKSPACE/* libs/$LIBRARY
+          git submodule update --init tools/boostdep
+          python3 tools/boostdep/depinst/depinst.py --git_args "--jobs 3" $LIBRARY
+      - name: Configure
+        run: |
+          cd ../boost-root
+          mkdir __build__ && cd __build__
+          cmake -DBOOST_INCLUDE_LIBRARIES=$LIBRARY -DBUILD_TESTING=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DBOOST_MATH_ENABLE_NVRTC=1 -DCMAKE_CUDA_ARCHITECTURES=70 -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.5 -DBOOST_MATH_NVRTC_CI_RUN=1 ..
+          pwd
+      - name: Build tests
+        run: |
+          cd ../boost-root/__build__
+          cmake --build . --target tests -j $(nproc)
+      # We don't have the ability for runtime right now
+      #- name: Run tests
+      #  run: |
+      #    cd ../boost-root/__build__
+      #    ctest --output-on-failure --no-tests=error

From 88c36d3fefcd9aed278541fe8b2b77acb261a881 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 6 Aug 2024 13:58:26 -0400
Subject: [PATCH 3/8] Add Jamfile

---
 test/nvrtc_jamfile | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 test/nvrtc_jamfile

diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
new file mode 100644
index 0000000000..707780b7a7
--- /dev/null
+++ b/test/nvrtc_jamfile
@@ -0,0 +1,13 @@
+# Copyright 2024 Matt Borland
+# Distributed under the Boost Software License, Version 1.0.
+# https://www.boost.org/LICENSE_1_0.txt
+
+import testing ;
+import ../../config/checks/config : requires ;
+
+project : requirements
+    [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
+    ;
+
+# Special Functions
+run test_gamma_nvrtc.cpp ;

From 470e5934da54c7505faf30da03cc5c63023f44fb Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 6 Aug 2024 13:59:47 -0400
Subject: [PATCH 4/8] Update CML

---
 test/CMakeLists.txt | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 78d8fd4c34..22799e92ea 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -8,7 +8,7 @@ if(HAVE_BOOST_TEST)
 
     if (BOOST_MATH_ENABLE_CUDA)
 
-        message(STATUS "Building boost.cuda_math with CUDA")
+        message(STATUS "Building boost.math with CUDA")
 
         enable_language(CUDA)
         set(CMAKE_CUDA_EXTENSIONS OFF)
@@ -16,9 +16,26 @@ if(HAVE_BOOST_TEST)
         enable_testing()
 
         boost_test_jamfile(FILE cuda_jamfile LINK_LIBRARIES Boost::math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::static_assert Boost::throw_exception Boost::unit_test_framework )
+
+    elseif (BOOST_MATH_ENABLE_NVRTC)
+
+        message(STATUS "Building boost.math with NVRTC")
+
+        find_package(CUDA REQUIRED)
+
+        enable_testing()
+
+        set(CUDA_nvrtc_LIBRARY /usr/local/cuda/lib64/libnvrtc.so)
+        
+        if (BOOST_MATH_NVRTC_CI_RUN)
+            boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::static_assert Boost::throw_exception ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} COMPILE_DEFINITIONS BOOST_MATH_NVRTC_CI_RUN=1 INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} )
+        else ()
+            boost_test_jamfile(FILE nvrtc_jamfile LINK_LIBRARIES Boost::math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::static_assert Boost::throw_exception ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} )
+        endif()
+
     elseif (BOOST_MATH_ENABLE_SYCL)
 
-        message(STATUS "Building boost.cuda_math with SYCL")
+        message(STATUS "Building boost.math with SYCL")
 
         set(CMAKE_CXX_COMPILER "icpx")
         set(CMAKE_C_COMPILER "icx")
@@ -32,4 +49,4 @@ if(HAVE_BOOST_TEST)
 
     endif()
 
-endif()
\ No newline at end of file
+endif()

From e7f09ca02749ee31147d3335a94188a491752c74 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 6 Aug 2024 14:00:39 -0400
Subject: [PATCH 5/8] Add tgamma support to NVRTC

---
 include/boost/math/special_functions/gamma.hpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp
index b1beeb766b..46e8236aa1 100644
--- a/include/boost/math/special_functions/gamma.hpp
+++ b/include/boost/math/special_functions/gamma.hpp
@@ -14,6 +14,22 @@
 #pragma once
 #endif
 
+#ifdef __CUDACC_RTC__
+
+namespace boost {
+namespace math {
+
+template <typename T>
+__host__ __device__ T tgamma(T x)
+{
+   return ::tgamma(x);
+}
+
+} // namespace math
+} // namespace boost
+
+#else
+
 #include <boost/math/tools/config.hpp>
 #include <boost/math/tools/series.hpp>
 #include <boost/math/tools/fraction.hpp>
@@ -2280,4 +2296,6 @@ BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
 #include <boost/math/special_functions/detail/gamma_inva.hpp>
 #include <boost/math/special_functions/erf.hpp>
 
+#endif // __CUDACC_RTC__
+
 #endif // BOOST_MATH_SF_GAMMA_HPP

From 477d1c1777253e669bdf677f308a9e6df11f52fd Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 6 Aug 2024 14:45:07 -0400
Subject: [PATCH 6/8] Fix runs on value

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ccc4d37466..ee72988440 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -673,7 +673,7 @@ jobs:
     strategy:
       fail-fast: false
 
-    runs-on: gpu-runner-1
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: Jimver/cuda-toolkit@v0.2.16

From 37db0179032d0820159a6f802f79c1c65f2e39fe Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 7 Aug 2024 07:28:13 -0400
Subject: [PATCH 7/8] Rearrange and add policy overload

---
 .../boost/math/special_functions/gamma.hpp    | 33 ++++++++++---------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp
index 46e8236aa1..70b4ff155c 100644
--- a/include/boost/math/special_functions/gamma.hpp
+++ b/include/boost/math/special_functions/gamma.hpp
@@ -14,21 +14,7 @@
 #pragma once
 #endif
 
-#ifdef __CUDACC_RTC__
-
-namespace boost {
-namespace math {
-
-template <typename T>
-__host__ __device__ T tgamma(T x)
-{
-   return ::tgamma(x);
-}
-
-} // namespace math
-} // namespace boost
-
-#else
+#ifndef __CUDACC_RTC__
 
 #include <boost/math/tools/config.hpp>
 #include <boost/math/tools/series.hpp>
@@ -2296,6 +2282,23 @@ BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
 #include <boost/math/special_functions/detail/gamma_inva.hpp>
 #include <boost/math/special_functions/erf.hpp>
 
+#else
+
+namespace boost {
+namespace math {
+
+inline __host__ __device__ float tgamma(float x) { return ::tgammaf(x); }
+inline __host__ __device__ double tgamma(double x) { return ::tgamma(x); }
+
+template <typename T, typename Policy>
+inline __host__ __device__ T tgamma(T x, const Policy&)
+{
+   return boost::math::tgamma(x);
+}
+
+} // namespace math
+} // namespace boost
+
 #endif // __CUDACC_RTC__
 
 #endif // BOOST_MATH_SF_GAMMA_HPP

From 135208b3a642594b7caa3460e8b01441d21456fa Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 7 Aug 2024 07:29:38 -0400
Subject: [PATCH 8/8] Expand testing

---
 test/nvrtc_jamfile                            |   3 +-
 test/test_gamma_nvrtc_double.cpp              | 186 ++++++++++++++++++
 ...a_nvrtc.cpp => test_gamma_nvrtc_float.cpp} |  33 ++--
 3 files changed, 206 insertions(+), 16 deletions(-)
 create mode 100644 test/test_gamma_nvrtc_double.cpp
 rename test/{test_gamma_nvrtc.cpp => test_gamma_nvrtc_float.cpp} (84%)

diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index 707780b7a7..63d0f2af09 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -10,4 +10,5 @@ project : requirements
     ;
 
 # Special Functions
-run test_gamma_nvrtc.cpp ;
+run test_gamma_nvrtc_double.cpp ;
+run test_gamma_nvrtc_float.cpp ;
diff --git a/test/test_gamma_nvrtc_double.cpp b/test/test_gamma_nvrtc_double.cpp
new file mode 100644
index 0000000000..6b21bd04b7
--- /dev/null
+++ b/test/test_gamma_nvrtc_double.cpp
@@ -0,0 +1,186 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvrtc.h>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <boost/math/special_functions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::tgamma(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = boost::math::tgamma(h_in1[i]);
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_gamma_nvrtc.cpp b/test/test_gamma_nvrtc_float.cpp
similarity index 84%
rename from test/test_gamma_nvrtc.cpp
rename to test/test_gamma_nvrtc_float.cpp
index 2d33430240..ce312f6916 100644
--- a/test/test_gamma_nvrtc.cpp
+++ b/test/test_gamma_nvrtc_float.cpp
@@ -18,10 +18,13 @@
 #include <cuda_runtime.h>
 #include <nvrtc.h>
 
+typedef float float_type;
+
 const char* cuda_kernel = R"(
+typedef float float_type;
 #include <boost/math/special_functions/gamma.hpp>
 extern "C" __global__ 
-void test_gamma_kernel(const float *in1, const float*, float *out, int numElements)
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
 {
     int i = blockDim.x * blockIdx.x + threadIdx.x;
     if (i < numElements)
@@ -113,36 +116,36 @@ int main()
         checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
 
         int numElements = 5000;
-        float *h_in1, *h_in2, *h_out;
-        float *d_in1, *d_in2, *d_out;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
 
         // Allocate memory on the host
-        h_in1 = new float[numElements];
-        h_in2 = new float[numElements];
-        h_out = new float[numElements];
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
 
         // Initialize input arrays
         std::mt19937_64 rng(42);
-        std::uniform_real_distribution<float> dist(0.0f, 1.0f);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
         for (int i = 0; i < numElements; ++i) 
         {
-            h_in1[i] = static_cast<float>(dist(rng));
-            h_in2[i] = static_cast<float>(dist(rng));
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
         }
 
-        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float)), "Failed to allocate device memory for d_in1");
-        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float)), "Failed to allocate device memory for d_in2");
-        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float)), "Failed to allocate device memory for d_out");
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
 
-        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
-        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
 
         int blockSize = 256;
         int numBlocks = (numElements + blockSize - 1) / blockSize;
         void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
         checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
 
-        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
 
         // Verify Result
         for (int i = 0; i < numElements; ++i)