diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c2f48cd..be6123ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,7 @@ endif()
 
 project(cvcuda
         LANGUAGES C CXX
-        VERSION 0.12.0
+        VERSION 0.13.0
         DESCRIPTION "CUDA-accelerated Computer Vision algorithms"
 )
 
diff --git a/README.md b/README.md
index ddb93741..bc91416f 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 
 [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0)
 
-![Version](https://img.shields.io/badge/Version-v0.12.0--beta-blue)
+![Version](https://img.shields.io/badge/Version-v0.13.0--beta-blue)
 
 ![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2%7C_aarch64-gray)
 
@@ -239,12 +239,13 @@ cpack . -G [DEB|TXZ]
 
 Python Wheels
 
-By default during the `release` build, Python bindings and wheels are created for the available CUDA version and the specified Python version(s). The wheels are stored in `build-rel/pythonX.Y/wheel` folder, where `build-rel` is the build directory used to build the release build and `X` and `Y` are Python major and minor versions.
+By default, during the `release` build, Python bindings and wheels are created for the available CUDA version and the specified Python version(s). The wheels are now output to the `build-rel/python3/repaired_wheels` folder (after being processed by the `auditwheel repair` command in the case of ManyLinux). The single generated python wheel is compatible with all versions of python specified during the cmake build step. Here, `build-rel` is the build directory used to build the release build.
 
-The built wheels can be installed using pip.
-For example, to install the Python wheel built for CUDA 12.x, Python 3.10 on Linux x86_64 systems:
+The new Python wheels for PyPI compliance must be built within the ManyLinux 2014 Docker environment. The Docker images can be generated using the `docker/manylinux/docker_buildx.sh` script. These images ensure the wheels meet ManyLinux 2014 and PyPI standards.
+
+The built wheels can still be installed using `pip`. For example, to install the Python wheel built for CUDA 12.x, Python 3.10 and 3.11 on Linux x86_64 systems:
 ```shell
-pip install cvcuda_cu12-<x.x.x>-cp310-cp310-linux_x86_64.whl
+pip install cvcuda_cu12-<x.x.x>-cp310.cp311-cp310.cp311-linux_x86_64.whl
 ```
 
 ## Contributing
diff --git a/cmake/BuildPython.cmake b/cmake/BuildPython.cmake
index cab2e737..d42414f4 100644
--- a/cmake/BuildPython.cmake
+++ b/cmake/BuildPython.cmake
@@ -45,14 +45,14 @@ list(APPEND PYPROJ_COMMON_ARGS
     -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
 )
 
-# It need to overwrite the PYTHON_MODULE_EXTENSION to generate
+# It needs to overwrite the PYTHON_MODULE_EXTENSION to generate
 # python module name with correct name when cross compiling
 # example: set(PYTHON_MODULE_EXTENSION .cpython-py38-aarch64-linux-gnu.so)
 if (CMAKE_CROSSCOMPILING)
-list(APPEND PYPROJ_COMMON_ARGS
-    -DCUDAToolkit_ROOT=${CUDAToolkit_ROOT}
-    -DPYTHON_MODULE_EXTENSION=${PYTHON_MODULE_EXTENSION}
-)
+    list(APPEND PYPROJ_COMMON_ARGS
+        -DCUDAToolkit_ROOT=${CUDAToolkit_ROOT}
+        -DPYTHON_MODULE_EXTENSION=${PYTHON_MODULE_EXTENSION}
+    )
 endif()
 
 foreach(VER ${PYTHON_VERSIONS})
@@ -61,7 +61,7 @@ foreach(VER ${PYTHON_VERSIONS})
     ExternalProject_Add(cvcuda_python${VER}
         PREFIX ${BASEDIR}
         SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/python
-        CMAKE_ARGS ${PYPROJ_COMMON_ARGS} -DPYTHON_VERSION=${VER} -DBUILD_ROOT=${CMAKE_BINARY_DIR} -DPYTHON_VERSION_SHORT=${VER}
+        CMAKE_ARGS ${PYPROJ_COMMON_ARGS} -DPYTHON_VERSION=${VER}
         BINARY_DIR ${BASEDIR}/build
         TMP_DIR ${BASEDIR}/tmp
         STAMP_DIR ${BASEDIR}/stamp
@@ -72,7 +72,37 @@ foreach(VER ${PYTHON_VERSIONS})
 endforeach()
 
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
-    foreach(PYTHON_VERSION ${PYTHON_VERSIONS})
-        configure_file("${CMAKE_CURRENT_SOURCE_DIR}/python/setup.py.in" "${CMAKE_BINARY_DIR}/python${PYTHON_VERSION}/setup.py")
+    set(PACKAGE_LIB_DIR ${CMAKE_BINARY_DIR}/python3/lib)
+
+    file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/python3)
+    file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/python3/lib)
+    file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/python3/cvcuda)
+    file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/python3/cvcuda/_bindings)
+    file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/python3/nvcv)
+    file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/python3/nvcv/_bindings)
+
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/python/setup.py.in" "${CMAKE_BINARY_DIR}/python3/setup.py")
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/python/__init__.py.in" "${CMAKE_BINARY_DIR}/python3/cvcuda/__init__.py")
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/python/__init__.py.in" "${CMAKE_BINARY_DIR}/python3/nvcv/__init__.py")
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/python/_load_binding.py.in" "${CMAKE_BINARY_DIR}/python3/cvcuda/_load_binding.py")
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/python/_load_binding.py.in" "${CMAKE_BINARY_DIR}/python3/nvcv/_load_binding.py")
+
+    add_custom_target(wheel ALL)
+
+    foreach(VER ${PYTHON_VERSIONS})
+        add_dependencies(wheel cvcuda_python${VER})
     endforeach()
+
+    add_custom_command(
+        TARGET wheel
+        COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:cvcuda> ${CMAKE_BINARY_DIR}/python3/lib
+        COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:nvcv_types> ${CMAKE_BINARY_DIR}/python3/lib
+        COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/lib/python/cvcuda*.so ${CMAKE_BINARY_DIR}/python3/cvcuda/_bindings
+        COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/lib/python/nvcv*.so ${CMAKE_BINARY_DIR}/python3/nvcv/_bindings
+    )
+
+    add_custom_command(
+        TARGET wheel
+        COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/python/build_wheels.sh" "${CMAKE_BINARY_DIR}/python3"
+    )
 endif()
diff --git a/cmake/ConfigCompiler.cmake b/cmake/ConfigCompiler.cmake
index b75165e0..4031d008 100644
--- a/cmake/ConfigCompiler.cmake
+++ b/cmake/ConfigCompiler.cmake
@@ -81,10 +81,11 @@ if(BUILD_TESTS)
         set(candidate_compilers ${PUBLIC_API_COMPILERS})
     else()
         # If not, by default, we'll try these.
-        set(candidate_compilers gcc-11 gcc-9 clang-11 clang-14)
+        set(candidate_compilers gcc-11 gcc-10 gcc-9 clang-11 clang-14)
     endif()
 
     unset(valid_compilers)
+    set(at_least_one_compiler_found OFF)
 
     foreach(comp ${candidate_compilers})
         string(MAKE_C_IDENTIFIER "${comp}" comp_str)
@@ -93,14 +94,20 @@ if(BUILD_TESTS)
         find_program(COMPILER_EXEC_${COMP_STR} ${comp})
         if(COMPILER_EXEC_${COMP_STR})
             list(APPEND valid_compilers ${comp})
+            set(at_least_one_compiler_found ON)
         else()
             if(PUBLIC_API_COMPILERS)
                 message(FATAL_ERROR "Compiler '${comp}' not found")
-            else()
-                message(WARNING "Compiler '${comp}' not found, skipping public API checks for it")
             endif()
         endif()
     endforeach()
+
+    if(NOT at_least_one_compiler_found)
+        foreach(comp ${candidate_compilers})
+            message(WARNING "Compiler '${comp}' not found, skipping public API checks for it")
+        endforeach()
+    endif()
+
     set(PUBLIC_API_COMPILERS "${valid_compilers}")
 endif()
 
diff --git a/cmake/ConfigPython.cmake b/cmake/ConfigPython.cmake
index 83f65f8b..85f1cb23 100644
--- a/cmake/ConfigPython.cmake
+++ b/cmake/ConfigPython.cmake
@@ -81,6 +81,33 @@ file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/cmake/cvcuda_$<LOWER_CASE:$<CON
 # Python versions to build already set?
 if(PYTHON_VERSIONS)
     set(USE_DEFAULT_PYTHON false)
+
+    set(AVAILABLE_PYTHON_VERSIONS "")
+    foreach(VER ${PYTHON_VERSIONS})
+        find_program(PYTHON_EXECUTABLE python${VER} PATHS /usr/bin /usr/local/bin NO_DEFAULT_PATH)
+        if (PYTHON_EXECUTABLE)
+            execute_process(
+                COMMAND ${PYTHON_EXECUTABLE} -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')"
+                OUTPUT_VARIABLE PYTHON_VERSION_OUTPUT
+                OUTPUT_STRIP_TRAILING_WHITESPACE
+            )
+            if (PYTHON_VERSION_OUTPUT STREQUAL ${VER})
+                list(APPEND AVAILABLE_PYTHON_VERSIONS ${VER})
+            else()
+                message(WARNING "Python executable ${PYTHON_EXECUTABLE} does not match version ${VER} (${PYTHON_VERSION_OUTPUT}). Skipping.")
+            endif()
+        else()
+            message(WARNING "Python version ${VER} not found. Skipping.")
+        endif()
+        unset(PYTHON_EXECUTABLE CACHE)
+    endforeach()
+
+    if(NOT AVAILABLE_PYTHON_VERSIONS)
+        message(FATAL_ERROR "No available Python versions found. Exiting.")
+    endif()
+
+    set(PYTHON_VERSIONS ${AVAILABLE_PYTHON_VERSIONS})
+    unset(AVAILABLE_PYTHON_VERSIONS)
 # If not, gets the default version from FindPython
 else()
     find_package(Python COMPONENTS Interpreter REQUIRED)
diff --git a/docker/manylinux/Dockerfile.build.manylinux2014.deps b/docker/manylinux/Dockerfile.build.manylinux2014.deps
new file mode 100644
index 00000000..1a87a075
--- /dev/null
+++ b/docker/manylinux/Dockerfile.build.manylinux2014.deps
@@ -0,0 +1,117 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Build arguments and version numbers
+ARG FROM_IMAGE_NAME=quay.io/pypa/manylinux2014_x86_64
+ARG BUILDER_EXTRA_DEPS=scratch
+
+# Base image
+FROM ${BUILDER_EXTRA_DEPS} AS extra_deps
+FROM ${FROM_IMAGE_NAME}
+
+ARG ARCH=x86_64
+ARG CC=gcc
+ARG CXX=g++
+ARG PATCHELF_VERSION=0.17.2
+ARG CMAKE_VERSION=3.20.1
+ARG PYVER=3.8
+ARG PYV=38
+ARG CLANG_VERSION=14.0
+ARG SPHINX_VERSION=4.5.0
+
+# Set build arguments as environment variables
+ENV ARCH=${ARCH}
+ENV CC=${CC}
+ENV CXX=${CXX}
+ENV PATCHELF_VERSION=${PATCHELF_VERSION}
+ENV CMAKE_VERSION=${CMAKE_VERSION}
+ENV PYVER=${PYVER}
+ENV PYV=${PYV}
+ENV CLANG_VERSION=${CLANG_VERSION}
+ENV LIBCLANG_VERSION=${CLANG_VERSION}
+ENV SPHINX_VERSION=${SPHINX_VERSION}
+
+# Install additional dependencies
+RUN yum install -y ninja-build ccache ShellCheck curl
+
+# Configure ccache
+RUN mkdir -p /cache
+COPY ccache.conf /etc/ccache.conf
+ENV CCACHE_CONFIGPATH=/etc/ccache.conf
+ENV PRE_COMMIT_HOME=/cache/pre-commit
+
+# Install patchelf (needed to patch rpath of dependencies in bundle-wheel.sh)
+RUN wget -q https://github.com/NixOS/patchelf/releases/download/${PATCHELF_VERSION}/patchelf-${PATCHELF_VERSION}-${ARCH}.tar.gz -O /tmp/patchelf.tar.gz && \
+    tar -xzf /tmp/patchelf.tar.gz -C /tmp && \
+    mv /tmp/bin/patchelf /usr/local/bin/ && \
+    rm -rf /tmp/patchelf*
+
+# Install CMake
+RUN cmake --version
+RUN wget -q https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${ARCH}.sh -O /tmp/cmake-install.sh && \
+    chmod +x /tmp/cmake-install.sh && \
+    mkdir /opt/cmake-${CMAKE_VERSION} && \
+    /tmp/cmake-install.sh --skip-license --prefix=/opt/cmake-${CMAKE_VERSION} \
+    && rm -f /usr/local/bin/*cmake* \
+    && rm -f /usr/local/bin/cpack \
+    && rm -f /usr/local/bin/ctest && \
+    ln -s /opt/cmake-${CMAKE_VERSION}/bin/* /usr/local/bin/ && \
+    rm -rf /tmp/cmake-install.sh
+
+# Set up Python environment variables
+ENV PYTHONPATH=/opt/python/v
+ENV PYBIN=${PYTHONPATH}/bin
+ENV PYLIB=${PYTHONPATH}/lib
+
+# Create symlink to the desired Python version
+RUN ln -s /opt/python/cp${PYV}* ${PYTHONPATH}
+
+# Update PATH and library paths
+ENV PATH=${PYTHONPATH}/bin:/opt/python/*/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/lib:/opt/python/*/lib:${PYLIB}:${LD_LIBRARY_PATH}
+ENV LIBRARY_PATH=/usr/local/lib:/opt/python/*/lib:${PYLIB}:${LIBRARY_PATH}
+
+# Propagate the environment variable to profile.d
+RUN echo "export PYTHONPATH=${PYTHONPATH}" >> /etc/profile.d/python.sh && \
+    echo "export PYBIN=${PYBIN}" >> /etc/profile.d/python.sh && \
+    echo "export PYLIB=${PYLIB}" >> /etc/profile.d/python.sh && \
+    echo "export PATH=\${PYTHONPATH}/bin:/opt/python/*/bin:\${PATH}" >> /etc/profile.d/python.sh && \
+    echo "export LD_LIBRARY_PATH=/usr/local/lib:/opt/python/*/lib:\${PYLIB}:\${LD_LIBRARY_PATH}" >> /etc/profile.d/python.sh && \
+    echo "export LIBRARY_PATH=/usr/local/lib:/opt/python/*/lib:\${PYLIB}:\${LIBRARY_PATH}" >> /etc/profile.d/python.sh && \
+    chmod +x /etc/profile.d/python.sh
+
+# Install Python packages
+RUN python3 -m pip install --no-cache-dir \
+    breathe \
+    cibuildwheel \
+    clang==${CLANG_VERSION} \
+    exhale \
+    flake8 \
+    future \
+    graphviz \
+    numpy \
+    pre-commit \
+    recommonmark \
+    setuptools \
+    sphinx_rtd_theme \
+    sphinx==${SPHINX_VERSION} \
+    twine \
+    wheel
+
+# Update the dynamic linker run-time bindings
+RUN ldconfig
+
+# extra deps
+COPY --from=extra_deps / /
diff --git a/docker/manylinux/Dockerfile.builder.deps b/docker/manylinux/Dockerfile.builder.deps
new file mode 100644
index 00000000..1afe990c
--- /dev/null
+++ b/docker/manylinux/Dockerfile.builder.deps
@@ -0,0 +1,61 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=quay.io/pypa/manylinux2014_x86_64
+ARG CUDA_IMAGE
+ARG BUILDER_CUDA_EXTRA_DEPS=scratch
+
+FROM ${BUILDER_CUDA_EXTRA_DEPS} AS cuda_extra_deps
+FROM ${CUDA_IMAGE} AS cuda
+
+# Find and copy libcuda.so* to /cuda_libs
+RUN mkdir /cuda_libs && \
+    find /usr -name 'libcuda.so*' -exec cp {} /cuda_libs/ \;
+
+FROM ${FROM_IMAGE_NAME}
+
+ENV PATH=/usr/local/cuda/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH}
+
+ENV NVIDIA_DRIVER_CAPABILITIES=video,compute,utility,compat32
+
+# Propagating the environment variable to profile.d
+RUN echo "export NVIDIA_DRIVER_CAPABILITIES=video,compute,utility,compat32" >> /etc/profile.d/nvidia.sh && \
+    echo "export PATH=/usr/local/cuda/bin:\${PATH}" >> /etc/profile.d/nvidia.sh && \
+    echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:\${LD_LIBRARY_PATH}" >> /etc/profile.d/nvidia.sh && \
+    chmod +x /etc/profile.d/nvidia.sh
+
+# CUDA
+COPY --from=cuda /usr/local/cuda /usr/local/cuda
+
+# Copy libcuda.so* files
+COPY --from=cuda /cuda_libs/* /usr/lib64/
+
+# Test CUDA compiler
+RUN nvcc --version
+
+# Ensure tmp is writable by all users recursively
+RUN chmod -R a+rw /tmp
+
+RUN git clone https://github.com/google/googletest.git -b release-1.10.0 && \
+    pushd googletest && \
+    mkdir build && \
+    pushd build && \
+    cmake .. && \
+    make -j$(nproc) && make install && \
+    popd && popd && rm -rf googletest
+
+# Extra deps
+COPY --from=cuda_extra_deps / /
diff --git a/docker/manylinux/Dockerfile.cuda.centos7.deps b/docker/manylinux/Dockerfile.cuda.centos7.deps
new file mode 100644
index 00000000..3cfa7efe
--- /dev/null
+++ b/docker/manylinux/Dockerfile.cuda.centos7.deps
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvidia/cuda:11.4.3-devel-centos7
+FROM ${FROM_IMAGE_NAME} AS cuda
+
+RUN ln -sf /usr/share/zoneinfo/US/Pacific /etc/localtime
diff --git a/docker/manylinux/Dockerfile.cuda.ubuntu20.04.deps b/docker/manylinux/Dockerfile.cuda.ubuntu20.04.deps
new file mode 100644
index 00000000..413df417
--- /dev/null
+++ b/docker/manylinux/Dockerfile.cuda.ubuntu20.04.deps
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvidia/cuda:11.4.3-devel-ubuntu20.04
+FROM ${FROM_IMAGE_NAME} AS cuda
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# need to update and install in one go, or else installation might use
+# stale data from server stored in docker cache, with packages that don't exist anymore.
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git git-lfs software-properties-common wget \
+    && add-apt-repository ppa:ubuntu-toolchain-r/test \
+    && apt-get update \
+    && apt-get install -y --no-install-recommends \
+    git git-lfs \
+    ninja-build \
+    ccache \
+    libgtest-dev libgmock-dev \
+    shellcheck \
+    curl
+
+RUN ln -sf /usr/share/zoneinfo/US/Pacific /etc/localtime
+
+# Allow using this image in systems without proper CUDA runtime/driver support.
+# We'll be using this image only for building, don't need strict CUDA checks.
+ENV NVIDIA_DISABLE_REQUIRE=true
+
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends python3 python3-pip python3-pytest python3-dev doxygen \
+    && rm -rf /var/lib/apt/lists/*
+
+# python3 is python3.8 in ubuntu20.04
+RUN python3 -m pip install pre-commit
diff --git a/docker/manylinux/Dockerfile.cuda.ubuntu22.04.deps b/docker/manylinux/Dockerfile.cuda.ubuntu22.04.deps
new file mode 100644
index 00000000..9d237378
--- /dev/null
+++ b/docker/manylinux/Dockerfile.cuda.ubuntu22.04.deps
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvidia/cuda:11.4.3-devel-ubuntu22.04
+FROM ${FROM_IMAGE_NAME} AS cuda
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# need to update and install in one go, or else installation might use
+# stale data from server stored in docker cache, with packages that don't exist anymore.
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+    git git-lfs \
+    ninja-build \
+    ccache \
+    libgtest-dev libgmock-dev \
+    pre-commit shellcheck \
+    curl
+
+RUN ln -sf /usr/share/zoneinfo/US/Pacific /etc/localtime
diff --git a/docker/manylinux/Dockerfile.gcc.manylinux2014.deps b/docker/manylinux/Dockerfile.gcc.manylinux2014.deps
new file mode 100644
index 00000000..06c2933b
--- /dev/null
+++ b/docker/manylinux/Dockerfile.gcc.manylinux2014.deps
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=quay.io/pypa/manylinux2014_x86_64
+ARG BUILDER_EXTRA_DEPS=scratch
+
+FROM ${BUILDER_EXTRA_DEPS} AS extra_deps
+FROM ${FROM_IMAGE_NAME}
+
+ARG GCC_VERSION=10
+
+ENV GCC_VERSION=${GCC_VERSION}
+
+# Install EPEL and SCL repositories
+RUN yum install -y epel-release && yum repolist
+
+# Install yum Dependencies
+RUN yum install -y \
+    wget nasm doxygen graphviz gettext xz openssl-devel openssl-static autogen zip dpkg \
+    devtoolset-${GCC_VERSION} \
+    devtoolset-${GCC_VERSION}-libasan-devel \
+    devtoolset-${GCC_VERSION}-liblsan-devel \
+    devtoolset-${GCC_VERSION}-libtsan-devel \
+    devtoolset-${GCC_VERSION}-libubsan-devel && \
+    yum clean all && rm -rf /var/cache/yum
+
+# Update PATH and LD_LIBRARY_PATH to use GCC from devtoolset
+ENV PATH=/opt/rh/devtoolset-${GCC_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${GCC_VERSION}/root/usr/lib64:$LD_LIBRARY_PATH
+
+# Set the compiler environment variables
+ENV CC=gcc
+ENV CXX=g++
+
+# Propagating the environment variables to profile.d
+RUN echo "export CC=gcc" > /etc/profile.d/gcc.sh && \
+    echo "export CXX=g++" >> /etc/profile.d/gcc.sh && \
+    echo "source /opt/rh/devtoolset-${GCC_VERSION}/enable" >> /etc/profile.d/gcc.sh && \
+    echo "alias gcc-${GCC_VERSION}=gcc" >> /etc/profile.d/gcc.sh && \
+    echo "alias g++-${GCC_VERSION}=g++" >> /etc/profile.d/gcc.sh && \
+    chmod +x /etc/profile.d/gcc.sh
+
+# Don't want the short-unicode version for Python 2.7
+RUN rm -f /opt/python/cp27-cp27m
diff --git a/docker/manylinux/Dockerfile.runner.deps b/docker/manylinux/Dockerfile.runner.deps
new file mode 100644
index 00000000..5271dbd4
--- /dev/null
+++ b/docker/manylinux/Dockerfile.runner.deps
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=quay.io/pypa/manylinux2014_x86_64
+ARG BUILDER_CUDA_EXTRA_DEPS=scratch
+
+FROM ${BUILDER_CUDA_EXTRA_DEPS} AS cuda_extra_deps
+FROM ${FROM_IMAGE_NAME}
+
+ENV PATH=/usr/local/cuda/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH}
+
+ENV NVIDIA_DRIVER_CAPABILITIES=video,compute,utility,compat32
+
+# Propagating the environment variable to profile.d
+RUN echo "export NVIDIA_DRIVER_CAPABILITIES=video,compute,utility,compat32" >> /etc/profile.d/nvidia.sh && \
+    echo "export PATH=/usr/local/cuda/bin:\${PATH}" >> /etc/profile.d/nvidia.sh && \
+    echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:\${LD_LIBRARY_PATH}" >> /etc/profile.d/nvidia.sh && \
+    chmod +x /etc/profile.d/nvidia.sh
+
+# Extra deps
+COPY --from=cuda_extra_deps / /
diff --git a/docker/manylinux/ccache.conf b/docker/manylinux/ccache.conf
new file mode 100644
index 00000000..1fb25208
--- /dev/null
+++ b/docker/manylinux/ccache.conf
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+max_size = 20G
+cache_dir = /cache/ccache
diff --git a/docker/manylinux/config_external.sh b/docker/manylinux/config_external.sh
new file mode 100755
index 00000000..f343a4c9
--- /dev/null
+++ b/docker/manylinux/config_external.sh
@@ -0,0 +1,43 @@
+#!/bin/bash -ex
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export DOCKER_BUILDKIT=${DOCKER_BUILDKIT:-1}
+
+export PLATFORM=${PLATFORM:-"linux/amd64"}
+export ARCH=${ARCH:-"x86_64"}
+
+export REGISTRY_MANYLINUX_PREFIX=${REGISTRY_MANYLINUX_PREFIX:-"quay.io/pypa/"}
+export REGISTRY_CUDA_PREFIX=${REGISTRY_CUDA_PREFIX:-"nvidia/"}
+export REGISTRY_HOST_PREFIX=${REGISTRY_HOST_PREFIX:-""}
+
+export MANYLINUX_VERSION="2014"
+export MANYLINUX_BASE_OS="centos7"
+export MANYLINUX_IMAGE_TAG="2024.10.26-1"
+
+export GCC_VERSIONS=(
+    "10"
+)
+
+export CUDA_VERSIONS=(
+    "11.7.1"
+    "12.2.0"
+)
+
+export TEST_OS_VERSIONS=(
+    "ubuntu20.04"
+    "ubuntu22.04"
+)
diff --git a/docker/manylinux/docker_buildx.sh b/docker/manylinux/docker_buildx.sh
new file mode 100755
index 00000000..1474d852
--- /dev/null
+++ b/docker/manylinux/docker_buildx.sh
@@ -0,0 +1,160 @@
+#!/bin/bash -ex
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Ensure failures are caught when commands are piped
+set -o pipefail
+
+# Set default version if not provided
+export VERSION="${VERSION:-1}"
+
+# Get the directory of the script
+SCRIPT_DIR="$(readlink -f $(dirname "$0"))"
+
+# Move to the script directory
+pushd "${SCRIPT_DIR}" >/dev/null
+
+# Source configuration files
+if ! source "${SCRIPT_DIR}/config_internal.sh"; then
+    source "${SCRIPT_DIR}/config_external.sh"
+fi
+
+# Initialize variables
+BUILDER_NAME="cvcuda_builder"
+
+# Initialize buildx instance
+docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1 || docker buildx create --name "${BUILDER_NAME}"
+docker buildx use "${BUILDER_NAME}"
+docker buildx inspect --bootstrap
+
+####### BASE IMAGES #######
+
+# Build Manylinux images with different GCC versions
+for GCC_VERSION in "${GCC_VERSIONS[@]}"; do
+    IMAGE_NAME="${REGISTRY_HOST_PREFIX}manylinux${MANYLINUX_VERSION}-${ARCH}.gcc${GCC_VERSION}"
+    DOCKERFILE="${SCRIPT_DIR}/Dockerfile.gcc.manylinux${MANYLINUX_VERSION}.deps"
+    FROM_IMAGE_NAME="${REGISTRY_MANYLINUX_PREFIX}manylinux${MANYLINUX_VERSION}_${ARCH}:${MANYLINUX_IMAGE_TAG}"
+
+    docker buildx build \
+        --cache-to type=inline \
+        --cache-from type=registry,ref="${IMAGE_NAME}" \
+        -t "${IMAGE_NAME}" \
+        -t "${IMAGE_NAME}:v${VERSION}" \
+        -f "${DOCKERFILE}" \
+        --build-arg "FROM_IMAGE_NAME=${FROM_IMAGE_NAME}" \
+        --build-arg "GCC_VERSION=${GCC_VERSION}" \
+        --platform "${PLATFORM}" \
+        --provenance=false \
+        --push \
+        .
+done
+
+# Build CUDA images on manylinux platform
+for CUDA_VERSION in "${CUDA_VERSIONS[@]}"; do
+    IMAGE_NAME="${REGISTRY_HOST_PREFIX}cuda${CUDA_VERSION}-${MANYLINUX_BASE_OS}-${ARCH}"
+    DOCKERFILE="${SCRIPT_DIR}/Dockerfile.cuda.${MANYLINUX_BASE_OS}.deps"
+    FROM_IMAGE_NAME="${REGISTRY_CUDA_PREFIX}cuda:${CUDA_VERSION}-devel-${MANYLINUX_BASE_OS}"
+
+    docker buildx build \
+        --cache-to type=inline \
+        --cache-from type=registry,ref="${IMAGE_NAME}" \
+        -t "${IMAGE_NAME}" \
+        -t "${IMAGE_NAME}:v${VERSION}" \
+        -f "${DOCKERFILE}" \
+        --build-arg FROM_IMAGE_NAME="${FROM_IMAGE_NAME}" \
+        --platform "${PLATFORM}" \
+        --provenance=false \
+        --push \
+        .
+done
+
+# Build CUDA images on test OS platforms
+for CUDA_VERSION in "${CUDA_VERSIONS[@]}"; do
+    for OS_VERSION in "${TEST_OS_VERSIONS[@]}"; do
+        IMAGE_NAME="${REGISTRY_HOST_PREFIX}cuda${CUDA_VERSION}-${OS_VERSION}-${ARCH}"
+        DOCKERFILE="${SCRIPT_DIR}/Dockerfile.cuda.${OS_VERSION}.deps"
+        FROM_IMAGE_NAME="${REGISTRY_CUDA_PREFIX}cuda:${CUDA_VERSION}-devel-${OS_VERSION}"
+
+        docker buildx build \
+            --cache-to type=inline \
+            --cache-from type=registry,ref="${IMAGE_NAME}" \
+            -t "${IMAGE_NAME}" \
+            -t "${IMAGE_NAME}:v${VERSION}" \
+            -f "${DOCKERFILE}" \
+            --build-arg FROM_IMAGE_NAME="${FROM_IMAGE_NAME}" \
+            --platform "${PLATFORM}" \
+            --provenance=false \
+            --push \
+            .
+    done
+done
+
+# Build base images for building dependencies
+for GCC_VERSION in "${GCC_VERSIONS[@]}"; do
+    IMAGE_NAME="${REGISTRY_HOST_PREFIX}cvcuda_deps-${ARCH}.gcc${GCC_VERSION}"
+    DOCKERFILE="${SCRIPT_DIR}/Dockerfile.build.manylinux${MANYLINUX_VERSION}.deps"
+    FROM_IMAGE_NAME="${REGISTRY_HOST_PREFIX}manylinux${MANYLINUX_VERSION}-${ARCH}.gcc${GCC_VERSION}"
+
+    docker buildx build \
+        --cache-to type=inline \
+        --cache-from type=registry,ref="${IMAGE_NAME}" \
+        -t "${IMAGE_NAME}" \
+        -t "${IMAGE_NAME}:v${VERSION}" \
+        -f "${DOCKERFILE}" \
+        --build-arg FROM_IMAGE_NAME="${FROM_IMAGE_NAME}" \
+        --build-arg ARCH="${ARCH}" \
+        --platform "${PLATFORM}" \
+        --provenance=false \
+        --push \
+        .
+done
+
+####### BUILDER IMAGES #######
+
+# Generate the builder image over cuda and compiler versions
+for CUDA_VERSION in "${CUDA_VERSIONS[@]}"; do
+    for GCC_VERSION in "${GCC_VERSIONS[@]}"; do
+        IMAGE_NAME="${REGISTRY_HOST_PREFIX}builder-cuda${CUDA_VERSION}-gcc${GCC_VERSION}-${ARCH}"
+        DOCKERFILE="${SCRIPT_DIR}/Dockerfile.builder.deps"
+        FROM_IMAGE_NAME="${REGISTRY_HOST_PREFIX}cvcuda_deps-${ARCH}.gcc${GCC_VERSION}"
+        CUDA_IMAGE="${REGISTRY_HOST_PREFIX}cuda${CUDA_VERSION}-${MANYLINUX_BASE_OS}-${ARCH}"
+
+        docker buildx build \
+            --cache-to type=inline \
+            --cache-from type=registry,ref="${IMAGE_NAME}" \
+            -t "${IMAGE_NAME}" \
+            -t "${IMAGE_NAME}:v${VERSION}" \
+            -f "${DOCKERFILE}" \
+            --build-arg FROM_IMAGE_NAME="${FROM_IMAGE_NAME}" \
+            --build-arg CUDA_IMAGE="${CUDA_IMAGE}" \
+            --platform "${PLATFORM}" \
+            --provenance=false \
+            --push \
+            .
+    done
+done
+
+####### RUNNER IMAGES #######
+
+# Generate the runner image over cuda and os versions
+for CUDA_VERSION in "${CUDA_VERSIONS[@]}"; do
+    for OS_VERSION in "${TEST_OS_VERSIONS[@]}"; do
+        IMAGE_NAME="${REGISTRY_HOST_PREFIX}runner-cuda${CUDA_VERSION}-${OS_VERSION}-${ARCH}"
+        DOCKERFILE="${SCRIPT_DIR}/Dockerfile.runner.deps"
+        FROM_IMAGE_NAME="${REGISTRY_HOST_PREFIX}cuda${CUDA_VERSION}-${OS_VERSION}-${ARCH}"
+
+        docker buildx build \
+            --cache-to type=inline \
+            --cache-from type=registry,ref="${IMAGE_NAME}" \
+            -t "${IMAGE_NAME}" \
+            -t "${IMAGE_NAME}:v${VERSION}" \
+            -f "${DOCKERFILE}" \
+            --build-arg FROM_IMAGE_NAME="${FROM_IMAGE_NAME}" \
+            --platform "${PLATFORM}" \
+            --provenance=false \
+            --push \
+            .
+    done
+done
+
+popd >/dev/null
diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst
index 69d555b1..04f069b7 100644
--- a/docs/sphinx/index.rst
+++ b/docs/sphinx/index.rst
@@ -123,6 +123,7 @@ Copyright
     :maxdepth: 1
     :hidden:
 
+    v0.13.0-beta <relnotes/v0.13.0-beta>
     v0.12.0-beta <relnotes/v0.12.0-beta>
     v0.11.0-beta <relnotes/v0.11.0-beta>
     v0.10.1-beta <relnotes/v0.10.1-beta>
diff --git a/docs/sphinx/relnotes/v0.13.0-beta.rst b/docs/sphinx/relnotes/v0.13.0-beta.rst
new file mode 100644
index 00000000..fdf98df4
--- /dev/null
+++ b/docs/sphinx/relnotes/v0.13.0-beta.rst
@@ -0,0 +1,65 @@
+..
+  # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  # SPDX-License-Identifier: Apache-2.0
+  #
+  # Licensed under the Apache License, Version 2.0 (the "License");
+  # you may not use this file except in compliance with the License.
+  # You may obtain a copy of the License at
+  #
+  # http://www.apache.org/licenses/LICENSE-2.0
+  #
+  # Unless required by applicable law or agreed to in writing, software
+  # distributed under the License is distributed on an "AS IS" BASIS,
+  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  # See the License for the specific language governing permissions and
+  # limitations under the License.
+
+.. _v0.13.0-beta:
+
+v0.13.0-beta
+============
+
+Release Highlights
+------------------
+
+CV-CUDA v0.13.0 includes ManyLinux 2014 compliant wheels alongside the following changes:​
+
+* **New Features**:​
+
+  * Added Python wheel generation compliant with ManyLinux 2014 and PyPI standards.
+
+    * The multiple python version wheels are now unified into a single wheel file per CUDA version​.
+
+    * Included scripts to build two ManyLinux 2014 Docker images (CUDA 11, CUDA 12) for build, and four Ubuntu images (20.04 and 22.04 x CUDA 11, CUDA 12) for testing.
+
+    * Python wheels must be built within the ManyLinux 2014 docker images to guarantee ManyLinux2014 compliance.
+
+* **Bug Fixes**:​
+
+  * Upgraded pybind11 to version 2.13.6 for improved compatibility and functionality.​
+
+    * Resolved Python ABI compatibility issues present in previous versions by upgrading pybind11 reported in previous versions.​
+
+
+Compatibility and Known Limitations
+-----------------------------------
+
+For the full list, see main README on `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_.
+
+License
+-------
+
+CV-CUDA is licensed under the `Apache 2.0 <https://github.com/CVCUDA/CV-CUDA/blob/main/LICENSE.md>`_ license.
+
+Resources
+---------
+
+1. `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_
+2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA <https://developer.nvidia.com/blog/increasing-throughput-and-reducing-costs-for-computer-vision-with-cv-cuda/>`_
+3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI <https://blogs.nvidia.com/blog/2023/03/21/cv-cuda-ai-computer-vision/>`_
+4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI <https://developer.nvidia.com/zh-cn/blog/cv-cuda-high-performance-image-processing/>`_
+
+Acknowledgements
+----------------
+
+CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team.
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 2f175f08..b6de39ae 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -39,10 +39,6 @@ string(REPLACE "." "" PYTHON_MODULE_NAME "${PYTHON_MODULE_NAME}")
 include(GNUInstallDirs)
 set(PYTHON_MODULE_FILENAME_LIST "" CACHE INTERNAL "")
 
-if(CMAKE_BUILD_TYPE STREQUAL "Release")
-    add_custom_target(wheel ALL)
-endif()
-
 function(nvcv_python_add_module)
     cmake_parse_arguments(ARG "SHARED;MODULE" "TARGET;OUTPUT_NAME" "SOURCES" ${ARGV})
 
@@ -80,10 +76,6 @@ function(nvcv_python_add_module)
     set(PYTHON_MODULE_FILENAME_LIST
         "${PYTHON_MODULE_FILENAME_LIST};${prefix}${ARG_OUTPUT_NAME}${suffix}" CACHE INTERNAL "")
 
-    if(CMAKE_BUILD_TYPE STREQUAL "Release")
-        add_dependencies(wheel ${ARG_TARGET})
-    endif()
-
     install(TARGETS ${ARG_TARGET}
         LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/python
         COMPONENT ${PYTHON_MODULE_NAME}
@@ -102,10 +94,3 @@ string(JOIN " " PYTHON_MODULE_FILENAME_LIST ${PYTHON_MODULE_FILENAME_LIST})
 
 configure_file(cpack/debian_python_postinst.in cpack/postinst @ONLY)
 configure_file(cpack/debian_python_prerm.in cpack/prerm @ONLY)
-
-# Create Python wheel
-if(CMAKE_BUILD_TYPE STREQUAL "Release")
-    add_custom_command(
-        TARGET wheel
-        COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/build_wheels.sh" "${BUILD_ROOT}" ${PYTHON_VERSION_SHORT} )
-endif()
diff --git a/python/__init__.py.in b/python/__init__.py.in
new file mode 100644
index 00000000..7a602de5
--- /dev/null
+++ b/python/__init__.py.in
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from ._load_binding import load_binding as _load_binding
+
+# Dynamically load the appropriate binding
+_binding = _load_binding(
+    __name__,
+    os.path.join(os.path.dirname(__file__), '_bindings')
+)
+
+# Import all symbols from the binding into the top-level namespace
+__all__ = dir(_binding)
+globals().update({symbol: getattr(_binding, symbol) for symbol in __all__})
+
+# Clean up internal variables to avoid exposing them in the package namespace
+del _load_binding, _binding, os
diff --git a/python/_load_binding.py.in b/python/_load_binding.py.in
new file mode 100644
index 00000000..4001e1d2
--- /dev/null
+++ b/python/_load_binding.py.in
@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import importlib.util
+import sysconfig
+from functools import lru_cache
+
+
+@lru_cache(maxsize=1)
+def load_binding(module_name: str, bindings_dir: str):
+    """
+    Dynamically selects the correct binding for the current Python version
+    """
+    # Get the Python ABI tag
+    python_version = f"{sys.version_info.major}{sys.version_info.minor}"
+    abi_tag = sysconfig.get_config_var('SOABI')
+
+    # Construct the expected filename
+    binding_so_filename = f'{module_name}.{abi_tag}.so'
+
+    # Find the .so file in the package directory
+    binding_so_path = os.path.join(bindings_dir, binding_so_filename)
+    if not os.path.exists(binding_so_path):
+        raise ImportError(
+            f'Could not find the binding file for Python {python_version} at '
+            f'{binding_so_path}. Make sure the package is installed.'
+        )
+
+    # Dynamically load the .so file
+    spec = importlib.util.spec_from_file_location(module_name, binding_so_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Load a Python binding')
+    parser.add_argument('module_name', type=str,
+                        help='The name of the module to load')
+    parser.add_argument('bindings_dir', type=str,
+                        help='The directory containing the bindings')
+    args = parser.parse_args()
+
+    binding = load_binding(args.module_name, args.bindings_dir)
+
+    print(f'Loaded module: {binding}')
+    print(f'  Binding version: {binding.__version__}')
+    print(f'  Binding description: {binding.__doc__}')
+    print(f'  Binding functions: {dir(binding)}')
diff --git a/python/build_wheels.sh b/python/build_wheels.sh
index ecc16209..2df64a50 100755
--- a/python/build_wheels.sh
+++ b/python/build_wheels.sh
@@ -1,6 +1,6 @@
 #!/bin/bash -e
 
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,70 +15,114 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Creates the Python self contained wheels
+if [ "$#" -ne 1 ]; then
+    echo "Usage: build_wheels.sh <python_build_dir>"
+    exit 1
+fi
 
-# Usage: build_wheels.sh [build_artifacts_dir] [python_versions]
-# Note: This script is automatically called by cmake/make. The proper way to
-# build python wheels is to issue the command:
-#
-# Do not run this script outside of cmake.
+PYTHON_BUILD_DIR=$(realpath "$1")
+BUILD_DIR=$(dirname "${PYTHON_BUILD_DIR}")
+WHEEL_DIR="${PYTHON_BUILD_DIR}/dist"
+REPAIRED_WHEEL_DIR="${PYTHON_BUILD_DIR}/repaired_wheels"
+WHEEL_BUILD_DIR="${PYTHON_BUILD_DIR}/build_wheel"
+LIB_DIR="${PYTHON_BUILD_DIR}/cvcuda_cu${CUDA_VERSION_MAJOR}.libs"
+SUPPORTED_PYTHONS=("38" "39" "310" "311" "312" "313")
+PACKAGES=("cvcuda" "nvcv")
+
+detect_platform_tag() {
+    if [ -n "${AUDITWHEEL_PLAT}" ]; then
+        echo "${AUDITWHEEL_PLAT}"
+    else
+        echo "linux_$(uname -m)"
+    fi
+}
+
+PLATFORM_TAG=$(detect_platform_tag)
+echo "Detected Platform Tag: ${PLATFORM_TAG}"
+
+LIBRARIES=(
+    "libcvcuda.so"
+    "libnvcv_types.so"
+)
+
+mkdir -p "${WHEEL_DIR}" "${REPAIRED_WHEEL_DIR}" "${WHEEL_BUILD_DIR}" "${LIB_DIR}"
+
+# Detect available Python bindings
+AVAILABLE_PYTHONS=()
+PYTHON_EXECUTABLES=()
+for py_ver in "${SUPPORTED_PYTHONS[@]}"; do
+    py_exec="python3.${py_ver:1}"
+    if command -v "${py_exec}" &> /dev/null; then
+        if compgen -G "${PYTHON_BUILD_DIR}/cvcuda/_bindings/cvcuda.cpython-${py_ver}-*.so" > /dev/null; then
+            AVAILABLE_PYTHONS+=("cp${py_ver}")
+            PYTHON_EXECUTABLES+=("${py_exec}")
+        fi
+    fi
+done
+PYTHON_EXECUTABLE="${PYTHON_EXECUTABLES[0]}"
 
-set -e  # Stops this script if any one command fails.
+# Print the available Python bindings
+echo "Available Python Bindings: ${AVAILABLE_PYTHONS[*]}"
 
-if [ "$#" -lt 2 ]; then
-    echo "Usage: build_wheels.sh <build_dir> [python_versions,...]"
+if [ "${#AVAILABLE_PYTHONS[@]}" -eq 0 ]; then
+    echo "Error: No Python bindings detected."
     exit 1
 fi
 
-BUILD_DIR=$(realpath "$1"); shift
-PY_VERSIONS=("$@")
-LIB_DIR="${BUILD_DIR}/lib"
-
-echo "BUILD_DIR: $BUILD_DIR"
-echo "Python Versions: ${PY_VERSIONS[*]}"
-
-for py_version in "${PY_VERSIONS[@]}"
-do
-    py_version_flat="${py_version//./}"  # Gets the non dotted version string
-    echo "Building Python wheels for: Python${py_version}"
-
-    # Step 1: Create a directories to store all wheels related files for this python version
-    py_dir="${BUILD_DIR}/python${py_version}"
-    wheel_dir="${py_dir}/wheel"
-    mkdir -p "${wheel_dir}"
-    rm -rf ${wheel_dir:?}/*
-    mkdir -p "${wheel_dir}/cvcuda.libs"
-
-    cd "${wheel_dir}"
-
-    # Step 2: Copy necessary .so files under one directory
-    # We will copy the target of the linked file and not the symlink only.
-    # Also the new file-name of the .so will be the actual so-name present inside the header of the .so
-    # This can be retrieved by using patchelf.
-    # This allows us to copy .so files without knowing their versions and also making sure they still
-    # work after copying.
-    # Copy the core .so files first
-    for so_file_name in libcvcuda.so libnvcv_types.so
-    do
-        cp -L "${LIB_DIR}/${so_file_name}" \
-            "${wheel_dir}/cvcuda.libs/`patchelf --print-soname "${LIB_DIR}/${so_file_name}"`"
-    done
-
-    # Copy the bindings .so files + patch them in their rpath.
-    # This allows the bindings to find the core .so files in a directory named cvcuda.libs only.
-    for so_file_path in ${LIB_DIR}/python/*.cpython-${py_version_flat}*.so
-    do
-        so_file_name=$(basename ${so_file_path})
-        cp -L "${so_file_path}" \
-            "${wheel_dir}/"
-
-        patchelf --force-rpath --set-rpath '$ORIGIN'/cvcuda.libs "${wheel_dir}/${so_file_name}"
-    done
-
-    # Step 3: Copy the setup.py corresponding to current python version to our wheels directory.
-    cp "${py_dir}/setup.py" "${wheel_dir}"
-
-    # Step 3: Create wheel
-    python${py_version} setup.py bdist_wheel --dist-dir="${wheel_dir}"
+# Copy and patch shared libraries
+echo "Copying and patching shared libraries..."
+for lib in "${LIBRARIES[@]}"; do
+    src_path="${BUILD_DIR}/lib/${lib}"
+    if [ -f "${src_path}" ]; then
+        cp "${src_path}" "${LIB_DIR}/"
+        echo "Copied: ${src_path} -> ${LIB_DIR}/"
+        patchelf --force-rpath --set-rpath '$ORIGIN/../cvcuda_cu${CUDA_VERSION_MAJOR}.libs' "${LIB_DIR}/${lib}"
+    else
+        echo "Warning: Shared library ${src_path} not found. Skipping."
+    fi
+done
+
+# Create wheel structure
+ln -sf "${PYTHON_BUILD_DIR}/setup.py" "${WHEEL_BUILD_DIR}/"
+ln -sf "${PYTHON_BUILD_DIR}/cvcuda" "${WHEEL_BUILD_DIR}/"
+ln -sf "${PYTHON_BUILD_DIR}/nvcv" "${WHEEL_BUILD_DIR}/"
+ln -sf "${LIB_DIR}" "${WHEEL_BUILD_DIR}/cvcuda_cu${CUDA_VERSION_MAJOR}.libs"
+
+# Build wheel
+echo "Building wheel..."
+pushd "${WHEEL_BUILD_DIR}" > /dev/null
+${PYTHON_EXECUTABLE} -m build --wheel --outdir="${WHEEL_DIR}" || ${PYTHON_EXECUTABLE} setup.py bdist_wheel --dist-dir="${WHEEL_DIR}"
+
+# Modify the wheel's Python and ABI tags for detected versions
+# Ensuring the tag is propagated to the wheel
+${PYTHON_EXECUTABLE} -m pip install --upgrade wheel
+python_tag=$(IFS=. ; echo "${AVAILABLE_PYTHONS[*]}")
+for whl in "${WHEEL_DIR}"/*.whl; do
+    ${PYTHON_EXECUTABLE} -m wheel tags --remove \
+                        --python-tag "${python_tag}" \
+                        --abi-tag "${python_tag}" \
+                        --platform-tag "${PLATFORM_TAG}" \
+                        "${whl}"
+done
+popd > /dev/null
 
+echo "Repairing wheel for compliance..."
+${PYTHON_EXECUTABLE} -m pip install --upgrade auditwheel
+for whl in "${WHEEL_DIR}"/*.whl; do
+    ${PYTHON_EXECUTABLE} -m auditwheel repair "${whl}" --plat "${PLATFORM_TAG}" -w "${REPAIRED_WHEEL_DIR}"
+    rm "${whl}"
 done
+
+echo "Verifying wheel filenames..."
+for repaired_whl in "${REPAIRED_WHEEL_DIR}"/*.whl; do
+    repaired_whl_name="$(basename "${repaired_whl}")"
+    echo "Wheel: ${repaired_whl_name}"
+    IFS='-' read -r dist_name version python_tag abi_tag platform_tag <<< "$(echo "${repaired_whl_name}" | sed 's/\.whl$//')"
+    echo "  Distribution Name: ${dist_name}"
+    echo "  Version: ${version}"
+    echo "  Python Tag: ${python_tag}"
+    echo "  ABI Tag: ${abi_tag}"
+    echo "  Platform Tag: ${platform_tag}"
+done
+
+echo "Repaired wheels are located in: ${REPAIRED_WHEEL_DIR}"
diff --git a/python/setup.py.in b/python/setup.py.in
index c22e9d0f..e1447c0c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,81 +1,107 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an 'AS IS' BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This is a Python setuptools setup script to generate Python wheels.
-# It is in a template form with placeholder fields that looks like ${}.
-# This script will be automatically invoked by cmake when Python bindings are built.
-# Do not invoke this outside of cmake.
-
-
-from setuptools import setup, Extension
+import os
+from collections import defaultdict
+from setuptools import setup, find_packages, Extension
 from setuptools.command.build_ext import build_ext
 
 
 class NoBuildExtension(build_ext):
     """
-    Since CV-CUDA Python wheels are pure pre-compiled binary distribution at this point
-    without any Python or any other source code files and since the binaries are generated
-    by cmake system outside and without the knowledge of the setuptools, we must
-    create a dummy class to build an extension here with no source code in it and
-    no build steps in it to let setuptools create a platform library instead of a
-    pure library. Without any extensions in a setup tools project setuptools will
-    end up creating a purelib package. One can compile cmake/pybind11 code here
-    as an extension but since that part is handled outside of this file for now
-    we will simply create an empty extension and a corresponding build step that
-    actually does nothing but let setuptools know that this is a pure binary distribution.
+    Prevent setuptools from trying to build extensions since the actual
+    compilation is handled externally (e.g., via CMake).
     """
-
     def run(self):
-        return  # Do nothing during build time.
+        pass
+
+
+def find_shared_libraries(lib_dir):
+    """
+    Locate the central shared libraries (libcvcuda.so, libnvcv_types.so)
+    and Python bindings (*.cpython-*.so) in the given directory.
+
+    Args:
+        lib_dir (str): The directory to search for shared libraries.
+
+    Returns:
+        dict: A dictionary containing shared libraries and bindings.
+    """
+    shared_libraries = defaultdict(list)
+
+    for root, _, files in os.walk(lib_dir):
+        for file in files:
+            if file.endswith('.so'):
+                file_path = os.path.relpath(os.path.join(root, file),
+                                            start=lib_dir)
+
+                # Central shared libraries
+                if file.startswith('libcvcuda'):
+                    shared_libraries['cvcuda'].append(file_path)
+                elif file.startswith('libnvcv_types'):
+                    shared_libraries['nvcv'].append(file_path)
+                # Python bindings
+                elif file.startswith('cvcuda') and 'cpython' in file:
+                    shared_libraries['cvcuda_bindings'].append(file_path)
+                elif file.startswith('nvcv') and 'cpython' in file:
+                    shared_libraries['nvcv_bindings'].append(file_path)
+
+    return shared_libraries
 
 
-# Define our PyPI trove classifiers for this project. Many values here are
-# placeholders which will be filled in by cmake when this is built.
-pypi_trove_classifiers = [
-    "Development Status :: 4 - Beta",
-    "Environment :: GPU :: NVIDIA CUDA",
-    "Environment :: GPU :: NVIDIA CUDA :: ${CUDA_VERSION_MAJOR}",
-    "Operating System :: POSIX :: Linux",
-    "License :: OSI Approved :: Apache Software License",
-    "Programming Language :: Python :: ${PYTHON_VERSION}",
-    "Programming Language :: Python :: Implementation :: CPython",
-]
+# Locate libraries and bindings in the specified directory
+libs = find_shared_libraries(os.path.dirname(__file__))
+print(libs)
 
-# Finally call the setup.
 setup(
-    name="cvcuda-cu${CUDA_VERSION_MAJOR}",
-    description="${CMAKE_PROJECT_DESCRIPTION}",
-    author="NVIDIA Corporation",
-    url="https://github.com/CVCUDA/CV-CUDA",
-    version="${CMAKE_PROJECT_VERSION}${PROJECT_VERSION_SUFFIX}",
-    packages=[""],  # Must be empty to support current CV-CUDA style distribution
-    package_dir={"": "."},
+    name='cvcuda-cu${CUDA_VERSION_MAJOR}',
+    version='${CMAKE_PROJECT_VERSION}${PROJECT_VERSION_SUFFIX}',
+    description='${CMAKE_PROJECT_DESCRIPTION}',
+    author='NVIDIA Corporation',
+    author_email='support@nvidia.com',
+    url='https://github.com/CVCUDA/CV-CUDA',
+    packages=find_packages(include=["cvcuda", "nvcv"]),
+    package_dir={
+        'cvcuda': 'cvcuda',
+        'nvcv': 'nvcv',
+    },
     package_data={
-        "": ["*.so", "cvcuda.libs/*.*"]
-    },  # Includes the binding .so + core .so files
+        'cvcuda': ['_bindings/*.*'],
+        'nvcv': ['_bindings/*.*'],
+        'cvcuda_cu${CUDA_VERSION_MAJOR}.libs': ['*.*'],
+    },
     include_package_data=True,
-    install_requires=["numpy>=1.23.5"],
-    python_requires="==${PYTHON_VERSION}.*",
+    install_requires=['numpy>=1.23.5'],
+    python_requires='>=3.8, <3.14',
     zip_safe=False,
-    cmdclass={
-        "build_ext": NoBuildExtension,  # This allows us to make it a platlib.
-    },
+    cmdclass={'build_ext': NoBuildExtension},
     ext_modules=[
-        Extension(
-            name="UnusedEmptyExtension", sources=[]
-        ),  # This allows us to make it a platlib.
+        Extension(name='UnusedEmptyExtension', sources=[]),
+    ],
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'Environment :: GPU :: NVIDIA CUDA',
+        'Environment :: GPU :: NVIDIA CUDA :: ${CUDA_VERSION_MAJOR}',
+        'Operating System :: POSIX :: Linux',
+        'License :: OSI Approved :: Apache Software License',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
+        'Programming Language :: Python :: 3.13',
+        'Programming Language :: Python :: Implementation :: CPython',
     ],
-    classifiers=pypi_trove_classifiers,
 )
diff --git a/src/cvcuda/priv/CMakeLists.txt b/src/cvcuda/priv/CMakeLists.txt
index 8a61def0..fbc4fa53 100644
--- a/src/cvcuda/priv/CMakeLists.txt
+++ b/src/cvcuda/priv/CMakeLists.txt
@@ -103,8 +103,8 @@ target_link_libraries(cvcuda_priv
         nvcv_util_sanitizer
         cvcuda_legacy
         CUDA::cudart_static
-	CUDA::cusolver
-        CUDA::cublas
-        CUDA::cublasLt
+        CUDA::cusolver_static
+        CUDA::cublas_static
+        CUDA::cublasLt_static
         -lrt
 )
diff --git a/src/nvcv/CMakeLists.txt b/src/nvcv/CMakeLists.txt
index 2f2ae1ea..ac81ea9f 100644
--- a/src/nvcv/CMakeLists.txt
+++ b/src/nvcv/CMakeLists.txt
@@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.20.1)
 
 project(nvcv
         LANGUAGES C CXX
-        VERSION 0.12.0
+        VERSION 0.13.0
         DESCRIPTION "NVCV is NVIDIA Computer Vision library"
 )
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e6d06d96..29317641 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -45,9 +45,12 @@ target_link_libraries(nvcv_test_main
 if(UNIX)
     file(TO_NATIVE_PATH "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/" TESTS_DRIVER_DIR)
     set(TESTS_DRIVER "${TESTS_DRIVER_DIR}/run_tests.sh")
+    set(WHEEL_TESTER "${TESTS_DRIVER_DIR}/test_wheels.sh")
 
     configure_file(${CMAKE_CURRENT_SOURCE_DIR}/run_tests.sh.in ${TESTS_DRIVER}
         @ONLY)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/test_wheels.sh.in ${WHEEL_TESTER}
+        @ONLY)
 
     macro(nvcv_add_test TESTCMD TESTGROUP)
         get_filename_component(TESTNAME "${TESTCMD}" NAME)
@@ -86,6 +89,9 @@ if(UNIX)
     install(PROGRAMS ${TESTS_DRIVER}
         TYPE BIN
         COMPONENT tests)
+    install(PROGRAMS ${WHEEL_TESTER}
+        TYPE BIN
+        COMPONENT tests)
 else()
     macro(nvcv_add_test)
         add_test(${ARGV})
diff --git a/tests/cvcuda/stressTest/cvcuda_cache_repro.py b/tests/cvcuda/stressTest/cvcuda_cache_repro.py
new file mode 100644
index 00000000..8d46438d
--- /dev/null
+++ b/tests/cvcuda/stressTest/cvcuda_cache_repro.py
@@ -0,0 +1,200 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import cvcuda
+import torch
+import random
+import threading
+import queue
+import time
+import gc
+
+
+def preprocess(input, out_size):
+    frame_nhwc = cvcuda.as_tensor(
+        torch.as_tensor(input).to(device="cuda:0", non_blocking=True),
+        "NHWC",
+    )
+    resized = cvcuda.resize(
+        frame_nhwc,
+        (
+            frame_nhwc.shape[0],
+            out_size[1],
+            out_size[0],
+            frame_nhwc.shape[3],
+        ),
+        cvcuda.Interp.LINEAR,
+    )
+    # Convert to floating point range 0-1.
+    normalized = cvcuda.convertto(resized, np.float32, scale=1 / 255)
+    # Convert it to NCHW layout and return it.
+    normalized = cvcuda.reformat(normalized, "NCHW")
+    return normalized
+
+
+def preprocess_into(input, out_size):
+    torch.cuda.synchronize()
+    cvcuda_RGBtensor = cvcuda.as_tensor(input.cuda(), "NHWC")
+    torch.cuda.synchronize()
+    torch_RGBtensor_resized = torch.empty(
+        (
+            cvcuda_RGBtensor.shape[0],
+            out_size[1],
+            out_size[0],
+            cvcuda_RGBtensor.shape[3],
+        ),
+        dtype=torch.uint8,
+        device="cuda:0",
+    )
+    cvcuda_RGBtensor_resized = cvcuda.as_tensor(
+        torch_RGBtensor_resized.cuda(),
+        "NHWC",
+    )
+    cvcuda.resize_into(
+        cvcuda_RGBtensor_resized,
+        cvcuda_RGBtensor,
+        cvcuda.Interp.LINEAR,
+    )
+
+    torch_nchw = torch.empty(
+        (input.shape[0], 3, out_size[1], out_size[0]),
+        dtype=torch.uint8,
+        device="cuda:0",
+    )
+    cvcuda_nchw = cvcuda.as_tensor(torch_nchw.cuda(0), "NCHW")
+    cvcuda.reformat_into(cvcuda_nchw, cvcuda_RGBtensor_resized)
+    return torch_nchw
+
+
+def generate_images(N, width=None, height=None, random_size=False):
+    if random_size:
+        w = random.randint(1, 10)
+        h = random.randint(1, 10)
+    else:
+        w = width
+        h = height
+    return torch.as_tensor(torch.rand(N, h, w, 3), dtype=torch.uint8)
+
+
+def worker(device_id, task_queue, result_queue):
+    while True:
+        task = task_queue.get()
+        if task is None:
+            break
+        gradient_img_batch, image_size = task
+        result = preprocess(gradient_img_batch, image_size)
+        result_queue.put(result)
+        task_queue.task_done()
+
+
+def worker_into(device_id, task_queue, result_queue):
+    while True:
+        task = task_queue.get()
+        if task is None:
+            break
+        gradient_img_batch, image_size = task
+        result = preprocess_into(gradient_img_batch, image_size)
+        result_queue.put(result)
+        task_queue.task_done()
+
+
+def test_random_batch_size():
+    device_id = 0
+    num_threads = 10
+
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+
+    threads = []
+    for i in range(num_threads):
+        t = threading.Thread(target=worker, args=(device_id, task_queue, result_queue))
+        t.start()
+        threads.append(t)
+
+    # Set the duration to run the function (in seconds)
+    duration = 10
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        batch_size = random.randint(5, 10)
+        target_img_width = random.randint(110, 115)
+        target_img_height = random.randint(220, 230)
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        image_size = (target_img_width, target_img_height)
+        task_queue.put((gradient_img_batch, image_size))
+
+    # Signal the threads to stop
+    for _ in range(num_threads):
+        task_queue.put(None)
+
+    for t in threads:
+        t.join()
+
+    print("Random Batch Size test complete")
+
+
+def test_random_batch_size_into():
+    device_id = 0
+    num_threads = 10
+
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+
+    threads = []
+    for i in range(num_threads):
+        t = threading.Thread(
+            target=worker_into, args=(device_id, task_queue, result_queue)
+        )
+        t.start()
+        threads.append(t)
+
+    # Set the duration to run the function (in seconds)
+    duration = 10
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        batch_size = random.randint(5, 10)
+        target_img_width = random.randint(110, 115)
+        target_img_height = random.randint(220, 230)
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        # print(gradient_img_batch.size())
+        image_size = (target_img_width, target_img_height)
+        task_queue.put((gradient_img_batch, image_size))
+
+    # Signal the threads to stop
+    for _ in range(num_threads):
+        task_queue.put(None)
+
+    for t in threads:
+        t.join()
+
+    print("Into Random Batch Size test complete")
+
+
+def main():
+    test_random_batch_size_into()
+    collected = gc.collect()
+    print(f"Garbage collector: collected {collected} objects.")
+    time.sleep(1)
+    torch.cuda.empty_cache()
+    time.sleep(1)
+    test_random_batch_size()
+    collected = gc.collect()
+    print(f"Garbage collector: collected {collected} objects.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/cvcuda/stressTest/stress_test_inference.py b/tests/cvcuda/stressTest/stress_test_inference.py
new file mode 100644
index 00000000..c09c5555
--- /dev/null
+++ b/tests/cvcuda/stressTest/stress_test_inference.py
@@ -0,0 +1,542 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+import cvcuda
+import torch
+import random
+import nvcv
+
+import os
+import sys
+import urllib.request
+import time
+
+import tensorrt as trt
+import tensorflow as tf
+
+common_dir = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+    "common",
+    "python",
+)
+assets_dir = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+    "assets",
+)
+sys.path.insert(0, common_dir)
+
+from trt_utils import setup_tensort_bindings  # noqa: E402
+
+time_of_test_in_min = 15
+max_batch_size = 10
+
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+
+
+def preprocess(input, out_size):
+    frame_nhwc = cvcuda.as_tensor(
+        torch.as_tensor(input).to(device="cuda:0", non_blocking=True),
+        "NHWC",
+    )
+    resized = cvcuda.resize(
+        frame_nhwc,
+        (
+            frame_nhwc.shape[0],
+            out_size[1],
+            out_size[0],
+            frame_nhwc.shape[3],
+        ),
+        cvcuda.Interp.LINEAR,
+    )
+    # Convert to floating point range 0-1.
+    normalized = cvcuda.convertto(resized, np.float32, scale=1 / 255)
+    # Convert it to NCHW layout and return it.
+    normalized = cvcuda.reformat(normalized, "NCHW")
+    return normalized
+
+
+def preprocess_into(input, out_size):
+    torch.cuda.synchronize()
+    cvcuda_RGBtensor = cvcuda.as_tensor(input.cuda(), "NHWC")
+    torch.cuda.synchronize()
+    torch_RGBtensor_resized = torch.empty(
+        (
+            cvcuda_RGBtensor.shape[0],
+            out_size[1],
+            out_size[0],
+            cvcuda_RGBtensor.shape[3],
+        ),
+        dtype=torch.uint8,
+        device="cuda:0",
+    )
+    cvcuda_RGBtensor_resized = cvcuda.as_tensor(
+        torch_RGBtensor_resized.cuda(),
+        "NHWC",
+    )
+    cvcuda.resize_into(
+        cvcuda_RGBtensor_resized,
+        cvcuda_RGBtensor,
+        cvcuda.Interp.LINEAR,
+    )
+
+    torch_nchw = torch.empty(
+        (input.shape[0], 3, out_size[1], out_size[0]),
+        dtype=torch.uint8,
+        device="cuda:0",
+    )
+    cvcuda_nchw = cvcuda.as_tensor(torch_nchw.cuda(0), "NCHW")
+    # normalized = cvcuda.convertto(cvcuda_nchw, np.float32, scale=1 / 255)
+    cvcuda.reformat_into(cvcuda_nchw, cvcuda_RGBtensor_resized)
+    return torch_nchw
+
+
+def generate_images(N, width=None, height=None, random_size=False):
+    if random_size:
+        w = random.randint(100, 500)
+        h = random.randint(100, 500)
+    else:
+        w = width
+        h = height
+    return torch.as_tensor(torch.rand(N, h, w, 3), dtype=torch.uint8)
+
+
+class ObjectDetectionTensorflow:
+    def __init__(
+        self,
+        output_dir,
+        batch_size,
+        image_size,
+        device_id,
+    ):
+        self.logger = logging.getLogger(__name__)
+        self.output_dir = output_dir
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.device_id = device_id
+
+        physical_devices = tf.config.list_physical_devices("GPU")
+        tf.config.experimental.set_memory_growth(physical_devices[self.device_id], True)
+
+        hdf5_model_path = os.path.join(output_dir, "resnet34_peoplenet.hdf5")
+
+        if not os.path.isfile(hdf5_model_path):
+            # We need to download the HDF5 model first from NGC.
+            model_url = (
+                "https://api.ngc.nvidia.com/v2/models/"
+                "org/nvidia/team/tao/peoplenet/trainable_unencrypted_v2.6/"
+                "files?redirect=true&path=model.hdf5"
+            )
+            self.logger.info("Downloading the PeopleNet model from NGC: %s" % model_url)
+            urllib.request.urlretrieve(model_url, hdf5_model_path)
+            self.logger.info("Download complete. Saved to: %s" % hdf5_model_path)
+
+        with tf.device("/GPU:%d" % self.device_id):
+            self.model = tf.keras.models.load_model(hdf5_model_path)
+            self.logger.info("TensorFlow PeopleNet model is loaded.")
+
+        self.logger.info("Using TensorFlow as the inference engine.")
+
+    def __call__(self, frame_nchw):
+
+        if isinstance(frame_nchw, torch.Tensor):
+            # We convert torch.Tensor to tf.Tensor by:
+            # torch.Tensor -> Pytorch Flat Tensor -> DlPack -> tf.Tensor -> Un-flatten
+            frame_nchw_shape = frame_nchw.shape
+            frame_nchw = frame_nchw.flatten()
+            frame_nchw_tf = tf.experimental.dlpack.from_dlpack(frame_nchw.__dlpack__())
+            frame_nchw_tf = tf.reshape(frame_nchw_tf, frame_nchw_shape)
+
+        elif isinstance(frame_nchw, nvcv.Tensor):
+            # We convert nvcv.Tensor to tf.Tensor by:
+            # nvcv.Tensor -> PyTorch Tensor -> Pytorch Flat Tensor -> DlPack -> tf.Tensor -> Un-flatten
+            frame_nchw_pyt = torch.as_tensor(
+                frame_nchw.cuda(), device="cuda:%d" % self.device_id
+            )
+            frame_nchw_pyt = frame_nchw_pyt.flatten()
+            frame_nchw_tf = tf.experimental.dlpack.from_dlpack(
+                frame_nchw_pyt.__dlpack__()
+            )
+            frame_nchw_tf = tf.reshape(frame_nchw_tf, frame_nchw.shape)
+
+        elif isinstance(frame_nchw, np.ndarray):
+            frame_nchw_tf = tf.convert_to_tensor(frame_nchw)
+
+        else:
+            raise ValueError(
+                "Invalid type of input tensor for tensorflow inference: %s"
+                % str(type(frame_nchw))
+            )
+
+        with tf.device("/GPU:%d" % self.device_id):
+            output_tensors = self.model(frame_nchw_tf)  # returns a tuple.
+
+        # Convert the output to PyTorch Tensors
+        boxes = torch.from_dlpack(tf.experimental.dlpack.to_dlpack(output_tensors[0]))
+        score = torch.from_dlpack(
+            tf.experimental.dlpack.to_dlpack(output_tensors[1])
+        )  # inference.tensorflow
+        return boxes, score
+
+
+class ObjectDetectionTensorRT:
+    def __init__(
+        self,
+        output_dir,
+        batch_size,
+        image_size,
+        device_id,
+    ):
+        self.logger = logging.getLogger(__name__)
+        self.output_dir = output_dir
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.device_id = device_id
+
+        # Download and prepare the models for the first use.
+        etlt_model_path = os.path.join(self.output_dir, "resnet34_peoplenet_int8.etlt")
+        trt_engine_file_path = os.path.join(
+            self.output_dir,
+            "resnet34_peoplenet_int8.%d.%d.%d.trtmodel"
+            % (
+                batch_size,
+                image_size[1],
+                image_size[0],
+            ),
+        )
+
+        # Check if we have a previously generated model.
+        if not os.path.isfile(trt_engine_file_path):
+            if not os.path.isfile(etlt_model_path):
+                # We need to download the ETLE model first from NGC.
+                model_url = (
+                    "https://api.ngc.nvidia.com/v2/models/"
+                    "nvidia/tao/peoplenet/versions/deployable_quantized_v2.6.1/"
+                    "files/resnet34_peoplenet_int8.etlt"
+                )
+                self.logger.info(
+                    "Downloading the PeopleNet model from NGC: %s" % model_url
+                )
+                urllib.request.urlretrieve(model_url, etlt_model_path)
+                self.logger.info("Download complete. Saved to: %s" % etlt_model_path)
+
+            # Convert ETLE to TensorRT model using the TAO-Converter.
+            self.logger.info("Converting the PeopleNet model to TensorRT...")
+            if os.system(
+                "tao-converter -e %s -k tlt_encode -d 3,%d,%d -m %d -i nchw %s"
+                % (
+                    trt_engine_file_path,
+                    image_size[1],
+                    image_size[0],
+                    batch_size,
+                    etlt_model_path,
+                )
+            ):
+                raise Exception("Conversion failed.")
+            else:
+                self.logger.info(
+                    "Conversion complete. Saved to: %s" % trt_engine_file_path
+                )
+
+        # Once the TensorRT engine generation is all done, we load it.
+        trt_logger = trt.Logger(trt.Logger.ERROR)
+        with open(trt_engine_file_path, "rb") as f, trt.Runtime(trt_logger) as runtime:
+            # Keeping this as a class variable because we want to be able to
+            # allocate the output tensors either on its first use or when the
+            # batch size changes
+            self.trt_model = runtime.deserialize_cuda_engine(f.read())
+
+        # Create execution context.
+        self.model = self.trt_model.create_execution_context()
+
+        # We will allocate the output tensors and its bindings either when we
+        # use it for the first time or when the batch size changes.
+        self.output_tensors, self.output_idx = None, None
+
+        self.logger.info("Using TensorRT as the inference engine.")
+
+    def __call__(self, tensor):
+
+        # Grab the data directly from the pre-allocated tensor.
+        input_bindings = [tensor.cuda().__cuda_array_interface__["data"][0]]
+        output_bindings = []
+
+        actual_batch_size = tensor.shape[0]
+
+        # Need to allocate the output tensors
+        if not self.output_tensors or actual_batch_size != self.batch_size:
+            self.output_tensors, self.output_idx = setup_tensort_bindings(
+                self.trt_model,
+                actual_batch_size,
+                self.device_id,
+                self.logger,
+            )
+
+        for t in self.output_tensors:
+            output_bindings.append(t.data_ptr())
+        io_bindings = input_bindings + output_bindings
+
+        # Call inference for implicit batch
+        self.model.execute_async(
+            actual_batch_size,
+            bindings=io_bindings,
+            stream_handle=cvcuda.Stream.current.handle,
+        )
+
+        boxes = self.output_tensors[0]
+        score = self.output_tensors[1]  # inference.tensorrt
+        return boxes, score
+
+
+def test_random_image_size():
+    target_img_width = 960
+    target_img_height = 544
+    image_size = (target_img_width, target_img_height)
+    batch_size = 1
+    device_id = 0
+    backend = "tensorflow"
+    output_dir = ""
+    if backend == "tensorflow":
+        inference = ObjectDetectionTensorflow(
+            output_dir,
+            batch_size,
+            image_size,
+            device_id,
+        )
+
+    elif backend == "tensorrt":
+        inference = ObjectDetectionTensorRT(
+            output_dir,
+            batch_size,
+            image_size,
+            device_id,
+        )
+    else:
+        raise ValueError("Unknown backend: %s" % backend)
+
+    duration = time_of_test_in_min * 60  # 5 minutes
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        image_size = (target_img_width, target_img_height)
+        result = preprocess(gradient_img_batch, image_size)
+        bboxes, probabilities = inference(result)
+    print("Random Image Size Test Complete")
+
+
+def test_increasing_batch_size():
+    target_img_width = 960
+    target_img_height = 544
+    image_size = (target_img_width, target_img_height)
+    batch_size = 1
+    device_id = 0
+    backend = "tensorflow"
+    output_dir = ""
+    if backend == "tensorflow":
+        inference = ObjectDetectionTensorflow(
+            output_dir,
+            batch_size,
+            image_size,
+            device_id,
+        )
+
+    elif backend == "tensorrt":
+        inference = ObjectDetectionTensorRT(
+            output_dir,
+            batch_size,
+            image_size,
+            device_id,
+        )
+    else:
+        raise ValueError("Unknown backend: %s" % backend)
+    duration = time_of_test_in_min * 60  # 5 minutes
+    start_time = time.time()
+
+    while time.time() - start_time < duration and batch_size < max_batch_size:
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        image_size = (target_img_width, target_img_height)
+        result = preprocess(gradient_img_batch, image_size)
+        bboxes, probabilities = inference(result)
+        batch_size += 1
+    print("Random Image Size Test Complete")
+
+
+def test_random_batch_size():
+    target_img_width = 960
+    target_img_height = 544
+    image_size = (target_img_width, target_img_height)
+    batch_size = 1
+    device_id = 0
+    backend = "tensorflow"
+    output_dir = ""
+    if backend == "tensorflow":
+        inference = ObjectDetectionTensorflow(
+            output_dir,
+            batch_size,
+            image_size,
+            device_id,
+        )
+
+    elif backend == "tensorrt":
+        inference = ObjectDetectionTensorRT(
+            output_dir,
+            batch_size,
+            image_size,
+            device_id,
+        )
+    else:
+        raise ValueError("Unknown backend: %s" % backend)
+
+    duration = time_of_test_in_min * 60  # 5 minutes
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        gradient_img_batch = generate_images(width=1080, height=1920, N=batch_size)
+        image_size = (target_img_width, target_img_height)
+        result = preprocess(gradient_img_batch, image_size)
+        bboxes, probabilities = inference(result)
+        batch_size = random.randint(1, 80)
+    print("Random Batch Size Test Complete")
+
+
+def test_random_image_size_into():
+    target_img_width = 960
+    target_img_height = 544
+    image_size = (target_img_width, target_img_height)
+    batch_size = 1
+    device_id = 0
+    backend = "tensorflow"
+    output_dir = ""
+    if backend == "tensorflow":
+        inference = ObjectDetectionTensorflow(
+            output_dir,
+            batch_size,
+            image_size,
+            device_id,
+        )
+
+    elif backend == "tensorrt":
+        inference = ObjectDetectionTensorRT(
+            output_dir,
+            batch_size,
+            image_size,
+            device_id,
+        )
+    else:
+        raise ValueError("Unknown backend: %s" % backend)
+    duration = time_of_test_in_min * 60  # 5 minutes
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        # while True:
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        # print(gradient_img_batch.size())
+        image_size = (target_img_width, target_img_height)
+        result = preprocess_into(gradient_img_batch, image_size)
+        bboxes, probabilities = inference(result)
+        # print(f"bboxes :{bboxes}")
+        # print(f"probabilities :{probabilities}")
+    print("Into operator Random Image Size Test Complete")
+
+
+def test_increasing_batch_size_into():
+    target_img_width = 960
+    target_img_height = 544
+    image_size = (target_img_width, target_img_height)
+    batch_size = 1
+    device_id = 0
+    backend = "tensorflow"
+    output_dir = ""
+    if backend == "tensorflow":
+        inference = ObjectDetectionTensorflow(
+            output_dir,
+            batch_size,
+            image_size,
+            device_id,
+        )
+
+    elif backend == "tensorrt":
+        inference = ObjectDetectionTensorRT(
+            output_dir,
+            batch_size,
+            image_size,
+            device_id,
+        )
+    else:
+        raise ValueError("Unknown backend: %s" % backend)
+    duration = time_of_test_in_min * 60  # 5 minutes
+    start_time = time.time()
+
+    while time.time() - start_time < duration and batch_size < max_batch_size:
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        image_size = (target_img_width, target_img_height)
+        result = preprocess_into(gradient_img_batch, image_size)
+        bboxes, probabilities = inference(result)
+        batch_size += 1
+    print("Into operator Random Image Size Test Complete")
+
+
+def test_random_batch_size_into():
+    target_img_width = 960
+    target_img_height = 544
+    image_size = (target_img_width, target_img_height)
+    batch_size = 1
+    device_id = 0
+    backend = "tensorflow"
+    output_dir = ""
+    if backend == "tensorflow":
+        inference = ObjectDetectionTensorflow(
+            output_dir,
+            batch_size,
+            image_size,
+            device_id,
+        )
+
+    elif backend == "tensorrt":
+        inference = ObjectDetectionTensorRT(
+            output_dir,
+            batch_size,
+            image_size,
+            device_id,
+        )
+    else:
+        raise ValueError("Unknown backend: %s" % backend)
+    duration = time_of_test_in_min * 60  # 5 minutes
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        image_size = (target_img_width, target_img_height)
+        result = preprocess_into(gradient_img_batch, image_size)
+        bboxes, probabilities = inference(result)
+        batch_size = random.randint(1, 80)
+    print("Into Operator Random Batch Size Test Complete")
+
+
+def main():
+    print(torch.cuda.get_device_properties(0))
+    test_random_image_size()
+    test_random_batch_size()
+    test_random_image_size_into()
+    test_random_batch_size_into()
+
+    # test_increasing_batch_size()
+    # test_increasing_batch_size_into()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/cvcuda/stressTest/stress_test_mt_prep.py b/tests/cvcuda/stressTest/stress_test_mt_prep.py
new file mode 100644
index 00000000..32bffb05
--- /dev/null
+++ b/tests/cvcuda/stressTest/stress_test_mt_prep.py
@@ -0,0 +1,267 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import cvcuda
+import torch
+import random
+import threading
+import queue
+import time
+
+
+def preprocess(input, out_size):
+    frame_nhwc = cvcuda.as_tensor(
+        torch.as_tensor(input).to(device="cuda:0", non_blocking=True),
+        "NHWC",
+    )
+    resized = cvcuda.resize(
+        frame_nhwc,
+        (
+            frame_nhwc.shape[0],
+            out_size[1],
+            out_size[0],
+            frame_nhwc.shape[3],
+        ),
+        cvcuda.Interp.LINEAR,
+    )
+    # Convert to floating point range 0-1.
+    normalized = cvcuda.convertto(resized, np.float32, scale=1 / 255)
+    # Convert it to NCHW layout and return it.
+    normalized = cvcuda.reformat(normalized, "NCHW")
+    return normalized
+
+
+def preprocess_into(input, out_size):
+    torch.cuda.synchronize()
+    cvcuda_RGBtensor = cvcuda.as_tensor(input.cuda(), "NHWC")
+    torch.cuda.synchronize()
+    torch_RGBtensor_resized = torch.empty(
+        (
+            cvcuda_RGBtensor.shape[0],
+            out_size[1],
+            out_size[0],
+            cvcuda_RGBtensor.shape[3],
+        ),
+        dtype=torch.uint8,
+        device="cuda:0",
+    )
+    cvcuda_RGBtensor_resized = cvcuda.as_tensor(
+        torch_RGBtensor_resized.cuda(),
+        "NHWC",
+    )
+    cvcuda.resize_into(
+        cvcuda_RGBtensor_resized,
+        cvcuda_RGBtensor,
+        cvcuda.Interp.LINEAR,
+    )
+
+    torch_nchw = torch.empty(
+        (input.shape[0], 3, out_size[1], out_size[0]),
+        dtype=torch.uint8,
+        device="cuda:0",
+    )
+    cvcuda_nchw = cvcuda.as_tensor(torch_nchw.cuda(0), "NCHW")
+    cvcuda.reformat_into(cvcuda_nchw, cvcuda_RGBtensor_resized)
+    return torch_nchw
+
+
+def generate_images(N, width=None, height=None, random_size=False):
+    if random_size:
+        w = random.randint(1, 10)
+        h = random.randint(1, 10)
+    else:
+        w = width
+        h = height
+    return torch.as_tensor(torch.rand(N, h, w, 3), dtype=torch.uint8)
+
+
+def worker(device_id, task_queue, result_queue):
+    while True:
+        task = task_queue.get()
+        if task is None:
+            break
+        gradient_img_batch, image_size = task
+        result = preprocess(gradient_img_batch, image_size)
+        result_queue.put(result)
+        task_queue.task_done()
+
+
+def worker_into(device_id, task_queue, result_queue):
+    while True:
+        task = task_queue.get()
+        if task is None:
+            break
+        gradient_img_batch, image_size = task
+        result = preprocess_into(gradient_img_batch, image_size)
+        result_queue.put(result)
+        task_queue.task_done()
+
+
+def test_random_image_size():
+    device_id = 0
+    num_threads = 15
+
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+
+    threads = []
+    for i in range(num_threads):
+        t = threading.Thread(target=worker, args=(device_id, task_queue, result_queue))
+        t.start()
+        threads.append(t)
+
+    # Set the duration to run the function (in seconds)
+    duration = 10
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        batch_size = 10
+        target_img_width = random.randint(220, 230)
+        target_img_height = random.randint(220, 230)
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        image_size = (target_img_width, target_img_height)
+        task_queue.put((gradient_img_batch, image_size))
+
+    # Signal the threads to stop
+    for _ in range(num_threads):
+        task_queue.put(None)
+
+    for t in threads:
+        t.join()
+
+    print("Random Output Image Size test complete")
+
+
+def test_random_batch_size():
+    device_id = 0
+    num_threads = 10
+
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+
+    threads = []
+    for i in range(num_threads):
+        t = threading.Thread(target=worker, args=(device_id, task_queue, result_queue))
+        t.start()
+        threads.append(t)
+
+    # Set the duration to run the function (in seconds)
+    duration = 10
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        batch_size = random.randint(5, 10)
+        target_img_width = random.randint(110, 115)
+        target_img_height = random.randint(220, 230)
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        image_size = (target_img_width, target_img_height)
+        task_queue.put((gradient_img_batch, image_size))
+
+    # Signal the threads to stop
+    for _ in range(num_threads):
+        task_queue.put(None)
+
+    for t in threads:
+        t.join()
+
+    print("Random Batch Size test complete")
+
+
+def test_random_image_size_into():
+    device_id = 0
+    num_threads = 15
+
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+
+    threads = []
+    for i in range(num_threads):
+        t = threading.Thread(
+            target=worker_into, args=(device_id, task_queue, result_queue)
+        )
+        t.start()
+        threads.append(t)
+
+    # Set the duration to run the function (in seconds)
+    duration = 10
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        batch_size = 10
+        target_img_width = random.randint(220, 230)
+        target_img_height = random.randint(220, 230)
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        image_size = (target_img_width, target_img_height)
+        task_queue.put((gradient_img_batch, image_size))
+
+    # Signal the threads to stop
+    for _ in range(num_threads):
+        task_queue.put(None)
+
+    for t in threads:
+        t.join()
+
+    print("Into Random Output Image Size test complete")
+
+
+def test_random_batch_size_into():
+    device_id = 0
+    num_threads = 10
+
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+
+    threads = []
+    for i in range(num_threads):
+        t = threading.Thread(
+            target=worker_into, args=(device_id, task_queue, result_queue)
+        )
+        t.start()
+        threads.append(t)
+
+    # Set the duration to run the function (in seconds)
+    duration = 10
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        batch_size = random.randint(5, 10)
+        target_img_width = random.randint(110, 115)
+        target_img_height = random.randint(220, 230)
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        # print(gradient_img_batch.size())
+        image_size = (target_img_width, target_img_height)
+        task_queue.put((gradient_img_batch, image_size))
+
+    # Signal the threads to stop
+    for _ in range(num_threads):
+        task_queue.put(None)
+
+    for t in threads:
+        t.join()
+
+    print("Into Random Batch Size test complete")
+
+
+def main():
+    # test_random_image_size()
+    test_random_batch_size_into()
+    time.sleep(10)
+    test_random_batch_size()
+    # test_random_image_size_into()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/cvcuda/stressTest/stress_test_preprocess.py b/tests/cvcuda/stressTest/stress_test_preprocess.py
new file mode 100644
index 00000000..510c1534
--- /dev/null
+++ b/tests/cvcuda/stressTest/stress_test_preprocess.py
@@ -0,0 +1,197 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import cvcuda
+import torch
+import random
+import time
+
+time_of_test_in_min = 0.1
+
+
+def preprocess(input, out_size):
+    frame_nhwc = cvcuda.as_tensor(
+        torch.as_tensor(input).to(device="cuda:0", non_blocking=True),
+        "NHWC",
+    )
+    resized = cvcuda.resize(
+        frame_nhwc,
+        (
+            frame_nhwc.shape[0],
+            out_size[1],
+            out_size[0],
+            frame_nhwc.shape[3],
+        ),
+        cvcuda.Interp.LINEAR,
+    )
+    # Convert to floating point range 0-1.
+    normalized = cvcuda.convertto(resized, np.float32, scale=1 / 255)
+    # Convert it to NCHW layout and return it.
+    normalized = cvcuda.reformat(normalized, "NCHW")
+    return normalized
+
+
+def preprocess_into(input, out_size):
+    cvcuda_RGBtensor = cvcuda.as_tensor(input.cuda(), "NHWC")
+
+    torch_RGBtensor_resized = torch.empty(
+        (
+            cvcuda_RGBtensor.shape[0],
+            out_size[1],
+            out_size[0],
+            cvcuda_RGBtensor.shape[3],
+        ),
+        dtype=torch.uint8,
+        device="cuda:0",
+    )
+    cvcuda_RGBtensor_resized = cvcuda.as_tensor(
+        torch_RGBtensor_resized.cuda(),
+        "NHWC",
+    )
+    cvcuda.resize_into(
+        cvcuda_RGBtensor_resized,
+        cvcuda_RGBtensor,
+        cvcuda.Interp.LINEAR,
+    )
+
+    torch_nchw = torch.empty(
+        (input.shape[0], 3, out_size[1], out_size[0]),
+        dtype=torch.uint8,
+        device="cuda:0",
+    )
+    cvcuda_nchw = cvcuda.as_tensor(torch_nchw.cuda(0), "NCHW")
+    # normalized = cvcuda.convertto(cvcuda_nchw, np.float32, scale=1 / 255)
+    cvcuda.reformat_into(cvcuda_nchw, cvcuda_RGBtensor_resized)
+    return torch_nchw
+
+
+def generate_images(N, width=None, height=None, random_size=False):
+    if random_size:
+        w = random.randint(100, 500)
+        h = random.randint(100, 500)
+    else:
+        w = width
+        h = height
+    return torch.as_tensor(torch.rand(N, h, w, 3), dtype=torch.uint8)
+
+
+def test_random_image_size():
+    target_img_width = 224
+    target_img_height = 224
+    batch_size = 20
+
+    duration = time_of_test_in_min * 60  # 5 minutes
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        image_size = (target_img_width, target_img_height)
+        result = preprocess(gradient_img_batch, image_size)  # noqa: F841
+    print("Random Image Size Test Complete")
+
+
+def test_increasing_batch_size():
+    target_img_width = 224
+    target_img_height = 224
+    batch_size = 1
+
+    duration = time_of_test_in_min * 60  # 5 minutes
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        image_size = (target_img_width, target_img_height)
+        result = preprocess(gradient_img_batch, image_size)  # noqa: F841
+        batch_size += 1
+    print("Increasing Batch Size Test Complete")
+
+
+def test_random_batch_size():
+    target_img_width = 224
+    target_img_height = 224
+    batch_size = 1
+
+    duration = time_of_test_in_min * 60  # 5 minutes
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        gradient_img_batch = generate_images(width=1080, height=1920, N=batch_size)
+        image_size = (target_img_width, target_img_height)
+        result = preprocess(gradient_img_batch, image_size)  # noqa: F841
+        batch_size = random.randint(1, 80)
+    print("Random Batch Size Test Complete")
+
+
+def test_random_image_size_into():
+    target_img_width = 224
+    target_img_height = 224
+    batch_size = 20
+
+    duration = time_of_test_in_min * 60  # 5 minutes
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        image_size = (target_img_width, target_img_height)
+        result = preprocess_into(gradient_img_batch, image_size)  # noqa: F841
+    print("Into operator Random Image Size Test Complete")
+
+
+def test_increasing_batch_size_into():
+    target_img_width = 224
+    target_img_height = 224
+    batch_size = 1
+
+    duration = time_of_test_in_min * 60  # 5 minutes
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        image_size = (target_img_width, target_img_height)
+        result = preprocess_into(gradient_img_batch, image_size)  # noqa: F841
+        batch_size += 1
+    print("Into operator Random Image Size Test Complete")
+
+
+def test_random_batch_size_into():
+    target_img_width = 224
+    target_img_height = 224
+    batch_size = 1
+
+    duration = time_of_test_in_min * 60  # 5 minutes
+    start_time = time.time()
+
+    while time.time() - start_time < duration:
+        gradient_img_batch = generate_images(N=batch_size, random_size=True)
+        image_size = (target_img_width, target_img_height)
+        result = preprocess_into(gradient_img_batch, image_size)  # noqa: F841
+        batch_size = random.randint(1, 80)
+    print("Into Operator Increasing Batch Size Test Complete")
+
+
+def main():
+    print(torch.cuda.get_device_properties(0))
+    test_random_image_size()
+    test_random_batch_size()
+    test_random_image_size_into()
+    test_random_batch_size_into()
+
+    test_increasing_batch_size()
+    test_increasing_batch_size_into()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_wheels.sh.in b/tests/test_wheels.sh.in
new file mode 100755
index 00000000..e60690ec
--- /dev/null
+++ b/tests/test_wheels.sh.in
@@ -0,0 +1,124 @@
+#!/bin/bash -e
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Enable recursive globbing
+shopt -s globstar
+
+if [ -z "$1" ]; then
+    echo "Usage: $0 <directory_containing_wheels>"
+    exit 1
+fi
+
+WHEEL_LIST=($1/**/cvcuda*.whl)
+PACKAGES=("cvcuda" "nvcv")
+
+# Check for GPU and driver
+if command -v nvidia-smi &> /dev/null; then
+    if ! nvidia-smi > /dev/null 2>&1; then
+        echo "Warning: No GPU detected or driver not working with nvidia-smi. Skipping wheel testing..."
+        exit 0
+    fi
+    echo "GPU detected and driver is working (via nvidia-smi)."
+elif command -v tegrastats &> /dev/null; then
+    if ! tegrastats > /dev/null 2>&1; then
+        echo "Warning: tegrastats could not verify GPU. Skipping wheel testing..."
+        exit 0
+    fi
+    echo "GPU detected and driver is working (via tegrastats)."
+else
+    echo "Warning: Neither nvidia-smi nor tegrastats found. Skipping wheel testing..."
+    exit 0
+fi
+
+# Check if there are any wheels to test
+if [ ${#WHEEL_LIST[@]} -eq 0 ]; then
+    echo "No wheels found in the specified directory."
+    exit 0
+fi
+
+# Extract compatible Python versions from the wheel filenames
+get_compatible_pythons() {
+    local wheel_file="$1"
+    local compatible_versions=()
+    python_tag=$(basename "${wheel_file}" | cut -d'-' -f3)
+    IFS='.' read -ra tags <<< "${python_tag}"
+
+    for tag in "${tags[@]}"; do
+        if [[ "${tag}" =~ cp(3[0-9]{1,2}) ]]; then
+            py_ver="${BASH_REMATCH[1]}"
+            compatible_versions+=("${py_ver}")
+        fi
+    done
+
+    echo "${compatible_versions[@]}"
+}
+
+# Test each wheel
+for whl in "${WHEEL_LIST[@]}"; do
+    echo "Testing wheel: $(basename "${whl}")"
+
+    # Determine compatible Python versions
+    compatible_pythons=($(get_compatible_pythons "${whl}"))
+    if [ "${#compatible_pythons[@]}" -eq 0 ]; then
+        echo "Error: No compatible Python versions found for $(basename "${whl}"). Skipping."
+        continue
+    fi
+
+    for py_ver in "${compatible_pythons[@]}"; do
+        py_exec="python3.${py_ver:1}"
+        if ! command -v "${py_exec}" &> /dev/null; then
+            echo "Skipping Python ${py_ver}: ${py_exec} not found."
+            continue
+        fi
+
+        echo "Testing with ${py_exec}..."
+
+        # Create a temporary virtual environment
+        test_env_dir=$(mktemp -d)
+        ${py_exec} -m venv --without-pip "${test_env_dir}/test_env"
+        source "${test_env_dir}/test_env/bin/activate"
+
+        # Manually install pip using get-pip.py
+        echo "Manually installing pip..."
+        curl -sS https://bootstrap.pypa.io/get-pip.py | ${py_exec}
+
+        # Ensure pip is up to date
+        echo "Upgrading pip..."
+        ${py_exec} -m pip install --upgrade pip
+
+        # Install and test the wheel
+        echo "Installing ${whl}..."
+        ${py_exec} -m pip install "${whl}"
+
+        for package in "${PACKAGES[@]}"; do
+            echo "Testing import for package ${package}..."
+            if ! ${py_exec} -c "import ${package}" &> /dev/null; then
+                echo "  Error: Failed to import ${package} with ${py_exec}."
+                deactivate
+                rm -rf "${test_env_dir}"
+                exit 1
+            fi
+            echo "  ${package} imported successfully with ${py_exec}."
+        done
+
+        deactivate
+        rm -rf "${test_env_dir}"
+        echo "Testing with ${py_exec} completed successfully."
+    done
+done
+
+echo "All wheels tested successfully."