Skip to content

Commit

Permalink
Big performance update
Browse files Browse the repository at this point in the history
-Solved most nonstrided shared memory conflicts with new on-fly axis swapping technique
-Improved R2C/C2R performance. Will add odd sequence lengths in the next update
-Improved register management for large primes
-Added rocFFT benchmark and precision scripts
-Added new benchmarks: 1000 - tests all 1D FP32 systems from 2 to 4096, 1001 - tests all 1D FP64 systems from 2 to 4096, 1003 - tests all 3D FP32 cube systems from 2 to 512 for VkFFT/cuFFT/rocFFT. Will update plots soon
-Switched to dynamic shared memory allocation in CUDA and HIP backends
-Bugfixes to strides definition and buffer layout management
  • Loading branch information
DTolm committed Mar 15, 2021
1 parent d0f27e3 commit 2eb95e3
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 3 deletions.
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,12 @@ if(build_VkFFT_rocFFT_benchmark)
else()
add_library(rocfft_scripts STATIC
rocfft_scripts/benchmark_rocFFT.cpp
rocfft_scripts/benchmark_rocFFT_2_4096.cpp
rocfft_scripts/benchmark_rocFFT_r2c.cpp
rocfft_scripts/benchmark_rocFFT_double.cpp
rocfft_scripts/benchmark_rocFFT_3d.cpp)
rocfft_scripts/benchmark_rocFFT_double_2_4096.cpp
rocfft_scripts/benchmark_rocFFT_3d.cpp
rocfft_scripts/benchmark_rocFFT_3d_2_512.cpp)
endif()
target_include_directories(rocfft_scripts PUBLIC "rocfft_scripts/")
target_link_libraries(rocfft_scripts PRIVATE hip::host roc::rocfft)
Expand Down
4 changes: 2 additions & 2 deletions vkFFT/vkFFT.h
Original file line number Diff line number Diff line change
Expand Up @@ -10905,7 +10905,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
if (inputLaunchConfiguration.fence == 0) return 1005;
app->configuration.fence = inputLaunchConfiguration.fence;

VkPhysicalDeviceProperties physicalDeviceProperties = { {0} };
VkPhysicalDeviceProperties physicalDeviceProperties = {0};
vkGetPhysicalDeviceProperties(app->configuration.physicalDevice[0], &physicalDeviceProperties);
if (inputLaunchConfiguration.isCompilerInitialized != 0) app->configuration.isCompilerInitialized = inputLaunchConfiguration.isCompilerInitialized;
if (!app->configuration.isCompilerInitialized)
Expand Down Expand Up @@ -11024,7 +11024,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
app->configuration.maxComputeWorkGroupSize[2] = value;
hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlock, app->configuration.device[0]);
app->configuration.sharedMemorySizeStatic = value;
hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlockOptin, app->configuration.device[0]);
//hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlockOptin, app->configuration.device[0]);
app->configuration.sharedMemorySize = (value > 65536) ? 65536 : value;
hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, app->configuration.device[0]);
app->configuration.warpSize = value;
Expand Down

0 comments on commit 2eb95e3

Please sign in to comment.