Register overutilization for arbitrary sequence sizes rework

-This update brings generalization of register overutilization technique for arbitrary sequences. With it enabled, bigger sequences can be done in one upload to the chip, effectively bringing shared memory size to 128KB or even 256KB. It trades occupancy for lower number of memory transfers and can be useful for high-end GPUs with high CU count. Works both for non-strided and strided axes. Tested for C2C and single upload mode, no convolutions (will be improved in the future). Enabled for power of 2 sequences by default (works for non-power of 2 sequences as well, but performance gains are not yet tested). -Switched FFT exponent sign to be conformant with FFTW. Added an option to disable normalization in inverse FFT (enabled by default).
DTolm · Jan 31, 2021 · ee16b4c · ee16b4c
1 parent 86c46fe
commit ee16b4c
Show file tree

Hide file tree

Showing 8 changed files with 2,358 additions and 2,638 deletions.
diff --git a/Vulkan_FFT.cpp b/Vulkan_FFT.cpp
diff --git a/cufft_scripts/benchmark_cuFFT.cu b/cufft_scripts/benchmark_cuFFT.cu
@@ -71,8 +71,8 @@ void launch_benchmark_cuFFT_single(bool file_output, FILE* output)
 			auto timeSubmit = std::chrono::steady_clock::now();
 			for (int i = 0; i < batch; i++) {
 
-				cufftExecC2C(planC2C, dataC, dataC, 1);
 				cufftExecC2C(planC2C, dataC, dataC, -1);
+				cufftExecC2C(planC2C, dataC, dataC, 1);
 			}
 			cudaDeviceSynchronize();
 			auto timeEnd = std::chrono::steady_clock::now();

diff --git a/cufft_scripts/benchmark_cuFFT_3d.cu b/cufft_scripts/benchmark_cuFFT_3d.cu
@@ -71,8 +71,8 @@ void launch_benchmark_cuFFT_single_3d(bool file_output, FILE* output)
 			auto timeSubmit = std::chrono::steady_clock::now();
 			for (int i = 0; i < batch; i++) {
 
-				cufftExecC2C(planC2C, dataC, dataC, 1);
 				cufftExecC2C(planC2C, dataC, dataC, -1);
+				cufftExecC2C(planC2C, dataC, dataC, 1);
 			}
 			cudaDeviceSynchronize();
 			auto timeEnd = std::chrono::steady_clock::now();

diff --git a/cufft_scripts/benchmark_cuFFT_double.cu b/cufft_scripts/benchmark_cuFFT_double.cu
@@ -71,8 +71,8 @@ void launch_benchmark_cuFFT_double(bool file_output, FILE* output)
 			auto timeSubmit = std::chrono::steady_clock::now();
 			for (int i = 0; i < batch; i++) {
 
-				cufftExecZ2Z(planZ2Z, dataC, dataC, 1);
 				cufftExecZ2Z(planZ2Z, dataC, dataC, -1);
+				cufftExecZ2Z(planZ2Z, dataC, dataC, 1);
 			}
 			cudaDeviceSynchronize();
 			auto timeEnd = std::chrono::steady_clock::now();

diff --git a/cufft_scripts/precision_cuFFT.cu b/cufft_scripts/precision_cuFFT.cu
@@ -34,7 +34,7 @@ void launch_precision_cuFFT_single(void* inputC, void* output_cuFFT, uint32_t* d
 		break;
 	}
 	for (int i = 0; i < 1; i++) {
-		cufftExecC2C(planC2C, dataC, dataC, 1);
+		cufftExecC2C(planC2C, dataC, dataC, -1);
 	}
 	cudaDeviceSynchronize();
 	cudaMemcpy(output_cuFFT, dataC, sizeof(cufftComplex) * dims[0] * dims[1] * dims[2], cudaMemcpyDeviceToHost);

diff --git a/cufft_scripts/precision_cuFFT_double.cu b/cufft_scripts/precision_cuFFT_double.cu
@@ -34,7 +34,7 @@ void launch_precision_cuFFT_double(void* inputC, void* output_cuFFT, uint32_t* d
 		break;
 	}
 	for (int i = 0; i < 1; i++) {
-		cufftExecZ2Z(planZ2Z, dataC, dataC, 1);
+		cufftExecZ2Z(planZ2Z, dataC, dataC, -1);
 	}
 	cudaDeviceSynchronize();
 	cudaMemcpy(output_cuFFT, dataC, sizeof(cufftDoubleComplex) * dims[0] * dims[1] * dims[2], cudaMemcpyDeviceToHost);

diff --git a/cufft_scripts/precision_cuFFT_half.cu b/cufft_scripts/precision_cuFFT_half.cu
@@ -50,7 +50,7 @@ void launch_precision_cuFFT_half(void* inputC, void* output_cuFFT, uint32_t* dim
 		NULL, 1, 1, CUDA_C_16F, 1, &ws, CUDA_C_16F);
 
 	for (int i = 0; i < 1; i++) {
-		res = cufftXtExec(planHalf, dataC, dataC, 1);
+		res = cufftXtExec(planHalf, dataC, dataC, -1);
 	}
 	cudaDeviceSynchronize();
 	cudaMemcpy(output_cuFFT, dataC, sizeof(half2) * dims[0] * dims[1] * dims[2], cudaMemcpyDeviceToHost);

diff --git a/vkFFT/vkFFT.h b/vkFFT/vkFFT.h