Skip to content

Commit

Permalink
Register overutilization for arbitrary sequence sizes rework
Browse files Browse the repository at this point in the history
-This update brings generalization of register overutilization technique for arbitrary sequences. With it enabled, bigger sequences can be done in one upload to the chip, effectively bringing shared memory size to 128KB or even 256KB. It trades occupancy for lower number of memory transfers and can be useful for high-end GPUs with high CU count. Works both for non-strided and strided axes. Tested for C2C and single upload mode, no convolutions (will be improved in the future). Enabled for power of 2 sequences by default (works for non-power of 2 sequences as well, but performance gains are not yet tested).
-Switched FFT exponent sign to be conformant with FFTW. Added an option to disable normalization in inverse FFT (enabled by default).
  • Loading branch information
DTolm committed Jan 31, 2021
1 parent 86c46fe commit ee16b4c
Show file tree
Hide file tree
Showing 8 changed files with 2,358 additions and 2,638 deletions.
176 changes: 119 additions & 57 deletions Vulkan_FFT.cpp

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion cufft_scripts/benchmark_cuFFT.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ void launch_benchmark_cuFFT_single(bool file_output, FILE* output)
auto timeSubmit = std::chrono::steady_clock::now();
for (int i = 0; i < batch; i++) {

cufftExecC2C(planC2C, dataC, dataC, 1);
cufftExecC2C(planC2C, dataC, dataC, -1);
cufftExecC2C(planC2C, dataC, dataC, 1);
}
cudaDeviceSynchronize();
auto timeEnd = std::chrono::steady_clock::now();
Expand Down
2 changes: 1 addition & 1 deletion cufft_scripts/benchmark_cuFFT_3d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ void launch_benchmark_cuFFT_single_3d(bool file_output, FILE* output)
auto timeSubmit = std::chrono::steady_clock::now();
for (int i = 0; i < batch; i++) {

cufftExecC2C(planC2C, dataC, dataC, 1);
cufftExecC2C(planC2C, dataC, dataC, -1);
cufftExecC2C(planC2C, dataC, dataC, 1);
}
cudaDeviceSynchronize();
auto timeEnd = std::chrono::steady_clock::now();
Expand Down
2 changes: 1 addition & 1 deletion cufft_scripts/benchmark_cuFFT_double.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ void launch_benchmark_cuFFT_double(bool file_output, FILE* output)
auto timeSubmit = std::chrono::steady_clock::now();
for (int i = 0; i < batch; i++) {

cufftExecZ2Z(planZ2Z, dataC, dataC, 1);
cufftExecZ2Z(planZ2Z, dataC, dataC, -1);
cufftExecZ2Z(planZ2Z, dataC, dataC, 1);
}
cudaDeviceSynchronize();
auto timeEnd = std::chrono::steady_clock::now();
Expand Down
2 changes: 1 addition & 1 deletion cufft_scripts/precision_cuFFT.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ void launch_precision_cuFFT_single(void* inputC, void* output_cuFFT, uint32_t* d
break;
}
for (int i = 0; i < 1; i++) {
cufftExecC2C(planC2C, dataC, dataC, 1);
cufftExecC2C(planC2C, dataC, dataC, -1);
}
cudaDeviceSynchronize();
cudaMemcpy(output_cuFFT, dataC, sizeof(cufftComplex) * dims[0] * dims[1] * dims[2], cudaMemcpyDeviceToHost);
Expand Down
2 changes: 1 addition & 1 deletion cufft_scripts/precision_cuFFT_double.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ void launch_precision_cuFFT_double(void* inputC, void* output_cuFFT, uint32_t* d
break;
}
for (int i = 0; i < 1; i++) {
cufftExecZ2Z(planZ2Z, dataC, dataC, 1);
cufftExecZ2Z(planZ2Z, dataC, dataC, -1);
}
cudaDeviceSynchronize();
cudaMemcpy(output_cuFFT, dataC, sizeof(cufftDoubleComplex) * dims[0] * dims[1] * dims[2], cudaMemcpyDeviceToHost);
Expand Down
2 changes: 1 addition & 1 deletion cufft_scripts/precision_cuFFT_half.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ void launch_precision_cuFFT_half(void* inputC, void* output_cuFFT, uint32_t* dim
NULL, 1, 1, CUDA_C_16F, 1, &ws, CUDA_C_16F);

for (int i = 0; i < 1; i++) {
res = cufftXtExec(planHalf, dataC, dataC, 1);
res = cufftXtExec(planHalf, dataC, dataC, -1);
}
cudaDeviceSynchronize();
cudaMemcpy(output_cuFFT, dataC, sizeof(half2) * dims[0] * dims[1] * dims[2], cudaMemcpyDeviceToHost);
Expand Down
4,808 changes: 2,233 additions & 2,575 deletions vkFFT/vkFFT.h

Large diffs are not rendered by default.

0 comments on commit ee16b4c

Please sign in to comment.