From 99d723c5b02966afebc76c713c9c8502c778257b Mon Sep 17 00:00:00 2001
From: Tolmachev Dmitrii <dtolm@yandex.ru>
Date: Tue, 2 Mar 2021 17:33:20 +0100
Subject: [PATCH] Radix 11 and 13 support -VkFFT now supports sequences that
 are a multiple of 2s, 3s, 5s, 7s, 11s and 13s. Improved thread management
 -R2C optimization now produces output in the same padded layout as FFTW
 -Solved some of the shared memory bank conflicts for slightly better
 performance -It is now possible to select in which stream to run VkFFT in
 CUDA/HIP version

---
 README.md      |   11 +-
 Vulkan_FFT.cpp |   43 +-
 vkFFT/vkFFT.h  | 2865 +++++++++++++++++++++++++++---------------------
 3 files changed, 1634 insertions(+), 1285 deletions(-)

diff --git a/README.md b/README.md
index ac1e96e9..8cd1b1ab 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 [![Build Status](https://travis-ci.com/DTolm/VkFFT.svg?token=nMgUQeqx7PXMeCFaXqsb&branch=master)](https://travis-ci.com/github/DTolm/VkFFT)
 # VkFFT - Vulkan Fast Fourier Transform library
-VkFFT is an efficient GPU-accelerated multidimensional Fast Fourier Transform library for Vulkan projects. VkFFT aims to provide community with an open-source alternative to Nvidia's cuFFT library, while achieving better performance. VkFFT is written in C language.
+VkFFT is an efficient GPU-accelerated multidimensional Fast Fourier Transform library for Vulkan/CUDA/HIP projects. VkFFT aims to provide community with an open-source alternative to Nvidia's cuFFT library, while achieving better performance. VkFFT is written in C language and supports Vulkan, CUDA and HIP as backends.
 
 ## I am looking for a PhD position/job that may be interested in my set of skills. Contact me by email: <d.tolmachev@fz-juelich.de> | <dtolm96@gmail.com>
 
@@ -12,7 +12,7 @@ VkFFT is an efficient GPU-accelerated multidimensional Fast Fourier Transform li
   - 1D/2D/3D systems
   - Forward and inverse directions of FFT
   - Support for big FFT dimension sizes. Current limits in single and half precision: C2C - (2^32, 2^32, 2^32). C2R/R2C - (2^12, 2^32, 2^32). (will be increased later). Current limits in double precision: C2C - (2^32, 2^32, 2^32), C2R/R2C - (2^11, 2^32, 2^32) with no register overutilization.
-  - Radix-2/3/4/5/7/8 FFT. Sequences using radix 3, 5 and 7 have comparable performance to that of powers of 2
+  - Radix-2/3/4/5/7/8/11/13 FFT. Sequences using radix 3, 5, 7, 11 and 13 have comparable performance to that of powers of 2
   - Single, double and half precision support. Double precision uses CPU generated LUT tables. Half precision still does all computations in single and only uses half precision to store data.
   - All transformations are performed in-place with no performance loss. Out-of-place transforms are supported by selecting different input/output buffers.
   - No additional transposition uploads. Note: data can be reshuffled after the four step FFT algorithm with additional buffer (for big sequences). Doesn't matter for convolutions - they return to the input ordering (saves memory).
@@ -23,6 +23,7 @@ VkFFT is an efficient GPU-accelerated multidimensional Fast Fourier Transform li
   - Multiple feature/batch convolutions - one input, multiple kernels
   - Multiple input/output/temporary buffer split. Allows to use data split between different memory allocations and mitigate 4GB single allocation limit.
   - Works on Nvidia, AMD and Intel GPUs (tested on Nvidia RTX 3080, GTX 1660 Ti, AMD Radeon VII and Intel UHD 620)
+  - VkFFT supports Vulkan, CUDA and HIP as backend to cover wide range of APIs
   - Header-only library with Vulkan interface, which allows to append VkFFT directly to user's command buffer. Shaders are compiled once during the plan creation stage
 ## Future release plan
  - ##### Planned
@@ -32,8 +33,12 @@ VkFFT is an efficient GPU-accelerated multidimensional Fast Fourier Transform li
     - Multiple GPU job splitting
 
 ## Installation
+Vulkan version:
 Include the vkFFT.h file and glslang compiler. Sample CMakeLists.txt file configures project based on Vulkan_FFT.cpp file, which contains examples on how to use VkFFT to perform FFT, iFFT and convolution calculations, use zero padding, multiple feature/batch convolutions, C2C FFTs of big systems, R2C/C2R transforms, double precision FFTs, half precision FFTs.\
 For single and double precision, Vulkan 1.0 is required. For half precision, Vulkan 1.1 is required.
+
+CUDA/HIP:
+Include the vkFFT.h file and make sure your system has NVRTC/HIPRTC built. Only single/double precision for now.
 ## Command-line interface
 VkFFT has a command-line interface with the following set of commands:\
 -h: print help\
@@ -47,7 +52,7 @@ So, the command to launch single precision benchmark of VkFFT and cuFFT and save
 .\Vulkan_FFT.exe -d 0 -o output.txt -vkfft 0 -cufft 0\
 For double precision benchmark, replace -vkfft 0 -cufft 0 with -vkfft 1 -cufft 1. For half precision benchmark, replace -vkfft 0 -cufft 0 with -vkfft 2 -cufft 2.
 ## How to use VkFFT
-VkFFT.h is a library which can append FFT, iFFT or convolution calculation to the user defined command buffer. It operates on storage buffers allocated by user and doesn't require any additional memory by itself. All computations are fully based on Vulkan compute shaders with no CPU usage except for FFT planning. VkFFT creates and optimizes memory layout by itself and performs FFT with the best chosen parameters. For an example application, see Vulkan_FFT.cpp file, which has comments explaining the VkFFT configuration process.\
+VkFFT.h is a library which can append FFT, iFFT or convolution calculation to the user defined command buffer. It operates on storage buffers allocated by user and doesn't require any additional memory by itself (except for LUT tables, if they are enabled). All computations are fully based on Vulkan compute shaders with no CPU usage except for FFT planning. VkFFT creates and optimizes memory layout by itself and performs FFT with the best chosen parameters. For an example application, see Vulkan_FFT.cpp file, which has comments explaining the VkFFT configuration process.\
 VkFFT achieves striding by grouping nearby FFTs instead of transpositions.
 ![alt text](https://github.com/dtolm/VkFFT/blob/master/FFT_memory_layout.png?raw=true)
 ## Benchmark results in comparison to cuFFT
diff --git a/Vulkan_FFT.cpp b/Vulkan_FFT.cpp
index 010adf9a..12d167c3 100644
--- a/Vulkan_FFT.cpp
+++ b/Vulkan_FFT.cpp
@@ -493,7 +493,7 @@ VkResult transferDataToCPU(VkGPU* vkGPU, void* arr, VkBuffer* buffer, uint64_t b
 	return res;
 }
 #endif
-void performVulkanFFT(VkGPU* vkGPU, VkFFTApplication* app, uint32_t inverse, uint32_t batch) {
+void performVulkanFFT(VkGPU* vkGPU, VkFFTApplication* app, int inverse, uint32_t batch) {
 #if(VKFFT_BACKEND==0)
 	VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO };
 	commandBufferAllocateInfo.commandPool = vkGPU->commandPool;
@@ -2215,7 +2215,8 @@ uint32_t sample_9(VkGPU* vkGPU, uint32_t sample_id, bool file_output, FILE* outp
 	VkDeviceMemory bufferDeviceMemory = {};
 	allocateFFTBuffer(vkGPU, &inputBuffer, &inputBufferDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, inputBufferSize);
 	allocateFFTBuffer(vkGPU, &buffer, &bufferDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize);
-
+	convolution_configuration.inputBuffer = &inputBuffer;
+	convolution_configuration.buffer = &buffer;
 #elif(VKFFT_BACKEND==1)
 	cuFloatComplex* inputBuffer = 0;
 	cuFloatComplex* buffer = 0;
@@ -2246,7 +2247,7 @@ uint32_t sample_9(VkGPU* vkGPU, uint32_t sample_id, bool file_output, FILE* outp
 		for (uint32_t k = 0; k < convolution_configuration.size[2]; k++) {
 			for (uint32_t j = 0; j < convolution_configuration.size[1]; j++) {
 				for (uint32_t i = 0; i < convolution_configuration.size[0]; i++) {
-					buffer_input[i + j * convolution_configuration.size[0] + k * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] + v * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2]] = 1;
+					buffer_input[i + j * (convolution_configuration.size[0]+2) + k * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] + v * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2]] = 1;
 				}
 			}
 		}
@@ -2290,9 +2291,9 @@ uint32_t sample_9(VkGPU* vkGPU, uint32_t sample_id, bool file_output, FILE* outp
 				for (uint32_t j = 0; j < convolution_configuration.size[1]; j++) {
 					for (uint32_t i = 0; i < convolution_configuration.size[0]; i++) {
 						if (file_output)
-							fprintf(output, "%.6f ", buffer_output[i + j * convolution_configuration.size[0] + k * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] + v * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2] + f * convolution_configuration.coordinateFeatures * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2]]);
+							fprintf(output, "%.6f ", buffer_output[i + j * (convolution_configuration.size[0] + 2) + k * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] + v * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2] + f * convolution_configuration.coordinateFeatures * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2]]);
 
-						printf("%.6f ", buffer_output[i + j * convolution_configuration.size[0] + k * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] + v * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2] + f * convolution_configuration.coordinateFeatures * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2]]);
+						printf("%.6f ", buffer_output[i + j * (convolution_configuration.size[0]+2) + k * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] + v * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2] + f * convolution_configuration.coordinateFeatures * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2]]);
 					}
 					std::cout << "\n";
 				}
@@ -3196,29 +3197,33 @@ uint32_t sample_13(VkGPU* vkGPU, uint32_t sample_id, bool file_output, FILE* out
 uint32_t sample_14(VkGPU* vkGPU, uint32_t sample_id, bool file_output, FILE* output, uint32_t isCompilerInitialized) {
 	uint32_t res = 0;
 	if (file_output)
-		fprintf(output, "14 - VkFFT/FFTW C2C power 3/5/7 precision test in single precision\n");
-	printf("14 - VkFFT/FFTW C2C power 3/5/7 precision test in single precision\n");
+		fprintf(output, "14 - VkFFT/FFTW C2C power 3/5/7/11/13 precision test in single precision\n");
+	printf("14 - VkFFT/FFTW C2C power 3/5/7/11/13 precision test in single precision\n");
 
-	const int num_benchmark_samples = 145;
+	const int num_benchmark_samples = 200;
 	const int num_runs = 1;
 
-	uint32_t benchmark_dimensions[num_benchmark_samples][4] = { {3, 1, 1, 1},{5, 1, 1, 1},{6, 1, 1, 1},{7, 1, 1, 1},{9, 1, 1, 1},{10, 1, 1, 1},{12, 1, 1, 1},{14, 1, 1, 1},
-		{15, 1, 1, 1},{21, 1, 1, 1},{24, 1, 1, 1},{25, 1, 1, 1},{27, 1, 1, 1},{28, 1, 1, 1},{30, 1, 1, 1},{35, 1, 1, 1},{45, 1, 1, 1},{42, 1, 1, 1},{49, 1, 1, 1},{56, 1, 1, 1},{60, 1, 1, 1},{81, 1, 1, 1},
-		{125, 1, 1, 1},{243, 1, 1, 1},{343, 1, 1, 1},{625, 1, 1, 1},{720, 1, 1, 1},{1080, 1, 1, 1},{1400, 1, 1, 1},{1440, 1, 1, 1},{1920, 1, 1, 1},{2160, 1, 1, 1},{3024,1,1,1},{3500,1,1,1},
+	uint32_t benchmark_dimensions[num_benchmark_samples][4] = { {3, 1, 1, 1},{5, 1, 1, 1},{6, 1, 1, 1},{7, 1, 1, 1},{9, 1, 1, 1},{10, 1, 1, 1},{11, 1, 1, 1},{12, 1, 1, 1},{13, 1, 1, 1},{14, 1, 1, 1},
+		{15, 1, 1, 1},{21, 1, 1, 1},{22, 1, 1, 1},{24, 1, 1, 1},{25, 1, 1, 1},{26, 1, 1, 1},{27, 1, 1, 1},{28, 1, 1, 1},{30, 1, 1, 1},{33, 1, 1, 1},{35, 1, 1, 1},{39, 1, 1, 1},{45, 1, 1, 1},{42, 1, 1, 1},{44, 1, 1, 1},{49, 1, 1, 1},{52, 1, 1, 1},{55, 1, 1, 1},{56, 1, 1, 1},{60, 1, 1, 1},{65, 1, 1, 1},{66, 1, 1, 1},{81, 1, 1, 1},
+		{121, 1, 1, 1},{125, 1, 1, 1},{143, 1, 1, 1},{169, 1, 1, 1},{243, 1, 1, 1},{286, 1, 1, 1},{343, 1, 1, 1},{429, 1, 1, 1},{572, 1, 1, 1},{625, 1, 1, 1},{720, 1, 1, 1},{1080, 1, 1, 1},{1001, 1, 1, 1},{1287, 1, 1, 1},{1400, 1, 1, 1},{1440, 1, 1, 1},{1920, 1, 1, 1},{2160, 1, 1, 1},{3024,1,1,1},{3500,1,1,1},
 		{3840, 1, 1, 1},{4000 , 1, 1, 1},{4050, 1, 1, 1},{4320 , 1, 1, 1},{7000,1,1,1},{7680, 1, 1, 1},{9000, 1, 1, 1},{7680 * 5, 1, 1, 1},
 		{(uint32_t)pow(3,10), 1, 1, 1},{(uint32_t)pow(3,11), 1, 1, 1},{(uint32_t)pow(3,12), 1, 1, 1},{(uint32_t)pow(3,13), 1, 1, 1},{(uint32_t)pow(3,14), 1, 1, 1},{(uint32_t)pow(3,15), 1, 1, 1},
 		{(uint32_t)pow(5,5), 1, 1, 1},{(uint32_t)pow(5,6), 1, 1, 1},{(uint32_t)pow(5,7), 1, 1, 1},{(uint32_t)pow(5,8), 1, 1, 1},{(uint32_t)pow(5,9), 1, 1, 1},
 		{(uint32_t)pow(7,4), 1, 1, 1},{(uint32_t)pow(7,5), 1, 1, 1},{(uint32_t)pow(7,6), 1, 1, 1},{(uint32_t)pow(7,7), 1, 1, 1},{(uint32_t)pow(7,8), 1, 1, 1},
-		{8, 3, 1, 2},{8, 5, 1, 2},{8, 6, 1, 2},{8, 7, 1, 2},{8, 9, 1, 2},{8, 10, 1, 2},{8, 12, 1, 2},{8, 14, 1, 2},{8, 15, 1, 2},{8, 21, 1, 2},{8, 24, 1, 2},
-		{8, 25, 1, 2},{8, 27, 1, 2},{8, 28, 1, 2},{8, 30, 1, 2},{8, 35, 1, 2},{8, 45, 1, 2},{8, 49, 1, 2},{8, 56, 1, 2},{8, 60, 1, 2},{8, 81, 1, 2},{8, 125, 1, 2},{8, 243, 1, 2},{8, 343, 1, 2},
+		{(uint32_t)pow(11,3), 1, 1, 1},{(uint32_t)pow(11,4), 1, 1, 1},{(uint32_t)pow(11,5), 1, 1, 1},{(uint32_t)pow(11,6), 1, 1, 1},
+		{(uint32_t)pow(13,3), 1, 1, 1},{(uint32_t)pow(13,4), 1, 1, 1},{(uint32_t)pow(13,5), 1, 1, 1},{(uint32_t)pow(13,6), 1, 1, 1},
+		{8, 3, 1, 2},{8, 5, 1, 2},{8, 6, 1, 2},{8, 7, 1, 2},{8, 9, 1, 2},{8, 10, 1, 2},{8, 11, 1, 2},{8, 12, 1, 2},{8, 13, 1, 2},{8, 14, 1, 2},{8, 15, 1, 2},{8, 21, 1, 2},{8, 22, 1, 2},{8, 24, 1, 2},
+		{8, 25, 1, 2},{8, 26, 1, 2},{8, 27, 1, 2},{8, 28, 1, 2},{8, 30, 1, 2},{8, 33, 1, 2},{8, 35, 1, 2},{8, 39, 1, 2},{8, 44, 1, 2},{8, 45, 1, 2},{8, 49, 1, 2},{8, 52, 1, 2},{8, 56, 1, 2},{8, 60, 1, 2},{8, 66, 1, 2},{8, 81, 1, 2},{8, 125, 1, 2},{8, 243, 1, 2},{8, 343, 1, 2},
 		{8, 625, 1, 2},{8, 720, 1, 2},{8, 1080, 1, 2},{8, 1400, 1, 2},{8, 1440, 1, 2},{8, 1920, 1, 2},{8, 2160, 1, 2},{8, 3024, 1, 2},{8, 3500, 1, 2},
 		{8, 3840, 1, 2},{8, 4000, 1, 2},{8, 4050, 1, 2},{8, 4320, 1, 2},{8, 7000, 1, 2},{8, 7680, 1, 2},{8, 4050 * 3, 1, 2},{8, 7680 * 5, 1, 2}, {720, 480, 1, 2},{1280, 720, 1, 2},{1920, 1080, 1, 2}, {2560, 1440, 1, 2},{3840, 2160, 1, 2},{7680, 4320, 1, 2},
 		{8, (uint32_t)pow(3,10), 1, 2},	{8, (uint32_t)pow(3,11), 1, 2}, {8, (uint32_t)pow(3,12), 1, 2}, {8, (uint32_t)pow(3,13), 1, 2}, {8, (uint32_t)pow(3,14), 1, 2}, {8, (uint32_t)pow(3,15), 1, 2},
 		{8, (uint32_t)pow(5,5), 1, 2},	{8, (uint32_t)pow(5,6), 1, 2}, {8, (uint32_t)pow(5,7), 1, 2}, {8, (uint32_t)pow(5,8), 1, 2}, {8, (uint32_t)pow(5,9), 1, 2},
 		{8, (uint32_t)pow(7,4), 1, 2},{8, (uint32_t)pow(7,5), 1, 2},{8, (uint32_t)pow(7,6), 1, 2},{8, (uint32_t)pow(7,7), 1, 2},{8, (uint32_t)pow(7,8), 1, 2},
-		{3, 3, 3, 3},{5, 5, 5, 3},{6, 6, 6, 3},{7, 7, 7, 3},{9, 9, 9, 3},{10, 10, 10, 3},{12, 12, 12, 3},{14, 14, 14, 3},
-		{15, 15, 15, 3},{21, 21, 21, 3},{24, 24, 24, 3},{25, 25, 25, 3},{27, 27, 27, 3},{28, 28, 28, 3},{30, 30, 30, 3},{35, 35, 35, 3},{42, 42, 42, 3},{45, 45, 45, 3},{49, 49, 49, 3},{56, 56, 56, 3},{60, 60, 60, 3},{81, 81, 81, 3},
-		{125, 125, 125, 3},{243, 243, 243, 3}
+		{8, (uint32_t)pow(11,3), 1, 2},{8, (uint32_t)pow(11,4), 1, 2},{8, (uint32_t)pow(11,5), 1, 2},{8, (uint32_t)pow(11,6), 1, 2},
+		{8, (uint32_t)pow(13,3), 1, 2},{8, (uint32_t)pow(13,4), 1, 2},{8, (uint32_t)pow(13,5), 1, 2},{8, (uint32_t)pow(13,6), 1, 2},
+		{3, 3, 3, 3},{5, 5, 5, 3},{6, 6, 6, 3},{7, 7, 7, 3},{9, 9, 9, 3},{10, 10, 10, 3},{11, 11, 11, 3},{12, 12, 12, 3},{13, 13, 13, 3},{14, 14, 14, 3},
+		{15, 15, 15, 3},{21, 21, 21, 3},{22, 22, 22, 3},{24, 24, 24, 3},{25, 25, 25, 3},{26, 26, 26, 3},{27, 27, 27, 3},{28, 28, 28, 3},{30, 30, 30, 3},{33, 33, 33, 3},{35, 35, 35, 3},{39, 39, 39, 3},{42, 42, 42, 3},{44, 44, 44, 3},{45, 45, 45, 3},{49, 49, 49, 3},{52, 52, 52, 3},{56, 56, 56, 3},{60, 60, 60, 3},{81, 81, 81, 3},
+		{121, 121, 121, 3},{125, 125, 125, 3},{143, 143, 143, 3},{169, 169, 169, 3},{243, 243, 243, 3}
 	};
 
 	double benchmark_result = 0;//averaged result = sum(system_size/iteration_time)/num_benchmark_samples
@@ -3594,7 +3599,7 @@ int main(int argc, char* argv[])
 	if (findFlag(argv, argv + argc, "-h"))
 	{
 		//print help
-		printf("VkFFT v1.1.8 (16-02-2021). Author: Tolmachev Dmitrii\n");
+		printf("VkFFT v1.1.9 (02-03-2021). Author: Tolmachev Dmitrii\n");
 		printf("	-h: print help\n");
 #if (VKFFT_BACKEND==0)
 		printf("	-devices: print the list of available GPU devices\n");
@@ -3624,14 +3629,14 @@ int main(int argc, char* argv[])
 #if ((VKFFT_BACKEND==0)&&(VK_API_VERSION>10))
 		printf("		13 - VkFFT / cuFFT / FFTW C2C precision test in half precision\n");
 #endif
-		printf("		14 - VkFFT / FFTW C2C power 3 / 5 / 7 precision test in single precision\n");
+		printf("		14 - VkFFT / FFTW C2C power 3 / 5 / 7 / 11 / 13 precision test in single precision\n");
 #else
 		printf("		11 - VkFFT / FFTW C2C precision test in single precision\n");
 		printf("		12 - VkFFT / FFTW C2C precision test in double precision\n");
 #if ((VKFFT_BACKEND==0)&&(VK_API_VERSION>10))
 		printf("		13 - VkFFT / FFTW C2C precision test in half precision\n");
 #endif
-		printf("		14 - VkFFT / FFTW C2C power 3 / 5 / 7 precision test in single precision\n");
+		printf("		14 - VkFFT / FFTW C2C power 3 / 5 / 7 / 11 / 13 precision test in single precision\n");
 #endif
 #endif
 #ifdef USE_cuFFT
diff --git a/vkFFT/vkFFT.h b/vkFFT/vkFFT.h
index 2128b214..34f51dc4 100644
--- a/vkFFT/vkFFT.h
+++ b/vkFFT/vkFFT.h
@@ -49,10 +49,12 @@ extern "C" {
 #elif(VKFFT_BACKEND==1)
 		CUdevice* device;//pointer to CUDA device, obtained from cuDeviceGet
 		//CUcontext* context;//pointer to CUDA context, obtained from cuDeviceGet
+		cudaStream_t* stream;//pointer to streams (can be more than 1), where to execute the kernels
 		uint32_t num_streams;//try to submit CUDA kernels in multiple streams for asynchronous execution. Default 1
 #elif(VKFFT_BACKEND==2)
 		hipDevice_t* device;//pointer to HIP device, obtained from hipDeviceGet
 		//hipCtx_t* context;//pointer to HIP context, obtained from hipDeviceGet
+		hipStream_t* stream;//pointer to streams (can be more than 1), where to execute the kernels
 		uint32_t num_streams;//try to submit HIP kernels in multiple streams for asynchronous execution. Default 1
 #endif
 
@@ -94,6 +96,7 @@ extern "C" {
 		//optional: (default 0 if not stated otherwise)
 		uint32_t coalescedMemory;//in bits, for Nvidia and AMD is equal to 32, Intel is equal 64, scaled for half precision. Gonna work regardles, but if specified by user correctly, the performance will be higher. 
 		uint32_t aimThreads;//aim at this many threads per block. Default 128
+		uint32_t numSharedBanks;//how many banks shared memory has. Default 32
 
 		uint32_t doublePrecision; //perform calculations in double precision (0 - off, 1 - on).
 		uint32_t halfPrecision; //perform calculations in half precision (0 - off, 1 - on)
@@ -153,12 +156,10 @@ extern "C" {
 		VkCommandBuffer* commandBuffer;//Filled at app execution
 		VkMemoryBarrier* memory_barrier;//Filled at app creation
 #elif(VKFFT_BACKEND==1)
-		cudaStream_t* stream;//Filled at app creation
 		cudaEvent_t* stream_event;//Filled at app creation
 		uint32_t streamCounter;//Filled at app creation
 		uint32_t streamID;//Filled at app creation
 #elif(VKFFT_BACKEND==2)
-		hipStream_t* stream;//Filled at app creation
 		hipEvent_t* stream_event;//Filled at app creation
 		uint32_t streamCounter;//Filled at app creation
 		uint32_t streamID;//Filled at app creation
@@ -181,6 +182,7 @@ extern "C" {
 		uint32_t axis_id;
 		uint32_t axis_upload_id;
 		uint32_t registers_per_thread;
+		uint32_t registers_per_thread_per_radix[14];
 		uint32_t min_registers_per_thread;
 		uint32_t readToRegisters;
 		uint32_t writeFromRegisters;
@@ -230,6 +232,13 @@ extern "C" {
 		uint32_t supportAxis;
 		uint32_t cacheShuffle;
 		uint32_t registerBoost;
+		uint32_t warpSize;
+		uint32_t numSharedBanks;
+		uint32_t resolveBankConflictFirstStages;
+		uint32_t sharedStrideBankConflictFirstStages;
+		uint32_t sharedStrideReadWriteConflict;
+		uint32_t maxSharedStride;
+
 		char** regIDs;
 		char* disableThreadsStart;
 		char* disableThreadsEnd;
@@ -257,7 +266,7 @@ extern "C" {
 		char temp[10];
 		char w[10];
 		char iw[10];
-		char locID[7][20];
+		char locID[13][40];
 		char* output;
 		uint32_t currentLen;
 	} VkFFTSpecializationConstantsLayout;
@@ -324,10 +333,7 @@ extern "C" {
 	typedef struct {
 		uint32_t numAxisUploads[3];
 		uint32_t axisSplit[3][4];
-		uint32_t numSupportAxisUploads[2];
-		uint32_t supportAxisSplit[2][4];
 		VkFFTAxis axes[3][4];
-		VkFFTAxis supportAxes[2][4];//Nx/2+1 for r2c/c2r
 	} VkFFTPlan;
 	typedef struct {
 		VkFFTConfiguration configuration;
@@ -444,6 +450,20 @@ extern "C" {
 	%s.y = %s.y * %s;\n", out, in_1, in_num, out, in_1, in_num);
 		VkAppendLine(sc, sc->tempStr);
 	};
+	static inline void VkMulComplexNumberImag(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num, const char* temp) {
+		if (strcmp(out, in_1)) {
+			sprintf(sc->tempStr, "\
+	%s.x = - %s.y * %s;\n\
+	%s.y = %s.x * %s;\n", out, in_1, in_num, out, in_1, in_num);
+		}
+		else {
+			sprintf(sc->tempStr, "\
+	%s.x = - %s.y * %s;\n\
+	%s.y = %s.x * %s;\n\
+	%s = %s;\n", temp, in_1, in_num, temp, in_1, in_num, out, temp);
+		}
+		VkAppendLine(sc, sc->tempStr);
+	};
 	static inline void VkDivComplexNumber(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) {
 		sprintf(sc->tempStr, "\
 	%s.x = %s.x / %s;\n\
@@ -495,6 +515,21 @@ extern "C" {
 	%s = %s / %s;\n", out, in_1, in_num);
 		VkAppendLine(sc, sc->tempStr);
 	};
+	static inline void VkPermute(VkFFTSpecializationConstantsLayout* sc, const uint32_t* permute, const uint32_t num_elem, const uint32_t type, char ** regIDs) {
+		char temp_ID[13][20];
+		if (type == 0) {
+			for (uint32_t i = 0; i < num_elem; i++)
+				sprintf(temp_ID[i], "%s", sc->locID[i]);
+			for (uint32_t i = 0; i < num_elem; i++)
+				sprintf(sc->locID[i], "%s", temp_ID[permute[i]]);
+		}
+		if (type == 1) {
+			for (uint32_t i = 0; i < num_elem; i++)
+				sprintf(temp_ID[i], "%s", regIDs[i]);
+			for (uint32_t i = 0; i < num_elem; i++)
+				sprintf(regIDs[i], "%s", temp_ID[permute[i]]);
+		}
+	};
 
 	static inline void appendVersion(char* output) {
 #if(VKFFT_BACKEND==0)
@@ -947,17 +982,17 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			char shiftZ[500] = "";
 			if (sc->size[2] > 1) {
 				if (sc->performWorkGroupShift[2])
-					sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %d", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, 2 * sc->inputStride[2]);
+					sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %d", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->inputStride[2]);
 				else
-					sprintf(shiftZ, " + %s * %d", sc->gl_GlobalInvocationID_z, 2 * sc->inputStride[2]);
+					sprintf(shiftZ, " + %s * %d", sc->gl_GlobalInvocationID_z,  sc->inputStride[2]);
 			}
 			char shiftCoordinate[100] = "";
 			if (sc->numCoordinates * sc->matrixConvolution > 1) {
-				sprintf(shiftCoordinate, " + consts.coordinate * %d", 2 * sc->inputStride[3]);
+				sprintf(shiftCoordinate, " + consts.coordinate * %d", sc->inputStride[3]);
 			}
 			char shiftBatch[100] = "";
 			if ((sc->numBatches > 1) || (sc->numKernels > 1)) {
-				sprintf(shiftBatch, " + consts.batchID * %d", 2 * sc->inputStride[4]);
+				sprintf(shiftBatch, " + consts.batchID * %d", sc->inputStride[4]);
 			}
 			sprintf(output + strlen(output), "%s%s%s%s%s%s", inputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
 			break;
@@ -1152,17 +1187,17 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			char shiftZ[500] = "";
 			if (sc->size[2] > 1) {
 				if (sc->performWorkGroupShift[2])
-					sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %d", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, 2 * sc->outputStride[2]);
+					sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %d", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->outputStride[2]);
 				else
-					sprintf(shiftZ, " + %s * %d", sc->gl_GlobalInvocationID_z, 2 * sc->outputStride[2]);
+					sprintf(shiftZ, " + %s * %d", sc->gl_GlobalInvocationID_z, sc->outputStride[2]);
 			}
 			char shiftCoordinate[100] = "";
 			if (sc->numCoordinates * sc->matrixConvolution > 1) {
-				sprintf(shiftCoordinate, " + consts.coordinate * %d", 2 * sc->outputStride[3]);
+				sprintf(shiftCoordinate, " + consts.coordinate * %d", sc->outputStride[3]);
 			}
 			char shiftBatch[100] = "";
 			if ((sc->numBatches > 1) || (sc->numKernels > 1)) {
-				sprintf(shiftBatch, " + consts.batchID * %d", 2 * sc->outputStride[4]);
+				sprintf(shiftBatch, " + consts.batchID * %d", sc->outputStride[4]);
 			}
 			sprintf(output + strlen(output), "%s%s%s%s%s%s", outputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
 			break;
@@ -1580,27 +1615,29 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			else {
 				sprintf(output + strlen(output), "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s angle) {\n", vecType, vecType, vecType, vecType, vecType, floatType);
 			}*/
-			char* tf[4];
-			char* tf2[4];
-			char* tf2inv[4];
+			char* tf[8];
+
 			//VkAppendLine(sc, "	{\n");
-			for (uint32_t i = 0; i < 4; i++) {
+			for (uint32_t i = 0; i < 8; i++) {
 				tf[i] = (char*)malloc(sizeof(char) * 40);
-				tf2[i] = (char*)malloc(sizeof(char) * 40);
-				tf2inv[i] = (char*)malloc(sizeof(char) * 40);
+
 			}
 			sprintf(tf[0], "-1.16666666666666651863693004997913%s", LFending);
 			sprintf(tf[1], "0.79015646852540022404554065360571%s", LFending);
 			sprintf(tf[2], "0.05585426728964774240049351305970%s", LFending);
 			sprintf(tf[3], "0.73430220123575240531721419756650%s", LFending);
-			sprintf(tf2[0], "0.44095855184409837868031445395900%s", LFending);
-			sprintf(tf2[1], "0.34087293062393136944265847887436%s", LFending);
-			sprintf(tf2[2], "-0.53396936033772524066165487965918%s", LFending);
-			sprintf(tf2[3], "0.87484229096165666561546458979137%s", LFending);
-			sprintf(tf2inv[0], "-0.44095855184409837868031445395900%s", LFending);
-			sprintf(tf2inv[1], "-0.34087293062393136944265847887436%s", LFending);
-			sprintf(tf2inv[2], "0.53396936033772524066165487965918%s", LFending);
-			sprintf(tf2inv[3], "-0.87484229096165666561546458979137%s", LFending);
+			if (stageAngle < 0) {
+				sprintf(tf[4], "0.44095855184409837868031445395900%s", LFending);
+				sprintf(tf[5], "0.34087293062393136944265847887436%s", LFending);
+				sprintf(tf[6], "-0.53396936033772524066165487965918%s", LFending);
+				sprintf(tf[7], "0.87484229096165666561546458979137%s", LFending);
+			}
+			else {
+				sprintf(tf[4], "-0.44095855184409837868031445395900%s", LFending);
+				sprintf(tf[5], "-0.34087293062393136944265847887436%s", LFending);
+				sprintf(tf[6], "0.53396936033772524066165487965918%s", LFending);
+				sprintf(tf[7], "-0.87484229096165666561546458979137%s", LFending);
+			}
 			/*for (uint32_t i = 0; i < 7; i++) {
 				sc->locID[i] = (char*)malloc(sizeof(char) * 40);
 				sprintf(sc->locID[i], "loc_%d", i);
@@ -1683,15 +1720,15 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 	temp%s = temp%s - temp%s;\n\
 	temp%s = temp%s - temp%s;\n\
 	temp%s = temp%s - temp%s;\n", regID[0], regID[1], regID[5], regID[2], regID[5], regID[3], regID[4], regID[3], regID[1]);
-			if (stageAngle < 0) {
+			
 				VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[0]);
 				VkMulComplexNumber(sc, sc->locID[2], sc->locID[2], tf[1]);
 				VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[2]);
 				VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[3]);
-				VkMulComplexNumber(sc, sc->locID[5], sc->locID[5], tf2[0]);
-				VkMulComplexNumber(sc, regID[0], regID[0], tf2[1]);
-				VkMulComplexNumber(sc, regID[2], regID[2], tf2[2]);
-				VkMulComplexNumber(sc, regID[4], regID[4], tf2[3]);
+			VkMulComplexNumber(sc, sc->locID[5], sc->locID[5], tf[4]);
+			VkMulComplexNumber(sc, regID[0], regID[0], tf[5]);
+			VkMulComplexNumber(sc, regID[2], regID[2], tf[6]);
+			VkMulComplexNumber(sc, regID[4], regID[4], tf[7]);
 				//sprintf(output + strlen(output), "\
 	loc_1 *= -1.16666666666666651863693004997913;\n\
 	loc_2 *= 0.79015646852540022404554065360571;\n\
@@ -1701,26 +1738,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 	temp%s *= 0.34087293062393136944265847887436;\n\
 	temp%s *= -0.53396936033772524066165487965918;\n\
 	temp%s *= 0.87484229096165666561546458979137;\n", regID[0], regID[2], regID[4]);
-			}
-			else {
-				VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[0]);
-				VkMulComplexNumber(sc, sc->locID[2], sc->locID[2], tf[1]);
-				VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[2]);
-				VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[3]);
-				VkMulComplexNumber(sc, sc->locID[5], sc->locID[5], tf2inv[0]);
-				VkMulComplexNumber(sc, regID[0], regID[0], tf2inv[1]);
-				VkMulComplexNumber(sc, regID[2], regID[2], tf2inv[2]);
-				VkMulComplexNumber(sc, regID[4], regID[4], tf2inv[3]);
-				//sprintf(output + strlen(output), "\
-	loc_1 *= -1.16666666666666651863693004997913;\n\
-	loc_2 *= 0.79015646852540022404554065360571;\n\
-	loc_3 *= 0.05585426728964774240049351305970;\n\
-	loc_4 *= 0.73430220123575240531721419756650;\n\
-	loc_5 *= -0.44095855184409837868031445395900;\n\
-	temp%s *= -0.34087293062393136944265847887436;\n\
-	temp%s *= 0.53396936033772524066165487965918;\n\
-	temp%s *= -0.87484229096165666561546458979137;\n", regID[0], regID[2], regID[4]);
-			}
+			
 			VkSubComplex(sc, regID[5], regID[4], regID[2]);
 			VkAddComplexInv(sc, regID[6], regID[4], regID[0]);
 			VkAddComplex(sc, regID[4], regID[0], regID[2]);
@@ -1776,10 +1794,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			/*for (uint32_t i = 0; i < 7; i++) {
 				free(sc->locID[i]);
 			}*/
-			for (uint32_t i = 0; i < 4; i++) {
+			for (uint32_t i = 0; i < 8; i++) {
 				free(tf[i]);
-				free(tf2[i]);
-				free(tf2inv[i]);
 			}
 			break;
 		}
@@ -1953,6 +1969,356 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 
 			break;
 		}
+		case 11: {
+			
+			char* tf[20];
+			//char* tf2[4];
+			//char* tf2inv[4];
+			//VkAppendLine(sc, "	{\n");
+			for (uint32_t i = 0; i < 20; i++) {
+				tf[i] = (char*)malloc(sizeof(char) * 40);
+				//tf2[i] = (char*)malloc(sizeof(char) * 40);
+				//tf2inv[i] = (char*)malloc(sizeof(char) * 40);
+			}
+			sprintf(tf[0], "-1.100000000000000%s", LFending);
+			
+			sprintf(tf[2], "0.253097611605959%s", LFending);
+			sprintf(tf[3], "-1.288200610773679%s", LFending);
+			sprintf(tf[4], "0.304632239669212%s", LFending);
+			sprintf(tf[5], "-0.391339615511917%s", LFending);
+			sprintf(tf[6], "-2.871022253392850%s", LFending);
+			sprintf(tf[7], "1.374907986616384%s", LFending);
+			sprintf(tf[8], "0.817178135341212%s", LFending);
+			sprintf(tf[9], "1.800746506445679%s", LFending);
+			sprintf(tf[10], "-0.859492973614498%s", LFending);
+
+			if (stageAngle < 0) {
+				sprintf(tf[1], "0.331662479035540%s", LFending);
+				sprintf(tf[11], "-2.373470454748280%s", LFending);
+				sprintf(tf[12], "-0.024836393087493%s", LFending);
+				sprintf(tf[13], "0.474017017512829%s", LFending);
+				sprintf(tf[14], "0.742183927770612%s", LFending);
+				sprintf(tf[15], "1.406473309094609%s", LFending);
+				sprintf(tf[16], "-1.191364552195948%s", LFending);
+				sprintf(tf[17], "0.708088885039503%s", LFending);
+				sprintf(tf[18], "0.258908260614168%s", LFending);
+				sprintf(tf[19], "-0.049929922194110%s", LFending);
+			}
+			else {
+				sprintf(tf[1], "-0.331662479035540%s", LFending);
+				sprintf(tf[11], "2.373470454748280%s", LFending);
+				sprintf(tf[12], "0.024836393087493%s", LFending);
+				sprintf(tf[13], "-0.474017017512829%s", LFending);
+				sprintf(tf[14], "-0.742183927770612%s", LFending);
+				sprintf(tf[15], "-1.406473309094609%s", LFending);
+				sprintf(tf[16], "1.191364552195948%s", LFending);
+				sprintf(tf[17], "-0.708088885039503%s", LFending);
+				sprintf(tf[18], "-0.258908260614168%s", LFending);
+				sprintf(tf[19], "0.049929922194110%s", LFending);
+			}
+			for (uint32_t i = radix - 1; i > 0; i--) {
+				if (i == radix - 1) {
+					if (sc->LUT) {
+						sprintf(output + strlen(output), "	%s = twiddleLUT[LUTId];\n", w);
+						if (!sc->inverse)
+							sprintf(output + strlen(output), "	%s.y = -%s.y;\n", w, w);
+					}
+					else {
+						if (!strcmp(floatType, "float")) {
+							sprintf(output + strlen(output), "	%s.x = %s(angle*%.17f);\n", w, cosDef, 2.0 * i / radix);
+							sprintf(output + strlen(output), "	%s.y = %s(angle*%.17f);\n", w, sinDef, 2.0 * i / radix);
+							//sprintf(output + strlen(output), "	w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
+						}
+						if (!strcmp(floatType, "double"))
+							sprintf(output + strlen(output), "	%s = sincos_20(angle*%.17f);\n", w, 2.0 * i / radix);
+					}
+				}
+				else {
+					if (sc->LUT) {
+						sprintf(output + strlen(output), "	%s = twiddleLUT[LUTId+%d];\n\n", w, (radix - 1 - i) * stageSize);
+						if (!sc->inverse)
+							sprintf(output + strlen(output), "	%s.y = -%s.y;\n", w, w);
+					}
+					else {
+						if (!strcmp(floatType, "float")) {
+							sprintf(output + strlen(output), "	%s.x = %s(angle*%.17f);\n", w, cosDef, 2.0 * i / radix);
+							sprintf(output + strlen(output), "	%s.y = %s(angle*%.17f);\n", w, sinDef, 2.0 * i / radix);
+							//sprintf(output + strlen(output), "	w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
+						}
+						if (!strcmp(floatType, "double"))
+							sprintf(output + strlen(output), "	%s = sincos_20(angle*%.17f);\n", w, 2.0 * i / radix);
+					}
+				}
+				VkMulComplex(sc, sc->locID[i], regID[i], w, 0);
+				
+			}
+			VkMovComplex(sc, sc->locID[0], regID[0]);
+			uint32_t permute[11] = { 0,1,9,4,3,5,10,2,7,8,6 };
+			VkPermute(sc, permute, 11, 0, 0);
+			for (uint32_t i = 0; i < 5; i++) {
+				VkAddComplex(sc, regID[i + 1], sc->locID[i+1], sc->locID[i+6]);
+				VkSubComplex(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]);
+			}
+			VkMovComplex(sc, sc->locID[1], regID[1]);
+			for (uint32_t i = 0; i < 4; i++) {
+				VkAddComplex(sc, sc->locID[1], sc->locID[1], regID[i + 2]);
+				VkSubComplex(sc, sc->locID[i + 3], regID[i + 1], regID[5]);
+			}
+			VkMovComplex(sc, sc->locID[2], regID[6]);
+			for (uint32_t i = 0; i < 4; i++) {
+				VkAddComplex(sc, sc->locID[2], sc->locID[2], regID[i + 7]);
+				VkSubComplex(sc, sc->locID[i + 7], regID[i + 6], regID[10]);
+			}
+
+			VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]);
+			VkMulComplexNumber(sc, regID[1], sc->locID[1], tf[0]);
+			VkMulComplexNumberImag(sc, regID[2], sc->locID[2], tf[1], sc->locID[0]);
+			for (uint32_t k = 0; k < 2; k++) {
+				VkAddComplex(sc, regID[k*4+3], sc->locID[k*4+3], sc->locID[k*4+5]);
+				VkAddComplex(sc, regID[k*4+4], sc->locID[k*4+4], sc->locID[k*4+6]);
+				VkAddComplex(sc, regID[k*4+5], sc->locID[k*4+3], sc->locID[k*4+4]);
+				VkAddComplex(sc, regID[k*4+6], sc->locID[k*4+5], sc->locID[k*4+6]);
+				VkAddComplex(sc, sc->locID[1], regID[k*4+3], regID[k*4+4]);
+
+				if (k == 0) {
+					VkMulComplexNumber(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], tf[k * 9 + 2]);
+					VkMulComplexNumber(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], tf[k * 9 + 3]);
+					VkMulComplexNumber(sc, regID[k * 4 + 5], regID[k * 4 + 5], tf[k * 9 + 4]);
+					VkMulComplexNumber(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], tf[k * 9 + 5]);
+					VkMulComplexNumber(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], tf[k * 9 + 6]);
+					VkMulComplexNumber(sc, regID[k * 4 + 6], regID[k * 4 + 6], tf[k * 9 + 7]);
+					VkMulComplexNumber(sc, regID[k * 4 + 3], regID[k * 4 + 3], tf[k * 9 + 8]);
+					VkMulComplexNumber(sc, regID[k * 4 + 4], regID[k * 4 + 4], tf[k * 9 + 9]);
+					VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[k * 9 + 10]);
+				}
+				else {
+					VkMulComplexNumberImag(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], tf[k * 9 + 2], sc->locID[0]);
+					VkMulComplexNumberImag(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], tf[k * 9 + 3], sc->locID[0]);
+					VkMulComplexNumberImag(sc, regID[k * 4 + 5], regID[k * 4 + 5], tf[k * 9 + 4], sc->locID[0]);
+					VkMulComplexNumberImag(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], tf[k * 9 + 5], sc->locID[0]);
+					VkMulComplexNumberImag(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], tf[k * 9 + 6], sc->locID[0]);
+					VkMulComplexNumberImag(sc, regID[k * 4 + 6], regID[k * 4 + 6], tf[k * 9 + 7], sc->locID[0]);
+					VkMulComplexNumberImag(sc, regID[k * 4 + 3], regID[k * 4 + 3], tf[k * 9 + 8], sc->locID[0]);
+					VkMulComplexNumberImag(sc, regID[k * 4 + 4], regID[k * 4 + 4], tf[k * 9 + 9], sc->locID[0]);
+					VkMulComplexNumberImag(sc, sc->locID[1], sc->locID[1], tf[k * 9 + 10], sc->locID[0]);
+				}
+				
+				VkAddComplex(sc, sc->locID[k*4+3], sc->locID[k*4+3], regID[k*4+3]);
+				VkAddComplex(sc, sc->locID[k*4+5], sc->locID[k*4+5], regID[k*4+3]);
+
+				VkAddComplex(sc, sc->locID[k*4+4], sc->locID[k*4+4], regID[k*4+4]);
+				VkAddComplex(sc, sc->locID[k*4+6], sc->locID[k*4+6], regID[k*4+4]);
+
+				VkAddComplex(sc, regID[k*4+5], regID[k*4+5], sc->locID[1]);
+				VkAddComplex(sc, regID[k*4+6], regID[k*4+6], sc->locID[1]);
+
+				VkAddComplex(sc, regID[k*4+3], sc->locID[k*4+3], regID[k*4+5]);
+				VkAddComplex(sc, regID[k*4+4], sc->locID[k*4+4], regID[k*4+5]);
+
+				VkAddComplex(sc, regID[k*4+5], sc->locID[k*4+5], regID[k*4+6]);
+				VkAddComplex(sc, regID[k*4+6], sc->locID[k*4+6], regID[k*4+6]);
+
+			}
+			VkAddComplex(sc, regID[1], regID[0], regID[1]);
+
+			VkMovComplex(sc, sc->locID[5], regID[1]);
+			for (uint32_t i = 0; i < 4; i++) {
+				VkAddComplex(sc, sc->locID[i + 1], regID[1], regID[i + 3]);
+				VkSubComplex(sc, sc->locID[5], sc->locID[5], regID[i + 3]);
+			}
+			VkMovComplex(sc, sc->locID[10], regID[2]);
+			for (uint32_t i = 0; i < 4; i++) {
+				VkAddComplex(sc, sc->locID[i + 6], regID[2], regID[i + 7]);
+				VkSubComplex(sc, sc->locID[10], sc->locID[10], regID[i + 7]);
+			}
+			for (uint32_t i = 0; i < 5; i++) {
+				VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 6]);
+				VkSubComplex(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]);
+			}
+			uint32_t permute2[11] = { 0,10,1,8,7,9,4,2,3,6,5 };
+			VkPermute(sc, permute2, 11, 1, regID);
+
+			for (uint32_t i = 0; i < 20; i++) {
+				free(tf[i]);
+			}
+			break;
+		}
+		case 13: {
+
+			char* tf[20];
+			//char* tf2[4];
+			//char* tf2inv[4];
+			//VkAppendLine(sc, "	{\n");
+			for (uint32_t i = 0; i < 20; i++) {
+				tf[i] = (char*)malloc(sizeof(char) * 40);
+				//tf2[i] = (char*)malloc(sizeof(char) * 40);
+				//tf2inv[i] = (char*)malloc(sizeof(char) * 40);
+			}
+			sprintf(tf[0], "-1.083333333333333%s", LFending);
+			sprintf(tf[1], "-0.300462606288666%s", LFending);
+			sprintf(tf[5], "1.007074065727533%s", LFending);
+			sprintf(tf[6], "0.731245990975348%s", LFending);
+			sprintf(tf[7], "-0.579440018900960%s", LFending);
+			sprintf(tf[8], "0.531932498429674%s", LFending);
+			sprintf(tf[9], "-0.508814921720398%s", LFending);
+			sprintf(tf[10], "-0.007705858903092%s", LFending);
+
+			if (stageAngle < 0) {
+				sprintf(tf[2], "-0.749279330626139%s", LFending);
+				sprintf(tf[3], "0.401002128321867%s", LFending);
+				sprintf(tf[4], "0.174138601152136%s", LFending);
+				sprintf(tf[11], "-2.511393318389568%s", LFending);
+				sprintf(tf[12], "-1.823546408682421%s", LFending);
+				sprintf(tf[13], "1.444979909023996%s", LFending);
+				sprintf(tf[14], "-1.344056915177370%s", LFending);
+				sprintf(tf[15], "-0.975932420775946%s", LFending);
+				sprintf(tf[16], "0.773329778651105%s", LFending);
+				sprintf(tf[17], "1.927725116783469%s", LFending);
+				sprintf(tf[18], "1.399739414729183%s", LFending);
+				sprintf(tf[19], "-1.109154843837551%s", LFending);
+			}
+			else {
+				sprintf(tf[2], "0.749279330626139%s", LFending);
+				sprintf(tf[3], "-0.401002128321867%s", LFending);
+				sprintf(tf[4], "-0.174138601152136%s", LFending);
+				sprintf(tf[11], "2.511393318389568%s", LFending);
+				sprintf(tf[12], "1.823546408682421%s", LFending);
+				sprintf(tf[13], "-1.444979909023996%s", LFending);
+				sprintf(tf[14], "1.344056915177370%s", LFending);
+				sprintf(tf[15], "0.975932420775946%s", LFending);
+				sprintf(tf[16], "-0.773329778651105%s", LFending);
+				sprintf(tf[17], "-1.927725116783469%s", LFending);
+				sprintf(tf[18], "-1.399739414729183%s", LFending);
+				sprintf(tf[19], "1.109154843837551%s", LFending);
+			}
+			for (uint32_t i = radix - 1; i > 0; i--) {
+				if (i == radix - 1) {
+					if (sc->LUT) {
+						sprintf(output + strlen(output), "	%s = twiddleLUT[LUTId];\n", w);
+						if (!sc->inverse)
+							sprintf(output + strlen(output), "	%s.y = -%s.y;\n", w, w);
+					}
+					else {
+						if (!strcmp(floatType, "float")) {
+							sprintf(output + strlen(output), "	%s.x = %s(angle*%.17f);\n", w, cosDef, 2.0 * i / radix);
+							sprintf(output + strlen(output), "	%s.y = %s(angle*%.17f);\n", w, sinDef, 2.0 * i / radix);
+							//sprintf(output + strlen(output), "	w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
+						}
+						if (!strcmp(floatType, "double"))
+							sprintf(output + strlen(output), "	%s = sincos_20(angle*%.17f);\n", w, 2.0 * i / radix);
+					}
+				}
+				else {
+					if (sc->LUT) {
+						sprintf(output + strlen(output), "	%s = twiddleLUT[LUTId+%d];\n\n", w, (radix - 1 - i) * stageSize);
+						if (!sc->inverse)
+							sprintf(output + strlen(output), "	%s.y = -%s.y;\n", w, w);
+					}
+					else {
+						if (!strcmp(floatType, "float")) {
+							sprintf(output + strlen(output), "	%s.x = %s(angle*%.17f);\n", w, cosDef, 2.0 * i / radix);
+							sprintf(output + strlen(output), "	%s.y = %s(angle*%.17f);\n", w, sinDef, 2.0 * i / radix);
+							//sprintf(output + strlen(output), "	w = %s(cos(angle*%.17f), sin(angle*%.17f));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix);
+						}
+						if (!strcmp(floatType, "double"))
+							sprintf(output + strlen(output), "	%s = sincos_20(angle*%.17f);\n", w, 2.0 * i / radix);
+					}
+				}
+				VkMulComplex(sc, sc->locID[i], regID[i], w, 0);
+
+			}
+			VkMovComplex(sc, sc->locID[0], regID[0]);
+			uint32_t permute[13] = { 0,1,3,9,5,2,6,12,10,4,8,11,7 };
+			VkPermute(sc, permute, 13, 0, 0);
+			for (uint32_t i = 0; i < 6; i++) {
+				VkSubComplex(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]);
+				VkAddComplex(sc, sc->locID[i + 1], sc->locID[i + 1], sc->locID[i + 7]);
+			}
+			for (uint32_t i = 0; i < 3; i++) {
+				VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 4]);
+				VkSubComplex(sc, regID[i + 4], sc->locID[i + 1], sc->locID[i + 4]);
+			}
+			for (uint32_t i = 0; i < 4; i++) {
+				VkAddComplex(sc, sc->locID[i + 1], regID[i*3 + 1], regID[i * 3 + 2]);
+				VkSubComplex(sc, sc->locID[i*2 + 5], regID[i * 3 + 1], regID[i * 3 + 3]);
+				VkAddComplex(sc, sc->locID[i + 1], sc->locID[i + 1], regID[i * 3 + 3]);
+				VkSubComplex(sc, sc->locID[i * 2 + 6], regID[i * 3 + 2], regID[i * 3 + 3]);
+			}
+
+			VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]);
+			VkMulComplexNumber(sc, regID[1], sc->locID[1], tf[0]);
+			VkMulComplexNumber(sc, regID[2], sc->locID[2], tf[1]);
+			for (uint32_t k = 0; k < 3; k++) {
+				VkAddComplex(sc, regID[k * 2 + 4], sc->locID[k * 2 + 3], sc->locID[k * 2 + 4]);
+			
+				if (k == 0) {
+					VkMulComplexNumberImag(sc, sc->locID[k * 2 + 3], sc->locID[k * 2 + 3], tf[k * 3 + 2], sc->locID[0]);
+					VkMulComplexNumberImag(sc, sc->locID[k * 2 + 4], sc->locID[k * 2 + 4], tf[k * 3 + 3], sc->locID[0]);
+					VkMulComplexNumberImag(sc, regID[k * 2 + 4], regID[k * 2 + 4], tf[k * 3 + 4], sc->locID[0]);
+				}
+				else {
+					VkMulComplexNumber(sc, sc->locID[k * 2 + 3], sc->locID[k * 2 + 3], tf[k * 3 + 2]);
+					VkMulComplexNumber(sc, sc->locID[k * 2 + 4], sc->locID[k * 2 + 4], tf[k * 3 + 3]);
+					VkMulComplexNumber(sc, regID[k * 2 + 4], regID[k * 2 + 4], tf[k * 3 + 4]);
+				}
+
+				VkAddComplex(sc, regID[k * 2 + 3], sc->locID[k * 2 + 3], regID[k * 2 + 4]);
+				VkAddComplex(sc, regID[k * 2 + 4], sc->locID[k * 2 + 4], regID[k * 2 + 4]);
+
+			}
+			VkAddComplex(sc, regID[9], sc->locID[9], sc->locID[11]);
+			VkAddComplex(sc, regID[10], sc->locID[10], sc->locID[12]);
+			VkAddComplex(sc, regID[11], sc->locID[9], sc->locID[10]);
+			VkAddComplex(sc, regID[12], sc->locID[11], sc->locID[12]);
+			VkAddComplex(sc, sc->locID[1], regID[9], regID[10]);
+
+			VkMulComplexNumberImag(sc, sc->locID[9], sc->locID[9], tf[11], sc->locID[0]);
+			VkMulComplexNumberImag(sc, sc->locID[10], sc->locID[10], tf[12], sc->locID[0]);
+			VkMulComplexNumberImag(sc, regID[11], regID[11], tf[13], sc->locID[0]);
+			VkMulComplexNumberImag(sc, sc->locID[11], sc->locID[11], tf[14], sc->locID[0]);
+			VkMulComplexNumberImag(sc, sc->locID[12], sc->locID[12], tf[15], sc->locID[0]);
+			VkMulComplexNumberImag(sc, regID[12], regID[12], tf[16], sc->locID[0]);
+			VkMulComplexNumberImag(sc, regID[9], regID[9], tf[17], sc->locID[0]);
+			VkMulComplexNumberImag(sc, regID[10], regID[10], tf[18], sc->locID[0]);
+			VkMulComplexNumberImag(sc, sc->locID[1], sc->locID[1], tf[19], sc->locID[0]);
+
+			VkAddComplex(sc, sc->locID[9], sc->locID[9], regID[9]);
+			VkAddComplex(sc, sc->locID[11], sc->locID[11], regID[9]);
+			VkAddComplex(sc, sc->locID[10], sc->locID[10], regID[10]);
+			VkAddComplex(sc, sc->locID[12], sc->locID[12], regID[10]);
+			VkAddComplex(sc, regID[11], regID[11], sc->locID[1]);
+			VkAddComplex(sc, regID[12], regID[12], sc->locID[1]);
+
+			VkAddComplex(sc, regID[9], sc->locID[9], regID[11]);
+			VkAddComplex(sc, regID[10], sc->locID[10], regID[11]);
+			VkAddComplex(sc, regID[11], sc->locID[11], regID[12]);
+			VkAddComplex(sc, regID[12], sc->locID[12], regID[12]);
+
+			VkAddComplex(sc, regID[1], regID[0], regID[1]);
+
+			for (uint32_t i = 0; i < 4; i++) {
+				VkAddComplex(sc, sc->locID[i * 3 + 1], regID[i + 1], regID[i * 2 + 5]);
+				VkSubComplex(sc, sc->locID[i * 3 + 3], regID[i + 1], regID[i * 2 + 5]);
+				VkAddComplex(sc, sc->locID[i * 3 + 2], regID[i + 1], regID[i * 2 + 6]);
+				VkSubComplex(sc, sc->locID[i * 3 + 3], sc->locID[i * 3 + 3], regID[i * 2 + 6]);
+			}
+			for (uint32_t i = 0; i < 3; i++) {
+				VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 4]);
+				VkSubComplex(sc, sc->locID[i + 4], sc->locID[i + 1], sc->locID[i + 4]);
+				VkMovComplex(sc, sc->locID[i + 1], regID[i + 1]);
+			}
+			for (uint32_t i = 0; i < 6; i++) {
+				VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 7]);
+				VkSubComplex(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]);
+			}
+			uint32_t permute2[13] = { 0,12,1,10,5,3,2,8,9,11,4,7,6};
+			VkPermute(sc, permute2, 13, 1, regID);
+
+			for (uint32_t i = 0; i < 20; i++) {
+				free(tf[i]);
+			}
+			break;
+		}
 		}
 	}
 	static inline void appendSharedMemoryVkFFT(char* output, VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint32_t sharedType) {
@@ -1997,18 +2363,27 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		switch (sharedType) {
 		case 0: case 5: case 6://single_c2c + single_r2c
 		{
-			uint32_t sharedStride = ((maxSequenceSharedMemory - sc->localSize[1] * sc->fftDim / sc->registerBoost) * (sc->localSize[1] - 1) == 0) ? sc->fftDim / sc->registerBoost : sc->fftDim / sc->registerBoost + 1;
-			sprintf(output + strlen(output), "const %s sharedStride = %d; //to avoid bank conflict if we transpose\n", uintType, sharedStride);
+			sc->resolveBankConflictFirstStages = 0;
+			sc->sharedStrideBankConflictFirstStages = (sc->fftDim > sc->numSharedBanks / 2) ? sc->fftDim / sc->registerBoost * (sc->numSharedBanks / 2 + 1) / (sc->numSharedBanks / 2) : sc->fftDim / sc->registerBoost;
+			sc->sharedStrideReadWriteConflict = (sc->numSharedBanks / 2 <= sc->localSize[1]) ? sc->fftDim / sc->registerBoost + 1 : sc->fftDim / sc->registerBoost + (sc->numSharedBanks / 2) / sc->localSize[1];
+			sc->maxSharedStride = (sc->sharedStrideBankConflictFirstStages < sc->sharedStrideReadWriteConflict) ? sc->sharedStrideReadWriteConflict : sc->sharedStrideBankConflictFirstStages;
+			sc->maxSharedStride = (((maxSequenceSharedMemory - sc->localSize[1] * sc->fftDim / sc->registerBoost) == 0) || ((sc->fftDim & (sc->fftDim - 1)) != 0)) ? sc->fftDim / sc->registerBoost : sc->maxSharedStride;
+
+			sc->sharedStrideBankConflictFirstStages = (sc->maxSharedStride == sc->fftDim / sc->registerBoost) ? sc->fftDim / sc->registerBoost : sc->sharedStrideBankConflictFirstStages;
+			sc->sharedStrideReadWriteConflict = (sc->maxSharedStride == sc->fftDim / sc->registerBoost) ? sc->fftDim / sc->registerBoost : sc->sharedStrideReadWriteConflict;
+
+			//printf("%d %d %d %d %d\n", sc->maxSharedStride, sc->sharedStrideBankConflictFirstStages, sc->sharedStrideReadWriteConflict, sc->localSize[1], sc->fftDim);
+			sprintf(output + strlen(output), "%s sharedStride = %d; //to avoid bank conflict if we transpose\n", uintType, sc->sharedStrideReadWriteConflict);
 #if(VKFFT_BACKEND==0)
-			sprintf(output + strlen(output), "%s %s sdata[%d];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consequential ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sharedStride);
+			sprintf(output + strlen(output), "%s %s sdata[%d];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consequential ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
 #elif(VKFFT_BACKEND==1)
-			sprintf(output + strlen(output), "%s %s sdata[%d];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consequential ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sharedStride);
+			sprintf(output + strlen(output), "%s %s sdata[%d];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consequential ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
 			//sprintf(output + strlen(output), "%s %s sdata[];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consequential ffts\n\n", sharedDefinitions, vecType);
 #elif(VKFFT_BACKEND==2)
-			sprintf(output + strlen(output), "%s %s sdata[%d];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consequential ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sharedStride);
+			sprintf(output + strlen(output), "%s %s sdata[%d];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consequential ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride);
 			//sprintf(output + strlen(output), "%s %s sdata[];// sharedStride - fft size,  gl_WorkGroupSize.y - grouped consequential ffts\n\n", sharedDefinitions, vecType);
 #endif
-			sc->usedSharedMemory = vecSize * sc->localSize[1] * sharedStride;
+			sc->usedSharedMemory = vecSize * sc->localSize[1] * sc->maxSharedStride;
 			break;
 		}
 		case 1: case 2://grouped_c2c + single_c2c_strided
@@ -2089,6 +2464,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			if (sc->fftDim % 3 == 0) maxNonPow2Radix = 3;
 			if (sc->fftDim % 5 == 0) maxNonPow2Radix = 5;
 			if (sc->fftDim % 7 == 0) maxNonPow2Radix = 7;
+			if (sc->fftDim % 11 == 0) maxNonPow2Radix = 11;
+			if (sc->fftDim % 13 == 0) maxNonPow2Radix = 13;
 			for (uint32_t i = 0; i < maxNonPow2Radix; i++) {
 				sprintf(sc->locID[i], "loc_%d", i);
 				sprintf(sc->tempStr, "	%s %s;\n", vecType, sc->locID[i]);
@@ -2441,7 +2818,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 				sprintf(output + strlen(output), "		{ \n");
 			}
 
-			if ((sc->localSize[1] > 1) || ((sc->performR2C) && (sc->inverse)) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread / sc->stageRadix[0]) > sc->fftDim))
+			if ((sc->localSize[1] > 1) || ((sc->performR2C) && (sc->inverse)) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim))
 				sc->readToRegisters = 0;
 			else
 				sc->readToRegisters = 1;
@@ -2618,7 +2995,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		}
 		case 1://grouped_c2c
 		{
-			if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread / sc->stageRadix[0]) > sc->fftDim)
+			if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim)
 				sc->readToRegisters = 0;
 			else
 				sc->readToRegisters = 1;
@@ -2693,7 +3070,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		}
 		case 2://single_c2c_strided
 		{
-			if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread / sc->stageRadix[0]) > sc->fftDim)
+			if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim)
 				sc->readToRegisters = 0;
 			else
 				sc->readToRegisters = 1;
@@ -2765,7 +3142,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		}
 		case 5://single_r2c
 		{
-			if ((sc->localSize[1] > 1) || ((sc->performR2C) && (sc->inverse)) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread / sc->stageRadix[0]) > sc->fftDim))
+			if ((sc->localSize[1] > 1) || ((sc->performR2C) && (sc->inverse)) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim))
 				sc->readToRegisters = 0;
 			else
 				sc->readToRegisters = 1;
@@ -2898,7 +3275,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			break;
 		}
 		case 6: {//single_c2r
-			if ((sc->localSize[1] > 1) || ((sc->performR2C) && (sc->inverse)) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread / sc->stageRadix[0]) > sc->fftDim))
+			if ((sc->localSize[1] > 1) || ((sc->performR2C) && (sc->inverse)) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim))
 				sc->readToRegisters = 0;
 			else
 				sc->readToRegisters = 1;
@@ -2913,12 +3290,12 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 						if ((ceil(sc->min_registers_per_thread / 2.0) != sc->min_registers_per_thread / 2) && (i == (ceil(sc->min_registers_per_thread / 2.0) - 1)))
 							sprintf(output + strlen(output), "if (%s < %d){\n", sc->gl_LocalInvocationID_x, sc->fftDim / 2 - i * sc->localSize[0]);
 
-						sprintf(output + strlen(output), "		inoutID = %s+%d;\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
+						sprintf(output + strlen(output), "		inoutID = %s+%d;\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0] + 1);
 
 						sprintf(output + strlen(output), "		if((inoutID < %d)||(inoutID >= %d)){\n", sc->fft_zeropad_left_read[sc->axis_id], sc->fft_zeropad_right_read[sc->axis_id]);
 
 						sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-						sprintf(index_y, "%s%s", sc->gl_GlobalInvocationID_y, shiftY);
+						sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 						indexInputVkFFT(output, sc, uintType, readType, sc->inoutID, index_y, requestCoordinate, requestBatch);
 						sprintf(output + strlen(output), ";\n");
 
@@ -2928,8 +3305,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 							sprintf(output + strlen(output), "		temp_0 = %sinputBlocks[%s / %d]%s[%s %% %d]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight);
 
 						sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-						sprintf(index_x, "%s+%d", sc->gl_LocalInvocationID_x, sc->inputStride[1] / 2 + i * sc->localSize[0]);
-						sprintf(index_y, "%s%s", sc->gl_GlobalInvocationID_y, shiftY);
+						sprintf(index_x, "%s+%d", sc->gl_LocalInvocationID_x, sc->inputStride[1] + i * sc->localSize[0] + 1);
+						sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 						indexInputVkFFT(output, sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 						sprintf(output + strlen(output), ";\n");
 						//sprintf(output + strlen(output), "		inoutID = indexInput(%s+%d, (%s%s));\n", sc->gl_LocalInvocationID_x, sc->inputStride[1] / 2 + i * sc->localSize[0], sc->gl_GlobalInvocationID_y, shiftY);
@@ -2957,13 +3334,15 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		sdata[sharedStride * %s + %d - %s].y = (-temp_0.y + temp_1.x);\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + 1, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + 1, sc->gl_LocalInvocationID_y, sc->fftDim - i * sc->localSize[0] - 1, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim - i * sc->localSize[0] - 1, sc->gl_LocalInvocationID_x);
 						if ((ceil(sc->min_registers_per_thread / 2.0) != sc->min_registers_per_thread / 2) && (i == (ceil(sc->min_registers_per_thread / 2.0) - 1)))
 							sprintf(output + strlen(output), "}\n");
+						if ((uint32_t)ceil(sc->size[1] / 2.0) % sc->localSize[1] != 0)
+							sprintf(output + strlen(output), "		}");
 					}
 					sprintf(output + strlen(output), "\
 	if (%s==0) \n\
 	{\n", sc->gl_LocalInvocationID_x);
 					sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-					sprintf(index_x, "2 * (%s%s)", sc->gl_GlobalInvocationID_y, shiftY);
-					sprintf(index_y, "%d", sc->inputStride[2] / (sc->inputStride[1] + 2));
+					sprintf(index_x, "0");
+					sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 					indexInputVkFFT(output, sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 					sprintf(output + strlen(output), ";\n");
 					//sprintf(output + strlen(output), "		inoutID = indexInput(2 * (%s%s), %d);\n", sc->gl_GlobalInvocationID_y, shiftY, sc->inputStride[2] / (sc->inputStride[1] + 2));
@@ -2973,8 +3352,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 						sprintf(output + strlen(output), "		temp_0 = %sinputBlocks[inoutID / %d]%s[inoutID %% %d]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
 
 					sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-					sprintf(index_x, "2 * (%s%s) + 1", sc->gl_GlobalInvocationID_y, shiftY);
-					sprintf(index_y, "%d", sc->inputStride[2] / (sc->inputStride[1] + 2));
+					sprintf(index_x, "%s", sc->inputStride[1]);
+					sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 					indexInputVkFFT(output, sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 					sprintf(output + strlen(output), ";\n");
 					//sprintf(output + strlen(output), "		inoutID = indexInput(2 * (%s%s) + 1, %d);\n", sc->gl_GlobalInvocationID_y, shiftY, sc->inputStride[2] / (sc->inputStride[1] + 2));
@@ -2991,8 +3370,6 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		sdata[sharedStride * %s].x = (temp_0.x - temp_1.y);\n\
 		sdata[sharedStride * %s].y = (temp_0.y + temp_1.x);\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_y);
 					VkAppendLine(sc, "	}\n");
-					if ((uint32_t)ceil(sc->size[1] / 2.0) % sc->localSize[1] != 0)
-						sprintf(output + strlen(output), "		}");
 				}
 			}
 			else {
@@ -3004,8 +3381,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 							sprintf(output + strlen(output), "if (%s < %d){\n", sc->gl_LocalInvocationID_x, sc->fftDim / 2 - i * sc->localSize[0]);
 
 						sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-						sprintf(index_x, "%s + %d", sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
-						sprintf(index_y, "%s%s", sc->gl_GlobalInvocationID_y, shiftY);
+						sprintf(index_x, "%s + %d", sc->gl_LocalInvocationID_x, i * sc->localSize[0] + 1);
+						sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 						indexInputVkFFT(output, sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 						sprintf(output + strlen(output), ";\n");
 						//sprintf(output + strlen(output), "		inoutID = indexInput(%s + %d, (%s%s));\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_GlobalInvocationID_y, shiftY);
@@ -3016,8 +3393,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 							sprintf(output + strlen(output), "		temp_0 = %sinputBlocks[inoutID / %d]%s[inoutID %% %d]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
 
 						sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-						sprintf(index_x, "%s + %d", sc->gl_LocalInvocationID_x, sc->inputStride[1] / 2 + i * sc->localSize[0]);
-						sprintf(index_y, "%s%s", sc->gl_GlobalInvocationID_y, shiftY);
+						sprintf(index_x, "%s + %d", sc->gl_LocalInvocationID_x, sc->inputStride[1] + i * sc->localSize[0] + 1);
+						sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 						indexInputVkFFT(output, sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 						sprintf(output + strlen(output), ";\n");
 						//sprintf(output + strlen(output), "		inoutID = indexInput(%s+%d, (%s%s));\n", sc->gl_LocalInvocationID_x, sc->inputStride[1] / 2 + i * sc->localSize[0], sc->gl_GlobalInvocationID_y, shiftY);
@@ -3040,13 +3417,15 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		sdata[sharedStride * %s + %d - %s].y = (-temp_0.y + temp_1.x);\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + 1, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + 1, sc->gl_LocalInvocationID_y, sc->fftDim - i * sc->localSize[0] - 1, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim - i * sc->localSize[0] - 1, sc->gl_LocalInvocationID_x);
 						if ((ceil(sc->min_registers_per_thread / 2.0) != sc->min_registers_per_thread / 2) && (i == (ceil(sc->min_registers_per_thread / 2.0) - 1)))
 							sprintf(output + strlen(output), "}\n");
+						if ((uint32_t)ceil(sc->size[1] / 2.0) % sc->localSize[1] != 0)
+							sprintf(output + strlen(output), "		}");
 					}
 					sprintf(output + strlen(output), "\
 	if (%s==0) \n\
 	{\n", sc->gl_LocalInvocationID_x);
 					sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-					sprintf(index_x, "2 * (%s%s)", sc->gl_GlobalInvocationID_y, shiftY);
-					sprintf(index_y, "%d", sc->inputStride[2] / (sc->inputStride[1] + 2));
+					sprintf(index_x, "0");
+					sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 					indexInputVkFFT(output, sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 					sprintf(output + strlen(output), ";\n");
 					//sprintf(output + strlen(output), "		inoutID = indexInput(2 * (%s%s), %d);\n", sc->gl_GlobalInvocationID_y, shiftY, sc->inputStride[2] / (sc->inputStride[1] + 2));
@@ -3056,8 +3435,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 						sprintf(output + strlen(output), "		temp_0 = %sinputBlocks[inoutID / %d]%s[inoutID %% %d]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight);
 
 					sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-					sprintf(index_x, "2 * (%s%s) + 1", sc->gl_GlobalInvocationID_y, shiftY);
-					sprintf(index_y, "%d", sc->inputStride[2] / (sc->inputStride[1] + 2));
+					sprintf(index_x, "%d", sc->inputStride[1]);
+					sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 					indexInputVkFFT(output, sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
 					sprintf(output + strlen(output), ";\n");
 					//sprintf(output + strlen(output), "		inoutID = indexInput(2 * (%s%s) + 1, %d);\n", sc->gl_GlobalInvocationID_y, shiftY, sc->inputStride[2] / (sc->inputStride[1] + 2));
@@ -3074,8 +3453,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		sdata[sharedStride * %s].x = (temp_0.x - temp_1.y);\n\
 		sdata[sharedStride * %s].y = (temp_0.y + temp_1.x);\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_y);
 					VkAppendLine(sc, "	}\n");
-					if ((uint32_t)ceil(sc->size[1] / 2.0) % sc->localSize[1] != 0)
-						sprintf(output + strlen(output), "		}");
+					
 				}
 				else {
 					//Not implemented
@@ -3111,14 +3489,14 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		if (!strcmp(floatType, "double")) sprintf(LFending, "l");
 #endif
 
-		uint32_t logicalRegistersPerThread = (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+		uint32_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[sc->stageRadix[0]];// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
 		switch (reorderType) {
 		case 1: {//grouped_c2c
 			char shiftX[500] = "";
 			if (sc->performWorkGroupShift[0])
 				sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
 			if ((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse)) {
-				if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread / sc->stageRadix[0]) > sc->fftDim) {
+				if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) {
 					appendBarrierVkFFT(output, 1);
 					sc->readToRegisters = 0;
 				}
@@ -3175,7 +3553,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			if (sc->performWorkGroupShift[0])
 				sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
 			if ((!sc->reorderFourStep) && (sc->inverse)) {
-				if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread / sc->stageRadix[0]) > sc->fftDim) {
+				if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) {
 					appendBarrierVkFFT(output, 1);
 					sc->readToRegisters = 0;
 				}
@@ -3255,7 +3633,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		if (!strcmp(floatType, "double")) sprintf(LFending, "l");
 #endif
 
-		uint32_t logicalRegistersPerThread = (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+		uint32_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]];// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
 		switch (reorderType) {
 		case 1: {//grouped_c2c
 			char shiftX[500] = "";
@@ -3329,7 +3707,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			if (sc->performWorkGroupShift[0])
 				sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x);
 			if (!((!sc->reorderFourStep) && (sc->inverse))) {
-				if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) {
+				if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) {
 					appendBarrierVkFFT(output, 1);
 					sc->writeFromRegisters = 0;
 				}
@@ -3421,8 +3799,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			else
 				sprintf(convolutionInverse, ", 1");
 		}
-		uint32_t logicalStoragePerThread = (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-		uint32_t logicalRegistersPerThread = (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+		uint32_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+		uint32_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
 		uint32_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
 		if ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || (sc->localSize[1] > 1) || ((sc->performR2C) && (sc->inverse)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)))
 			appendBarrierVkFFT(output, 1);
@@ -3444,12 +3822,21 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 					for (uint32_t i = 0; i < stageRadix; i++) {
 						uint32_t id = j + i * logicalRegistersPerThread / stageRadix;
 						id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
+
+						sprintf(output + strlen(output), "\
+		%s = %s + %d;\n", sc->sdataID, sc->gl_LocalInvocationID_x, j * logicalGroupSize + i * sc->fftDim / stageRadix);
+
+						if (sc->resolveBankConflictFirstStages == 1) {
+							sprintf(output + strlen(output), "\
+	%s = (%s / %d) * %d + %s %% %d;", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2);
+						}
+
 						if (sc->localSize[1] > 1)
 							sprintf(output + strlen(output), "\
-		%s = sdata[sharedStride * %s + %s + %d];\n", sc->regIDs[id], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, j * logicalGroupSize + i * sc->fftDim / stageRadix);
-						else
+		%s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y);
+
 							sprintf(output + strlen(output), "\
-		%s = sdata[%s + %d];\n", sc->regIDs[id], sc->gl_LocalInvocationID_x, j * logicalGroupSize + i * sc->fftDim / stageRadix);
+		%s = sdata[%s];\n", sc->regIDs[id], sc->sdataID);
 					}
 				}
 				char** regID = (char**)malloc(sizeof(char*) * stageRadix);
@@ -3465,6 +3852,11 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 
 				}
 				inlineRadixKernelVkFFT(output, sc, floatType, uintType, stageRadix, stageSize, stageAngle, regID);
+				for (uint32_t i = 0; i < stageRadix; i++) {
+					uint32_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
+					id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
+					sprintf(sc->regIDs[id], "%s", regID[i]);
+				}
 				for (uint32_t i = 0; i < stageRadix; i++)
 					free(regID[i]);
 				free(regID);
@@ -3515,8 +3907,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			else
 				sprintf(convolutionInverse, ", 1");
 		}
-		uint32_t logicalStoragePerThread = (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-		uint32_t logicalRegistersPerThread = (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+		uint32_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+		uint32_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
 		uint32_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
 		if ((sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)))
 			appendBarrierVkFFT(output, 1);
@@ -3555,6 +3947,11 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 
 				}
 				inlineRadixKernelVkFFT(output, sc, floatType, uintType, stageRadix, stageSize, stageAngle, regID);
+				for (uint32_t i = 0; i < stageRadix; i++) {
+					uint32_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
+					id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread;
+					sprintf(sc->regIDs[id], "%s", regID[i]);
+				}
 				for (uint32_t i = 0; i < stageRadix; i++)
 					free(regID[i]);
 				free(regID);
@@ -3593,8 +3990,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		if (!strcmp(floatType, "double")) sprintf(vecType, "double2");
 #endif
 		if (((sc->registerBoost > 1) && (stageSize * stageRadix == sc->fftDim) && (sc->stageRadix[sc->numStages - 1] == sc->registerBoost))) {
-			uint32_t logicalStoragePerThread = (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-			uint32_t logicalRegistersPerThread = (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+			uint32_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+			uint32_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
 			uint32_t* usedIDs = (uint32_t*)malloc(sizeof(uint32_t) * logicalStoragePerThread);
 			/*for (uint32_t i = 1; i < logicalStoragePerThread - 1; i++) {
 				usedIDs[i] = 0;
@@ -3691,10 +4088,10 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			sprintf(stageNormalization, "%d", stageRadix);
 		char tempNum[50] = "";
 
-		uint32_t logicalStoragePerThread = (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-		uint32_t logicalStoragePerThreadNext = (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-		uint32_t logicalRegistersPerThread = (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
-		uint32_t logicalRegistersPerThreadNext = (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+		uint32_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+		uint32_t logicalStoragePerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext] * sc->registerBoost;// (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+		uint32_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+		uint32_t logicalRegistersPerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext];// (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
 
 		uint32_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
 		uint32_t logicalGroupSizeNext = sc->fftDim / logicalStoragePerThreadNext;
@@ -3771,6 +4168,23 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 								t++;
 								sprintf(tempNum, "%d", i * stageSize);
 								VkAddReal(sc, sc->sdataID, sc->inoutID, tempNum);
+								if ((stageSize <= sc->numSharedBanks / 2) && (sc->fftDim > sc->numSharedBanks / 2) && (sc->sharedStrideBankConflictFirstStages != sc->fftDim / sc->registerBoost) && ((sc->fftDim & (sc->fftDim - 1)) == 0) && (stageSize * stageRadix != sc->fftDim)) {
+									if (sc->resolveBankConflictFirstStages == 0) {
+										sc->resolveBankConflictFirstStages = 1;
+										sprintf(output + strlen(output), "\
+	%s = %d;", sc->sharedStride, sc->sharedStrideBankConflictFirstStages);
+									}
+									sprintf(output + strlen(output), "\
+	%s = (%s / %d) * %d + %s %% %d;", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2);
+
+								}
+								else {
+									if (sc->resolveBankConflictFirstStages == 1) {
+										sc->resolveBankConflictFirstStages = 0;
+										sprintf(output + strlen(output), "\
+	%s = %d;", sc->sharedStride, sc->sharedStrideReadWriteConflict);
+									}
+								}
 								if (sc->localSize[1] > 1) {
 									VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride);
 									VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID);
@@ -3904,10 +4318,10 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		char stageNormalization[10] = "";
 		char tempNum[50] = "";
 
-		uint32_t logicalStoragePerThread = (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-		uint32_t logicalStoragePerThreadNext = (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
-		uint32_t logicalRegistersPerThread = (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
-		uint32_t logicalRegistersPerThreadNext = (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+		uint32_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+		uint32_t logicalStoragePerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext] * sc->registerBoost;//(sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+		uint32_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];//(sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
+		uint32_t logicalRegistersPerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext];//(sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread : sc->min_registers_per_thread;
 
 		uint32_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
 		uint32_t logicalGroupSizeNext = sc->fftDim / logicalStoragePerThreadNext;
@@ -4084,10 +4498,10 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		case 0: case 5: case 6: {
 			uint32_t logicalStoragePerThread;
 			if (start == 1) {
-				logicalStoragePerThread = (sc->registers_per_thread % sc->stageRadix[0] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+				logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[0]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[0] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
 			}
 			else {
-				logicalStoragePerThread = (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+				logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
 			}
 			uint32_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
 			if ((sc->registerBoost > 1) && (logicalStoragePerThread != sc->min_registers_per_thread * sc->registerBoost)) {
@@ -4099,7 +4513,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 					if (start == 0) {
 						sprintf(output + strlen(output), "\
 	if (%s * %d < %d) {\n", sc->gl_GlobalInvocationID_x, logicalStoragePerThread, sc->fftDim);
-						for (uint32_t i = 0; i < sc->registers_per_thread; i++) {
+						for (uint32_t i = 0; i < logicalStoragePerThread/ sc->registerBoost; i++) {
 							sprintf(output + strlen(output), "\
 	sdata[%s + %d] = %s;\n", sc->gl_LocalInvocationID_x, i * logicalGroupSize, sc->regIDs[i + k * sc->registers_per_thread]);
 						}
@@ -4120,7 +4534,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 					if (start == 1) {
 						sprintf(output + strlen(output), "\
 	if (%s * %d < %d) {\n", sc->gl_GlobalInvocationID_x, logicalStoragePerThread, sc->fftDim);
-						for (uint32_t i = 0; i < sc->registers_per_thread; i++) {
+						for (uint32_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) {
 							sprintf(output + strlen(output), "\
 	%s = sdata[%s + %d];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, i * logicalGroupSize);
 						}
@@ -4142,10 +4556,10 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		case 1: case 2: {
 			uint32_t logicalStoragePerThread;
 			if (start == 1) {
-				logicalStoragePerThread = (sc->registers_per_thread % sc->stageRadix[0] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+				logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[0]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[0] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
 			}
 			else {
-				logicalStoragePerThread = (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
+				logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost;
 			}
 			uint32_t logicalGroupSize = sc->fftDim / logicalStoragePerThread;
 			if ((sc->registerBoost > 1) && (logicalStoragePerThread != sc->min_registers_per_thread * sc->registerBoost)) {
@@ -4157,7 +4571,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 					if (start == 0) {
 						sprintf(output + strlen(output), "\
 	if (%s * %d < %d) {\n", sc->gl_GlobalInvocationID_y, logicalStoragePerThread, sc->fftDim);
-						for (uint32_t i = 0; i < sc->registers_per_thread; i++) {
+						for (uint32_t i = 0; i < logicalStoragePerThread/ sc->registerBoost; i++) {
 							sprintf(output + strlen(output), "\
 	sdata[%s + %s * (%s + %d)] = %s;\n", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupSize_x, sc->gl_LocalInvocationID_y, i * logicalGroupSize, sc->regIDs[i + k * sc->registers_per_thread]);
 						}
@@ -4178,7 +4592,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 					if (start == 1) {
 						sprintf(output + strlen(output), "\
 	if (%s * %d < %d) {\n", sc->gl_GlobalInvocationID_y, logicalStoragePerThread, sc->fftDim);
-						for (uint32_t i = 0; i < sc->registers_per_thread; i++) {
+						for (uint32_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) {
 							sprintf(output + strlen(output), "\
 	%s = sdata[%s + %s * (%s + %d)];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_WorkGroupSize_x, sc->gl_LocalInvocationID_y, i * logicalGroupSize);
 						}
@@ -4674,7 +5088,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		switch (writeType) {
 		case 0:
 		{//single_c2c
-			if ((sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread / sc->stageRadix[sc->numStages - 1]) > sc->fftDim)) {
+			if ((sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim)) {
 				sc->writeFromRegisters = 0;
 				appendBarrierVkFFT(output, 1);
 			}
@@ -5020,7 +5434,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			break;
 		}
 		case 1: {//grouped_c2c
-			if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) {
+			if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) {
 				sc->writeFromRegisters = 0;
 				appendBarrierVkFFT(output, 1);
 			}
@@ -5076,7 +5490,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 							}
 							else {
 								if (sc->outputBufferBlockNum == 1)
-									sprintf(output + strlen(output), "		%s[inoutID] = %ssdata[%s*(%s+%d) + %s]%s;\n", sc->gl_WorkGroupSize_x, outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight);
+									sprintf(output + strlen(output), "		%s[inoutID] = %ssdata[%s*(%s+%d) + %s]%s;\n", outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight);
 								else
 									sprintf(output + strlen(output), "		outputBlocks[inoutID / %d]%s[inoutID %% %d] = %ssdata[%s*(%s+%d) + %s]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->gl_WorkGroupSize_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight);
 							}
@@ -5144,7 +5558,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 
 		}
 		case 2: {//single_c2c_strided
-			if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) {
+			if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) {
 				sc->writeFromRegisters = 0;
 				appendBarrierVkFFT(output, 1);
 			}
@@ -5229,8 +5643,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		temp_1.x = sdata[sharedStride * %s].y;\n\
 		temp_1.y = 0;\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_y);
 					sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-					sprintf(index_x, "2 * (%s%s)", sc->gl_GlobalInvocationID_y, shiftY);
-					sprintf(index_y, "%d", sc->outputStride[2] / (sc->outputStride[1] + 2));
+					sprintf(index_x, "0");
+					sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 					indexOutputVkFFT(output, sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
 					sprintf(output + strlen(output), ";\n");
 					//sprintf(output + strlen(output), "		inoutID = indexOutput(2 * (%s%s), %d);\n", sc->gl_GlobalInvocationID_y, shiftY, sc->outputStride[2] / (sc->outputStride[1] + 2));
@@ -5240,8 +5654,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 					else
 						sprintf(output + strlen(output), "		outputBlocks[inoutID / %d]%s[inoutID %% %d] = %stemp_0%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
 					sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-					sprintf(index_x, "2 * (%s%s)+1", sc->gl_GlobalInvocationID_y, shiftY);
-					sprintf(index_y, "%d", sc->outputStride[2] / (sc->outputStride[1] + 2));
+					sprintf(index_x, "%d", sc->outputStride[1]);
+					sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 					indexOutputVkFFT(output, sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
 					sprintf(output + strlen(output), ";\n");
 					//sprintf(output + strlen(output), "		inoutID = indexOutput(2 * (%s%s) + 1, %d);\n", sc->gl_GlobalInvocationID_y, shiftY, sc->outputStride[2] / (sc->outputStride[1] + 2));
@@ -5270,11 +5684,11 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		temp_1.y = 0.5 * (-sdata[sharedStride * %s + (%d + %s)].x + sdata[sharedStride * %s + (%d - %s)].x);\n", sc->gl_LocalInvocationID_y, 1 + i * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim - 1 - i * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, 1 + i * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim - 1 - i * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, 1 + i * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim - 1 - i * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, 1 + i * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim - 1 - i * sc->localSize[0], sc->gl_LocalInvocationID_x);
 						if (sc->zeropad[1]) {
 
-							sprintf(output + strlen(output), "		inoutID = %s+%d;\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
+							sprintf(output + strlen(output), "		inoutID = %s+%d;\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0] + 1);
 							sprintf(output + strlen(output), "		if((inoutID < %d)||(inoutID >= %d)){\n", sc->fft_zeropad_left_write[sc->axis_id], sc->fft_zeropad_right_write[sc->axis_id]);
 
 							sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-							sprintf(index_y, "%s%s", sc->gl_GlobalInvocationID_y, shiftY);
+							sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 							indexOutputVkFFT(output, sc, uintType, writeType, sc->inoutID, index_y, requestCoordinate, requestBatch);
 							sprintf(output + strlen(output), ";\n");
 							if (sc->outputBufferBlockNum == 1)
@@ -5283,8 +5697,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 								sprintf(output + strlen(output), "		outputBlocks[%s / %d]%s[%s %% %d] = %stemp_0%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
 
 							sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-							sprintf(index_x, "%s+%d", sc->gl_LocalInvocationID_x, sc->outputStride[1] / 2 + i * sc->localSize[0]);
-							sprintf(index_y, "%s%s", sc->gl_GlobalInvocationID_y, shiftY);
+							sprintf(index_x, "%s+%d", sc->gl_LocalInvocationID_x, sc->outputStride[1] + i * sc->localSize[0] + 1);
+							sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 							indexOutputVkFFT(output, sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
 							sprintf(output + strlen(output), ";\n");
 							//sprintf(output + strlen(output), "		inoutID = indexOutput(%s+%d, (%s%s));\n", sc->gl_LocalInvocationID_x, sc->outputStride[1] / 2 + i * sc->localSize[0], sc->gl_GlobalInvocationID_y, shiftY);
@@ -5298,8 +5712,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 						}
 						else {
 							sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-							sprintf(index_x, "%s+%d", sc->gl_LocalInvocationID_x, i * sc->localSize[0]);
-							sprintf(index_y, "%s%s", sc->gl_GlobalInvocationID_y, shiftY);
+							sprintf(index_x, "%s+%d", sc->gl_LocalInvocationID_x, i * sc->localSize[0] + 1);
+							sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 							indexOutputVkFFT(output, sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
 							sprintf(output + strlen(output), ";\n");
 							//sprintf(output + strlen(output), "		inoutID = indexOutput(%s+%d, (%s%s));\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_GlobalInvocationID_y, shiftY);
@@ -5310,8 +5724,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 								sprintf(output + strlen(output), "		outputBlocks[inoutID / %d]%s[inoutID %% %d] = %stemp_0%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight);
 
 							sprintf(output + strlen(output), "			%s = ", sc->inoutID);
-							sprintf(index_x, "%s+%d", sc->gl_LocalInvocationID_x, sc->outputStride[1] / 2 + i * sc->localSize[0]);
-							sprintf(index_y, "%s%s", sc->gl_GlobalInvocationID_y, shiftY);
+							sprintf(index_x, "%s+%d", sc->gl_LocalInvocationID_x, sc->outputStride[1] + i * sc->localSize[0] + 1);
+							sprintf(index_y, "2*%s%s", sc->gl_GlobalInvocationID_y, shiftY);
 							indexOutputVkFFT(output, sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
 							sprintf(output + strlen(output), ";\n");
 							//sprintf(output + strlen(output), "		inoutID = indexOutput(%s+%d, (%s%s));\n", sc->gl_LocalInvocationID_x, sc->outputStride[1] / 2 + i * sc->localSize[0], sc->gl_GlobalInvocationID_y, shiftY);
@@ -5683,6 +6097,12 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 				case 8:
 					stageSizeSum += stageSize * 3;
 					break;
+				case 11:
+					stageSizeSum += stageSize * 10;
+					break;
+				case 13:
+					stageSizeSum += stageSize * 12;
+					break;
 				}
 				if (i == sc->numStages - 1)
 					appendRadixShuffle(output, sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i], type);
@@ -5734,6 +6154,12 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 				case 8:
 					stageSizeSum += stageSize * 3;
 					break;
+				case 11:
+					stageSizeSum += stageSize * 10;
+					break;
+				case 13:
+					stageSizeSum += stageSize * 12;
+					break;
 				}
 				if (i == sc->numStages - 1)
 					appendRadixShuffle(output, sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i], type);
@@ -5746,6 +6172,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		}
 		appendBoostThreadDataReorder(output, sc, floatType, uintType, type, 0);
 		appendReorder4StepWrite(output, sc, floatType, uintType, type);
+		//appendWriteSharedToRegistersVkFFT(output, sc, floatType, floatTypeOutputMemory, uintType, type);
 		appendWriteDataVkFFT(output, sc, floatType, floatTypeOutputMemory, uintType, type);
 		if ((sc->convolutionStep) && (sc->matrixConvolution > 1))
 			VkAppendLine(sc, "	}\n");
@@ -5836,7 +6263,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		uint32_t multipliers[20] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };//split the sequence
 		uint32_t isPowOf2 = (pow(2, (uint32_t)log2(app->configuration.size[axis_id])) == app->configuration.size[axis_id]) ? 1 : 0;
 		uint32_t tempSequence = app->configuration.size[axis_id];
-		for (uint32_t i = 2; i < 8; i++) {
+		for (uint32_t i = 2; i < 14; i++) {
 			if (tempSequence % i == 0) {
 				tempSequence /= i;
 				multipliers[i]++;
@@ -5914,7 +6341,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			else maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided;
 		}
 		if (((uint32_t)log2(app->configuration.size[axis_id]) >= app->configuration.swapTo3Stage4Step) && (app->configuration.swapTo3Stage4Step >= 17)) numPasses = 3;//Force set to 3 stage 4 step algorithm
-		uint32_t* locAxisSplit = (supportAxis) ? FFTPlan->supportAxisSplit[axis_id - 1] : FFTPlan->axisSplit[axis_id];
+		uint32_t* locAxisSplit = FFTPlan->axisSplit[axis_id];
 		if (numPasses == 1) {
 			locAxisSplit[0] = app->configuration.size[axis_id];
 		}
@@ -6160,14 +6587,11 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 				locAxisSplit[i] = swap;
 			}
 		}
-		if (supportAxis)
-			FFTPlan->numSupportAxisUploads[axis_id - 1] = numPasses;
-		else
 			FFTPlan->numAxisUploads[axis_id] = numPasses;
 		for (uint32_t k = 0; k < numPasses; k++) {
 			tempSequence = locAxisSplit[k];
 			uint32_t loc_multipliers[20] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };//split the smaller sequence
-			for (uint32_t i = 2; i < 8; i++) {
+			for (uint32_t i = 2; i < 14; i++) {
 				if (tempSequence % i == 0) {
 					tempSequence /= i;
 					loc_multipliers[i]++;
@@ -6175,1235 +6599,1136 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 				}
 			}
 			uint32_t registers_per_thread = 8;
+			uint32_t registers_per_thread_per_radix[14] = {};
 			uint32_t min_registers_per_thread = 8;
 			if (loc_multipliers[2] > 0) {
 				if (loc_multipliers[3] > 0) {
 					if (loc_multipliers[5] > 0) {
 						if (loc_multipliers[7] > 0) {
-							registers_per_thread = 15;
+							if (loc_multipliers[11] > 0) {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 15;
+									registers_per_thread_per_radix[2] = 14;
+									registers_per_thread_per_radix[3] = 15;
+									registers_per_thread_per_radix[5] = 15;
+									registers_per_thread_per_radix[7] = 14;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 11;
+								}
+								else {
+									registers_per_thread = 15;
+									registers_per_thread_per_radix[2] = 14;
+									registers_per_thread_per_radix[3] = 15;
+									registers_per_thread_per_radix[5] = 15;
+									registers_per_thread_per_radix[7] = 14;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 11;
+								}
+							}
+							else {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 15;
+									registers_per_thread_per_radix[2] = 14;
+									registers_per_thread_per_radix[3] = 15;
+									registers_per_thread_per_radix[5] = 15;
+									registers_per_thread_per_radix[7] = 14;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 13;
+								}
+								else {
+									registers_per_thread = 15;
+									registers_per_thread_per_radix[2] = 14;
+									registers_per_thread_per_radix[3] = 15;
+									registers_per_thread_per_radix[5] = 15;
+									registers_per_thread_per_radix[7] = 14;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 0;
 							min_registers_per_thread = 14;
+						}
+							}
 						}
 						else {
 							if ((loc_multipliers[2] == 1)) {
+								if (loc_multipliers[11] > 0) {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 15;
+										registers_per_thread_per_radix[2] = 10;
+										registers_per_thread_per_radix[3] = 15;
+										registers_per_thread_per_radix[5] = 10;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 10;
+									}
+									else {
+										registers_per_thread = 15;
+										registers_per_thread_per_radix[2] = 10;
+										registers_per_thread_per_radix[3] = 15;
+										registers_per_thread_per_radix[5] = 10;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 10;
+									}
+								}
+								else {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 15;
+										registers_per_thread_per_radix[2] = 10;
+										registers_per_thread_per_radix[3] = 15;
+										registers_per_thread_per_radix[5] = 10;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 10;
+									}
+									else {
 								registers_per_thread = 6;
+										registers_per_thread_per_radix[2] = 6;
+										registers_per_thread_per_radix[3] = 6;
+										registers_per_thread_per_radix[5] = 5;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 0;
 								min_registers_per_thread = 5;
+							}
+								}
 							}
 							else {
+								if (loc_multipliers[11] > 0) {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 13;
+										registers_per_thread_per_radix[2] = 10;
+										registers_per_thread_per_radix[3] = 12;
+										registers_per_thread_per_radix[5] = 10;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 10;
+									}
+									else {
 								registers_per_thread = 12;
+										registers_per_thread_per_radix[2] = 10;
+										registers_per_thread_per_radix[3] = 12;
+										registers_per_thread_per_radix[5] = 10;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 0;
 								min_registers_per_thread = 10;
 							}
+						}
+								else {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 13;
+										registers_per_thread_per_radix[2] = 10;
+										registers_per_thread_per_radix[3] = 12;
+										registers_per_thread_per_radix[5] = 10;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 10;
+					}
+									else {
+										registers_per_thread = 12;
+										registers_per_thread_per_radix[2] = 10;
+										registers_per_thread_per_radix[3] = 12;
+										registers_per_thread_per_radix[5] = 10;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 10;
+									}
+								}
+							}
 						}
 					}
 					else
 					{
 						if (loc_multipliers[7] > 0) {
 							if ((loc_multipliers[2] == 1)) {
+								if (loc_multipliers[11] > 0) {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 26;
+										registers_per_thread_per_radix[2] = 22;
+										registers_per_thread_per_radix[3] = 21;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 21;
+										registers_per_thread_per_radix[11] = 22;
+										registers_per_thread_per_radix[13] = 26;
+										min_registers_per_thread = 21;
+									}
+									else {
+										registers_per_thread = 22;
+										registers_per_thread_per_radix[2] = 22;
+										registers_per_thread_per_radix[3] = 21;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 21;
+										registers_per_thread_per_radix[11] = 22;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 21;
+									}
+								}
+								else {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 26;
+										registers_per_thread_per_radix[2] = 26;
+										registers_per_thread_per_radix[3] = 21;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 21;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 26;
+										min_registers_per_thread = 21;
+									}
+									else {
 								registers_per_thread = 7;
+										registers_per_thread_per_radix[2] = 6;
+										registers_per_thread_per_radix[3] = 6;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 7;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 0;
 								min_registers_per_thread = 6;
+							}
+								}
 							}
+							else {
+								if (loc_multipliers[11] > 0) {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 14;
+										registers_per_thread_per_radix[2] = 12;
+										registers_per_thread_per_radix[3] = 12;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 14;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 11;
+									}
+									else {
+										registers_per_thread = 14;
+										registers_per_thread_per_radix[2] = 12;
+										registers_per_thread_per_radix[3] = 12;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 14;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 11;
+									}
+								}
+								else {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 14;
+										registers_per_thread_per_radix[2] = 12;
+										registers_per_thread_per_radix[3] = 12;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 14;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 12;
+									}
 							else {
 								registers_per_thread = 14;
+										registers_per_thread_per_radix[2] = 12;
+										registers_per_thread_per_radix[3] = 12;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 14;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 0;
 								min_registers_per_thread = 12;
 							}
+						}
+							}
 						}
 						else {
 							if ((loc_multipliers[2] == 1)) {
-								registers_per_thread = 6;
+								if (loc_multipliers[11] > 0) {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 13;
+										registers_per_thread_per_radix[2] = 6;
+										registers_per_thread_per_radix[3] = 6;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 6;
+									}
+									else {
+										registers_per_thread = 11;
+										registers_per_thread_per_radix[2] = 6;
+										registers_per_thread_per_radix[3] = 6;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 0;
 								min_registers_per_thread = 6;
 							}
+								}
 							else {
-								registers_per_thread = 12;
-								min_registers_per_thread = 12;
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 13;
+										registers_per_thread_per_radix[2] = 6;
+										registers_per_thread_per_radix[3] = 6;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 6;
 							}
+									else {
+										registers_per_thread = 6;
+										registers_per_thread_per_radix[2] = 6;
+										registers_per_thread_per_radix[3] = 6;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 6;
 						}
 					}
 				}
 				else {
-					if (loc_multipliers[5] > 0) {
-						if (loc_multipliers[7] > 0) {
-							registers_per_thread = 10;
-							min_registers_per_thread = 7;
+								if (loc_multipliers[11] > 0) {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 13;
+										registers_per_thread_per_radix[2] = 12;
+										registers_per_thread_per_radix[3] = 12;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 11;
 						}
 						else {
-							registers_per_thread = 10;
-							min_registers_per_thread = 10;
+										registers_per_thread = 12;
+										registers_per_thread_per_radix[2] = 12;
+										registers_per_thread_per_radix[3] = 12;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 11;
 						}
 					}
-					else
-					{
-						if (loc_multipliers[7] > 0) {
-							registers_per_thread = 14;
-							min_registers_per_thread = 14;
+								else {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 13;
+										registers_per_thread_per_radix[2] = 12;
+										registers_per_thread_per_radix[3] = 12;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 12;
 						}
 						else {
-							registers_per_thread = (loc_multipliers[2] > 2) ? 8 : pow(2, loc_multipliers[2]);
-							min_registers_per_thread = (loc_multipliers[2] > 2) ? 8 : pow(2, loc_multipliers[2]);
+										registers_per_thread = 12;
+										registers_per_thread_per_radix[2] = 12;
+										registers_per_thread_per_radix[3] = 12;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 12;
 						}
 					}
 				}
 			}
+					}
+				}
 			else {
-				if (loc_multipliers[3] > 0) {
 					if (loc_multipliers[5] > 0) {
 						if (loc_multipliers[7] > 0) {
-							registers_per_thread = 21;
-							min_registers_per_thread = 15;
+							if (loc_multipliers[11] > 0) {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 14;
+									registers_per_thread_per_radix[2] = 10;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 10;
+									registers_per_thread_per_radix[7] = 14;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 10;
 						}
 						else {
-							registers_per_thread = 15;
-							min_registers_per_thread = 15;
+									registers_per_thread = 14;
+									registers_per_thread_per_radix[2] = 10;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 10;
+									registers_per_thread_per_radix[7] = 14;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 10;
 						}
 					}
-					else
-					{
-						if (loc_multipliers[7] > 0) {
-							if ((loc_multipliers[3] == 1)) {
-								registers_per_thread = 21;
-								min_registers_per_thread = 21;
+							else {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 14;
+									registers_per_thread_per_radix[2] = 10;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 10;
+									registers_per_thread_per_radix[7] = 14;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 10;
 							}
 							else {
-								registers_per_thread = 9;
+									registers_per_thread = 10;
+									registers_per_thread_per_radix[2] = 10;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 10;
+									registers_per_thread_per_radix[7] = 7;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 0;
 								min_registers_per_thread = 7;
 							}
 						}
+						}
 						else {
-							if ((loc_multipliers[3] == 1)) {
-								registers_per_thread = 3;
-								min_registers_per_thread = 3;
+							if (loc_multipliers[11] > 0) {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 13;
+									registers_per_thread_per_radix[2] = 10;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 10;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 10;
 							}
 							else {
-								registers_per_thread = 9;
-								min_registers_per_thread = 9;
-							}
-						}
+									registers_per_thread = 11;
+									registers_per_thread_per_radix[2] = 10;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 10;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 10;
 					}
 				}
 				else {
-					if (loc_multipliers[5] > 0) {
-						if (loc_multipliers[7] > 0) {
-							registers_per_thread = 7;
-							min_registers_per_thread = 5;
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 13;
+									registers_per_thread_per_radix[2] = 10;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 10;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 10;
 						}
 						else {
-							registers_per_thread = 5;
-							min_registers_per_thread = 5;
+									registers_per_thread = 10;
+									registers_per_thread_per_radix[2] = 10;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 10;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 10;
+								}
+							}
 						}
 					}
 					else
 					{
 						if (loc_multipliers[7] > 0) {
-							registers_per_thread = 7;
-							min_registers_per_thread = 7;
+							if (loc_multipliers[11] > 0) {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 14;
+									registers_per_thread_per_radix[2] = 14;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 14;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 11;
 						}
-						else
-							return 11;
+								else {
+									registers_per_thread = 14;
+									registers_per_thread_per_radix[2] = 14;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 14;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 11;
 					}
 				}
-
+							else {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 14;
+									registers_per_thread_per_radix[2] = 14;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 14;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 13;
 			}
-			if ((registerBoost == 4) && (registers_per_thread % 4 != 0)) {
-				registers_per_thread *= 2;
-				min_registers_per_thread *= 2;
+								else {
+									registers_per_thread = 14;
+									registers_per_thread_per_radix[2] = 14;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 14;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 14;
 			}
-			if (registers_per_thread % 8 == 0) {
-				loc_multipliers[8] = loc_multipliers[2] / 3;
-				loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[8] * 3;
 			}
-			if (registers_per_thread % 4 == 0) {
-				loc_multipliers[4] = loc_multipliers[2] / 2;
-				loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[4] * 2;
 			}
-			if ((registerBoost == 2) && (loc_multipliers[2] == 0)) {
-				if (loc_multipliers[4] > 0) {
-					loc_multipliers[4]--;
-					loc_multipliers[2] = 2;
+						else {
+							if (loc_multipliers[11] > 0) {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 26;
+									registers_per_thread_per_radix[2] = 22;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 22;
+									registers_per_thread_per_radix[13] = 26;
+									min_registers_per_thread = 22;
 				}
 				else {
-					loc_multipliers[8]--;
-					loc_multipliers[4]++;
-					loc_multipliers[2]++;
+									registers_per_thread = 22;
+									registers_per_thread_per_radix[2] = 22;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 22;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 22;
 				}
 			}
-			if ((registerBoost == 4) && (loc_multipliers[4] == 0)) {
-				loc_multipliers[8]--;
-				loc_multipliers[4]++;
-				loc_multipliers[2]++;
+							else {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 26;
+									registers_per_thread_per_radix[2] = 26;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 26;
+									min_registers_per_thread = 26;
 			}
-			uint32_t maxBatchCoalesced = ((axis_id == 0) && (((k == 0) && (!app->configuration.reorderFourStep)) || (numPasses == 1))) ? 1 : app->configuration.coalescedMemory / complexSize;
-			if (maxBatchCoalesced * locAxisSplit[k] / (min_registers_per_thread * registerBoost) > app->configuration.maxThreadsNum)
-			{
-				for (uint32_t i = 2; i < 8; i++) {
-					if (locAxisSplit[k] / (min_registers_per_thread * registerBoost) % i == 0) {
-						min_registers_per_thread *= i;
-						i = 8;
+								else {
+									registers_per_thread = (loc_multipliers[2] > 2) ? 8 : pow(2, loc_multipliers[2]);
+									registers_per_thread_per_radix[2] = (loc_multipliers[2] > 2) ? 8 : pow(2, loc_multipliers[2]);
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = (loc_multipliers[2] > 2) ? 8 : pow(2, loc_multipliers[2]);
 					}
 				}
-				for (uint32_t i = 2; i < 8; i++) {
-					if (locAxisSplit[k] / (registers_per_thread * registerBoost) % i == 0) {
-						registers_per_thread *= i;
-						i = 8;
 					}
 				}
-				if (min_registers_per_thread > registers_per_thread) {
-					uint32_t temp = min_registers_per_thread;
-					min_registers_per_thread = registers_per_thread;
-					registers_per_thread = temp;
 				}
 			}
-			uint32_t j = 0;
-			VkFFTAxis* axes = (supportAxis) ? FFTPlan->supportAxes[axis_id - 1] : FFTPlan->axes[axis_id];
-			axes[k].specializationConstants.registerBoost = registerBoost;
-			axes[k].specializationConstants.registers_per_thread = registers_per_thread;
-			axes[k].specializationConstants.min_registers_per_thread = min_registers_per_thread;
-			axes[k].specializationConstants.numStages = 0;
-			axes[k].specializationConstants.fftDim = locAxisSplit[k];
-			uint32_t tempRegisterBoost = registerBoost;// ((axis_id == nonStridedAxisId) && (!app->configuration.reorderFourStep)) ? ceil(axes[k].specializationConstants.fftDim / (float)maxSingleSizeNonStrided) : ceil(axes[k].specializationConstants.fftDim / (float)maxSingleSizeStrided);
-			uint32_t switchRegisterBoost = 0;
-			if (tempRegisterBoost > 1) {
-				if (loc_multipliers[tempRegisterBoost] > 0) {
-					loc_multipliers[tempRegisterBoost]--;
-					switchRegisterBoost = tempRegisterBoost;
+			else {
+				if (loc_multipliers[3] > 0) {
+					if (loc_multipliers[5] > 0) {
+						if (loc_multipliers[7] > 0) {
+							if (loc_multipliers[11] > 0) {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 21;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 15;
+									registers_per_thread_per_radix[5] = 15;
+									registers_per_thread_per_radix[7] = 21;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 11;
 				}
 				else {
-					for (uint32_t i = 8; i > 1; i--) {
-						if (loc_multipliers[i] > 0) {
-							loc_multipliers[i]--;
-							switchRegisterBoost = i;
-							i = 1;
+									registers_per_thread = 21;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 15;
+									registers_per_thread_per_radix[5] = 15;
+									registers_per_thread_per_radix[7] = 21;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 11;
 						}
 					}
+							else {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 21;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 15;
+									registers_per_thread_per_radix[5] = 15;
+									registers_per_thread_per_radix[7] = 21;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 13;
 				}
+								else {
+									registers_per_thread = 21;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 15;
+									registers_per_thread_per_radix[5] = 15;
+									registers_per_thread_per_radix[7] = 21;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 15;
 			}
-			for (uint32_t i = 8; i > 1; i--) {
-				if (loc_multipliers[i] > 0) {
-					axes[k].specializationConstants.stageRadix[j] = i;
-					loc_multipliers[i]--;
-					i++;
-					j++;
-					axes[k].specializationConstants.numStages++;
 				}
 			}
-			if (switchRegisterBoost > 0) {
-				axes[k].specializationConstants.stageRadix[axes[k].specializationConstants.numStages] = switchRegisterBoost;
-				axes[k].specializationConstants.numStages++;
+						else {
+							if (loc_multipliers[11] > 0) {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 15;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 15;
+									registers_per_thread_per_radix[5] = 15;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 11;
 			}
 			else {
-				if (min_registers_per_thread != registers_per_thread) {
-					j = axes[k].specializationConstants.stageRadix[axes[k].specializationConstants.numStages - 1];
-					axes[k].specializationConstants.stageRadix[axes[k].specializationConstants.numStages - 1] = axes[k].specializationConstants.stageRadix[0];
-					axes[k].specializationConstants.stageRadix[0] = j;
+									registers_per_thread = 15;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 15;
+									registers_per_thread_per_radix[5] = 15;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 11;
 				}
 			}
+							else {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 15;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 15;
+									registers_per_thread_per_radix[5] = 15;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 13;
 		}
-		return 0;
+								else {
+									registers_per_thread = 15;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 15;
+									registers_per_thread_per_radix[5] = 15;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 15;
 	}
-	static inline uint32_t VkFFTPlanSupportAxis(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint32_t axis_id, uint32_t axis_upload_id, uint32_t inverse) {
-		//get radix stages
-		VkFFTAxis* axis = &FFTPlan->supportAxes[axis_id - 1][axis_upload_id];
-		axis->specializationConstants.inverse = inverse;
-		axis->specializationConstants.supportAxis = 1;
-		axis->specializationConstants.symmetricKernel = app->configuration.symmetricKernel;
-		uint32_t complexSize;
-		if (app->configuration.doublePrecision)
-			complexSize = (2 * sizeof(double));
-		else
-			if (app->configuration.halfPrecision)
-				complexSize = (2 * sizeof(float));
-			else
-				complexSize = (2 * sizeof(float));
-		axis->specializationConstants.complexSize = complexSize;
-		uint32_t maxSequenceLengthSharedMemory = app->configuration.sharedMemorySize / complexSize;
-		uint32_t maxSequenceLengthSharedMemoryPow2 = app->configuration.sharedMemorySizePow2 / complexSize;
-		uint32_t maxSingleSizeStrided = (app->configuration.coalescedMemory > complexSize) ? app->configuration.sharedMemorySize / (app->configuration.coalescedMemory) : app->configuration.sharedMemorySize / complexSize;
-		uint32_t maxSingleSizeStridedPow2 = (app->configuration.coalescedMemory > complexSize) ? app->configuration.sharedMemorySizePow2 / (app->configuration.coalescedMemory) : app->configuration.sharedMemorySizePow2 / complexSize;
-
-		axis->specializationConstants.stageStartSize = 1;
-		for (uint32_t i = 0; i < axis_upload_id; i++)
-			axis->specializationConstants.stageStartSize *= FFTPlan->supportAxisSplit[axis_id - 1][i];
-
-		axis->specializationConstants.firstStageStartSize = app->configuration.size[axis_id] / FFTPlan->supportAxisSplit[axis_id - 1][FFTPlan->numSupportAxisUploads[axis_id - 1] - 1];
-
-		axis->specializationConstants.fft_dim_x = app->configuration.size[1];
-		axis->specializationConstants.performR2C = 0;
-		axis->specializationConstants.reorderFourStep = (FFTPlan->numSupportAxisUploads[axis_id - 1] > 1) ? app->configuration.reorderFourStep : 0;
-		uint32_t passID = FFTPlan->numSupportAxisUploads[axis_id - 1] - 1 - axis_upload_id;
-		axis->specializationConstants.fft_dim_full = app->configuration.size[axis_id];
-
-		//allocate LUT 
-		if (app->configuration.useLUT) {
-			double double_PI = 3.1415926535897932384626433832795;
-			uint32_t dimMult = 1;
-			uint32_t maxStageSum = 0;
-			for (uint32_t i = 0; i < axis->specializationConstants.numStages; i++) {
-				switch (axis->specializationConstants.stageRadix[i]) {
-				case 2:
-					maxStageSum += dimMult;
-					break;
-				case 3:
-					maxStageSum += dimMult * 2;
-					break;
-				case 4:
-					maxStageSum += dimMult * 2;
-					break;
-				case 5:
-					maxStageSum += dimMult * 4;
-					break;
-				case 7:
-					maxStageSum += dimMult * 6;
-					break;
-				case 8:
-					maxStageSum += dimMult * 3;
-					break;
 				}
-				dimMult *= axis->specializationConstants.stageRadix[i];
 			}
-			axis->specializationConstants.maxStageSumLUT = maxStageSum;
-			dimMult = 1;
-			if (app->configuration.doublePrecision) {
-				if (axis_upload_id > 0)
-					axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim) * 2 * sizeof(double);
+					}
 				else
-					axis->bufferLUTSize = (maxStageSum) * 2 * sizeof(double);
-				double* tempLUT = (double*)malloc(axis->bufferLUTSize);
-				uint32_t localStageSize = 1;
-				uint32_t localStageSum = 0;
-				for (uint32_t i = 0; i < axis->specializationConstants.numStages; i++) {
-					if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) {
-						for (uint32_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) {
-							for (uint32_t j = 0; j < localStageSize; j++) {
-								tempLUT[2 * (j + localStageSum)] = cos(j * double_PI / localStageSize / pow(2, k));
-								tempLUT[2 * (j + localStageSum) + 1] = sin(j * double_PI / localStageSize / pow(2, k));
+					{
+						if (loc_multipliers[7] > 0) {
+							if ((loc_multipliers[3] == 1)) {
+								if (loc_multipliers[11] > 0) {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 21;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 21;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 21;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 11;
 							}
-							localStageSum += localStageSize;
+									else {
+										registers_per_thread = 21;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 21;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 21;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 11;
 						}
-						localStageSize *= axis->specializationConstants.stageRadix[i];
 					}
 					else {
-						for (uint32_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
-							for (uint32_t j = 0; j < localStageSize; j++) {
-								tempLUT[2 * (j + localStageSum)] = cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
-								tempLUT[2 * (j + localStageSum) + 1] = sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 21;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 21;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 21;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 13;
 							}
-							localStageSum += localStageSize;
+									else {
+										registers_per_thread = 21;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 21;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 21;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 21;
 						}
-						localStageSize *= axis->specializationConstants.stageRadix[i];
 					}
 				}
-
-				if (axis_upload_id > 0)
-					for (uint32_t i = 0; i < axis->specializationConstants.stageStartSize; i++) {
-						for (uint32_t j = 0; j < axis->specializationConstants.fftDim; j++) {
-							double angle = 2 * double_PI * ((i * j) / (double)(axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim));
-							tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize)] = cos(angle);
-							tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize) + 1] = sin(angle);
+							else {
+								if (loc_multipliers[11] > 0) {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 13;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 9;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 7;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 7;
 						}
+									else {
+										registers_per_thread = 11;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 9;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 7;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 7;
 					}
-				if (!inverse) {
-					axis->bufferLUT = app->localFFTPlan_inverse->supportAxes[axis_id - 1][axis_upload_id].bufferLUT;
-#if(VKFFT_BACKEND==0)
-					axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->supportAxes[axis_id - 1][axis_upload_id].bufferLUTDeviceMemory;
-#endif
-					axis->bufferLUTSize = app->localFFTPlan_inverse->supportAxes[axis_id - 1][axis_upload_id].bufferLUTSize;
-					axis->referenceLUT = 1;
 				}
 				else {
-					if ((axis_id == 2) && (axis->specializationConstants.fft_dim_full == FFTPlan->supportAxes[0][0].specializationConstants.fft_dim_full)) {
-						axis->bufferLUT = FFTPlan->supportAxes[0][axis_upload_id].bufferLUT;
-#if(VKFFT_BACKEND==0)
-						axis->bufferLUTDeviceMemory = FFTPlan->supportAxes[0][axis_upload_id].bufferLUTDeviceMemory;
-#endif
-						axis->bufferLUTSize = FFTPlan->supportAxes[0][axis_upload_id].bufferLUTSize;
-						axis->referenceLUT = 1;
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 13;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 9;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 7;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 7;
 					}
 					else {
-#if(VKFFT_BACKEND==0)
-						allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
-						transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
-#elif(VKFFT_BACKEND==1)
-						cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
-						cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
-#elif(VKFFT_BACKEND==2)
-						hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
-						hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
-#endif
+										registers_per_thread = 9;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 9;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 7;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 7;
 					}
 				}
-				free(tempLUT);
 			}
+						}
 			else {
-				if (axis_upload_id > 0)
-					axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim) * 2 * sizeof(float);
-				else
-					axis->bufferLUTSize = (maxStageSum) * 2 * sizeof(float);
-				float* tempLUT = (float*)malloc(axis->bufferLUTSize);
-				uint32_t localStageSize = 1;
-				uint32_t localStageSum = 0;
-				for (uint32_t i = 0; i < axis->specializationConstants.numStages; i++) {
-					if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) {
-						for (uint32_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) {
-							for (uint32_t j = 0; j < localStageSize; j++) {
-								tempLUT[2 * (j + localStageSum)] = (float)cos(j * double_PI / localStageSize / pow(2, k));
-								tempLUT[2 * (j + localStageSum) + 1] = (float)sin(j * double_PI / localStageSize / pow(2, k));
+							if ((loc_multipliers[3] == 1)) {
+								if (loc_multipliers[11] > 0) {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 39;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 33;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 33;
+										registers_per_thread_per_radix[13] = 39;
+										min_registers_per_thread = 33;
 							}
-							localStageSum += localStageSize;
+									else {
+										registers_per_thread = 33;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 33;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 33;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 33;
 						}
-						localStageSize *= axis->specializationConstants.stageRadix[i];
 					}
 					else {
-						for (uint32_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
-							for (uint32_t j = 0; j < localStageSize; j++) {
-								tempLUT[2 * (j + localStageSum)] = (float)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
-								tempLUT[2 * (j + localStageSum) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 39;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 39;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 39;
+										min_registers_per_thread = 39;
 							}
-							localStageSum += localStageSize;
+									else {
+										registers_per_thread = 3;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 3;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 3;
 						}
-						localStageSize *= axis->specializationConstants.stageRadix[i];
 					}
 				}
-
-				if (axis_upload_id > 0)
-					for (uint32_t i = 0; i < axis->specializationConstants.stageStartSize; i++) {
-						for (uint32_t j = 0; j < axis->specializationConstants.fftDim; j++) {
-							double angle = 2 * double_PI * ((i * j) / (double)(axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim));
-							tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize)] = (float)cos(angle);
-							tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize) + 1] = (float)sin(angle);
+							else {
+								if (loc_multipliers[11] > 0) {
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 13;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 9;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 9;
 						}
+									else {
+										registers_per_thread = 11;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 9;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 11;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 9;
 					}
-				axis->referenceLUT = 0;
-				if (!inverse) {
-					axis->bufferLUT = app->localFFTPlan_inverse->supportAxes[axis_id - 1][axis_upload_id].bufferLUT;
-#if(VKFFT_BACKEND==0)
-					axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->supportAxes[axis_id - 1][axis_upload_id].bufferLUTDeviceMemory;
-#endif
-					axis->bufferLUTSize = app->localFFTPlan_inverse->supportAxes[axis_id - 1][axis_upload_id].bufferLUTSize;
-					axis->referenceLUT = 1;
 				}
 				else {
-					if ((axis_id == 2) && (axis->specializationConstants.fft_dim_full == FFTPlan->supportAxes[0][0].specializationConstants.fft_dim_full)) {
-						axis->bufferLUT = FFTPlan->supportAxes[0][axis_upload_id].bufferLUT;
-#if(VKFFT_BACKEND==0)
-						axis->bufferLUTDeviceMemory = FFTPlan->supportAxes[0][axis_upload_id].bufferLUTDeviceMemory;
-#endif
-						axis->bufferLUTSize = FFTPlan->supportAxes[0][axis_upload_id].bufferLUTSize;
-						axis->referenceLUT = 1;
+									if (loc_multipliers[13] > 0) {
+										registers_per_thread = 13;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 9;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 13;
+										min_registers_per_thread = 9;
 					}
 					else {
-#if(VKFFT_BACKEND==0)
-						allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize);
-						transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize);
-#elif(VKFFT_BACKEND==1)
-						cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
-						cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice);
-#elif(VKFFT_BACKEND==2)
-						hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize);
-						hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice);
-#endif
+										registers_per_thread = 9;
+										registers_per_thread_per_radix[2] = 0;
+										registers_per_thread_per_radix[3] = 9;
+										registers_per_thread_per_radix[5] = 0;
+										registers_per_thread_per_radix[7] = 0;
+										registers_per_thread_per_radix[11] = 0;
+										registers_per_thread_per_radix[13] = 0;
+										min_registers_per_thread = 9;
 					}
 				}
-				free(tempLUT);
 			}
 		}
-		//configure strides
-		//perform r2c
-		axis->specializationConstants.inputStride[0] = 1;
-		axis->specializationConstants.inputStride[3] = app->configuration.bufferStride[2];
-
-		if (axis_id == 1)
-		{
-
-			//don't transpose 0-1 
-			axis->specializationConstants.inputStride[1] = app->configuration.size[1];
-			axis->specializationConstants.inputStride[2] = app->configuration.bufferStride[1];
 		}
-		if (axis_id == 2)
-		{
-
-			//don't transpose 0-1, don't transpose 1-2
-			axis->specializationConstants.inputStride[1] = app->configuration.bufferStride[1];
-			axis->specializationConstants.inputStride[2] = app->configuration.size[1];
-
 		}
-
-		axis->specializationConstants.inputStride[4] = axis->specializationConstants.inputStride[3] * app->configuration.coordinateFeatures;
-		axis->specializationConstants.outputStride[0] = axis->specializationConstants.inputStride[0];
-		axis->specializationConstants.outputStride[1] = axis->specializationConstants.inputStride[1];
-		axis->specializationConstants.outputStride[2] = axis->specializationConstants.inputStride[2];
-		axis->specializationConstants.outputStride[3] = axis->specializationConstants.inputStride[3];
-		axis->specializationConstants.outputStride[4] = axis->specializationConstants.inputStride[4];
-
-		/*axis->specializationConstants.inputStride[3] = (app->configuration.coordinateFeatures == 1) ? 0 : axis->specializationConstants.inputStride[3];
-		axis->specializationConstants.outputStride[3] = (app->configuration.coordinateFeatures == 1) ? 0 : axis->specializationConstants.outputStride[3];
-
-		axis->specializationConstants.inputStride[4] = ((app->configuration.numberBatches == 1) && (app->configuration.numberKernels == 1)) ? 0 : axis->specializationConstants.inputStride[3] * app->configuration.coordinateFeatures;
-		axis->specializationConstants.outputStride[4] = ((app->configuration.numberBatches == 1) && (app->configuration.numberKernels == 1)) ? 0 : axis->specializationConstants.outputStride[3] * app->configuration.coordinateFeatures;
-		*/
-		axis->specializationConstants.inputOffset = app->configuration.bufferStride[1] - app->configuration.size[1];
-		axis->specializationConstants.outputOffset = app->configuration.bufferStride[1] - app->configuration.size[1];
-
-		uint32_t storageComplexSize;
-		if (app->configuration.doublePrecision)
-			storageComplexSize = (2 * sizeof(double));
-		else
-			if (app->configuration.halfPrecision)
-				storageComplexSize = (2 * 2);
-			else
-				storageComplexSize = (2 * sizeof(float));
-		uint32_t initPageSize = 0;
-		for (uint32_t i = 0; i < app->configuration.bufferNum; i++) {
-			initPageSize += app->configuration.bufferSize[i];
+				else {
+					if (loc_multipliers[5] > 0) {
+						if (loc_multipliers[7] > 0) {
+							if (loc_multipliers[11] > 0) {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 13;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 5;
+									registers_per_thread_per_radix[7] = 7;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 5;
 		}
-		if (app->configuration.performConvolution) {
-			uint32_t initPageSizeKernel = 0;
-			for (uint32_t i = 0; i < app->configuration.kernelNum; i++) {
-				initPageSizeKernel += app->configuration.kernelSize[i];
+								else {
+									registers_per_thread = 11;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 5;
+									registers_per_thread_per_radix[7] = 7;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 5;
 			}
-			if (initPageSizeKernel > initPageSize) initPageSize = initPageSizeKernel;
 		}
-		if (axis_id == 1) {
-			if ((axis->specializationConstants.inputStride[1] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) {
-				initPageSize = app->configuration.localPageSize * 1024;
+							else {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 13;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 5;
+									registers_per_thread_per_radix[7] = 7;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 5;
 			}
+								else {
+									registers_per_thread = 7;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 5;
+									registers_per_thread_per_radix[7] = 7;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 5;
 		}
-		if (axis_id == 2) {
-			if ((app->configuration.bufferStride[2] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) {
-				initPageSize = app->configuration.localPageSize * 1024;
 			}
 		}
-		uint32_t locPageSize = initPageSize;
-		uint64_t totalSize = 0;
-		for (uint32_t i = 0; i < app->configuration.bufferNum; i++) {
-			totalSize += app->configuration.bufferSize[i];
-			if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
-		}
-		axis->specializationConstants.inputBufferBlockSize = locPageSize / storageComplexSize;
-		axis->specializationConstants.inputBufferBlockNum = (uint32_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
-		//if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize;
-		locPageSize = initPageSize;
-		totalSize = 0;
-		for (uint32_t i = 0; i < app->configuration.bufferNum; i++) {
-			totalSize += app->configuration.bufferSize[i];
-			if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i];
+						else {
+							if (loc_multipliers[11] > 0) {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 13;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 5;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 5;
+		}
+								else {
+									registers_per_thread = 11;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 5;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 5;
 		}
-		axis->specializationConstants.outputBufferBlockSize = locPageSize / storageComplexSize;
-		axis->specializationConstants.outputBufferBlockNum = (uint32_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
-		//if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize;
-
-		if (app->configuration.performConvolution) {
-			totalSize = 0;
-			locPageSize = initPageSize;
-			for (uint32_t i = 0; i < app->configuration.kernelNum; i++) {
-				totalSize += app->configuration.kernelSize[i];
-				if (app->configuration.kernelSize[i] < locPageSize) locPageSize = app->configuration.kernelSize[i];
 			}
-			axis->specializationConstants.kernelBlockSize = locPageSize / storageComplexSize;
-			axis->specializationConstants.kernelBlockNum = (uint32_t)ceil(totalSize / (double)(axis->specializationConstants.kernelBlockSize * storageComplexSize));
-			//if (axis->specializationConstants.kernelBlockNum == 1) axis->specializationConstants.kernelBlockSize = totalSize / storageComplexSize;
+							else {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 13;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 5;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 5;
 		}
 		else {
-			axis->specializationConstants.kernelBlockSize = 0;
-			axis->specializationConstants.kernelBlockNum = 0;
+									registers_per_thread = 5;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 5;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 5;
 		}
-
-		axis->numBindings = 2;
-		uint32_t numBuffersBound[4] = { axis->specializationConstants.inputBufferBlockNum , axis->specializationConstants.outputBufferBlockNum, 0, 0 };
-#if(VKFFT_BACKEND==0)
-		VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER };
-		descriptorPoolSize.descriptorCount = axis->specializationConstants.inputBufferBlockNum + axis->specializationConstants.outputBufferBlockNum;
-#endif
-		if ((axis_id == 0) && (axis_upload_id == 0) && (app->configuration.FFTdim == 1) && (app->configuration.performConvolution)) {
-			numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
-#if(VKFFT_BACKEND==0)
-			descriptorPoolSize.descriptorCount += axis->specializationConstants.kernelBlockNum;
-#endif
-			axis->numBindings++;
 		}
-		if ((axis_id == 1) && (axis_upload_id == 0) && (app->configuration.FFTdim == 2) && (app->configuration.performConvolution)) {
-			numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
-#if(VKFFT_BACKEND==0)
-			descriptorPoolSize.descriptorCount += axis->specializationConstants.kernelBlockNum;
-#endif
-			axis->numBindings++;
 		}
-		if ((axis_id == 2) && (axis_upload_id == 0) && (app->configuration.FFTdim == 3) && (app->configuration.performConvolution)) {
-			numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum;
-#if(VKFFT_BACKEND==0)
-			descriptorPoolSize.descriptorCount += axis->specializationConstants.kernelBlockNum;
-#endif
-			axis->numBindings++;
 		}
-		if (app->configuration.useLUT) {
-			numBuffersBound[axis->numBindings] = 1;
-#if(VKFFT_BACKEND==0)
-			descriptorPoolSize.descriptorCount++;
-#endif
-			axis->numBindings++;
+					else
+					{
+						if (loc_multipliers[7] > 0) {
+							if (loc_multipliers[11] > 0) {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 13;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 7;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 7;
 		}
-#if(VKFFT_BACKEND==0)
-		VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO };
-		descriptorPoolCreateInfo.poolSizeCount = 1;
-		descriptorPoolCreateInfo.pPoolSizes = &descriptorPoolSize;
-		descriptorPoolCreateInfo.maxSets = 1;
-		vkCreateDescriptorPool(app->configuration.device[0], &descriptorPoolCreateInfo, NULL, &axis->descriptorPool);
-
-		const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-		VkDescriptorSetLayoutBinding* descriptorSetLayoutBindings;
-		descriptorSetLayoutBindings = (VkDescriptorSetLayoutBinding*)malloc(axis->numBindings * sizeof(VkDescriptorSetLayoutBinding));
-		for (uint32_t i = 0; i < axis->numBindings; ++i) {
-			descriptorSetLayoutBindings[i].binding = i;
-			descriptorSetLayoutBindings[i].descriptorType = descriptorType;
-			descriptorSetLayoutBindings[i].descriptorCount = numBuffersBound[i];
-			descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+								else {
+									registers_per_thread = 11;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 7;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 7;
 		}
-
-		VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO };
-		descriptorSetLayoutCreateInfo.bindingCount = axis->numBindings;
-		descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings;
-
-		vkCreateDescriptorSetLayout(app->configuration.device[0], &descriptorSetLayoutCreateInfo, NULL, &axis->descriptorSetLayout);
-		free(descriptorSetLayoutBindings);
-		VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO };
-		descriptorSetAllocateInfo.descriptorPool = axis->descriptorPool;
-		descriptorSetAllocateInfo.descriptorSetCount = 1;
-		descriptorSetAllocateInfo.pSetLayouts = &axis->descriptorSetLayout;
-		vkAllocateDescriptorSets(app->configuration.device[0], &descriptorSetAllocateInfo, &axis->descriptorSet);
-#endif
-		for (uint32_t i = 0; i < axis->numBindings; ++i) {
-			for (uint32_t j = 0; j < numBuffersBound[i]; ++j) {
-#if(VKFFT_BACKEND==0)
-				VkDescriptorBufferInfo descriptorBufferInfo = { 0 };
-#endif
-				if (i == 0) {
-					uint32_t bufferId = 0;
-					uint32_t offset = j;
-					for (uint32_t l = 0; l < app->configuration.bufferNum; ++l) {
-						if (offset >= (uint32_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
-							bufferId++;
-							offset -= (uint32_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
 						}
 						else {
-							l = app->configuration.bufferNum;
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 13;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 7;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 7;
 						}
-
+								else {
+									registers_per_thread = 7;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 7;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 7;
 					}
-					axis->inputBuffer = app->configuration.buffer;
-#if(VKFFT_BACKEND==0)
-					descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
-					descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
-					descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
-#endif
 				}
-				if (i == 1) {
-					uint32_t bufferId = 0;
-					uint32_t offset = j;
-					for (uint32_t l = 0; l < app->configuration.bufferNum; ++l) {
-						if (offset >= (uint32_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
-							bufferId++;
-							offset -= (uint32_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
 						}
 						else {
-							l = app->configuration.bufferNum;
+							if (loc_multipliers[11] > 0) {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 13;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 11;
 						}
-
+								else {
+									registers_per_thread = 11;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 11;
+									registers_per_thread_per_radix[13] = 0;
+									min_registers_per_thread = 11;
 					}
-					axis->outputBuffer = app->configuration.buffer;
-#if(VKFFT_BACKEND==0)
-					descriptorBufferInfo.buffer = app->configuration.buffer[bufferId];
-					descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
-					descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
-#endif
 				}
-				if ((i == 2) && (app->configuration.performConvolution)) {
-					uint32_t bufferId = 0;
-					uint32_t offset = j;
-					for (uint32_t l = 0; l < app->configuration.kernelNum; ++l) {
-						if (offset >= (uint32_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.kernelBlockSize * storageComplexSize))) {
-							bufferId++;
-							offset -= (uint32_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.kernelBlockSize * storageComplexSize));
+							else {
+								if (loc_multipliers[13] > 0) {
+									registers_per_thread = 13;
+									registers_per_thread_per_radix[2] = 0;
+									registers_per_thread_per_radix[3] = 0;
+									registers_per_thread_per_radix[5] = 0;
+									registers_per_thread_per_radix[7] = 0;
+									registers_per_thread_per_radix[11] = 0;
+									registers_per_thread_per_radix[13] = 13;
+									min_registers_per_thread = 13;
 						}
 						else {
-							l = app->configuration.bufferNum;
+									return 11;
 						}
-
 					}
-#if(VKFFT_BACKEND==0)
-					descriptorBufferInfo.buffer = app->configuration.kernel[bufferId];
-					descriptorBufferInfo.range = (axis->specializationConstants.kernelBlockSize * storageComplexSize);
-					descriptorBufferInfo.offset = offset * (axis->specializationConstants.kernelBlockSize * storageComplexSize);
-#endif
 				}
-				if ((i == axis->numBindings - 1) && (app->configuration.useLUT)) {
-#if(VKFFT_BACKEND==0)
-					descriptorBufferInfo.buffer = axis->bufferLUT;
-					descriptorBufferInfo.offset = 0;
-					descriptorBufferInfo.range = axis->bufferLUTSize;
-#endif
 				}
-#if(VKFFT_BACKEND==0)
-				VkWriteDescriptorSet writeDescriptorSet = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET };
-				writeDescriptorSet.dstSet = axis->descriptorSet;
-				writeDescriptorSet.dstBinding = i;
-				writeDescriptorSet.dstArrayElement = j;
-				writeDescriptorSet.descriptorType = descriptorType;
-				writeDescriptorSet.descriptorCount = 1;
-				writeDescriptorSet.pBufferInfo = &descriptorBufferInfo;
-				vkUpdateDescriptorSets(app->configuration.device[0], 1, &writeDescriptorSet, 0, NULL);
-#endif
 			}
-		}
-		{
-#if(VKFFT_BACKEND==0)
-			VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO };
-			pipelineLayoutCreateInfo.setLayoutCount = 1;
-			pipelineLayoutCreateInfo.pSetLayouts = &axis->descriptorSetLayout;
-
-			VkPushConstantRange pushConstantRange = { VK_SHADER_STAGE_COMPUTE_BIT };
-			pushConstantRange.offset = 0;
-			pushConstantRange.size = sizeof(VkFFTPushConstantsLayout);
-			// Push constant ranges are part of the pipeline layout
-			pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
-			pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
-
-
-			vkCreatePipelineLayout(app->configuration.device[0], &pipelineLayoutCreateInfo, NULL, &axis->pipelineLayout);
-#endif
-			uint32_t maxBatchCoalesced;
-			if (app->configuration.doublePrecision)
-				maxBatchCoalesced = app->configuration.coalescedMemory / (2 * sizeof(double));
-			else
-				if (app->configuration.halfPrecision)
-					maxBatchCoalesced = app->configuration.coalescedMemory / (2 * sizeof(float));
-				else
-					maxBatchCoalesced = app->configuration.coalescedMemory / (2 * sizeof(float));
 
-			axis->groupedBatch = maxBatchCoalesced;
-			/*if ((app->configuration.size[0] < 4096) && (app->configuration.size[1] < 512) && (app->configuration.size[2] == 1)) {
-				if (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) {
-					if (1024 / axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim) {
-						if (1024 / axis->specializationConstants.fftDim > axis->groupedBatch)
-							axis->groupedBatch = 1024 / axis->specializationConstants.fftDim;
-						else
-							axis->groupedBatch = maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim;
 					}
+			registers_per_thread_per_radix[8] = registers_per_thread_per_radix[2];
+			registers_per_thread_per_radix[4] = registers_per_thread_per_radix[2];
+			if ((registerBoost == 4) && (registers_per_thread % 4 != 0)) {
+				registers_per_thread *= 2;
+				for (uint32_t i = 2; i < 14; i++) {
+					registers_per_thread_per_radix[i] *= 2;
 				}
+				min_registers_per_thread *= 2;
 			}
-			else {
-				axis->groupedBatch = (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) ? maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim : axis->groupedBatch;
-			}*/
-			//axis->groupedBatch = (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) ? maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim : axis->groupedBatch;
-			if (((FFTPlan->numSupportAxisUploads[axis_id - 1] == 1) && (axis_id - 1 == 0)) || ((axis_id - 1 == 0) && (!app->configuration.reorderFourStep) && (axis_upload_id == 0))) {
-				axis->groupedBatch = (maxSequenceLengthSharedMemoryPow2 / axis->specializationConstants.fftDim > axis->groupedBatch) ? maxSequenceLengthSharedMemoryPow2 / axis->specializationConstants.fftDim : axis->groupedBatch;
+			if (registers_per_thread_per_radix[8] % 8 == 0) {
+				loc_multipliers[8] = loc_multipliers[2] / 3;
+				loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[8] * 3;
 			}
-			else {
-				axis->groupedBatch = (maxSingleSizeStridedPow2 / axis->specializationConstants.fftDim > 1) ? maxSingleSizeStridedPow2 / axis->specializationConstants.fftDim * axis->groupedBatch : axis->groupedBatch;
+			if (registers_per_thread_per_radix[4] % 4 == 0) {
+				loc_multipliers[4] = loc_multipliers[2] / 2;
+				loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[4] * 2;
 			}
-			if (axis->groupedBatch < maxBatchCoalesced) axis->groupedBatch = maxBatchCoalesced;
-			axis->groupedBatch = (axis->groupedBatch / maxBatchCoalesced) * maxBatchCoalesced;
-			if ((app->configuration.halfThreads) && (axis->groupedBatch * axis->specializationConstants.fftDim * complexSize >= app->configuration.sharedMemorySize))
-				axis->groupedBatch = ceil(axis->groupedBatch / 2.0);
-
-			if (axis->groupedBatch > app->configuration.warpSize) axis->groupedBatch = (axis->groupedBatch / app->configuration.warpSize) * app->configuration.warpSize;
-			//if (axis->groupedBatch > maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / maxBatchCoalesced) * maxBatchCoalesced;
-			uint32_t maxThreadNum = maxSequenceLengthSharedMemory / axis->specializationConstants.min_registers_per_thread;
-			if (axis_id == 1) {
-				if (axis_upload_id == 0) {
-					axis->axisBlock[0] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
-					if (axis->axisBlock[0] > maxThreadNum) axis->axisBlock[0] = maxThreadNum;
-					if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0];
-
-					axis->axisBlock[1] = 1;
-					axis->axisBlock[2] = 1;
-					axis->axisBlock[3] = axis->specializationConstants.fftDim;
+			if ((registerBoost == 2) && (loc_multipliers[2] == 0)) {
+				if (loc_multipliers[4] > 0) {
+					loc_multipliers[4]--;
+					loc_multipliers[2] = 2;
 				}
 				else {
-					axis->axisBlock[1] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
-
-					axis->axisBlock[0] = (axis->specializationConstants.stageStartSize > axis->groupedBatch) ? axis->groupedBatch : axis->specializationConstants.stageStartSize;
-					if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0];
-					axis->axisBlock[2] = 1;
-					axis->axisBlock[3] = axis->specializationConstants.fftDim;
+					loc_multipliers[8]--;
+					loc_multipliers[4]++;
+					loc_multipliers[2]++;
 				}
 			}
-			if (axis_id == 2) {
-				axis->axisBlock[1] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
-
-				axis->axisBlock[0] = (app->configuration.size[1] > axis->groupedBatch) ? axis->groupedBatch : app->configuration.size[1];
-				if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0];
-
-				/*if (axis->axisBlock[0] * axis->axisBlock[1] < 64)
-					if (app->configuration.size[1] > 64 / axis->axisBlock[1])
-						axis->axisBlock[0] = 64 / axis->axisBlock[1];
-					else
-						axis->axisBlock[0] = app->configuration.size[0];*/
-				axis->axisBlock[2] = 1;
-				axis->axisBlock[3] = axis->specializationConstants.fftDim;
-			}
-			uint32_t tempSize[3] = { app->configuration.size[0], app->configuration.size[1], app->configuration.size[2] };
-			if (axis_id == 1) {
-				if (axis_upload_id == 0)
-					tempSize[0] = app->configuration.size[1] / axis->specializationConstants.fftDim;
-				else
-					tempSize[0] = app->configuration.size[1] / axis->specializationConstants.fftDim / axis->axisBlock[0];
-
-				tempSize[1] = 1;
-				tempSize[2] = app->configuration.size[2];
-				//if (app->configuration.performZeropadding[2]) tempSize[2] = ceil(tempSize[2] / 2.0);
-
-				if (tempSize[0] > app->configuration.maxComputeWorkGroupCount[0]) axis->specializationConstants.performWorkGroupShift[0] = 1;
-				else  axis->specializationConstants.performWorkGroupShift[0] = 0;
-				if (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) axis->specializationConstants.performWorkGroupShift[1] = 1;
-				else  axis->specializationConstants.performWorkGroupShift[1] = 0;
-				if (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) axis->specializationConstants.performWorkGroupShift[2] = 1;
-				else  axis->specializationConstants.performWorkGroupShift[2] = 0;
-
-			}
-			if (axis_id == 2) {
-				tempSize[0] = app->configuration.size[1] / axis->axisBlock[0] * app->configuration.size[2] / axis->specializationConstants.fftDim;
-				tempSize[1] = 1;
-				tempSize[2] = 1;
-
-				if (tempSize[0] > app->configuration.maxComputeWorkGroupCount[0]) axis->specializationConstants.performWorkGroupShift[0] = 1;
-				else  axis->specializationConstants.performWorkGroupShift[0] = 0;
-				if (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) axis->specializationConstants.performWorkGroupShift[1] = 1;
-				else  axis->specializationConstants.performWorkGroupShift[1] = 0;
-				if (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) axis->specializationConstants.performWorkGroupShift[2] = 1;
-				else  axis->specializationConstants.performWorkGroupShift[2] = 0;
-
-			}
-			axis->specializationConstants.localSize[0] = axis->axisBlock[0];
-			axis->specializationConstants.localSize[1] = axis->axisBlock[1];
-			axis->specializationConstants.localSize[2] = axis->axisBlock[2];
-			//specializationInfo.pData = &axis->specializationConstants;
-			uint32_t registerBoost = (FFTPlan->numSupportAxisUploads[axis_id - 1] > 1) ? app->configuration.registerBoost4Step : app->configuration.registerBoost;
-
-			axis->specializationConstants.numCoordinates = (app->configuration.matrixConvolution > 1) ? 1 : app->configuration.coordinateFeatures;
-			axis->specializationConstants.matrixConvolution = app->configuration.matrixConvolution;
-			if ((app->configuration.FFTdim == 1) && (app->configuration.size[1] == 1) && (app->configuration.numberBatches > 1) && (!app->configuration.performConvolution) && (app->configuration.coordinateFeatures == 1)) {
-				app->configuration.size[1] = app->configuration.numberBatches;
-				app->configuration.numberBatches = 1;
-			}
-			axis->specializationConstants.numBatches = app->configuration.numberBatches;
-			axis->specializationConstants.numKernels = app->configuration.numberKernels;
-			axis->specializationConstants.sharedMemSize = app->configuration.sharedMemorySize;
-			axis->specializationConstants.sharedMemSizePow2 = app->configuration.sharedMemorySizePow2;
-			axis->specializationConstants.normalize = app->configuration.normalize;
-			axis->specializationConstants.size[0] = app->configuration.size[0];
-			axis->specializationConstants.size[1] = app->configuration.size[1];
-			axis->specializationConstants.size[2] = app->configuration.size[2];
-			axis->specializationConstants.axis_id = axis_id;
-			axis->specializationConstants.axis_upload_id = axis_upload_id;
-			for (uint32_t i = 0; i < 3; i++) {
-				axis->specializationConstants.frequencyZeropadding = app->configuration.frequencyZeroPadding;
-				axis->specializationConstants.performZeropaddingFull[i] = app->configuration.performZeropadding[i]; // don't read if input is zeropadded (0 - off, 1 - on)
-				axis->specializationConstants.fft_zeropad_left_full[i] = app->configuration.fft_zeropad_left[i];
-				axis->specializationConstants.fft_zeropad_right_full[i] = app->configuration.fft_zeropad_right[i];
-			}
-			if ((inverse)) {
-				if ((app->configuration.frequencyZeroPadding) && (((!app->configuration.reorderFourStep) && (axis_upload_id == 0)) || ((app->configuration.reorderFourStep) && (axis_upload_id == FFTPlan->numSupportAxisUploads[axis_id - 1] - 1)))) {
-					axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id];
-					axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id];
-					axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id];
-				}
-				else
-					axis->specializationConstants.zeropad[0] = 0;
-				if ((!app->configuration.frequencyZeroPadding) && (((!app->configuration.reorderFourStep) && (axis_upload_id == FFTPlan->numSupportAxisUploads[axis_id - 1] - 1)) || ((app->configuration.reorderFourStep) && (axis_upload_id == 0)))) {
-					axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id];
-					axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id];
-					axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id];
-				}
-				else
-					axis->specializationConstants.zeropad[1] = 0;
-			}
-			else {
-				if ((!app->configuration.frequencyZeroPadding) && (axis_upload_id == FFTPlan->numSupportAxisUploads[axis_id - 1] - 1)) {
-					axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id];
-					axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id];
-					axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id];
-				}
-				else
-					axis->specializationConstants.zeropad[0] = 0;
-				if (((app->configuration.frequencyZeroPadding) && (axis_upload_id == 0)) || ((app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (app->configuration.performConvolution))) {
-					axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id];
-					axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id];
-					axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id];
-				}
-				else
-					axis->specializationConstants.zeropad[1] = 0;
-			}
-			if ((app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (app->configuration.performConvolution)) {
-				axis->specializationConstants.convolutionStep = 1;
-			}
-			else
-				axis->specializationConstants.convolutionStep = 0;
-			char floatTypeInputMemory[10];
-			char floatTypeOutputMemory[10];
-			char floatTypeKernelMemory[10];
-			char floatType[10];
-			axis->specializationConstants.unroll = 1;
-			axis->specializationConstants.LUT = app->configuration.useLUT;
-			if (app->configuration.doublePrecision) {
-				sprintf(floatType, "double");
-				sprintf(floatTypeInputMemory, "double");
-				sprintf(floatTypeOutputMemory, "double");
-				sprintf(floatTypeKernelMemory, "double");
-				//axis->specializationConstants.unroll = 1;
-			}
-			else {
-				//axis->specializationConstants.unroll = 0;
-				if (app->configuration.halfPrecision) {
-					sprintf(floatType, "float");
-					sprintf(floatTypeInputMemory, "half");
-					sprintf(floatTypeOutputMemory, "half");
-					sprintf(floatTypeKernelMemory, "half");
-					if (app->configuration.halfPrecisionMemoryOnly) {
-						//only out of place mode, input/output buffer must be different
-						sprintf(floatTypeInputMemory, "float");
-						sprintf(floatTypeOutputMemory, "float");
-						sprintf(floatTypeKernelMemory, "float");
-					}
-				}
-				else {
-					sprintf(floatType, "float");
-					sprintf(floatTypeInputMemory, "float");
-					sprintf(floatTypeOutputMemory, "float");
-					sprintf(floatTypeKernelMemory, "float");
-				}
-			}
-			char uintType[20] = "";
-#if(VKFFT_BACKEND==0)
-			sprintf(uintType, "uint");
-#elif(VKFFT_BACKEND==1)
-			sprintf(uintType, "unsigned int");
-#elif(VKFFT_BACKEND==2)
-			sprintf(uintType, "unsigned int");
-#endif
-			uint32_t LUT = app->configuration.useLUT;
-			uint32_t type;
-			if ((axis_id - 1 == 0) && (axis_upload_id == 0)) type = 0;
-			if (axis_id - 1 != 0) type = 1;
-			if ((axis_id - 1 == 0) && (axis_upload_id > 0)) type = 2;
-			axis->specializationConstants.cacheShuffle = 0;// ((!app->configuration.doublePrecision) && ((type == 0) || (type == 5) || (type == 6))) ? 1 : 0;
-			//if ((axis->specializationConstants.fftDim == 2 * maxSequenceLengthSharedMemory) && (app->configuration.registerBoost >= 2)) type = 3;
-			//if ((axis->specializationConstants.fftDim == 4 * maxSequenceLengthSharedMemory) && (app->configuration.registerBoost >= 4)) type = 4;
-			char* code0 = (char*)malloc(sizeof(char) * 1000000);
-			shaderGenVkFFT(code0, &axis->specializationConstants, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory, uintType, type);
-#if(VKFFT_BACKEND==0)
-			const glslang_resource_t default_resource = {
-				/* .MaxLights = */ 32,
-				/* .MaxClipPlanes = */ 6,
-				/* .MaxTextureUnits = */ 32,
-				/* .MaxTextureCoords = */ 32,
-				/* .MaxVertexAttribs = */ 64,
-				/* .MaxVertexUniformComponents = */ 4096,
-				/* .MaxVaryingFloats = */ 64,
-				/* .MaxVertexTextureImageUnits = */ 32,
-				/* .MaxCombinedTextureImageUnits = */ 80,
-				/* .MaxTextureImageUnits = */ 32,
-				/* .MaxFragmentUniformComponents = */ 4096,
-				/* .MaxDrawBuffers = */ 32,
-				/* .MaxVertexUniformVectors = */ 128,
-				/* .MaxVaryingVectors = */ 8,
-				/* .MaxFragmentUniformVectors = */ 16,
-				/* .MaxVertexOutputVectors = */ 16,
-				/* .MaxFragmentInputVectors = */ 15,
-				/* .MinProgramTexelOffset = */ -8,
-				/* .MaxProgramTexelOffset = */ 7,
-				/* .MaxClipDistances = */ 8,
-				/* .MaxComputeWorkGroupCountX = */ 65535,
-				/* .MaxComputeWorkGroupCountY = */ 65535,
-				/* .MaxComputeWorkGroupCountZ = */ 65535,
-				/* .MaxComputeWorkGroupSizeX = */ 1024,
-				/* .MaxComputeWorkGroupSizeY = */ 1024,
-				/* .MaxComputeWorkGroupSizeZ = */ 64,
-				/* .MaxComputeUniformComponents = */ 1024,
-				/* .MaxComputeTextureImageUnits = */ 16,
-				/* .MaxComputeImageUniforms = */ 8,
-				/* .MaxComputeAtomicCounters = */ 8,
-				/* .MaxComputeAtomicCounterBuffers = */ 1,
-				/* .MaxVaryingComponents = */ 60,
-				/* .MaxVertexOutputComponents = */ 64,
-				/* .MaxGeometryInputComponents = */ 64,
-				/* .MaxGeometryOutputComponents = */ 128,
-				/* .MaxFragmentInputComponents = */ 128,
-				/* .MaxImageUnits = */ 8,
-				/* .MaxCombinedImageUnitsAndFragmentOutputs = */ 8,
-				/* .MaxCombinedShaderOutputResources = */ 8,
-				/* .MaxImageSamples = */ 0,
-				/* .MaxVertexImageUniforms = */ 0,
-				/* .MaxTessControlImageUniforms = */ 0,
-				/* .MaxTessEvaluationImageUniforms = */ 0,
-				/* .MaxGeometryImageUniforms = */ 0,
-				/* .MaxFragmentImageUniforms = */ 8,
-				/* .MaxCombinedImageUniforms = */ 8,
-				/* .MaxGeometryTextureImageUnits = */ 16,
-				/* .MaxGeometryOutputVertices = */ 256,
-				/* .MaxGeometryTotalOutputComponents = */ 1024,
-				/* .MaxGeometryUniformComponents = */ 1024,
-				/* .MaxGeometryVaryingComponents = */ 64,
-				/* .MaxTessControlInputComponents = */ 128,
-				/* .MaxTessControlOutputComponents = */ 128,
-				/* .MaxTessControlTextureImageUnits = */ 16,
-				/* .MaxTessControlUniformComponents = */ 1024,
-				/* .MaxTessControlTotalOutputComponents = */ 4096,
-				/* .MaxTessEvaluationInputComponents = */ 128,
-				/* .MaxTessEvaluationOutputComponents = */ 128,
-				/* .MaxTessEvaluationTextureImageUnits = */ 16,
-				/* .MaxTessEvaluationUniformComponents = */ 1024,
-				/* .MaxTessPatchComponents = */ 120,
-				/* .MaxPatchVertices = */ 32,
-				/* .MaxTessGenLevel = */ 64,
-				/* .MaxViewports = */ 16,
-				/* .MaxVertexAtomicCounters = */ 0,
-				/* .MaxTessControlAtomicCounters = */ 0,
-				/* .MaxTessEvaluationAtomicCounters = */ 0,
-				/* .MaxGeometryAtomicCounters = */ 0,
-				/* .MaxFragmentAtomicCounters = */ 8,
-				/* .MaxCombinedAtomicCounters = */ 8,
-				/* .MaxAtomicCounterBindings = */ 1,
-				/* .MaxVertexAtomicCounterBuffers = */ 0,
-				/* .MaxTessControlAtomicCounterBuffers = */ 0,
-				/* .MaxTessEvaluationAtomicCounterBuffers = */ 0,
-				/* .MaxGeometryAtomicCounterBuffers = */ 0,
-				/* .MaxFragmentAtomicCounterBuffers = */ 1,
-				/* .MaxCombinedAtomicCounterBuffers = */ 1,
-				/* .MaxAtomicCounterBufferSize = */ 16384,
-				/* .MaxTransformFeedbackBuffers = */ 4,
-				/* .MaxTransformFeedbackInterleavedComponents = */ 64,
-				/* .MaxCullDistances = */ 8,
-				/* .MaxCombinedClipAndCullDistances = */ 8,
-				/* .MaxSamples = */ 4,
-				/* .maxMeshOutputVerticesNV = */ 256,
-				/* .maxMeshOutputPrimitivesNV = */ 512,
-				/* .maxMeshWorkGroupSizeX_NV = */ 32,
-				/* .maxMeshWorkGroupSizeY_NV = */ 1,
-				/* .maxMeshWorkGroupSizeZ_NV = */ 1,
-				/* .maxTaskWorkGroupSizeX_NV = */ 32,
-				/* .maxTaskWorkGroupSizeY_NV = */ 1,
-				/* .maxTaskWorkGroupSizeZ_NV = */ 1,
-				/* .maxMeshViewCountNV = */ 4,
-				/* .maxDualSourceDrawBuffersEXT = */ 1,
-
-				/* .limits = */ {
-				/* .nonInductiveForLoops = */ 1,
-				/* .whileLoops = */ 1,
-				/* .doWhileLoops = */ 1,
-				/* .generalUniformIndexing = */ 1,
-				/* .generalAttributeMatrixVectorIndexing = */ 1,
-				/* .generalVaryingIndexing = */ 1,
-				/* .generalSamplerIndexing = */ 1,
-				/* .generalVariableIndexing = */ 1,
-				/* .generalConstantMatrixVectorIndexing = */ 1,
-			} };
-			glslang_target_client_version_t client_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_VULKAN_1_1 : GLSLANG_TARGET_VULKAN_1_0;
-			glslang_target_language_version_t target_language_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_SPV_1_3 : GLSLANG_TARGET_SPV_1_0;
-			const glslang_input_t input =
-			{
-				GLSLANG_SOURCE_GLSL,
-				GLSLANG_STAGE_COMPUTE,
-				GLSLANG_CLIENT_VULKAN,
-				client_version,
-				GLSLANG_TARGET_SPV,
-				target_language_version,
-				code0,
-				450,
-				GLSLANG_NO_PROFILE,
-				1,
-				0,
-				GLSLANG_MSG_DEFAULT_BIT,
-				&default_resource,
-			};
-			glslang_optimization_level_t optimization = GLSLANG_OPT_NONE;
-			glslang_shader_t* shader = glslang_shader_create(&input);
-			const char* err;
-			//printf("%s\n", code0);
-			if (!glslang_shader_preprocess(shader, &input))
-			{
-				err = glslang_shader_get_info_log(shader);
-				printf("%s\n", code0);
-				printf("%s\nVkFFT shader type: %d\n", err, type);
-				glslang_shader_delete(shader);
-				free(code0);
-				return 3;
-
-			}
-
-			if (!glslang_shader_parse(shader, &input))
-			{
-				err = glslang_shader_get_info_log(shader);
-				printf("%s\n", code0);
-				printf("%s\nVkFFT shader type: %d\n", err, type);
-				glslang_shader_delete(shader);
-				free(code0);
-				return 3;
-
+			if ((registerBoost == 4) && (loc_multipliers[4] == 0)) {
+				loc_multipliers[8]--;
+				loc_multipliers[4]++;
+				loc_multipliers[2]++;
 			}
-			glslang_program_t* program = glslang_program_create();
-			glslang_program_add_shader(program, shader);
-			if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT))
+			uint32_t maxBatchCoalesced = ((axis_id == 0) && (((k == 0) && (!app->configuration.reorderFourStep)) || (numPasses == 1))) ? 1 : app->configuration.coalescedMemory / complexSize;
+			if (maxBatchCoalesced * locAxisSplit[k] / (min_registers_per_thread * registerBoost) > app->configuration.maxThreadsNum)
 			{
-				err = glslang_program_get_info_log(program);
-				printf("%s\n", code0);
-				printf("%s\nVkFFT shader type: %d\n", err, type);
-				glslang_shader_delete(shader);
-				free(code0);
-				return 3;
-
+				for (uint32_t i = 2; i < 14; i++) {
+					if (locAxisSplit[k] / (min_registers_per_thread * registerBoost) % i == 0) {
+						min_registers_per_thread *= i;
+						i = 14;
 			}
-
-			glslang_program_SPIRV_generate(program, input.stage);
-
-			if (glslang_program_SPIRV_get_messages(program))
-			{
-				printf("%s", glslang_program_SPIRV_get_messages(program));
 			}
-
-			glslang_shader_delete(shader);
-			VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO };
-			VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO };
-			pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
-			VkShaderModuleCreateInfo createInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO };
-			createInfo.pCode = glslang_program_SPIRV_get_ptr(program);
-			createInfo.codeSize = glslang_program_SPIRV_get_size(program) * sizeof(uint32_t);
-			vkCreateShaderModule(app->configuration.device[0], &createInfo, NULL, &pipelineShaderStageCreateInfo.module);
-			pipelineShaderStageCreateInfo.pName = "main";
-			pipelineShaderStageCreateInfo.pSpecializationInfo = 0;// &specializationInfo;
-			computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo;
-			computePipelineCreateInfo.layout = axis->pipelineLayout;
-			vkCreateComputePipelines(app->configuration.device[0], VK_NULL_HANDLE, 1, &computePipelineCreateInfo, NULL, &axis->pipeline);
-			vkDestroyShaderModule(app->configuration.device[0], pipelineShaderStageCreateInfo.module, NULL);
-			glslang_program_delete(program);
-#elif(VKFFT_BACKEND==1)
-			nvrtcProgram prog;
-			/*char* includeNames = (char*)malloc(sizeof(char)*100);
-			char* headers = (char*)malloc(sizeof(char) * 100);
-			sprintf(headers, "C://Program Files//NVIDIA GPU Computing Toolkit//CUDA//v11.1//include//cuComplex.h");
-			sprintf(includeNames, "cuComplex.h");*/
-			nvrtcResult result = nvrtcCreateProgram(&prog,         // prog
-				code0,         // buffer
-				"VkFFT.cu",    // name
-				0,             // numHeaders
-				0,          // headers
-				0);        // includeNames
-			//free(includeNames);
-			//free(headers);
-			if (result != NVRTC_SUCCESS) printf("1 error: %s\n", nvrtcGetErrorString(result));
-			//const char opts[20] = "--fmad=false";
-			//result = nvrtcAddNameExpression(prog, "&consts");
-			//if (result != NVRTC_SUCCESS) printf("1.5 error: %s\n", nvrtcGetErrorString(result));
-			result = nvrtcCompileProgram(prog,  // prog
-				0,     // numOptions
-				NULL); // options
-			if (result != NVRTC_SUCCESS) {
-				printf("2 error: %s\n", nvrtcGetErrorString(result));
-				char* log = (char*)malloc(sizeof(char) * 100000);
-				nvrtcGetProgramLog(prog, log);
-				printf("%s\n", log);
-				free(log);
-				printf("%s\n", code0);
+				for (uint32_t i = 2; i < 14; i++) {
+					for (uint32_t j = 2; j < 14; j++) {
+						if (locAxisSplit[k] / (registers_per_thread_per_radix[i] * registerBoost) % j == 0) {
+							registers_per_thread_per_radix[i] *= j;
+							j = 14;
 			}
-			size_t ptxSize;
-			result = nvrtcGetPTXSize(prog, &ptxSize);
-			if (result != NVRTC_SUCCESS) printf("3 error: %s\n", nvrtcGetErrorString(result));
-			char* ptx = (char*)malloc(ptxSize);
-			result = nvrtcGetPTX(prog, ptx);
-			if (result != NVRTC_SUCCESS) printf("4 error: %s\n", nvrtcGetErrorString(result));
-			//printf("%s\n", ptx);
-			// Destroy the program.
-			result = nvrtcDestroyProgram(&prog);
-			if (result != NVRTC_SUCCESS) printf("5 error: %s\n", nvrtcGetErrorString(result));
-			axis->VkFFTKernel = {};
-			axis->VkFFTModule = {};
-			CUresult result2 = cuModuleLoadDataEx(&axis->VkFFTModule, ptx, 0, 0, 0);
-
-			if (result2 != CUDA_SUCCESS) {
-				printf("6 error: %d\n", result2);
 			}
-			result2 = cuModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main");
-			if (result2 != CUDA_SUCCESS) {
-				printf("7 error: %d\n", result2);
+					registers_per_thread_per_radix[i] *= 2;
+				}
+				for (uint32_t i = 2; i < 14; i++) {
+					if (locAxisSplit[k] / (registers_per_thread * registerBoost) % i == 0) {
+						registers_per_thread *= i;
+						i = 14;
+				}
 			}
-			if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) {
-				result2 = cuFuncSetCacheConfig(axis->VkFFTKernel, CU_FUNC_CACHE_PREFER_SHARED);
-				if (result2 != CUDA_SUCCESS) {
-					printf("7.5 error: %d\n", result2);
+				if (min_registers_per_thread > registers_per_thread) {
+					uint32_t temp = min_registers_per_thread;
+					min_registers_per_thread = registers_per_thread;
+					registers_per_thread = temp;
 				}
+				for (uint32_t i = 2; i < 14; i++) {
+					if (registers_per_thread_per_radix[i] > registers_per_thread) {
+						registers_per_thread = registers_per_thread_per_radix[i];
+				}
+					if (registers_per_thread_per_radix[i] < min_registers_per_thread) {
+						min_registers_per_thread = registers_per_thread_per_radix[i];
 			}
-			size_t size = sizeof(VkFFTPushConstantsLayout);
-			result2 = cuModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts");
-			if (result2 != CUDA_SUCCESS) {
-				printf("8 error: %d\n", result2);
 			}
-			free(ptx);
-#elif(VKFFT_BACKEND==2)
-			hiprtcProgram prog;
-			/*char* includeNames = (char*)malloc(sizeof(char)*100);
-			char* headers = (char*)malloc(sizeof(char) * 100);
-			sprintf(headers, "C://Program Files//NVIDIA GPU Computing Toolkit//CUDA//v11.1//include//cuComplex.h");
-			sprintf(includeNames, "cuComplex.h");*/
-			hiprtcResult result = hiprtcCreateProgram(&prog,         // prog
-				code0,         // buffer
-				"VkFFT.hip",    // name
-				0,             // numHeaders
-				0,          // headers
-				0);        // includeNames
-			if (result != HIPRTC_SUCCESS) printf("1 error: %s\n", hiprtcGetErrorString(result));
-
-			result = hiprtcAddNameExpression(prog, "&consts");
-			if (result != HIPRTC_SUCCESS) printf("1.5 error: %s\n", hiprtcGetErrorString(result));
-
-			result = hiprtcCompileProgram(prog,  // prog
-				0,     // numOptions
-				NULL); // options
-			if (result != HIPRTC_SUCCESS) {
-				printf("2 error: %s\n", hiprtcGetErrorString(result));
-				char* log = (char*)malloc(sizeof(char) * 100000);
-				hiprtcGetProgramLog(prog, log);
-				printf("%s\n", log);
-				free(log);
-				printf("%s\n", code0);
 			}
-			size_t codeSize;
-			result = hiprtcGetCodeSize(prog, &codeSize);
-			if (result != HIPRTC_SUCCESS) printf("3 error: %s\n", hiprtcGetErrorString(result));
-			char* code = (char*)malloc(codeSize);
-			result = hiprtcGetCode(prog, code);
-			if (result != HIPRTC_SUCCESS) printf("4 error: %s\n", hiprtcGetErrorString(result));
-			//printf("%s\n", code);
-			// Destroy the program.
-			result = hiprtcDestroyProgram(&prog);
-			if (result != HIPRTC_SUCCESS) printf("5 error: %s\n", hiprtcGetErrorString(result));
-			axis->VkFFTKernel = {};
-			axis->VkFFTModule = {};
-			hipError_t result2 = hipModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0);
-
-			if (result2 != hipSuccess) {
-				printf("6 error: %d\n", result2);
+			uint32_t j = 0;
+			VkFFTAxis* axes = FFTPlan->axes[axis_id];
+			axes[k].specializationConstants.registerBoost = registerBoost;
+			axes[k].specializationConstants.registers_per_thread = registers_per_thread;
+			axes[k].specializationConstants.min_registers_per_thread = min_registers_per_thread;
+			for (uint32_t i = 2; i < 14; i++) {
+				axes[k].specializationConstants.registers_per_thread_per_radix[i] = registers_per_thread_per_radix[i];
+					}
+			axes[k].specializationConstants.numStages = 0;
+			axes[k].specializationConstants.fftDim = locAxisSplit[k];
+			uint32_t tempRegisterBoost = registerBoost;// ((axis_id == nonStridedAxisId) && (!app->configuration.reorderFourStep)) ? ceil(axes[k].specializationConstants.fftDim / (float)maxSingleSizeNonStrided) : ceil(axes[k].specializationConstants.fftDim / (float)maxSingleSizeStrided);
+			uint32_t switchRegisterBoost = 0;
+			if (tempRegisterBoost > 1) {
+				if (loc_multipliers[tempRegisterBoost] > 0) {
+					loc_multipliers[tempRegisterBoost]--;
+					switchRegisterBoost = tempRegisterBoost;
+				}
+				else {
+					for (uint32_t i = 14; i > 1; i--) {
+						if (loc_multipliers[i] > 0) {
+							loc_multipliers[i]--;
+							switchRegisterBoost = i;
+							i = 1;
+				}
 			}
-			result2 = hipModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main");
-			if (result2 != hipSuccess) {
-				printf("7 error: %d\n", result2);
 			}
-			if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) {
-				result2 = hipFuncSetCacheConfig(axis->VkFFTKernel, hipFuncCachePreferShared);
-				if (result2 != hipSuccess) {
-					printf("7.5 error: %d\n", result2);
-				}
 			}
-			size_t size = sizeof(VkFFTPushConstantsLayout);
-			result2 = hipModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts");
-			if (result2 != hipSuccess) {
-				printf("8 error: %d\n", result2);
+			for (uint32_t i = 14; i > 1; i--) {
+				if (loc_multipliers[i] > 0) {
+					axes[k].specializationConstants.stageRadix[j] = i;
+					loc_multipliers[i]--;
+					i++;
+					j++;
+					axes[k].specializationConstants.numStages++;
 			}
-
-			free(code);
-#endif
-			free(code0);
-		}
+			}
+			if (switchRegisterBoost > 0) {
+				axes[k].specializationConstants.stageRadix[axes[k].specializationConstants.numStages] = switchRegisterBoost;
+				axes[k].specializationConstants.numStages++;
+			}
+			else {
+				if (min_registers_per_thread != registers_per_thread) {
+					j = axes[k].specializationConstants.stageRadix[axes[k].specializationConstants.numStages - 1];
+					axes[k].specializationConstants.stageRadix[axes[k].specializationConstants.numStages - 1] = axes[k].specializationConstants.stageRadix[0];
+					axes[k].specializationConstants.stageRadix[0] = j;
+			}
+			}
+				}
 		return 0;
-
 	}
 	static inline uint32_t VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint32_t axis_id, uint32_t axis_upload_id, uint32_t inverse) {
 		//get radix stages
 		VkFFTAxis* axis = &FFTPlan->axes[axis_id][axis_upload_id];
+		axis->specializationConstants.warpSize = app->configuration.warpSize;
+		axis->specializationConstants.numSharedBanks = app->configuration.numSharedBanks;
 		uint32_t complexSize;
 		if (app->configuration.doublePrecision)
 			complexSize = (2 * sizeof(double));
@@ -7468,7 +7793,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		}
 		else {
 			if (app->configuration.performR2C)
-				axis->specializationConstants.fft_dim_x = app->configuration.size[0] / 2;
+				axis->specializationConstants.fft_dim_x = app->configuration.size[0] / 2 + 1;
 			else
 				axis->specializationConstants.fft_dim_x = app->configuration.size[0];
 		}
@@ -7537,6 +7862,12 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 				case 8:
 					maxStageSum += dimMult * 3;
 					break;
+				case 11:
+					maxStageSum += dimMult * 10;
+					break;
+				case 13:
+					maxStageSum += dimMult * 12;
+					break;
 				}
 				dimMult *= axis->specializationConstants.stageRadix[i];
 			}
@@ -7673,7 +8004,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 					axis->referenceLUT = 1;
 				}
 				else {
-					if (((axis_id == 1) || (axis_id == 2)) && (!((!app->configuration.reorderFourStep) && (FFTPlan->numAxisUploads[axis_id] > 1))) && ((axis->specializationConstants.fft_dim_full == FFTPlan->axes[0][0].specializationConstants.fft_dim_full)&&(FFTPlan->numAxisUploads[axis_id] == 1)&&(axis->specializationConstants.fft_dim_full<maxSingleSizeStrided/ axis->specializationConstants.registerBoost))) {
+					if (((axis_id == 1) || (axis_id == 2)) && (!((!app->configuration.reorderFourStep) && (FFTPlan->numAxisUploads[axis_id] > 1))) && ((axis->specializationConstants.fft_dim_full == FFTPlan->axes[0][0].specializationConstants.fft_dim_full) && (FFTPlan->numAxisUploads[axis_id] == 1) && (axis->specializationConstants.fft_dim_full < maxSingleSizeStrided / axis->specializationConstants.registerBoost))) {
 						axis->bufferLUT = FFTPlan->axes[0][axis_upload_id].bufferLUT;
 #if(VKFFT_BACKEND==0)
 						axis->bufferLUTDeviceMemory = FFTPlan->axes[0][axis_upload_id].bufferLUTDeviceMemory;
@@ -7711,8 +8042,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		//configure strides
 		uint32_t* axisStride = axis->specializationConstants.inputStride;
 		uint32_t* usedStride = app->configuration.bufferStride;
-		if ((!inverse) && (axis_id == 0) && (axis_upload_id == 0) && (app->configuration.isInputFormatted)) usedStride = app->configuration.inputBufferStride;
-		if ((inverse) && (axis_id == app->configuration.FFTdim - 1) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted)) usedStride = app->configuration.inputBufferStride;
+		if ((!inverse) && (axis_id == 0) && (axis_upload_id == 0) && ((app->configuration.isInputFormatted) || (app->configuration.performR2C))) usedStride = app->configuration.inputBufferStride;
+		if ((inverse) && (axis_id == app->configuration.FFTdim - 1) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isOutputFormatted)) usedStride = app->configuration.outputBufferStride;
 
 		axisStride[0] = 1;
 
@@ -7722,13 +8053,13 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		}
 		if (axis_id == 1)
 		{
-			axisStride[1] = (app->configuration.performR2C) ? usedStride[0] / 2 : usedStride[0];
+			axisStride[1] = usedStride[0];
 			axisStride[2] = usedStride[1];
 		}
 		if (axis_id == 2)
 		{
 			axisStride[1] = usedStride[1];
-			axisStride[2] = (app->configuration.performR2C) ? usedStride[0] / 2 : usedStride[0];
+			axisStride[2] = usedStride[0];
 		}
 
 		axisStride[3] = usedStride[2];
@@ -7738,7 +8069,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		axisStride = axis->specializationConstants.outputStride;
 		usedStride = app->configuration.bufferStride;
 		if ((!inverse) && (axis_id == app->configuration.FFTdim - 1) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isOutputFormatted)) usedStride = app->configuration.outputBufferStride;
-		if ((inverse) && (axis_id == 0) && (axis_upload_id == 0) && (app->configuration.isOutputFormatted)) usedStride = app->configuration.outputBufferStride;
+		if ((inverse) && (axis_id == 0) && (axis_upload_id == 0) && ((app->configuration.isInputFormatted) || (app->configuration.performR2C))) usedStride = app->configuration.inputBufferStride;
 
 		axisStride[0] = 1;
 
@@ -7748,13 +8079,13 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		}
 		if (axis_id == 1)
 		{
-			axisStride[1] = (app->configuration.performR2C) ? usedStride[0] / 2 : usedStride[0];
+			axisStride[1] = usedStride[0];
 			axisStride[2] = usedStride[1];
 		}
 		if (axis_id == 2)
 		{
 			axisStride[1] = usedStride[1];
-			axisStride[2] = (app->configuration.performR2C) ? usedStride[0] / 2 : usedStride[0];
+			axisStride[2] = usedStride[0];
 		}
 
 		axisStride[3] = usedStride[2];
@@ -8277,6 +8608,11 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			}
 			//axis->groupedBatch = 8;
 			//shared memory bank conflict resolve
+#if(VKFFT_BACKEND!=2)//for some reason, hip doesn't get performance increase from having variable shared memory strides.
+			if ((FFTPlan->numAxisUploads[axis_id] == 2) && (axis_upload_id == 0) && (axis->specializationConstants.fftDim * maxBatchCoalesced <= maxSequenceLengthSharedMemory)) {
+				axis->groupedBatch = ceil(axis->groupedBatch / 2.0);
+			}
+#endif
 			if ((FFTPlan->numAxisUploads[axis_id] == 3) && (axis_upload_id == 0) && (axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory / (2 * complexSize))) {
 				axis->groupedBatch = ceil(axis->groupedBatch / 2.0);
 			}
@@ -8291,6 +8627,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 				axis->groupedBatch = ceil(axis->groupedBatch / 2.0);
 			if (axis->groupedBatch > app->configuration.warpSize) axis->groupedBatch = (axis->groupedBatch / app->configuration.warpSize) * app->configuration.warpSize;
 			//if (axis->groupedBatch > maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / maxBatchCoalesced) * maxBatchCoalesced;
+
 			uint32_t maxThreadNum = maxSequenceLengthSharedMemory / (axis->specializationConstants.min_registers_per_thread * axis->specializationConstants.registerBoost);
 			if (axis_id == 0) {
 
@@ -8302,8 +8639,13 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 						axis->axisBlock[1] = axis->groupedBatch;
 					else {
 						axis->axisBlock[1] = (axis->axisBlock[0] < app->configuration.warpSize) ? app->configuration.warpSize / axis->axisBlock[0] : 1;
+						if (app->configuration.performR2C) {
+							if (app->configuration.size[1]/2 < axis->axisBlock[1]) axis->axisBlock[1] = app->configuration.size[1]/2;
+						}
+						else {
 						if (app->configuration.size[1] < axis->axisBlock[1]) axis->axisBlock[1] = app->configuration.size[1];
 					}
+					}
 					if (axis->axisBlock[1] > app->configuration.maxComputeWorkGroupSize[1]) axis->axisBlock[1] = app->configuration.maxComputeWorkGroupSize[1];
 					axis->axisBlock[2] = 1;
 					axis->axisBlock[3] = axis->specializationConstants.fftDim;
@@ -8324,19 +8666,13 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 				axis->axisBlock[1] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
 
 				if (app->configuration.performR2C) {
-					if (axis_upload_id == 0) {
+					/*if (axis_upload_id == 0) {
 						VkFFTScheduler(app, FFTPlan, axis_id, 1);
 						for (uint32_t i = 0; i < FFTPlan->numSupportAxisUploads[0]; i++) {
-							/*FFTPlan->supportAxes[axis_id - 1][i].specializationConstants.registers_per_thread = FFTPlan->axes[axis_id][i].specializationConstants.registers_per_thread;
-							FFTPlan->supportAxes[axis_id - 1][i].specializationConstants.min_registers_per_thread = FFTPlan->axes[axis_id][i].specializationConstants.min_registers_per_thread;
-							FFTPlan->supportAxes[axis_id - 1][i].specializationConstants.numStages = FFTPlan->axes[axis_id][i].specializationConstants.numStages;
-							FFTPlan->supportAxes[axis_id - 1][i].specializationConstants.fftDim = FFTPlan->axisSplit[axis_id][i];
-							for (uint32_t j = 0; j < FFTPlan->supportAxes[axis_id - 1][i].specializationConstants.numStages; j++)
-								FFTPlan->supportAxes[axis_id - 1][i].specializationConstants.stageRadix[j] = FFTPlan->axes[axis_id][i].specializationConstants.stageRadix[j];*/
 							VkFFTPlanSupportAxis(app, FFTPlan, 1, i, inverse);
 						}
-					}
-					axis->axisBlock[0] = (app->configuration.size[0] / 2 > axis->groupedBatch) ? axis->groupedBatch : app->configuration.size[0] / 2;
+					}*/
+					axis->axisBlock[0] = (app->configuration.size[0] / 2 + 1 > axis->groupedBatch) ? axis->groupedBatch : app->configuration.size[0] / 2 + 1;
 					/*if (axis->axisBlock[0] * axis->axisBlock[1] < 64)
 						if (app->configuration.size[0]/2 > 64 / axis->axisBlock[1])
 							axis->axisBlock[0] = 64 / axis->axisBlock[1];
@@ -8361,20 +8697,14 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 				axis->axisBlock[1] = (axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost > 1) ? axis->specializationConstants.fftDim / axis->specializationConstants.min_registers_per_thread / axis->specializationConstants.registerBoost : 1;
 
 				if (app->configuration.performR2C) {
-					if (axis_upload_id == 0) {
+					/*if (axis_upload_id == 0) {
 						VkFFTScheduler(app, FFTPlan, axis_id, 1);
 						//->numSupportAxisUploads[1] = FFTPlan->numAxisUploads[2];
 						for (uint32_t i = 0; i < FFTPlan->numSupportAxisUploads[1]; i++) {
-							/*FFTPlan->supportAxes[axis_id - 1][i].specializationConstants.registers_per_thread = FFTPlan->axes[axis_id][i].specializationConstants.registers_per_thread;
-							FFTPlan->supportAxes[axis_id - 1][i].specializationConstants.min_registers_per_thread = FFTPlan->axes[axis_id][i].specializationConstants.min_registers_per_thread;
-							FFTPlan->supportAxes[axis_id - 1][i].specializationConstants.numStages = FFTPlan->axes[axis_id][i].specializationConstants.numStages;
-							FFTPlan->supportAxes[axis_id - 1][i].specializationConstants.fftDim = FFTPlan->axisSplit[axis_id][i];
-							for (uint32_t j = 0; j < FFTPlan->supportAxes[axis_id - 1][i].specializationConstants.numStages; j++)
-								FFTPlan->supportAxes[axis_id - 1][i].specializationConstants.stageRadix[j] = FFTPlan->axes[axis_id][i].specializationConstants.stageRadix[j];*/
 							VkFFTPlanSupportAxis(app, FFTPlan, 2, i, inverse);
 						}
-					}
-					axis->axisBlock[0] = (app->configuration.size[0] / 2 > axis->groupedBatch) ? axis->groupedBatch : app->configuration.size[0] / 2;
+					}*/
+					axis->axisBlock[0] = (app->configuration.size[0] / 2 + 1 > axis->groupedBatch) ? axis->groupedBatch : app->configuration.size[0] / 2 + 1;
 					/*if (axis->axisBlock[0] * axis->axisBlock[1] < 64)
 						if (app->configuration.size[0] / 2 > 64 / axis->axisBlock[1])
 							axis->axisBlock[0] = 64 / axis->axisBlock[1];
@@ -8416,10 +8746,10 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 				else  axis->specializationConstants.performWorkGroupShift[2] = 0;
 			}
 			if (axis_id == 1) {
-				tempSize[0] = app->configuration.size[0] / axis->axisBlock[0] * app->configuration.size[1] / axis->specializationConstants.fftDim;
+				tempSize[0] = (app->configuration.performR2C == 1) ? ceil((app->configuration.size[0]/2+1) / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim) : ceil(app->configuration.size[0] / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim);
 				tempSize[1] = 1;
 				tempSize[2] = app->configuration.size[2];
-				if (app->configuration.performR2C == 1) tempSize[0] = ceil(tempSize[0] / 2.0);
+				//if (app->configuration.performR2C == 1) tempSize[0] = ceil(tempSize[0] / 2.0);
 				//if (app->configuration.performZeropadding[2]) tempSize[2] = ceil(tempSize[2] / 2.0);
 
 				if (tempSize[0] > app->configuration.maxComputeWorkGroupCount[0]) axis->specializationConstants.performWorkGroupShift[0] = 1;
@@ -8431,10 +8761,10 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 
 			}
 			if (axis_id == 2) {
-				tempSize[0] = app->configuration.size[0] / axis->axisBlock[0] * app->configuration.size[2] / axis->specializationConstants.fftDim;
+				tempSize[0] = (app->configuration.performR2C == 1) ? ceil((app->configuration.size[0]/2+1) / (double)axis->axisBlock[0] * app->configuration.size[2] / (double)axis->specializationConstants.fftDim) : ceil(app->configuration.size[0] / (double)axis->axisBlock[0] * app->configuration.size[2] / (double)axis->specializationConstants.fftDim);
 				tempSize[1] = 1;
 				tempSize[2] = app->configuration.size[1];
-				if (app->configuration.performR2C == 1) tempSize[0] = ceil(tempSize[0] / 2.0);
+				//if (app->configuration.performR2C == 1) tempSize[0] = ceil(tempSize[0] / 2.0);
 
 				if (tempSize[0] > app->configuration.maxComputeWorkGroupCount[0]) axis->specializationConstants.performWorkGroupShift[0] = 1;
 				else  axis->specializationConstants.performWorkGroupShift[0] = 0;
@@ -8580,7 +8910,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			if ((axis_id == 0) && (!axis->specializationConstants.inverse) && (app->configuration.performR2C)) type = 5;
 			if ((axis_id == 0) && (axis->specializationConstants.inverse) && (app->configuration.performR2C)) type = 6;
 #if(VKFFT_BACKEND==0)
-			axis->specializationConstants.cacheShuffle = ((FFTPlan->numAxisUploads[axis_id] > 1) && ((axis->specializationConstants.fftDim& (axis->specializationConstants.fftDim - 1)) == 0) && (!app->configuration.doublePrecision) && ((type == 0) || (type == 5) || (type == 6))) ? 1 : 0;
+			axis->specializationConstants.cacheShuffle = ((FFTPlan->numAxisUploads[axis_id] > 1) && ((axis->specializationConstants.fftDim & (axis->specializationConstants.fftDim - 1)) == 0) && (!app->configuration.doublePrecision) && ((type == 0) || (type == 5) || (type == 6))) ? 1 : 0;
 #elif(VKFFT_BACKEND==1)
 			axis->specializationConstants.cacheShuffle = 0;
 #elif(VKFFT_BACKEND==2)
@@ -9001,7 +9331,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		if (inputLaunchConfiguration.device == 0) return 1002;
 		app->configuration.device = inputLaunchConfiguration.device;
 		if (inputLaunchConfiguration.num_streams != 0)	app->configuration.num_streams = inputLaunchConfiguration.num_streams;
-
+		if (inputLaunchConfiguration.stream != 0)	app->configuration.stream = inputLaunchConfiguration.stream;
+		app->configuration.streamID = 0;
 		int value = 0;
 		cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, app->configuration.device[0]);
 		app->configuration.maxThreadsNum = value;
@@ -9025,10 +9356,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		app->configuration.warpSize = value;
 		app->configuration.sharedMemorySizePow2 = (uint32_t)pow(2, (uint32_t)log2(app->configuration.sharedMemorySize));
 		if (app->configuration.num_streams > 1) {
-			app->configuration.stream = (cudaStream_t*)malloc(app->configuration.num_streams * sizeof(cudaStream_t));
 			app->configuration.stream_event = (cudaEvent_t*)malloc(app->configuration.num_streams * sizeof(cudaEvent_t));
 			for (uint32_t i = 0; i < app->configuration.num_streams; i++) {
-				cudaStreamCreate(&app->configuration.stream[i]);
 				cudaEventCreate(&app->configuration.stream_event[i]);
 			}
 		}
@@ -9044,7 +9373,8 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		if (inputLaunchConfiguration.device == 0) return 1002;
 		app->configuration.device = inputLaunchConfiguration.device;
 		if (inputLaunchConfiguration.num_streams != 0)	app->configuration.num_streams = inputLaunchConfiguration.num_streams;
-
+		if (inputLaunchConfiguration.stream != 0)	app->configuration.stream = inputLaunchConfiguration.stream;
+		app->configuration.streamID = 0;
 		int value = 0;
 		hipDeviceGetAttribute(&value, hipDeviceAttributeMaxThreadsPerBlock, app->configuration.device[0]);
 		app->configuration.maxThreadsNum = value;
@@ -9068,14 +9398,11 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		app->configuration.warpSize = value;
 		app->configuration.sharedMemorySizePow2 = (uint32_t)pow(2, (uint32_t)log2(app->configuration.sharedMemorySize));
 		if (app->configuration.num_streams > 1) {
-			app->configuration.stream = (hipStream_t*)malloc(app->configuration.num_streams * sizeof(hipStream_t));
 			app->configuration.stream_event = (hipEvent_t*)malloc(app->configuration.num_streams * sizeof(hipEvent_t));
 			for (uint32_t i = 0; i < app->configuration.num_streams; i++) {
-				hipStreamCreate(&app->configuration.stream[i]);
 				hipEventCreate(&app->configuration.stream_event[i]);
 			}
 		}
-
 		app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;
 		app->configuration.useLUT = (app->configuration.doublePrecision) ? 1 : 0;
 		app->configuration.registerBoostNonPow2 = 0;
@@ -9093,18 +9420,30 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 
 		app->configuration.size[0] = inputLaunchConfiguration.size[0];
 
-		if (inputLaunchConfiguration.bufferStride[0] == 0)
+		if (inputLaunchConfiguration.bufferStride[0] == 0) {
+			if (inputLaunchConfiguration.performR2C)
+				app->configuration.bufferStride[0] = app->configuration.size[0] / 2 + 1;
+			else
 			app->configuration.bufferStride[0] = app->configuration.size[0];
+		}
 		else
 			app->configuration.bufferStride[0] = inputLaunchConfiguration.bufferStride[0];
 
-		if (inputLaunchConfiguration.inputBufferStride[0] == 0)
+		if (inputLaunchConfiguration.inputBufferStride[0] == 0) {
+			if (inputLaunchConfiguration.performR2C)
+				app->configuration.inputBufferStride[0] = app->configuration.size[0] + 2;
+			else
 			app->configuration.inputBufferStride[0] = app->configuration.size[0];
+		}
 		else
 			app->configuration.inputBufferStride[0] = inputLaunchConfiguration.inputBufferStride[0];
 
-		if (inputLaunchConfiguration.outputBufferStride[0] == 0)
+		if (inputLaunchConfiguration.outputBufferStride[0] == 0) {
+			if (inputLaunchConfiguration.performR2C)
+				app->configuration.outputBufferStride[0] = app->configuration.size[0] + 2;
+			else
 			app->configuration.outputBufferStride[0] = app->configuration.size[0];
+		}
 		else
 			app->configuration.outputBufferStride[0] = inputLaunchConfiguration.outputBufferStride[0];
 		for (uint32_t i = 1; i < 3; i++) {
@@ -9207,9 +9546,12 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 		if (inputLaunchConfiguration.coalescedMemory != 0)	app->configuration.coalescedMemory = inputLaunchConfiguration.coalescedMemory;
 		app->configuration.aimThreads = 128;
 		if (inputLaunchConfiguration.aimThreads != 0)	app->configuration.aimThreads = inputLaunchConfiguration.aimThreads;
+		app->configuration.numSharedBanks = 32;
+		if (inputLaunchConfiguration.numSharedBanks != 0)	app->configuration.numSharedBanks = inputLaunchConfiguration.numSharedBanks;
+
 		if (inputLaunchConfiguration.performR2C != 0) {
 			app->configuration.performR2C = inputLaunchConfiguration.performR2C;
-			if (inputLaunchConfiguration.bufferStride[0] == 0)
+			/*if (inputLaunchConfiguration.bufferStride[0] == 0)
 				app->configuration.bufferStride[0] = app->configuration.size[0];
 
 			if (inputLaunchConfiguration.inputBufferStride[0] == 0)
@@ -9234,7 +9576,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 				app->configuration.inputBufferStride[2] = app->configuration.inputBufferStride[1] * app->configuration.size[2];
 
 			if (inputLaunchConfiguration.outputBufferStride[2] == 0)
-				app->configuration.outputBufferStride[2] = app->configuration.outputBufferStride[1] * app->configuration.size[2];
+				app->configuration.outputBufferStride[2] = app->configuration.outputBufferStride[1] * app->configuration.size[2];*/
 		}
 		app->configuration.normalize = 0;
 		if (inputLaunchConfiguration.normalize != 0)	app->configuration.normalize = inputLaunchConfiguration.normalize;
@@ -9391,7 +9733,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 							printf("Error: %d\n", result);
 						}
 					}
-					if (app->configuration.num_streams > 1) {
+					if (app->configuration.num_streams >= 1) {
 						result = cuLaunchKernel(axis->VkFFTKernel,
 							maxBlockSize[0], maxBlockSize[1], maxBlockSize[2],     // grid dim
 							axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2],   // block dim
@@ -9436,7 +9778,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 						}
 					}
 					//printf("%d %d %d %d %d %d\n",maxBlockSize[0], maxBlockSize[1], maxBlockSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]);
-					if (app->configuration.num_streams > 1) {
+					if (app->configuration.num_streams >= 1) {
 						result = hipModuleLaunchKernel(axis->VkFFTKernel,
 							maxBlockSize[0], maxBlockSize[1], maxBlockSize[2],     // grid dim
 							axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2],   // block dim
@@ -9497,6 +9839,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 #elif(VKFFT_BACKEND==2)
 		app->configuration.streamCounter = 0;
 #endif
+		uint32_t localSize0 = (app->configuration.performR2C == 1) ? app->configuration.size[0]/2+1 : app->configuration.size[0];
 		if (inverse != 1) {
 			//FFT axis 0
 			for (uint32_t j = 0; j < app->configuration.numberBatches; j++) {
@@ -9546,7 +9889,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 
 				//FFT axis 1
 				if ((app->configuration.FFTdim == 2) && (app->configuration.performConvolution)) {
-					if (app->configuration.performR2C == 1) {
+					/*if (app->configuration.performR2C == 1) {
 						for (int l = app->localFFTPlan->numSupportAxisUploads[0] - 1; l >= 0; l--) {
 							VkFFTAxis* axis = &app->localFFTPlan->supportAxes[0][l];
 							uint32_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (l == 0)) ? 1 : app->configuration.coordinateFeatures;
@@ -9575,7 +9918,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 								VkFFTSync(app);
 						}
 
-					}
+					}*/
 
 					for (int l = app->localFFTPlan->numAxisUploads[1] - 1; l >= 0; l--) {
 						VkFFTAxis* axis = &app->localFFTPlan->axes[1][l];
@@ -9589,10 +9932,10 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 							vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL);
 #endif
 							uint32_t dispatchBlock[3];
-							dispatchBlock[0] = ceil(app->configuration.size[0] / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim);
+							dispatchBlock[0] = ceil(localSize0 / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim);
 							dispatchBlock[1] = 1;
 							dispatchBlock[2] = app->configuration.size[2];
-							if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
+							//if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
 							//if (app->configuration.performZeropadding[2]) dispatchBlock[2] = ceil(dispatchBlock[2] / 2.0);
 							dispatchEnhanced(app, axis, dispatchBlock);
 						}
@@ -9600,7 +9943,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 					}
 				}
 				else {
-					if (app->configuration.performR2C == 1) {
+					/*if (app->configuration.performR2C == 1) {
 						for (uint32_t j = 0; j < app->configuration.numberBatches; j++) {
 							for (int l = app->localFFTPlan->numSupportAxisUploads[0] - 1; l >= 0; l--) {
 								VkFFTAxis* axis = &app->localFFTPlan->supportAxes[0][l];
@@ -9627,7 +9970,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 									VkFFTSync(app);
 							}
 						}
-					}
+					}*/
 					for (uint32_t j = 0; j < app->configuration.numberBatches; j++) {
 						for (int l = app->localFFTPlan->numAxisUploads[1] - 1; l >= 0; l--) {
 							VkFFTAxis* axis = &app->localFFTPlan->axes[1][l];
@@ -9640,10 +9983,10 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 #endif
 								uint32_t dispatchBlock[3];
 
-								dispatchBlock[0] = ceil(app->configuration.size[0] / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim);
+								dispatchBlock[0] = ceil(localSize0 / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim);
 								dispatchBlock[1] = 1;
 								dispatchBlock[2] = app->configuration.size[2];
-								if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
+								//if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
 								//if (app->configuration.performZeropadding[2]) dispatchBlock[2] = ceil(dispatchBlock[2] / 2.0);
 								dispatchEnhanced(app, axis, dispatchBlock);
 							}
@@ -9657,7 +10000,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			if (app->configuration.FFTdim > 2) {
 				if ((app->configuration.FFTdim == 3) && (app->configuration.performConvolution)) {
 
-					if (app->configuration.performR2C == 1) {
+					/*if (app->configuration.performR2C == 1) {
 
 						for (int l = app->localFFTPlan->numSupportAxisUploads[1] - 1; l >= 0; l--) {
 							VkFFTAxis* axis = &app->localFFTPlan->supportAxes[1][l];
@@ -9680,7 +10023,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 							if (l > 0)
 								VkFFTSync(app);
 						}
-					}
+					}*/
 
 					for (int l = app->localFFTPlan->numAxisUploads[2] - 1; l >= 0; l--) {
 
@@ -9694,10 +10037,10 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 							vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL);
 #endif
 							uint32_t dispatchBlock[3];
-							dispatchBlock[0] = ceil(app->configuration.size[0] / (double)axis->axisBlock[0] * app->configuration.size[2] / (double)axis->specializationConstants.fftDim);
+							dispatchBlock[0] = ceil(localSize0 / (double)axis->axisBlock[0] * app->configuration.size[2] / (double)axis->specializationConstants.fftDim);
 							dispatchBlock[1] = 1;
 							dispatchBlock[2] = app->configuration.size[1];
-							if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
+							//if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
 							dispatchEnhanced(app, axis, dispatchBlock);
 
 						}
@@ -9705,7 +10048,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 					}
 				}
 				else {
-					if (app->configuration.performR2C == 1) {
+					/*if (app->configuration.performR2C == 1) {
 						for (uint32_t j = 0; j < app->configuration.numberBatches; j++) {
 							for (int l = app->localFFTPlan->numSupportAxisUploads[1] - 1; l >= 0; l--) {
 								VkFFTAxis* axis = &app->localFFTPlan->supportAxes[1][l];
@@ -9727,7 +10070,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 									VkFFTSync(app);
 							}
 						}
-					}
+					}*/
 					for (uint32_t j = 0; j < app->configuration.numberBatches; j++) {
 						for (int l = app->localFFTPlan->numAxisUploads[2] - 1; l >= 0; l--) {
 							VkFFTAxis* axis = &app->localFFTPlan->axes[2][l];
@@ -9739,10 +10082,10 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 								vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL);
 #endif
 								uint32_t dispatchBlock[3];
-								dispatchBlock[0] = ceil(app->configuration.size[0] / (double)axis->axisBlock[0] * app->configuration.size[2] / (double)axis->specializationConstants.fftDim);
+								dispatchBlock[0] = ceil(localSize0 / (double)axis->axisBlock[0] * app->configuration.size[2] / (double)axis->specializationConstants.fftDim);
 								dispatchBlock[1] = 1;
 								dispatchBlock[2] = app->configuration.size[1];
-								if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
+								//if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
 								dispatchEnhanced(app, axis, dispatchBlock);
 							}
 							VkFFTSync(app);
@@ -9758,7 +10101,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 
 				//multiple upload ifft leftovers
 				if (app->configuration.FFTdim == 3) {
-					if (app->configuration.performR2C == 1) {
+					/*if (app->configuration.performR2C == 1) {
 						for (uint32_t j = 0; j < app->configuration.numberKernels; j++) {
 							for (int l = 1; l < app->localFFTPlan_inverse->numSupportAxisUploads[1]; l++) {
 								VkFFTAxis* axis = &app->localFFTPlan_inverse->supportAxes[1][l];
@@ -9781,7 +10124,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 									VkFFTSync(app);
 							}
 						}
-					}
+					}*/
 					for (uint32_t j = 0; j < app->configuration.numberKernels; j++) {
 						for (int l = 1; l < app->localFFTPlan_inverse->numAxisUploads[2]; l++) {
 							VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[2][l];
@@ -9794,17 +10137,17 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 								vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL);
 #endif
 								uint32_t dispatchBlock[3];
-								dispatchBlock[0] = ceil(app->configuration.size[0] / (double)axis->axisBlock[0] * app->configuration.size[2] / (double)axis->specializationConstants.fftDim);
+								dispatchBlock[0] = ceil(localSize0 / (double)axis->axisBlock[0] * app->configuration.size[2] / (double)axis->specializationConstants.fftDim);
 								dispatchBlock[1] = 1;
 								dispatchBlock[2] = app->configuration.size[1];
-								if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
+								//if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
 								dispatchEnhanced(app, axis, dispatchBlock);
 							}
 							VkFFTSync(app);
 						}
 					}
 				}
-				if (app->configuration.performR2C == 1) {
+				/*if (app->configuration.performR2C == 1) {
 					for (uint32_t j = 0; j < app->configuration.numberKernels; j++) {
 						for (int l = 0; l < app->localFFTPlan_inverse->numSupportAxisUploads[0]; l++) {
 							VkFFTAxis* axis = &app->localFFTPlan_inverse->supportAxes[0][l];
@@ -9830,7 +10173,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 								VkFFTSync(app);
 						}
 					}
-				}
+				}*/
 				for (uint32_t j = 0; j < app->configuration.numberKernels; j++) {
 					for (int l = 0; l < app->localFFTPlan_inverse->numAxisUploads[1]; l++) {
 						VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[1][l];
@@ -9842,10 +10185,10 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 							vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL);
 #endif
 							uint32_t dispatchBlock[3];
-							dispatchBlock[0] = ceil(app->configuration.size[0] / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim);
+							dispatchBlock[0] = ceil(localSize0 / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim);
 							dispatchBlock[1] = 1;
 							dispatchBlock[2] = app->configuration.size[2];
-							if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
+							//if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
 							//if (app->configuration.performZeropadding[2]) dispatchBlock[2] = ceil(dispatchBlock[2] / 2.0);
 							dispatchEnhanced(app, axis, dispatchBlock);
 
@@ -9857,7 +10200,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			}
 			if (app->configuration.FFTdim > 1) {
 				if (app->configuration.FFTdim == 2) {
-					if (app->configuration.performR2C == 1) {
+					/*if (app->configuration.performR2C == 1) {
 						for (uint32_t j = 0; j < app->configuration.numberKernels; j++) {
 							for (int l = 1; l < app->localFFTPlan_inverse->numSupportAxisUploads[0]; l++) {
 								VkFFTAxis* axis = &app->localFFTPlan_inverse->supportAxes[0][l];
@@ -9885,7 +10228,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 							}
 						}
 
-					}
+					}*/
 					for (uint32_t j = 0; j < app->configuration.numberKernels; j++) {
 						for (int l = 1; l < app->localFFTPlan_inverse->numAxisUploads[1]; l++) {
 							VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[1][l];
@@ -9899,10 +10242,10 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 								vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL);
 #endif
 								uint32_t dispatchBlock[3];
-								dispatchBlock[0] = ceil(app->configuration.size[0] / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim);
+								dispatchBlock[0] = ceil(localSize0 / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim);
 								dispatchBlock[1] = 1;
 								dispatchBlock[2] = app->configuration.size[2];
-								if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
+								//if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
 								//if (app->configuration.performZeropadding[2]) dispatchBlock[2] = ceil(dispatchBlock[2] / 2.0);
 								dispatchEnhanced(app, axis, dispatchBlock);
 
@@ -9968,10 +10311,10 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 							vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL);
 #endif
 							uint32_t dispatchBlock[3];
-							dispatchBlock[0] = ceil(app->configuration.size[0] / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim);
+							dispatchBlock[0] = ceil(localSize0 / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim);
 							dispatchBlock[1] = 1;
 							dispatchBlock[2] = app->configuration.size[2];
-							if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
+							//if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
 							//if (app->configuration.performZeropadding[2]) dispatchBlock[2] = ceil(dispatchBlock[2] / 2.0);
 							dispatchEnhanced(app, axis, dispatchBlock);
 
@@ -9986,7 +10329,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			//we start from axis 2 and go back to axis 0
 			//FFT axis 2
 			if (app->configuration.FFTdim > 2) {
-				if (app->configuration.performR2C == 1) {
+				/*if (app->configuration.performR2C == 1) {
 					for (uint32_t j = 0; j < app->configuration.numberBatches; j++) {
 						for (int l = app->localFFTPlan->numSupportAxisUploads[1] - 1; l >= 0; l--) {
 							if (!app->configuration.reorderFourStep) l = app->localFFTPlan->numSupportAxisUploads[1] - 1 - l;
@@ -10012,12 +10355,12 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 								VkFFTSync(app);
 						}
 					}
-				}
+				}*/
 
 				for (uint32_t j = 0; j < app->configuration.numberBatches; j++) {
-					for (int l = app->localFFTPlan->numAxisUploads[2] - 1; l >= 0; l--) {
-						if (!app->configuration.reorderFourStep) l = app->localFFTPlan->numAxisUploads[2] - 1 - l;
-						VkFFTAxis* axis = &app->localFFTPlan->axes[2][l];
+					for (int l = app->localFFTPlan_inverse->numAxisUploads[2] - 1; l >= 0; l--) {
+						if (!app->configuration.reorderFourStep) l = app->localFFTPlan_inverse->numAxisUploads[2] - 1 - l;
+						VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[2][l];
 						axis->pushConstants.batch = j;
 						for (uint32_t i = 0; i < app->configuration.coordinateFeatures; i++) {
 							axis->pushConstants.coordinate = i;
@@ -10026,17 +10369,17 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 							vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL);
 #endif
 							uint32_t dispatchBlock[3];
-							dispatchBlock[0] = ceil(app->configuration.size[0] / (double)axis->axisBlock[0] * app->configuration.size[2] / (double)axis->specializationConstants.fftDim);
+							dispatchBlock[0] = ceil(localSize0 / (double)axis->axisBlock[0] * app->configuration.size[2] / (double)axis->specializationConstants.fftDim);
 							dispatchBlock[1] = 1;
 							dispatchBlock[2] = app->configuration.size[1];
 							//if (app->configuration.performZeropaddingInverse[0]) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
 							//if (app->configuration.performZeropaddingInverse[1]) dispatchBlock[1] = ceil(dispatchBlock[1] / 2.0);
 
-							if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
+							//if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
 							dispatchEnhanced(app, axis, dispatchBlock);
 						}
 						VkFFTSync(app);
-						if (!app->configuration.reorderFourStep) l = app->localFFTPlan->numAxisUploads[2] - 1 - l;
+						if (!app->configuration.reorderFourStep) l = app->localFFTPlan_inverse->numAxisUploads[2] - 1 - l;
 					}
 				}
 
@@ -10044,7 +10387,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			if (app->configuration.FFTdim > 1) {
 
 				//FFT axis 1
-				if (app->configuration.performR2C == 1) {
+				/*if (app->configuration.performR2C == 1) {
 					for (uint32_t j = 0; j < app->configuration.numberBatches; j++) {
 						for (int l = app->localFFTPlan->numSupportAxisUploads[0] - 1; l >= 0; l--) {
 							if (!app->configuration.reorderFourStep) l = app->localFFTPlan->numSupportAxisUploads[0] - 1 - l;
@@ -10072,11 +10415,11 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 								VkFFTSync(app);
 						}
 					}
-				}
+				}*/
 				for (uint32_t j = 0; j < app->configuration.numberBatches; j++) {
-					for (int l = app->localFFTPlan->numAxisUploads[1] - 1; l >= 0; l--) {
-						if (!app->configuration.reorderFourStep) l = app->localFFTPlan->numAxisUploads[1] - 1 - l;
-						VkFFTAxis* axis = &app->localFFTPlan->axes[1][l];
+					for (int l = app->localFFTPlan_inverse->numAxisUploads[1] - 1; l >= 0; l--) {
+						if (!app->configuration.reorderFourStep) l = app->localFFTPlan_inverse->numAxisUploads[1] - 1 - l;
+						VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[1][l];
 						axis->pushConstants.batch = j;
 						for (uint32_t i = 0; i < app->configuration.coordinateFeatures; i++) {
 							axis->pushConstants.coordinate = i;
@@ -10085,17 +10428,17 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 							vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL);
 #endif
 							uint32_t dispatchBlock[3];
-							dispatchBlock[0] = ceil(app->configuration.size[0] / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim);
+							dispatchBlock[0] = ceil(localSize0 / (double)axis->axisBlock[0] * app->configuration.size[1] / (double)axis->specializationConstants.fftDim);
 							dispatchBlock[1] = 1;
 							dispatchBlock[2] = app->configuration.size[2];
-							if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
+							//if (app->configuration.performR2C == 1) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
 							//if (app->configuration.performZeropadding[2]) dispatchBlock[2] = ceil(dispatchBlock[2] / 2.0);
 							//if (app->configuration.performZeropaddingInverse[0]) dispatchBlock[0] = ceil(dispatchBlock[0] / 2.0);
 
 							dispatchEnhanced(app, axis, dispatchBlock);
 
 						}
-						if (!app->configuration.reorderFourStep) l = app->localFFTPlan->numAxisUploads[1] - 1 - l;
+						if (!app->configuration.reorderFourStep) l = app->localFFTPlan_inverse->numAxisUploads[1] - 1 - l;
 						VkFFTSync(app);
 					}
 				}
@@ -10103,9 +10446,9 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 			}
 			//FFT axis 0
 			for (uint32_t j = 0; j < app->configuration.numberBatches; j++) {
-				for (int l = app->localFFTPlan->numAxisUploads[0] - 1; l >= 0; l--) {
-					if (!app->configuration.reorderFourStep) l = app->localFFTPlan->numAxisUploads[0] - 1 - l;
-					VkFFTAxis* axis = &app->localFFTPlan->axes[0][l];
+				for (int l = app->localFFTPlan_inverse->numAxisUploads[0] - 1; l >= 0; l--) {
+					if (!app->configuration.reorderFourStep) l = app->localFFTPlan_inverse->numAxisUploads[0] - 1 - l;
+					VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[0][l];
 					axis->pushConstants.batch = j;
 					uint32_t maxCoordinate = ((app->configuration.matrixConvolution) > 1 && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)) ? 1 : app->configuration.coordinateFeatures;
 					for (uint32_t i = 0; i < maxCoordinate; i++) {
@@ -10116,12 +10459,12 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 #endif
 						uint32_t dispatchBlock[3];
 						if (l == 0) {
-							if (app->localFFTPlan->numAxisUploads[0] > 2) {
-								dispatchBlock[0] = ceil(ceil(app->configuration.size[0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan->axisSplit[0][1]) * app->localFFTPlan->axisSplit[0][1];
+							if (app->localFFTPlan_inverse->numAxisUploads[0] > 2) {
+								dispatchBlock[0] = ceil(ceil(app->configuration.size[0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1];
 								dispatchBlock[1] = app->configuration.size[1];
 							}
 							else {
-								if (app->localFFTPlan->numAxisUploads[0] > 1) {
+								if (app->localFFTPlan_inverse->numAxisUploads[0] > 1) {
 									dispatchBlock[0] = ceil(ceil(app->configuration.size[0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]));
 									dispatchBlock[1] = app->configuration.size[1];
 								}
@@ -10141,7 +10484,7 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 						//if (app->configuration.performZeropadding[2]) dispatchBlock[2] = ceil(dispatchBlock[2] / 2.0);
 						dispatchEnhanced(app, axis, dispatchBlock);
 					}
-					if (!app->configuration.reorderFourStep) l = app->localFFTPlan->numAxisUploads[0] - 1 - l;
+					if (!app->configuration.reorderFourStep) l = app->localFFTPlan_inverse->numAxisUploads[0] - 1 - l;
 					VkFFTSync(app);
 				}
 			}
@@ -10153,19 +10496,15 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 #if(VKFFT_BACKEND==1)
 		if (app->configuration.num_streams > 1) {
 			for (uint32_t i = 0; i < app->configuration.num_streams; i++) {
-				cudaStreamDestroy(app->configuration.stream[i]);
 				cudaEventDestroy(app->configuration.stream_event[i]);
 			}
-			free(app->configuration.stream);
 			free(app->configuration.stream_event);
 		}
 #elif(VKFFT_BACKEND==2)
 		if (app->configuration.num_streams > 1) {
 			for (uint32_t i = 0; i < app->configuration.num_streams; i++) {
-				hipStreamDestroy(app->configuration.stream[i]);
 				hipEventDestroy(app->configuration.stream_event[i]);
 			}
-			free(app->configuration.stream);
 			free(app->configuration.stream_event);
 		}
 #endif
@@ -10189,25 +10528,25 @@ layout(std430, binding = %d) readonly buffer DataLUT {\n\
 				deleteAxis(app, &app->localFFTPlan->axes[i][j]);
 		}
 
-		for (uint32_t i = 0; i < app->configuration.FFTdim - 1; i++) {
+		/*for (uint32_t i = 0; i < app->configuration.FFTdim - 1; i++) {
 
 			if (app->configuration.performR2C) {
 				for (uint32_t j = 0; j < app->localFFTPlan->numSupportAxisUploads[i]; j++)
 					deleteAxis(app, &app->localFFTPlan->supportAxes[i][j]);
 			}
-		}
+		}*/
 		free(app->localFFTPlan);
 
 		for (uint32_t i = 0; i < app->configuration.FFTdim; i++) {
 			for (uint32_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++)
 				deleteAxis(app, &app->localFFTPlan_inverse->axes[i][j]);
 		}
-		for (uint32_t i = 0; i < app->configuration.FFTdim - 1; i++) {
+		/*for (uint32_t i = 0; i < app->configuration.FFTdim - 1; i++) {
 			if (app->configuration.performR2C) {
 				for (uint32_t j = 0; j < app->localFFTPlan_inverse->numSupportAxisUploads[i]; j++)
 					deleteAxis(app, &app->localFFTPlan_inverse->supportAxes[i][j]);
 			}
-		}
+		}*/
 		free(app->localFFTPlan_inverse);
 	}
 #ifdef __cplusplus