From 7f8adc8b91bb3d1accf2dad31e5b34acbefa89fe Mon Sep 17 00:00:00 2001 From: dzzz2001 <153698752+dzzz2001@users.noreply.github.com> Date: Fri, 7 Jun 2024 14:31:25 +0800 Subject: [PATCH] Perf: optimize psir_dot function in gint_rho_gpu.cu (#4326) * remove new in gtask_rho.cpp * rename some header files * optimize psir_dot in gint_rho_gpu.cu * fix a memory leak * remove redundant clear() * fix a memory leak * modify cuda_tools.cu * modify some header files * modify synchronization operation * modify psir_dot * modify psir_dot --- .../module_hamilt_lcao/module_gint/gint.cpp | 6 +- .../module_gint/gint_force_gpu.cu | 5 +- .../{gint_force.h => gint_force_gpu.h} | 4 +- .../module_gint/gint_rho_gpu.cu | 63 +++++-------------- .../{gint_rho.h => gint_rho_gpu.h} | 11 +--- .../module_gint/gint_vl_gpu.cu | 6 +- .../module_gint/{gint_vl.h => gint_vl_gpu.h} | 6 +- .../module_gint/grid_technique.cpp | 24 ++----- .../module_gint/grid_technique.h | 6 -- .../module_gint/gtask_force.cpp | 3 +- .../module_gint/gtask_rho.cpp | 18 ++---- .../module_gint/gtask_vl.cpp | 4 +- .../module_gint/kernels/cuda/cuda_tools.cu | 13 ++-- .../module_gint/kernels/cuda/cuda_tools.cuh | 8 ++- .../module_gint/kernels/cuda/gint_force.cu | 8 +-- .../module_gint/kernels/cuda/gint_rho.cu | 47 ++++++++------ .../module_gint/kernels/cuda/gint_rho.cuh | 24 ++----- .../module_gint/kernels/cuda/gint_vl.cu | 2 +- .../kernels/cuda/vbatch_matrix_mul.cuh | 2 +- 19 files changed, 97 insertions(+), 163 deletions(-) rename source/module_hamilt_lcao/module_gint/{gint_force.h => gint_force_gpu.h} (99%) rename source/module_hamilt_lcao/module_gint/{gint_rho.h => gint_rho_gpu.h} (91%) rename source/module_hamilt_lcao/module_gint/{gint_vl.h => gint_vl_gpu.h} (96%) diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp index 9ccd1edd7d..8b83abdf37 100644 --- a/source/module_hamilt_lcao/module_gint/gint.cpp +++ b/source/module_hamilt_lcao/module_gint/gint.cpp @@ -1,9 +1,9 @@ #include "gint.h" #if ((defined __CUDA)) -#include "gint_force.h" -#include "gint_rho.h" -#include "gint_vl.h" +#include "gint_force_gpu.h" +#include "gint_rho_gpu.h" +#include "gint_vl_gpu.h" #endif #include "module_base/memory.h" diff --git a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu index ad33ad63bc..5579ddec97 100644 --- a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu @@ -3,11 +3,12 @@ #include #include -#include "gint_force.h" +#include "gint_force_gpu.h" #include "kernels/cuda/cuda_tools.cuh" #include "kernels/cuda/gint_force.cuh" #include "module_base/ylm.h" -#include "module_hamilt_lcao/module_gint/gint_tools.h" +#include "gint_tools.h" + namespace GintKernel { diff --git a/source/module_hamilt_lcao/module_gint/gint_force.h b/source/module_hamilt_lcao/module_gint/gint_force_gpu.h similarity index 99% rename from source/module_hamilt_lcao/module_gint/gint_force.h rename to source/module_hamilt_lcao/module_gint/gint_force_gpu.h index 4625035100..c8d307d29c 100644 --- a/source/module_hamilt_lcao/module_gint/gint_force.h +++ b/source/module_hamilt_lcao/module_gint/gint_force_gpu.h @@ -1,5 +1,5 @@ -#ifndef GINT_FORCE_H -#define GINT_FORCE_H +#ifndef GINT_FORCE_GPU_H +#define GINT_FORCE_GPU_H #include "module_hamilt_lcao/module_gint/gint.h" #include "module_hamilt_lcao/module_gint/grid_technique.h" diff --git a/source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu index c14e53fb76..2de3670550 100644 --- a/source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu @@ -1,8 +1,8 @@ #include "kernels/cuda/cuda_tools.cuh" #include "module_base/ylm.h" -#include "module_hamilt_lcao/module_gint/gint_rho.h" -#include "module_hamilt_lcao/module_gint/gint_tools.h" -#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh" +#include "gint_rho_gpu.h" +#include "gint_tools.h" +#include "kernels/cuda/gint_rho.cuh" #include "omp.h" #include @@ -68,7 +68,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer* dm, { checkCuda(cudaStreamSynchronize(gridt.streams[i])); } - // calculate the rho for every nbz bigcells #pragma omp parallel for num_threads(gridt.nstreams) collapse(2) @@ -78,6 +77,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer* dm, { // get stream id int stream_num = omp_get_thread_num(); + checkCuda(cudaStreamSynchronize(gridt.streams[stream_num])); // psi_input contains data used to generate the psi values. // The suffix "_g" indicates that the data is stored in the GPU, @@ -154,16 +154,9 @@ void gint_gamma_rho_gpu(const hamilt::HContainer* dm, double* rho_g = gridt.rho_g; // variables for dot product psir * psir_dm - int dot_count = 0; - int* vec_len = &gridt.vec_len[gridt.num_mcell * stream_num]; - double** vec_l = &gridt.vec_l[gridt.num_mcell * stream_num]; - double** vec_r = &gridt.vec_r[gridt.num_mcell * stream_num]; double** dot_product = &gridt.dot_product[gridt.num_mcell * stream_num]; - int* vec_len_g = &gridt.vec_len_g[gridt.num_mcell * stream_num]; - double** vec_l_g = &gridt.vec_l_g[gridt.num_mcell * stream_num]; - double** vec_r_g = &gridt.vec_r_g[gridt.num_mcell * stream_num]; double** dot_product_g = &gridt.dot_product_g[gridt.num_mcell * stream_num]; @@ -172,7 +165,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer* dm, int atom_pair_num = 0; const int grid_index_ij = i * gridt.nby * gridt.nbzp + j * gridt.nbzp; std::vector gpu_matrix_cal_flag(max_size * gridt.nbzp,false); - checkCuda(cudaStreamSynchronize(gridt.streams[stream_num])); // generate GPU tasks, including the calculation of psir, matrix // multiplication, and dot product @@ -211,11 +203,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer* dm, max_n, atom_pair_num, rho_g, - vec_l, - vec_r, - dot_product, - vec_len, - dot_count); + dot_product); // Copying data from host to device checkCuda(cudaMemcpyAsync(input_double_g, @@ -286,21 +274,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer* dm, cudaMemcpyHostToDevice, gridt.streams[stream_num])); - checkCuda(cudaMemcpyAsync(vec_len_g, - vec_len, - gridt.num_mcell * sizeof(int), - cudaMemcpyHostToDevice, - gridt.streams[stream_num])); - checkCuda(cudaMemcpyAsync(vec_l_g, - vec_l, - gridt.num_mcell * sizeof(double*), - cudaMemcpyHostToDevice, - gridt.streams[stream_num])); - checkCuda(cudaMemcpyAsync(vec_r_g, - vec_r, - gridt.num_mcell * sizeof(double*), - cudaMemcpyHostToDevice, - gridt.streams[stream_num])); checkCuda(cudaMemcpyAsync(dot_product_g, dot_product, gridt.num_mcell * sizeof(double*), @@ -352,20 +325,20 @@ void gint_gamma_rho_gpu(const hamilt::HContainer* dm, atom_pair_num, gridt.streams[stream_num], ap_alpha_g); + checkCudaLastError(); // Launching kernel to calculate dot product psir * psir_dm - dim3 grid_dot(64); - dim3 block_dot(64); - int incx = 1; - int incy = 1; - psir_dot<<>>( - vec_len_g, - vec_l_g, - incx, - vec_r_g, - incy, - dot_product_g, - dot_count); + const int block_size = 128; + dim3 block_dot(block_size); + dim3 grid_dot(gridt.nbzp, gridt.bxyz); + psir_dot<<>>( + gridt.nbzp, + gridt.bxyz, + max_size * ucell.nwmax, + psir_ylm_left_g, + psir_r_g, + dot_product_g); + checkCudaLastError(); } } @@ -374,13 +347,11 @@ void gint_gamma_rho_gpu(const hamilt::HContainer* dm, { checkCuda(cudaStreamSynchronize(gridt.streams[i])); } - // Copy rho from device to host checkCuda(cudaMemcpy(rho, gridt.rho_g, nczp * gridt.ncx * gridt.ncy * sizeof(double), cudaMemcpyDeviceToHost)); - // free the memory checkCuda(cudaFree(dm_matrix_g)); } diff --git a/source/module_hamilt_lcao/module_gint/gint_rho.h b/source/module_hamilt_lcao/module_gint/gint_rho_gpu.h similarity index 91% rename from source/module_hamilt_lcao/module_gint/gint_rho.h rename to source/module_hamilt_lcao/module_gint/gint_rho_gpu.h index 17b92e5c35..b912e0c90e 100644 --- a/source/module_hamilt_lcao/module_gint/gint_rho.h +++ b/source/module_hamilt_lcao/module_gint/gint_rho_gpu.h @@ -7,7 +7,6 @@ #include "module_hamilt_lcao/module_gint/gint.h" #include "module_hamilt_lcao/module_gint/grid_technique.h" -cudaError_t checkCuda(cudaError_t result); namespace GintKernel { @@ -90,11 +89,7 @@ void gtask_rho(const Grid_Technique& gridt, * @param max_n Maximum value of n. * @param atom_pair_num Total count of atom pairs, which is also the number of matrix multiplication operations. * @param rho_g Rho. - * @param vec_l Pointers to psir_ylm for vector dot product. - * @param vec_r Pointers to psir_dm for vector dot product. * @param dot_product Pointers to the results of dot products. - * @param vec_len Vector lengths for each dot product. - * @param dot_count Total count of dot products. */ void alloc_mult_dot_rho(const Grid_Technique& gridt, const UnitCell& ucell, @@ -120,11 +115,7 @@ void alloc_mult_dot_rho(const Grid_Technique& gridt, int& max_n, int& atom_pair_num, double* rho_g, - double** vec_l, - double** vec_r, - double** dot_product, - int* vec_len, - int& dot_count); + double** dot_product); } // namespace GintKernel #endif \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu index cb2b817098..f1ce3e85b8 100644 --- a/source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu @@ -2,9 +2,9 @@ #include "kernels/cuda/cuda_tools.cuh" #include "module_base/ylm.h" -#include "module_hamilt_lcao/module_gint/gint_tools.h" -#include "module_hamilt_lcao/module_gint/gint_vl.h" -#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cuh" +#include "gint_tools.h" +#include "gint_vl_gpu.h" +#include "kernels/cuda/gint_vl.cuh" namespace GintKernel { diff --git a/source/module_hamilt_lcao/module_gint/gint_vl.h b/source/module_hamilt_lcao/module_gint/gint_vl_gpu.h similarity index 96% rename from source/module_hamilt_lcao/module_gint/gint_vl.h rename to source/module_hamilt_lcao/module_gint/gint_vl_gpu.h index 19988d6daf..7e9c1deaa0 100644 --- a/source/module_hamilt_lcao/module_gint/gint_vl.h +++ b/source/module_hamilt_lcao/module_gint/gint_vl_gpu.h @@ -1,5 +1,5 @@ -#ifndef GINT_VL_H -#define GINT_VL_H +#ifndef GINT_VL_GPU_H +#define GINT_VL_GPU_H #include #include // for CUDA_VERSION #include @@ -7,8 +7,6 @@ #include "module_hamilt_lcao/module_gint/gint.h" #include "module_hamilt_lcao/module_gint/grid_technique.h" -cudaError_t checkCuda(cudaError_t result); - namespace GintKernel { diff --git a/source/module_hamilt_lcao/module_gint/grid_technique.cpp b/source/module_hamilt_lcao/module_gint/grid_technique.cpp index 2eb74becc1..9591864bc0 100644 --- a/source/module_hamilt_lcao/module_gint/grid_technique.cpp +++ b/source/module_hamilt_lcao/module_gint/grid_technique.cpp @@ -185,7 +185,7 @@ void Grid_Technique::set_pbc_grid(const int& ncx_in, #if ((defined __CUDA) /* || (defined __ROCM) */) if(GlobalV::device_flag == "gpu") { - this->init_gpu_gint_variables(ucell,orb,num_stream); + this->init_gpu_gint_variables(ucell, orb, num_stream); } #endif @@ -923,22 +923,11 @@ void Grid_Technique::init_gpu_gint_variables(const UnitCell& ucell,const LCAO_Or num_mcell = nbzp * bxyz; checkCudaErrors(cudaMalloc((void**)&rho_g, this->ncxyz * sizeof(double))); checkCudaErrors(cudaMemset(rho_g, 0, this->ncxyz * sizeof(double))); - checkCudaErrors( - cudaMallocHost((void**)&vec_l, num_mcell * nstreams * sizeof(double*))); - checkCudaErrors( - cudaMalloc((void**)&vec_l_g, num_mcell * nstreams * sizeof(double*))); - checkCudaErrors( - cudaMallocHost((void**)&vec_r, num_mcell * nstreams * sizeof(double*))); - checkCudaErrors( - cudaMalloc((void**)&vec_r_g, num_mcell * nstreams * sizeof(double*))); checkCudaErrors(cudaMallocHost((void**)&dot_product, num_mcell * nstreams * sizeof(double*))); checkCudaErrors(cudaMalloc((void**)&dot_product_g, num_mcell * nstreams * sizeof(double*))); - checkCudaErrors( - cudaMallocHost((void**)&vec_len, num_mcell * nstreams * sizeof(int))); - checkCudaErrors( - cudaMalloc((void**)&vec_len_g, num_mcell * nstreams * sizeof(int))); + for (int i = 0; i < nstreams; ++i) { @@ -961,7 +950,10 @@ void Grid_Technique::free_gpu_gint_variables(int nat) return; } for (int i = 0; i < nstreams; ++i) + { checkCudaErrors(cudaStreamDestroy(streams[i])); + } + delete[] streams; checkCudaErrors(cudaFree(ylmcoef_g)); checkCudaErrors(cudaFree(atom_nwl_g)); @@ -1020,14 +1012,8 @@ void Grid_Technique::free_gpu_gint_variables(int nat) checkCudaErrors(cudaFree(dm_global_g)); checkCudaErrors(cudaFree(ap_output_gbl_g)); - checkCudaErrors(cudaFreeHost(vec_len)); - checkCudaErrors(cudaFreeHost(vec_l)); - checkCudaErrors(cudaFreeHost(vec_r)); checkCudaErrors(cudaFreeHost(dot_product)); - checkCudaErrors(cudaFree(vec_len_g)); - checkCudaErrors(cudaFree(vec_l_g)); - checkCudaErrors(cudaFree(vec_r_g)); checkCudaErrors(cudaFree(dot_product_g)); checkCudaErrors(cudaFree(rho_g)); diff --git a/source/module_hamilt_lcao/module_gint/grid_technique.h b/source/module_hamilt_lcao/module_gint/grid_technique.h index b51cb30686..cdab4b4bd4 100644 --- a/source/module_hamilt_lcao/module_gint/grid_technique.h +++ b/source/module_hamilt_lcao/module_gint/grid_technique.h @@ -218,12 +218,6 @@ class Grid_Technique : public Grid_MeshBall // additional variables for rho calculating int num_mcell; double* rho_g; - int* vec_len; - int* vec_len_g; - double** vec_l; - double** vec_l_g; - double** vec_r; - double** vec_r_g; double** dot_product; double** dot_product_g; diff --git a/source/module_hamilt_lcao/module_gint/gtask_force.cpp b/source/module_hamilt_lcao/module_gint/gtask_force.cpp index 6223325cd9..641afe1182 100644 --- a/source/module_hamilt_lcao/module_gint/gtask_force.cpp +++ b/source/module_hamilt_lcao/module_gint/gtask_force.cpp @@ -1,6 +1,6 @@ #include -#include "gint_force.h" +#include "gint_force_gpu.h" #include "module_base/ylm.h" #include "module_hamilt_lcao/module_gint/gint_tools.h" namespace GintKernel @@ -213,7 +213,6 @@ void alloc_mult_force(const Grid_Technique& gridt, } } atom_pair_num = tid; - gpu_mat_cal_flag.clear(); } diff --git a/source/module_hamilt_lcao/module_gint/gtask_rho.cpp b/source/module_hamilt_lcao/module_gint/gtask_rho.cpp index 80cba8abe7..14cc28cec0 100644 --- a/source/module_hamilt_lcao/module_gint/gtask_rho.cpp +++ b/source/module_hamilt_lcao/module_gint/gtask_rho.cpp @@ -1,4 +1,4 @@ -#include "gint_rho.h" +#include "gint_rho_gpu.h" #include "module_base/ylm.h" #include "module_hamilt_lcao/module_gint/gint_tools.h" #include "omp.h" @@ -118,13 +118,10 @@ void alloc_mult_dot_rho(const Grid_Technique& gridt, int& max_n, int& atom_pair_num, double* rho_g, - double** vec_l, - double** vec_r, - double** dot_product, - int* vec_len, - int& dot_count) + double** dot_product) { int tid = 0; + int dot_count = 0; max_m = 0; max_n = 0; const int nwmax=ucell.nwmax; @@ -203,18 +200,13 @@ void alloc_mult_dot_rho(const Grid_Technique& gridt, gridt.ncy * nczp); for (int i = 0; i < gridt.bxyz; i++) { - vec_l[dot_count] - = psir_ylm_g + (bcell_start_psir + i * max_size * nwmax); - vec_r[dot_count] - = psir_dm_g + (bcell_start_psir + i * max_size * nwmax); dot_product[dot_count] = rho_g + vindex[i]; - vec_len[dot_count] = nwmax * max_size; dot_count++; } + + delete[] vindex; } atom_pair_num = tid; - - gpu_mat_cal_flag.clear(); } } // namespace GintKernel \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/gtask_vl.cpp b/source/module_hamilt_lcao/module_gint/gtask_vl.cpp index 7566c70257..d95aa85480 100644 --- a/source/module_hamilt_lcao/module_gint/gtask_vl.cpp +++ b/source/module_hamilt_lcao/module_gint/gtask_vl.cpp @@ -1,6 +1,6 @@ #include -#include "gint_vl.h" +#include "gint_vl_gpu.h" #include "module_base/ylm.h" #include "module_hamilt_lcao/module_gint/gint_tools.h" namespace GintKernel @@ -197,8 +197,6 @@ void alloc_mult_vlocal(const Grid_Technique& gridt, } } } - - gpu_matrix_calc_flag.clear(); } } // namespace GintKernel \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu index 7c4b2289f2..e1fd15a264 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu @@ -1,21 +1,22 @@ #include -#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" -cudaError_t checkCuda(cudaError_t result) +#include "cuda_tools.cuh" + +cudaError_t check(cudaError_t result, const char *const func, const char *const file, const int line) { if (result != cudaSuccess) { - fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); - assert(result == cudaSuccess); + fprintf(stderr, "CUDA Runtime Error at %s:%d code=%s \"%s\" \n", file, line, cudaGetErrorString(result), func); + exit(EXIT_FAILURE); } return result; } -cudaError_t checkCudaLastError() +cudaError_t __checkCudaLastError(const char *file, const int line) { cudaError_t result = cudaGetLastError(); if (result != cudaSuccess) { - fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); + fprintf(stderr, "%s(%i) : getLastCudaError():%s\n", file, line, cudaGetErrorString(result)); assert(result == cudaSuccess); } return result; diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh index e0d7b4d2c0..e51ffaa495 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh @@ -8,8 +8,12 @@ #include #include #include -cudaError_t checkCuda(cudaError_t result); -cudaError_t checkCudaLastError(); + +#define checkCuda(val) check(val, #val, __FILE__, __LINE__) +#define checkCudaLastError() __checkCudaLastError(__FILE__, __LINE__) + +cudaError_t check(cudaError_t result, const char *const func, const char *const file, const int line); +cudaError_t __checkCudaLastError(const char *file, const int line); void dump_cuda_array_to_file(double* cuda_array, int width, diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu index db2bcf3321..2c0c89e2e6 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu @@ -1,9 +1,9 @@ #include "gint_force.cuh" #include "interp.cuh" -#include "module_hamilt_lcao/module_gint/gint_force.h" -#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" -#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh" -#include "module_hamilt_lcao/module_gint/kernels/cuda/sph.cuh" +#include "module_hamilt_lcao/module_gint/gint_force_gpu.h" +#include "cuda_tools.cuh" +#include "gint_force.cuh" +#include "sph.cuh" #include "cuda_runtime.h" // CUDA kernel to calculate psi and force namespace GintKernel diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cu index d7dcf4e6d7..e204b35b46 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cu @@ -1,5 +1,5 @@ #include "interp.cuh" -#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh" +#include "gint_rho.cuh" #include "sph.cuh" namespace GintKernel @@ -58,27 +58,38 @@ __global__ void get_psi(const double* const ylmcoef, } } -__global__ void psir_dot(const int* n, - double** vec_l_g, - int incl, - double** vec_r_g, - int incr, - double** results_g, - int batchcount) +__global__ void psir_dot(const int nbzp, + const int bxyz, + const int vec_size, + double* vec_a_g, + double* vec_b_g, + double** results_g) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = id; i < batchcount; i += stride) + extern __shared__ double s_data[]; + int tid = threadIdx.x; + int offset = blockIdx.x * bxyz * vec_size + blockIdx.y * vec_size; + double* vec_a_mcell = vec_a_g + offset; + double* vec_b_mcell = vec_b_g + offset; + + s_data[tid] = 0.0; + + for(unsigned int k = tid; k < vec_size; k += blockDim.x) { - double* sum = results_g[i]; - double* x = vec_l_g[i]; - double* y = vec_r_g[i]; + s_data[tid] += vec_a_mcell[k] * vec_b_mcell[k]; + } + + __syncthreads(); - for (int j = 0; j < n[i]; j++) - { - sum[0] += x[j * incl] * y[j * incr]; + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) + { + if (tid < s) { + s_data[tid] += s_data[tid + s]; } + __syncthreads(); } -} + if (tid == 0) { + *results_g[blockIdx.x*bxyz + blockIdx.y] = s_data[0]; + } +} } // namespace GintKernel \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh index 958b598954..78d5e10681 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh @@ -43,24 +43,12 @@ __global__ void get_psi(const double* const ylmcoef, const double* const psi_u, double* psir_ylm); -/** - * @brief Kernel function to calculate batch vector dot products. - * - * @param n vector length. - * @param vec_l_g pointers to left vec. - * @param incl stride between consecutive elements in the `vec_l_g`. - * @param vec_r_g pointers to right vec. - * @param incr stride between consecutive elements in the `vec_r_g`. - * @param results_g dot product results. - * @param batchcount total count of dot products to compute. - */ -__global__ void psir_dot(const int* n, - double** vec_l_g, - int incl, - double** vec_r_g, - int incr, - double** results_g, - int batchcount); +__global__ void psir_dot(const int nbzp, + const int bxyz, + const int vec_size, + double* vec_a_g, + double* vec_b_g, + double** results_g); } // namespace GintKernel #endif // GINT_RHO_CUH \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cu index 62edcc7f42..25818fa1cc 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cu @@ -1,6 +1,6 @@ #include "gint_vl.cuh" #include "interp.cuh" -#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" +#include "cuda_tools.cuh" #include "sph.cuh" namespace GintKernel { diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh index 24e8ba91e1..38a2c3c2b1 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh @@ -10,7 +10,7 @@ #include #include "module_cell/unitcell.h" #include "module_hamilt_pw/hamilt_pwdft/global.h" -#include +#include "module_base/module_device/device.h" #define sA(i, j) sA[(j)*slda + (i)]