From d73b1d2b51cb59f09a90ece3b1d64dc96ec47f78 Mon Sep 17 00:00:00 2001
From: Erjie Wu <110683255+ErjieWu@users.noreply.github.com>
Date: Thu, 12 Dec 2024 08:55:37 +0800
Subject: [PATCH] Refactor: Combine gamma-only and multi-k versions of some
 functions in DeePKS. (#5717)

* Add support for INPUT deepks_v_delta>0 in multi-k points DeePKS calculations

* Refactor: Change LCAO_Deepks_Interface to template class.

* Remove the h_mat and h_mat_k variables in LCAO_Deepks and change H_V_delta to form consistent with H_V_delta_k.

* Change functions in deepks_hmat to template.

* Combine gamma-only and multi-k for v_delta_precalc.

* Change functions about v_delta_precalc and psialpha in deepks_v_delta calculations to templates.

* Change save_npy_h to template.

* Change some functions in LCAO_deepks_io to templates.

* Remove ld.V_deltaR.

* Change cal_orbital_precalc to template.

* Remove orbital_precalc_k.cpp.

* Change cal_gdmx into template function.

* [pre-commit.ci lite] apply automatic fixes

* Update LCAO_deepks_interface.cpp

* Update FORCE_STRESS.cpp

* Update FORCE_gamma.cpp

* Update deepks_lcao.cpp

* Update LCAO_deepks.cpp

* Update LCAO_deepks.cpp

* Update LCAO_deepks.h

---------

Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
---
 source/Makefile.Objects                       |   3 -
 source/module_esolver/esolver_ks_lcao.cpp     |  50 +-
 .../hamilt_lcaodft/FORCE_STRESS.cpp           |  16 +-
 .../hamilt_lcaodft/FORCE_gamma.cpp            |   5 +-
 .../operator_lcao/deepks_lcao.cpp             |   6 +-
 .../module_deepks/CMakeLists.txt              |   3 -
 .../module_deepks/LCAO_deepks.cpp             |  45 +-
 .../module_deepks/LCAO_deepks.h               |  87 +---
 .../module_deepks/LCAO_deepks_interface.cpp   | 460 ++++++------------
 .../module_deepks/LCAO_deepks_interface.h     |  23 +-
 .../module_deepks/LCAO_deepks_io.cpp          | 226 ++++-----
 .../module_deepks/LCAO_deepks_io.h            |  95 ++--
 .../module_deepks/LCAO_deepks_odelta.cpp      |   6 +-
 .../module_deepks/LCAO_deepks_pdm.cpp         |   5 +-
 .../module_deepks/LCAO_deepks_torch.cpp       | 213 ++++----
 .../module_deepks/LCAO_deepks_vdelta.cpp      |   6 +-
 .../module_deepks/cal_gdmx.cpp                |  97 +++-
 .../module_deepks/cal_gdmx_k.cpp              | 249 ----------
 .../module_deepks/deepks_hmat.cpp             | 144 ++----
 .../module_deepks/deepks_hmat.h               |  32 +-
 .../module_deepks/orbital_precalc.cpp         | 290 ++++++++---
 .../module_deepks/orbital_precalc_k.cpp       | 350 -------------
 .../module_deepks/test/LCAO_deepks_test.cpp   |   4 +-
 .../module_deepks/v_delta_precalc.cpp         | 185 +++++--
 .../module_deepks/v_delta_precalc_k.cpp       | 234 ---------
 25 files changed, 946 insertions(+), 1888 deletions(-)
 delete mode 100644 source/module_hamilt_lcao/module_deepks/cal_gdmx_k.cpp
 delete mode 100644 source/module_hamilt_lcao/module_deepks/orbital_precalc_k.cpp
 delete mode 100644 source/module_hamilt_lcao/module_deepks/v_delta_precalc_k.cpp

diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index 661db25611..ae07f24df8 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -201,14 +201,11 @@ OBJS_DEEPKS=LCAO_deepks.o\
         deepks_hmat.o\
         LCAO_deepks_interface.o\
         orbital_precalc.o\
-        orbital_precalc_k.o\
         cal_gdmx.o\
-        cal_gdmx_k.o\
         cal_gedm.o\
         cal_gvx.o\
         cal_descriptor.o\
         v_delta_precalc.o\
-        v_delta_precalc_k.o\
         
 
 OBJS_ELECSTAT=elecstate.o\
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index e515c23903..3d089e568e 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -961,7 +961,7 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(UnitCell& ucell, const int istep)
     // 6) write Hamiltonian and Overlap matrix
     for (int ik = 0; ik < this->kv.get_nks(); ++ik)
     {
-        if (PARAM.inp.out_mat_hs[0] || PARAM.inp.deepks_v_delta)
+        if (PARAM.inp.out_mat_hs[0])
         {
             this->p_hamilt->updateHk(ik);
         }
@@ -1000,12 +1000,6 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(UnitCell& ucell, const int istep)
                                     this->pv,
                                     GlobalV::DRANK);
             }
-#ifdef __DEEPKS
-            if (PARAM.inp.deepks_out_labels && PARAM.inp.deepks_v_delta)
-            {
-                DeePKS_domain::save_h_mat(h_mat.p, this->pv.nloc, ik);
-            }
-#endif
         }
     }
 
@@ -1023,24 +1017,30 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(UnitCell& ucell, const int istep)
 
     //! 8) Write DeePKS information
 #ifdef __DEEPKS
-    std::shared_ptr<LCAO_Deepks> ld_shared_ptr(&GlobalC::ld, [](LCAO_Deepks*) {});
-    LCAO_Deepks_Interface LDI = LCAO_Deepks_Interface(ld_shared_ptr);
-    ModuleBase::timer::tick("ESolver_KS_LCAO", "out_deepks_labels");
-    LDI.out_deepks_labels(this->pelec->f_en.etot,
-                          this->pelec->klist->get_nks(),
-                          ucell.nat,
-                          PARAM.globalv.nlocal,
-                          this->pelec->ekb,
-                          this->pelec->klist->kvec_d,
-                          ucell,
-                          orb_,
-                          GlobalC::GridD,
-                          &(this->pv),
-                          *(this->psi),
-                          dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM(),
-                          PARAM.inp.deepks_v_delta);
-
-    ModuleBase::timer::tick("ESolver_KS_LCAO", "out_deepks_labels");
+    if (this->psi != nullptr && (istep % PARAM.inp.out_interval == 0))
+    {
+        hamilt::HamiltLCAO<TK, TR>* p_ham_deepks
+            = dynamic_cast<hamilt::HamiltLCAO<TK, TR>*>(this->p_hamilt);
+        std::shared_ptr<LCAO_Deepks> ld_shared_ptr(&GlobalC::ld, [](LCAO_Deepks*) {});
+        LCAO_Deepks_Interface<TK, TR> LDI(ld_shared_ptr);
+
+        ModuleBase::timer::tick("ESolver_KS_LCAO", "out_deepks_labels");
+        LDI.out_deepks_labels(this->pelec->f_en.etot,
+                            this->pelec->klist->get_nks(),
+                            ucell.nat,
+                            PARAM.globalv.nlocal,
+                            this->pelec->ekb,
+                            this->pelec->klist->kvec_d,
+                            ucell,
+                            orb_,
+                            GlobalC::GridD,
+                            &(this->pv),
+                            *(this->psi),
+                            dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM(),
+                            p_ham_deepks);
+
+        ModuleBase::timer::tick("ESolver_KS_LCAO", "out_deepks_labels");
+    }
 #endif
 
     //! 9) Perform RDMFT calculations
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
index 43a633daf2..4a262da0e4 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
@@ -522,7 +522,7 @@ void Force_Stress_LCAO<T>::getForceStress(const bool isforce,
                     {
                         const std::vector<std::vector<double>>& dm_gamma
                             = dynamic_cast<const elecstate::ElecStateLCAO<double>*>(pelec)->get_DM()->get_DMK_vector();
-                        GlobalC::ld.cal_gdmx(dm_gamma[0], ucell, orb, GlobalC::GridD, isstress);
+                        GlobalC::ld.cal_gdmx(dm_gamma, ucell, orb, GlobalC::GridD, kv.get_nks(), kv.kvec_d, isstress);
                     }
                     else
                     {
@@ -531,13 +531,13 @@ void Force_Stress_LCAO<T>::getForceStress(const bool isforce,
                                   ->get_DM()
                                   ->get_DMK_vector();
 
-                        GlobalC::ld.cal_gdmx_k(dm_k,
-                                               ucell,
-                                               orb,
-                                               GlobalC::GridD,
-                                               kv.get_nks(),
-                                               kv.kvec_d,
-                                               isstress);
+                        GlobalC::ld.cal_gdmx(dm_k,
+                                             ucell,
+                                             orb,
+                                             GlobalC::GridD,
+                                             kv.get_nks(),
+                                             kv.kvec_d,
+                                             isstress);
                     }
                     if (PARAM.inp.deepks_out_unittest)
                     {
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
index c7aad83123..38e5be5051 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
@@ -260,7 +260,8 @@ void Force_LCAO<double>::ftable(const bool isforce,
 
         if (PARAM.inp.deepks_out_unittest)
         {
-            LCAO_deepks_io::print_dm(dm_gamma[0], PARAM.globalv.nlocal, this->ParaV->nrow);
+            const int nks = 1; // 1 for gamma-only
+            LCAO_deepks_io::print_dm(nks, PARAM.globalv.nlocal, this->ParaV->nrow, dm_gamma); 
 
             GlobalC::ld.check_projected_dm();
 
@@ -268,7 +269,7 @@ void Force_LCAO<double>::ftable(const bool isforce,
 
             GlobalC::ld.check_gedm();
 
-            GlobalC::ld.cal_e_delta_band(dm_gamma);
+            GlobalC::ld.cal_e_delta_band(dm_gamma,nks);
 
             std::ofstream ofs("E_delta_bands.dat");
             ofs << std::setprecision(10) << GlobalC::ld.e_delta_band;
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
index 95df523db2..47d2f57c9f 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
@@ -186,7 +186,7 @@ void DeePKS<OperatorLCAO<std::complex<double>, double>>::contributeHR()
     {
         ModuleBase::timer::tick("DeePKS", "contributeHR");
 
-        GlobalC::ld.cal_projected_DM_k(this->DM, *this->ucell, *ptr_orb_, GlobalC::GridD);
+        GlobalC::ld.cal_projected_DM(this->DM, *this->ucell, *ptr_orb_, GlobalC::GridD);
         GlobalC::ld.cal_descriptor(this->ucell->nat);
         // calculate dE/dD
         GlobalC::ld.cal_gedm(this->ucell->nat);
@@ -219,7 +219,7 @@ void DeePKS<OperatorLCAO<std::complex<double>, std::complex<double>>>::contribut
     {
         ModuleBase::timer::tick("DeePKS", "contributeHR");
 
-        GlobalC::ld.cal_projected_DM_k(this->DM, *this->ucell, *ptr_orb_, GlobalC::GridD);
+        GlobalC::ld.cal_projected_DM(this->DM, *this->ucell, *ptr_orb_, GlobalC::GridD);
         GlobalC::ld.cal_descriptor(this->ucell->nat);
         // calculate dE/dD
         GlobalC::ld.cal_gedm(this->ucell->nat);
@@ -497,7 +497,7 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(const double* hr_i
 
 inline void get_h_delta_k(int ik, double*& h_delta_k)
 {
-    h_delta_k = GlobalC::ld.H_V_delta.data();
+    h_delta_k = GlobalC::ld.H_V_delta[ik].data();
     return;
 }
 inline void get_h_delta_k(int ik, std::complex<double>*& h_delta_k)
diff --git a/source/module_hamilt_lcao/module_deepks/CMakeLists.txt b/source/module_hamilt_lcao/module_deepks/CMakeLists.txt
index e7e5110c6f..f32151bafe 100644
--- a/source/module_hamilt_lcao/module_deepks/CMakeLists.txt
+++ b/source/module_hamilt_lcao/module_deepks/CMakeLists.txt
@@ -13,14 +13,11 @@ if(ENABLE_DEEPKS)
       deepks_hmat.cpp
       LCAO_deepks_interface.cpp
       orbital_precalc.cpp
-      orbital_precalc_k.cpp
       cal_gdmx.cpp
-      cal_gdmx_k.cpp
       cal_gedm.cpp
       cal_gvx.cpp      
       cal_descriptor.cpp
       v_delta_precalc.cpp
-      v_delta_precalc_k.cpp
   )
 
   add_library(
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
index 84eb2d6e6b..1676838282 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
@@ -17,7 +17,6 @@
 //4. subroutines that are related to V_delta:
 //  - allocate_V_delta : allocates H_V_delta; if calculating force, it also calls
 //      init_gdmx, as well as allocating F_delta
-//  - allocate_V_deltaR : allcoates H_V_deltaR, for multi-k calculations
 
 #ifdef __DEEPKS
 
@@ -35,7 +34,6 @@ LCAO_Deepks::LCAO_Deepks()
     alpha_index = new ModuleBase::IntArray[1];
     inl_index = new ModuleBase::IntArray[1];
     inl_l = nullptr;
-    H_V_deltaR = nullptr;
     gedm = nullptr;
 }
 
@@ -45,7 +43,6 @@ LCAO_Deepks::~LCAO_Deepks()
     delete[] alpha_index;
     delete[] inl_index;
     delete[] inl_l;
-    delete[] H_V_deltaR;
 
     //=======1. to use deepks, pdm is required==========
     //delete pdm**
@@ -92,7 +89,10 @@ void LCAO_Deepks::init(
     
     int tot_inl = tot_inl_per_atom * nat;
 
-    if(PARAM.inp.deepks_equiv) tot_inl = nat;
+    if(PARAM.inp.deepks_equiv) 
+    {
+        tot_inl = nat;
+    }
 
     this->lmaxd = lm;
     this->nmaxd = nm;
@@ -143,25 +143,6 @@ void LCAO_Deepks::init(
 
     this->pv = &pv_in;
 
-    if(PARAM.inp.deepks_v_delta)
-    {
-        //allocate and init h_mat
-        if(PARAM.globalv.gamma_only_local)
-        {
-            int nloc=this->pv->nloc;
-            this->h_mat.resize(nloc,0.0);
-        }
-        else
-        {
-            int nloc=this->pv->nloc;
-            this->h_mat_k.resize(nks);
-            for (int ik = 0; ik < nks; ik++)
-            {
-                this->h_mat_k[ik].resize(nloc,std::complex<double>(0.0,0.0));
-            }
-        }
-    }
-
     return;
 }
 
@@ -335,8 +316,9 @@ void LCAO_Deepks::allocate_V_delta(const int nat, const int nks)
     //initialize the H matrix H_V_delta
     if(PARAM.globalv.gamma_only_local)
     {
-        this->H_V_delta.resize(pv->nloc);
-        ModuleBase::GlobalFunc::ZEROS(this->H_V_delta.data(), pv->nloc);
+        H_V_delta.resize(1); // the first dimension is for the consistence with H_V_delta_k
+        this->H_V_delta[0].resize(pv->nloc);
+        ModuleBase::GlobalFunc::ZEROS(this->H_V_delta[0].data(), pv->nloc);
     }
     else
     {
@@ -387,15 +369,6 @@ void LCAO_Deepks::allocate_V_delta(const int nat, const int nks)
     return;
 }
 
-void LCAO_Deepks::allocate_V_deltaR(const int nnr)
-{
-    ModuleBase::TITLE("LCAO_Deepks", "allocate_V_deltaR");
-    GlobalV::ofs_running << nnr << std::endl;
-    delete[] H_V_deltaR;
-    H_V_deltaR = new double[nnr];
-    ModuleBase::GlobalFunc::ZEROS(H_V_deltaR, nnr);
-}
-
 void LCAO_Deepks::init_orbital_pdm_shell(const int nks)
 {
     
@@ -541,12 +514,12 @@ void LCAO_Deepks::del_v_delta_pdm_shell(const int nks,const int nlocal)
 
 void LCAO_Deepks::dpks_cal_e_delta_band(const std::vector<std::vector<double>>& dm, const int nks)
 {
-    this->cal_e_delta_band(dm);
+    this->cal_e_delta_band(dm, nks);
 }
 
 void LCAO_Deepks::dpks_cal_e_delta_band(const std::vector<std::vector<std::complex<double>>>& dm, const int nks)
 {
-    this->cal_e_delta_band_k(dm, nks);
+    this->cal_e_delta_band(dm, nks);
 }
 
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
index 57b3609059..51d765f1b2 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
@@ -54,18 +54,10 @@ class LCAO_Deepks
     ///\rho_{HL} = c_{L, \mu}c_{L,\nu} - c_{H, \mu}c_{H,\nu} \f$ (for gamma_only)
     ModuleBase::matrix o_delta;
 
-    ///(Unit: Ry) Hamiltonian matrix in k space
-    /// for gamma only
-    std::vector<double> h_mat;    
-    /// for multi-k
-    std::vector<std::vector<std::complex<double>>> h_mat_k;
-
     /// Correction term to the Hamiltonian matrix: \f$\langle\psi|V_\delta|\psi\rangle\f$ (for gamma only)
-    std::vector<double> H_V_delta;
+    /// The size of first dimension is 1, which is used for the consitence with H_V_delta_k
+    std::vector<std::vector<double>> H_V_delta; 
     /// Correction term to Hamiltonian, for multi-k
-    /// In R space:
-    double* H_V_deltaR;
-    /// In k space:
     std::vector<std::vector<std::complex<double>>> H_V_delta_k;
 
     // F_delta will be deleted soon, mohan 2024-07-25
@@ -213,7 +205,6 @@ class LCAO_Deepks
     // 3. subroutines that are related to V_delta:
     //   - allocate_V_delta : allocates H_V_delta; if calculating force, it also calls
     //       init_gdmx, as well as allocating F_delta
-    //   - allocate_V_deltaR : allcoates H_V_deltaR, for multi-k calculations
 
   public:
     explicit LCAO_Deepks();
@@ -231,8 +222,6 @@ class LCAO_Deepks
     /// Allocate memory for correction to Hamiltonian
     void allocate_V_delta(const int nat, const int nks = 1);
 
-    void allocate_V_deltaR(const int nnr);
-
     // array for storing gdmx, used for calculating gvx
     void init_gdmx(const int nat);
     // void del_gdmx(const int nat);
@@ -299,8 +288,7 @@ class LCAO_Deepks
     // 3. check_projected_dm, which prints pdm to descriptor.dat
 
     // 4. cal_gdmx, calculating gdmx (and optionally gdm_epsl for stress) for gamma point
-    // 5. cal_gdmx_k, counterpart of 3, for multi-k
-    // 6. check_gdmx, which prints gdmx to a series of .dat files
+    // 5. check_gdmx, which prints gdmx to a series of .dat files
 
   public:
     /** 
@@ -316,7 +304,7 @@ class LCAO_Deepks
                           const LCAO_Orbitals& orb,
                           Grid_Driver& GridD);
 
-    void cal_projected_DM_k(const elecstate::DensityMatrix<std::complex<double>, double>* dm,
+    void cal_projected_DM(const elecstate::DensityMatrix<std::complex<double>, double>* dm,
                             const UnitCell& ucell,
                             const LCAO_Orbitals& orb,
                             Grid_Driver& GridD);
@@ -335,21 +323,16 @@ class LCAO_Deepks
 
     // calculate the gradient of pdm with regard to atomic positions
     // d/dX D_{Inl,mm'}
+    template <typename TK>
     void cal_gdmx( // const ModuleBase::matrix& dm,
-        const std::vector<double>& dm,
-        const UnitCell& ucell,
-        const LCAO_Orbitals& orb,
-        Grid_Driver& GridD,
-        const bool isstress);
-
-    void cal_gdmx_k( // const std::vector<ModuleBase::ComplexMatrix>& dm,
-        const std::vector<std::vector<std::complex<double>>>& dm,
+        const std::vector<std::vector<TK>>& dm,
         const UnitCell& ucell,
         const LCAO_Orbitals& orb,
         Grid_Driver& GridD,
         const int nks,
         const std::vector<ModuleBase::Vector3<double>>& kvec_d,
         const bool isstress);
+
     void check_gdmx(const int nat);
 
     /** 
@@ -381,10 +364,10 @@ class LCAO_Deepks
   public:
     /// calculate tr(\rho V_delta)
     // void cal_e_delta_band(const std::vector<ModuleBase::matrix>& dm/**<[in] density matrix*/);
-    void cal_e_delta_band(const std::vector<std::vector<double>>& dm /**<[in] density matrix*/);
+    void cal_e_delta_band(const std::vector<std::vector<double>>& dm /**<[in] density matrix*/, const int /*nks*/);
     // void cal_e_delta_band_k(const std::vector<ModuleBase::ComplexMatrix>& dm/**<[in] density matrix*/,
     //     const int nks);
-    void cal_e_delta_band_k(const std::vector<std::vector<std::complex<double>>>& dm /**<[in] density matrix*/,
+    void cal_e_delta_band(const std::vector<std::vector<std::complex<double>>>& dm /**<[in] density matrix*/,
                             const int nks);
 
     //! a temporary interface for cal_e_delta_band and cal_e_delta_band_k
@@ -404,8 +387,9 @@ class LCAO_Deepks
 
   public:
     void cal_o_delta(const std::vector<std::vector<ModuleBase::matrix>>&
-                         dm_hl /**<[in] modified density matrix that contains HOMO and LUMO only*/);
-    void cal_o_delta_k(const std::vector<std::vector<ModuleBase::ComplexMatrix>>&
+                         dm_hl /**<[in] modified density matrix that contains HOMO and LUMO only*/,
+                     const int nks);
+    void cal_o_delta(const std::vector<std::vector<ModuleBase::ComplexMatrix>>&
                            dm_hl /**<[in] modified density matrix that contains HOMO and LUMO only*/,
                        const int nks);
 
@@ -439,18 +423,12 @@ class LCAO_Deepks
     // 10. cal_orbital_precalc : orbital_precalc is usted for training with orbital label,
     //                          which equals gvdm * orbital_pdm_shell,
     //                          orbital_pdm_shell[1,Inl,nm*nm] = dm_hl * overlap * overlap
-    // 11. cal_orbital_precalc_k : orbital_precalc is usted for training with orbital label,
-    //                          for multi-k case, which equals gvdm * orbital_pdm_shell,
-    //                          orbital_pdm_shell[1,Inl,nm*nm] = dm_hl_k * overlap * overlap
-    // 12. cal_v_delta_precalc : v_delta_precalc is used for training with v_delta label,
+    // 11. cal_v_delta_precalc : v_delta_precalc is used for training with v_delta label,
     //                         which equals gvdm * v_delta_pdm_shell,
     //                         v_delta_pdm_shell = overlap * overlap
-    // 13. cal_v_delta_precalc_k : v_delta_precalc is used for training with v_delta label,
-    //                         for multi-k case, which equals ???
-    //                         ???
-    // 14. check_v_delta_precalc : check v_delta_precalc
-    // 15. prepare_psialpha : prepare psialpha for outputting npy file
-    // 16. prepare_gevdm : prepare gevdm for outputting npy file
+    // 12. check_v_delta_precalc : check v_delta_precalc
+    // 13. prepare_psialpha : prepare psialpha for outputting npy file
+    // 14. prepare_gevdm : prepare gevdm for outputting npy file
 
   public:
     /// Calculates descriptors
@@ -485,30 +463,18 @@ class LCAO_Deepks
     void cal_gedm_equiv(const int nat);
 
     // calculates orbital_precalc
-    void cal_orbital_precalc(const std::vector<std::vector<ModuleBase::matrix>>& dm_hl /**<[in] density matrix*/,
+    template <typename TK, typename TH>
+    void cal_orbital_precalc(const std::vector<std::vector<TH>>& dm_hl /**<[in] density matrix*/,
                              const int nat,
+                             const int nks,
+                             const std::vector<ModuleBase::Vector3<double>>& kvec_d,
                              const UnitCell& ucell,
                              const LCAO_Orbitals& orb,
                              Grid_Driver& GridD);
 
-    // calculates orbital_precalc for multi-k case
-    void cal_orbital_precalc_k(
-        const std::vector<std::vector<ModuleBase::ComplexMatrix>>& dm_hl /**<[in] density matrix*/,
-        const int nat,
-        const int nks,
-        const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-        const UnitCell& ucell,
-        const LCAO_Orbitals& orb,
-        Grid_Driver& GridD);
-
     //calculates v_delta_precalc
+    template <typename TK>
     void cal_v_delta_precalc(const int nlocal,
-        const int nat,
-        const UnitCell &ucell,
-        const LCAO_Orbitals &orb,
-        Grid_Driver &GridD);
-
-    void cal_v_delta_precalc_k(const int nlocal,
         const int nat,
         const int nks,
         const std::vector<ModuleBase::Vector3<double>> &kvec_d,
@@ -516,21 +482,20 @@ class LCAO_Deepks
         const LCAO_Orbitals &orb,
         Grid_Driver &GridD);
 
-    void check_v_delta_precalc(const int nat, const int nks,const int nlocal);
+    template <typename TK>
+    void check_v_delta_precalc(const int nat, const int nks, const int nlocal);
 
     // prepare psialpha for outputting npy file
+    template <typename TK>
     void prepare_psialpha(const int nlocal,
-        const int nat,
-        const UnitCell &ucell,
-        const LCAO_Orbitals &orb,
-        Grid_Driver &GridD);
-    void prepare_psialpha_k(const int nlocal,
         const int nat,
         const int nks,
         const std::vector<ModuleBase::Vector3<double>> &kvec_d,
         const UnitCell &ucell,
         const LCAO_Orbitals &orb,
         Grid_Driver &GridD);
+
+    template <typename TK>
     void check_vdp_psialpha(const int nat, const int nks, const int nlocal);
     
     // prepare gevdm for outputting npy file
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
index 52d3811692..266cac73ee 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
@@ -6,12 +6,15 @@
 #include "module_base/global_variable.h"
 #include "module_base/tool_title.h"
 #include "module_elecstate/cal_dm.h"
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
 
-LCAO_Deepks_Interface::LCAO_Deepks_Interface(std::shared_ptr<LCAO_Deepks> ld_in) : ld(ld_in)
+template <typename TK, typename TR>
+LCAO_Deepks_Interface<TK, TR>::LCAO_Deepks_Interface(std::shared_ptr<LCAO_Deepks> ld_in) : ld(ld_in)
 {
 }
-// gamma-only
-void LCAO_Deepks_Interface::out_deepks_labels(const double& etot,
+
+template<typename TK, typename TR>
+void LCAO_Deepks_Interface<TK,TR>::out_deepks_labels(const double& etot,
                                               const int& nks,
                                               const int& nat,
                                               const int& nlocal,
@@ -21,17 +24,20 @@ void LCAO_Deepks_Interface::out_deepks_labels(const double& etot,
                                               const LCAO_Orbitals& orb,
                                               Grid_Driver& GridD,
                                               const Parallel_Orbitals* ParaV,
-                                              const psi::Psi<double>& psid,
-                                              const elecstate::DensityMatrix<double, double>* dm,
-                                              const int& deepks_v_delta)
+                                              const psi::Psi<TK>& psi,
+                                              const elecstate::DensityMatrix<TK, double>* dm,
+                                              hamilt::HamiltLCAO<TK,TR>* p_ham)
 {
     ModuleBase::TITLE("LCAO_Deepks_Interface", "out_deepks_labels");
+    ModuleBase::timer::tick("LCAO_Deepks_Interface", "out_deepks_labels");
+
+    // define TH for different types
+    using TH = std::conditional_t<std::is_same<TK, double>::value, ModuleBase::matrix, ModuleBase::ComplexMatrix>;
 
     const int my_rank = GlobalV::MY_RANK;
     const int nspin = PARAM.inp.nspin;
 
-    // calculating deepks correction to bandgap
-    // and save the results
+    // calculating deepks correction to bandgap and save the results
     if (PARAM.inp.deepks_out_labels)
     {
         // mohan updated 2024-07-25
@@ -72,118 +78,163 @@ void LCAO_Deepks_Interface::out_deepks_labels(const double& etot,
             if (PARAM.inp.deepks_scf)
             {
                 ModuleBase::matrix wg_hl;
-                wg_hl.create(nspin, PARAM.inp.nbands);
+                std::vector<std::vector<TH>> dm_bandgap;
 
-                std::vector<std::vector<ModuleBase::matrix>> dm_bandgap_gamma;
+                // Calculate O_delta
+                if constexpr (std::is_same<TK, double>::value) // for gamma only
+                {
+                    wg_hl.create(nspin, PARAM.inp.nbands);
+                    dm_bandgap.resize(nspin);
 
-                dm_bandgap_gamma.resize(nspin);
-                for (int is = 0; is < nspin; ++is)
+                    for (int is = 0; is < nspin; ++is)
+                    {
+                        for (int ib = 0; ib < 1; ++ib)
+                        {
+                            wg_hl.zero_out();
+                            wg_hl(is, ib + nocc - 1) = -1.0;
+                            wg_hl(is, ib + nocc) = 1.0;
+                            dm_bandgap[ib].resize(nspin);
+                            elecstate::cal_dm(ParaV, wg_hl, psi, dm_bandgap[ib]);
+                        }
+                    }
+                }
+                else // for multi-k
                 {
-                    for (int ib = 0; ib < 1; ++ib)
+                    wg_hl.create(nks, PARAM.inp.nbands);
+                    dm_bandgap.resize(1);
+
+                    for (int ib = 0; ib < 1; ib++)
                     {
                         wg_hl.zero_out();
-                        wg_hl(is, ib + nocc - 1) = -1.0;
-                        wg_hl(is, ib + nocc) = 1.0;
-                        dm_bandgap_gamma[ib].resize(nspin);
-                        elecstate::cal_dm(ParaV, wg_hl, psid, dm_bandgap_gamma[ib]);
+                        for (int ik = 0; ik < nks; ik++)
+                        {
+                            wg_hl(ik, ib + nocc - 1) = -1.0;
+                            wg_hl(ik, ib + nocc) = 1.0;
+                        }
+                        dm_bandgap[ib].resize(nks);
+                        elecstate::cal_dm(ParaV, wg_hl, psi, dm_bandgap[ib]);
                     }
                 }
+                
+                ld->cal_orbital_precalc<TK,TH>(dm_bandgap, nat, nks, kvec_d, ucell, orb, GridD);
+                ld->cal_o_delta(dm_bandgap, nks);
 
-                ld->cal_orbital_precalc(dm_bandgap_gamma, nat, ucell, orb, GridD);
-
+                // save obase and orbital_precalc
                 LCAO_deepks_io::save_npy_orbital_precalc(nat, 
-                                                         nks, 
-                                                         ld->des_per_atom, 
-                                                         ld->orbital_precalc_tensor, 
-                                                         PARAM.globalv.global_out_dir,
-                                                         my_rank);
-
-                ld->cal_o_delta(dm_bandgap_gamma);
-
+                                                        nks, 
+                                                        ld->des_per_atom, 
+                                                        ld->orbital_precalc_tensor, 
+                                                        PARAM.globalv.global_out_dir,
+                                                        my_rank);
                 const std::string file_obase = PARAM.globalv.global_out_dir + "deepks_obase.npy";
-                LCAO_deepks_io::save_npy_o(deepks_bands - ld->o_delta, file_obase, nks, my_rank);
+                LCAO_deepks_io::save_npy_o(deepks_bands - ld->o_delta, file_obase, nks, my_rank);      
             }     // end deepks_scf == 1
             else  // deepks_scf == 0
             {
                 const std::string file_obase = PARAM.globalv.global_out_dir + "deepks_obase.npy";
                 LCAO_deepks_io::save_npy_o(deepks_bands, file_obase, nks, my_rank); // no scf, o_tot=o_base
-            }                                                    // end deepks_scf == 0
+            }     // end deepks_scf == 0
         } // end bandgap label
 
-        if(deepks_v_delta)//gamma only now
+        // save H(R) matrix
+        if (true) // should be modified later!
+        {
+            const std::string file_hr = PARAM.globalv.global_out_dir + "deepks_hr.npy";
+            const hamilt::HContainer<TR>& hR = *(p_ham->getHR());
+
+            // How to save H(R)?
+        }
+
+        if(PARAM.inp.deepks_v_delta)
         {
-            ModuleBase::matrix h_tot;
-            h_tot.create(nlocal,nlocal);
+            std::vector<TH> h_tot(nks);
+            std::vector<std::vector<TK>> h_mat(nks, std::vector<TK>(ParaV->nloc));
+            for (int ik = 0; ik < nks; ik++)
+            {
+                h_tot[ik].create(nlocal, nlocal);
+                p_ham->updateHk(ik);
+                const TK* hk_ptr = p_ham->getHk();
+                for (int i = 0; i < ParaV->nloc; i++)
+                {
+                    h_mat[ik][i] = hk_ptr[i];
+                }
+            }
 
-            DeePKS_domain::collect_h_mat(*ParaV, ld->h_mat,h_tot,nlocal);
+            DeePKS_domain::collect_h_mat<TK,TH>(*ParaV, h_mat, h_tot, nlocal, nks);
 
             const std::string file_htot = PARAM.globalv.global_out_dir + "deepks_htot.npy";
-            LCAO_deepks_io::save_npy_h(h_tot, file_htot, nlocal, my_rank);
+            LCAO_deepks_io::save_npy_h<TK,TH>(h_tot, file_htot, nlocal, nks, my_rank);
 
             if(PARAM.inp.deepks_scf)
             {
-                ModuleBase::matrix v_delta;
-                v_delta.create(nlocal,nlocal);
-                DeePKS_domain::collect_h_mat(*ParaV, ld->H_V_delta,v_delta,nlocal);
+                std::vector<TH> v_delta(nks);
+                std::vector<TH> h_base(nks);
+                for (int ik = 0; ik < nks; ik++)
+                {
+                    v_delta[ik].create(nlocal, nlocal);
+                    h_base[ik].create(nlocal, nlocal);
+                }
+                std::vector<std::vector<TK>>* H_V_delta = nullptr;
+                if constexpr (std::is_same<TK, double>::value)
+                {
+                    H_V_delta = &ld->H_V_delta;
+                }
+                else
+                {
+                    H_V_delta = &ld->H_V_delta_k;
+                }
+                DeePKS_domain::collect_h_mat<TK,TH>(*ParaV, *H_V_delta,v_delta,nlocal,nks); 
 
+                // save v_delta and h_base
                 const std::string file_hbase = PARAM.globalv.global_out_dir + "deepks_hbase.npy";
-                LCAO_deepks_io::save_npy_h(h_tot-v_delta, file_hbase, nlocal, my_rank);
+                for (int ik = 0; ik < nks; ik++)
+                {
+                    h_base[ik] = h_tot[ik] - v_delta[ik];
+                }
+                LCAO_deepks_io::save_npy_h<TK,TH>(h_base, file_hbase, nlocal, nks, my_rank);
 
                 const std::string file_vdelta = PARAM.globalv.global_out_dir + "deepks_vdelta.npy";
-                LCAO_deepks_io::save_npy_h(v_delta, file_vdelta, nlocal, my_rank);
+                LCAO_deepks_io::save_npy_h<TK,TH>(v_delta, file_vdelta, nlocal, nks, my_rank);
 
-                if(deepks_v_delta==1)//v_delta_precalc storage method 1
+                if(PARAM.inp.deepks_v_delta==1)//v_delta_precalc storage method 1
                 {
-                    ld->cal_v_delta_precalc(nlocal,
-                            nat,
-                            ucell,
-                            orb,
-                            GridD);
-                
-                    const int nks_gamma = 1;
-                    LCAO_deepks_io::save_npy_v_delta_precalc(
-                      nat, 
-                      nks_gamma, 
-                      nlocal,
-                      ld->des_per_atom,
-                      ld->v_delta_precalc_tensor,
-                      PARAM.globalv.global_out_dir,
-                      my_rank);
+                    ld->cal_v_delta_precalc<TK>(nlocal, nat, nks, kvec_d, ucell, orb, GridD);
+
+                    LCAO_deepks_io::save_npy_v_delta_precalc<TK>(nat, 
+                                                                 nks, 
+                                                                 nlocal,
+                                                                 ld->des_per_atom,
+                                                                 ld->v_delta_precalc_tensor,
+                                                                 PARAM.globalv.global_out_dir,
+                                                                 my_rank);
                 }
-                else if(deepks_v_delta==2)//v_delta_precalc storage method 2
+                else if(PARAM.inp.deepks_v_delta==2)//v_delta_precalc storage method 2
                 {
-                    ld->prepare_psialpha(nlocal,
-                                nat,
-                                ucell,
-                                orb,
-                                GridD);
-
-                    const int nks_gamma=1;
-                    LCAO_deepks_io::save_npy_psialpha(nat, 
-                                nks_gamma, 
-                                nlocal,
-                                ld->inlmax,
-                                ld->lmaxd,
-                                ld->psialpha_tensor,
-                                PARAM.globalv.global_out_dir,
-                                my_rank);
-
-                    ld->prepare_gevdm(
-                                nat,
-                                orb);
+                    ld->prepare_psialpha<TK>(nlocal, nat, nks, kvec_d, ucell, orb, GridD);
+
+                    LCAO_deepks_io::save_npy_psialpha<TK>(nat, 
+                                                          nks, 
+                                                          nlocal,
+                                                          ld->inlmax,
+                                                          ld->lmaxd,
+                                                          ld->psialpha_tensor,
+                                                          PARAM.globalv.global_out_dir,
+                                                          my_rank);
+
+                    ld->prepare_gevdm(nat, orb);
 
                     LCAO_deepks_io::save_npy_gevdm(nat,
-                      ld->inlmax,
-                      ld->lmaxd,
-                      ld->gevdm_tensor,
-                      PARAM.globalv.global_out_dir,
-                      my_rank);
+                                                   ld->inlmax,
+                                                   ld->lmaxd,
+                                                   ld->gevdm_tensor,
+                                                   PARAM.globalv.global_out_dir,
+                                                   my_rank);
                 }
             }
             else //deepks_scf == 0
             {
                 const std::string file_hbase = PARAM.globalv.global_out_dir + "deepks_hbase.npy";
-                LCAO_deepks_io::save_npy_h(h_tot, file_hbase, nlocal, my_rank);
+                LCAO_deepks_io::save_npy_h<TK,TH>(h_tot, file_hbase, nlocal, nks, my_rank);
             }
         }//end v_delta label
     
@@ -194,6 +245,7 @@ void LCAO_Deepks_Interface::out_deepks_labels(const double& etot,
     if (PARAM.inp.deepks_out_labels || PARAM.inp.deepks_scf)
     {
         // this part is for integrated test of deepks
+        // so it is printed no matter even if deepks_out_labels is not used
         // when deepks_scf is on, the init pdm should be same as the out pdm, so we should not recalculate the pdm
 		if(!PARAM.inp.deepks_scf) 
 		{
@@ -216,258 +268,24 @@ void LCAO_Deepks_Interface::out_deepks_labels(const double& etot,
 					PARAM.inp.deepks_equiv, 
 					ld->d_tensor, 
                     PARAM.globalv.global_out_dir,
-					my_rank); // libnpy needed
+					GlobalV::MY_RANK); // libnpy needed
 		}
     }
     
     /// print out deepks information to the screen
     if (PARAM.inp.deepks_scf)
     {
-        ld->cal_e_delta_band(dm->get_DMK_vector());
+        ld->cal_e_delta_band(dm->get_DMK_vector(), nks);
         std::cout << "E_delta_band = " << std::setprecision(8) << ld->e_delta_band << " Ry"
                   << " = " << std::setprecision(8) << ld->e_delta_band * ModuleBase::Ry_to_eV << " eV"
                   << std::endl;
-        std::cout << "E_delta_NN= " << std::setprecision(8) << ld->E_delta << " Ry"
+        std::cout << "E_delta_NN = " << std::setprecision(8) << ld->E_delta << " Ry"
                   << " = " << std::setprecision(8) << ld->E_delta * ModuleBase::Ry_to_eV << " eV" << std::endl;
     }
 }
 
-// multi-k
-void LCAO_Deepks_Interface::out_deepks_labels(const double& etot,
-                                              const int& nks,
-                                              const int& nat,
-                                              const int& nlocal,
-                                              const ModuleBase::matrix& ekb,
-                                              const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                                              const UnitCell& ucell,
-                                              const LCAO_Orbitals& orb,
-                                              Grid_Driver& GridD,
-                                              const Parallel_Orbitals* ParaV,
-                                              const psi::Psi<std::complex<double>>& psi,
-                                              const elecstate::DensityMatrix<std::complex<double>, double>* dm,
-                                              const int& deepks_v_delta)
-{
-    ModuleBase::TITLE("LCAO_Deepks_Interface", "out_deepks_labels");
-    ModuleBase::timer::tick("LCAO_Deepks_Interface", "out_deepks_labels");
-
-    const int my_rank = GlobalV::MY_RANK;
-    const int nspin = PARAM.inp.nspin;
-
-    /// calculating deepks correction to bandgap and save the results
-    if (PARAM.inp.deepks_out_labels)
-    {
-        // mohan updated 2024-07-25
-        const std::string file_etot = PARAM.globalv.global_out_dir + "deepks_etot.npy";
-        const std::string file_ebase = PARAM.globalv.global_out_dir + "deepks_ebase.npy";
-
-        LCAO_deepks_io::save_npy_e(etot, file_etot, my_rank);
-
-        if (PARAM.inp.deepks_scf)
-        {
-            /// ebase :no deepks E_delta including
-            LCAO_deepks_io::save_npy_e(etot - ld->E_delta,
-                           file_ebase, my_rank);
-        }
-        else // deepks_scf = 0; base calculation
-        {
-            LCAO_deepks_io::save_npy_e(etot, file_ebase, my_rank);
-        }
-
-        if (PARAM.inp.deepks_bandgap)
-        {
-            int nocc = PARAM.inp.nelec / 2;
-            ModuleBase::matrix deepks_bands;
-            deepks_bands.create(nks, 1);
-            for (int iks = 0; iks < nks; iks++)
-            {
-                for (int hl = 0; hl < 1; hl++)
-                {
-                    deepks_bands(iks, hl) = ekb(iks, nocc + hl) - ekb(iks, nocc - 1 + hl);
-                }
-            }
-
-            const std::string file_otot = PARAM.globalv.global_out_dir + "deepks_otot.npy";
-            LCAO_deepks_io::save_npy_o(deepks_bands, file_otot, nks, my_rank);
-
-            if (PARAM.inp.deepks_scf)
-            {
-                int nocc = PARAM.inp.nelec / 2; // redundant!
-                ModuleBase::matrix wg_hl;
-                wg_hl.create(nks, PARAM.inp.nbands);
-                std::vector<std::vector<ModuleBase::ComplexMatrix>> dm_bandgap_k;
-                dm_bandgap_k.resize(1);
-
-                for (int ib = 0; ib < 1; ib++)
-                {
-                    wg_hl.zero_out();
-                    for (int ik = 0; ik < nks; ik++)
-                    {
-                        wg_hl(ik, ib + nocc - 1) = -1.0;
-                        wg_hl(ik, ib + nocc) = 1.0;
-                    }
-                    dm_bandgap_k[ib].resize(nks);
-                    elecstate::cal_dm(ParaV, wg_hl, psi, dm_bandgap_k[ib]);
-                }
-
-                // ld->cal_o_delta_k(dm_bandgap_k, ParaV, nks);
-                ld->cal_orbital_precalc_k(dm_bandgap_k, nat, nks, kvec_d, ucell, orb, GridD);
-
-				LCAO_deepks_io::save_npy_orbital_precalc(
-						nat, 
-						nks, 
-						ld->des_per_atom, 
-						ld->orbital_precalc_tensor, 
-                        PARAM.globalv.global_out_dir,
-						GlobalV::MY_RANK);
-
-                ld->cal_o_delta_k(dm_bandgap_k, nks);
-
-                const std::string file_obase = PARAM.globalv.global_out_dir + "deepks_obase.npy";
-                LCAO_deepks_io::save_npy_o(deepks_bands - ld->o_delta, file_obase, nks, my_rank);
-            }     // end deepks_scf == 1
-            else  // deepks_scf == 0
-            {
-                const std::string file_obase = PARAM.globalv.global_out_dir + "deepks_obase.npy";
-                LCAO_deepks_io::save_npy_o(deepks_bands, file_obase, nks, my_rank); // no scf, o_tot=o_base
-            }     // end deepks_scf == 0
-        } // end bandgap label
-        if(deepks_v_delta)
-        {
-            std::vector<ModuleBase::ComplexMatrix> h_tot(nks);
-            for (int ik = 0; ik < nks; ik++)
-            {
-                h_tot[ik].create(nlocal, nlocal);
-            }
-
-            DeePKS_domain::collect_h_mat(*ParaV, ld->h_mat_k,h_tot,nlocal,nks);
-
-            const std::string file_htot = PARAM.globalv.global_out_dir + "deepks_htot.npy";
-            LCAO_deepks_io::save_npy_h(h_tot, file_htot, nlocal, nks, my_rank);
-
-            if(PARAM.inp.deepks_scf)
-            {
-                std::vector<ModuleBase::ComplexMatrix> v_delta(nks);
-                std::vector<ModuleBase::ComplexMatrix> hbase(nks);
-                for (int ik = 0; ik < nks; ik++)
-                {
-                    v_delta[ik].create(nlocal, nlocal);
-                    hbase[ik].create(nlocal, nlocal);
-                }
-                DeePKS_domain::collect_h_mat(*ParaV, ld->H_V_delta_k,v_delta,nlocal,nks); 
-
-                const std::string file_hbase = PARAM.globalv.global_out_dir + "deepks_hbase.npy";
-                for (int ik = 0; ik < nks; ik++)
-                {
-                    hbase[ik] = h_tot[ik] - v_delta[ik];
-                }
-                LCAO_deepks_io::save_npy_h(hbase, file_hbase, nlocal, nks, my_rank);
-
-                const std::string file_vdelta = PARAM.globalv.global_out_dir + "deepks_vdelta.npy";
-                LCAO_deepks_io::save_npy_h(v_delta, file_vdelta, nlocal, nks, my_rank);
-
-                if(deepks_v_delta==1)//v_delta_precalc storage method 1
-                {
-                    ld->cal_v_delta_precalc_k(nlocal,
-                            nat,
-                            nks,
-                            kvec_d,
-                            ucell,
-                            orb,
-                            GridD);
-                
-                    LCAO_deepks_io::save_npy_v_delta_precalc(
-                      nat, 
-                      nks, 
-                      nlocal,
-                      ld->des_per_atom,
-                      ld->v_delta_precalc_tensor,
-                      PARAM.globalv.global_out_dir,
-                      my_rank);
-
-                }
-                else if(deepks_v_delta==2)//v_delta_precalc storage method 2
-                {
-                    ld->prepare_psialpha_k(nlocal,
-                                nat,
-                                nks,
-                                kvec_d,
-                                ucell,
-                                orb,
-                                GridD);
-                    
-                    LCAO_deepks_io::save_npy_psialpha(nat, 
-                                nks, 
-                                nlocal,
-                                ld->inlmax,
-                                ld->lmaxd,
-                                ld->psialpha_tensor,
-                                PARAM.globalv.global_out_dir,
-                                my_rank);
-
-                    ld->prepare_gevdm(
-                                nat,
-                                orb);
-
-                    LCAO_deepks_io::save_npy_gevdm(nat,
-                      ld->inlmax,
-                      ld->lmaxd,
-                      ld->gevdm_tensor,
-                      PARAM.globalv.global_out_dir,
-                      my_rank);
-                }
-            }
-            else //deepks_scf == 0
-            {
-                const std::string file_hbase = PARAM.globalv.global_out_dir + "deepks_hbase.npy";
-                LCAO_deepks_io::save_npy_h(h_tot, file_hbase, nlocal, nks, my_rank);
-            }
-        }
-    }  // end deepks_out_labels
-
-
-    // DeePKS PDM and descriptor
-    if (PARAM.inp.deepks_out_labels || PARAM.inp.deepks_scf)
-    {
-        // this part is for integrated test of deepks
-        // so it is printed no matter even if deepks_out_labels is not used
-        // when deepks_scf is on, the init pdm should be same as the out pdm, so we should not recalculate the pdm
-		if(!PARAM.inp.deepks_scf) 
-		{
-			ld->cal_projected_DM_k(dm, ucell, orb, GridD);
-		}
-
-        ld->check_projected_dm(); // print out the projected dm for NSCF calculaiton
-
-        ld->cal_descriptor(nat);     // final descriptor
-
-        ld->check_descriptor(ucell, PARAM.globalv.global_out_dir);
-
-        if (PARAM.inp.deepks_out_labels)
-        {
-            LCAO_deepks_io::save_npy_d(nat, 
-                                       ld->des_per_atom, 
-									   ld->inlmax, 
-									   ld->inl_l,
-									   PARAM.inp.deepks_equiv, 
-                                       ld->d_tensor, 
-                                       PARAM.globalv.global_out_dir,
-                                       GlobalV::MY_RANK); // libnpy needed
-        }
-    }
-    //
-    if (PARAM.inp.deepks_scf)
-    {
-        ld->cal_e_delta_band_k(dm->get_DMK_vector(), nks);
-
-        std::cout << "E_delta_band = " << std::setprecision(8) << ld->e_delta_band << " Ry"
-                  << " = " << std::setprecision(8) << ld->e_delta_band * ModuleBase::Ry_to_eV << " eV"
-                  << std::endl;
-
-        std::cout << "E_delta_NN= " << std::setprecision(8) << ld->E_delta << " Ry"
-                  << " = " << std::setprecision(8) << ld->E_delta * ModuleBase::Ry_to_eV << " eV" << std::endl;
-    }
-
-    ModuleBase::timer::tick("LCAO_Deepks_Interface", "out_deepks_labels");
-}
+template class LCAO_Deepks_Interface<double, double>;
+template class LCAO_Deepks_Interface<std::complex<double>, double>;
+template class LCAO_Deepks_Interface<std::complex<double>, std::complex<double>>;
 
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.h b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.h
index cfb180ea19..1e657f48fe 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.h
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.h
@@ -5,8 +5,10 @@
 #include "LCAO_deepks.h"
 #include "module_base/complexmatrix.h"
 #include "module_base/matrix.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h"
 #include <memory>
 
+template <typename TK, typename TR>
 class LCAO_Deepks_Interface
 {
   public:
@@ -28,7 +30,6 @@ class LCAO_Deepks_Interface
     /// @param[in] psid
     /// @param[in] dm_gamma
     /// @param[in] dm_k
-    /// @param[in] deepks_v_delta
     // for Gamma-only
     void out_deepks_labels(const double& etot,
                            const int& nks,
@@ -40,23 +41,9 @@ class LCAO_Deepks_Interface
                            const LCAO_Orbitals& orb,
                            Grid_Driver& GridD,
                            const Parallel_Orbitals* ParaV,
-                           const psi::Psi<double>& psid,
-                           const elecstate::DensityMatrix<double, double>* dm,
-                           const int& deepks_v_delta);
-  // for multi-k
-  void out_deepks_labels(const double& etot,
-                           const int& nks,
-                           const int& nat,
-                           const int& nlocal,
-                           const ModuleBase::matrix& ekb,
-                           const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                           const UnitCell& ucell,
-                           const LCAO_Orbitals& orb,
-                           Grid_Driver& GridD,
-                           const Parallel_Orbitals* ParaV,
-                           const psi::Psi<std::complex<double>>& psi,
-                           const elecstate::DensityMatrix<std::complex<double>, double>* dm,
-                           const int& deepks_v_delta);
+                           const psi::Psi<TK>& psid,
+                           const elecstate::DensityMatrix<TK, double>* dm,
+                           hamilt::HamiltLCAO<TK, TR>* p_ham);
 
   private:
     std::shared_ptr<LCAO_Deepks> ld;
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
index dd7d39227f..c1aa9cab0a 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
@@ -28,28 +28,11 @@
 #include "LCAO_deepks_io.h"
 #include "npy.hpp"
 
-void LCAO_deepks_io::print_dm(const std::vector<double> &dm, 
-                              const int nlocal, 
-                              const int nrow)
-{
-    std::ofstream ofs("dm");
-    ofs << std::setprecision(15);
-
-    for (int mu=0; mu<nlocal; mu++)
-    {
-        for (int nu=0; nu<nlocal; nu++)
-        {
-            ofs << dm[mu * nrow + nu] << " ";
-        }
-        ofs << std::endl;
-    }
-}
-
-
-void LCAO_deepks_io::print_dm_k(const int nks, 
+template <typename TK>
+void LCAO_deepks_io::print_dm(const int nks, 
                                 const int nlocal,
                                 const int nrow,
-                                const std::vector<std::vector<std::complex<double>>>& dm)
+                                const std::vector<std::vector<TK>>& dm)
 {
     std::stringstream ss;
     for(int ik=0;ik<nks;ik++)
@@ -410,42 +393,8 @@ void LCAO_deepks_io::save_npy_orbital_precalc(const int nat,
     return;
 }
 
-
-//just for gamma only
-void LCAO_deepks_io::save_npy_h(const ModuleBase::matrix &hamilt,
-                                const std::string &h_file,
-                                const int nlocal,
-                                const int rank)
-{
-    ModuleBase::TITLE("LCAO_deepks_io", "save_npy_h");
-	if(rank!=0)
-	{
-		return;
-	}
-    int nks=1;
-
-    const long unsigned hshape[] = {static_cast<unsigned long>(nks),
-                                    static_cast<unsigned long>(nlocal), 
-                                    static_cast<unsigned long>(nlocal) };
-
-    std::vector<double> npy_h;
-    for(int k=0; k<nks; k++)
-    {
-        for (int i=0; i<nlocal; i++)
-        {
-            for (int j=0; j<nlocal; j++)
-            {
-                npy_h.push_back(hamilt(i,j));
-            }
-        }         
-    }
-
-    npy::SaveArrayAsNumpy(h_file, false, 3, hshape, npy_h);
-    return;    
-}
-
-// for multi-k, should be combined with gamma-only version in future
-void LCAO_deepks_io::save_npy_h(const std::vector<ModuleBase::ComplexMatrix> &hamilt,
+template <typename TK, typename TH>
+void LCAO_deepks_io::save_npy_h(const std::vector<TH> &hamilt,
                                 const std::string &h_file,
                                 const int nlocal,
                                 const int nks,
@@ -461,7 +410,7 @@ void LCAO_deepks_io::save_npy_h(const std::vector<ModuleBase::ComplexMatrix> &ha
                                     static_cast<unsigned long>(nlocal), 
                                     static_cast<unsigned long>(nlocal) };
 
-    std::vector<std::complex<double>> npy_h;
+    std::vector<TK> npy_h;
     for(int k=0; k<nks; k++)
     {
         for (int i=0; i<nlocal; i++)
@@ -477,6 +426,7 @@ void LCAO_deepks_io::save_npy_h(const std::vector<ModuleBase::ComplexMatrix> &ha
     return;    
 }
 
+template <typename TK>
 void LCAO_deepks_io::save_npy_v_delta_precalc(const int nat, 
                                               const int nks,
                                               const int nlocal, 
@@ -500,60 +450,39 @@ void LCAO_deepks_io::save_npy_v_delta_precalc(const int nat,
                                     static_cast<unsigned long>(nlocal),
                                     static_cast<unsigned long>(nat),
                                     static_cast<unsigned long>(des_per_atom)};
-    if (nks==1)
+
+    std::vector<TK> npy_v_delta_precalc;    
+    for (int iks = 0; iks < nks; ++iks)
     {
-        std::vector<double> npy_v_delta_precalc;    
-        for (int iks = 0; iks < nks; ++iks)
+        for (int mu = 0; mu < nlocal; ++mu)
         {
-            for (int mu = 0; mu < nlocal; ++mu)
+            for (int nu = 0; nu < nlocal; ++nu)
             {
-                for (int nu = 0; nu < nlocal; ++nu)
+                for (int iat = 0;iat < nat;++iat)
                 {
-                    for (int iat = 0;iat < nat;++iat)
+                    for(int p=0; p<des_per_atom; ++p)
                     {
-                        for(int p=0; p<des_per_atom; ++p)
+                        if constexpr (std::is_same<TK, double>::value)
                         {
                             npy_v_delta_precalc.push_back(v_delta_precalc_tensor.index({iks, mu, nu, iat, p }).item().toDouble());
                         }
-                    }                
-                }
-            }
-        }
-        const std::string file_vdpre = out_dir + "deepks_vdpre.npy";
-        npy::SaveArrayAsNumpy(file_vdpre, false, 5, gshape, npy_v_delta_precalc);
-        return;
-    }
-    else
-    {
-        std::vector<std::complex<double>> npy_v_delta_precalc;
-        for (int iks = 0; iks < nks; ++iks)
-        {
-            for (int mu = 0; mu < nlocal; ++mu)
-            {
-                for (int nu = 0; nu < nlocal; ++nu)
-                {
-                    for (int iat = 0;iat < nat;++iat)
-                    {
-                        for(int p=0; p<des_per_atom; ++p)
+                        else
                         {
-                            auto real_part = torch::real(v_delta_precalc_tensor.index({iks, mu, nu, iat, p})).item<double>();
-                            auto imag_part = torch::imag(v_delta_precalc_tensor.index({iks, mu, nu, iat, p})).item<double>();
-                            std::complex<double> value(real_part, imag_part);
+                            std::complex<double> value(torch::real(v_delta_precalc_tensor.index({iks, mu, nu, iat, p})).item<double>(),
+                                                        torch::imag(v_delta_precalc_tensor.index({iks, mu, nu, iat, p})).item<double>());
                             npy_v_delta_precalc.push_back(value);
                         }
-                    }                
-                }
+                    }
+                }                
             }
         }
-        const std::string file_vdpre = out_dir + "deepks_vdpre.npy";
-        npy::SaveArrayAsNumpy(file_vdpre, false, 5, gshape, npy_v_delta_precalc);
-        return;
     }
-
-    
+    const std::string file_vdpre = out_dir + "deepks_vdpre.npy";
+    npy::SaveArrayAsNumpy(file_vdpre, false, 5, gshape, npy_v_delta_precalc);
+    return;
 }
 
-
+template <typename TK>
 void LCAO_deepks_io::save_npy_psialpha(const int nat, 
                                        const int nks,
                                        const int nlocal,
@@ -578,55 +507,35 @@ void LCAO_deepks_io::save_npy_psialpha(const int nat,
                                     static_cast<unsigned long>(nks),
                                     static_cast<unsigned long>(nlocal),
                                     static_cast<unsigned long>(mmax)};
-    if(nks==1)
+    std::vector<TK> npy_psialpha;
+    for(int iat=0; iat< nat ; iat++) 
     {
-        std::vector<double> npy_psialpha;
-        for(int iat=0; iat< nat ; iat++) 
+        for(int nl = 0; nl < nlmax; nl++)
         {
-            for(int nl = 0; nl < nlmax; nl++)
+            for (int iks = 0; iks < nks ; iks++)
             {
-                for (int iks = 0; iks < nks ; iks++)
+                for(int mu = 0; mu < nlocal ; mu++)
                 {
-                    for(int mu = 0; mu < nlocal ; mu++)
+                    for(int m=0; m< mmax; m++)
                     {
-                        for(int m=0; m< mmax; m++)
+                        if constexpr (std::is_same<TK, double>::value)
                         {
                             npy_psialpha.push_back(psialpha_tensor.index({ iat,nl, iks, mu, m }).item().toDouble());
                         }
-                    }                
-                }
-            }
-        }
-        const std::string file_psialpha = out_dir + "deepks_psialpha.npy";
-        npy::SaveArrayAsNumpy(file_psialpha, false, 5, gshape, npy_psialpha);
-        return;
-    }
-    else
-    {
-        std::vector<std::complex<double>> npy_psialpha;
-        for(int iat=0; iat< nat ; iat++) 
-        {
-            for(int nl = 0; nl < nlmax; nl++)
-            {
-                for (int iks = 0; iks < nks ; iks++)
-                {
-                    for(int mu = 0; mu < nlocal ; mu++)
-                    {
-                        for(int m=0; m< mmax; m++)
+                        else
                         {
-                            std::complex<double> value(torch::real(psialpha_tensor.index({ iat,nl, iks, mu, m })).item<double>(), 
-                                                       torch::imag(psialpha_tensor.index({ iat,nl, iks, mu, m })).item<double>());
+                            std::complex<double> value(torch::real(psialpha_tensor.index({ iat, nl, iks, mu, m })).item<double>(), 
+                                                       torch::imag(psialpha_tensor.index({ iat, nl, iks, mu, m })).item<double>());
                             npy_psialpha.push_back(value);
                         }
-                    }                
-                }
+                    }
+                }                
             }
         }
-        const std::string file_psialpha = out_dir + "deepks_psialpha.npy";
-        npy::SaveArrayAsNumpy(file_psialpha, false, 5, gshape, npy_psialpha);
-        return;
     }
-
+    const std::string file_psialpha = out_dir + "deepks_psialpha.npy";
+    npy::SaveArrayAsNumpy(file_psialpha, false, 5, gshape, npy_psialpha);
+    return;
 }
 
 
@@ -676,4 +585,61 @@ void LCAO_deepks_io::save_npy_gevdm(const int nat,
     return;
 }
 
+
+template void LCAO_deepks_io::print_dm<double>(const int nks, 
+                                               const int nlocal,
+                                               const int nrow,
+                                               const std::vector<std::vector<double>>& dm);
+
+template void LCAO_deepks_io::print_dm<std::complex<double>>(const int nks, 
+                                                             const int nlocal,
+                                                             const int nrow,
+                                                             const std::vector<std::vector<std::complex<double>>>& dm);
+
+template void LCAO_deepks_io::save_npy_h<double>(const std::vector<ModuleBase::matrix> &hamilt,
+                                                 const std::string &h_file,
+                                                 const int nlocal,
+                                                 const int nks,
+                                                 const int rank);
+
+template void LCAO_deepks_io::save_npy_h<std::complex<double>>(const std::vector<ModuleBase::ComplexMatrix> &hamilt,
+                                                               const std::string &h_file,
+                                                               const int nlocal,
+                                                               const int nks,
+                                                               const int rank);
+
+template void LCAO_deepks_io::save_npy_v_delta_precalc<double>(const int nat, 
+                                                               const int nks,
+                                                               const int nlocal, 
+                                                               const int des_per_atom,
+                                                               const torch::Tensor& v_delta_precalc_tensor,
+                                                               const std::string& out_dir,
+                                                               const int rank);
+
+template void LCAO_deepks_io::save_npy_v_delta_precalc<std::complex<double>>(const int nat, 
+                                                                             const int nks,
+                                                                             const int nlocal, 
+                                                                             const int des_per_atom,
+                                                                             const torch::Tensor& v_delta_precalc_tensor,
+                                                                             const std::string& out_dir,
+                                                                             const int rank);
+
+template void LCAO_deepks_io::save_npy_psialpha<double>(const int nat,
+                                                        const int nks,
+                                                        const int nlocal,
+                                                        const int inlmax,
+                                                        const int lmaxd,
+                                                        const torch::Tensor &psialpha_tensor,
+                                                        const std::string& out_dir,
+                                                        const int rank);
+
+template void LCAO_deepks_io::save_npy_psialpha<std::complex<double>>(const int nat,
+                                                                      const int nks,
+                                                                      const int nlocal,
+                                                                      const int inlmax,
+                                                                      const int lmaxd,
+                                                                      const torch::Tensor &psialpha_tensor,
+                                                                      const std::string& out_dir,
+                                                                      const int rank);
+
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.h b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.h
index 8f88fa20d4..34398a21d7 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.h
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.h
@@ -20,33 +20,31 @@ namespace LCAO_deepks_io
     /// It also contains subroutines for printing density matrices
     /// which is used in unit tests
 
-    /// There are 2 subroutines for printing density matrices:
-    /// 1. print_dm : for gamma only
-    /// 2. print_dm_k : for multi-k
+    /// There are 2 subroutines for printing and loading .npy file:
+    /// 1. print_dm : print density matrices
+    /// 2. load_npy_gedm : load gedm from .npy file
 
     /// others print quantities in .npy format
 
-    /// 3. save_npy_d : descriptor ->dm_eig.npy
-    /// 4. save_npy_gvx : gvx ->grad_vx.npy
-    /// 5. save_npy_e : energy
-    /// 6. save_npy_f : force
+    /// 3. save_npy_d : descriptor -> deepks_dm_eig.npy
+    /// 4. save_npy_e : energy
+    /// 5. save_npy_f : force 
+    /// 6. save_npy_gvx : gvx -> deepks_gradvx.npy
     /// 7. save_npy_s : stress
-    /// 8. save_npy_o: orbital
-    /// 9. save_npy_orbital_precalc: orbital_precalc -> orbital_precalc.npy
-    /// 10. save_npy_h : Hamiltonian
-    /// 11. save_npy_v_delta_precalc : v_delta_precalc
-    /// 12. save_npy_psialpha : psialpha
-    /// 13. save_npy_gevdm : grav_evdm , can use psialpha and gevdm to calculate v_delta_precalc
+    /// 8. save_npy_gvepsl : gvepsl -> deepks_gvepsl.npy
+    /// 9. save_npy_o: orbital
+    /// 10. save_npy_orbital_precalc: orbital_precalc -> deepks_orbpre.npy
+    /// 11. save_npy_h : Hamiltonian
+    /// 12. save_npy_v_delta_precalc : v_delta_precalc -> deepks_vdpre.npy
+    /// 13. save_npy_psialpha : psialpha -> deepks_psialpha.npy
+    /// 14. save_npy_gevdm : grav_evdm -> deepks_gevdm.npy, can use psialpha and gevdm to calculate v_delta_precalc
 
 /// print density matrices
-void print_dm(const std::vector<double> &dm,
-		const int nlocal,
-		const int nrow);
-
-void print_dm_k(const int nks,
+template <typename TK>
+void print_dm(const int nks,
 		const int nlocal,
 		const int nrow,
-		const std::vector<std::vector<std::complex<double>>>& dm);
+		const std::vector<std::vector<TK>>& dm);
 
 void load_npy_gedm(const int nat,
 		const int des_per_atom,
@@ -54,18 +52,7 @@ void load_npy_gedm(const int nat,
 		double& e_delta,
 		const int rank);
 
-    ///----------------------------------------------------------------------
-    /// The following 4 functions save the `[dm_eig], [e_base], [f_base], [grad_vx]`
-    /// of current configuration as `.npy` file, when `deepks_scf = 1`.
-    /// After a full group of consfigurations are calculated,
-    /// we need a python script to `load` and `torch.cat` these `.npy` files,
-    /// and get `l_e_delta,npy` and `l_f_delta.npy` corresponding to the exact E, F data.
-    ///
-    /// Unit of energy: Ry
-    ///
-    /// Unit of force: Ry/Bohr
-    ///----------------------------------------------------------------------
-
+/// save descriptor
 void save_npy_d(const int nat,
 		const int des_per_atom,
 		const int inlmax,
@@ -75,33 +62,36 @@ void save_npy_d(const int nat,
 		const std::string& out_dir,
 		const int rank);
 
-void save_npy_gvx(const int nat,
-		const int des_per_atom,
-		const torch::Tensor &gvx_tensor,
-        const std::string& out_dir,
-		const int rank);
-
-void save_npy_gvepsl(const int nat,
-		const int des_per_atom,
-		const torch::Tensor &gvepsl_tensor,
-		const std::string& out_dir,
-		const int rank);
-
+// save energy
 void save_npy_e(const double &e,  /**<[in] \f$E_{base}\f$ or \f$E_{tot}\f$, in Ry*/
 		const std::string &e_file,
 		const int rank);
 
+// save force and gvx
 void save_npy_f(const ModuleBase::matrix &f, /**<[in] \f$F_{base}\f$ or \f$F_{tot}\f$, in Ry/Bohr*/
 		const std::string &f_file,
 		const int nat,
 		const int rank);
 
+void save_npy_gvx(const int nat,
+		const int des_per_atom,
+		const torch::Tensor &gvx_tensor,
+        const std::string& out_dir,
+		const int rank);
+
+// save stress and gvepsl
 void save_npy_s(const ModuleBase::matrix &stress, /**<[in] \f$S_{base}\f$ or \f$S_{tot}\f$, in Ry/Bohr^3*/
 		const std::string &s_file,
 		const double &omega,
 		const int rank);
 
-/// QO added on 2021-12-15
+void save_npy_gvepsl(const int nat,
+		const int des_per_atom,
+		const torch::Tensor &gvepsl_tensor,
+		const std::string& out_dir,
+		const int rank);
+
+/// save orbital and orbital_precalc
 void save_npy_o(const ModuleBase::matrix &bandgap, /**<[in] \f$E_{base}\f$ or \f$E_{tot}\f$, in Ry*/
 		const std::string &o_file,
 		const int nks,
@@ -114,20 +104,15 @@ void save_npy_orbital_precalc(const int nat,
         const std::string& out_dir,
 		const int rank);
 
-/// xinyuan added on 2023-2-20
-/// for gamma only
-void save_npy_h(const ModuleBase::matrix &hamilt,
+// save Hamiltonian and v_delta_precalc(for deepks_v_delta==1)/psialpha+gevdm(for deepks_v_delta==2)
+template <typename TK, typename TH>
+void save_npy_h(const std::vector<TH> &hamilt,
 		const std::string &h_file,
 		const int nlocal,
-		const int rank);
-
-/// for multi-k
-void save_npy_h(const std::vector<ModuleBase::ComplexMatrix> &hamilt,
-        const std::string &h_file,
-        const int nlocal,
         const int nks,
-        const int rank);
+		const int rank);
 
+template <typename TK>
 void save_npy_v_delta_precalc(const int nat,
 		const int nks,
 		const int nlocal,
@@ -136,6 +121,7 @@ void save_npy_v_delta_precalc(const int nat,
 		const std::string& out_dir,
 		const int rank);
 
+template <typename TK>
 void save_npy_psialpha(const int nat,
 		const int nks,
 		const int nlocal,
@@ -145,6 +131,7 @@ void save_npy_psialpha(const int nat,
 		const std::string& out_dir,
 		const int rank);
 
+// Always real, no need for template now
 void save_npy_gevdm(const int nat,
 		const int inlmax,
 		const int lmaxd,
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_odelta.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_odelta.cpp
index 7f9885d76f..eae3c12220 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_odelta.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_odelta.cpp
@@ -13,7 +13,7 @@
 #include "LCAO_deepks.h"
 #include "module_base/parallel_reduce.h"
 
-void LCAO_Deepks::cal_o_delta(const std::vector<std::vector<ModuleBase::matrix>>& dm_hl)
+void LCAO_Deepks::cal_o_delta(const std::vector<std::vector<ModuleBase::matrix>>& dm_hl, const int nks)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_o_delta");
     this->o_delta.zero_out();
@@ -31,7 +31,7 @@ void LCAO_Deepks::cal_o_delta(const std::vector<std::vector<ModuleBase::matrix>>
                     const int index = nu * pv->nrow + mu;
                     for (int is = 0; is < PARAM.inp.nspin; ++is)
                     {
-                        this->o_delta(0,hl) += dm_hl[hl][is](nu, mu) * this->H_V_delta[index];
+                        this->o_delta(0,hl) += dm_hl[hl][is](nu, mu) * this->H_V_delta[0][index];
                     }
                 }
             }
@@ -44,7 +44,7 @@ void LCAO_Deepks::cal_o_delta(const std::vector<std::vector<ModuleBase::matrix>>
 
 //calculating the correction of (LUMO-HOMO) energies, i.e., band gap corrections
 //for multi_k calculations
-void LCAO_Deepks::cal_o_delta_k(const std::vector<std::vector<ModuleBase::ComplexMatrix>>& dm_hl,
+void LCAO_Deepks::cal_o_delta(const std::vector<std::vector<ModuleBase::ComplexMatrix>>& dm_hl,
     const int nks)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_o_delta_k");
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
index 2925dfc69a..29c1fd813e 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
@@ -14,8 +14,7 @@
 //3. check_projected_dm, which prints pdm to descriptor.dat
 
 //4. cal_gdmx, calculating gdmx (and optionally gdm_epsl for stress) for gamma point
-//5. cal_gdmx_k, counterpart of 3, for multi-k
-//6. check_gdmx, which prints gdmx to a series of .dat files
+//5. check_gdmx, which prints gdmx to a series of .dat files
 
 #ifdef __DEEPKS
 
@@ -317,7 +316,7 @@ void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<double, double
     return;
 }
 
-void LCAO_Deepks::cal_projected_DM_k(const elecstate::DensityMatrix<std::complex<double>, double>* dm,
+void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<std::complex<double>, double>* dm,
     const UnitCell &ucell,
     const LCAO_Orbitals &orb,
     Grid_Driver& GridD)
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
index 97aef62322..5ec6a7acb4 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
@@ -153,8 +153,11 @@ void LCAO_Deepks::load_model(const std::string& deepks_model) {
 }
 
 // prepare_psialpha and prepare_gevdm for deepks_v_delta = 2
+template <typename TK>
 void LCAO_Deepks::prepare_psialpha(const int nlocal,
     const int nat,
+    const int nks,
+    const std::vector<ModuleBase::Vector3<double>> &kvec_d,
     const UnitCell &ucell,
     const LCAO_Orbitals &orb,
     Grid_Driver &GridD)
@@ -162,109 +165,15 @@ void LCAO_Deepks::prepare_psialpha(const int nlocal,
     ModuleBase::TITLE("LCAO_Deepks", "prepare_psialpha");
     int nlmax = this->inlmax/nat;
     int mmax = 2*this->lmaxd+1;
-    this->psialpha_tensor = torch::zeros({ nat, nlmax, 1, nlocal, mmax }, torch::TensorOptions().dtype(torch::kFloat64)); // support gamma-only
-
-    //cutoff for alpha is same for all types of atoms
-    const double Rcut_Alpha = orb.Alpha[0].getRcut();
-
-    for (int T0 = 0; T0 < ucell.ntype; T0++)
+    if constexpr (std::is_same<TK, double>::value)
     {
-		Atom* atom0 = &ucell.atoms[T0]; 
-        for (int I0 =0; I0< atom0->na; I0++)
-        {
-            //iat: atom index on which |alpha> is located
-            const int iat = ucell.itia2iat(T0,I0);
-			const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0] ,T0, I0);
-
-            //outermost loop : find all adjacent atoms
-            for (int ad=0; ad<GridD.getAdjacentNum()+1 ; ++ad)
-            {
-                const int T1 = GridD.getType(ad);
-                const int I1 = GridD.getNatom(ad);
-                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-				const double Rcut_AO1 = orb.Phi[T1].getRcut();
-
-                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad);
-				const Atom* atom1 = &ucell.atoms[T1];
-				const int nw1_tot = atom1->nw*PARAM.globalv.npol;
-
-				const double dist1 = (tau1-tau0).norm() * ucell.lat0;
-
-				if (dist1 > Rcut_Alpha + Rcut_AO1)
-				{
-					continue;
-				}
-
-                //middle loop : all atomic basis on the adjacent atom ad
-				for (int iw1=0; iw1<nw1_tot; ++iw1)
-				{
-					const int iw1_all = start1 + iw1;
-					const int iw1_local = pv->global2local_row(iw1_all);
-					const int iw2_local = pv->global2local_col(iw1_all);
-					if(iw1_local < 0 || iw2_local < 0) {continue;
-}
-					const int iw1_0 = iw1/PARAM.globalv.npol;
-					std::vector<double> nlm = this->nlm_save[iat][ad][iw1][0];
-                    
-                    int ib=0;
-                    int nl=0;
-                    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
-                    {
-                        for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
-                        {
-                            const int nm = 2*L0+1;
-                            
-                            for (int m1=0; m1<nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
-                            {
-                                this->psialpha_tensor[iat][nl][0][iw1_all][m1] = nlm[ib+m1];
-                            }
-                            ib+=nm;
-                            nl++;
-                        }
-                    }  
-				}//end iw
-			}//end ad
-		}//end I0
-	}//end T0
-
-#ifdef __MPI
-    double msg[mmax];
-    for(int iat=0; iat< nat ; iat++)
+        this->psialpha_tensor = torch::zeros({ nat, nlmax, nks, nlocal, mmax }, torch::kFloat64); // support gamma-only
+    }
+    else
     {
-        for(int nl = 0; nl < nlmax; nl++)
-        {
-            for(int mu = 0; mu < nlocal ; mu++)
-            {
-                for(int m=0;m<mmax;m++) 
-				{
-					msg[m] = this->psialpha_tensor[iat][nl][0][mu][m].item().toDouble();
-				}
-				Parallel_Reduce::reduce_all(msg,mmax);
-				for(int m=0;m<mmax;m++) 
-				{ 
-					this->psialpha_tensor[iat][nl][0][mu][m] = msg[m];
-				}
-			}
-        }
+        this->psialpha_tensor = torch::zeros({ nat, nlmax, nks, nlocal, mmax }, torch::kComplexDouble); // support multi-k
     }
 
-#endif  
-}
-
-void LCAO_Deepks::prepare_psialpha_k(const int nlocal,
-    const int nat,
-    const int nks,
-    const std::vector<ModuleBase::Vector3<double>> &kvec_d,
-    const UnitCell &ucell,
-    const LCAO_Orbitals &orb,
-    Grid_Driver &GridD)
-{
-    ModuleBase::TITLE("LCAO_Deepks", "prepare_psialpha");
-    int nlmax = this->inlmax/nat;
-    int mmax = 2*this->lmaxd+1;
-    this->psialpha_tensor = torch::zeros({ nat, nlmax, nks, nlocal, mmax }, torch::kComplexDouble); // support multi-k
-
     //cutoff for alpha is same for all types of atoms
     const double Rcut_Alpha = orb.Alpha[0].getRcut();
 
@@ -304,10 +213,13 @@ void LCAO_Deepks::prepare_psialpha_k(const int nlocal,
 
                 key_tuple key(ibt, dR.x, dR.y, dR.z);
 
-                if (this->nlm_save_k[iat].find(key)
-                    == this->nlm_save_k[iat].end()) 
+                if constexpr (std::is_same<TK, std::complex<double>>::value)
                 {
-                    continue;
+                    if (this->nlm_save_k[iat].find(key)
+                        == this->nlm_save_k[iat].end()) 
+                    {
+                        continue;
+                    }
                 }
 
                 //middle loop : all atomic basis on the adjacent atom ad
@@ -318,14 +230,26 @@ void LCAO_Deepks::prepare_psialpha_k(const int nlocal,
 					const int iw2_local = pv->global2local_col(iw1_all);
 					if(iw1_local < 0 || iw2_local < 0) {continue;}
 					const int iw1_0 = iw1/PARAM.globalv.npol;
-					std::vector<double> nlm = this->nlm_save_k[iat][key][iw1][0];
-                    
+					std::vector<double> nlm;
+                    if constexpr (std::is_same<TK, double>::value)
+                    {
+                        nlm = this->nlm_save[iat][ad][iw1][0];
+                    }
+                    else
+                    {
+                        nlm = this->nlm_save_k[iat][key][iw1][0];
+                    }
+
                     for (int ik = 0; ik <nks; ik++)
                     {
-                        const double arg = kvec_d[ik] * dR * ModuleBase::TWO_PI;
-                        double sinp, cosp;
-                        ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                        const std::complex<double> kphase = std::complex<double>(cosp, sinp);
+                        std::complex<double> kphase = std::complex<double>(1.0, 0.0);
+                        if constexpr (std::is_same<TK, std::complex<double>>::value)
+                        {
+                            const double arg = - (kvec_d[ik] * dR) * ModuleBase::TWO_PI;
+                            double sinp, cosp;
+                            ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                            kphase = std::complex<double>(cosp, sinp);
+                        }
                         int ib=0;
                         int nl=0;
                         for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
@@ -335,11 +259,18 @@ void LCAO_Deepks::prepare_psialpha_k(const int nlocal,
                                 const int nm = 2*L0+1;
                                 
                                 for (int m1=0; m1<nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
-                                {
-                                    std::complex<double> nlm_phase = nlm[ib + m1] * kphase;
-                                    torch::Tensor nlm_tensor = torch::tensor({nlm_phase.real(), nlm_phase.imag()}, torch::kDouble);
-                                    torch::Tensor complex_tensor = torch::complex(nlm_tensor[0], nlm_tensor[1]);
-                                    this->psialpha_tensor[iat][nl][ik][iw1_all][m1] = complex_tensor;
+                                {   
+                                    if constexpr (std::is_same<TK, double>::value)
+                                    {
+                                        this->psialpha_tensor[iat][nl][ik][iw1_all][m1] = nlm[ib+m1];
+                                    }
+                                    else
+                                    {
+                                        std::complex<double> nlm_phase = nlm[ib + m1] * kphase;
+                                        torch::Tensor nlm_tensor = torch::tensor({nlm_phase.real(), nlm_phase.imag()}, torch::kDouble);
+                                        torch::Tensor complex_tensor = torch::complex(nlm_tensor[0], nlm_tensor[1]);
+                                        this->psialpha_tensor[iat][nl][ik][iw1_all][m1] = complex_tensor;
+                                    }
                                 }
                                 ib+=nm;
                                 nl++;
@@ -352,7 +283,7 @@ void LCAO_Deepks::prepare_psialpha_k(const int nlocal,
 	}//end T0
 
 #ifdef __MPI
-    std::complex<double> msg[mmax];
+    TK msg[mmax];
     for(int iat=0; iat< nat ; iat++)
     {
         for(int nl = 0; nl < nlmax; nl++)
@@ -363,15 +294,29 @@ void LCAO_Deepks::prepare_psialpha_k(const int nlocal,
                 {
                     for(int m=0;m<mmax;m++) 
                     {
-                        auto tensor_value = this->psialpha_tensor.index({iat, nl, ik, mu, m});
-                        msg[m] = std::complex<double>(torch::real(tensor_value).item<double>(), torch::imag(tensor_value).item<double>());
+                        if constexpr (std::is_same<TK, double>::value)
+                        {
+                            msg[m] = this->psialpha_tensor[iat][nl][ik][mu][m].item().toDouble();
+                        }
+                        else
+                        {
+                            auto tensor_value = this->psialpha_tensor.index({iat, nl, ik, mu, m});
+                            msg[m] = std::complex<double>(torch::real(tensor_value).item<double>(), torch::imag(tensor_value).item<double>());
+                        }
                     }
                     Parallel_Reduce::reduce_all(msg,mmax);
                     for(int m=0;m<mmax;m++) 
                     {
-                        torch::Tensor msg_tensor = torch::tensor({msg[m].real(), msg[m].imag()}, torch::kDouble);
-                        torch::Tensor complex_tensor = torch::complex(msg_tensor[0], msg_tensor[1]);
-                        this->psialpha_tensor[iat][nl][ik][mu][m] = complex_tensor;
+                        if constexpr (std::is_same<TK, double>::value)
+                        {
+                            this->psialpha_tensor[iat][nl][ik][mu][m] = msg[m];
+                        }
+                        else
+                        {
+                            torch::Tensor msg_tensor = torch::tensor({msg[m].real(), msg[m].imag()}, torch::kDouble);
+                            torch::Tensor complex_tensor = torch::complex(msg_tensor[0], msg_tensor[1]);
+                            this->psialpha_tensor[iat][nl][ik][mu][m] = complex_tensor;
+                        }
                     }
                 }
             }
@@ -381,6 +326,7 @@ void LCAO_Deepks::prepare_psialpha_k(const int nlocal,
 #endif  
 }
 
+template <typename TK>
 void LCAO_Deepks::check_vdp_psialpha(const int nat, const int nks, const int nlocal)
 {
     std::ofstream ofs("vdp_psialpha.dat");
@@ -398,7 +344,15 @@ void LCAO_Deepks::check_vdp_psialpha(const int nat, const int nks, const int nlo
                 {
                     for(int m=0; m< mmax; m++)
                     {
-                        ofs << this->psialpha_tensor.index({ iat,nl, iks, mu, m }).item().toDouble() << " ";
+                        if constexpr (std::is_same<TK, double>::value)
+                        {
+                            ofs << this->psialpha_tensor.index({ iat, nl, iks, mu, m }).item().toDouble() << " ";
+                        }
+                        else
+                        {
+                            auto tensor_value = this->psialpha_tensor.index({iat, nl, iks, mu, m});
+                            ofs << std::complex<double>(torch::real(tensor_value).item<double>(), torch::imag(tensor_value).item<double>()) << " ";
+                        }
                     }
                 }                
             }
@@ -470,4 +424,23 @@ void LCAO_Deepks::check_vdp_gevdm(const int nat)
     ofs.close();
 }
 
+template void LCAO_Deepks::prepare_psialpha<double>(const int nlocal,
+                                                    const int nat,
+                                                    const int nks,
+                                                    const std::vector<ModuleBase::Vector3<double>> &kvec_d,
+                                                    const UnitCell &ucell,
+                                                    const LCAO_Orbitals &orb,
+                                                    Grid_Driver &GridD);
+
+template void LCAO_Deepks::prepare_psialpha<std::complex<double>>(const int nlocal,
+                                                                  const int nat,
+                                                                  const int nks,
+                                                                  const std::vector<ModuleBase::Vector3<double>> &kvec_d,
+                                                                  const UnitCell &ucell,
+                                                                  const LCAO_Orbitals &orb,
+                                                                  Grid_Driver &GridD);
+
+template void LCAO_Deepks::check_vdp_psialpha<double>(const int nat, const int nks, const int nlocal);
+template void LCAO_Deepks::check_vdp_psialpha<std::complex<double>>(const int nat, const int nks, const int nlocal);
+
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_vdelta.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_vdelta.cpp
index 459ff5c10d..9a1f151f70 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_vdelta.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_vdelta.cpp
@@ -18,7 +18,7 @@
 
 //calculating sum of correction band energies
 //for gamma_only calculations
-void LCAO_Deepks::cal_e_delta_band(const std::vector<std::vector<double>>& dm)
+void LCAO_Deepks::cal_e_delta_band(const std::vector<std::vector<double>>& dm, const int /*nks*/)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_e_delta_band");
     this->e_delta_band = 0;
@@ -35,7 +35,7 @@ void LCAO_Deepks::cal_e_delta_band(const std::vector<std::vector<double>>& dm)
                 for (int is = 0; is < dm.size(); ++is)  //dm.size() == PARAM.inp.nspin
                 {
                     //this->e_delta_band += dm[is](nu, mu) * this->H_V_delta[index];
-					this->e_delta_band += dm[is][nu*this->pv->nrow+mu] * this->H_V_delta[index];
+					this->e_delta_band += dm[is][nu*this->pv->nrow+mu] * this->H_V_delta[0][index];
                 }
             }
         }
@@ -48,7 +48,7 @@ void LCAO_Deepks::cal_e_delta_band(const std::vector<std::vector<double>>& dm)
 
 //calculating sum of correction band energies
 //for multi_k calculations
-void LCAO_Deepks::cal_e_delta_band_k(const std::vector<std::vector<std::complex<double>>>& dm,
+void LCAO_Deepks::cal_e_delta_band(const std::vector<std::vector<std::complex<double>>>& dm,
     const int nks)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_e_delta_band");
diff --git a/source/module_hamilt_lcao/module_deepks/cal_gdmx.cpp b/source/module_hamilt_lcao/module_deepks/cal_gdmx.cpp
index 6950e030ea..e28af0685c 100644
--- a/source/module_hamilt_lcao/module_deepks/cal_gdmx.cpp
+++ b/source/module_hamilt_lcao/module_deepks/cal_gdmx.cpp
@@ -14,10 +14,13 @@
 /// be calculated: 
 /// gdm_epsl = d/d\epsilon_{ab} * 
 ///           sum_{mu,nu} rho_{mu,nu} <chi_mu|alpha_m><alpha_m'|chi_nu>
-void LCAO_Deepks::cal_gdmx(const std::vector<double>& dm,
+template <typename TK>
+void LCAO_Deepks::cal_gdmx(const std::vector<std::vector<TK>>& dm,
     const UnitCell &ucell,
     const LCAO_Orbitals &orb,
     Grid_Driver& GridD,
+    const int nks,
+    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
     const bool isstress)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_gdmx");
@@ -70,6 +73,8 @@ void LCAO_Deepks::cal_gdmx(const std::vector<double>& dm,
 				const int nw1_tot = atom1->nw*PARAM.globalv.npol;
 				const double Rcut_AO1 = orb.Phi[T1].getRcut(); 
 
+                ModuleBase::Vector3<double> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z); 
+
 				for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
 				{
 					const int T2 = GridD.getType(ad2);
@@ -79,6 +84,7 @@ void LCAO_Deepks::cal_gdmx(const std::vector<double>& dm,
 					const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
 					const Atom* atom2 = &ucell.atoms[T2];
 					const int nw2_tot = atom2->nw*PARAM.globalv.npol;
+					ModuleBase::Vector3<double> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
 					
 					const double Rcut_AO2 = orb.Phi[T2].getRcut();
                 	const double dist1 = (tau1-tau0).norm() * ucell.lat0;
@@ -104,25 +110,68 @@ void LCAO_Deepks::cal_gdmx(const std::vector<double>& dm,
                     auto row_indexes = pv->get_indexes_row(ibt1);
                     auto col_indexes = pv->get_indexes_col(ibt2);
                     if(row_indexes.size() * col_indexes.size() == 0) continue;
-
-                    hamilt::AtomPair<double> dm_pair(ibt1, ibt2, 0, 0, 0, pv);
-                    dm_pair.allocate(nullptr, 1);
-                    if(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
+                    
+                    double* dm_current;
+                    int dRx, dRy, dRz;
+                    if constexpr (std::is_same<TK, double>::value)
                     {
-                        dm_pair.add_from_matrix(dm.data(), pv->get_row_size(), 1.0, 1);
+                        dRx = 0;
+                        dRy = 0;
+                        dRz = 0;
                     }
                     else
                     {
-                        dm_pair.add_from_matrix(dm.data(), pv->get_col_size(), 1.0, 0);
+                        dRx = (dR2-dR1).x;
+                        dRy = (dR2-dR1).y;
+                        dRz = (dR2-dR1).z;
                     }
-                    const double* dm_current = dm_pair.get_pointer();
+                    hamilt::AtomPair<double> dm_pair(ibt1, ibt2, dRx, dRy, dRz, pv);
+                    dm_pair.allocate(nullptr, 1);
+                    for(int ik=0;ik<nks;ik++)
+                    {
+                        TK kphase;
+                        if constexpr (std::is_same<TK, double>::value)
+                        {
+                            kphase = 1.0;
+                        }
+                        else
+                        {
+                            const double arg = - (kvec_d[ik] * (dR2-dR1) ) * ModuleBase::TWO_PI;
+                            double sinp, cosp;
+                            ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                            kphase = TK(cosp, sinp);
+                        }
+                        if(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
+                        {
+                            dm_pair.add_from_matrix(dm[ik].data(), pv->get_row_size(), kphase, 1);
+                        }
+                        else
+                        {
+                            dm_pair.add_from_matrix(dm[ik].data(), pv->get_col_size(), kphase, 0);
+                        }
+                    }
+                    
+                    dm_current = dm_pair.get_pointer();
 
+                    key_tuple key_1(ibt1,dR1.x,dR1.y,dR1.z);
+                    key_tuple key_2(ibt2,dR2.x,dR2.y,dR2.z);
 					for (int iw1=0; iw1<row_indexes.size(); ++iw1)
                     {
                         for (int iw2=0; iw2<col_indexes.size(); ++iw2)
                         {
-                            std::vector<double> nlm1 = this->nlm_save[iat][ad1][row_indexes[iw1]][0];
-                            std::vector<std::vector<double>> nlm2 = this->nlm_save[iat][ad2][col_indexes[iw2]];
+                            std::vector<double> nlm1;
+                            std::vector<std::vector<double>> nlm2;
+
+                            if constexpr (std::is_same<TK, double>::value)
+                            {
+                                nlm1 = this->nlm_save[iat][ad1][row_indexes[iw1]][0];
+                                nlm2 = this->nlm_save[iat][ad2][col_indexes[iw2]];
+                            }
+                            else
+                            {
+                                nlm1 = this->nlm_save_k[iat][key_1][row_indexes[iw1]][0];
+                                nlm2 = this->nlm_save_k[iat][key_2][col_indexes[iw2]];
+                            }
 
                             assert(nlm1.size()==nlm2[0].size());
 
@@ -178,8 +227,16 @@ void LCAO_Deepks::cal_gdmx(const std::vector<double>& dm,
                             assert(ib==nlm1.size());
                             if  (isstress)
                             {
-                                nlm1 = this->nlm_save[iat][ad2][col_indexes[iw2]][0];
-                                nlm2 = this->nlm_save[iat][ad1][row_indexes[iw1]];
+                                if constexpr (std::is_same<TK, double>::value)
+                                {
+                                    nlm1 = this->nlm_save[iat][ad2][col_indexes[iw2]][0];
+                                    nlm2 = this->nlm_save[iat][ad1][row_indexes[iw1]];
+                                }
+                                else
+                                {
+                                    nlm1 = this->nlm_save_k[iat][key_2][col_indexes[iw2]][0];
+                                    nlm2 = this->nlm_save_k[iat][key_1][row_indexes[iw1]];
+                                }
 
                                 assert(nlm1.size()==nlm2[0].size());  
                                 int ib=0;
@@ -278,4 +335,20 @@ void LCAO_Deepks::check_gdmx(const int nat)
     }
 }
 
+template void LCAO_Deepks::cal_gdmx<double>(const std::vector<std::vector<double>>& dm,
+                                            const UnitCell &ucell,
+                                            const LCAO_Orbitals &orb,
+                                            Grid_Driver& GridD,
+                                            const int nks,
+                                            const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                            const bool isstress);
+
+template void LCAO_Deepks::cal_gdmx<std::complex<double>>(const std::vector<std::vector<std::complex<double>>>& dm,
+                                                          const UnitCell &ucell,
+                                                          const LCAO_Orbitals &orb,
+                                                          Grid_Driver& GridD,
+                                                          const int nks,
+                                                          const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                                          const bool isstress);
+
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/cal_gdmx_k.cpp b/source/module_hamilt_lcao/module_deepks/cal_gdmx_k.cpp
deleted file mode 100644
index 62a3a25c26..0000000000
--- a/source/module_hamilt_lcao/module_deepks/cal_gdmx_k.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-#ifdef __DEEPKS
-
-#include "module_parameter/parameter.h"
-#include "LCAO_deepks.h"
-#include "module_base/vector3.h"
-#include "module_base/timer.h"
-#include "module_base/constants.h"
-#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
-#include "module_base/libm/libm.h"
-
-void LCAO_Deepks::cal_gdmx_k(const std::vector<std::vector<std::complex<double>>>& dm,
-    const UnitCell &ucell,
-    const LCAO_Orbitals &orb,
-    Grid_Driver& GridD,
-    const int nks,
-    const std::vector<ModuleBase::Vector3<double>> &kvec_d,
-    const bool isstress)
-{
-    ModuleBase::TITLE("LCAO_Deepks", "cal_gdmx_k");
-    ModuleBase::timer::tick("LCAO_Deepks","cal_gdmx_k");
-
-    const int size = (2 * lmaxd + 1) * (2 * lmaxd + 1);
-
-    for (int iat = 0;iat < ucell.nat;iat++)
-    {
-        for (int inl = 0;inl < inlmax;inl++)
-        {
-            ModuleBase::GlobalFunc::ZEROS(gdmx[iat][inl], size);
-            ModuleBase::GlobalFunc::ZEROS(gdmy[iat][inl], size);
-            ModuleBase::GlobalFunc::ZEROS(gdmz[iat][inl], size);
-        }
-    }
-
-    if (isstress)
-    {
-        for (int ipol = 0;ipol < 6;ipol++)
-        {
-            for (int inl = 0;inl < inlmax;inl++)
-            {
-                ModuleBase::GlobalFunc::ZEROS(gdm_epsl[ipol][inl], size);
-            }
-        }
-    }
-
-    const double Rcut_Alpha = orb.Alpha[0].getRcut();
-    int nrow = this->pv->nrow;
-    for (int T0 = 0; T0 < ucell.ntype; T0++)
-    {
-		Atom* atom0 = &ucell.atoms[T0]; 
-        for (int I0 =0; I0< atom0->na; I0++)
-        {
-            const int iat = ucell.itia2iat(T0,I0);//on which alpha is located
-            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0] ,T0, I0);
-
-            for (int ad1=0; ad1<GridD.getAdjacentNum()+1 ; ++ad1)
-            {
-                const int T1 = GridD.getType(ad1);
-                const int I1 = GridD.getNatom(ad1);
-                const int ibt1 = ucell.itia2iat(T1,I1); //on which chi_mu is located
-                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-                
-                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
-				const Atom* atom1 = &ucell.atoms[T1];
-				const int nw1_tot = atom1->nw*PARAM.globalv.npol;
-				const double Rcut_AO1 = orb.Phi[T1].getRcut();
-
-                ModuleBase::Vector3<double> dR1(GridD.getBox(ad1).x, GridD.getBox(ad1).y, GridD.getBox(ad1).z); 
-
-				for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
-				{
-					const int T2 = GridD.getType(ad2);
-					const int I2 = GridD.getNatom(ad2);
-					const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
-                    const int ibt2 = ucell.itia2iat(T2,I2);
-					const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
-					const Atom* atom2 = &ucell.atoms[T2];
-					const int nw2_tot = atom2->nw*PARAM.globalv.npol;
-                    ModuleBase::Vector3<double> dR2(GridD.getBox(ad2).x, GridD.getBox(ad2).y, GridD.getBox(ad2).z);
-					
-					const double Rcut_AO2 = orb.Phi[T2].getRcut();
-                	const double dist1 = (tau1-tau0).norm() * ucell.lat0;
-                	const double dist2 = (tau2-tau0).norm() * ucell.lat0;
-
-					if (dist1 > Rcut_Alpha + Rcut_AO1
-							|| dist2 > Rcut_Alpha + Rcut_AO2)
-					{
-						continue;
-					}
-
-                    double r0[3];
-                    double r1[3];
-                    if(isstress)
-                    {
-                        r1[0] = ( tau1.x - tau0.x) ;
-                        r1[1] = ( tau1.y - tau0.y) ;
-                        r1[2] = ( tau1.z - tau0.z) ;
-                        r0[0] = ( tau2.x - tau0.x) ;
-                        r0[1] = ( tau2.y - tau0.y) ;
-                        r0[2] = ( tau2.z - tau0.z) ;
-                    }
-
-                    auto row_indexes = pv->get_indexes_row(ibt1);
-                    auto col_indexes = pv->get_indexes_col(ibt2);
-                    if(row_indexes.size() * col_indexes.size() == 0) continue;
-
-                    hamilt::AtomPair<double> dm_pair(ibt1, ibt2, (dR2-dR1).x, (dR2-dR1).y, (dR2-dR1).z, pv);
-                    dm_pair.allocate(nullptr, 1);
-                    for(int ik=0;ik<nks;ik++)
-                    {
-                        const double arg = - (kvec_d[ik] * (dR2-dR1) ) * ModuleBase::TWO_PI;
-                        double sinp, cosp;
-                        ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                        const std::complex<double> kphase = std::complex<double>(cosp, sinp);
-                        if(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
-                        {
-                            dm_pair.add_from_matrix(dm[ik].data(), pv->get_row_size(), kphase, 1);
-                        }
-                        else
-                        {
-                            dm_pair.add_from_matrix(dm[ik].data(), pv->get_col_size(), kphase, 0);
-                        }
-                    }
-                    const double* dm_current = dm_pair.get_pointer();
-
-                    key_tuple key_1(ibt1,dR1.x,dR1.y,dR1.z);
-                    key_tuple key_2(ibt2,dR2.x,dR2.y,dR2.z);
-					for (int iw1l = 0; iw1l < row_indexes.size(); ++iw1l)
-                    {
-                        for (int iw2l = 0; iw2l < col_indexes.size(); ++iw2l)
-                        {
-                            std::vector<double> nlm1 = this->nlm_save_k[iat][key_1][row_indexes[iw1l]][0];
-                            std::vector<std::vector<double>> nlm2 = this->nlm_save_k[iat][key_2][col_indexes[iw2l]];
-
-                            assert(nlm1.size()==nlm2[0].size());
-
-                            int ib=0;
-                            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
-                            {
-                                for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
-                                {
-                                    const int inl = this->inl_index[T0](I0, L0, N0);
-                                    const int nm = 2*L0+1;
-                                    for (int m1 = 0;m1 < 2 * L0 + 1;++m1)
-                                    {
-                                        for (int m2 = 0; m2 < 2 * L0 + 1; ++m2)
-                                        {
-                                            //(<d/dX chi_mu|alpha_m>)<chi_nu|alpha_m'>
-                                            gdmx[iat][inl][m1*nm+m2] += nlm2[1][ib+m2] * nlm1[ib+m1] * *dm_current;
-                                            gdmy[iat][inl][m1*nm+m2] += nlm2[2][ib+m2] * nlm1[ib+m1] * *dm_current;
-                                            gdmz[iat][inl][m1*nm+m2] += nlm2[3][ib+m2] * nlm1[ib+m1] * *dm_current;
-
-                                            //(<d/dX chi_nu|alpha_m'>)<chi_mu|alpha_m>
-                                            gdmx[iat][inl][m2*nm+m1] += nlm2[1][ib+m2] * nlm1[ib+m1] * *dm_current;
-                                            gdmy[iat][inl][m2*nm+m1] += nlm2[2][ib+m2] * nlm1[ib+m1] * *dm_current;
-                                            gdmz[iat][inl][m2*nm+m1] += nlm2[3][ib+m2] * nlm1[ib+m1] * *dm_current;                                            
-
-                                            //(<chi_mu|d/dX alpha_m>)<chi_nu|alpha_m'> = -(<d/dX chi_mu|alpha_m>)<chi_nu|alpha_m'>
-                                            gdmx[ibt2][inl][m1*nm+m2] -= nlm2[1][ib+m2] * nlm1[ib+m1] * *dm_current;         
-                                            gdmy[ibt2][inl][m1*nm+m2] -= nlm2[2][ib+m2] * nlm1[ib+m1] * *dm_current;         
-                                            gdmz[ibt2][inl][m1*nm+m2] -= nlm2[3][ib+m2] * nlm1[ib+m1] * *dm_current;
-
-                                            //(<chi_nu|d/dX alpha_m'>)<chi_mu|alpha_m> = -(<d/dX chi_nu|alpha_m'>)<chi_mu|alpha_m>
-                                            gdmx[ibt2][inl][m2*nm+m1] -= nlm2[1][ib+m2] * nlm1[ib+m1] * *dm_current; 
-                                            gdmy[ibt2][inl][m2*nm+m1] -= nlm2[2][ib+m2] * nlm1[ib+m1] * *dm_current;         
-                                            gdmz[ibt2][inl][m2*nm+m1] -= nlm2[3][ib+m2] * nlm1[ib+m1] * *dm_current; 
-
-                                            if (isstress)
-                                            {
-                                                int mm = 0;
-                                                for(int ipol=0;ipol<3;ipol++)
-                                                {
-                                                    for(int jpol=ipol;jpol<3;jpol++)
-                                                    {
-                                                        gdm_epsl[mm][inl][m2*nm+m1] += ucell.lat0 * 
-                                                        *dm_current * (nlm2[jpol+1][ib+m2] * nlm1[ib+m1] * r0[ipol]);
-                                                        mm++;
-                                                    }
-                                                }
-                                            }
-
-                                        }
-                                    }
-                                    ib+=nm;
-                                }
-                            }
-                            assert(ib==nlm1.size());
-
-                            if  (isstress)
-                            {
-                                nlm1 = this->nlm_save_k[iat][key_2][col_indexes[iw2l]][0];
-                                nlm2 = this->nlm_save_k[iat][key_1][row_indexes[iw1l]];
-
-                                assert(nlm1.size()==nlm2[0].size());  
-                                int ib=0;
-                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
-                                {
-                                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
-                                    {
-                                        const int inl = this->inl_index[T0](I0, L0, N0);
-                                        const int nm = 2*L0+1;
-                                        for (int m1 = 0;m1 < nm; ++m1)
-                                        {
-                                            for (int m2 = 0; m2 < nm; ++m2)
-                                            {
-                                                int mm = 0;
-                                                for(int ipol=0;ipol<3;ipol++)
-                                                {
-                                                    for(int jpol=ipol;jpol<3;jpol++)
-                                                    {
-                                                        gdm_epsl[mm][inl][m2*nm+m1]  += ucell.lat0 * 
-                                                        *dm_current * (nlm1[ib+m1] * nlm2[jpol+1][ib+m2] * r1[ipol]);
-                                                        mm++;
-                                                    }
-                                                }
-                                            }
-                                        }
-                                        ib+=nm;
-                                    }
-                                }
-                            }
-                            dm_current++;
-						}//iw2
-					}//iw1
-				}//ad2
-			}//ad1
-        }//I0
-    }//T0
-
-#ifdef __MPI
-    for(int iat=0;iat<ucell.nat;iat++)
-    {
-        allsum_deepks(this->inlmax,size,this->gdmx[iat]);
-        allsum_deepks(this->inlmax,size,this->gdmy[iat]);
-        allsum_deepks(this->inlmax,size,this->gdmz[iat]);
-    }
-    if (isstress)
-    {
-        for(int ipol=0;ipol<6;ipol++)
-        {
-            allsum_deepks(this->inlmax,size,this->gdm_epsl[ipol]);
-        }
-    }
-#endif
-    ModuleBase::timer::tick("LCAO_Deepks","cal_gdmx_k");
-    return;
-}
-
-#endif
diff --git a/source/module_hamilt_lcao/module_deepks/deepks_hmat.cpp b/source/module_hamilt_lcao/module_deepks/deepks_hmat.cpp
index 66125bf042..54d143687c 100644
--- a/source/module_hamilt_lcao/module_deepks/deepks_hmat.cpp
+++ b/source/module_hamilt_lcao/module_deepks/deepks_hmat.cpp
@@ -4,98 +4,11 @@
 #include "LCAO_deepks.h"
 #include "module_base/parallel_reduce.h"
 
-
-void DeePKS_domain::save_h_mat(
-		const double *h_mat_in,
-		const int nloc,
-        const int ik)
-{
-	for(int i=0;i<nloc;i++)
-	{
-		GlobalC::ld.h_mat[i]=h_mat_in[i];
-	}  
-}
-
-void DeePKS_domain::save_h_mat(
-		const std::complex<double> *h_mat_in,
-		const int nloc,
-        const int ik)
-{
-    for(int i=0;i<nloc;i++)
-    {
-        GlobalC::ld.h_mat_k[ik][i]=h_mat_in[i];
-    }  
-}
-
-
+template <typename TK, typename TH>
 void DeePKS_domain::collect_h_mat(
         const Parallel_Orbitals &pv,
-		const std::vector<double>& h_in,
-		ModuleBase::matrix &h_out,
-		const int nlocal)
-{
-    ModuleBase::TITLE("DeePKS_domain", "collect_h_tot");
-
-    //construct the total H matrix
-#ifdef __MPI
-    int ir=0;
-    int ic=0;
-    for (int i=0; i<nlocal; i++)
-    {
-        std::vector<double> lineH(nlocal-i,0.0);
-
-        ir = pv.global2local_row(i);
-        if (ir>=0)
-        {
-            // data collection
-            for (int j=i; j<nlocal; j++)
-            {
-                ic = pv.global2local_col(j);
-                if (ic>=0)
-                {
-                    int iic=0;
-                    if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver))
-                    {
-                        iic=ir+ic*pv.nrow;
-                    }
-                    else
-                    {
-                        iic=ir*pv.ncol+ic;
-                    }
-                    lineH[j-i] = h_in[iic];
-                }
-            }
-        }
-        else
-        {
-            //do nothing
-        }
-
-        Parallel_Reduce::reduce_all(lineH.data(),nlocal-i);
-
-        for (int j=i; j<nlocal; j++)
-        {
-            h_out(i,j)=lineH[j-i];
-            h_out(j,i)=h_out(i,j);//H is a symmetric matrix
-        }
-    }
-#else
-    for (int i=0; i<nlocal; i++)
-    {
-        for (int j=i; j<nlocal; j++)
-        {
-            h_out(i,j)=h_in[i*nlocal+j];
-            h_out(j,i)=h_out(i,j);//H is a symmetric matrix
-        }
-    }
-#endif
-}
-
-// for multi-k
-void DeePKS_domain::collect_h_mat(
-        const Parallel_Orbitals &pv,
-		const std::vector<std::vector<std::complex<double>>>& h_in,
-		std::vector<ModuleBase::ComplexMatrix> &h_out,
+		const std::vector<std::vector<TK>>& h_in,
+		std::vector<TH> &h_out,
 		const int nlocal,
         const int nks)
 {
@@ -109,7 +22,7 @@ void DeePKS_domain::collect_h_mat(
         int ic=0;
         for (int i=0; i<nlocal; i++)
         {
-            std::vector<std::complex<double>> lineH(nlocal-i, std::complex<double>(0.0, 0.0));
+            std::vector<TK> lineH(nlocal-i, TK(0.0));
 
             ir = pv.global2local_row(i);
             if (ir>=0)
@@ -159,28 +72,9 @@ void DeePKS_domain::collect_h_mat(
     }
 }
 
-//just for gamma-only now
-void DeePKS_domain::check_h_mat(
-		const ModuleBase::matrix &H,
-		const std::string &h_file,
-		const int nlocal)
-{
-    std::ofstream ofs(h_file.c_str());
-    ofs << std::setprecision(10);
-    for (int i=0; i<nlocal; i++)
-    {
-        for (int j=0; j<nlocal; j++)
-        {
-            ofs << H(i,j) << " ";
-        }
-        ofs << std::endl;
-    }
-    ofs.close();
-}
-
-// for multi-k
+template <typename TH>
 void DeePKS_domain::check_h_mat(
-		const std::vector<ModuleBase::ComplexMatrix> &H,
+		const std::vector<TH> &H,
 		const std::string &h_file,
 		const int nlocal,
         const int nks)
@@ -202,4 +96,30 @@ void DeePKS_domain::check_h_mat(
     ofs.close();
 }
 
+template void DeePKS_domain::collect_h_mat<double, ModuleBase::matrix>(
+        const Parallel_Orbitals &pv,
+        const std::vector<std::vector<double>>& h_in,
+        std::vector<ModuleBase::matrix> &h_out,
+        const int nlocal,
+        const int nks);
+
+template void DeePKS_domain::collect_h_mat<std::complex<double>, ModuleBase::ComplexMatrix>(
+        const Parallel_Orbitals &pv,
+        const std::vector<std::vector<std::complex<double>>>& h_in,
+        std::vector<ModuleBase::ComplexMatrix> &h_out,
+        const int nlocal,
+        const int nks);
+
+template void DeePKS_domain::check_h_mat<ModuleBase::matrix>(
+        const std::vector<ModuleBase::matrix> &H,
+        const std::string &h_file,
+        const int nlocal,
+        const int nks);
+
+template void DeePKS_domain::check_h_mat<ModuleBase::ComplexMatrix>(
+        const std::vector<ModuleBase::ComplexMatrix> &H,
+        const std::string &h_file,
+        const int nlocal,
+        const int nks);
+
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/deepks_hmat.h b/source/module_hamilt_lcao/module_deepks/deepks_hmat.h
index fe4d51dea5..9668d5ede9 100644
--- a/source/module_hamilt_lcao/module_deepks/deepks_hmat.h
+++ b/source/module_hamilt_lcao/module_deepks/deepks_hmat.h
@@ -14,37 +14,19 @@
 
 namespace DeePKS_domain
 {
-	void save_h_mat(
-			const double *h_mat_in,
-			const int nloc,
-            const int ik);
-
-	void save_h_mat(
-			const std::complex<double> *h_mat_in,
-			const int nloc,
-            const int ik);
-
     //Collect data in h_in to matrix h_out. Note that left lower trianger in h_out is filled
+    template <typename TK, typename TH>
     void collect_h_mat(
         const Parallel_Orbitals &pv,
-        const std::vector<double>& h_in,
-        ModuleBase::matrix &h_out,
-        const int nlocal);
-
-    void collect_h_mat(
-        const Parallel_Orbitals &pv,
-		const std::vector<std::vector<std::complex<double>>>& h_in,
-		std::vector<ModuleBase::ComplexMatrix> &h_out,
-		const int nlocal,
+        const std::vector<std::vector<TK>>& h_in,
+        std::vector<TH> &h_out,
+        const int nlocal,
         const int nks);
 
+    // write h_mat to file h_file for checking // not used in the code now
+    template <typename TH>
     void check_h_mat(
-        const ModuleBase::matrix &H,
-        const std::string &h_file,
-        const int nlocal);
-
-    void check_h_mat(
-		const std::vector<ModuleBase::ComplexMatrix> &H,
+		const std::vector<TH> &H,
 		const std::string &h_file,
 		const int nlocal,
         const int nks);
diff --git a/source/module_hamilt_lcao/module_deepks/orbital_precalc.cpp b/source/module_hamilt_lcao/module_deepks/orbital_precalc.cpp
index 0e01573331..5f202d1532 100644
--- a/source/module_hamilt_lcao/module_deepks/orbital_precalc.cpp
+++ b/source/module_hamilt_lcao/module_deepks/orbital_precalc.cpp
@@ -3,9 +3,6 @@
 /// cal_orbital_precalc : orbital_precalc is usted for training with orbital label,
 ///                          which equals gvdm * orbital_pdm_shell,
 ///                          orbital_pdm_shell[1,Inl,nm*nm] = dm_hl * overlap * overlap
-/// cal_orbital_precalc_k : orbital_precalc is usted for training with orbital label,
-///                          for multi-k case, which equals gvdm * orbital_pdm_shell,
-///                          orbital_pdm_shell[1,Inl,nm*nm] = dm_hl_k * overlap * overlap
 
 #include "LCAO_deepks.h"
 #include "LCAO_deepks_io.h" // mohan add 2024-07-22
@@ -18,21 +15,24 @@
 
 // calculates orbital_precalc[1,NAt,NDscrpt] = gvdm * orbital_pdm_shell;
 // orbital_pdm_shell[2,Inl,nm*nm] = dm_hl * overlap * overlap;
+template <typename TK, typename TH>
 void LCAO_Deepks::cal_orbital_precalc(
-    const std::vector<std::vector<ModuleBase::matrix>>& dm_hl,
+    const std::vector<std::vector<TH>>& dm_hl,
     const int nat,
+    const int nks,
+    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
     const UnitCell& ucell,
     const LCAO_Orbitals& orb,
     Grid_Driver& GridD) 
 {
-
     ModuleBase::TITLE("LCAO_Deepks", "cal_orbital_precalc");
+    ModuleBase::timer::tick("LCAO_Deepks", "calc_orbital_precalc");
 
     this->cal_gvdm(nat);
 
     const double Rcut_Alpha = orb.Alpha[0].getRcut();
 
-    this->init_orbital_pdm_shell(1);
+    this->init_orbital_pdm_shell(nks);
 
     for (int T0 = 0; T0 < ucell.ntype; T0++) 
     {
@@ -85,6 +85,21 @@ void LCAO_Deepks::cal_orbital_precalc(
                     continue;
                 }
 
+                ModuleBase::Vector3<double> dR1(GridD.getBox(ad1).x,
+                                                GridD.getBox(ad1).y,
+                                                GridD.getBox(ad1).z);
+
+                key_tuple key_1(ibt1, dR1.x, dR1.y, dR1.z);
+
+                if constexpr (std::is_same<TK, std::complex<double>>::value)
+                {
+                    if (this->nlm_save_k[iat].find(key_1)
+                        == this->nlm_save_k[iat].end()) 
+                    {
+                        continue;
+                    }
+                }
+
                 auto row_indexes = pv->get_indexes_row(ibt1);
 
                 const int row_size = row_indexes.size();
@@ -96,12 +111,19 @@ void LCAO_Deepks::cal_orbital_precalc(
 
                 std::vector<double> s_1t(trace_alpha_size * row_size);
 
-				std::vector<double> g_1dmt(trace_alpha_size * row_size, 0.0);
+				std::vector<double> g_1dmt(nks * trace_alpha_size * row_size, 0.0);
 
 				for (int irow = 0; irow < row_size; irow++) 
 				{
-                    const double* row_ptr
-                        = this->nlm_save[iat][ad1][row_indexes[irow]][0].data();
+                    double* row_ptr;
+                    if constexpr (std::is_same<TK, double>::value)
+                    {
+                        row_ptr = this->nlm_save[iat][ad1][row_indexes[irow]][0].data();
+                    }
+                    else
+                    {
+                        row_ptr = this->nlm_save_k[iat][key_1][row_indexes[irow]][0].data();
+                    }
                     for (int i = 0; i < trace_alpha_size; i++) 
                     {
                         s_1t[i * row_size + irow] = row_ptr[trace_alpha_row[i]];
@@ -113,6 +135,13 @@ void LCAO_Deepks::cal_orbital_precalc(
                     const int T2 = GridD.getType(ad2);
                     const int I2 = GridD.getNatom(ad2);
                     const int ibt2 = ucell.itia2iat(T2, I2);
+                    if constexpr (std::is_same<TK, std::complex<double>>::value) // Why only for multi-k?
+                    {
+                        if (ibt1 > ibt2) 
+                        {
+                            continue;
+                        }
+                    }
                     const ModuleBase::Vector3<double> tau2
                         = GridD.getAdjacentTau(ad2);
                     const Atom* atom2 = &ucell.atoms[T2];
@@ -126,6 +155,21 @@ void LCAO_Deepks::cal_orbital_precalc(
                         continue;
                     }
 
+                    ModuleBase::Vector3<double> dR2(GridD.getBox(ad2).x,
+                                                    GridD.getBox(ad2).y,
+                                                    GridD.getBox(ad2).z);
+
+                    key_tuple key_2(ibt2, dR2.x, dR2.y, dR2.z);
+                    
+                    if constexpr (std::is_same<TK, std::complex<double>>::value)
+                    {
+                        if (this->nlm_save_k[iat].find(key_2)
+                                == this->nlm_save_k[iat].end()) 
+                        {
+                            continue;
+                        }
+                    }
+
                     auto col_indexes = pv->get_indexes_col(ibt2);
                     const int col_size = col_indexes.size();
 					if (col_size == 0) 
@@ -136,8 +180,15 @@ void LCAO_Deepks::cal_orbital_precalc(
                     std::vector<double> s_2t(trace_alpha_size * col_size);
                     for (int icol = 0; icol < col_size; icol++) 
                     {
-                        const double* col_ptr
-                            = this->nlm_save[iat][ad2][col_indexes[icol]][0].data();
+                        double* col_ptr;
+                        if constexpr (std::is_same<TK, double>::value)
+                        {
+                           col_ptr = this->nlm_save[iat][ad2][col_indexes[icol]][0].data();
+                        }
+                        else
+                        {
+                            col_ptr = this->nlm_save_k[iat][key_2][col_indexes[icol]][0].data();
+                        }
                         for (int i = 0; i < trace_alpha_size; i++) 
                         {
                             s_2t[i * col_size + icol]
@@ -145,40 +196,94 @@ void LCAO_Deepks::cal_orbital_precalc(
                         }
                     }
 
-                    std::vector<double> dm_array(row_size * col_size, 0.0);
-                    for (int is = 0; is < PARAM.inp.nspin; is++) 
-                    {
-                        hamilt::AtomPair<double> dm_pair(ibt1,
-                                                         ibt2,
-                                                         0,
-                                                         0,
-                                                         0,
-                                                         pv);
+                    std::vector<double> dm_array(row_size * nks * col_size, 0.0);
 
-                        dm_pair.allocate(dm_array.data(), 0);
+                    const int row_size_nks = row_size * nks;
 
-                        if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver)) 
+                    if constexpr (std::is_same<TK, double>::value)
+                    {
+                        for (int is = 0; is < PARAM.inp.nspin; is++) 
                         {
-                            dm_pair.add_from_matrix(dm_hl[0][is].c,
-                                                    pv->get_row_size(),
-                                                    1.0,
-                                                    1);
-                        } 
-                        else 
+                            hamilt::AtomPair<double> dm_pair(ibt1,
+                                                            ibt2,
+                                                            0,
+                                                            0,
+                                                            0,
+                                                            pv);
+
+                            dm_pair.allocate(dm_array.data(), 0);
+
+                            if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver)) 
+                            {
+                                dm_pair.add_from_matrix(dm_hl[0][is].c,
+                                                        pv->get_row_size(),
+                                                        1.0,
+                                                        1);
+                            } 
+                            else 
+                            {
+                                dm_pair.add_from_matrix(dm_hl[0][is].c,
+                                                        pv->get_col_size(),
+                                                        1.0,
+                                                        0);
+                            }
+                        } // is
+                    }
+                    else
+                    {
+                        for (int ik = 0; ik < nks; ik++) 
                         {
-                            dm_pair.add_from_matrix(dm_hl[0][is].c,
-                                                    pv->get_col_size(),
-                                                    1.0,
-                                                    0);
-                        }
-                    } // is
+                            hamilt::AtomPair<double> dm_pair(ibt1,
+                                                            ibt2,
+                                                            (dR2 - dR1).x,
+                                                            (dR2 - dR1).y,
+                                                            (dR2 - dR1).z,
+                                                            pv);
+
+                            dm_pair.allocate(&dm_array[ik * row_size * col_size], 0);
+
+                            const double arg
+                                = -(kvec_d[ik] * (dR2 - dR1)) * ModuleBase::TWO_PI;
+
+                            double sinp, cosp;
+
+                            ModuleBase::libm::sincos(arg, &sinp, &cosp);
+
+                            const std::complex<double> kphase
+                                = std::complex<double>(cosp, sinp);
+
+                            if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver)) 
+                            {
+                                dm_pair.add_from_matrix(dm_hl[0][ik].c,
+                                        pv->get_row_size(),
+                                        kphase,
+                                        1);
+                            } 
+                            else 
+                            {
+                                dm_pair.add_from_matrix(dm_hl[0][ik].c,
+                                        pv->get_col_size(),
+                                        kphase,
+                                        0);
+                            }
+                        } // ik
+                    }
 
                     // dgemm for s_2t and dm_array to get g_1dmt
                     constexpr char transa = 'T', transb = 'N';
-                    const double gemm_alpha = 1.0, gemm_beta = 1.0;
+                    double gemm_alpha = 1.0, gemm_beta = 1.0;
+
+                    if constexpr (std::is_same<TK, std::complex<double>>::value) 
+                    {
+                        if (ibt1 < ibt2) 
+                        {
+                            gemm_alpha = 2.0;
+                        }
+                    }
+
                     dgemm_(&transa,
                            &transb,
-                           &row_size,
+                           &row_size_nks,
                            &trace_alpha_size,
                            &col_size,
                            &gemm_alpha,
@@ -188,46 +293,54 @@ void LCAO_Deepks::cal_orbital_precalc(
                            &col_size,
                            &gemm_beta,
                            g_1dmt.data(),
-                           &row_size);
+                           &row_size_nks);
                 } // ad2
 
-                // do dot of g_1dmt and s_1t to get orbital_pdm_shell
-                const double* p_g1dmt = g_1dmt.data();
+                for (int ik = 0; ik < nks; ik++)
+                {
+                    // do dot of g_1dmt and s_1t to get orbital_pdm_shell
+                    const double* p_g1dmt = g_1dmt.data() + ik * row_size;
 
-                int ib = 0, index = 0, inc = 1;
+                    int ib = 0, index = 0, inc = 1;
 
-                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0) 
-                {
-                    for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0) 
+                    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0) 
                     {
-                        const int inl = this->inl_index[T0](I0, L0, N0);
-                        const int nm = 2 * L0 + 1;
-
-                        for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                        for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0) 
                         {
-                            for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                            const int inl = this->inl_index[T0](I0, L0, N0);
+                            const int nm = 2 * L0 + 1;
+
+                            for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                             {
-                                orbital_pdm_shell[0][0][inl][m1 * nm + m2]
-                                    += ddot_(&row_size,
-                                             p_g1dmt + index * row_size,
-                                             &inc,
-                                             s_1t.data() + index * row_size,
-                                             &inc);
-                                index++;
+                                for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                                {
+                                    orbital_pdm_shell[ik][0][inl][m1 * nm + m2]
+                                        += ddot_(&row_size,
+                                                p_g1dmt + index * row_size * nks,
+                                                &inc,
+                                                s_1t.data() + index * row_size,
+                                                &inc);
+                                    index++;
+                                }
                             }
+                            ib += nm;
                         }
-                        ib += nm;
                     }
                 }
             } // ad1
         }
     }
 #ifdef __MPI
-    for (int inl = 0; inl < this->inlmax; inl++) 
+    for (int iks = 0; iks < nks; iks++) 
     {
-        Parallel_Reduce::reduce_all(this->orbital_pdm_shell[0][0][inl],
-                                    (2 * this->lmaxd + 1)
-                                        * (2 * this->lmaxd + 1));
+        for (int hl = 0; hl < 1; hl++)
+        {
+            for (int inl = 0; inl < this->inlmax; inl++) 
+            {
+                Parallel_Reduce::reduce_all(this->orbital_pdm_shell[iks][hl][inl],
+                                            (2 * this->lmaxd + 1) * (2 * this->lmaxd + 1));
+            }
+        }
     }
 #endif
 
@@ -239,33 +352,35 @@ void LCAO_Deepks::cal_orbital_precalc(
     for (int nl = 0; nl < nlmax; ++nl) 
     {
         std::vector<torch::Tensor> kiammv;
-        for (int iks = 0; iks < 1; ++iks) 
+        for (int iks = 0; iks < nks; ++iks) 
         {
             std::vector<torch::Tensor> iammv;
-            std::vector<torch::Tensor> ammv;
-            for (int iat = 0; iat < nat; ++iat) 
+            for (int hl = 0; hl < 1; ++hl)
             {
-                int inl = iat * nlmax + nl;
-                int nm = 2 * this->inl_l[inl] + 1;
-                std::vector<double> mmv;
-
-                for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                std::vector<torch::Tensor> ammv;
+                for (int iat = 0; iat < nat; ++iat) 
                 {
-                    for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                    int inl = iat * nlmax + nl;
+                    int nm = 2 * this->inl_l[inl] + 1;
+                    std::vector<double> mmv;
+
+                    for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                     {
-                        mmv.push_back(
-                            this->orbital_pdm_shell[iks][0][inl][m1 * nm + m2]);
+                        for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                        {
+                            mmv.push_back(
+                                this->orbital_pdm_shell[iks][hl][inl][m1 * nm + m2]);
+                        }
                     }
-                }
-                torch::Tensor mm = torch::tensor(mmv,
-                     torch::TensorOptions().dtype(torch::kFloat64)).reshape({nm, nm}); // nm*nm
+                    torch::Tensor mm = torch::tensor(mmv,
+                        torch::TensorOptions().dtype(torch::kFloat64)).reshape({nm, nm}); // nm*nm
 
-                ammv.push_back(mm);
+                    ammv.push_back(mm);
+                }
+                torch::Tensor amm = torch::stack(ammv, 0);
+                iammv.push_back(amm);
             }
-            torch::Tensor amm = torch::stack(ammv, 0);
-            iammv.push_back(amm);
             torch::Tensor iamm = torch::stack(iammv, 0); // inl*nm*nm
-            // orbital_pdm_shell_vector.push_back(iamm);
             kiammv.push_back(iamm);
         }
         torch::Tensor kiamm = torch::stack(kiammv, 0);
@@ -284,8 +399,25 @@ void LCAO_Deepks::cal_orbital_precalc(
     }
 
     this->orbital_precalc_tensor = torch::cat(orbital_precalc_vector, -1);
-    this->del_orbital_pdm_shell(1);
+    this->del_orbital_pdm_shell(nks);
+    ModuleBase::timer::tick("LCAO_Deepks", "calc_orbital_precalc");
     return;
 }
 
+template void LCAO_Deepks::cal_orbital_precalc<double, ModuleBase::matrix>(const std::vector<std::vector<ModuleBase::matrix>>& dm_hl,
+                                                                           const int nat,
+                                                                           const int nks,
+                                                                           const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                                                           const UnitCell& ucell,
+                                                                           const LCAO_Orbitals& orb,
+                                                                           Grid_Driver& GridD);
+
+
+template void LCAO_Deepks::cal_orbital_precalc<std::complex<double>, ModuleBase::ComplexMatrix>(const std::vector<std::vector<ModuleBase::ComplexMatrix>>& dm_hl,
+                                                                                                const int nat,
+                                                                                                const int nks,
+                                                                                                const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                                                                                const UnitCell& ucell,
+                                                                                                const LCAO_Orbitals& orb,
+                                                                                                Grid_Driver& GridD);
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/orbital_precalc_k.cpp b/source/module_hamilt_lcao/module_deepks/orbital_precalc_k.cpp
deleted file mode 100644
index fd8b24e0c7..0000000000
--- a/source/module_hamilt_lcao/module_deepks/orbital_precalc_k.cpp
+++ /dev/null
@@ -1,350 +0,0 @@
-#ifdef __DEEPKS
-
-#include "LCAO_deepks.h"
-#include "LCAO_deepks_io.h" // mohan add 2024-07-22
-#include "module_base/blas_connector.h"
-#include "module_base/constants.h"
-#include "module_base/libm/libm.h"
-#include "module_base/parallel_reduce.h"
-#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
-#include "module_parameter/parameter.h"
-
-// calculates orbital_precalc[nks,2,NAt,NDscrpt] = gvdm * orbital_pdm_shell for
-// multi-k case; orbital_pdm_shell[nks,2,Inl,nm*nm] = dm_hl_k * overlap *
-// overlap;
-void LCAO_Deepks::cal_orbital_precalc_k(
-    const std::vector<std::vector<ModuleBase::ComplexMatrix>>& dm_hl_k,
-    const int nat,
-    const int nks,
-    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-    const UnitCell& ucell,
-    const LCAO_Orbitals& orb,
-    Grid_Driver& GridD) {
-    ModuleBase::TITLE("LCAO_Deepks", "calc_orbital_precalc_k");
-    ModuleBase::timer::tick("LCAO_Deepks", "calc_orbital_precalc_k");
-
-    this->cal_gvdm(nat);
-    const double Rcut_Alpha = orb.Alpha[0].getRcut();
-    this->init_orbital_pdm_shell(nks);
-
-    for (int T0 = 0; T0 < ucell.ntype; T0++) 
-    {
-        Atom* atom0 = &ucell.atoms[T0];
-
-        for (int I0 = 0; I0 < atom0->na; I0++) 
-        {
-            const int iat = ucell.itia2iat(T0, I0);
-            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
-
-            // trace alpha orbital
-            std::vector<int> trace_alpha_row;
-            std::vector<int> trace_alpha_col;
-            int ib = 0;
-            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0) 
-            {
-                for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0) 
-                {
-                    const int inl = this->inl_index[T0](I0, L0, N0);
-                    const int nm = 2 * L0 + 1;
-
-                    for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
-                    {
-                        for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
-                        {
-                            trace_alpha_row.push_back(ib + m1);
-                            trace_alpha_col.push_back(ib + m2);
-                        }
-                    }
-                    ib += nm;
-                }
-            }
-            const int trace_alpha_size = trace_alpha_row.size();
-
-            for (int ad1 = 0; ad1 < GridD.getAdjacentNum() + 1; ++ad1) 
-            {
-                const int T1 = GridD.getType(ad1);
-                const int I1 = GridD.getNatom(ad1);
-                const int ibt1 = ucell.itia2iat(T1, I1);
-                const ModuleBase::Vector3<double> tau1
-                    = GridD.getAdjacentTau(ad1);
-                const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
-                const double Rcut_AO1 = orb.Phi[T1].getRcut();
-                if (dist1 >= Rcut_Alpha + Rcut_AO1) 
-                {
-                    continue;
-                }
-
-                const Atom* atom1 = &ucell.atoms[T1];
-                const int nw1_tot = atom1->nw * PARAM.globalv.npol;
-
-                ModuleBase::Vector3<double> dR1(GridD.getBox(ad1).x,
-                                                GridD.getBox(ad1).y,
-                                                GridD.getBox(ad1).z);
-
-                auto row_indexes = pv->get_indexes_row(ibt1);
-                const int row_size = row_indexes.size();
-                if (row_size == 0) 
-                {
-                    continue;
-                }
-
-                key_tuple key_1(ibt1, dR1.x, dR1.y, dR1.z);
-                if (this->nlm_save_k[iat].find(key_1)
-                    == this->nlm_save_k[iat].end()) 
-                {
-                    continue;
-                }
-
-                std::vector<double> s_1t(trace_alpha_size * row_size);
-
-                std::vector<double> g_1dmt(nks * trace_alpha_size * row_size, 0.0);
-
-                for (int irow = 0; irow < row_size; irow++) 
-                {
-                    const double* row_ptr
-                        = this->nlm_save_k[iat][key_1][row_indexes[irow]][0].data();
-
-                    for (int i = 0; i < trace_alpha_size; i++) 
-                    {
-                        s_1t[i * row_size + irow] = row_ptr[trace_alpha_row[i]];
-                    }
-                }
-
-                for (int ad2 = 0; ad2 < GridD.getAdjacentNum() + 1; ad2++) 
-                {
-                    const int T2 = GridD.getType(ad2);
-                    const int I2 = GridD.getNatom(ad2);
-                    const int ibt2 = ucell.itia2iat(T2, I2);
-                    if (ibt1 > ibt2) 
-                    {
-                        continue;
-                    }
-
-                    const ModuleBase::Vector3<double> tau2
-                        = GridD.getAdjacentTau(ad2);
-
-                    const Atom* atom2 = &ucell.atoms[T2];
-
-                    const int nw2_tot = atom2->nw * PARAM.globalv.npol;
-
-                    ModuleBase::Vector3<double> dR2(GridD.getBox(ad2).x,
-                                                    GridD.getBox(ad2).y,
-                                                    GridD.getBox(ad2).z);
-
-                    const double Rcut_AO2 = orb.Phi[T2].getRcut();
-                    const double dist2 = (tau2 - tau0).norm() * ucell.lat0;
-
-                    if (dist2 >= Rcut_Alpha + Rcut_AO2) 
-                    {
-                        continue;
-                    }
-
-                    auto col_indexes = pv->get_indexes_col(ibt2);
-                    const int col_size = col_indexes.size();
-					if (col_size == 0) 
-					{
-						continue;
-					}
-
-                    std::vector<double> s_2t(trace_alpha_size * col_size);
-                    key_tuple key_2(ibt2, dR2.x, dR2.y, dR2.z);
-
-					if (this->nlm_save_k[iat].find(key_2)
-							== this->nlm_save_k[iat].end()) 
-					{
-						continue;
-					}
-
-                    for (int icol = 0; icol < col_size; icol++) 
-                    {
-                        const double* col_ptr
-                            = this->nlm_save_k[iat][key_2][col_indexes[icol]][0].data();
-
-                        for (int i = 0; i < trace_alpha_size; i++) 
-                        {
-                            s_2t[i * col_size + icol]
-                                = col_ptr[trace_alpha_col[i]];
-                        }
-                    }
-
-                    std::vector<double> dm_array(row_size * nks * col_size, 0.0);
-
-                    const int row_size_nks = row_size * nks;
-
-                    for (int ik = 0; ik < nks; ik++) 
-                    {
-                        hamilt::AtomPair<double> dm_pair(ibt1,
-                                                         ibt2,
-                                                         (dR2 - dR1).x,
-                                                         (dR2 - dR1).y,
-                                                         (dR2 - dR1).z,
-                                                         pv);
-
-                        dm_pair.allocate(&dm_array[ik * row_size * col_size], 0);
-
-                        const double arg
-                            = -(kvec_d[ik] * (dR2 - dR1)) * ModuleBase::TWO_PI;
-
-                        double sinp, cosp;
-
-                        ModuleBase::libm::sincos(arg, &sinp, &cosp);
-
-                        const std::complex<double> kphase
-                            = std::complex<double>(cosp, sinp);
-
-						if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver)) 
-						{
-							dm_pair.add_from_matrix(dm_hl_k[0][ik].c,
-									pv->get_row_size(),
-									kphase,
-									1);
-						} 
-						else 
-						{
-							dm_pair.add_from_matrix(dm_hl_k[0][ik].c,
-									pv->get_col_size(),
-									kphase,
-									0);
-						}
-                    } // ik
-
-                    // dgemm for s_2t and dm_array to get g_1dmt
-                    constexpr char transa = 'T', transb = 'N';
-                    double gemm_alpha = 1.0, gemm_beta = 1.0;
-
-					if (ibt1 < ibt2) 
-					{
-						gemm_alpha = 2.0;
-					}
-
-					dgemm_(&transa,
-                           &transb,
-                           &row_size_nks,
-                           &trace_alpha_size,
-                           &col_size,
-                           &gemm_alpha,
-                           dm_array.data(),
-                           &col_size,
-                           s_2t.data(),
-                           &col_size,
-                           &gemm_beta,
-                           g_1dmt.data(),
-                           &row_size_nks);
-                } // ad2
-
-				for (int ik = 0; ik < nks; ik++) 
-				{
-					// do dot of g_1dmt and s_1t to get orbital_pdm_shell
-                    const double* p_g1dmt = g_1dmt.data() + ik * row_size;
-
-                    int ib = 0, index = 0, inc = 1;
-
-					for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0) 
-					{
-						for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0) 
-						{
-                            const int inl = this->inl_index[T0](I0, L0, N0);
-                            const int nm = 2 * L0 + 1;
-
-							for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
-							{
-								for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
-								{
-									orbital_pdm_shell[ik][0][inl][m1 * nm + m2]
-										+= ddot_(&row_size,
-												p_g1dmt
-												+ index * row_size * nks,
-												&inc,
-												s_1t.data() + index * row_size,
-												&inc);
-									index++;
-								}
-							}
-							ib += nm;
-						}
-                    }
-                }
-            } // ad1
-        }
-    }
-
-#ifdef __MPI
-	for (int iks = 0; iks < nks; iks++) 
-	{
-		for (int hl = 0; hl < 1; hl++) 
-		{
-			for (int inl = 0; inl < this->inlmax; inl++) 
-			{
-				Parallel_Reduce::reduce_all(
-						this->orbital_pdm_shell[iks][hl][inl],
-						(2 * this->lmaxd + 1) * (2 * this->lmaxd + 1));
-			}
-		}
-	}
-#endif
-
-    // transfer orbital_pdm_shell to orbital_pdm_shell_vector
-
-    int nlmax = this->inlmax / nat;
-
-    std::vector<torch::Tensor> orbital_pdm_shell_vector;
-
-	for (int nl = 0; nl < nlmax; ++nl) 
-	{
-		std::vector<torch::Tensor> kiammv;
-		for (int iks = 0; iks < nks; ++iks) 
-		{
-			std::vector<torch::Tensor> iammv;
-			for (int hl = 0; hl < 1; ++hl) 
-			{
-				std::vector<torch::Tensor> ammv;
-				for (int iat = 0; iat < nat; ++iat) 
-				{
-                    int inl = iat * nlmax + nl;
-                    int nm = 2 * this->inl_l[inl] + 1;
-                    std::vector<double> mmv;
-
-                    for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
-                    {
-                        for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
-                        {
-                            mmv.push_back(
-                                this->orbital_pdm_shell[iks][hl][inl][m1 * nm + m2]);
-                        }
-                    }
-                    torch::Tensor mm
-                        = torch::tensor(
-                              mmv,
-                              torch::TensorOptions().dtype(torch::kFloat64))
-                              .reshape({nm, nm}); // nm*nm
-                    ammv.push_back(mm);
-                }
-                torch::Tensor amm = torch::stack(ammv, 0);
-                iammv.push_back(amm);
-            }
-            torch::Tensor iamm = torch::stack(iammv, 0); // inl*nm*nm
-            kiammv.push_back(iamm);
-        }
-        torch::Tensor kiamm = torch::stack(kiammv, 0);
-        orbital_pdm_shell_vector.push_back(kiamm);
-    }
-
-    assert(orbital_pdm_shell_vector.size() == nlmax);
-
-    // einsum for each nl:
-    std::vector<torch::Tensor> orbital_precalc_vector;
-	for (int nl = 0; nl < nlmax; ++nl) 
-	{
-		orbital_precalc_vector.push_back(
-				at::einsum("kiamn, avmn->kiav",
-					{orbital_pdm_shell_vector[nl], this->gevdm_vector[nl]}));
-    }
-    this->orbital_precalc_tensor = torch::cat(orbital_precalc_vector, -1);
-
-    this->del_orbital_pdm_shell(nks);
-    ModuleBase::timer::tick("LCAO_Deepks", "calc_orbital_precalc_k");
-    return;
-}
-
-#endif
diff --git a/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp b/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
index 010da2073d..c3d1abe61f 100644
--- a/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
+++ b/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
@@ -126,11 +126,11 @@ void test_deepks::check_gdmx()
     this->ld.init_gdmx(ucell.nat);
     if (PARAM.sys.gamma_only_local)
     {
-        this->ld.cal_gdmx(dm_new[0], ucell, ORB, Test_Deepks::GridD, 0);
+        this->ld.cal_gdmx(dm_new, ucell, ORB, Test_Deepks::GridD, kv.get_nkstot(), kv.kvec_d, 0);
     }
     else
     {
-        this->ld.cal_gdmx_k(dm_k_new, ucell, ORB, Test_Deepks::GridD, kv.get_nkstot(), kv.kvec_d, 0);
+        this->ld.cal_gdmx(dm_k_new, ucell, ORB, Test_Deepks::GridD, kv.get_nkstot(), kv.kvec_d, 0);
     }
     this->ld.check_gdmx(ucell.nat);
 
diff --git a/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp b/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp
index 0c91180d9a..25eebd7490 100644
--- a/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp
+++ b/source/module_hamilt_lcao/module_deepks/v_delta_precalc.cpp
@@ -18,8 +18,11 @@
 // calculates v_delta_precalc[nks,nlocal,nlocal,NAt,NDscrpt] = gvdm * v_delta_pdm_shell;
 // v_delta_pdm_shell[nks,nlocal,nlocal,Inl,nm*nm] = overlap * overlap;
 // for deepks_v_delta = 1
+template <typename TK>
 void LCAO_Deepks::cal_v_delta_precalc(const int nlocal,
     const int nat,
+    const int nks,
+    const std::vector<ModuleBase::Vector3<double>> &kvec_d,
     const UnitCell &ucell,
     const LCAO_Orbitals &orb,
     Grid_Driver &GridD)
@@ -30,7 +33,7 @@ void LCAO_Deepks::cal_v_delta_precalc(const int nlocal,
 
     this->cal_gvdm(nat);
     const double Rcut_Alpha = orb.Alpha[0].getRcut();
-    this->init_v_delta_pdm_shell(1,nlocal); // 1 for gamma-only
+    this->init_v_delta_pdm_shell(nks,nlocal); 
    
     for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
@@ -46,6 +49,7 @@ void LCAO_Deepks::cal_v_delta_precalc(const int nlocal,
             {
                 const int T1 = GridD.getType(ad1);
                 const int I1 = GridD.getNatom(ad1);
+                const int ibt1 = ucell.itia2iat(T1, I1); 
                 const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
                 const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
 				const Atom* atom1 = &ucell.atoms[T1];
@@ -57,11 +61,27 @@ void LCAO_Deepks::cal_v_delta_precalc(const int nlocal,
                 {
                     continue;
                 }
+                
+                ModuleBase::Vector3<double> dR1(GridD.getBox(ad1).x,
+                                            GridD.getBox(ad1).y,
+                                            GridD.getBox(ad1).z);
+
+                key_tuple key_1(ibt1, dR1.x, dR1.y, dR1.z); 
+
+                if constexpr (std::is_same<TK, std::complex<double>>::value)
+                {
+                    if (this->nlm_save_k[iat].find(key_1)
+                        == this->nlm_save_k[iat].end()) 
+                    {
+                        continue;
+                    }
+                }
 
 				for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
 				{
 					const int T2 = GridD.getType(ad2);
 					const int I2 = GridD.getNatom(ad2);
+                    const int ibt2 = ucell.itia2iat(T2, I2);
 					const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
 					const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
 					const Atom* atom2 = &ucell.atoms[T2];
@@ -75,43 +95,83 @@ void LCAO_Deepks::cal_v_delta_precalc(const int nlocal,
 						continue;
 					}
 
+                    ModuleBase::Vector3<double> dR2(GridD.getBox(ad2).x,
+                                                        GridD.getBox(ad2).y,
+                                                        GridD.getBox(ad2).z);
+                    key_tuple key_2(ibt2, dR2.x, dR2.y, dR2.z);
+
+                    if constexpr (std::is_same<TK, std::complex<double>>::value)
+                    {
+                        if (this->nlm_save_k[iat].find(key_2)
+                            == this->nlm_save_k[iat].end()) 
+                        {
+                            continue;
+                        }
+                    }
+
 					for (int iw1=0; iw1<nw1_tot; ++iw1)
 					{
 						const int iw1_all = start1 + iw1; // this is \mu
 						const int iw1_local = pv->global2local_row(iw1_all);
-						if(iw1_local < 0) {continue;
-}
+						if(iw1_local < 0) {continue;}
 						const int iw1_0 = iw1/PARAM.globalv.npol;
 						for (int iw2=0; iw2<nw2_tot; ++iw2)
 						{
 							const int iw2_all = start2 + iw2; // this is \nu
 							const int iw2_local = pv->global2local_col(iw2_all);
-							if(iw2_local < 0) {continue;
-}
+							if(iw2_local < 0) {continue;}
 							const int iw2_0 = iw2/PARAM.globalv.npol;
 
-                            std::vector<double> nlm1 = this->nlm_save[iat][ad1][iw1][0];
-                            std::vector<double> nlm2 = this->nlm_save[iat][ad2][iw2][0];
+                            std::vector<double> nlm1;
+                            std::vector<double> nlm2;
+                            if constexpr (std::is_same<TK, double>::value)
+                            {
+                                nlm1 = this->nlm_save[iat][ad1][iw1][0];
+                                nlm2 = this->nlm_save[iat][ad2][iw2][0];
+                            }
+                            else
+                            {
+                                nlm1 = this->nlm_save_k[iat][key_1][iw1][0];
+                                nlm2 = this->nlm_save_k[iat][key_2][iw2][0];
+                            }
 
                             assert(nlm1.size()==nlm2.size());
-                            int ib=0;
-                            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                            for (int ik = 0; ik < nks; ik++)
                             {
-                                for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                                int ib=0;
+                                std::complex<double> kphase = std::complex<double>(1.0, 0.0);
+                                if constexpr (std::is_same<TK, std::complex<double>>::value)
                                 {
-                                    const int inl = this->inl_index[T0](I0, L0, N0);
-                                    const int nm = 2*L0+1;
-                                    
-                                    for (int m1=0; m1<nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
+                                    const double arg = - (kvec_d[ik] * (dR1-dR2) ) * ModuleBase::TWO_PI;
+                                    double sinp, cosp;
+                                    ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                                    kphase = std::complex<double>(cosp, sinp);
+                                }
+                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                                {
+                                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
                                     {
-                                        for (int m2=0; m2<nm; ++m2) // nm = 1 for s, 3 for p, 5 for d
+                                        const int inl = this->inl_index[T0](I0, L0, N0);
+                                        const int nm = 2*L0+1;
+                                        
+                                        for (int m1=0; m1<nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
                                         {
-                                            v_delta_pdm_shell[0][iw1_all][iw2_all][inl][m1*nm+m2] += nlm1[ib+m1]*nlm2[ib+m2];
+                                            for (int m2=0; m2<nm; ++m2) // nm = 1 for s, 3 for p, 5 for d
+                                            {
+                                                if constexpr (std::is_same<TK, double>::value)
+                                                {
+                                                    this->v_delta_pdm_shell[ik][iw1_all][iw2_all][inl][m1*nm+m2] += nlm1[ib+m1]*nlm2[ib+m2];
+                                                }
+                                                else
+                                                {
+                                                    this->v_delta_pdm_shell_complex[ik][iw1_all][iw2_all][inl][m1*nm+m2] += nlm1[ib+m1]*nlm2[ib+m2]*kphase;
+                                                }
+                                            }
                                         }
+                                        ib+=nm;
                                     }
-                                    ib+=nm;
-                                }
-                            }                            
+                                }  
+                            } //ik                         
 						}//iw2
 					}//iw1
 				}//ad2
@@ -121,13 +181,23 @@ void LCAO_Deepks::cal_v_delta_precalc(const int nlocal,
     }
 #ifdef __MPI
     const int mn_size=(2 * this->lmaxd + 1) * (2 * this->lmaxd + 1);
-    for(int inl = 0; inl < this->inlmax; inl++)
+    for (int ik = 0; ik < nks; ik++)
     {
-        for(int mu = 0; mu < nlocal ; mu++)
+        for(int inl = 0; inl < this->inlmax; inl++)
         {
-            for(int nu=0; nu< nlocal ; nu++)
+            for(int mu = 0; mu < nlocal ; mu++)
             {
-                Parallel_Reduce::reduce_all(this->v_delta_pdm_shell[0][mu][nu][inl],mn_size);
+                for(int nu=0; nu< nlocal ; nu++)
+                {
+                    if constexpr (std::is_same<TK, double>::value)
+                    {
+                        Parallel_Reduce::reduce_all(this->v_delta_pdm_shell[ik][mu][nu][inl],mn_size);
+                    }
+                    else
+                    {
+                        Parallel_Reduce::reduce_all(this->v_delta_pdm_shell_complex[ik][mu][nu][inl],mn_size);
+                    }
+                }
             }
         }
     }
@@ -141,7 +211,7 @@ void LCAO_Deepks::cal_v_delta_precalc(const int nlocal,
     for(int nl = 0; nl < nlmax; ++nl)
     {
         std::vector<torch::Tensor> kuuammv;
-        for(int iks = 0; iks < 1; ++iks)
+        for(int iks = 0; iks < nks; ++iks)
         {
             std::vector<torch::Tensor> uuammv;
             for(int mu = 0; mu < nlocal; ++mu)
@@ -154,17 +224,32 @@ void LCAO_Deepks::cal_v_delta_precalc(const int nlocal,
                     {
                         int inl = iat*nlmax+nl;
                         int nm = 2*this->inl_l[inl]+1;
-                        std::vector<double> mmv;
+                        std::vector<TK> mmv;
                     
                         for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                         {
                             for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
                             {
-                                mmv.push_back(this->v_delta_pdm_shell[iks][mu][nu][inl][m1*nm+m2]);
+                                if constexpr (std::is_same<TK, double>::value)
+                                {
+                                    mmv.push_back(this->v_delta_pdm_shell[iks][mu][nu][inl][m1*nm+m2]);
+                                }
+                                else
+                                {
+                                    mmv.push_back(this->v_delta_pdm_shell_complex[iks][mu][nu][inl][m1*nm+m2]);
+                                }
                             }
                         }
-                        torch::Tensor mm = torch::tensor(mmv, torch::TensorOptions().dtype(torch::kFloat64) ).reshape({nm, nm});    //nm*nm
-                        ammv.push_back(mm);
+                        if constexpr (std::is_same<TK, double>::value)
+                        {
+                            torch::Tensor mm = torch::tensor(mmv, torch::TensorOptions().dtype(torch::kFloat64) ).reshape({nm, nm});    //nm*nm
+                            ammv.push_back(mm);
+                        }
+                        else
+                        {
+                            torch::Tensor mm = torch::from_blob(mmv.data(), {nm, nm}, torch::TensorOptions().dtype(torch::kComplexDouble)).clone();    //nm*nm
+                            ammv.push_back(mm);
+                        }
                     }
                     torch::Tensor amm = torch::stack(ammv, 0); 
                     uammv.push_back(amm);                    
@@ -184,12 +269,20 @@ void LCAO_Deepks::cal_v_delta_precalc(const int nlocal,
     //einsum for each nl: 
     std::vector<torch::Tensor> v_delta_precalc_vector;
     for (int nl = 0; nl<nlmax; ++nl)
-    {
-        v_delta_precalc_vector.push_back(at::einsum("kxyamn, avmn->kxyav", {v_delta_pdm_shell_vector[nl], this->gevdm_vector[nl]}));
+    {   
+        if constexpr (std::is_same<TK, double>::value)
+        {
+            v_delta_precalc_vector.push_back(at::einsum("kxyamn, avmn->kxyav", {v_delta_pdm_shell_vector[nl], this->gevdm_vector[nl]}));
+        }
+        else
+        {
+            torch::Tensor gevdm_vector_complex = this->gevdm_vector[nl].to(torch::kComplexDouble);
+            v_delta_precalc_vector.push_back(at::einsum("kxyamn, avmn->kxyav", {v_delta_pdm_shell_vector[nl], gevdm_vector_complex}));
+        }
     }
 
     this->v_delta_precalc_tensor = torch::cat(v_delta_precalc_vector, -1);
-    this->del_v_delta_pdm_shell(1,nlocal);
+    this->del_v_delta_pdm_shell(nks,nlocal);
 
     //check_v_delta_precalc(nlocal,nat);
     // timeval t_end;
@@ -198,6 +291,7 @@ void LCAO_Deepks::cal_v_delta_precalc(const int nlocal,
     return;
 }
 
+template <typename TK>
 void LCAO_Deepks::check_v_delta_precalc(const int nat, const int nks,const int nlocal)
 {
     std::ofstream ofs("v_delta_precalc.dat");
@@ -212,7 +306,15 @@ void LCAO_Deepks::check_v_delta_precalc(const int nat, const int nks,const int n
                 {
                     for(int p=0; p<this->des_per_atom; ++p)
                     {
-                        ofs<<this->v_delta_precalc_tensor.index({iks, mu, nu, iat, p }).item().toDouble()<<" ";
+                        if constexpr (std::is_same<TK, double>::value)
+                        {
+                            ofs << this->v_delta_precalc_tensor.index({iks, mu, nu, iat, p }).item().toDouble() << " " ;
+                        }
+                        else
+                        {
+                            auto tensor_value = this->v_delta_precalc_tensor.index({iks, mu, nu, iat, p});
+                            ofs << std::complex<double>(torch::real(tensor_value).item<double>(), torch::imag(tensor_value).item<double>()) << " " ;
+                        }
                     }
                 }
                 ofs << std::endl;                
@@ -222,4 +324,23 @@ void LCAO_Deepks::check_v_delta_precalc(const int nat, const int nks,const int n
     ofs.close();
 }
 
+template void LCAO_Deepks::cal_v_delta_precalc<double>(const int nlocal,
+                                                       const int nat,
+                                                       const int nks,
+                                                       const std::vector<ModuleBase::Vector3<double>> &kvec_d,
+                                                       const UnitCell &ucell,
+                                                       const LCAO_Orbitals &orb,
+                                                       Grid_Driver &GridD);
+
+template void LCAO_Deepks::cal_v_delta_precalc<std::complex<double>>(const int nlocal,
+                                                                    const int nat,
+                                                                    const int nks,
+                                                                    const std::vector<ModuleBase::Vector3<double>> &kvec_d,
+                                                                    const UnitCell &ucell,
+                                                                    const LCAO_Orbitals &orb,
+                                                                    Grid_Driver &GridD);
+
+template void LCAO_Deepks::check_v_delta_precalc<double>(const int nat, const int nks, const int nlocal);
+template void LCAO_Deepks::check_v_delta_precalc<std::complex<double>>(const int nat, const int nks, const int nlocal);
+
 #endif
diff --git a/source/module_hamilt_lcao/module_deepks/v_delta_precalc_k.cpp b/source/module_hamilt_lcao/module_deepks/v_delta_precalc_k.cpp
deleted file mode 100644
index 1a1d3b4006..0000000000
--- a/source/module_hamilt_lcao/module_deepks/v_delta_precalc_k.cpp
+++ /dev/null
@@ -1,234 +0,0 @@
-#ifdef __DEEPKS
-
-#include "LCAO_deepks.h"
-#include "LCAO_deepks_io.h" 
-#include "module_base/blas_connector.h"
-#include "module_base/constants.h"
-#include "module_base/libm/libm.h"
-#include "module_base/parallel_reduce.h"
-#include "module_hamilt_lcao/module_hcontainer/atom_pair.h"
-#include "module_parameter/parameter.h"
-
-
-// calculates v_delta_precalc[nks,nlocal,nlocal,NAt,NDscrpt] = gvdm * v_delta_pdm_shell;
-// v_delta_pdm_shell[nks,nlocal,nlocal,Inl,nm*nm] = overlap * overlap;
-// for deepks_v_delta = 1
-void LCAO_Deepks::cal_v_delta_precalc_k(const int nlocal,
-    const int nat,
-    const int nks,
-    const std::vector<ModuleBase::Vector3<double>> &kvec_d,
-    const UnitCell &ucell,
-    const LCAO_Orbitals &orb,
-    Grid_Driver &GridD)
-{
-    ModuleBase::TITLE("LCAO_Deepks", "calc_v_delta_precalc");
-    // timeval t_start;
-    // gettimeofday(&t_start,NULL);
-
-    this->cal_gvdm(nat);
-    const double Rcut_Alpha = orb.Alpha[0].getRcut();
-    this->init_v_delta_pdm_shell(nks,nlocal); // multi-k
-   
-    for (int T0 = 0; T0 < ucell.ntype; T0++)
-    {
-		Atom* atom0 = &ucell.atoms[T0]; 
-        
-        for (int I0 =0; I0< atom0->na; I0++)
-        {
-            const int iat = ucell.itia2iat(T0,I0);
-            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0] ,T0, I0);
-
-            for (int ad1=0; ad1<GridD.getAdjacentNum()+1 ; ++ad1)
-            {
-                const int T1 = GridD.getType(ad1);
-                const int I1 = GridD.getNatom(ad1);
-                const int ibt1 = ucell.itia2iat(T1, I1); 
-                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-                const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad1);
-				const Atom* atom1 = &ucell.atoms[T1];
-				const int nw1_tot = atom1->nw*PARAM.globalv.npol;
-				const double Rcut_AO1 = orb.Phi[T1].getRcut(); 
-
-                const double dist1 = (tau1-tau0).norm() * ucell.lat0;
-                if (dist1 >= Rcut_Alpha + Rcut_AO1)
-                {
-                    continue;
-                }
-
-                ModuleBase::Vector3<double> dR1(GridD.getBox(ad1).x,
-                                                GridD.getBox(ad1).y,
-                                                GridD.getBox(ad1).z);
-                
-                key_tuple key_1(ibt1, dR1.x, dR1.y, dR1.z);
-                
-                if (this->nlm_save_k[iat].find(key_1)
-                    == this->nlm_save_k[iat].end()) 
-                {
-                    continue;
-                }
-
-				for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
-				{
-					const int T2 = GridD.getType(ad2);
-					const int I2 = GridD.getNatom(ad2);
-                    const int ibt2 = ucell.itia2iat(T2, I2);
-					const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
-					const ModuleBase::Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
-					const Atom* atom2 = &ucell.atoms[T2];
-					const int nw2_tot = atom2->nw*PARAM.globalv.npol;
-					
-					const double Rcut_AO2 = orb.Phi[T2].getRcut();
-                	const double dist2 = (tau2-tau0).norm() * ucell.lat0;
-
-					if (dist2 >= Rcut_Alpha + Rcut_AO2)
-					{
-						continue;
-					}
-
-                    ModuleBase::Vector3<double> dR2(GridD.getBox(ad2).x,
-                                                    GridD.getBox(ad2).y,
-                                                    GridD.getBox(ad2).z);
-
-                    key_tuple key_2(ibt2, dR2.x, dR2.y, dR2.z);
-
-                    if (this->nlm_save_k[iat].find(key_2)
-                        == this->nlm_save_k[iat].end()) 
-                    {
-                        continue;
-                    }
-
-					for (int iw1=0; iw1<nw1_tot; ++iw1)
-					{
-						const int iw1_all = start1 + iw1; // this is \mu
-						const int iw1_local = pv->global2local_row(iw1_all);
-						if(iw1_local < 0) {continue;}
-						const int iw1_0 = iw1/PARAM.globalv.npol;
-						for (int iw2=0; iw2<nw2_tot; ++iw2)
-						{
-							const int iw2_all = start2 + iw2; // this is \nu
-							const int iw2_local = pv->global2local_col(iw2_all);
-							if(iw2_local < 0) {continue;}
-							const int iw2_0 = iw2/PARAM.globalv.npol;
-                            // Should use nlm_save_k, to be modified here!!!
-                            std::vector<double> nlm1 = this->nlm_save_k[iat][key_1][iw1][0];
-                            std::vector<double> nlm2 = this->nlm_save_k[iat][key_2][iw2][0];
-                            assert(nlm1.size()==nlm2.size());
-                            for (int ik = 0; ik < nks; ik++)
-                            {
-                                int ib=0;
-                                const double arg = - (kvec_d[ik] * (dR1-dR2) ) * ModuleBase::TWO_PI;
-                                double sinp, cosp;
-                                ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                                const std::complex<double> kphase = std::complex<double>(cosp, sinp);
-                                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
-                                {
-                                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
-                                    {
-                                        const int inl = this->inl_index[T0](I0, L0, N0);
-                                        const int nm = 2*L0+1;
-                                        
-                                        for (int m1=0; m1<nm; ++m1) // nm = 1 for s, 3 for p, 5 for d
-                                        {
-                                            for (int m2=0; m2<nm; ++m2) // nm = 1 for s, 3 for p, 5 for d
-                                            {
-                                                v_delta_pdm_shell_complex[ik][iw1_all][iw2_all][inl][m1*nm+m2] += nlm1[ib+m1]*nlm2[ib+m2]*kphase;
-                                            }
-                                        }
-
-                                        ib+=nm;
-                                    }
-                                }
-                            }
-                                                        
-						}//iw2
-					}//iw1
-				}//ad2
-			}//ad1   
-            
-        }
-    }
-
-#ifdef __MPI
-    const int mn_size=(2 * this->lmaxd + 1) * (2 * this->lmaxd + 1);
-    for(int ik = 0; ik < nks; ik++)
-    {
-        for(int inl = 0; inl < this->inlmax; inl++)
-        {
-            for(int mu = 0; mu < nlocal ; mu++)
-            {
-                for(int nu=0; nu< nlocal ; nu++)
-                {
-                    Parallel_Reduce::reduce_all(this->v_delta_pdm_shell_complex[ik][mu][nu][inl],mn_size);
-                }
-            }
-        }
-    }
-#endif    
-    // transfer v_delta_pdm_shell to v_delta_pdm_shell_vector
-    
-    int nlmax = this->inlmax/nat;
-   
-    std::vector<torch::Tensor> v_delta_pdm_shell_vector;
-    for(int nl = 0; nl < nlmax; ++nl)
-    {
-        std::vector<torch::Tensor> kuuammv;
-        for(int iks = 0; iks < nks; ++iks)
-        {
-            std::vector<torch::Tensor> uuammv;
-            for(int mu = 0; mu < nlocal; ++mu)
-            {
-                std::vector<torch::Tensor> uammv;
-                for(int nu =0 ; nu < nlocal; ++nu)
-                {
-                    std::vector<torch::Tensor> ammv;
-                    for (int iat=0; iat<nat; ++iat)
-                    {
-                        int inl = iat*nlmax+nl;
-                        int nm = 2*this->inl_l[inl]+1;
-                        std::vector<std::complex<double>> mmv;
-                    
-                        for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
-                        {
-                            for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
-                            {
-                                mmv.push_back(this->v_delta_pdm_shell_complex[iks][mu][nu][inl][m1*nm+m2]);
-                            }
-                        }
-                        torch::Tensor mm = torch::from_blob(mmv.data(), {nm, nm}, torch::TensorOptions().dtype(torch::kComplexDouble)).clone();    //nm*nm
-                        ammv.push_back(mm);
-                    }
-                    torch::Tensor amm = torch::stack(ammv, 0); 
-                    uammv.push_back(amm);                    
-                }
-                torch::Tensor uamm = torch::stack(uammv, 0); 
-                uuammv.push_back(uamm);
-            }
-            torch::Tensor uuamm = torch::stack(uuammv, 0); 
-            kuuammv.push_back(uuamm);
-        }
-        torch::Tensor kuuamm = torch::stack(kuuammv, 0);  
-        v_delta_pdm_shell_vector.push_back(kuuamm);
-    }
-
-    assert(v_delta_pdm_shell_vector.size() == nlmax);
-    
-    //einsum for each nl: 
-    std::vector<torch::Tensor> v_delta_precalc_vector;
-    for (int nl = 0; nl<nlmax; ++nl)
-    {
-        torch::Tensor gevdm_vector_complex = this->gevdm_vector[nl].to(torch::kComplexDouble);
-        v_delta_precalc_vector.push_back(at::einsum("kxyamn, avmn->kxyav", {v_delta_pdm_shell_vector[nl], gevdm_vector_complex}));
-    }
-
-    this->v_delta_precalc_tensor = torch::cat(v_delta_precalc_vector, -1);
-    this->del_v_delta_pdm_shell(nks,nlocal);
-
-    //check_v_delta_precalc(nlocal,nat);
-    // timeval t_end;
-    // gettimeofday(&t_end,NULL);
-    // std::cout<<"calculate v_delta_precalc time:\t"<<(double)(t_end.tv_sec-t_start.tv_sec) + (double)(t_end.tv_usec-t_start.tv_usec)/1000000.0<<std::endl;
-    return;
-}
-
-#endif
\ No newline at end of file