diff --git a/.github/workflows/rocm-build-ci.yml b/.github/workflows/rocm-build-ci.yml index 30ad92ae1f..c8812aa8c9 100644 --- a/.github/workflows/rocm-build-ci.yml +++ b/.github/workflows/rocm-build-ci.yml @@ -5,7 +5,7 @@ jobs: runs-on: [self-hosted, amd] strategy: matrix: - rocm: [ 5.6.1, 5.7.2 ] + rocm: [ 5.6.1, 6.0.2 ] steps: - uses: actions/checkout@v3 - run: | @@ -22,12 +22,13 @@ jobs: -DROCM_PATH=${ROCM_PATH} \ -DQUDA_DIRAC_CLOVER=ON \ -DQUDA_DIRAC_CLOVER_HASENBUSCH=OFF \ - -DQUDA_DIRAC_DOMAIN_WALL=OFF \ + -DQUDA_DIRAC_DOMAIN_WALL=ON \ -DQUDA_DIRAC_NDEG_TWISTED_MASS=OFF \ -DQUDA_DIRAC_STAGGERED=ON \ -DQUDA_DIRAC_TWISTED_MASS=OFF \ -DQUDA_DIRAC_TWISTED_CLOVER=OFF \ -DQUDA_DIRAC_WILSON=ON \ + -DQUDA_DIRAC_LAPLACE=ON \ -DQUDA_CLOVER_DYNAMIC=ON \ -DQUDA_QDPJIT=OFF \ -DQUDA_INTERFACE_QDPJIT=OFF \ diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d9ce6c032..165dfb689a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,13 +150,13 @@ option(QUDA_DIRAC_TWISTED_CLOVER "build twisted clover Dirac operators" ${QUDA_D option(QUDA_DIRAC_CLOVER_HASENBUSCH "build clover Hasenbusch twist operators" ${QUDA_DIRAC_DEFAULT}) option(QUDA_DIRAC_NDEG_TWISTED_MASS "build non-degenerate twisted mass Dirac operators" ${QUDA_DIRAC_DEFAULT}) option(QUDA_DIRAC_NDEG_TWISTED_CLOVER "build non-degenerate twisted clover Dirac operators" ${QUDA_DIRAC_DEFAULT}) +option(QUDA_DIRAC_LAPLACE "build laplace operator" ${QUDA_DIRAC_DEFAULT}) option(QUDA_DIRAC_DD "build code for domain decomposition of the Dirac operator" OFF) option(QUDA_CONTRACT "build code for bilinear contraction" OFF) option(QUDA_COVDEV "build code for covariant derivative" OFF) -option(QUDA_LAPLACE "build laplace operator" OFF) option(QUDA_QIO "build QIO code for binary I/O" OFF) option(QUDA_SMEAR_GAUSS_TWOLINK "build code for two-link Gaussian smearing" OFF) diff --git a/README.md b/README.md index a739762cf2..d5995a5c99 100644 --- a/README.md +++ b/README.md @@ -266,12 +266,14 @@ Advanced Scientific Computing (PASC21) [arXiv:2104.05615[hep-lat]]. * Steven Gottlieb (Indiana University) * Anthony Grebe (Fermilab) * Kyriakos Hadjiyiannakou (Cyprus) +* Ben Hoerz (Intel) * Dean Howarth (Lawrence Livermore Lab, Lawrence Berkeley Lab) * Hwancheol Jeong (Indiana University) * Xiangyu Jiang (ITP, Chinese Academy of Sciences) * Balint Joo (OLCF, Oak Ridge National Laboratory, formerly Jefferson Lab) * Hyung-Jin Kim (Samsung Advanced Institute of Technology) * Bartosz Kostrzewa (HPC/A-Lab, University of Bonn) +* Damon McDougall (AMD) * James Osborn (Argonne National Laboratory) * Ferenc Pittler (Cyprus) * Claudio Rebbi (Boston University) diff --git a/ci/docker/Dockerfile.build b/ci/docker/Dockerfile.build index 5322ddfd2d..26adb18840 100644 --- a/ci/docker/Dockerfile.build +++ b/ci/docker/Dockerfile.build @@ -2,6 +2,8 @@ FROM docker.io/nvidia/cuda:11.8.0-devel-ubuntu22.04 ARG DEBIAN_FRONTEND=noninteractive +RUN echo "Running CSCS CI on $(nproc) processors" + RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \ build-essential \ cmake \ @@ -34,7 +36,7 @@ RUN QUDA_TEST_GRID_SIZE="1 1 1 2" cmake -S /quda/src \ -DQUDA_CTEST_LAUNCH="" \ -DQUDA_GPU_ARCH=sm_60 \ -DQUDA_MULTIGRID=ON \ - -DQUDA_MULTIGRID_NVEC_LIST=24 \ + -DQUDA_MULTIGRID_NVEC_LIST=6 \ -DQUDA_MDW_FUSED_LS_LIST=4 \ -DQUDA_MPI=ON \ -DQUDA_DIRAC_DEFAULT_OFF=ON \ @@ -42,10 +44,11 @@ RUN QUDA_TEST_GRID_SIZE="1 1 1 2" cmake -S /quda/src \ -DQUDA_DIRAC_CLOVER=ON \ -DQUDA_DIRAC_TWISTED_CLOVER=ON \ -DQUDA_DIRAC_STAGGERED=ON \ + -DQUDA_DIRAC_LAPLACE=ON \ -GNinja \ -B /quda/build -RUN cmake --build /quda/build +RUN cmake --build /quda/build -j $(nproc) RUN cmake --install /quda/build diff --git a/include/clover_field.h b/include/clover_field.h index 90d24d53eb..256453da2e 100644 --- a/include/clover_field.h +++ b/include/clover_field.h @@ -20,6 +20,19 @@ namespace quda { #endif } + /** + @brief Helper function that returns whether we have enabled + clover fermions. + */ + constexpr bool is_enabled_twisted_clover() + { +#ifdef GPU_TWISTED_CLOVER_DIRAC + return true; +#else + return false; +#endif + } + namespace clover { diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index a5ffa80543..dad19b48a8 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1052,6 +1052,112 @@ namespace quda } }; + template + struct GhostNOrder { + GhostNOrder() = default; + GhostNOrder(const GhostNOrder &) = default; + GhostNOrder(const ColorSpinorField &, int = 1, Float ** = 0) { } + GhostNOrder &operator=(const GhostNOrder &) = default; + }; + + template + struct GhostNOrder { + static constexpr int length = 2 * Ns * Nc; + static constexpr int length_ghost = spin_project ? length / 2 : length; + // if spin projecting, check that short vector length is compatible, if not halve the vector length + static constexpr int N_ghost = !spin_project ? N : (Ns * Nc) % N == 0 ? N : N / 2; + static constexpr int M_ghost = length_ghost / N_ghost; + using GhostVector = typename VectorType::type; + using real = typename mapper::type; + using complex = complex; + using norm_type = float; + int nParity; + array faceVolumeCB = {}; + mutable array ghost = {}; + mutable array ghost_norm = {}; + + GhostNOrder() = default; + GhostNOrder(const GhostNOrder &) = default; + + GhostNOrder(const ColorSpinorField &a, int nFace = 1, Float **ghost_ = 0) : nParity(a.SiteSubset()) + { + for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; } + resetGhost(ghost_ ? (void **)ghost_ : a.Ghost()); + } + + GhostNOrder &operator=(const GhostNOrder &) = default; + + void resetGhost(void *const *ghost_) const + { + for (int dim = 0; dim < 4; dim++) { + for (int dir = 0; dir < 2; dir++) { + ghost[2 * dim + dir] = comm_dim_partitioned(dim) ? static_cast(ghost_[2 * dim + dir]) : nullptr; + ghost_norm[2 * dim + dir] = !comm_dim_partitioned(dim) ? + nullptr : + reinterpret_cast(static_cast(ghost_[2 * dim + dir]) + + nParity * length_ghost * faceVolumeCB[dim] * sizeof(Float)); + } + } + } + + __device__ __host__ inline void loadGhost(complex out[length_ghost / 2], int x, int dim, int dir, int parity = 0) const + { + real v[length_ghost]; + norm_type nrm + = isFixed::value ? vector_load(ghost_norm[2 * dim + dir], parity * faceVolumeCB[dim] + x) : 0.0; + +#pragma unroll + for (int i = 0; i < M_ghost; i++) { + GhostVector vecTmp = vector_load( + ghost[2 * dim + dir], parity * faceVolumeCB[dim] * M_ghost + i * faceVolumeCB[dim] + x); +#pragma unroll + for (int j = 0; j < N_ghost; j++) + copy_and_scale(v[i * N_ghost + j], reinterpret_cast(&vecTmp)[j], nrm); + } + +#pragma unroll + for (int i = 0; i < length_ghost / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]); + } + + __device__ __host__ inline void saveGhost(const complex in[length_ghost / 2], int x, int dim, int dir, + int parity = 0) const + { + real v[length_ghost]; +#pragma unroll + for (int i = 0; i < length_ghost / 2; i++) { + v[2 * i + 0] = in[i].real(); + v[2 * i + 1] = in[i].imag(); + } + + if (isFixed::value) { + norm_type max_[length_ghost / 2]; + // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued) +#pragma unroll + for (int i = 0; i < length_ghost / 2; i++) + max_[i] = fmaxf((norm_type)fabsf((norm_type)v[i]), (norm_type)fabsf((norm_type)v[i + length_ghost / 2])); + norm_type scale = 0.0; +#pragma unroll + for (int i = 0; i < length_ghost / 2; i++) scale = fmaxf(max_[i], scale); + ghost_norm[2 * dim + dir][parity * faceVolumeCB[dim] + x] = scale * fixedInvMaxValue::value; + + real scale_inv = fdividef(fixedMaxValue::value, scale); +#pragma unroll + for (int i = 0; i < length_ghost; i++) v[i] = v[i] * scale_inv; + } + +#pragma unroll + for (int i = 0; i < M_ghost; i++) { + GhostVector vecTmp; + // first do scalar copy converting into storage type +#pragma unroll + for (int j = 0; j < N_ghost; j++) copy_scaled(reinterpret_cast(&vecTmp)[j], v[i * N_ghost + j]); + // second do vectorized copy into memory + vector_store(ghost[2 * dim + dir], parity * faceVolumeCB[dim] * M_ghost + i * faceVolumeCB[dim] + x, vecTmp); + } + } + }; + /** @brief Accessor routine for ColorSpinorFields in native field order. @tparam Float Underlying storage data type of the field @@ -1063,21 +1169,18 @@ namespace quda pointer arithmetic for huge allocations (e.g., packed set of vectors). Default is to use 32-bit pointer arithmetic. */ - template - struct FloatNOrder { + template + struct FloatNOrder : GhostNOrder { static_assert((2 * Ns * Nc) % N_ == 0, "Internal degrees of freedom not divisible by short-vector length"); static constexpr int length = 2 * Ns * Nc; - static constexpr int length_ghost = spin_project ? length / 2 : length; static constexpr int N = N_; static constexpr int M = length / N; - // if spin projecting, check that short vector length is compatible, if not halve the vector length - static constexpr int N_ghost = !spin_project ? N : (Ns * Nc) % N == 0 ? N : N / 2; - static constexpr int M_ghost = length_ghost / N_ghost; - using Accessor = FloatNOrder; + using Accessor = FloatNOrder; + using GhostNOrder = GhostNOrder; using real = typename mapper::type; using complex = complex; using Vector = typename VectorType::type; - using GhostVector = typename VectorType::type; using AllocInt = typename AllocType::type; using norm_type = float; Float *field = nullptr; @@ -1085,45 +1188,24 @@ namespace quda AllocInt offset = 0; // offset can be 32-bit or 64-bit AllocInt norm_offset = 0; int volumeCB = 0; - array faceVolumeCB = {}; - mutable array ghost = {}; - mutable array ghost_norm = {}; - int nParity = 0; - void *backup_h = nullptr; //! host memory for backing up the field when tuning size_t bytes = 0; FloatNOrder() = default; FloatNOrder(const FloatNOrder &) = default; FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) : + GhostNOrder(a, nFace, ghost_), field(buffer ? buffer : a.data()), norm(buffer ? reinterpret_cast(reinterpret_cast(buffer) + a.NormOffset()) : const_cast(reinterpret_cast(a.Norm()))), offset(a.Bytes() / (2 * sizeof(Float) * N)), norm_offset(a.Bytes() / (2 * sizeof(norm_type))), volumeCB(a.VolumeCB()), - nParity(a.SiteSubset()), bytes(a.Bytes()) { - for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; } - resetGhost(ghost_ ? (void **)ghost_ : a.Ghost()); } - FloatNOrder &operator=(const FloatNOrder &) = default; - void resetGhost(void *const *ghost_) const - { - for (int dim = 0; dim < 4; dim++) { - for (int dir = 0; dir < 2; dir++) { - ghost[2 * dim + dir] = comm_dim_partitioned(dim) ? static_cast(ghost_[2 * dim + dir]) : nullptr; - ghost_norm[2 * dim + dir] = !comm_dim_partitioned(dim) ? - nullptr : - reinterpret_cast(static_cast(ghost_[2 * dim + dir]) - + nParity * length_ghost * faceVolumeCB[dim] * sizeof(Float)); - } - } - } - __device__ __host__ inline void load(complex out[length / 2], int x, int parity = 0) const { real v[length]; @@ -1193,20 +1275,71 @@ namespace quda return colorspinor_wrapper(*this, x_cb, parity); } + /** + @brief This accessor routine returns a const + colorspinor_ghost_wrapper to this object, allowing us to + overload various operators for manipulating at the site + level interms of matrix operations. + @param[in] dim Dimensions of the ghost we are requesting + @param[in] ghost_idx Checkerboarded space-time ghost index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a colorspinor_ghost+wrapper that curries in access to + this field at the above coordinates. + */ + __device__ __host__ inline auto Ghost(int dim, int dir, int ghost_idx, int parity) const + { + return colorspinor_ghost_wrapper(*this, dim, dir, ghost_idx, parity); + } + + size_t Bytes() const { return bytes; } + }; + + template + struct GhostNOrder { + using Float = short; + static constexpr int Ns = 1; + static constexpr int Nc = 3; + static constexpr int length_ghost = 2 * Ns * Nc; + using GhostVector = int4; // 128-bit packed type + using real = typename mapper::type; + using complex = complex; + using norm_type = float; + int nParity; + array faceVolumeCB = {}; + mutable array ghost = {}; + mutable array ghost_norm = {}; + + GhostNOrder() = default; + GhostNOrder(const GhostNOrder &) = default; + + GhostNOrder(const ColorSpinorField &a, int nFace = 1, Float **ghost_ = 0) : nParity(a.SiteSubset()) + { + for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; } + resetGhost(ghost_ ? (void **)ghost_ : a.Ghost()); + } + + GhostNOrder &operator=(const GhostNOrder &) = default; + + void resetGhost(void *const *ghost_) const + { + for (int dim = 0; dim < 4; dim++) { + for (int dir = 0; dir < 2; dir++) { + ghost[2 * dim + dir] = comm_dim_partitioned(dim) ? static_cast(ghost_[2 * dim + dir]) : nullptr; + } + } + } + __device__ __host__ inline void loadGhost(complex out[length_ghost / 2], int x, int dim, int dir, int parity = 0) const { real v[length_ghost]; - norm_type nrm - = isFixed::value ? vector_load(ghost_norm[2 * dim + dir], parity * faceVolumeCB[dim] + x) : 0.0; + GhostVector vecTmp = vector_load(ghost[2 * dim + dir], parity * faceVolumeCB[dim] + x); + + // extract the norm + norm_type nrm; + memcpy(&nrm, &vecTmp.w, sizeof(norm_type)); #pragma unroll - for (int i = 0; i < M_ghost; i++) { - GhostVector vecTmp = vector_load( - ghost[2 * dim + dir], parity * faceVolumeCB[dim] * M_ghost + i * faceVolumeCB[dim] + x); -#pragma unroll - for (int j = 0; j < N_ghost; j++) - copy_and_scale(v[i * N_ghost + j], reinterpret_cast(&vecTmp)[j], nrm); - } + for (int i = 0; i < length_ghost; i++) copy_and_scale(v[i], reinterpret_cast(&vecTmp)[i], nrm); #pragma unroll for (int i = 0; i < length_ghost / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]); @@ -1216,78 +1349,34 @@ namespace quda int parity = 0) const { real v[length_ghost]; + #pragma unroll for (int i = 0; i < length_ghost / 2; i++) { v[2 * i + 0] = in[i].real(); v[2 * i + 1] = in[i].imag(); } - if (isFixed::value) { - norm_type max_[length_ghost / 2]; - // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued) -#pragma unroll - for (int i = 0; i < length_ghost / 2; i++) - max_[i] = fmaxf((norm_type)fabsf((norm_type)v[i]), (norm_type)fabsf((norm_type)v[i + length_ghost / 2])); - norm_type scale = 0.0; + norm_type max_[length_ghost / 2]; + // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued) #pragma unroll - for (int i = 0; i < length_ghost / 2; i++) scale = fmaxf(max_[i], scale); - ghost_norm[2 * dim + dir][parity * faceVolumeCB[dim] + x] = scale * fixedInvMaxValue::value; - - real scale_inv = fdividef(fixedMaxValue::value, scale); + for (int i = 0; i < length_ghost / 2; i++) + max_[i] = fmaxf(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length_ghost / 2])); + norm_type scale = 0.0; #pragma unroll - for (int i = 0; i < length_ghost; i++) v[i] = v[i] * scale_inv; - } + for (int i = 0; i < length_ghost / 2; i++) scale = fmaxf(max_[i], scale); + norm_type nrm = scale * fixedInvMaxValue::value; + real scale_inv = fdividef(fixedMaxValue::value, scale); #pragma unroll - for (int i = 0; i < M_ghost; i++) { - GhostVector vecTmp; - // first do scalar copy converting into storage type -#pragma unroll - for (int j = 0; j < N_ghost; j++) copy_scaled(reinterpret_cast(&vecTmp)[j], v[i * N_ghost + j]); - // second do vectorized copy into memory - vector_store(ghost[2 * dim + dir], parity * faceVolumeCB[dim] * M_ghost + i * faceVolumeCB[dim] + x, vecTmp); - } - } - - /** - @brief This accessor routine returns a const - colorspinor_ghost_wrapper to this object, allowing us to - overload various operators for manipulating at the site - level interms of matrix operations. - @param[in] dim Dimensions of the ghost we are requesting - @param[in] ghost_idx Checkerboarded space-time ghost index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a colorspinor_ghost+wrapper that curries in access to - this field at the above coordinates. - */ - __device__ __host__ inline auto Ghost(int dim, int dir, int ghost_idx, int parity) const - { - return colorspinor_ghost_wrapper(*this, dim, dir, ghost_idx, parity); - } - - /** - @brief Backup the field to the host when tuning - */ - void save() - { - if (backup_h) errorQuda("Already allocated host backup"); - backup_h = safe_malloc(bytes); - qudaMemcpy(backup_h, field, bytes, qudaMemcpyDeviceToHost); - } + for (int i = 0; i < length_ghost; i++) v[i] = v[i] * scale_inv; - /** - @brief Restore the field from the host after tuning - */ - void load() - { - qudaMemcpy(field, backup_h, bytes, qudaMemcpyHostToDevice); - host_free(backup_h); - backup_h = nullptr; - } + GhostVector vecTmp; + memcpy(&vecTmp.w, &nrm, sizeof(norm_type)); // pack the norm - size_t Bytes() const - { - return nParity * volumeCB * (Nc * Ns * 2 * sizeof(Float) + (isFixed::value ? sizeof(norm_type) : 0)); + // pack the spinor elements +#pragma unroll + for (int i = 0; i < length_ghost; i++) copy_scaled(reinterpret_cast(&vecTmp)[i], v[i]); + vector_store(ghost[2 * dim + dir], parity * faceVolumeCB[dim] + x, vecTmp); } }; @@ -1301,54 +1390,39 @@ namespace quda pointer arithmetic for huge allocations (e.g., packed set of vectors). Default is to use 32-bit pointer arithmetic. */ - template - struct FloatNOrder { + template + struct FloatNOrder + : GhostNOrder { using Float = short; static constexpr int Ns = 1; static constexpr int Nc = 3; static constexpr int length = 2 * Ns * Nc; - static constexpr int length_ghost = 2 * Ns * Nc; - using Accessor = FloatNOrder; + using Accessor = FloatNOrder; + using GhostNOrder = GhostNOrder; using real = typename mapper::type; using complex = complex; using Vector = int4; // 128-bit packed type - using GhostVector = int4; // 128-bit packed type using AllocInt = typename AllocType::type; using norm_type = float; Float *field = nullptr; - const AllocInt offset = 0; // offset can be 32-bit or 64-bit + AllocInt offset = 0; // offset can be 32-bit or 64-bit int volumeCB = 0; - array faceVolumeCB = {}; - mutable array ghost = {}; - int nParity = 0; - void *backup_h = nullptr; //! host memory for backing up the field when tuning size_t bytes = 0; FloatNOrder() = default; FloatNOrder(const FloatNOrder &) = default; FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) : + GhostNOrder(a, nFace, ghost_), field(buffer ? buffer : a.data()), offset(a.Bytes() / (2 * sizeof(Vector))), volumeCB(a.VolumeCB()), - nParity(a.SiteSubset()), bytes(a.Bytes()) { - for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; } - resetGhost(ghost_ ? (void **)ghost_ : a.Ghost()); } FloatNOrder &operator=(const FloatNOrder &) = default; - void resetGhost(void *const *ghost_) const - { - for (int dim = 0; dim < 4; dim++) { - for (int dir = 0; dir < 2; dir++) { - ghost[2 * dim + dir] = comm_dim_partitioned(dim) ? static_cast(ghost_[2 * dim + dir]) : nullptr; - } - } - } - __device__ __host__ inline void load(complex out[length / 2], int x, int parity = 0) const { real v[length]; @@ -1414,56 +1488,6 @@ namespace quda return colorspinor_wrapper(*this, x_cb, parity); } - __device__ __host__ inline void loadGhost(complex out[length_ghost / 2], int x, int dim, int dir, int parity = 0) const - { - real v[length_ghost]; - GhostVector vecTmp = vector_load(ghost[2 * dim + dir], parity * faceVolumeCB[dim] + x); - - // extract the norm - norm_type nrm; - memcpy(&nrm, &vecTmp.w, sizeof(norm_type)); - -#pragma unroll - for (int i = 0; i < length_ghost; i++) copy_and_scale(v[i], reinterpret_cast(&vecTmp)[i], nrm); - -#pragma unroll - for (int i = 0; i < length_ghost / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]); - } - - __device__ __host__ inline void saveGhost(const complex in[length_ghost / 2], int x, int dim, int dir, - int parity = 0) const - { - real v[length_ghost]; - -#pragma unroll - for (int i = 0; i < length_ghost / 2; i++) { - v[2 * i + 0] = in[i].real(); - v[2 * i + 1] = in[i].imag(); - } - - norm_type max_[length_ghost / 2]; - // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued) -#pragma unroll - for (int i = 0; i < length_ghost / 2; i++) - max_[i] = fmaxf(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length_ghost / 2])); - norm_type scale = 0.0; -#pragma unroll - for (int i = 0; i < length_ghost / 2; i++) scale = fmaxf(max_[i], scale); - norm_type nrm = scale * fixedInvMaxValue::value; - - real scale_inv = fdividef(fixedMaxValue::value, scale); -#pragma unroll - for (int i = 0; i < length_ghost; i++) v[i] = v[i] * scale_inv; - - GhostVector vecTmp; - memcpy(&vecTmp.w, &nrm, sizeof(norm_type)); // pack the norm - - // pack the spinor elements -#pragma unroll - for (int i = 0; i < length_ghost; i++) copy_scaled(reinterpret_cast(&vecTmp)[i], v[i]); - vector_store(ghost[2 * dim + dir], parity * faceVolumeCB[dim] + x, vecTmp); - } - /** @brief This accessor routine returns a const colorspinor_ghost_wrapper to this object, allowing us to @@ -1480,30 +1504,7 @@ namespace quda return colorspinor_ghost_wrapper(*this, dim, dir, ghost_idx, parity); } - /** - @brief Backup the field to the host when tuning - */ - void save() - { - if (backup_h) errorQuda("Already allocated host backup"); - backup_h = safe_malloc(bytes); - qudaMemcpy(backup_h, field, bytes, qudaMemcpyDeviceToHost); - } - - /** - @brief Restore the field from the host after tuning - */ - void load() - { - qudaMemcpy(field, backup_h, bytes, qudaMemcpyHostToDevice); - host_free(backup_h); - backup_h = nullptr; - } - - size_t Bytes() const - { - return nParity * volumeCB * (Nc * Ns * 2 * sizeof(Float) + (isFixed::value ? sizeof(norm_type) : 0)); - } + size_t Bytes() const { return bytes; } }; template struct SpaceColorSpinorOrder { @@ -1820,63 +1821,80 @@ namespace quda } // namespace colorspinor // Use traits to reduce the template explosion - template struct colorspinor_mapper { + template + struct colorspinor_mapper { }; // double precision - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder type; }; - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder type; }; - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder type; }; - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder type; }; // single precision - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder type; }; - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder type; }; - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder(2), false, huge_alloc> type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder(2), false, huge_alloc, disable_ghost> type; }; - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder type; }; // half precision - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder(4), false, huge_alloc> type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder(4), false, huge_alloc, disable_ghost> type; }; - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder(4), true, huge_alloc> type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder(4), true, huge_alloc, disable_ghost> type; }; - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder(2), false, huge_alloc> type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder(2), false, huge_alloc, disable_ghost> type; }; - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder type; }; // quarter precision - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder(4), false, huge_alloc> type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder(4), false, huge_alloc, disable_ghost> type; }; - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder(4), true, huge_alloc> type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder(4), true, huge_alloc, disable_ghost> type; }; - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder(2), false, huge_alloc> type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder(2), false, huge_alloc, disable_ghost> type; }; - template struct colorspinor_mapper { - typedef colorspinor::FloatNOrder type; + template + struct colorspinor_mapper { + typedef colorspinor::FloatNOrder type; }; template struct colorspinor_order_mapper { diff --git a/include/contract_quda.h b/include/contract_quda.h index 1f5c509687..3cdc2ee843 100644 --- a/include/contract_quda.h +++ b/include/contract_quda.h @@ -1,9 +1,20 @@ #pragma once #include -#include namespace quda { void contractQuda(const ColorSpinorField &x, const ColorSpinorField &y, void *result, QudaContractType cType); + + /** + @brief Contract the quark field x against the 3-d Laplace eigenvector + set y. At present, this expects a spin-4 fermion field, and the + Laplace eigenvector is a spin-1 field. + @param[out] result array of length 4 * Lt + @param[in] x Input fermion field set we are contracting + @param[in] y Input eigenvector field set we are contracting against + */ + void evecProjectLaplace3D(std::vector &result, cvector_ref &x, + cvector_ref &y); + } // namespace quda diff --git a/include/dirac_quda.h b/include/dirac_quda.h index e9bdb66fc6..bddd6104f6 100644 --- a/include/dirac_quda.h +++ b/include/dirac_quda.h @@ -56,6 +56,7 @@ namespace quda { GaugeField *fatGauge; // used by staggered only GaugeField *longGauge; // used by staggered only int laplace3D; + int covdev_mu; CloverField *clover; GaugeField *xInvKD; // used for the Kahler-Dirac operator only @@ -117,6 +118,7 @@ namespace quda { printfQuda("kappa = %g\n", kappa); printfQuda("mass = %g\n", mass); printfQuda("laplace3D = %d\n", laplace3D); + printfQuda("covdev_mu = %d\n", covdev_mu); printfQuda("m5 = %g\n", m5); printfQuda("Ls = %d\n", Ls); printfQuda("matpcType = %d\n", matpcType); @@ -1098,6 +1100,14 @@ namespace quda { void gamma5(ColorSpinorField &out, const ColorSpinorField &in); + /** + @brief Applies a pauli matrices to a spinor doublet + @param[out] out Output field + @param[in] in Input field + @param[d] d index of the pauli matrix + */ + void ApplyTau(ColorSpinorField &out, const ColorSpinorField &in, int d); + // Full twisted mass class DiracTwistedMass : public DiracWilson { @@ -2214,6 +2224,9 @@ namespace quda { */ class GaugeCovDev : public Dirac { + protected: + int covdev_mu; + public: GaugeCovDev(const DiracParam ¶m); GaugeCovDev(const GaugeCovDev &covDev); diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index a2c0300a1d..38079c3cbb 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -1480,7 +1480,7 @@ namespace quda { for (int i = 0; i < 9; i++) out[i] = cmul(z, out[i]); } else { // stagic phase #pragma unroll - for (int i = 0; i < 18; i++) { out[i] *= phase; } + for (int i = 0; i < 9; i++) { out[i] *= phase; } } } }; diff --git a/include/instantiate_dslash.h b/include/instantiate_dslash.h index 1c96e2e7ea..5a499cde8c 100644 --- a/include/instantiate_dslash.h +++ b/include/instantiate_dslash.h @@ -166,4 +166,13 @@ namespace quda } #endif + constexpr bool is_enabled_laplace() + { +#ifdef GPU_LAPLACE + return true; +#else + return false; +#endif + } + } // namespace quda diff --git a/include/kernels/clover_outer_product.cuh b/include/kernels/clover_outer_product.cuh index 70e505b149..7327201e09 100644 --- a/include/kernels/clover_outer_product.cuh +++ b/include/kernels/clover_outer_product.cuh @@ -8,13 +8,15 @@ namespace quda { - template + template struct CloverForceArg : kernel_param<> { using real = typename mapper::type; static constexpr int nColor = nColor_; static constexpr int nSpin = 4; static constexpr int dim = dim_; static constexpr int spin_project = true; + static constexpr bool doublet = doublet_; // whether we applying the operator to a doublet + static constexpr int n_flavor = doublet ? 2 : 1; using F = typename colorspinor_mapper::type; using Gauge = typename gauge_mapper::type; using Force = typename gauge_mapper::type; @@ -30,11 +32,16 @@ namespace quda { int displacement; bool partitioned[4]; real coeff; + const unsigned int volume_4d_cb; + const unsigned int ghost_face_4d_cb; CloverForceArg(GaugeField &force, const GaugeField &U, const ColorSpinorField &inA, const ColorSpinorField &inB, const ColorSpinorField &inC, const ColorSpinorField &inD, const unsigned int parity, const double coeff) : - kernel_param(dim3(dim == -1 ? inA.VolumeCB() : inB.GhostFaceCB()[dim])), // inB since it has a ghost allocated + kernel_param(dim3(dim == -1 ? inA.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET ? inA.VolumeCB() / 2 : inA.VolumeCB() : + inA.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET ? + inB.GhostFaceCB()[dim] / 2 : + inB.GhostFaceCB()[dim])), // inB since it has a ghost allocated force(force), inA(inA), inB(inB), @@ -43,7 +50,9 @@ namespace quda { U(U), parity(parity), displacement(1), - coeff(coeff) + coeff(coeff), + volume_4d_cb(inA.VolumeCB() / 2), + ghost_face_4d_cb(inB.GhostFaceCB()[dim] / 2) { for (int i=0; i<4; ++i) this->X[i] = U.X()[i]; for (int i=0; i<4; ++i) this->partitioned[i] = commDimPartitioned(i) ? true : false; @@ -76,31 +85,36 @@ namespace quda { using Spinor = ColorSpinor; using Link = Matrix; - Spinor A = arg.inA(x_cb, 0); - Spinor C = arg.inC(x_cb, 0); +#pragma unroll + for (int flavor = 0; flavor < Arg::n_flavor; ++flavor) { + + const int flavor_offset_idx = flavor * arg.volume_4d_cb; + Spinor A = arg.inA(x_cb + flavor_offset_idx, 0); + Spinor C = arg.inC(x_cb + flavor_offset_idx, 0); #pragma unroll - for (int dim=0; dim<4; ++dim) { - int shift[4] = {0, 0, 0, 0}; - shift[dim] = 1; - const int nbr_idx = neighborIndex(x_cb, shift, arg.partitioned, arg.parity, arg.X); - - if (nbr_idx >= 0) { - Spinor B_shift = arg.inB(nbr_idx, 0); - Spinor D_shift = arg.inD(nbr_idx, 0); - - B_shift = (B_shift.project(dim,1)).reconstruct(dim,1); - Link result = outerProdSpinTrace(B_shift,A); - - D_shift = (D_shift.project(dim,-1)).reconstruct(dim,-1); - result += outerProdSpinTrace(D_shift,C); - - Link temp = arg.force(dim, x_cb, arg.parity); - Link U = arg.U(dim, x_cb, arg.parity); - result = temp + U*result*arg.coeff; - arg.force(dim, x_cb, arg.parity) = result; - } - } // dim + for (int dim = 0; dim < 4; ++dim) { + int shift[4] = {0, 0, 0, 0}; + shift[dim] = 1; + const int nbr_idx = neighborIndex(x_cb, shift, arg.partitioned, arg.parity, arg.X); + + if (nbr_idx >= 0) { + Spinor B_shift = arg.inB(nbr_idx + flavor_offset_idx, 0); + Spinor D_shift = arg.inD(nbr_idx + flavor_offset_idx, 0); + + B_shift = (B_shift.project(dim, 1)).reconstruct(dim, 1); + Link result = outerProdSpinTrace(B_shift, A); + + D_shift = (D_shift.project(dim, -1)).reconstruct(dim, -1); + result += outerProdSpinTrace(D_shift, C); + + Link temp = arg.force(dim, x_cb, arg.parity); + Link U = arg.U(dim, x_cb, arg.parity); + result = temp + U * result * arg.coeff; + arg.force(dim, x_cb, arg.parity) = result; + } + } // dim + } // flavor } }; @@ -117,23 +131,28 @@ namespace quda { using Link = Matrix; int x[4]; - coordsFromIndexExterior(x, x_cb, arg.X, Arg::dim, arg.displacement, arg.parity); - const unsigned int bulk_cb_idx = ((((x[3]*arg.X[2] + x[2])*arg.X[1] + x[1])*arg.X[0] + x[0]) >> 1); - Spinor A = arg.inA(bulk_cb_idx, 0); - Spinor C = arg.inC(bulk_cb_idx, 0); - - HalfSpinor projected_tmp = arg.inB.Ghost(Arg::dim, 1, x_cb, 0); - Spinor B_shift = projected_tmp.reconstruct(Arg::dim, 1); - Link result = outerProdSpinTrace(B_shift,A); - - projected_tmp = arg.inD.Ghost(Arg::dim, 1, x_cb, 0); - Spinor D_shift = projected_tmp.reconstruct(Arg::dim,-1); - result += outerProdSpinTrace(D_shift,C); - - Link temp = arg.force(Arg::dim, bulk_cb_idx, arg.parity); - Link U = arg.U(Arg::dim, bulk_cb_idx, arg.parity); - result = temp + U*result*arg.coeff; - arg.force(Arg::dim, bulk_cb_idx, arg.parity) = result; +#pragma unroll + for (int flavor = 0; flavor < Arg::n_flavor; ++flavor) { + const int flavor_offset_bulk_idx = flavor * arg.volume_4d_cb; + const int flavor_offset_ghost_idx = flavor * arg.ghost_face_4d_cb; + coordsFromIndexExterior(x, x_cb, arg.X, Arg::dim, arg.displacement, arg.parity); + const unsigned int bulk_cb_idx = ((((x[3] * arg.X[2] + x[2]) * arg.X[1] + x[1]) * arg.X[0] + x[0]) >> 1); + Spinor A = arg.inA(bulk_cb_idx + flavor_offset_bulk_idx, 0); + Spinor C = arg.inC(bulk_cb_idx + flavor_offset_bulk_idx, 0); + + HalfSpinor projected_tmp = arg.inB.Ghost(Arg::dim, 1, x_cb + flavor_offset_ghost_idx, 0); + Spinor B_shift = projected_tmp.reconstruct(Arg::dim, 1); + Link result = outerProdSpinTrace(B_shift, A); + + projected_tmp = arg.inD.Ghost(Arg::dim, 1, x_cb + flavor_offset_ghost_idx, 0); + Spinor D_shift = projected_tmp.reconstruct(Arg::dim, -1); + result += outerProdSpinTrace(D_shift, C); + + Link temp = arg.force(Arg::dim, bulk_cb_idx, arg.parity); + Link U = arg.U(Arg::dim, bulk_cb_idx, arg.parity); + result = temp + U * result * arg.coeff; + arg.force(Arg::dim, bulk_cb_idx, arg.parity) = result; + } } }; diff --git a/include/kernels/clover_sigma_outer_product.cuh b/include/kernels/clover_sigma_outer_product.cuh index e4b901cdc1..0e39dd83a1 100644 --- a/include/kernels/clover_sigma_outer_product.cuh +++ b/include/kernels/clover_sigma_outer_product.cuh @@ -11,23 +11,25 @@ namespace quda // FIXME - make this multi-RHS once we have the multi-RHS framework developed #define MAX_NVECTOR 1 - template - struct CloverSigmaOprodArg : kernel_param<> { + template struct CloverSigmaOprodArg : kernel_param<> { using real = typename mapper::type; static constexpr int nColor = nColor_; static constexpr int nSpin = 4; static constexpr int nvector = nvector_; + static constexpr bool doublet = doublet_; // whether we applying the operator to a doublet + static constexpr int n_flavor = doublet ? 2 : 1; using Oprod = typename gauge_mapper::type; using F = typename colorspinor_mapper::type; Oprod oprod; + const unsigned int volume_4d_cb; F inA[nvector]; F inB[nvector]; array_2d coeff; CloverSigmaOprodArg(GaugeField &oprod, cvector_ref &inA, cvector_ref &inB, const std::vector> &coeff_) : - kernel_param(dim3(oprod.VolumeCB(), 2, 6)), oprod(oprod) + kernel_param(dim3(oprod.VolumeCB(), 2, 6)), oprod(oprod), volume_4d_cb(inA[0].VolumeCB() / 2) { for (int i = 0; i < nvector; i++) { this->inA[i] = inA[i]; @@ -46,10 +48,14 @@ namespace quda #pragma unroll for (int i = 0; i < Arg::nvector; i++) { - const Spinor A = arg.inA[i](x_cb, parity); - const Spinor B = arg.inB[i](x_cb, parity); - Spinor C = A.sigma(nu, mu); // multiply by sigma_mu_nu - result += arg.coeff[i][parity] * outerProdSpinTrace(C, B); +#pragma unroll + for (int flavor = 0; flavor < Arg::n_flavor; flavor++) { + const int flavor_offset_idx = flavor * (arg.volume_4d_cb); + const Spinor A = arg.inA[i](x_cb + flavor_offset_idx, parity); + const Spinor B = arg.inB[i](x_cb + flavor_offset_idx, parity); + Spinor C = A.sigma(nu, mu); // multiply by sigma_mu_nu + result += arg.coeff[i][parity] * outerProdSpinTrace(C, B); + } } result -= conj(result); diff --git a/include/kernels/dslash_gamma_helper.cuh b/include/kernels/dslash_gamma_helper.cuh index 5261ea5b32..12fd8ee88d 100644 --- a/include/kernels/dslash_gamma_helper.cuh +++ b/include/kernels/dslash_gamma_helper.cuh @@ -20,7 +20,8 @@ namespace quda { const F in; // input vector field const int d; // which gamma matrix are we applying const int nParity; // number of parities we're working on - bool doublet; // whether we applying the operator to a doublet + const bool doublet; // whether we applying the operator to a doublet + const int n_flavor; // number of flavors const int volumeCB; // checkerboarded volume real a; // scale factor real b; // chiral twist @@ -31,6 +32,7 @@ namespace quda { bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID) : out(out), in(in), d(d), nParity(in.SiteSubset()), doublet(in.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET), + n_flavor(doublet ? 2 : 1), volumeCB(doublet ? in.VolumeCB()/2 : in.VolumeCB()), a(0.0), b(0.0), c(0.0) { checkPrecision(out, in); @@ -76,13 +78,15 @@ namespace quda { __device__ __host__ void operator()(int x_cb, int parity) { - ColorSpinor in = arg.in(x_cb, parity); - switch(arg.d) { - case 0: arg.out(x_cb, parity) = in.gamma(0); break; - case 1: arg.out(x_cb, parity) = in.gamma(1); break; - case 2: arg.out(x_cb, parity) = in.gamma(2); break; - case 3: arg.out(x_cb, parity) = in.gamma(3); break; - case 4: arg.out(x_cb, parity) = in.gamma(4); break; + for (int i = 0; i < arg.n_flavor; i++) { + ColorSpinor in = arg.in(x_cb + i * arg.volumeCB, parity); + switch(arg.d) { + case 0: arg.out(x_cb + i * arg.volumeCB, parity) = in.gamma(0); break; + case 1: arg.out(x_cb + i * arg.volumeCB, parity) = in.gamma(1); break; + case 2: arg.out(x_cb + i * arg.volumeCB, parity) = in.gamma(2); break; + case 3: arg.out(x_cb + i * arg.volumeCB, parity) = in.gamma(3); break; + case 4: arg.out(x_cb + i * arg.volumeCB, parity) = in.gamma(4); break; + } } } }; @@ -103,12 +107,78 @@ namespace quda { fermion_t in = arg.in(x_cb, parity); arg.out(x_cb, parity) = arg.a * (in + arg.b * in.igamma(d)); } else { - fermion_t in_1 = arg.in(x_cb+0*arg.volumeCB, parity); - fermion_t in_2 = arg.in(x_cb+1*arg.volumeCB, parity); + fermion_t in_1 = arg.in(x_cb + 0 * arg.volumeCB, parity); + fermion_t in_2 = arg.in(x_cb + 1 * arg.volumeCB, parity); arg.out(x_cb + 0 * arg.volumeCB, parity) = arg.a * (in_1 + arg.b * in_1.igamma(d) + arg.c * in_2); arg.out(x_cb + 1 * arg.volumeCB, parity) = arg.a * (in_2 - arg.b * in_2.igamma(d) + arg.c * in_1); } } }; + /** + @brief Parameter structure for driving the Tau operator + */ + template struct TauArg : kernel_param<> { + using real = typename mapper::type; + constexpr static int nColor = nColor_; + typedef typename colorspinor_mapper::type F; + + F out; // output vector field + const F in; // input vector field + const int d; // which gamma matrix are we applying + const int nParity; // number of parities we're working on + bool doublet; // whether we applying the operator to a doublet + const int volumeCB; // checkerboarded volume + + TauArg(ColorSpinorField &out, const ColorSpinorField &in, int d) : + out(out), + in(in), + d(d), + nParity(in.SiteSubset()), + doublet(in.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET), + volumeCB(doublet ? in.VolumeCB() / 2 : in.VolumeCB()) + { + checkPrecision(out, in); + checkLocation(out, in); + if (d < 1 || d > 3) errorQuda("Undefined tau matrix %d", d); + if (in.Nspin() != 4) errorQuda("Cannot apply tau to nSpin=%d field", in.Nspin()); + if (!in.isNative() || !out.isNative()) + errorQuda("Unsupported field order out=%d in=%d\n", out.FieldOrder(), in.FieldOrder()); + if (!doublet) errorQuda("tau matrix can be applyed only to spinor doublet"); + + this->threads = dim3(doublet ? in.VolumeCB() / 2 : in.VolumeCB(), in.SiteSubset(), 1); + } + }; + /** + @brief Application of Gamma matrix to a color spinor field + */ + template struct Tau { + using fermion_t = ColorSpinor; + const Arg &arg; + constexpr Tau(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ void operator()(int x_cb, int parity) + { + fermion_t in_1 = arg.in(x_cb + 0 * arg.volumeCB, parity); + fermion_t in_2 = arg.in(x_cb + 1 * arg.volumeCB, parity); + const complex j(0.0, 1.0); + const typename Arg::real m1(-1); + + switch (arg.d) { + case 1: + arg.out(x_cb + 0 * arg.volumeCB, parity) = in_2; + arg.out(x_cb + 1 * arg.volumeCB, parity) = in_1; + break; + case 2: + arg.out(x_cb + 0 * arg.volumeCB, parity) = -j * in_2; + arg.out(x_cb + 1 * arg.volumeCB, parity) = j * in_1; + break; + case 3: + arg.out(x_cb + 0 * arg.volumeCB, parity) = in_1; + arg.out(x_cb + 1 * arg.volumeCB, parity) = m1 * in_2; + break; + } + } + }; } diff --git a/include/kernels/evec_project.cuh b/include/kernels/evec_project.cuh new file mode 100644 index 0000000000..cdd2ac9716 --- /dev/null +++ b/include/kernels/evec_project.cuh @@ -0,0 +1,112 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace quda { + + using spinor_array = array; + + constexpr unsigned long max_nx = 4; + constexpr unsigned long max_ny = 4; + + template + struct EvecProjectionArg : public ReduceArg + { + using real = typename mapper::type; + static constexpr int nColor = nColor_; + static constexpr int nSpinX = 4; + static constexpr int nSpinY = 1; + + typedef typename colorspinor_mapper::type F4; + typedef typename colorspinor_mapper::type F1; + + static constexpr unsigned int max_n_batch_block = 8; + int_fastdiv nx; + int_fastdiv ny; + F4 x[max_nx]; + F1 y[max_ny]; + + int_fastdiv X[4]; // grid dimensions + + EvecProjectionArg(cvector_ref &x, cvector_ref &y) : + ReduceArg(dim3(x.Volume()/x[0].X()[3], 1, x.size() * y.size() * x[0].X()[3]), + x.size() * y.size() * x[0].X()[3]), + nx(x.size()), ny(y.size()) + { + if (x.size() > max_nx) errorQuda("Requested vector size %lu greater than max %lu", x.size(), max_nx); + if (y.size() > max_ny) errorQuda("Requested vector size %lu greater than max %lu", y.size(), max_ny); + for (int i = 0 ; i < 4; i++) X[i] = x[0].X()[i]; + for (auto i = 0u; i < x.size(); i++) this->x[i] = x[i]; + for (auto i = 0u; i < y.size(); i++) this->y[i] = y[i]; + } + __device__ __host__ spinor_array init() const { return spinor_array(); } + }; + + template __device__ int idx_from_t_xyz(int t, int xyz, T X[4]) + { + int x[4]; +#pragma unroll + for (int d = 0; d < 4; d++) { + if (d != reduction_dim) { + x[d] = xyz % X[d]; + xyz = xyz / X[d]; + } + } + x[reduction_dim] = t; + return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]); + } + + template struct EvecProjection : plus { + using reduce_t = spinor_array; + using plus::operator(); + static constexpr int reduce_block_dim = 1; // only doing a reduct in the x thread dimension + + const Arg &arg; + constexpr EvecProjection(const Arg &arg) : arg(arg) {} + static constexpr const char *filename() { return KERNEL_FILE; } + // overload comm_reduce to defer until the entire "tile" is complete + template static inline void comm_reduce(U &) { } + + // Final param is unused in the MultiReduce functor in this use case. + __device__ __host__ inline reduce_t operator()(reduce_t &result, int xyz, int, int ijt) + { + int i = ijt % arg.nx; + int jt = ijt / arg.nx; + int j = jt % arg.ny; + int t = jt / arg.ny; + + constexpr int nSpinX = Arg::nSpinX; + constexpr int nSpinY = Arg::nSpinY; + constexpr int nColor = Arg::nColor; + using real = typename Arg::real; + using Vector4 = ColorSpinor; + using Vector1 = ColorSpinor; + + // Collect vector data + int parity = 0; + // Always use the dim in this kernel + int idx = idx_from_t_xyz<3>(t, xyz, arg.X); + + // This helper will change the value of 'parity' to the correct one, + // and return the checkerboard index. + int idx_cb = getParityCBFromFull(parity, arg.X, idx); + Vector4 x = arg.x[i](idx_cb, parity); + Vector1 y = arg.y[j](idx_cb, parity); + + // Compute the inner product over colour + reduce_t result_local; + for (int mu = 0; mu < nSpinX; mu++) { + complex prod = innerProduct(y, x, 0, mu); + result_local[2 * mu + 0] = prod.real(); + result_local[2 * mu + 1] = prod.imag(); + } + + return plus::operator()(result_local, result); + } + }; +} diff --git a/include/quda.h b/include/quda.h index de88d1d0d2..cc62cefdb8 100644 --- a/include/quda.h +++ b/include/quda.h @@ -132,10 +132,12 @@ extern "C" { double mu; /**< Twisted mass parameter */ double tm_rho; /**< Hasenbusch mass shift applied like twisted mass to diagonal (but not inverse) */ double epsilon; /**< Twisted mass parameter */ + double evmax; /** maximum of the eigenvalues of the ndeg twisted mass operator needed for fermionic forces **/ QudaTwistFlavorType twist_flavor; /**< Twisted mass flavor */ int laplace3D; /**< omit this direction from laplace operator: x,y,z,t -> 0,1,2,3 (-1 is full 4D) */ + int covdev_mu; /**< Apply forward/backward covariant derivative in direction mu(mu<=3)/mu-4(mu>3) */ double tol; /**< Solver tolerance in the L2 residual norm */ double tol_restart; /**< Solver tolerance in the L2 residual norm (used to restart InitCG) */ @@ -1797,6 +1799,24 @@ extern "C" { */ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_param); + /** + * @brief Performs contractions between a set of quark fields and + * eigenvectors of the 3-d Laplace operator. + * @param[in,out] host_sinks An array representing the inner + * products between the quark fields and the eigen-vector fields. + * Ordered as [nQuark][nEv][Lt][nSpin][complexity]. + * @param[in] host_quark Array of quark fields we are taking the inner over + * @param[in] n_quark Number of quark fields + * @param[in] tile_quark Tile size for quark fields (batch size) + * @param[in] host_evec Array of eigenvectors we are taking the inner over + * @param[in] n_evec Number of eigenvectors + * @param[in] tile_evec Tile size for eigenvectors (batch size) + * @param[in] inv_param Meta-data structure + * @param[in] X Lattice dimensions + */ + void laphSinkProject(double _Complex *host_sinks, void **host_quark, int n_quark, int tile_quark, + void **host_evec, int nevec, int tile_evec, QudaInvertParam *inv_param, const int X[4]); + #ifdef __cplusplus } #endif diff --git a/include/quda_define.h.in b/include/quda_define.h.in index 2692833f38..374fd725df 100644 --- a/include/quda_define.h.in +++ b/include/quda_define.h.in @@ -119,8 +119,8 @@ #define GPU_STAGGERED_DIRAC #endif -#cmakedefine QUDA_LAPLACE -#ifdef QUDA_LAPLACE +#cmakedefine QUDA_DIRAC_LAPLACE +#ifdef QUDA_DIRAC_LAPLACE /** * @def GPU_LAPLACE * @brief This macro is set when we have the Laplace operator enabled diff --git a/include/quda_internal.h b/include/quda_internal.h index dd8a6c8177..37337b0d4b 100644 --- a/include/quda_internal.h +++ b/include/quda_internal.h @@ -30,7 +30,7 @@ #define NSPIN2 #endif -#if defined(GPU_STAGGERED_DIRAC) +#if defined(GPU_STAGGERED_DIRAC) || defined(GPU_LAPLACE) #define NSPIN1 #endif diff --git a/include/reference_wrapper_helper.h b/include/reference_wrapper_helper.h index f567246949..5501e3cac6 100644 --- a/include/reference_wrapper_helper.h +++ b/include/reference_wrapper_helper.h @@ -295,6 +295,49 @@ namespace quda for (auto i = 0u; i < vector::size(); i++) odd.push_back(operator[](i).Odd()); return odd; } + + template + std::enable_if_t, ColorSpinorField>, QudaPrecision> Precision() const + { + for (auto i = 1u; i < vector::size(); i++) + if (operator[](i - 1).Precision() != operator[](i).Precision()) + errorQuda("Precisions %d %d do not match", operator[](i - 1).Precision(), operator[](i).Precision()); + return operator[](0).Precision(); + } + + template + std::enable_if_t, ColorSpinorField>, int> Ncolor() const + { + for (auto i = 1u; i < vector::size(); i++) + if (operator[](i - 1).Ncolor() != operator[](i).Ncolor()) + errorQuda("Ncolors do not match %d != %d", operator[](i - 1).Ncolor(), operator[](i).Ncolor()); + return operator[](0).Ncolor(); + } + + template std::enable_if_t, ColorSpinorField>, int> Nspin() const + { + for (auto i = 1u; i < vector::size(); i++) + if (operator[](i - 1).Nspin() != operator[](i).Nspin()) + errorQuda("Nspins do not match %d != %d", operator[](i - 1).Nspin(), operator[](i).Nspin()); + return operator[](0).Nspin(); + } + + template + std::enable_if_t, ColorSpinorField>, size_t> Volume() const + { + for (auto i = 1u; i < vector::size(); i++) + if (operator[](i - 1).Volume() != operator[](i).Volume()) + errorQuda("Volumes do not match %lu != %lu", operator[](i - 1).Volume(), operator[](i).Volume()); + return operator[](0).Volume(); + } + + template + std::enable_if_t, ColorSpinorField>, size_t> Bytes() const + { + size_t bytes = 0; + for (auto i = 0u; i < vector::size(); i++) bytes += operator[](i).Bytes(); + return bytes; + } }; template using cvector_ref = const vector_ref; diff --git a/include/timer.h b/include/timer.h index 6c7891f103..a98928ec39 100644 --- a/include/timer.h +++ b/include/timer.h @@ -215,6 +215,8 @@ namespace quda { TimeProfile(std::string fname, bool use_global) : fname(fname), switchOff(false), use_global(use_global) { ; } + auto Name() const { return fname; } + /**< Print out the profile information */ void Print(); @@ -245,6 +247,7 @@ namespace quda { double &secs; double &gflops; uint64_t flops; + bool active = false; pushProfile(TimeProfile &profile, double &secs = secs_dummy, double &gflops = gflops_dummy); virtual ~pushProfile(); }; diff --git a/include/tunable_reduction.h b/include/tunable_reduction.h index 17241fcccc..c575cb8135 100644 --- a/include/tunable_reduction.h +++ b/include/tunable_reduction.h @@ -338,7 +338,7 @@ namespace quda param.shared_bytes = shared_bytes; return true; } else { // we have run off the end so let's reset - param.block.z = 1; + param.block.z = std::max(1u, (n_batch + device::max_grid_size(2) - 1) / device::max_grid_size(2)); param.grid.z = (n_batch + param.block.z - 1) / param.block.z; return false; } @@ -352,7 +352,8 @@ namespace quda void initTuneParam(TuneParam ¶m) const { TunableReduction2D::initTuneParam(param); - param.block = {param.block.x, param.block.y, 1}; + param.block = {param.block.x, param.block.y, + std::max(1u, (n_batch + device::max_grid_size(2) - 1) / device::max_grid_size(2))}; param.grid = {param.grid.x, param.grid.y, (n_batch + param.block.z - 1) / param.block.z}; setSharedBytes(param); } @@ -364,7 +365,8 @@ namespace quda void defaultTuneParam(TuneParam ¶m) const { TunableReduction2D::defaultTuneParam(param); - param.block = {param.block.x, param.block.y, 1}; + param.block = {param.block.x, param.block.y, + std::max(1u, (n_batch + device::max_grid_size(2) - 1) / device::max_grid_size(2))}; param.grid = {param.grid.x, param.grid.y, (n_batch + param.block.z - 1) / param.block.z}; setSharedBytes(param); } diff --git a/include/tune_quda.h b/include/tune_quda.h index c2cc221d4e..f11763f068 100644 --- a/include/tune_quda.h +++ b/include/tune_quda.h @@ -96,7 +96,7 @@ namespace quda { by the autotuner. This defaults to twice the number of processors on the GPU, since it's unlikely a large grid size will help (if a kernels needs more parallelism, the autotuner - will find this through increased block size. + will find this through increased block size). */ virtual unsigned int maxGridSize() const { return 2 * device::processor_count(); } diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index e9b7945ea7..4ed829ff7c 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -36,6 +36,7 @@ set (QUDA_OBJS field_cache.cpp gauge_covdev.cpp dirac.cpp clover_field.cpp lattice_field.cpp gauge_field.cpp + evec_project.cu extract_gauge_ghost.cu gauge_norm.cu gauge_update_quda.cu max_clover.cu dirac_clover.cpp dirac_wilson.cpp dirac_staggered.cpp diff --git a/lib/check_params.h b/lib/check_params.h index cb5dd701f7..dffba68cab 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -365,9 +365,11 @@ void printQudaInvertParam(QudaInvertParam *param) { P(Ls, INVALID_INT); P(mu, INVALID_DOUBLE); P(epsilon, INVALID_DOUBLE); + P(evmax, INVALID_DOUBLE); P(tm_rho, 0.0); P(twist_flavor, QUDA_TWIST_INVALID); P(laplace3D, INVALID_INT); + P(covdev_mu, INVALID_INT); #else // asqtad and domain wall use mass parameterization if (param->dslash_type == QUDA_STAGGERED_DSLASH || param->dslash_type == QUDA_ASQTAD_DSLASH @@ -389,6 +391,7 @@ void printQudaInvertParam(QudaInvertParam *param) { } if (param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { P(tm_rho, INVALID_DOUBLE); } if (param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { P(epsilon, INVALID_DOUBLE); } + if (param->dslash_type == QUDA_COVDEV_DSLASH) { P(covdev_mu, INVALID_INT); } #endif P(tol, INVALID_DOUBLE); diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp index ceaa2a1ab7..6944f46d89 100644 --- a/lib/clover_field.cpp +++ b/lib/clover_field.cpp @@ -247,6 +247,11 @@ namespace quda { cloverInvert(clover_inverse, true); copy(clover_inverse, true); dynamic_inverse_copy = false; + if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) { + getProfile().TPSTOP(QUDA_PROFILE_D2H); + } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) { + getProfile().TPSTOP(QUDA_PROFILE_H2D); + } return; } diff --git a/lib/clover_force.cpp b/lib/clover_force.cpp index 7ab80e29a0..f342d54b60 100644 --- a/lib/clover_force.cpp +++ b/lib/clover_force.cpp @@ -44,24 +44,39 @@ namespace quda for (auto i = 0u; i < x.size(); i++) { gamma5(p[i][parity], x[i][parity]); - if (dagger) dirac->Dagger(QUDA_DAG_YES); dirac->Dslash(x[i][other_parity], p[i][parity], other_parity); // want to apply \hat Q_{-} = \hat M_{+}^\dagger \gamma_5 to get Y_o dirac->M(p[i][parity], p[i][parity]); // this is the odd part of Y if (dagger) dirac->Dagger(QUDA_DAG_NO); + if (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { + blas::ax(1.0 / inv_param.evmax, p[i][parity]); + ApplyTau(x[i][other_parity], x[i][other_parity], 1); + ApplyTau(p[i][parity], p[i][parity], 1); + Complex a(0.0, -inv_param.offset[i]); + blas::caxpy(a, x[i][parity], p[i][parity]); + } + gamma5(x[i][other_parity], x[i][other_parity]); - if (detratio) blas::xpy(x0[i][parity], p[i][parity]); + if (detratio && inv_param.twist_flavor != QUDA_TWIST_NONDEG_DOUBLET) blas::xpy(x0[i][parity], p[i][parity]); - if (not_dagger) dirac->Dagger(QUDA_DAG_YES); + if (not_dagger || inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) dirac->Dagger(QUDA_DAG_YES); + if (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) gamma5(p[i][parity], p[i][parity]); dirac->Dslash(p[i][other_parity], p[i][parity], other_parity); // and now the even part of Y - if (not_dagger) dirac->Dagger(QUDA_DAG_NO); - // up to here x.odd match X.odd in tmLQCD and p.odd=-Y.odd of tmLQCD - // x.Even= X.Even.tmLQCD/kappa and p.Even=-Y.Even.tmLQCD/kappa - - // the gamma5 application in tmLQCD is done inside deriv_Sb - gamma5(p[i], p[i]); + if (not_dagger || inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) dirac->Dagger(QUDA_DAG_NO); + + if (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { + ApplyTau(p[i][other_parity], p[i][other_parity], 1); + // up to here x.odd match X.odd in tmLQCD and p.odd=- gamma5 Y.odd of tmLQCD + // x.Even= X.Even.tmLQCD/kappa and p.Even=- gamma5 Y.Even.tmLQCD/kappa + // the gamma5 application in tmLQCD inside deriv_Sb is otimized away in here + } else { + // up to here x.odd match X.odd in tmLQCD and p.odd=-Y.odd of tmLQCD + // x.Even= X.Even.tmLQCD/kappa and p.Even=-Y.Even.tmLQCD/kappa + // the gamma5 application in tmLQCD is done inside deriv_Sb + gamma5(p[i], p[i]); + } } // derivative of the wilson operator it correspond to deriv_Sb(OE,...) plus deriv_Sb(EO,...) in tmLQCD @@ -71,6 +86,12 @@ namespace quda // derivative of pseudofermion sw term, first term term of (A12) in hep-lat/0112051, sw_spinor_eo(EE,..) plus // sw_spinor_eo(OO,..) in tmLQCD + if (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { + for (auto i = 0u; i < x.size(); i++) { + ApplyTau(p[i][parity], p[i][parity], 1); + ApplyTau(p[i][other_parity], p[i][other_parity], 1); + } + } computeCloverSigmaOprod(oprod, inv_param.dagger == QUDA_DAG_YES ? p : x, inv_param.dagger == QUDA_DAG_YES ? x : p, epsilon); diff --git a/lib/clover_outer_product.cu b/lib/clover_outer_product.cu index 01573951ad..f9040110e5 100644 --- a/lib/clover_outer_product.cu +++ b/lib/clover_outer_product.cu @@ -9,7 +9,7 @@ namespace quda { template class CloverForce : public TunableKernel1D { using real = typename mapper::type; - template using Arg = CloverForceArg; + template using Arg = CloverForceArg; GaugeField &force; const GaugeField &U; const ColorSpinorField &inA; @@ -18,14 +18,18 @@ namespace quda { const ColorSpinorField &inD; const int parity; const real coeff; + const bool doublet; // whether we applying the operator to a doublet + const int n_flavor; OprodKernelType kernel; int dir; - unsigned int minThreads() const override { return kernel == INTERIOR ? inB.VolumeCB() : inB.GhostFaceCB()[dir]; } + unsigned int minThreads() const override + { + return (kernel == INTERIOR ? inB.VolumeCB() : inB.GhostFaceCB()[dir]) / n_flavor; + } public: - CloverForce(const GaugeField &U, GaugeField &force, const ColorSpinorField& inA, - const ColorSpinorField& inB, const ColorSpinorField& inC, const ColorSpinorField& inD, - int parity, double coeff) : + CloverForce(const GaugeField &U, GaugeField &force, const ColorSpinorField &inA, const ColorSpinorField &inB, + const ColorSpinorField &inC, const ColorSpinorField &inD, int parity, double coeff) : TunableKernel1D(force), force(force), U(U), @@ -34,8 +38,11 @@ namespace quda { inC(inC), inD(inD), parity(parity), - coeff(static_cast(coeff)) + coeff(static_cast(coeff)), + doublet(inA.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET), + n_flavor(doublet ? 2 : 1) { + if (doublet) strcat(aux, ",doublet"); char aux2[TuneKey::aux_n]; strcpy(aux2, aux); strcat(aux, ",interior"); @@ -58,13 +65,40 @@ namespace quda { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); if (kernel == INTERIOR) { - launch(tp, stream, Arg<>(force, U, inA, inB, inC, inD, parity, coeff)); + if (doublet) + launch(tp, stream, Arg<-1, true>(force, U, inA, inB, inC, inD, parity, coeff)); + else + launch(tp, stream, Arg<>(force, U, inA, inB, inC, inD, parity, coeff)); } else if (kernel == EXTERIOR) { switch (dir) { - case 0: launch(tp, stream, Arg<0>(force, U, inA, inB, inC, inD, parity, coeff)); break; - case 1: launch(tp, stream, Arg<1>(force, U, inA, inB, inC, inD, parity, coeff)); break; - case 2: launch(tp, stream, Arg<2>(force, U, inA, inB, inC, inD, parity, coeff)); break; - case 3: launch(tp, stream, Arg<3>(force, U, inA, inB, inC, inD, parity, coeff)); break; + case 0: { + if (doublet) + launch(tp, stream, Arg<0, true>(force, U, inA, inB, inC, inD, parity, coeff)); + else + launch(tp, stream, Arg<0>(force, U, inA, inB, inC, inD, parity, coeff)); + break; + } + case 1: { + if (doublet) + launch(tp, stream, Arg<1, true>(force, U, inA, inB, inC, inD, parity, coeff)); + else + launch(tp, stream, Arg<1>(force, U, inA, inB, inC, inD, parity, coeff)); + break; + } + case 2: { + if (doublet) + launch(tp, stream, Arg<2, true>(force, U, inA, inB, inC, inD, parity, coeff)); + else + launch(tp, stream, Arg<2>(force, U, inA, inB, inC, inD, parity, coeff)); + break; + } + case 3: { + if (doublet) + launch(tp, stream, Arg<3, true>(force, U, inA, inB, inC, inD, parity, coeff)); + else + launch(tp, stream, Arg<3>(force, U, inA, inB, inC, inD, parity, coeff)); + break; + } default: errorQuda("Unexpected direction %d", dir); } } @@ -74,14 +108,22 @@ namespace quda { void postTune() override { force.restore(); } // spin trace + multiply-add (ignore spin-project) - long long flops() const override { return minThreads() * (144 + 234) * (kernel == INTERIOR ? 4 : 1); } + long long flops() const override + { + int oprod_flops = nColor * nColor * (8 * inA.Nspin() - 2); + int gemm_flops = nColor * nColor * (8 * nColor - 2); + int mat_size = 2 * nColor * nColor; + + return minThreads() * n_flavor * (2 * oprod_flops + gemm_flops + 3 * mat_size) * (kernel == INTERIOR ? 4 : 1); + } long long bytes() const override { if (kernel == INTERIOR) { - return inA.Bytes() + inC.Bytes() + 4*(inB.Bytes() + inD.Bytes()) + force.Bytes() + U.Bytes() / 2; + return inA.Bytes() + inC.Bytes() + 4 * (inB.Bytes() + inD.Bytes()) + force.Bytes() + U.Bytes() / 2; } else { - return minThreads() * (nColor * (4 * 2 + 2 * 2) + 2 * force.Reconstruct() + U.Reconstruct()) + return minThreads() * n_flavor + * (nColor * (4 * inA.Nspin() + 2 * inA.Nspin() / 2) * 2 + 2 * force.Reconstruct() + U.Reconstruct()) * sizeof(Float); } } diff --git a/lib/clover_sigma_outer_product.cu b/lib/clover_sigma_outer_product.cu index a6b9750e79..0cf740479a 100644 --- a/lib/clover_sigma_outer_product.cu +++ b/lib/clover_sigma_outer_product.cu @@ -12,18 +12,25 @@ namespace quda { template class CloverSigmaOprod : public TunableKernel3D { - template using Arg = CloverSigmaOprodArg; + template using Arg = CloverSigmaOprodArg; GaugeField &oprod; cvector_ref &inA; cvector_ref &inB; const std::vector> &coeff; + const bool doublet; // whether we are applying the operator to a doublet unsigned int minThreads() const override { return oprod.VolumeCB(); } public: CloverSigmaOprod(GaugeField &oprod, cvector_ref &inA, cvector_ref &inB, const std::vector> &coeff) : - TunableKernel3D(oprod, 2, 6), oprod(oprod), inA(inA), inB(inB), coeff(coeff) + TunableKernel3D(oprod, 2, 6), + oprod(oprod), + inA(inA), + inB(inB), + coeff(coeff), + doublet(inA[0].TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET) { + if (doublet) strcat(aux, ",doublet"); char tmp[16]; sprintf(tmp, ",nvector=%lu", inA.size()); strcat(aux, tmp); @@ -34,7 +41,12 @@ namespace quda { { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); switch (inA.size()) { - case 1: launch(tp, stream, Arg<1>(oprod, inA, inB, coeff)); break; + case 1: + if (doublet) + launch(tp, stream, Arg<1, true>(oprod, inA, inB, coeff)); + else + launch(tp, stream, Arg<1, false>(oprod, inA, inB, coeff)); + break; default: errorQuda("Unsupported nvector = %lu\n", inA.size()); } } // apply @@ -44,7 +56,11 @@ namespace quda { long long flops() const override { - return ((144 + 18) * inA.size() + 18) * 6 * oprod.Volume(); // spin trace + multiply-add + int n_flavor = doublet ? 2 : 1; + int oprod_flops = inA.Ncolor() * inA.Ncolor() * (8 * inA.Nspin() - 2); + int mat_size = 2 * inA.Ncolor() * inA.Ncolor(); + // ((spin trace + multiply-add) * n_flavor * n_vector + projection) * 6 dir * sites + return ((oprod_flops + 2 * mat_size) * n_flavor * inA.size() + mat_size) * 6 * oprod.Volume(); } long long bytes() const override { return (inA[0].Bytes() + inB[0].Bytes()) * inA.size() * 6 + 2 * oprod.Bytes(); } }; // CloverSigmaOprod @@ -53,7 +69,6 @@ namespace quda { cvector_ref &p, const std::vector> &coeff) { if constexpr (is_enabled_clover()) { - getProfile().TPSTART(QUDA_PROFILE_COMPUTE); if (x.size() > MAX_NVECTOR) { // divide and conquer computeCloverSigmaOprod(oprod, cvector_ref {x.begin(), x.begin() + x.size() / 2}, @@ -66,6 +81,7 @@ namespace quda { return; } + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(oprod, x, p, coeff); getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } else { diff --git a/lib/copy_gauge_extended.cu b/lib/copy_gauge_extended.cu index cc469500e9..620e9bbe85 100644 --- a/lib/copy_gauge_extended.cu +++ b/lib/copy_gauge_extended.cu @@ -12,6 +12,7 @@ namespace quda { FloatOut *Out; FloatIn *In; + bool advanceSharedBytes(TuneParam &) const { return false; } // Don't tune shared mem unsigned int minThreads() const { return in.VolumeCB() == out.VolumeCB() ? in.VolumeCB() : in.LocalVolumeCB(); } public: diff --git a/lib/copy_gauge_helper.hpp b/lib/copy_gauge_helper.hpp index f43fa15e2c..9703488e01 100644 --- a/lib/copy_gauge_helper.hpp +++ b/lib/copy_gauge_helper.hpp @@ -15,6 +15,7 @@ namespace quda GaugeField &out; const GaugeField ∈ + bool advanceSharedBytes(TuneParam &) const override { return false; } // Don't tune shared mem unsigned int minThreads() const override { return size; } public: diff --git a/lib/dirac_wilson.cpp b/lib/dirac_wilson.cpp index d41c5753d7..8e71a689a1 100644 --- a/lib/dirac_wilson.cpp +++ b/lib/dirac_wilson.cpp @@ -144,7 +144,7 @@ namespace quda { // we desire solution to full system for (auto i = 0u; i < b.size(); i++) { // src = b_e + k D_eo b_o - DslashXpay(x[i][other_parity], b[i][other_parity], this_parity, b[this_parity], kappa); + DslashXpay(x[i][other_parity], b[i][other_parity], this_parity, b[i][this_parity], kappa); src[i] = x[i][other_parity].create_alias(); sol[i] = x[i][this_parity].create_alias(); } diff --git a/lib/dslash_domain_wall_4d_fused_m5.hpp b/lib/dslash_domain_wall_4d_fused_m5.hpp index 3ba9e77c39..cb66d36470 100644 --- a/lib/dslash_domain_wall_4d_fused_m5.hpp +++ b/lib/dslash_domain_wall_4d_fused_m5.hpp @@ -36,6 +36,9 @@ namespace quda } } + int blockStep() const override { return 8; } + int blockMin() const override { return 8; } + public: DomainWall4DFusedM5(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in) : Dslash(arg, out, in, get_app_base()) @@ -44,14 +47,14 @@ namespace quda TunableKernel3D::resizeStep(in.X(4), 1); } - void apply(const qudaStream_t &stream) + void apply(const qudaStream_t &stream) override { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); Dslash::setParam(tp); Dslash::template instantiate(tp, stream); } - unsigned int sharedBytesPerThread() const + unsigned int sharedBytesPerThread() const override { // spin components in shared depend on inversion algorithm if (mobius_m5::use_half_vector()) { @@ -61,10 +64,9 @@ namespace quda } } - void initTuneParam(TuneParam ¶m) const + void initTuneParam(TuneParam ¶m) const override { Dslash::initTuneParam(param); - param.block.y = arg.Ls; // Ls must be contained in the block param.grid.y = 1; param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z; @@ -95,7 +97,7 @@ namespace quda return (12ll * n * Ls) * in.Volume(); } - long long flops() const + long long flops() const override { long long flops_ = 0; switch (Arg::dslash5_type) { @@ -112,7 +114,7 @@ namespace quda return flops_ + Dslash::flops(); } - long long bytes() const + long long bytes() const override { if (Arg::dslash5_type == Dslash5Type::M5_INV_MOBIUS_M5_INV_DAG) { return arg.y.Bytes() + Dslash::bytes(); @@ -121,7 +123,7 @@ namespace quda } } - void defaultTuneParam(TuneParam ¶m) const { initTuneParam(param); } + void defaultTuneParam(TuneParam ¶m) const override { initTuneParam(param); } }; template struct Dslash5TypeList { diff --git a/lib/dslash_gamma_helper.cu b/lib/dslash_gamma_helper.cu index 66c523df20..bab4aa23b9 100644 --- a/lib/dslash_gamma_helper.cu +++ b/lib/dslash_gamma_helper.cu @@ -10,7 +10,7 @@ namespace quda { ColorSpinorField &out; const ColorSpinorField ∈ const int d; - unsigned int minThreads() const { return in.VolumeCB(); } + unsigned int minThreads() const { return in.VolumeCB() / (in.Ndim() == 5 ? in.X(4) : 1); } public: GammaApply(ColorSpinorField &out, const ColorSpinorField &in, int d) : @@ -103,4 +103,39 @@ namespace quda { // Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma) void gamma5(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,4); } + template class TauApply : public TunableKernel2D + { + ColorSpinorField &out; + const ColorSpinorField ∈ + const int d; + unsigned int minThreads() const { return in.VolumeCB() / 2; } + + public: + TauApply(ColorSpinorField &out, const ColorSpinorField &in, int d) : + TunableKernel2D(in, in.SiteSubset()), out(out), in(in), d(d) + { + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + launch(tp, stream, GammaArg(out, in, d)); + } + + void preTune() { out.backup(); } + void postTune() { out.restore(); } + long long bytes() const { return out.Bytes() + in.Bytes(); } + }; + + // Apply the tau1 matrix to a doublet colorspinor field + // out(x) = tau_1*in +#ifdef GPU_TWISTED_MASS_DIRAC + void ApplyTau(ColorSpinorField &out, const ColorSpinorField &in, int d) { instantiate(out, in, d); } +#else + void ApplyTau(ColorSpinorField &, const ColorSpinorField &, int) + { + errorQuda("Twisted mass dslash has not been built"); + } +#endif // GPU_TWISTED_MASS_DIRAC } diff --git a/lib/evec_project.cu b/lib/evec_project.cu new file mode 100644 index 0000000000..394148a9a7 --- /dev/null +++ b/lib/evec_project.cu @@ -0,0 +1,82 @@ +#include +#include +#include +#include +#include + +namespace quda +{ + + template class EvecProjectLaplace3D : TunableMultiReduction + { + cvector_ref &x; + cvector_ref &y; + std::vector &result; + bool tuneSharedBytes() const override { return false; } + + public: + EvecProjectLaplace3D(cvector_ref &x, cvector_ref &y, + std::vector &result) : + TunableMultiReduction(x[0], 1, x.size() * y.size() * x[0].X()[3], 8), x(x), y(y), result(result) + { + strcat(aux, ",nx="); + char rhs_str[16]; + i32toa(rhs_str, x.size()); + strcat(aux, rhs_str); + strcat(aux, ",ny="); + i32toa(rhs_str, y.size()); + strcat(aux, rhs_str); + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) override + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + EvecProjectionArg arg(x, y); + launch(result, tp, stream, arg); + } + + long long flops() const override { return 8 * x.size() * y.size() * x.Nspin() * x.Ncolor() * x.Volume(); } + long long bytes() const override { return x.Bytes() * y.size() + y.Bytes() * x.size(); } + }; + + void evecProjectLaplace3D(std::vector &result, cvector_ref &x, + cvector_ref &y) + { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); + + checkNative(x[0], y[0]); + if (x.Nspin() != 4 || y.Nspin() != 1) errorQuda("Unexpected number of spins x=%d y=%d", x.Nspin(), y.Nspin()); + + // deploy as a tile computation + auto Lt = x[0].X()[3]; + + for (auto tx = 0u; tx < x.size(); tx += max_nx) { + for (auto ty = 0u; ty < y.size(); ty += max_ny) { + // compute remainder here + auto tile_x = std::min(max_nx, x.size() - tx); + auto tile_y = std::min(max_ny, y.size() - ty); + std::vector result_tile(2 * x.Nspin() * Lt * tile_x * tile_y); + + instantiate(cvector_ref {x.begin() + tx, x.begin() + tx + tile_x}, + cvector_ref {y.begin() + ty, y.begin() + ty + tile_y}, + result_tile); + + for (auto i = 0u; i < tile_x; i++) { + for (auto j = 0u; j < tile_y; j++) { + for (auto t = 0; t < Lt; t++) { + for (auto s = 0u; s < 4; s++) { + result[(((tx + i) * y.size() + (ty + j)) * Lt + t) * 4 + s] + = {result_tile[(((t * tile_y + j) * tile_x + i) * 4 + s) * 2 + 0], + result_tile[(((t * tile_y + j) * tile_x + i) * 4 + s) * 2 + 1]}; + } + } + } + } + } + } + + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + } + +} // namespace quda diff --git a/lib/gauge_covdev.cpp b/lib/gauge_covdev.cpp index 52c5f1c1b4..0ddcf0ac44 100644 --- a/lib/gauge_covdev.cpp +++ b/lib/gauge_covdev.cpp @@ -6,15 +6,16 @@ namespace quda { - GaugeCovDev::GaugeCovDev(const DiracParam ¶m) : Dirac(param) { } + GaugeCovDev::GaugeCovDev(const DiracParam ¶m) : Dirac(param), covdev_mu(param.covdev_mu) { } - GaugeCovDev::GaugeCovDev(const GaugeCovDev &covDev) : Dirac(covDev) { } + GaugeCovDev::GaugeCovDev(const GaugeCovDev &covDev) : Dirac(covDev), covdev_mu(covDev.covdev_mu) { } GaugeCovDev::~GaugeCovDev() { } GaugeCovDev& GaugeCovDev::operator=(const GaugeCovDev &covDev) { if (&covDev != this) Dirac::operator=(covDev); + covdev_mu = covDev.covdev_mu; return *this; } @@ -42,9 +43,9 @@ namespace quda { MCD(out, tmp, (mu+4)%8); } - void GaugeCovDev::Dslash(ColorSpinorField &, const ColorSpinorField &, const QudaParity) const + void GaugeCovDev::Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const { - //do nothing + DslashCD(out, in, parity, covdev_mu); } void GaugeCovDev::DslashXpay(ColorSpinorField &, const ColorSpinorField &, const QudaParity, const ColorSpinorField &, @@ -53,14 +54,14 @@ namespace quda { //do nothing } - void GaugeCovDev::M(ColorSpinorField &, const ColorSpinorField &) const + void GaugeCovDev::M(ColorSpinorField &out, const ColorSpinorField &in) const { - //do nothing + MCD(out, in, covdev_mu); } - void GaugeCovDev::MdagM(ColorSpinorField &, const ColorSpinorField &) const + void GaugeCovDev::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const { - //do nothing + MdagMCD(out, in, covdev_mu); } void GaugeCovDev::prepare(cvector_ref &, cvector_ref &, diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 249315f9c3..b6f504f352 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -207,6 +207,9 @@ static TimeProfile profileCovDev("covDevQuda"); //!< Profiler for momentum action static TimeProfile profileMomAction("momActionQuda"); +//!< Profiler for sink projection +static TimeProfile profileSinkProject("sinkProjectQuda"); + //!< Profiler for endQuda static TimeProfile profileEnd("endQuda"); @@ -1393,6 +1396,7 @@ void endQuda(void) profileProject.Print(); profilePhase.Print(); profileMomAction.Print(); + profileSinkProject.Print(); profileEnd.Print(); profileInit2End.Print(); @@ -1445,9 +1449,10 @@ namespace quda { } diracParam.type = pc ? QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC : QUDA_MOBIUS_DOMAIN_WALL_EOFA_DIRAC; diracParam.Ls = inv_param->Ls; - if (sizeof(Complex) != sizeof(double _Complex)) { - errorQuda("Irreconcilable difference between interface and internal complex number conventions"); - } + // check we are safe to cast into a Complex (= std::complex) + static_assert(sizeof(Complex) == sizeof(double _Complex), + "Irreconcilable difference between interface and internal complex number conventions"); + memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls); memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls); diracParam.eofa_shift = inv_param->eofa_shift; @@ -1505,6 +1510,7 @@ namespace quda { break; case QUDA_COVDEV_DSLASH: diracParam.type = QUDA_GAUGE_COVDEV_DIRAC; + diracParam.covdev_mu = inv_param->covdev_mu; break; default: errorQuda("Unsupported dslash_type %d", inv_param->dslash_type); @@ -3895,8 +3901,8 @@ void createCloverQuda(QudaInvertParam* invertParam) // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general) lat_dim_t R; for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d)); - GaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : - createExtendedGauge(*gaugePrecise, R, profileClover, false, recon); + GaugeField *gauge + = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, getProfile(), false, recon); GaugeField *ex = gauge; if (gauge->Precision() < cloverPrecise->Precision()) { @@ -4439,7 +4445,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double // Make sure extendedGaugeResident has the correct R if (extendedGaugeResident) delete extendedGaugeResident; - extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileCloverForce); + extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, getProfile()); GaugeField &gaugeEx = *extendedGaugeResident; computeCloverForce(cudaMom, gaugeEx, *gaugePrecise, *cloverPrecise, x, x0, force_coeff, ferm_epsilon, @@ -4499,7 +4505,7 @@ void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coef ColorSpinorField cpuQuarkX(cpuParam); x[i][parity] = cpuQuarkX; // in tmLQCD-parlance this is the odd part of X - if (detratio) { + if (detratio && inv_param->twist_flavor != QUDA_TWIST_NONDEG_DOUBLET) { x0[i] = ColorSpinorField(qParam); ColorSpinorParam cpuParam0(h_x0[i], *inv_param, gParamMom.x, true, inv_param->input_location); ColorSpinorField cpuQuarkX0(cpuParam0); @@ -5205,3 +5211,99 @@ void gaugeObservablesQuda(QudaGaugeObservableParam *param) gaugeObservables(*gauge, *param); } + +static void check_param(double _Complex *host_sinks, void **host_quark, int n_quark, int tile_quark, void **host_evec, + int n_evec, int tile_evec, QudaInvertParam *inv_param, const int X[4]) +{ + if (host_sinks == nullptr) errorQuda("Invalid host_sink ptr"); + if (host_quark == nullptr) errorQuda("Invalid host_quark ptr"); + for (auto i = 0; i < n_quark; i++) + if (host_quark[i] == nullptr) errorQuda("Invalid host_quark[%d] ptr", i); + if (tile_quark < 1) errorQuda("Invalid tiling parameter %d (must be positive)", tile_quark); + if (host_evec == nullptr) errorQuda("Invalid host_evec ptr"); + for (auto i = 0; i < n_evec; i++) + if (host_evec[i] == nullptr) errorQuda("Invalid host_evec[%d] ptr", i); + if (tile_evec < 1) errorQuda("Invalid tiling parameter %d (must be positive)", tile_evec); + if (inv_param == nullptr) errorQuda("Invalid QudaInvertParam ptr"); + for (int i = 0; i < 4; i++) + if (X[i] < 1 || X[i] > 512) errorQuda("Invalid lattice dimension %d", i); +} + +void laphSinkProject(double _Complex *host_sinks, void **host_quark, int n_quark, int tile_quark, void **host_evec, + int n_evec, int tile_evec, QudaInvertParam *inv_param, const int X[4]) +{ + auto profile = pushProfile(profileSinkProject, inv_param->secs, inv_param->gflops); + + // check parameters are valid + check_param(host_sinks, host_quark, n_quark, tile_quark, host_evec, n_evec, tile_evec, inv_param, X); + + // Parameter object describing the sources and smeared quarks + lat_dim_t x = {X[0], X[1], X[2], X[3]}; + ColorSpinorParam cpu_quark_param(host_quark, *inv_param, x, false, QUDA_CPU_FIELD_LOCATION); + cpu_quark_param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + + // QUDA style wrapper around the host data + std::vector quark(n_quark); + for (auto i = 0; i < n_quark; i++) { + cpu_quark_param.v = host_quark[i]; + quark[i] = ColorSpinorField(cpu_quark_param); + } + + // Parameter object describing evecs + ColorSpinorParam cpu_evec_param(host_evec, *inv_param, x, false, QUDA_CPU_FIELD_LOCATION); + // Switch to spin 1 + cpu_evec_param.nSpin = 1; + // QUDA style wrapper around the host data + std::vector evec(n_evec); + for (auto i = 0; i < n_evec; i++) { + cpu_evec_param.v = host_evec[i]; + evec[i] = ColorSpinorField(cpu_evec_param); + } + + // Create device vectors + ColorSpinorParam quda_quark_param(cpu_quark_param, *inv_param, QUDA_CUDA_FIELD_LOCATION); + quda_quark_param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + std::vector quda_quark(tile_quark, quda_quark_param); + + // Create device vectors for evecs + ColorSpinorParam quda_evec_param(cpu_evec_param, *inv_param, QUDA_CUDA_FIELD_LOCATION); + std::vector quda_evec(tile_evec, quda_evec_param); + + auto Lt = x[3] * comm_dim(3); + std::vector hostSink(n_quark * n_evec * Lt * 4); + + for (auto i = 0; i < n_quark; i += tile_quark) { // iterate over all quarks + auto tile_i = std::min(tile_quark, n_quark - i); // handle remainder here + for (auto tq = 0; tq < tile_i; tq++) quda_quark[tq] = quark[i + tq]; // download quarks + + for (auto j = 0; j < n_evec; j += tile_evec) { // iterate over all EV + auto tile_j = std::min(tile_evec, n_evec - j); // handle remainder here + for (auto te = 0; te < tile_j; te++) quda_evec[te] = evec[j + te]; // download evecs + + std::vector tmp(tile_i * tile_j * x[3] * 4); + + // We now perform the projection onto the eigenspace. The data + // is placed in host_sinks in T, spin order + evecProjectLaplace3D(tmp, {quda_quark.begin(), quda_quark.begin() + tile_i}, + {quda_evec.begin(), quda_evec.begin() + tile_j}); + + for (auto tq = 0; tq < tile_i; tq++) { + for (auto te = 0; te < tile_j; te++) { + for (auto t = 0; t < x[3]; t++) { + auto t_global = X[3] * comm_coord(3) + t; + for (auto s = 0; s < 4; s++) { + hostSink[(((i + tq) * n_evec + (j + te)) * Lt + t_global) * 4 + s] + = tmp[((tq * tile_j + te) * x[3] + t) * 4 + s]; + } + } + } + } + } + } + + comm_allreduce_sum(hostSink); + + for (auto i = 0; i < n_quark * n_evec * Lt * 4; i++) { // iterate over all quarks + reinterpret_cast *>(host_sinks)[i] = hostSink[i]; + } +} diff --git a/lib/laplace.cu b/lib/laplace.cu index a0f18e4dd4..c476aed6d2 100644 --- a/lib/laplace.cu +++ b/lib/laplace.cu @@ -50,8 +50,7 @@ namespace quda long long flops() const override { int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops - int num_mv_multiply = in.Nspin() == 4 ? 2 : 1; - int ghost_flops = (num_mv_multiply * mv_flops + 2 * in.Ncolor() * in.Nspin()); + int ghost_flops = (in.Nspin() * mv_flops + 2 * in.Ncolor() * in.Nspin()); int xpay_flops = 2 * 2 * in.Ncolor() * in.Nspin(); // multiply and add per real component int num_dir = (arg.dir == 4 ? 2 * 4 : 2 * 3); // 3D or 4D operator @@ -73,8 +72,7 @@ namespace quda case UBER_KERNEL: case KERNEL_POLICY: { long long sites = in.Volume(); - flops_ = (num_dir * (in.Nspin() / 4) * in.Ncolor() * in.Nspin() + // spin project (=0 for staggered) - num_dir * num_mv_multiply * mv_flops + // SU(3) matrix-vector multiplies + flops_ = (num_dir * in.Nspin() * mv_flops + // SU(3) matrix-vector multiplies ((num_dir - 1) * 2 * in.Ncolor() * in.Nspin())) * sites; // accumulation if (arg.xpay) flops_ += xpay_flops * sites; @@ -144,41 +142,20 @@ namespace quda template struct LaplaceApply { -#if (defined(GPU_STAGGERED_DIRAC) || defined(GPU_WILSON_DIRAC)) && defined(GPU_LAPLACE) - inline LaplaceApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, - double a, double b, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, - TimeProfile &profile) -#else - inline LaplaceApply(ColorSpinorField &, const ColorSpinorField &in, const GaugeField &, int, - double, double, const ColorSpinorField &, int, bool, const int *, TimeProfile &) -#endif + LaplaceApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, + double a, double b, const ColorSpinorField &x, int parity, bool dagger, const int * comm_override, TimeProfile &profile) { - if (in.Nspin() == 1) { -#if defined(GPU_STAGGERED_DIRAC) && defined(GPU_LAPLACE) - constexpr int nDim = 4; - constexpr int nSpin = 1; - LaplaceArg arg(out, in, U, dir, a, b, x, parity, dagger, comm_override); - Laplace laplace(arg, out, in); - - dslash::DslashPolicyTune policy(laplace, in, in.VolumeCB(), - in.GhostFaceCB(), profile); -#else - errorQuda("nSpin=%d Laplace operator required staggered dslash and laplace to be enabled", in.Nspin()); -#endif - } else if (in.Nspin() == 4) { -#if defined(GPU_WILSON_DIRAC) && defined(GPU_LAPLACE) - constexpr int nDim = 4; - constexpr int nSpin = 4; - LaplaceArg arg(out, in, U, dir, a, b, x, parity, dagger, comm_override); - Laplace laplace(arg, out, in); - - dslash::DslashPolicyTune policy(laplace, in, in.VolumeCB(), - in.GhostFaceCB(), profile); -#else - errorQuda("nSpin=%d Laplace operator required wilson dslash and laplace to be enabled", in.Nspin()); -#endif - } else { - errorQuda("Unsupported nSpin= %d", in.Nspin()); + if constexpr (is_enabled_laplace()) { + if (in.Nspin() == 1) { + constexpr int nDim = 4; + constexpr int nSpin = 1; + LaplaceArg arg(out, in, U, dir, a, b, x, parity, dagger, comm_override); + Laplace laplace(arg, out, in); + + dslash::DslashPolicyTune policy(laplace, in, in.VolumeCB(), in.GhostFaceCB(), profile); + } else { + errorQuda("Unsupported nSpin= %d", in.Nspin()); + } } } }; diff --git a/lib/quda_fortran.F90 b/lib/quda_fortran.F90 index 01a792558a..23372a3374 100644 --- a/lib/quda_fortran.F90 +++ b/lib/quda_fortran.F90 @@ -114,10 +114,12 @@ module quda_fortran real(8) :: mu ! Chiral twisted mass parameter real(8) :: tm_rho ! Chiral twisted mass shift used for Hasenbusch mass preconditioning for twisted clover real(8) :: epsilon ! Flavor twisted mass parameter + real(8) :: evmax ! Maximum of the eigenvalues of the ndeg twisted mass operator needed for fermionic forces QudaTwistFlavorType :: twist_flavor ! Twisted mass flavor integer(4) :: laplace3D ! direction to omit in Laplace - + integer(4) :: covdev_mu ! Apply forward/backward covariant derivative in direction mu(mu<=3)/mu-4(mu>3) + real(8) :: tol ! Requested L2 residual norm real(8) :: tol_restart ! Solver tolerance in the L2 residual norm (used to restart InitCG) real(8) :: tol_hq ! Requested heavy quark residual norm diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake index 576cc5c5d8..6ba7dcc2e9 100644 --- a/lib/targets/cuda/target_cuda.cmake +++ b/lib/targets/cuda/target_cuda.cmake @@ -17,8 +17,8 @@ endif() set(QUDA_GPU_ARCH ${QUDA_DEFAULT_GPU_ARCH} - CACHE STRING "set the GPU architecture (sm_60, sm_70, sm_80)") -set_property(CACHE QUDA_GPU_ARCH PROPERTY STRINGS sm_60 sm_70 sm_80) + CACHE STRING "set the GPU architecture (sm_60, sm_70, sm_80 sm_90)") +set_property(CACHE QUDA_GPU_ARCH PROPERTY STRINGS sm_60 sm_70 sm_80 sm_90) set(QUDA_GPU_ARCH_SUFFIX "" CACHE STRING "set the GPU architecture suffix (virtual, real). Leave empty for no suffix.") diff --git a/lib/targets/hip/malloc.cpp b/lib/targets/hip/malloc.cpp index 6fd0986b94..f46c44c8d0 100644 --- a/lib/targets/hip/malloc.cpp +++ b/lib/targets/hip/malloc.cpp @@ -528,15 +528,21 @@ namespace quda errorQuda("hipPointerGetAttributes returned error: %s\n", hipGetErrorString(error)); } - switch (attr.type) { #if HIP_VERSION_MAJOR >= 6 + switch (attr.type) { case hipMemoryTypeUnregistered: return QUDA_CPU_FIELD_LOCATION; +#else + switch (attr.memoryType) { #endif // HIP_VERSION_MAJOR >= 6 case hipMemoryTypeHost: return QUDA_CPU_FIELD_LOCATION; case hipMemoryTypeDevice: return QUDA_CUDA_FIELD_LOCATION; case hipMemoryTypeArray: return QUDA_CUDA_FIELD_LOCATION; case hipMemoryTypeUnified: return QUDA_CUDA_FIELD_LOCATION; ///< Not used currently +#if HIP_VERSION_MAJOR >= 6 default: errorQuda("Unknown memory type %d\n", attr.type); return QUDA_INVALID_FIELD_LOCATION; +#else + default: errorQuda("Unknown memory type %d\n", attr.memoryType); return QUDA_INVALID_FIELD_LOCATION; +#endif } } diff --git a/lib/timer.cpp b/lib/timer.cpp index 542b4ab9d6..4729a88406 100644 --- a/lib/timer.cpp +++ b/lib/timer.cpp @@ -245,20 +245,26 @@ namespace quda { profile(profile), secs(secs), gflops(gflops), flops(Tunable::flops_global()) { - profile.TPSTART(QUDA_PROFILE_TOTAL); - tp_stack.push(&profile); + if (profile.Name() != getProfile().Name()) { + // only push to stack if this profile not already the active one + profile.TPSTART(QUDA_PROFILE_TOTAL); + tp_stack.push(&profile); + active = true; + } } pushProfile::~pushProfile() { - if (tp_stack.empty()) errorQuda("popProfile() called with empty stack"); - auto &profile = *(tp_stack.top()); - if (&(this->profile) != &profile) errorQuda("Popped profile is not the expected one"); - tp_stack.pop(); - profile.TPSTOP(QUDA_PROFILE_TOTAL); - secs = profile.Last(QUDA_PROFILE_TOTAL); - gflops = (Tunable::flops_global() - flops) * 1e-9; - if (&gflops != &gflops_dummy) comm_allreduce_sum(gflops); + if (active == true) { + if (tp_stack.empty()) errorQuda("popProfile() called with empty stack"); + auto &profile = *(tp_stack.top()); + if (&(this->profile) != &profile) errorQuda("Popped profile is not the expected one"); + tp_stack.pop(); + profile.TPSTOP(QUDA_PROFILE_TOTAL); + secs = profile.Last(QUDA_PROFILE_TOTAL); + gflops = (Tunable::flops_global() - flops) * 1e-9; + if (&gflops != &gflops_dummy) comm_allreduce_sum(gflops); + } } TimeProfile &getProfile() diff --git a/lib/vector_io.cpp b/lib/vector_io.cpp index ed469750f7..505b26c45a 100644 --- a/lib/vector_io.cpp +++ b/lib/vector_io.cpp @@ -55,12 +55,16 @@ namespace quda for (int j = 0; j < Ls; j++) { V[i * Ls + j] = v.data() + j * stride; } } + // if we're loading inflated vectors, we need to grab the spinor size from `tmp` instead of `v0`. + auto spinor_X = (create_tmp && parity_inflate) ? tmp[0].X() : v0.X(); + auto spinor_site_subset = (create_tmp && parity_inflate) ? tmp[0].SiteSubset() : v0.SiteSubset(); + // time loading quda::host_timer_t host_timer; host_timer.start(); // start the timer - read_spinor_field(filename.c_str(), V.data(), v0.Precision(), v0.X(), v0.SiteSubset(), - spinor_parity, v0.Ncolor(), v0.Nspin(), Nvec * Ls, 0, nullptr); + read_spinor_field(filename.c_str(), V.data(), v0.Precision(), spinor_X, spinor_site_subset, spinor_parity, + v0.Ncolor(), v0.Nspin(), Nvec * Ls, 0, nullptr); host_timer.stop(); // stop the timer logQuda(QUDA_SUMMARIZE, "Time spent loading vectors from %s = %g secs\n", filename.c_str(), host_timer.last()); @@ -140,12 +144,16 @@ namespace quda for (int j = 0; j < Ls; j++) { V[i * Ls + j] = v.data() + j * stride; } } + // if we performed parity inflation, we need to grab the spinor size from `tmp` instead of `v0`. + auto spinor_X = (create_tmp && parity_inflate) ? tmp[0].X() : v0.X(); + auto spinor_site_subset = (create_tmp && parity_inflate) ? tmp[0].SiteSubset() : v0.SiteSubset(); + // time saving quda::host_timer_t host_timer; host_timer.start(); // start the timer - write_spinor_field(filename.c_str(), V.data(), save_prec, v0.X(), v0.SiteSubset(), spinor_parity, v0.Ncolor(), - v0.Nspin(), Nvec * Ls, 0, nullptr, partfile); + write_spinor_field(filename.c_str(), V.data(), save_prec, spinor_X, spinor_site_subset, spinor_parity, + v0.Ncolor(), v0.Nspin(), Nvec * Ls, 0, nullptr, partfile); host_timer.stop(); // stop the timer logQuda(QUDA_SUMMARIZE, "Time spent saving vectors to %s = %g secs\n", filename.c_str(), host_timer.last()); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7a45a72a08..6136864249 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -89,6 +89,13 @@ if(QUDA_DIRAC_WILSON target_link_libraries(eigensolve_test ${TEST_LIBS}) quda_checkbuildtest(eigensolve_test QUDA_BUILD_ALL_TESTS) install(TARGETS eigensolve_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR}) + + if(QUDA_DIRAC_LAPLACE) + add_executable(laph_test laph_test.cpp) + target_link_libraries(laph_test ${TEST_LIBS}) + quda_checkbuildtest(laph_test QUDA_BUILD_ALL_TESTS) + install(TARGETS laph_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR}) + endif() endif() if(QUDA_DIRAC_WILSON @@ -301,13 +308,13 @@ if(QUDA_DIRAC_WILSON OR QUDA_DIRAC_DOMAIN_WALL) add_test(NAME blas_test_parity_wilson COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dim 2 4 6 8 + --dim 2 4 6 8 --niter 1 --nsrc 8 --msrc 9 --solve-type direct-pc --gtest_output=xml:blas_test_parity.xml) add_test(NAME blas_test_full_wilson COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dim 2 4 6 8 + --dim 2 4 6 8 --niter 1 --nsrc 8 --msrc 9 --solve-type direct --gtest_output=xml:blas_test_full.xml) @@ -316,14 +323,14 @@ endif() if(QUDA_DIRAC_STAGGERED) add_test(NAME blas_test_parity_staggered COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dim 2 4 6 8 + --dim 2 4 6 8 --niter 1 --nsrc 8 --msrc 9 --dslash-type staggered --solve-type direct-pc --gtest_output=xml:blas_test_parity.xml) add_test(NAME blas_test_full_staggered COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dim 2 4 6 8 + --dim 2 4 6 8 --niter 1 --nsrc 8 --msrc 9 --dslash-type staggered --solve-type direct @@ -974,179 +981,170 @@ elseif(single_prec) endif() # Wilson-type Inversions -foreach(prec IN LISTS TEST_PRECS) - - if(${prec} STREQUAL "double") - set(tol 1e-12) - elseif(${prec} STREQUAL "single") - set(tol 1e-6) - endif() +if(QUDA_DIRAC_WILSON) + add_test(NAME invert_test_wilson + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type wilson --ngcrkrylov 8 + --dim 2 4 6 8 --niter 1000 + --enable-testing true + --gtest_output=xml:invert_test_wilson.xml) + + if(DEFINED ENV{QUDA_ENABLE_TUNING}) + if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) + add_test(NAME invert_test_splitgrid_wilson + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type wilson --ngcrkrylov 8 + --dim 2 4 6 8 --niter 1000 + --nsrc ${QUDA_TEST_NUM_PROCS} + --enable-testing true + --gtest_output=xml:invert_test_splitgrid_wilson.xml) - if(QUDA_DIRAC_WILSON) - add_test(NAME invert_test_wilson_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type wilson --ngcrkrylov 8 - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 - --enable-testing true - --gtest_output=xml:invert_test_wilson_${prec}.xml) - - if(DEFINED ENV{QUDA_ENABLE_TUNING}) - if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) - add_test(NAME invert_test_splitgrid_wilson_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type wilson --ngcrkrylov 8 - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 - --nsrc ${QUDA_TEST_NUM_PROCS} - --enable-testing true - --gtest_output=xml:invert_test_splitgrid_wilson_${prec}.xml) - - set_tests_properties(invert_test_splitgrid_wilson_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) - endif() - endif() + set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) + endif() endif() +endif() - if(QUDA_DIRAC_TWISTED_MASS) - add_test(NAME invert_test_twisted_mass_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-mass - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even --enable-testing true - --gtest_output=xml:invert_test_twisted_mass_sym_${prec}.xml) - - add_test(NAME invert_test_twisted_mass_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-mass - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even-asym --enable-testing true - --gtest_output=xml:invert_test_twisted_mass_asym_${prec}.xml) - endif() +if(QUDA_DIRAC_TWISTED_MASS) + add_test(NAME invert_test_twisted_mass_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-mass + --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8 + --matpc even-even --enable-testing true + --gtest_output=xml:invert_test_twisted_mass_sym.xml) + + add_test(NAME invert_test_twisted_mass_asym} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-mass + --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8 + --matpc even-even-asym --enable-testing true + --gtest_output=xml:invert_test_twisted_mass_asym.xml) +endif() - if(QUDA_DIRAC_NDEG_TWISTED_MASS) - add_test(NAME invert_test_ndeg_twisted_mass_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-mass - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even --flavor nondeg-doublet - --enable-testing true - --gtest_output=xml:invert_test_ndeg_twisted_mass_sym_${prec}.xml) +if(QUDA_DIRAC_NDEG_TWISTED_MASS) + add_test(NAME invert_test_ndeg_twisted_mass_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-mass + --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8 + --matpc even-even --flavor nondeg-doublet + --enable-testing true + --gtest_output=xml:invert_test_ndeg_twisted_mass_sym.xml) - add_test(NAME invert_test_ndeg_twisted_mass_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-mass - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even-asym --flavor nondeg-doublet - --enable-testing true - --gtest_output=xml:invert_test_ndeg_twisted_mass_asym_${prec}.xml) - endif() + add_test(NAME invert_test_ndeg_twisted_mass_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-mass + --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8 + --matpc even-even-asym --flavor nondeg-doublet + --enable-testing true + --gtest_output=xml:invert_test_ndeg_twisted_mass_asym.xml) +endif() - if(QUDA_DIRAC_CLOVER) - add_test(NAME invert_test_clover_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type clover --compute-clover true - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even --enable-testing true - --gtest_output=xml:invert_test_clover_sym_${prec}.xml) - - add_test(NAME invert_test_clover_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type clover --compute-clover true - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even-asym --enable-testing true - --gtest_output=xml:invert_test_clover_asym_${prec}.xml) - endif() +if(QUDA_DIRAC_CLOVER) + add_test(NAME invert_test_clover_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type clover --compute-clover true + --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8 + --matpc even-even --enable-testing true + --gtest_output=xml:invert_test_clover_sym.xml) + + add_test(NAME invert_test_clover_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type clover --compute-clover true + --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8 + --matpc even-even-asym --enable-testing true + --gtest_output=xml:invert_test_clover_asym.xml) +endif() - if(QUDA_DIRAC_TWISTED_CLOVER) - add_test(NAME invert_test_twisted_clover_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-clover --compute-clover true - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even --enable-testing true - --gtest_output=xml:invert_test_twisted_clover_sym_${prec}.xml) - - add_test(NAME invert_test_twisted_clover_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-clover --compute-clover true - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even-asym --enable-testing true - --gtest_output=xml:invert_test_twisted_clover_asym_${prec}.xml) - endif() +if(QUDA_DIRAC_TWISTED_CLOVER) + add_test(NAME invert_test_twisted_clover_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-clover --compute-clover true + --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8 + --matpc even-even --enable-testing true + --gtest_output=xml:invert_test_twisted_clover_sym.xml) + + add_test(NAME invert_test_twisted_clover_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-clover --compute-clover true + --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8 + --matpc even-even-asym --enable-testing true + --gtest_output=xml:invert_test_twisted_clover_asym.xml) +endif() - if(QUDA_DIRAC_NDEG_TWISTED_CLOVER) - add_test(NAME invert_test_ndeg_twisted_clover_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-clover --compute-clover true - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even --flavor nondeg-doublet - --enable-testing true - --gtest_output=xml:invert_test_ndeg_twisted_clover_sym_${prec}.xml) +if(QUDA_DIRAC_NDEG_TWISTED_CLOVER) + add_test(NAME invert_test_ndeg_twisted_clover_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-clover --compute-clover true + --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8 + --matpc even-even --flavor nondeg-doublet + --enable-testing true + --gtest_output=xml:invert_test_ndeg_twisted_clover_sym.xml) - add_test(NAME invert_test_ndeg_twisted_clover_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-clover --compute-clover true - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even-asym --flavor nondeg-doublet - --enable-testing true - --gtest_output=xml:invert_test_ndeg_twisted_clover_asym_${prec}.xml) - endif() + add_test(NAME invert_test_ndeg_twisted_clover_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-clover --compute-clover true + --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8 + --matpc even-even-asym --flavor nondeg-doublet + --enable-testing true + --gtest_output=xml:invert_test_ndeg_twisted_clover_asym.xml) +endif() - if(QUDA_DIRAC_DOMAIN_WALL) - add_test(NAME invert_test_domain_wall_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type domain-wall - --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even - --enable-testing true - --gtest_output=xml:invert_test_domain_wall_${prec}.xml) +if(QUDA_DIRAC_DOMAIN_WALL) + add_test(NAME invert_test_domain_wall + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type domain-wall + --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8 + --matpc even-even + --enable-testing true + --gtest_output=xml:invert_test_domain_wall.xml) - add_test(NAME invert_test_domain_wall_4d_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type domain-wall-4d - --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even - --enable-testing true - --gtest_output=xml:invert_test_domain_wall_4d_sym_${prec}.xml) + add_test(NAME invert_test_domain_wall_4d_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type domain-wall-4d + --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8 + --matpc even-even + --enable-testing true + --gtest_output=xml:invert_test_domain_wall_4d_sym.xml) - add_test(NAME invert_test_domain_wall_4d_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type domain-wall-4d - --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even-asym - --enable-testing true - --gtest_output=xml:invert_test_domain_wall_4d_asym_${prec}.xml) + add_test(NAME invert_test_domain_wall_4d_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type domain-wall-4d + --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8 + --matpc even-even-asym + --enable-testing true + --gtest_output=xml:invert_test_domain_wall_4d_asym.xml) - add_test(NAME invert_test_mobius_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type mobius - --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even - --enable-testing true - --gtest_output=xml:invert_test_mobius_sym_${prec}.xml) + add_test(NAME invert_test_mobius_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type mobius + --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8 + --matpc even-even + --enable-testing true + --gtest_output=xml:invert_test_mobius_sym.xml) - add_test(NAME invert_test_mobius_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type mobius - --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even-asym - --enable-testing true - --gtest_output=xml:invert_test_mobius_asym_${prec}.xml) + add_test(NAME invert_test_mobius_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type mobius + --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8 + --matpc even-even-asym + --enable-testing true + --gtest_output=xml:invert_test_mobius_asym.xml) - add_test(NAME invert_test_mobius_eofa_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type mobius-eofa - --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even - --enable-testing true - --gtest_output=xml:invert_test_mobius_eofa_sym_${prec}.xml) + add_test(NAME invert_test_mobius_eofa_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type mobius-eofa + --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8 + --matpc even-even + --enable-testing true + --gtest_output=xml:invert_test_mobius_eofa_sym.xml) - add_test(NAME invert_test_mobius_eofa_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type mobius-eofa - --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8 - --matpc even-even-asym - --enable-testing true - --gtest_output=xml:invert_test_mobius_eofa_asym_${prec}.xml) - endif() -endforeach(prec) + add_test(NAME invert_test_mobius_eofa_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type mobius-eofa + --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8 + --matpc even-even-asym + --enable-testing true + --gtest_output=xml:invert_test_mobius_eofa_asym.xml) +endif() # Staggered-type Inversions foreach(prec IN LISTS TEST_PRECS) @@ -1215,171 +1213,162 @@ foreach(prec IN LISTS TEST_PRECS) endforeach(prec) # Wilson-type eigensolves -foreach(prec IN LISTS TEST_PRECS) - - if(${prec} STREQUAL "double") - set(tol 1e-12) - elseif(${prec} STREQUAL "single") - set(tol 1e-6) - endif() - - if(QUDA_DIRAC_WILSON) - add_test(NAME eigensolve_test_wilson_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type wilson --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000 - --enable-testing true - --gtest_output=xml:eigensolve_test_wilson_${prec}.xml) - endif() +if(QUDA_DIRAC_WILSON) + add_test(NAME eigensolve_test_wilson + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type wilson --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --dim 2 4 6 8 --eig-tol ${tol} --eig-max-restarts 1000 + --enable-testing true + --gtest_output=xml:eigensolve_test_wilson.xml) +endif() - if(QUDA_DIRAC_TWISTED_MASS) - add_test(NAME eigensolve_test_twisted_mass_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even --enable-testing true - --gtest_output=xml:eigensolve_test_twisted_mass_sym_${prec}.xml) +if(QUDA_DIRAC_TWISTED_MASS) + add_test(NAME eigensolve_test_twisted_mass_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --dim 2 4 6 8 --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even --enable-testing true + --gtest_output=xml:eigensolve_test_twisted_mass_sym.xml) - add_test(NAME eigensolve_test_twisted_mass_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even-asym --enable-testing true - --gtest_output=xml:eigensolve_test_twisted_mass_asym_${prec}.xml) - endif() + add_test(NAME eigensolve_test_twisted_mass_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --dim 2 4 6 8 --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even-asym --enable-testing true + --gtest_output=xml:eigensolve_test_twisted_mass_asym.xml) +endif() - if(QUDA_DIRAC_NDEG_TWISTED_MASS) - add_test(NAME eigensolve_test_ndeg_twisted_mass_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even --flavor nondeg-doublet - --enable-testing true --eig-use-eigen-qr false --eig-qr-tol ${tol} - --gtest_output=xml:eigensolve_test_ndeg_twisted_mass_sym_${prec}.xml) - - add_test(NAME eigensolve_test_ndeg_twisted_mass_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even-asym --flavor nondeg-doublet - --enable-testing true --eig-use-eigen-qr false --eig-qr-tol ${tol} - --gtest_output=xml:eigensolve_test_ndeg_twisted_mass_asym_${prec}.xml) - endif() +if(QUDA_DIRAC_NDEG_TWISTED_MASS) + add_test(NAME eigensolve_test_ndeg_twisted_mass_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --dim 2 4 6 8 --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even --flavor nondeg-doublet + --enable-testing true --eig-use-eigen-qr false --eig-qr-tol ${tol} + --gtest_output=xml:eigensolve_test_ndeg_twisted_mass_sym.xml) + + add_test(NAME eigensolve_test_ndeg_twisted_mass_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --dim 2 4 6 8 --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even-asym --flavor nondeg-doublet + --enable-testing true --eig-use-eigen-qr false --eig-qr-tol ${tol} + --gtest_output=xml:eigensolve_test_ndeg_twisted_mass_asym.xml) +endif() - if(QUDA_DIRAC_CLOVER) - add_test(NAME eigensolve_test_clover_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type clover --compute-clover true - --dim 2 4 6 8 --prec ${prec} --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even --enable-testing true - --gtest_output=xml:eigensolve_test_clover_sym_${prec}.xml) - - add_test(NAME eigensolve_test_clover_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type clover --compute-clover true - --dim 2 4 6 8 --prec ${prec} --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even-asym --enable-testing true - --gtest_output=xml:eigensolve_test_clover_asym_${prec}.xml) - endif() +if(QUDA_DIRAC_CLOVER) + add_test(NAME eigensolve_test_clover_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type clover --compute-clover true + --dim 2 4 6 8 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even --enable-testing true + --gtest_output=xml:eigensolve_test_clover_sym.xml) + + add_test(NAME eigensolve_test_clover_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type clover --compute-clover true + --dim 2 4 6 8 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even-asym --enable-testing true + --gtest_output=xml:eigensolve_test_clover_asym.xml) +endif() - if(QUDA_DIRAC_TWISTED_CLOVER) - add_test(NAME eigensolve_test_twisted_clover_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-clover --compute-clover true - --dim 2 4 6 8 --prec ${prec} --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even --enable-testing true - --gtest_output=xml:eigensolve_test_twisted_clover_sym_${prec}.xml) +if(QUDA_DIRAC_TWISTED_CLOVER) + add_test(NAME eigensolve_test_twisted_clover_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-clover --compute-clover true + --dim 2 4 6 8 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even --enable-testing true + --gtest_output=xml:eigensolve_test_twisted_clover_sym.xml) - add_test(NAME eigensolve_test_twisted_clover_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-clover --compute-clover true - --dim 2 4 6 8 --prec ${prec} --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even-asym --enable-testing true - --gtest_output=xml:eigensolve_test_twisted_clover_asym_${prec}.xml) - endif() + add_test(NAME eigensolve_test_twisted_clover_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-clover --compute-clover true + --dim 2 4 6 8 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even-asym --enable-testing true + --gtest_output=xml:eigensolve_test_twisted_clover_asym.xml) +endif() - if(QUDA_DIRAC_NDEG_TWISTED_CLOVER) - add_test(NAME eigensolve_test_ndeg_twisted_clover_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-clover --compute-clover true - --dim 2 4 6 8 --prec ${prec} --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even --flavor nondeg-doublet - --enable-testing true - --gtest_output=xml:eigensolve_test_ndeg_twisted_clover_sym_${prec}.xml) +if(QUDA_DIRAC_NDEG_TWISTED_CLOVER) + add_test(NAME eigensolve_test_ndeg_twisted_clover_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-clover --compute-clover true + --dim 2 4 6 8 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even --flavor nondeg-doublet + --enable-testing true + --gtest_output=xml:eigensolve_test_ndeg_twisted_clover_sym.xml) - add_test(NAME eigensolve_test_ndeg_twisted_clover_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type twisted-clover --compute-clover true - --dim 2 4 6 8 --prec ${prec} --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even-asym --flavor nondeg-doublet - --enable-testing true - --gtest_output=xml:eigensolve_test_ndeg_twisted_clover_asym_${prec}.xml) - endif() - - if(QUDA_DIRAC_DOMAIN_WALL) - add_test(NAME eigensolve_test_domain_wall_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type domain-wall --prec ${prec} - --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even --enable-testing true - --gtest_output=xml:eigensolve_test_domain_wall_${prec}.xml) - - add_test(NAME eigensolve_test_domain_wall_4d_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type domain-wall-4d --prec ${prec} - --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even --enable-testing true - --gtest_output=xml:eigensolve_test_domain_wall_4d_sym_${prec}.xml) - - add_test(NAME eigensolve_test_domain_wall_4d_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type domain-wall-4d --prec ${prec} - --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even-asym --enable-testing true - --gtest_output=xml:eigensolve_test_domain_wall_4d_asym_${prec}.xml) - - add_test(NAME eigensolve_test_mobius_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type mobius --prec ${prec} - --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even --enable-testing true - --gtest_output=xml:eigensolve_test_mobius_sym_${prec}.xml) - - add_test(NAME eigensolve_test_mobius_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type mobius --prec ${prec} - --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even-asym --enable-testing true - --gtest_output=xml:eigensolve_test_mobius_asym_${prec}.xml) - - add_test(NAME eigensolve_test_mobius_eofa_sym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type mobius-eofa --prec ${prec} - --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even --enable-testing true - --gtest_output=xml:eigensolve_test_mobius_eofa_sym_${prec}.xml) - - add_test(NAME eigensolve_test_mobius_eofa_asym_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type mobius-eofa --prec ${prec} - --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 - --eig-tol ${tol} --eig-max-restarts 1000 - --matpc even-even-asym --enable-testing true - --gtest_output=xml:eigensolve_test_mobius_eofa_asym_${prec}.xml) - endif() -endforeach(prec) + add_test(NAME eigensolve_test_ndeg_twisted_clover_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type twisted-clover --compute-clover true + --dim 2 4 6 8 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even-asym --flavor nondeg-doublet + --enable-testing true + --gtest_output=xml:eigensolve_test_ndeg_twisted_clover_asym.xml) +endif() + +if(QUDA_DIRAC_DOMAIN_WALL) + add_test(NAME eigensolve_test_domain_wall + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type domain-wall + --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even --enable-testing true + --gtest_output=xml:eigensolve_test_domain_wall.xml) + + add_test(NAME eigensolve_test_domain_wall_4d_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type domain-wall-4d + --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even --enable-testing true + --gtest_output=xml:eigensolve_test_domain_wall_4d_sym.xml) + + add_test(NAME eigensolve_test_domain_wall_4d_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type domain-wall-4d + --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even-asym --enable-testing true + --gtest_output=xml:eigensolve_test_domain_wall_4d_asym.xml) + + add_test(NAME eigensolve_test_mobius_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type mobius + --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even --enable-testing true + --gtest_output=xml:eigensolve_test_mobius_sym.xml) + + add_test(NAME eigensolve_test_mobius_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type mobius + --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even-asym --enable-testing true + --gtest_output=xml:eigensolve_test_mobius_asym.xml) + + add_test(NAME eigensolve_test_mobius_eofa_sym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type mobius-eofa + --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even --enable-testing true + --gtest_output=xml:eigensolve_test_mobius_eofa_sym.xml) + + add_test(NAME eigensolve_test_mobius_eofa_asym + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type mobius-eofa + --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128 + --eig-tol ${tol} --eig-max-restarts 1000 + --matpc even-even-asym --enable-testing true + --gtest_output=xml:eigensolve_test_mobius_eofa_asym.xml) +endif() # Staggered-type eigensolves foreach(prec IN LISTS TEST_PRECS) @@ -1425,6 +1414,14 @@ foreach(prec IN LISTS TEST_PRECS) endif() endforeach(prec) +if (QUDA_DIRAC_LAPLACE AND QUDA_DIRAC_WILSON) + add_test(NAME laph_test + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dim 2 4 6 8 + --enable-testing true + --gtest_output=xml:laph_test.xml) +endif() + if(QUDA_DIRAC_STAGGERED) add_test(NAME hisq_stencil @@ -1433,35 +1430,23 @@ if(QUDA_DIRAC_STAGGERED) --gtest_output=xml:hisq_stencil_test.xml) endif() -foreach(prec IN LISTS TEST_PRECS) - - add_test(NAME gauge_path_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dim 2 4 6 8 --prec ${prec} - --gtest_output=xml:gauge_path_test_${prec}.xml) +if(QUDA_DIRAC_CLOVER OR QUDA_DIRAC_TWISTED_CLOVER OR QUDA_DIRAC_NDEG_TWISTED_CLOVER) + add_test(NAME clover_force_test + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dim 2 4 6 8 + --compute-clover true + --solution-type mat-pc-dag-mat-pc + --matpc odd-odd-asym --dagger + --enable-testing true --niter 1 + --gtest_output=xml:clover_force_test.xml) +endif() +add_test(NAME gauge_path + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dim 2 4 6 8 --enable-testing true --niter 1 + --gtest_output=xml:gauge_path_test.xml) - if(QUDA_DIRAC_CLOVER) - add_test(NAME clover_force_test_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dim 2 4 6 8 --prec ${prec} - --dslash-type clover --compute-clover true - --solution-type mat-pc-dag-mat-pc - --matpc odd-odd-asym --dagger - --enable-testing true - --gtest_output=xml:clover_force_test_${prec}.xml) - endif() - - if(QUDA_DIRAC_TWISTED_CLOVER) - add_test(NAME tmc_clover_force_test_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dim 2 4 6 8 --prec ${prec} - --dslash-type twisted-clover --compute-clover true - --solution-type mat-pc-dag-mat-pc - --matpc odd-odd-asym --dagger - --enable-testing true - --gtest_output=xml:tmc_clover_force_test_${prec}.xml) - endif() +foreach(prec IN LISTS TEST_PRECS) if(QUDA_DIRAC_STAGGERED) add_test(NAME unitarize_link_${prec} @@ -1480,18 +1465,6 @@ foreach(prec IN LISTS TEST_PRECS) --gtest_output=xml:hisq_unitarize_force_test_${prec}.xml) endif() - add_test(NAME gauge_alg_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dim 4 6 8 10 --prec ${prec} - --gtest_output=xml:gauge_alg_test_${prec}.xml) - - if (TARGET dilution_test) - add_test(NAME dilution_test_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dim 4 6 8 10 --dilution-block-size 4 6 4 5 --prec ${prec} - --gtest_output=xml:dilution_test_${prec}.xml) - endif() - if(QUDA_SMEAR_GAUSS_TWOLINK) set(KERNEL_TYPE TwoLink GaussianSmear) foreach(kerneltp IN LISTS KERNEL_TYPE) @@ -1514,6 +1487,18 @@ foreach(prec IN LISTS TEST_PRECS) endforeach(prec) +add_test(NAME gauge_alg + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dim 4 6 8 10 + --gtest_output=xml:gauge_alg_test.xml) + +if (TARGET dilution_test) + add_test(NAME dilution_test + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dim 4 6 8 10 --dilution-block-size 4 6 4 5 + --gtest_output=xml:dilution_test.xml) +endif() + if(QUDA_QIO) add_test(NAME io_test COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} diff --git a/tests/blas_interface_test_gtest.hpp b/tests/blas_interface_test_gtest.hpp index cc6421306a..3ba10e1101 100644 --- a/tests/blas_interface_test_gtest.hpp +++ b/tests/blas_interface_test_gtest.hpp @@ -71,7 +71,7 @@ TEST_P(BLASTest, verify) switch (test_type) { case QUDA_BLAS_GEMM: { auto deviation_gemm = gemm_test(param); - decltype(deviation_gemm) tol_gemm; + decltype(deviation_gemm) tol_gemm = 0.0; // initialize to suppress warning switch (data_type) { case QUDA_BLAS_DATATYPE_S: case QUDA_BLAS_DATATYPE_C: tol_gemm = 10 * std::numeric_limits::epsilon(); break; @@ -84,7 +84,7 @@ TEST_P(BLASTest, verify) } case QUDA_BLAS_LU_INV: { auto deviation_lu_inv = lu_inv_test(param); - decltype(deviation_lu_inv) tol_lu_inv; + decltype(deviation_lu_inv) tol_lu_inv = 0.0; // initialize to suppress warning // We allow a factor of 5000 (500x more than the gemm tolerance factor) // due to variations in algorithmic implementation, order of arithmetic // operations, and possible near singular eigenvalues or degeneracies. diff --git a/tests/clover_force_test.cpp b/tests/clover_force_test.cpp index d1cbd089ce..149e064003 100644 --- a/tests/clover_force_test.cpp +++ b/tests/clover_force_test.cpp @@ -5,14 +5,15 @@ #include "clover_force_reference.h" #include "misc.h" #include // convenient quark field container +#include #include #include #include #include +#include #include static int force_check; -static int path_check; static double force_deviation; QudaGaugeParam gauge_param; QudaInvertParam inv_param; @@ -22,6 +23,10 @@ quda::GaugeField mom_ref; std::vector clover; std::vector clover_inv; +QudaPrecision last_prec = QUDA_INVALID_PRECISION; +QudaDslashType last_dslash = QUDA_INVALID_DSLASH; +QudaTwistFlavorType last_twist_flavor = QUDA_TWIST_INVALID; + void init(int argc, char **argv) { // Set QUDA's internal parameters @@ -38,12 +43,6 @@ void init(int argc, char **argv) param.order = QUDA_QDP_GAUGE_ORDER; gauge = quda::GaugeField(param); - printfQuda("Randomizing gauge fields... "); - constructHostGaugeField(gauge, gauge_param, argc, argv); - - printfQuda("Sending gauge field to GPU\n"); - loadGaugeQuda(gauge.raw_pointer(), &gauge_param); - param.order = QUDA_MILC_GAUGE_ORDER; param.link_type = QUDA_ASQTAD_MOM_LINKS; param.reconstruct = QUDA_RECONSTRUCT_10; @@ -51,16 +50,25 @@ void init(int argc, char **argv) mom = quda::GaugeField(param); mom_ref = quda::GaugeField(param); - // Allocate host side memory for clover terms if needed. - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - clover.resize(V * clover_site_size * host_clover_data_type_size); - clover_inv.resize(V * clover_site_size * host_spinor_data_type_size); - compute_clover = true; - constructHostCloverField(clover.data(), clover_inv.data(), inv_param); - // Load the clover terms to the device - loadCloverQuda(clover.data(), clover_inv.data(), &inv_param); - } else { - errorQuda("dslash type ( dslash_type = %d ) must have the clover", dslash_type); + printfQuda("Randomizing gauge fields... "); + constructHostGaugeField(gauge, gauge_param, argc, argv); + + clover.resize(V * clover_site_size * host_clover_data_type_size); + clover_inv.resize(V * clover_site_size * host_spinor_data_type_size); + + if (!enable_testing) { + printfQuda("Sending gauge field to GPU\n"); + loadGaugeQuda(gauge.raw_pointer(), &gauge_param); + + // Allocate host side memory for clover terms if needed. + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + compute_clover = true; + constructHostCloverField(clover.data(), clover_inv.data(), inv_param); + // Load the clover terms to the device + loadCloverQuda(clover.data(), clover_inv.data(), &inv_param); + } else { + errorQuda("dslash type ( dslash_type = %d ) must have the clover", dslash_type); + } } } @@ -71,12 +79,23 @@ void destroy() mom_ref = {}; } -using test_t = ::testing::tuple; +using test_t = ::testing::tuple; std::tuple clover_force_test(test_t param) { - bool detratio = ::testing::get<0>(param); - int nvector = ::testing::get<1>(param); + int nvector = ::testing::get<3>(param); + gauge_param.cuda_prec = ::testing::get<0>(param); + inv_param.cuda_prec = ::testing::get<0>(param); + inv_param.dslash_type = ::testing::get<1>(param); + inv_param.twist_flavor = ::testing::get<4>(param); + if (inv_param.dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + inv_param.epsilon = epsilon; + inv_param.evmax = evmax; + if (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { + for (int i = 0; i < nvector; i++) inv_param.offset[i] = 0.06 + i * i * 0.1; + } + } + bool detratio = ::testing::get<2>(param); std::vector out_nvector(nvector); std::vector in(nvector); @@ -123,16 +142,15 @@ std::tuple clover_force_test(test_t param) gflops += inv_param.gflops; } - int *check_out = true ? &force_check : &path_check; + int *check_out = &force_check; std::array u = {gauge.data(0), gauge.data(1), gauge.data(2), gauge.data(3)}; if (verify_results) { gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; mom_ref.zero(); TMCloverForce_reference(mom_ref.data(), in.data(), in0.data(), coeff.data(), nvector, u, clover, clover_inv, &gauge_param, &inv_param, detratio); - *check_out - = compare_floats(mom.data(), mom_ref.data(), 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec); - // if (compute_force) + *check_out = compare_floats(mom.data(), mom_ref.data(), 4 * V * mom_site_size, getTolerance(gauge_param.cuda_prec), + gauge_param.cpu_prec); strong_check_mom(mom.data(), mom_ref.data(), 4 * V, gauge_param.cpu_prec); } @@ -155,15 +173,58 @@ class CloverForceTest : public ::testing::TestWithParam protected: test_t param; + bool skip() + { + if (!quda::is_enabled(::testing::get<0>(param))) return true; // skip if precision is not enabled + // check requested dslash is enabled + if ((::testing::get<1>(param) == QUDA_CLOVER_WILSON_DSLASH && !quda::is_enabled_clover()) + || (::testing::get<1>(param) == QUDA_TWISTED_CLOVER_DSLASH && !quda::is_enabled_twisted_clover())) + return true; + + return false; + } + public: CloverForceTest() : param(GetParam()) { } + + virtual void SetUp() + { + if (skip()) GTEST_SKIP(); + // check if precision has changed and update if it has + if (::testing::get<0>(param) != last_prec) { + if (last_prec != QUDA_INVALID_PRECISION) freeGaugeQuda(); + gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; + gauge_param.cuda_prec = ::testing::get<0>(param); + loadGaugeQuda(gauge.raw_pointer(), &gauge_param); + } + + if (::testing::get<0>(param) != last_prec || ::testing::get<1>(param) != last_dslash + || ::testing::get<4>(param) != last_twist_flavor) { + if (last_prec != QUDA_INVALID_PRECISION) freeCloverQuda(); + // Load the clover terms to the device + inv_param.clover_cuda_prec = ::testing::get<0>(param); + inv_param.dslash_type = ::testing::get<1>(param); + if (inv_param.dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + inv_param.twist_flavor = ::testing::get<4>(param); + inv_param.epsilon = epsilon; + inv_param.evmax = evmax; + } + constructHostCloverField(clover.data(), clover_inv.data(), inv_param); + loadCloverQuda(clover.data(), clover_inv.data(), &inv_param); + } + last_twist_flavor = ::testing::get<4>(param); + last_prec = ::testing::get<0>(param); + last_dslash = ::testing::get<1>(param); + } }; TEST_P(CloverForceTest, verify) { + if (skip()) GTEST_SKIP(); + auto deviation = clover_force_test(GetParam()); ASSERT_EQ(std::get<0>(deviation), 1) << "CPU and QUDA force implementations do not agree"; - ASSERT_LE(std::get<1>(deviation), getTolerance(cuda_prec)) + ASSERT_LE(std::get<1>(deviation), getTolerance(::testing::get<0>(param))) << "CPU and QUDA momentum action implementations do not agree"; } @@ -185,8 +246,11 @@ static void display_test_info() std::string gettestname(::testing::TestParamInfo param) { std::string name; - if (::testing::get<0>(param.param)) name += std::string("ratio_"); - name += std::string("nvector_") + std::to_string(::testing::get<1>(param.param)); + name += std::string(get_prec_str(::testing::get<0>(param.param))) + "_"; + name += std::string(get_dslash_str(::testing::get<1>(param.param))) + "_"; + name += std::string(get_TwistFlavor_str(::testing::get<4>(param.param))); + if (::testing::get<2>(param.param)) name += std::string("ratio_"); + name += std::string("nvector_") + std::to_string(::testing::get<3>(param.param)); return name; } @@ -197,6 +261,9 @@ int main(int argc, char **argv) // return code for google test int test_rc = 0; + // set default dslash to clover + dslash_type = QUDA_CLOVER_WILSON_DSLASH; + // command line options auto app = make_app(); add_clover_force_option_group(app); @@ -220,7 +287,7 @@ int main(int argc, char **argv) test_rc = RUN_ALL_TESTS(); } else { - clover_force_test({detratio, Nsrc}); + clover_force_test({prec, dslash_type, detratio, Nsrc, twist_flavor}); } destroy(); @@ -229,5 +296,28 @@ int main(int argc, char **argv) return test_rc; } +using ::testing::Combine; +using ::testing::Values; + +#ifdef GPU_CLOVER_DIRAC INSTANTIATE_TEST_SUITE_P(CloverForceTest, CloverForceTest, - ::testing::Combine(::testing::Values(false, true), ::testing::Values(1, 8)), gettestname); + Combine(Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION), Values(QUDA_CLOVER_WILSON_DSLASH), + Values(false, true), Values(1, 8), Values(QUDA_TWIST_NO)), + gettestname); +#endif + +#ifdef GPU_TWISTED_CLOVER_DIRAC +INSTANTIATE_TEST_SUITE_P(CloverForceTest_TwistedClover, CloverForceTest, + Combine(Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION), Values(QUDA_TWISTED_CLOVER_DSLASH), + Values(false, true), Values(1, 8), Values(QUDA_TWIST_SINGLET)), + gettestname); + +#endif + +#ifdef GPU_NDEG_TWISTED_CLOVER_DIRAC +// twisted clover non degenerate doublet +INSTANTIATE_TEST_SUITE_P(CloverForceTest_ndeg_TwistedClover, CloverForceTest, + Combine(Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION), Values(QUDA_TWISTED_CLOVER_DSLASH), + Values(false, true), Values(1, 8), Values(QUDA_TWIST_NONDEG_DOUBLET)), + gettestname); +#endif diff --git a/tests/dilution_test.cpp b/tests/dilution_test.cpp index 71bfa57aa7..429692f446 100644 --- a/tests/dilution_test.cpp +++ b/tests/dilution_test.cpp @@ -9,20 +9,22 @@ #include #include -using test_t = ::testing::tuple; +using test_t = ::testing::tuple; class DilutionTest : public ::testing::TestWithParam { protected: + QudaPrecision precision; QudaSiteSubset site_subset; QudaDilutionType dilution_type; int nSpin; public: DilutionTest() : - site_subset(::testing::get<0>(GetParam())), - dilution_type(testing::get<1>(GetParam())), - nSpin(testing::get<2>(GetParam())) + precision(::testing::get<0>(GetParam())), + site_subset(::testing::get<1>(GetParam())), + dilution_type(testing::get<2>(GetParam())), + nSpin(testing::get<3>(GetParam())) { } }; @@ -31,7 +33,7 @@ TEST_P(DilutionTest, verify) { using namespace quda; - if (!is_enabled_spin(nSpin)) GTEST_SKIP(); + if (!is_enabled_spin(nSpin) || !is_enabled(precision)) GTEST_SKIP(); // Set some parameters QudaGaugeParam gauge_param = newQudaGaugeParam(); @@ -44,7 +46,7 @@ TEST_P(DilutionTest, verify) param.siteSubset = site_subset; if (site_subset == QUDA_PARITY_SITE_SUBSET) param.x[0] /= 2; param.nSpin = nSpin; - param.setPrecision(inv_param.cuda_prec, inv_param.cuda_prec, true); // change order to native order + param.setPrecision(precision, precision, true); // change order to native order param.location = QUDA_CUDA_FIELD_LOCATION; param.create = QUDA_NULL_FIELD_CREATE; ColorSpinorField src(param); @@ -97,57 +99,58 @@ TEST_P(DilutionTest, verify) } using ::testing::Combine; +using ::testing::get; using ::testing::Values; +auto test_str = [](testing::TestParamInfo param) { + return std::string(get_prec_str(get<0>(param.param))) + "_" + get_dilution_type_str(get<2>(param.param)); +}; + +auto precisions = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION); + INSTANTIATE_TEST_SUITE_P(WilsonFull, DilutionTest, - Combine(Values(QUDA_FULL_SITE_SUBSET), + Combine(precisions, Values(QUDA_FULL_SITE_SUBSET), Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_SPIN_COLOR_EVEN_ODD, QUDA_DILUTION_BLOCK), Values(4)), - [](testing::TestParamInfo param) { - return get_dilution_type_str(::testing::get<1>(param.param)); - }); + test_str); INSTANTIATE_TEST_SUITE_P( WilsonParity, DilutionTest, - Combine(Values(QUDA_PARITY_SITE_SUBSET), + Combine(precisions, Values(QUDA_PARITY_SITE_SUBSET), Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_BLOCK), Values(4)), - [](testing::TestParamInfo param) { return get_dilution_type_str(::testing::get<1>(param.param)); }); + test_str); -INSTANTIATE_TEST_SUITE_P( - CoarseFull, DilutionTest, - Combine(Values(QUDA_FULL_SITE_SUBSET), - Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_SPIN_COLOR_EVEN_ODD), - Values(2)), - [](testing::TestParamInfo param) { return get_dilution_type_str(::testing::get<1>(param.param)); }); +INSTANTIATE_TEST_SUITE_P(CoarseFull, DilutionTest, + Combine(precisions, Values(QUDA_FULL_SITE_SUBSET), + Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, + QUDA_DILUTION_SPIN_COLOR_EVEN_ODD), + Values(2)), + test_str); INSTANTIATE_TEST_SUITE_P(CoarseParity, DilutionTest, - Combine(Values(QUDA_PARITY_SITE_SUBSET), + Combine(precisions, Values(QUDA_PARITY_SITE_SUBSET), Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR), Values(2)), - [](testing::TestParamInfo param) { - return get_dilution_type_str(::testing::get<1>(param.param)); - }); + test_str); -INSTANTIATE_TEST_SUITE_P( - StaggeredFull, DilutionTest, - Combine(Values(QUDA_FULL_SITE_SUBSET), - Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_SPIN_COLOR_EVEN_ODD), - Values(1)), - [](testing::TestParamInfo param) { return get_dilution_type_str(::testing::get<1>(param.param)); }); +INSTANTIATE_TEST_SUITE_P(StaggeredFull, DilutionTest, + Combine(precisions, Values(QUDA_FULL_SITE_SUBSET), + Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, + QUDA_DILUTION_SPIN_COLOR_EVEN_ODD), + Values(1)), + test_str); INSTANTIATE_TEST_SUITE_P(StaggeredParity, DilutionTest, - Combine(Values(QUDA_PARITY_SITE_SUBSET), + Combine(precisions, Values(QUDA_PARITY_SITE_SUBSET), Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR), Values(1)), - [](testing::TestParamInfo param) { - return get_dilution_type_str(::testing::get<1>(param.param)); - }); + test_str); struct dilution_test : quda_test { void display_info() const override { quda_test::display_info(); - printfQuda("prec S_dimension T_dimension Ls_dimension\n"); - printfQuda("%6s %3d/%3d/%3d %3d %2d\n", get_prec_str(prec), xdim, ydim, zdim, tdim, Lsdim); + printfQuda("S_dimension T_dimension Ls_dimension\n"); + printfQuda("%3d/%3d/%3d %3d %2d\n", xdim, ydim, zdim, tdim, Lsdim); } dilution_test(int argc, char **argv) : quda_test("Dilution Test", argc, argv) { } diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h index d26f8ca662..f5d2d6e4e8 100644 --- a/tests/dslash_test_utils.h +++ b/tests/dslash_test_utils.h @@ -91,13 +91,17 @@ struct DslashTestWrapper { void init_ctest(int argc, char **argv, int precision, QudaReconstructType link_recon) { - cuda_prec = getPrecision(precision); - - gauge_param = newQudaGaugeParam(); - inv_param = newQudaInvertParam(); - setWilsonGaugeParam(gauge_param); - setInvertParam(inv_param); + static bool first_time = true; + if (first_time) { + gauge_param = newQudaGaugeParam(); + setWilsonGaugeParam(gauge_param); + inv_param = newQudaInvertParam(); + setInvertParam(inv_param); + init_host(argc, argv); + first_time = false; + } + cuda_prec = getPrecision(precision); gauge_param.cuda_prec = cuda_prec; gauge_param.cuda_prec_sloppy = cuda_prec; gauge_param.cuda_prec_precondition = cuda_prec; @@ -109,24 +113,22 @@ struct DslashTestWrapper { gauge_param.reconstruct_sloppy = link_recon; inv_param.cuda_prec = cuda_prec; + inv_param.clover_cuda_prec = cuda_prec; + inv_param.clover_cuda_prec_sloppy = cuda_prec; + inv_param.clover_cuda_prec_precondition = cuda_prec; + inv_param.clover_cuda_prec_refinement_sloppy = cuda_prec; - static bool first_time = true; - if (first_time) { - init_host(argc, argv); - first_time = false; - } init(); } void init_test(int argc, char **argv) { - gauge_param = newQudaGaugeParam(); - inv_param = newQudaInvertParam(); - setWilsonGaugeParam(gauge_param); - setInvertParam(inv_param); - static bool first_time = true; if (first_time) { + gauge_param = newQudaGaugeParam(); + setWilsonGaugeParam(gauge_param); + inv_param = newQudaInvertParam(); + setInvertParam(inv_param); init_host(argc, argv); first_time = false; } diff --git a/tests/eigensolve_test.cpp b/tests/eigensolve_test.cpp index 7c17540a60..6f5114a182 100644 --- a/tests/eigensolve_test.cpp +++ b/tests/eigensolve_test.cpp @@ -21,6 +21,12 @@ QudaGaugeParam gauge_param; QudaInvertParam eig_inv_param; QudaEigParam eig_param; +std::vector gauge_; +std::array gauge; +std::vector clover; +std::vector clover_inv; +QudaPrecision last_prec = QUDA_INVALID_PRECISION; + // if "--enable-testing true" is passed, we run the tests defined in here #include @@ -28,10 +34,9 @@ void display_test_info(QudaEigParam ¶m) { printfQuda("running the following test:\n"); - printfQuda("prec sloppy_prec link_recon sloppy_link_recon S_dimension T_dimension Ls_dimension\n"); - printfQuda("%s %s %s %s %d/%d/%d %d %d\n", get_prec_str(prec), - get_prec_str(prec_sloppy), get_recon_str(link_recon), get_recon_str(link_recon_sloppy), xdim, ydim, zdim, - tdim, Lsdim); + printfQuda("prec link_recon sloppy_link_recon S_dimension T_dimension Ls_dimension\n"); + printfQuda("%s %s %s %d/%d/%d %d %d\n", get_prec_str(param.cuda_prec_ritz), + get_recon_str(link_recon), get_recon_str(link_recon_sloppy), xdim, ydim, zdim, tdim, Lsdim); printfQuda("\n Eigensolver parameters\n"); printfQuda(" - solver mode %s\n", get_eig_type_str(param.eig_type)); @@ -65,11 +70,6 @@ void display_test_info(QudaEigParam ¶m) dimPartitioned(3)); } -std::vector gauge_; -std::array gauge; -std::vector clover; -std::vector clover_inv; - void init(int argc, char **argv) { // Construct QUDA param structures to define the problem @@ -116,16 +116,19 @@ void init(int argc, char **argv) // Load the clover terms to the device loadCloverQuda(clover.data(), clover_inv.data(), &eig_inv_param); } + last_prec = gauge_param.cuda_prec; } std::vector eigensolve(test_t test_param) { // Collect testing parameters from gtest - eig_param.eig_type = ::testing::get<0>(test_param); - eig_param.use_norm_op = ::testing::get<1>(test_param); - eig_param.use_pc = ::testing::get<2>(test_param); - eig_param.compute_svd = ::testing::get<3>(test_param); - eig_param.spectrum = ::testing::get<4>(test_param); + eig_inv_param.cuda_prec = ::testing::get<0>(test_param); + eig_inv_param.cuda_prec_eigensolver = ::testing::get<0>(test_param); + eig_param.eig_type = ::testing::get<1>(test_param); + eig_param.use_norm_op = ::testing::get<2>(test_param); + eig_param.use_pc = ::testing::get<3>(test_param); + eig_param.compute_svd = ::testing::get<4>(test_param); + eig_param.spectrum = ::testing::get<5>(test_param); // For preconditioned matrices, spinors are only half the normal length. // QUDA constructs spinors based on the "solution type" which is @@ -198,6 +201,7 @@ std::vector eigensolve(test_t test_param) if (eig_param.arpack_check && !(eig_inv_param.cpu_prec == QUDA_DOUBLE_PRECISION)) { errorQuda("ARPACK check only available in double precision"); } + eigensolveQuda(host_evecs_ptr.data(), evals.data(), &eig_param); host_timer.stop(); printfQuda("Time for %s solution = %f\n", eig_param.arpack_check ? "ARPACK" : "QUDA", host_timer.last()); @@ -249,8 +253,7 @@ int main(int argc, char **argv) && dslash_type != QUDA_TWISTED_MASS_DSLASH && dslash_type != QUDA_TWISTED_CLOVER_DSLASH && dslash_type != QUDA_MOBIUS_DWF_DSLASH && dslash_type != QUDA_MOBIUS_DWF_EOFA_DSLASH && dslash_type != QUDA_DOMAIN_WALL_DSLASH && dslash_type != QUDA_DOMAIN_WALL_4D_DSLASH) { - printfQuda("dslash_type %d not supported\n", dslash_type); - exit(0); + errorQuda("dslash_type %d not supported", dslash_type); } // Initialize the QUDA library @@ -270,16 +273,13 @@ int main(int argc, char **argv) if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } result = RUN_ALL_TESTS(); } else { - eigensolve( - test_t {eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd, eig_param.spectrum}); + eigensolve(test_t {prec, eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd, + eig_param.spectrum}); } // Memory clean-up freeGaugeQuda(); - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - freeCloverQuda(); - } - + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) freeCloverQuda(); // Finalize the QUDA library endQuda(); finalizeComms(); diff --git a/tests/eigensolve_test_gtest.hpp b/tests/eigensolve_test_gtest.hpp index a872963413..41b2443932 100644 --- a/tests/eigensolve_test_gtest.hpp +++ b/tests/eigensolve_test_gtest.hpp @@ -1,6 +1,6 @@ #include -using test_t = ::testing::tuple; +using test_t = ::testing::tuple; class EigensolveTest : public ::testing::TestWithParam { @@ -9,12 +9,37 @@ class EigensolveTest : public ::testing::TestWithParam public: EigensolveTest() : param(GetParam()) { } + + virtual void SetUp() + { + // check if outer precision has changed and update if it has + if (::testing::get<0>(param) != last_prec) { + if (last_prec != QUDA_INVALID_PRECISION) { + freeGaugeQuda(); + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) freeCloverQuda(); + } + + // Load the gauge field to the device + gauge_param.cuda_prec = ::testing::get<0>(param); + loadGaugeQuda(gauge.data(), &gauge_param); + + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + // Load the clover terms to the device + eig_inv_param.clover_cuda_prec = ::testing::get<0>(param); + loadCloverQuda(clover.data(), clover_inv.data(), &eig_inv_param); + } + last_prec = ::testing::get<0>(param); + } + } }; bool skip_test(test_t param) { + auto prec = ::testing::get<0>(param); + if (!(QUDA_PRECISION & prec)) return true; // precision not enabled so skip it + // dwf-style solves must use a normal solver - if (is_chiral(dslash_type) && (::testing::get<1>(param) == QUDA_BOOLEAN_FALSE)) return true; + if (is_chiral(dslash_type) && (::testing::get<2>(param) == QUDA_BOOLEAN_FALSE)) return true; return false; } @@ -23,29 +48,32 @@ std::vector eigensolve(test_t test_param); TEST_P(EigensolveTest, verify) { if (skip_test(GetParam())) GTEST_SKIP(); - double factor = 1.0; + + auto tol = ::testing::get<0>(GetParam()) == QUDA_SINGLE_PRECISION ? 1e-6 : 1e-12; + eig_param.tol = tol; + // The IRAM eigensolver will sometimes report convergence with tolerances slightly // higher than requested. The same phenomenon occurs in ARPACK. This factor // prevents failure when IRAM has solved to say 2e-6 when 1e-6 is requested. // The solution to avoid this is to use a Krylov space (eig-n-kr) about 3-4 times the // size of the search space (eig-n-ev), or use a well chosen Chebyshev polynomial, // or use a tighter than necessary tolerance. - if (eig_param.eig_type == QUDA_EIG_IR_ARNOLDI || eig_param.eig_type == QUDA_EIG_BLK_IR_ARNOLDI) factor *= 10; - auto tol = factor * eig_param.tol; + if (eig_param.eig_type == QUDA_EIG_IR_ARNOLDI || eig_param.eig_type == QUDA_EIG_BLK_IR_ARNOLDI) tol *= 10; for (auto rsd : eigensolve(GetParam())) EXPECT_LE(rsd, tol); } std::string gettestname(::testing::TestParamInfo param) { std::string name; - name += get_eig_type_str(::testing::get<0>(param.param)) + std::string("_"); - name += (::testing::get<1>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("normop") : std::string("direct")) + name += get_prec_str(::testing::get<0>(param.param)) + std::string("_"); + name += get_eig_type_str(::testing::get<1>(param.param)) + std::string("_"); + name += (::testing::get<2>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("normop") : std::string("direct")) + std::string("_"); - name += (::testing::get<2>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("evenodd") : std::string("full")) + name += (::testing::get<3>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("evenodd") : std::string("full")) + std::string("_"); - name += (::testing::get<3>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("withSVD") : std::string("noSVD")) + name += (::testing::get<4>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("withSVD") : std::string("noSVD")) + std::string("_"); - name += get_eig_spectrum_str(::testing::get<4>(param.param)); + name += get_eig_spectrum_str(::testing::get<5>(param.param)); return name; } @@ -63,25 +91,28 @@ auto hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG); auto non_hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG, QUDA_SPECTRUM_LM_EIG, QUDA_SPECTRUM_SM_EIG, QUDA_SPECTRUM_LI_EIG, QUDA_SPECTRUM_SI_EIG); +auto precisions = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION); + // preconditioned normal solves INSTANTIATE_TEST_SUITE_P(NormalEvenOdd, EigensolveTest, - ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_TRUE), Values(QUDA_BOOLEAN_TRUE), - Values(QUDA_BOOLEAN_TRUE), hermitian_spectrum), + ::testing::Combine(precisions, hermitian_solvers, Values(QUDA_BOOLEAN_TRUE), + Values(QUDA_BOOLEAN_TRUE), Values(QUDA_BOOLEAN_TRUE), hermitian_spectrum), gettestname); // full system normal solve INSTANTIATE_TEST_SUITE_P(NormalFull, EigensolveTest, - ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_TRUE), Values(QUDA_BOOLEAN_TRUE), - Values(QUDA_BOOLEAN_TRUE), hermitian_spectrum), + ::testing::Combine(precisions, hermitian_solvers, Values(QUDA_BOOLEAN_TRUE), + Values(QUDA_BOOLEAN_TRUE), Values(QUDA_BOOLEAN_TRUE), hermitian_spectrum), gettestname); // preconditioned direct solves INSTANTIATE_TEST_SUITE_P(DirectEvenOdd, EigensolveTest, - ::testing::Combine(non_hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_TRUE), - Values(QUDA_BOOLEAN_FALSE), non_hermitian_spectrum), + ::testing::Combine(precisions, non_hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), + Values(QUDA_BOOLEAN_TRUE), Values(QUDA_BOOLEAN_FALSE), non_hermitian_spectrum), gettestname); // full system direct solve INSTANTIATE_TEST_SUITE_P(DirectFull, EigensolveTest, - ::testing::Combine(non_hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE), - Values(QUDA_BOOLEAN_FALSE), non_hermitian_spectrum), + ::testing::Combine(precisions, non_hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), + Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE), + non_hermitian_spectrum), gettestname); diff --git a/tests/gauge_alg_test.cpp b/tests/gauge_alg_test.cpp index 00ba1f3689..c5197af83c 100644 --- a/tests/gauge_alg_test.cpp +++ b/tests/gauge_alg_test.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -34,14 +35,17 @@ bool gauge_store; std::array, 4> host_gauge; -class GaugeAlgTest : public ::testing::Test -{ -protected: +using test_t = ::testing::tuple; + +struct GaugeAlgTest : public ::testing::TestWithParam { + QudaPrecision precision; QudaGaugeParam param; Timer a0, a1; GaugeField *U; double3 plaq; + GaugeAlgTest() : precision(::testing::get<0>(GetParam())) { } + void SetReunitarizationConsts() { const double unitarize_eps = 1e-14; @@ -59,14 +63,14 @@ class GaugeAlgTest : public ::testing::Test auto a1 = std::abs(a.y - b.y); auto a2 = std::abs(a.z - b.z); double prec_val = 1.0e-5; - if (prec == QUDA_DOUBLE_PRECISION) prec_val = gf_tolerance * 1e2; + if (precision == QUDA_DOUBLE_PRECISION) prec_val = gf_tolerance * 1e2; return ((a0 < prec_val) && (a1 < prec_val) && (a2 < prec_val)); } bool CheckDeterminant(double2 detu) { double prec_val = 5e-8; - if (prec == QUDA_DOUBLE_PRECISION) prec_val = gf_tolerance * 1e2; + if (precision == QUDA_DOUBLE_PRECISION) prec_val = gf_tolerance * 1e2; return (std::abs(1.0 - detu.x) < prec_val && std::abs(detu.y) < prec_val); } @@ -80,12 +84,18 @@ class GaugeAlgTest : public ::testing::Test GTEST_SKIP(); } #endif + if (!is_enabled(precision)) { + execute = false; + GTEST_SKIP(); + } + if (execute) { setVerbosity(verbosity); param = newQudaGaugeParam(); // Setup gauge container. setWilsonGaugeParam(param); + param.cuda_prec = precision; param.t_boundary = QUDA_PERIODIC_T; // Reunitarization setup @@ -102,7 +112,7 @@ class GaugeAlgTest : public ::testing::Test gParam.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED; gParam.create = QUDA_NULL_FIELD_CREATE; gParam.reconstruct = link_recon; - gParam.setPrecision(prec, true); + gParam.setPrecision(precision, true); for (int d = 0; d < 4; d++) { if (comm_dim_partitioned(d)) gParam.r[d] = 2; gParam.x[d] += 2 * gParam.r[d]; @@ -280,7 +290,7 @@ class GaugeAlgTest : public ::testing::Test } }; -TEST_F(GaugeAlgTest, Generation) +TEST_P(GaugeAlgTest, Generation) { if (execute && !gauge_load) { auto detu = getLinkDeterminant(*U); @@ -288,7 +298,7 @@ TEST_F(GaugeAlgTest, Generation) } } -TEST_F(GaugeAlgTest, Landau_Overrelaxation) +TEST_P(GaugeAlgTest, Landau_Overrelaxation) { if (execute) { printfQuda("Landau gauge fixing with overrelaxation\n"); @@ -301,7 +311,7 @@ TEST_F(GaugeAlgTest, Landau_Overrelaxation) } } -TEST_F(GaugeAlgTest, Coulomb_Overrelaxation) +TEST_P(GaugeAlgTest, Coulomb_Overrelaxation) { if (execute) { printfQuda("Coulomb gauge fixing with overrelaxation\n"); @@ -314,7 +324,7 @@ TEST_F(GaugeAlgTest, Coulomb_Overrelaxation) } } -TEST_F(GaugeAlgTest, Landau_FFT) +TEST_P(GaugeAlgTest, Landau_FFT) { if (execute) { if (!comm_partitioned()) { @@ -329,7 +339,7 @@ TEST_F(GaugeAlgTest, Landau_FFT) } } -TEST_F(GaugeAlgTest, Coulomb_FFT) +TEST_P(GaugeAlgTest, Coulomb_FFT) { if (execute) { if (!comm_partitioned()) { @@ -377,6 +387,12 @@ struct gauge_alg_test : quda_test { gauge_alg_test(int argc, char **argv) : quda_test("Gauge Alg Test", argc, argv) { } }; +INSTANTIATE_TEST_SUITE_P(GaugeAlgTest, GaugeAlgTest, + testing::Combine(testing::Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION)), + [](testing::TestParamInfo param) { + return std::string(get_prec_str(testing::get<0>(param.param))); + }); + int main(int argc, char **argv) { gauge_alg_test test(argc, argv); diff --git a/tests/gauge_path_test.cpp b/tests/gauge_path_test.cpp index 370470fc50..fee31cb53c 100644 --- a/tests/gauge_path_test.cpp +++ b/tests/gauge_path_test.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "misc.h" #include "gauge_force_reference.h" #include @@ -81,15 +82,20 @@ static double force_deviation; static double loop_deviation; static double plaq_deviation; +using force_test_t = ::testing::tuple; + // The same function is used to test computePath. // If compute_force is false then a path is computed -void gauge_force_test(bool compute_force = true) +void gauge_force_test(force_test_t test_param) { int max_length = 6; QudaGaugeParam gauge_param = newQudaGaugeParam(); setGaugeParam(gauge_param); + gauge_param.cuda_prec = ::testing::get<0>(test_param); + bool compute_force = ::testing::get<1>(test_param); + gauge_param.gauge_order = gauge_order; gauge_param.t_boundary = QUDA_PERIODIC_T; @@ -234,15 +240,26 @@ void gauge_force_test(bool compute_force = true) for (int i = 0; i < num_paths; i++) host_free(input_path_buf[dir][i]); host_free(input_path_buf[dir]); } + + if (compute_force) { + ASSERT_EQ(force_check, 1) << "CPU and QUDA force implementations do not agree"; + ASSERT_LE(force_deviation, getTolerance(cuda_prec)) << "CPU and QUDA momentum action implementations do not agree"; + } else { + // path check + ASSERT_EQ(path_check, 1) << "CPU and QUDA path implementations do not agree"; + } } -void gauge_loop_test() +using loop_test_t = ::testing::tuple; + +void gauge_loop_test(loop_test_t loop_param) { int max_length = 6; QudaGaugeParam gauge_param = newQudaGaugeParam(); setWilsonGaugeParam(gauge_param); + gauge_param.cuda_prec = ::testing::get<0>(loop_param); gauge_param.gauge_order = gauge_order; gauge_param.t_boundary = QUDA_PERIODIC_T; @@ -368,28 +385,53 @@ void gauge_loop_test() for (int i = 0; i < num_paths; i++) delete[] trace_path_p[i]; delete[] trace_path_p; + + ASSERT_LE(loop_deviation, getTolerance(cuda_prec)) << "CPU and QUDA loop trace implementations do not agree"; + ASSERT_LE(plaq_deviation, getTolerance(cuda_prec)) + << "Plaquette from QUDA loop trace and QUDA dedicated plaquette function do not agree"; } -TEST(force, verify) { ASSERT_EQ(force_check, 1) << "CPU and QUDA force implementations do not agree"; } +struct GaugePathTest : public ::testing::TestWithParam +{ + force_test_t param; + GaugePathTest() : param(GetParam()) { } +}; -TEST(action, verify) +TEST_P(GaugePathTest, verify) { - ASSERT_LE(force_deviation, getTolerance(cuda_prec)) << "CPU and QUDA momentum action implementations do not agree"; + QudaPrecision prec= ::testing::get<0>(param); + if (!quda::is_enabled(prec)) GTEST_SKIP(); + gauge_force_test(param); } -TEST(path, verify) { ASSERT_EQ(path_check, 1) << "CPU and QUDA path implementations do not agree"; } - -TEST(loop_traces, verify) +struct GaugeLoopTest : public ::testing::TestWithParam { - ASSERT_LE(loop_deviation, getTolerance(cuda_prec)) << "CPU and QUDA loop trace implementations do not agree"; -} + loop_test_t param; + GaugeLoopTest() : param(GetParam()) { } +}; -TEST(plaquette, verify) +TEST_P(GaugeLoopTest, verify) { - ASSERT_LE(plaq_deviation, getTolerance(cuda_prec)) - << "Plaquette from QUDA loop trace and QUDA dedicated plaquette function do not agree"; + QudaPrecision prec= ::testing::get<0>(param); + if (!quda::is_enabled(prec)) GTEST_SKIP(); + gauge_loop_test(param); } +using ::testing::Combine; +using ::testing::Values; + +INSTANTIATE_TEST_SUITE_P(GaugePathTest, GaugePathTest, + Combine(Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION), + Values(true, false)), + [](testing::TestParamInfo param) { + return std::string(get_prec_str(testing::get<0>(param.param))) + + "_" + (::testing::get<1>(param.param) ? "force" : "path"); }); + +INSTANTIATE_TEST_SUITE_P(GaugeLoopTest, GaugeLoopTest, + Combine(Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION)), + [](testing::TestParamInfo param) + { return std::string(get_prec_str(testing::get<0>(param.param))); }); + static void display_test_info() { printfQuda("running the following test:\n"); @@ -414,6 +456,7 @@ int main(int argc, char **argv) CLI::TransformPairs gauge_order_map {{"milc", QUDA_MILC_GAUGE_ORDER}, {"qdp", QUDA_QDP_GAUGE_ORDER}}; app->add_option("--gauge-order", gauge_order, "")->transform(CLI::QUDACheckedTransformer(gauge_order_map)); + add_testing_option_group(app); try { app->parse(argc, argv); } catch (const CLI::ParseError &e) { @@ -421,27 +464,23 @@ int main(int argc, char **argv) } initComms(argc, argv, gridsize_from_cmdline); - initQuda(device_ordinal); setVerbosity(verbosity); display_test_info(); - gauge_force_test(); - - // The same test is also used for gauge path (compute_force=false) - gauge_force_test(false); - - gauge_loop_test(); - - if (verify_results) { + if (enable_testing) { // Ensure gtest prints only from rank 0 ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners(); if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } test_rc = RUN_ALL_TESTS(); if (test_rc != 0) warningQuda("Tests failed"); + } else { + gauge_force_test({prec, true}); + gauge_force_test({prec, false}); + gauge_loop_test({prec}); } endQuda(); diff --git a/tests/host_reference/clover_force_reference.cpp b/tests/host_reference/clover_force_reference.cpp index 3d4c7e4e44..536ce9f4b0 100644 --- a/tests/host_reference/clover_force_reference.cpp +++ b/tests/host_reference/clover_force_reference.cpp @@ -225,6 +225,27 @@ void CloverForce_reference(void *h_mom, std::array gauge, std::vector std::vector &p, std::vector force_coeff) { int dag = 1; + // Get spinor ghost fields + // First wrap the input spinor into a ColorSpinorField + quda::ColorSpinorParam csParam[4]; + for (int i = 0; i < 4; ++i) { + csParam[i].location = QUDA_CPU_FIELD_LOCATION; + // csParam[i].v = in; + csParam[i].nColor = 3; + csParam[i].nSpin = 4; + csParam[i].nDim = 4; + for (int d = 0; d < 4; d++) csParam[i].x[d] = Z[d]; + csParam[i].setPrecision(QUDA_DOUBLE_PRECISION); + csParam[i].pad = 0; + csParam[i].siteSubset = QUDA_PARITY_SITE_SUBSET; + csParam[i].x[0] /= 2; + csParam[i].siteOrder = QUDA_EVEN_ODD_SITE_ORDER; + csParam[i].fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; + csParam[i].gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + csParam[i].create = QUDA_REFERENCE_FIELD_CREATE; + csParam[i].pc_type = QUDA_4D_PC; + } + for (auto i = 0u; i < x.size(); i++) { for (int parity = 0; parity < 2; parity++) { quda::ColorSpinorField &inA = (parity & 1) ? x[i].Odd() : x[i].Even(); @@ -238,6 +259,24 @@ void CloverForce_reference(void *h_mom, std::array gauge, std::vector CloverForce_kernel_host(gauge, h_mom, inA, inB, 1, parity, force_coeff[i]); inD.exchangeGhost((QudaParity)(1 - parity), nFace, 1 - dag); CloverForce_kernel_host(gauge, h_mom, inC, inD, -1, parity, force_coeff[i]); + + if (x[0].TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET) { + csParam[0].v = inA.data(); + csParam[1].v = inB.data(); + csParam[2].v = inC.data(); + csParam[3].v = inD.data(); + + for (int j = 0; j < 4; ++j) { csParam[j].v = (char *)csParam[j].v + Vh * spinor_site_size * sizeof(double); } + quda::ColorSpinorField in2A(csParam[0]); + quda::ColorSpinorField in2B(csParam[1]); + quda::ColorSpinorField in2C(csParam[2]); + quda::ColorSpinorField in2D(csParam[3]); + + in2B.exchangeGhost((QudaParity)(1 - parity), nFace, dag); + CloverForce_kernel_host(gauge, h_mom, in2A, in2B, 1, parity, force_coeff[i]); + in2D.exchangeGhost((QudaParity)(1 - parity), nFace, 1 - dag); + CloverForce_kernel_host(gauge, h_mom, in2C, in2D, -1, parity, force_coeff[i]); + } } } } @@ -826,40 +865,44 @@ void CloverSigmaOprod_reference(void *oprod_, quda::ColorSpinorField &inp, quda: sFloat *x = (sFloat *)inx.data(); sFloat *p = (sFloat *)inp.data(); + int flavors = inx.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET ? inx.TwistFlavor() : 1; + for (int parity = 0; parity < 2; parity++) { #pragma omp parallel for for (int i = 0; i < Vh; i++) { for (int mu = 1; mu < 4; mu++) { for (int nu = 0; nu < mu; nu++) { - - sFloat temp[spinor_site_size], temp_munu[spinor_site_size], temp_numu[spinor_site_size]; - multiplySpinorByDiracgamma(temp, nu, &p[spinor_site_size * (i + Vh * parity)]); - multiplySpinorByDiracgamma(temp_munu, mu, temp); - - multiplySpinorByDiracgamma(temp, mu, &p[spinor_site_size * (i + Vh * parity)]); - multiplySpinorByDiracgamma(temp_numu, nu, temp); - for (int s = 0; s < 4; s++) { - for (int t = 0; t < 3; t++) { - temp[s * (3 * 2) + t * (2) + 0] - = -temp_munu[s * (3 * 2) + t * (2) + 0] + temp_numu[s * (3 * 2) + t * (2) + 0]; - temp[s * (3 * 2) + t * (2) + 1] - = -temp_munu[s * (3 * 2) + t * (2) + 1] + temp_numu[s * (3 * 2) + t * (2) + 1]; + for (int flavor = 0; flavor < flavors; ++flavor) { + + sFloat temp[spinor_site_size], temp_munu[spinor_site_size], temp_numu[spinor_site_size]; + multiplySpinorByDiracgamma(temp, nu, &p[spinor_site_size * (i + Vh * flavor + Vh * flavors * parity)]); + multiplySpinorByDiracgamma(temp_munu, mu, temp); + + multiplySpinorByDiracgamma(temp, mu, &p[spinor_site_size * (i + Vh * flavor + Vh * flavors * parity)]); + multiplySpinorByDiracgamma(temp_numu, nu, temp); + for (int s = 0; s < 4; s++) { + for (int t = 0; t < 3; t++) { + temp[s * (3 * 2) + t * (2) + 0] + = -temp_munu[s * (3 * 2) + t * (2) + 0] + temp_numu[s * (3 * 2) + t * (2) + 0]; + temp[s * (3 * 2) + t * (2) + 1] + = -temp_munu[s * (3 * 2) + t * (2) + 1] + temp_numu[s * (3 * 2) + t * (2) + 1]; + } } - } - gFloat oprod_f[gauge_site_size]; - gFloat oprod_imx2[gauge_site_size]; - outerProdSpinTrace(oprod_f, temp, &x[spinor_site_size * (i + Vh * parity)]); - su3_imagx2(oprod_imx2, oprod_f); + gFloat oprod_f[gauge_site_size]; + gFloat oprod_imx2[gauge_site_size]; + outerProdSpinTrace(oprod_f, temp, &x[spinor_site_size * (i + Vh * flavor + Vh * flavors * parity)]); + su3_imagx2(oprod_imx2, oprod_f); - int munu = (mu - 1) * mu / 2 + nu; + int munu = (mu - 1) * mu / 2 + nu; - for (int ci = 0; ci < nColor; ci++) { // row - for (int cj = 0; cj < nColor; cj++) { // col - int color = ci * nColor + cj; - int id = 2 * (i + Vh * (color + 9 * (munu + parity * 6))); - oprod[id + 0] += coeff[parity] * oprod_imx2[color * 2 + 0] / 2.0; - oprod[id + 1] += coeff[parity] * oprod_imx2[color * 2 + 1] / 2.0; + for (int ci = 0; ci < nColor; ci++) { // row + for (int cj = 0; cj < nColor; cj++) { // col + int color = ci * nColor + cj; + int id = 2 * (i + Vh * (color + 9 * (munu + parity * 6))); + oprod[id + 0] += coeff[parity] * oprod_imx2[color * 2 + 0] / 2.0; + oprod[id + 1] += coeff[parity] * oprod_imx2[color * 2 + 1] / 2.0; + } } } } @@ -895,12 +938,36 @@ void Gamma5_host(double *out, double *in, const int V) } } } + +void tau1_host(double *out, double *in, const int V) +{ + int V2 = V * 24 / 2; +#pragma omp parallel for + for (int i = 0; i < V2; i++) { + double up = in[i]; + double down = in[i + V2]; + out[i] = down; + out[i + V2] = up; + } +} + void axpbyz_host(double a, double *x, double b, double *y, double *z, const int V) { #pragma omp parallel for for (int i = 0; i < V * 24; i++) { z[i] = a * x[i] + b * y[i]; } } +void caxpy_host(double a_re, double a_im, double *x, double *y, const int V) +{ +#pragma omp parallel for + for (int i = 0; i < V * 12; i++) { + double re = a_re * x[i * 2 + 0] - a_im * x[i * 2 + 1] + y[i * 2 + 0]; + double im = a_re * x[i * 2 + 1] + a_im * x[i * 2 + 0] + y[i * 2 + 1]; + y[i * 2 + 0] = re; + y[i * 2 + 1] = im; + } +} + void Gamma5_host_UKQCD(double *out, double *in, const int V) { #pragma omp parallel for @@ -968,20 +1035,30 @@ void TMCloverForce_reference(void *h_mom, void **h_x, void **h_x0, double *coeff if (myMatPCType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || myMatPCType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - tmc_dslash(x[i].Even().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa, - inv_param->mu, inv_param->twist_flavor, parity, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, - *gauge_param); + if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) { + tmc_dslash(x[i].Even().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa, + inv_param->mu, inv_param->twist_flavor, parity, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, + *gauge_param); + } else if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { + tmc_ndeg_dslash(x[i].Even().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), + inv_param->kappa, inv_param->mu, inv_param->epsilon, parity, myMatPCType, QUDA_DAG_YES, + inv_param->cpu_prec, *gauge_param); + } } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) { clover_dslash(x[i].Even().data(), gauge.data(), clover_inv.data(), tmp.data(), parity, QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param); } else { errorQuda("TMCloverForce_reference: dslash_type not supported\n"); } - Gamma5_host(x[i].Even().data(), x[i].Even().data(), x[i].Even().VolumeCB()); if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - tmc_matpc(p[i].Odd().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa, - inv_param->mu, inv_param->twist_flavor, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param); + if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) { + tmc_matpc(p[i].Odd().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa, + inv_param->mu, inv_param->twist_flavor, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param); + } else if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { + tmc_ndeg_matpc(p[i].Odd().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa, + inv_param->mu, inv_param->epsilon, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param); + } } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) { clover_matpc(p[i].Odd().data(), gauge.data(), clover.data(), clover_inv.data(), tmp.data(), inv_param->kappa, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param); @@ -989,7 +1066,18 @@ void TMCloverForce_reference(void *h_mom, void **h_x, void **h_x0, double *coeff errorQuda("TMCloverForce_reference: dslash_type not supported\n"); } - if (detratio) { + if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { + axpbyz_host(1.0 / inv_param->evmax, p[i].Odd().data(), 0, p[i].Odd().data(), + p[i].Odd().data(), p[i].Odd().VolumeCB()); + tau1_host(x[i].Even().data(), x[i].Even().data(), x[i].Even().VolumeCB()); + tau1_host(p[i].Odd().data(), p[i].Odd().data(), p[i].Odd().VolumeCB()); + caxpy_host(0.0, -inv_param->offset[i], x[i].Odd().data(), p[i].Odd().data(), + p[i].Odd().VolumeCB()); + } + + Gamma5_host(x[i].Even().data(), x[i].Even().data(), x[i].Even().VolumeCB()); + + if (detratio && inv_param->twist_flavor != QUDA_TWIST_NONDEG_DOUBLET) { qParam.v = h_x0[i]; quda::ColorSpinorField load_half(qParam); x0[i].Odd() = load_half; @@ -997,10 +1085,19 @@ void TMCloverForce_reference(void *h_mom, void **h_x, void **h_x0, double *coeff p[i].Odd().VolumeCB()); } + if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { + Gamma5_host(p[i].Odd().data(), p[i].Odd().data(), p[i].Odd().VolumeCB()); + } + if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - tmc_dslash(p[i].Even().data(), gauge.data(), p[i].Odd().data(), clover.data(), clover_inv.data(), - inv_param->kappa, inv_param->mu, inv_param->twist_flavor, parity, myMatPCType, QUDA_DAG_NO, - inv_param->cpu_prec, *gauge_param); + if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) + tmc_dslash(p[i].Even().data(), gauge.data(), p[i].Odd().data(), clover.data(), clover_inv.data(), + inv_param->kappa, inv_param->mu, inv_param->twist_flavor, parity, myMatPCType, QUDA_DAG_NO, + inv_param->cpu_prec, *gauge_param); + else if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) + tmc_ndeg_dslash(p[i].Even().data(), gauge.data(), p[i].Odd().data(), clover.data(), clover_inv.data(), + inv_param->kappa, inv_param->mu, inv_param->epsilon, parity, myMatPCType, QUDA_DAG_YES, + inv_param->cpu_prec, *gauge_param); } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) { clover_dslash(p[i].Even().data(), gauge.data(), clover_inv.data(), p[i].Odd().data(), parity, QUDA_DAG_NO, inv_param->cpu_prec, *gauge_param); @@ -1008,6 +1105,12 @@ void TMCloverForce_reference(void *h_mom, void **h_x, void **h_x0, double *coeff errorQuda("TMCloverForce_reference: dslash_type not supported\n"); } + if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { + Gamma5_host(p[i].Odd().data(), p[i].Odd().data(), p[i].Odd().VolumeCB()); + Gamma5_host(p[i].Even().data(), p[i].Even().data(), p[i].Even().VolumeCB()); + tau1_host(p[i].Even().data(), p[i].Even().data(), p[i].Even().VolumeCB()); + } + } else { errorQuda("TMCloverForce_reference: MATPC type not supported\n"); } @@ -1058,6 +1161,10 @@ void TMCloverForce_reference(void *h_mom, void **h_x, void **h_x0, double *coeff ferm_epsilon[i].reserve(2); ferm_epsilon[i][0] = k_csw_ov_8 * coeff[i]; ferm_epsilon[i][1] = k_csw_ov_8 * coeff[i] / (inv_param->kappa * inv_param->kappa); + if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { + tau1_host(p[i].Even().data(), p[i].Even().data(), p[i].Even().VolumeCB()); + tau1_host(p[i].Odd().data(), p[i].Odd().data(), p[i].Odd().VolumeCB()); + } } // derivative of pseudofermion sw term, first term term of (A12) in hep-lat/0112051, sw_spinor_eo(EE,..) plus // sw_spinor_eo(OO,..) in tmLQCD diff --git a/tests/host_reference/clover_reference.cpp b/tests/host_reference/clover_reference.cpp index 1a89ead931..085d7fe4bd 100644 --- a/tests/host_reference/clover_reference.cpp +++ b/tests/host_reference/clover_reference.cpp @@ -503,13 +503,15 @@ void tmc_ndeg_dslash(void *out, void **gauge, void *in, void *clover, void *cInv if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { void *tmptmp1 = safe_malloc(Vh * spinor_site_size * precision); void *tmptmp2 = safe_malloc(Vh * spinor_site_size * precision); - wil_dslash(tmptmp1, gauge, tmp1, oddBit, daggerBit, precision, gauge_param); - wil_dslash(tmptmp2, gauge, tmp2, oddBit, daggerBit, precision, gauge_param); + wil_dslash(tmptmp1, gauge, in1, oddBit, daggerBit, precision, gauge_param); + wil_dslash(tmptmp2, gauge, in2, oddBit, daggerBit, precision, gauge_param); ndegTwistCloverGamma5(out1, out2, tmptmp1, tmptmp2, clover, cInv, daggerBit, kappa, mu, epsilon, oddBit, QUDA_TWIST_GAMMA5_INVERSE, precision); host_free(tmptmp1); host_free(tmptmp2); } else { + ndegTwistCloverGamma5(tmp1, tmp2, in1, in2, clover, cInv, daggerBit, kappa, mu, epsilon, 1 - oddBit, + QUDA_TWIST_GAMMA5_INVERSE, precision); wil_dslash(out1, gauge, tmp1, oddBit, daggerBit, precision, gauge_param); wil_dslash(out2, gauge, tmp2, oddBit, daggerBit, precision, gauge_param); } diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp index eda5c778a7..a1215f051a 100644 --- a/tests/invert_test.cpp +++ b/tests/invert_test.cpp @@ -22,6 +22,12 @@ QudaEigParam mg_eig_param[QUDA_MAX_MG_LEVEL]; QudaEigParam eig_param; bool use_split_grid = false; +std::vector gauge_; +std::array gauge; +std::vector clover; +std::vector clover_inv; +QudaPrecision last_prec = QUDA_INVALID_PRECISION; + // if --enable-testing true is passed, we run the tests defined in here #include @@ -107,11 +113,6 @@ void display_test_info() dimPartitioned(3)); } -std::vector gauge_; -std::array gauge; -std::vector clover; -std::vector clover_inv; - void init(int argc, char **argv) { // Set QUDA's internal parameters @@ -163,8 +164,6 @@ void init(int argc, char **argv) gauge_.resize(4 * V * gauge_site_size * host_gauge_data_type_size); for (int i = 0; i < 4; i++) gauge[i] = gauge_.data() + i * V * gauge_site_size * host_gauge_data_type_size; constructHostGaugeField(gauge.data(), gauge_param, argc, argv); - // Load the gauge field to the device - loadGaugeQuda(gauge.data(), &gauge_param); // Allocate host side memory for clover terms if needed. //---------------------------------------------------------------------------- @@ -173,36 +172,48 @@ void init(int argc, char **argv) clover.resize(V * clover_site_size * host_clover_data_type_size); clover_inv.resize(V * clover_site_size * host_spinor_data_type_size); constructHostCloverField(clover.data(), clover_inv.data(), inv_param); + } + + // Load the gauge field to the device + loadGaugeQuda(gauge.data(), &gauge_param); + + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { // Load the clover terms to the device loadCloverQuda(clover.data(), clover_inv.data(), &inv_param); } + last_prec = gauge_param.cuda_prec; } std::vector> solve(test_t param) { - inv_param.inv_type = ::testing::get<0>(param); - inv_param.solution_type = ::testing::get<1>(param); - inv_param.solve_type = ::testing::get<2>(param); - inv_param.cuda_prec_sloppy = ::testing::get<3>(param); - inv_param.clover_cuda_prec_sloppy = ::testing::get<3>(param); - multishift = ::testing::get<4>(param); - inv_param.solution_accumulator_pipeline = ::testing::get<5>(param); + inv_param.cuda_prec = ::testing::get<0>(param); + inv_param.clover_cuda_prec = ::testing::get<0>(param); + inv_param.cuda_prec_sloppy = ::testing::get<1>(param); + inv_param.cuda_prec_refinement_sloppy = ::testing::get<1>(param); + inv_param.clover_cuda_prec_sloppy = ::testing::get<1>(param); + inv_param.clover_cuda_prec_refinement_sloppy = ::testing::get<1>(param); + inv_param.inv_type = ::testing::get<2>(param); + inv_param.solution_type = ::testing::get<3>(param); + inv_param.solve_type = ::testing::get<4>(param); + multishift = ::testing::get<5>(param); + inv_param.solution_accumulator_pipeline = ::testing::get<6>(param); // schwarz parameters - auto schwarz_param = ::testing::get<6>(param); + auto schwarz_param = ::testing::get<7>(param); inv_param.schwarz_type = ::testing::get<0>(schwarz_param); inv_param.inv_type_precondition = ::testing::get<1>(schwarz_param); inv_param.cuda_prec_precondition = ::testing::get<2>(schwarz_param); inv_param.clover_cuda_prec_precondition = ::testing::get<2>(schwarz_param); - inv_param.residual_type = ::testing::get<7>(param); + inv_param.residual_type = ::testing::get<8>(param); // reset lambda_max if we're doing a testing loop to ensure correct lambma_max if (enable_testing) inv_param.ca_lambda_max = -1.0; - logQuda(QUDA_SUMMARIZE, "Solution = %s, Solve = %s, Solver = %s, Sloppy precision = %s\n", + logQuda(QUDA_SUMMARIZE, "Solution = %s, Solve = %s, Solver = %s, Precision = %s, Sloppy precision = %s\n", get_solution_str(inv_param.solution_type), get_solve_str(inv_param.solve_type), - get_solver_str(inv_param.inv_type), get_prec_str(inv_param.cuda_prec_sloppy)); + get_solver_str(inv_param.inv_type), get_prec_str(inv_param.cuda_prec), + get_prec_str(inv_param.cuda_prec_sloppy)); // params corresponds to split grid for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i]; @@ -407,7 +418,6 @@ int main(int argc, char **argv) // All parameters have been set. Display the parameters via stdout display_test_info(); - // Initialize the QUDA library initQuda(device_ordinal); init(argc, argv); @@ -423,7 +433,7 @@ int main(int argc, char **argv) if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); } result = RUN_ALL_TESTS(); } else { - solve(test_t {inv_type, solution_type, solve_type, prec_sloppy, multishift, solution_accumulator_pipeline, + solve(test_t {prec, prec_sloppy, inv_type, solution_type, solve_type, multishift, solution_accumulator_pipeline, schwarz_t {precon_schwarz_type, inv_multigrid ? QUDA_MG_INVERTER : precon_type, prec_precondition}, inv_param.residual_type}); } diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp index ecc23ed12d..39e7ce8f68 100644 --- a/tests/invert_test_gtest.hpp +++ b/tests/invert_test_gtest.hpp @@ -5,8 +5,8 @@ // tuple containing parameters for Schwarz solver using schwarz_t = ::testing::tuple; -using test_t - = ::testing::tuple; +using test_t = ::testing::tuple; class InvertTest : public ::testing::TestWithParam { @@ -15,20 +15,44 @@ class InvertTest : public ::testing::TestWithParam public: InvertTest() : param(GetParam()) { } + + virtual void SetUp() + { + // check if outer precision has changed and update if it has + if (::testing::get<0>(param) != last_prec) { + if (last_prec != QUDA_INVALID_PRECISION) { + freeGaugeQuda(); + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) freeCloverQuda(); + } + + // Load the gauge field to the device + gauge_param.cuda_prec = ::testing::get<0>(param); + loadGaugeQuda(gauge.data(), &gauge_param); + + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + // Load the clover terms to the device + inv_param.clover_cuda_prec = ::testing::get<0>(param); + loadCloverQuda(clover.data(), clover_inv.data(), &inv_param); + } + last_prec = ::testing::get<0>(param); + } + } }; bool skip_test(test_t param) { - auto inverter_type = ::testing::get<0>(param); - auto solution_type = ::testing::get<1>(param); - auto solve_type = ::testing::get<2>(param); - auto prec_sloppy = ::testing::get<3>(param); - auto multishift = ::testing::get<4>(param); - auto solution_accumulator_pipeline = ::testing::get<5>(param); - auto schwarz_param = ::testing::get<6>(param); + auto prec = ::testing::get<0>(param); + auto prec_sloppy = ::testing::get<1>(param); + auto inverter_type = ::testing::get<2>(param); + auto solution_type = ::testing::get<3>(param); + auto solve_type = ::testing::get<4>(param); + auto multishift = ::testing::get<5>(param); + auto solution_accumulator_pipeline = ::testing::get<6>(param); + auto schwarz_param = ::testing::get<7>(param); auto prec_precondition = ::testing::get<2>(schwarz_param); if (prec < prec_sloppy) return true; // outer precision >= sloppy precision + if (!(QUDA_PRECISION & prec)) return true; // precision not enabled so skip it if (!(QUDA_PRECISION & prec_sloppy)) return true; // precision not enabled so skip it if (!(QUDA_PRECISION & prec_precondition) && prec_precondition != QUDA_INVALID_PRECISION) return true; // precision not enabled so skip it @@ -66,18 +90,19 @@ TEST_P(InvertTest, verify) { if (skip_test(GetParam())) GTEST_SKIP(); + auto tol = ::testing::get<0>(GetParam()) == QUDA_SINGLE_PRECISION ? 1e-6 : 1e-12; + auto tol_hq = ::testing::get<0>(GetParam()) == QUDA_SINGLE_PRECISION ? 1e-6 : 1e-12; inv_param.tol = 0.0; inv_param.tol_hq = 0.0; - auto res_t = ::testing::get<7>(GetParam()); + auto res_t = ::testing::get<8>(GetParam()); if (res_t & QUDA_L2_RELATIVE_RESIDUAL) inv_param.tol = tol; if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) inv_param.tol_hq = tol_hq; - auto tol = inv_param.tol; if (is_chiral(inv_param.dslash_type)) { tol *= std::sqrt(static_cast(inv_param.Ls)); } // FIXME eventually we should build in refinement to the *NR solvers to remove the need for this - if (is_normal_residual(::testing::get<0>(GetParam()))) tol *= 50; + if (is_normal_residual(::testing::get<2>(GetParam()))) tol *= 50; // Slight loss of precision possible when reconstructing full solution - if (is_full_solution(::testing::get<1>(GetParam())) && is_preconditioned_solve(::testing::get<2>(GetParam()))) + if (is_full_solution(::testing::get<3>(GetParam())) && is_preconditioned_solve(::testing::get<4>(GetParam()))) tol *= 10; for (auto rsd : solve(GetParam())) { @@ -89,21 +114,22 @@ TEST_P(InvertTest, verify) std::string gettestname(::testing::TestParamInfo param) { std::string name; - name += get_solver_str(::testing::get<0>(param.param)) + std::string("_"); - name += get_solution_str(::testing::get<1>(param.param)) + std::string("_"); - name += get_solve_str(::testing::get<2>(param.param)) + std::string("_"); - name += get_prec_str(::testing::get<3>(param.param)); - if (::testing::get<4>(param.param) > 1) - name += std::string("_shift") + std::to_string(::testing::get<4>(param.param)); + name += get_prec_str(::testing::get<0>(param.param)) + std::string("_"); + name += get_prec_str(::testing::get<1>(param.param)) + std::string("_"); + name += get_solver_str(::testing::get<2>(param.param)) + std::string("_"); + name += get_solution_str(::testing::get<3>(param.param)) + std::string("_"); + name += get_solve_str(::testing::get<4>(param.param)); if (::testing::get<5>(param.param) > 1) - name += std::string("_solution_accumulator_pipeline") + std::to_string(::testing::get<5>(param.param)); - auto &schwarz_param = ::testing::get<6>(param.param); + name += std::string("_shift") + std::to_string(::testing::get<5>(param.param)); + if (::testing::get<6>(param.param) > 1) + name += std::string("_solution_accumulator_pipeline") + std::to_string(::testing::get<6>(param.param)); + auto &schwarz_param = ::testing::get<7>(param.param); if (::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) { name += std::string("_") + get_schwarz_str(::testing::get<0>(schwarz_param)); name += std::string("_") + get_solver_str(::testing::get<1>(schwarz_param)); name += std::string("_") + get_prec_str(::testing::get<2>(schwarz_param)); } - auto res_t = ::testing::get<7>(param.param); + auto res_t = ::testing::get<8>(param.param); if (res_t & QUDA_L2_RELATIVE_RESIDUAL) name += std::string("_l2"); if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) name += std::string("_heavy_quark"); return name; @@ -118,6 +144,8 @@ auto direct_solvers = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGN QUDA_CG3NE_INVERTER, QUDA_CG3NR_INVERTER, QUDA_GCR_INVERTER, QUDA_CA_GCR_INVERTER, QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER); +auto precisions = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION); + auto sloppy_precisions = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION); @@ -129,41 +157,43 @@ auto no_heavy_quark = Values(QUDA_L2_RELATIVE_RESIDUAL); // preconditioned normal solves INSTANTIATE_TEST_SUITE_P(NormalEvenOdd, InvertTest, - Combine(normal_solvers, Values(QUDA_MATPCDAG_MATPC_SOLUTION, QUDA_MAT_SOLUTION), - Values(QUDA_NORMOP_PC_SOLVE), sloppy_precisions, Values(1), - solution_accumulator_pipelines, no_schwarz, no_heavy_quark), + Combine(precisions, sloppy_precisions, normal_solvers, + Values(QUDA_MATPCDAG_MATPC_SOLUTION, QUDA_MAT_SOLUTION), Values(QUDA_NORMOP_PC_SOLVE), + Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark), gettestname); // full system normal solve INSTANTIATE_TEST_SUITE_P(NormalFull, InvertTest, - Combine(normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION), Values(QUDA_NORMOP_SOLVE), - sloppy_precisions, Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark), + Combine(precisions, sloppy_precisions, normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION), + Values(QUDA_NORMOP_SOLVE), Values(1), solution_accumulator_pipelines, no_schwarz, + no_heavy_quark), gettestname); // preconditioned direct solves INSTANTIATE_TEST_SUITE_P(EvenOdd, InvertTest, - Combine(direct_solvers, Values(QUDA_MATPC_SOLUTION, QUDA_MAT_SOLUTION), - Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(1), - solution_accumulator_pipelines, no_schwarz, no_heavy_quark), + Combine(precisions, sloppy_precisions, direct_solvers, + Values(QUDA_MATPC_SOLUTION, QUDA_MAT_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE), + Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark), gettestname); // full system direct solve INSTANTIATE_TEST_SUITE_P(Full, InvertTest, - Combine(direct_solvers, Values(QUDA_MAT_SOLUTION), Values(QUDA_DIRECT_SOLVE), sloppy_precisions, - Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark), + Combine(precisions, sloppy_precisions, direct_solvers, Values(QUDA_MAT_SOLUTION), + Values(QUDA_DIRECT_SOLVE), Values(1), solution_accumulator_pipelines, no_schwarz, + no_heavy_quark), gettestname); // preconditioned multi-shift solves INSTANTIATE_TEST_SUITE_P(MultiShiftEvenOdd, InvertTest, - Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPCDAG_MATPC_SOLUTION), - Values(QUDA_NORMOP_PC_SOLVE), sloppy_precisions, Values(10), + Combine(precisions, sloppy_precisions, Values(QUDA_CG_INVERTER), + Values(QUDA_MATPCDAG_MATPC_SOLUTION), Values(QUDA_NORMOP_PC_SOLVE), Values(10), solution_accumulator_pipelines, no_schwarz, no_heavy_quark), gettestname); // Schwarz-preconditioned normal solves INSTANTIATE_TEST_SUITE_P(SchwarzNormal, InvertTest, - Combine(Values(QUDA_PCG_INVERTER), Values(QUDA_MATPCDAG_MATPC_SOLUTION), - Values(QUDA_NORMOP_PC_SOLVE), sloppy_precisions, Values(1), + Combine(precisions, sloppy_precisions, Values(QUDA_PCG_INVERTER), + Values(QUDA_MATPCDAG_MATPC_SOLUTION), Values(QUDA_NORMOP_PC_SOLVE), Values(1), solution_accumulator_pipelines, Combine(Values(QUDA_ADDITIVE_SCHWARZ), Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER), Values(QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION)), @@ -172,8 +202,8 @@ INSTANTIATE_TEST_SUITE_P(SchwarzNormal, InvertTest, // Schwarz-preconditioned direct solves INSTANTIATE_TEST_SUITE_P(SchwarzEvenOdd, InvertTest, - Combine(Values(QUDA_GCR_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE), - sloppy_precisions, Values(1), solution_accumulator_pipelines, + Combine(precisions, sloppy_precisions, Values(QUDA_GCR_INVERTER), Values(QUDA_MATPC_SOLUTION), + Values(QUDA_DIRECT_PC_SOLVE), Values(1), solution_accumulator_pipelines, Combine(Values(QUDA_ADDITIVE_SCHWARZ), Values(QUDA_MR_INVERTER, QUDA_CA_GCR_INVERTER), Values(QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION)), no_heavy_quark), @@ -181,7 +211,7 @@ INSTANTIATE_TEST_SUITE_P(SchwarzEvenOdd, InvertTest, // Heavy-Quark preconditioned solves INSTANTIATE_TEST_SUITE_P(HeavyQuarkEvenOdd, InvertTest, - Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_NORMOP_PC_SOLVE), - sloppy_precisions, Values(1), solution_accumulator_pipelines, no_schwarz, + Combine(precisions, sloppy_precisions, Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), + Values(QUDA_NORMOP_PC_SOLVE), Values(1), solution_accumulator_pipelines, no_schwarz, Values(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL, QUDA_HEAVY_QUARK_RESIDUAL)), gettestname); diff --git a/tests/io_test.cpp b/tests/io_test.cpp index af708baf3b..2618b0e479 100644 --- a/tests/io_test.cpp +++ b/tests/io_test.cpp @@ -71,12 +71,15 @@ TEST_P(GaugeIOTest, verify) for (int dir = 0; dir < 4; dir++) { host_free(gauge[dir]); } } -using cs_test_t = ::testing::tuple; +using cs_test_t = ::testing::tuple; class ColorSpinorIOTest : public ::testing::TestWithParam { protected: - QudaSiteSubset site_subset; + QudaSiteSubset site_subset_saved; + QudaSiteSubset site_subset_loaded; + QudaParity parity; bool inflate; QudaPrecision prec; QudaPrecision prec_io; @@ -86,13 +89,15 @@ class ColorSpinorIOTest : public ::testing::TestWithParam public: ColorSpinorIOTest() : - site_subset(::testing::get<0>(GetParam())), - inflate(::testing::get<1>(GetParam())), - prec(::testing::get<2>(GetParam())), - prec_io(::testing::get<3>(GetParam())), - nSpin(::testing::get<4>(GetParam())), - partfile(::testing::get<5>(GetParam())), - location(::testing::get<6>(GetParam())) + site_subset_saved(::testing::get<0>(GetParam())), + site_subset_loaded(::testing::get<1>(GetParam())), + parity(::testing::get<2>(GetParam())), + inflate(::testing::get<3>(GetParam())), + prec(::testing::get<4>(GetParam())), + prec_io(::testing::get<5>(GetParam())), + nSpin(::testing::get<6>(GetParam())), + partfile(::testing::get<7>(GetParam())), + location(::testing::get<8>(GetParam())) { } }; @@ -117,37 +122,79 @@ TEST_P(ColorSpinorIOTest, verify) || (prec < QUDA_SINGLE_PRECISION && nSpin == 2)) GTEST_SKIP(); + // There's no meaningful way to save a full parity spinor field and load a + // single-parity field + if (site_subset_saved == QUDA_FULL_SITE_SUBSET && site_subset_loaded == QUDA_PARITY_SITE_SUBSET) GTEST_SKIP(); + + // Site subsets must be the same except for inflation tests + if (site_subset_saved != site_subset_loaded && !inflate) GTEST_SKIP(); + + // Unique test: saving an inflated single-parity field and loading the full-parity field + bool mixed_inflated_full_test + = (site_subset_saved == QUDA_PARITY_SITE_SUBSET && site_subset_loaded == QUDA_FULL_SITE_SUBSET && inflate); + QudaGaugeParam gauge_param = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); - ColorSpinorParam param; setWilsonGaugeParam(gauge_param); setInvertParam(inv_param); - constructWilsonTestSpinorParam(¶m, &inv_param, &gauge_param); - param.siteSubset = site_subset; - param.suggested_parity = QUDA_EVEN_PARITY; - param.nSpin = nSpin; - param.setPrecision(prec, prec, true); // change order to native order - param.location = location; - if (location == QUDA_CPU_FIELD_LOCATION) param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; - param.create = QUDA_NULL_FIELD_CREATE; + + // We create a separate parameter struct for saved and loaded ColorSpinorFields + ColorSpinorParam param_save, param_load; + + // Saved + constructWilsonTestSpinorParam(¶m_save, &inv_param, &gauge_param); + param_save.siteSubset = site_subset_saved; + param_save.suggested_parity = parity; // ignored for full parity + param_save.nSpin = nSpin; + param_save.setPrecision(prec, prec, true); // change order to native order + param_save.location = location; + if (location == QUDA_CPU_FIELD_LOCATION) param_save.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; + param_save.create = QUDA_NULL_FIELD_CREATE; + + // Loaded + param_load = param_save; + param_load.siteSubset = site_subset_loaded; + + // Update ColorSpinorField size appropriately depending on the site subset + if (site_subset_saved == QUDA_PARITY_SITE_SUBSET) param_save.x[0] /= 2; + if (site_subset_loaded == QUDA_PARITY_SITE_SUBSET) param_load.x[0] /= 2; // create some random vectors auto n_vector = 1; - std::vector v(n_vector, param); - std::vector u(n_vector, param); + std::vector v(n_vector, param_save); + std::vector u(n_vector, param_load); RNG rng(v[0], 1234); for (auto &vi : v) spinorNoise(vi, rng, QUDA_NOISE_GAUSS); auto file = "dummy.cs"; - VectorIO io(file, inflate, partfile); - - io.save({v.begin(), v.end()}, prec_io, n_vector); - io.load(u); + // create a separate VectorIO for saving and loading + // this lets us test saving a single-parity field with inflation + // then loading the full parity field. + VectorIO io_save(file, inflate, partfile); + VectorIO io_load(file, inflate, partfile); + + io_save.save({v.begin(), v.end()}, prec_io, n_vector); + io_load.load(u); + + // This lambda simplifies returning the correct subset of `u` by reference if we're doing + // tests where we save an inflated single-parity vector and load a full-parity vector. + auto get_u_subset = [&](int i) -> const ColorSpinorField & { + if (mixed_inflated_full_test) + if (this->parity == QUDA_EVEN_PARITY) // lambdas don't automatically capture `this` + return u[i].Even(); + else + return u[i].Odd(); + else + return u[i]; + }; for (auto i = 0u; i < v.size(); i++) { - auto dev = blas::max_deviation(u[i], v[i]); + // grab the correct subset of u if we're doing mixed parity/full tests + const ColorSpinorField &u_subset = get_u_subset(i); + + auto dev = blas::max_deviation(u_subset, v[i]); if (prec == prec_io) EXPECT_EQ(dev[0], 0.0); else @@ -185,33 +232,38 @@ INSTANTIATE_TEST_SUITE_P(Gauge, GaugeIOTest, Combine(Values(QUDA_DOUBLE_PRECISIO // colorspinor full field IO test INSTANTIATE_TEST_SUITE_P(Full, ColorSpinorIOTest, - Combine(Values(QUDA_FULL_SITE_SUBSET), Values(false), + Combine(Values(QUDA_FULL_SITE_SUBSET), Values(QUDA_FULL_SITE_SUBSET), + Values(QUDA_INVALID_PARITY), Values(false), Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION), Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION), Values(1, 2, 4), Values(false, true), Values(QUDA_CUDA_FIELD_LOCATION, QUDA_CPU_FIELD_LOCATION)), [](testing::TestParamInfo param) { std::string name; - name += get_prec_str(::testing::get<2>(param.param)) + std::string("_"); - name += get_prec_str(::testing::get<3>(param.param)) + std::string("_"); - name += std::string("spin") + std::to_string(::testing::get<4>(param.param)); - name += ::testing::get<5>(param.param) ? "_singlefile" : "_partfile"; - name += ::testing::get<6>(param.param) == QUDA_CUDA_FIELD_LOCATION ? "_device" : "_host"; + name += get_prec_str(::testing::get<4>(param.param)) + std::string("_"); + name += get_prec_str(::testing::get<5>(param.param)) + std::string("_"); + name += std::string("spin") + std::to_string(::testing::get<6>(param.param)); + name += ::testing::get<7>(param.param) ? "_singlefile" : "_partfile"; + name += ::testing::get<8>(param.param) == QUDA_CUDA_FIELD_LOCATION ? "_device" : "_host"; return name; }); // colorspinor parity field IO test INSTANTIATE_TEST_SUITE_P(Parity, ColorSpinorIOTest, - Combine(Values(QUDA_PARITY_SITE_SUBSET), Values(false, true), + Combine(Values(QUDA_PARITY_SITE_SUBSET), Values(QUDA_PARITY_SITE_SUBSET, QUDA_FULL_SITE_SUBSET), + Values(QUDA_EVEN_PARITY, QUDA_ODD_PARITY), Values(false, true), Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION), Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION), Values(1, 2, 4), Values(false, true), Values(QUDA_CUDA_FIELD_LOCATION, QUDA_CPU_FIELD_LOCATION)), [](testing::TestParamInfo param) { std::string name; - if (::testing::get<1>(param.param)) name += std::string("inflate_"); - name += get_prec_str(::testing::get<2>(param.param)) + std::string("_"); - name += get_prec_str(::testing::get<3>(param.param)) + std::string("_"); - name += std::string("spin") + std::to_string(::testing::get<4>(param.param)); - name += ::testing::get<5>(param.param) ? "_singlefile" : "_partfile"; - name += ::testing::get<6>(param.param) == QUDA_CUDA_FIELD_LOCATION ? "_device" : "_host"; + name += ::testing::get<1>(param.param) == QUDA_PARITY_SITE_SUBSET ? "load_parity_" : + "load_full_"; + name += ::testing::get<2>(param.param) == QUDA_EVEN_PARITY ? "even_" : "odd_"; + if (::testing::get<3>(param.param)) name += std::string("inflate_"); + name += get_prec_str(::testing::get<4>(param.param)) + std::string("_"); + name += get_prec_str(::testing::get<5>(param.param)) + std::string("_"); + name += std::string("spin") + std::to_string(::testing::get<6>(param.param)); + name += ::testing::get<7>(param.param) ? "_singlefile" : "_partfile"; + name += ::testing::get<8>(param.param) == QUDA_CUDA_FIELD_LOCATION ? "_device" : "_host"; return name; }); diff --git a/tests/laph_test.cpp b/tests/laph_test.cpp new file mode 100644 index 0000000000..1ef7f3c883 --- /dev/null +++ b/tests/laph_test.cpp @@ -0,0 +1,163 @@ +#include +#include +#include + +#include "instantiate.h" +#include "test.h" +#include "misc.h" + +/* + This test will perhaps eventually evolve into a full test for 3-d + Laplace eigenvector generation and sink projection. For now the + focus is eigen-vector projection + */ + +using test_t = std::tuple; + +using ::testing::Combine; +using ::testing::get; +using ::testing::Values; + +struct LaphTest : ::testing::TestWithParam { + test_t param; + LaphTest() : param(GetParam()) { } +}; + +auto laph_test(test_t param) +{ + using namespace quda; + + QudaPrecision precision = get<0>(param); + int nSink = get<1>(param); + int nEv = get<2>(param); + int tileSink = get<3>(param); + int tileEv = get<4>(param); + + constexpr int nSpin = 4; + constexpr int nColor = 3; + + QudaInvertParam invParam = newQudaInvertParam(); + invParam.cpu_prec = QUDA_DOUBLE_PRECISION; + invParam.cuda_prec = precision; + invParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + invParam.dirac_order = QUDA_DIRAC_ORDER; + + ColorSpinorParam cs_param; + cs_param.nColor = nColor; + cs_param.nSpin = nSpin; + cs_param.x = {xdim, ydim, zdim, tdim}; + cs_param.siteSubset = QUDA_FULL_SITE_SUBSET; + cs_param.setPrecision(invParam.cpu_prec); + cs_param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; + cs_param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; + cs_param.gammaBasis = invParam.gamma_basis; + cs_param.pc_type = QUDA_4D_PC; + cs_param.location = QUDA_CPU_FIELD_LOCATION; + cs_param.create = QUDA_NULL_FIELD_CREATE; + + ColorSpinorField rngDummy(cs_param); + RNG rng(rngDummy, 1234); + + // initialize quark sinks + std::vector sinkList(nSink); + for (auto &s : sinkList) { + s = ColorSpinorField(cs_param); + spinorNoise(s, rng, QUDA_NOISE_GAUSS); + } + + // initialize EVs + cs_param.nSpin = 1; + std::vector evList(nEv); + for (auto &e : evList) { + e = ColorSpinorField(cs_param); + spinorNoise(e, rng, QUDA_NOISE_GAUSS); + } + + // host reference [nSink][nEv][Lt][nSpin][complexity] + auto Lt = tdim * comm_dim(3); + std::vector hostRes(nSink * nEv * Lt * nSpin, 0.); + +#pragma omp parallel for collapse(4) + for (int iEv = 0; iEv < nEv; ++iEv) { + for (int iSink = 0; iSink < nSink; ++iSink) { + for (int iSpin = 0; iSpin < nSpin; ++iSpin) { + for (int iT = 0; iT < tdim; ++iT) { + int globT = comm_coord(3) * tdim + iT; + for (int iZ = 0; iZ < zdim; ++iZ) { + for (int iY = 0; iY < ydim; ++iY) { + for (int iX = 0; iX < xdim; ++iX) { + int coord[4] = {iX, iY, iZ, iT}; + int linInd; + evList[iEv].OffsetIndex(linInd, coord); + + for (int iCol = 0; iCol < nColor; ++iCol) + hostRes[iSpin + nSpin * (globT + Lt * (iEv + nEv * iSink))] + += conj(evList[iEv].data()[iCol + nColor * linInd]) + * sinkList[iSink].data()[iCol + nColor * (iSpin + nSpin * linInd)]; + } + } + } + } // volume loops + } // spin loop + } // sink loop + } // ev loop + + comm_allreduce_sum(hostRes); + + // QUDA proper + void *snkPtr[nSink]; + for (int iSink = 0; iSink < nSink; ++iSink) snkPtr[iSink] = sinkList[iSink].data(); + + void *evPtr[nEv]; + for (int iEv = 0; iEv < nEv; ++iEv) evPtr[iEv] = evList[iEv].data(); + + std::vector qudaRes(nSink * nEv * Lt * nSpin, 0.); + + int X[4] = {xdim, ydim, zdim, tdim}; + laphSinkProject((__complex__ double *)qudaRes.data(), (void **)snkPtr, nSink, tileSink, + (void **)evPtr, nEv, tileEv, &invParam, X); + printfQuda("laphSinkProject Done: %g secs, %g Gflops\n", invParam.secs, invParam.gflops / invParam.secs); + + auto tol = getTolerance(cuda_prec); + int rtn = 0; + for (unsigned int i = 0; i < qudaRes.size(); i++) { + auto deviation = abs(qudaRes[i] - hostRes[i]) / abs(hostRes[i]); + if (deviation > tol) { + printfQuda("EV projection test failed at iEl=%d: (%f,%f) [QUDA], (%f,%f) [host]\n", i, qudaRes[i].real(), + qudaRes[i].imag(), hostRes[i].real(), hostRes[i].imag()); + EXPECT_LE(deviation, tol); + rtn = 1; + } + } + return rtn; +} + +TEST_P(LaphTest, verify) +{ + if (!quda::is_enabled(get<0>(GetParam()))) GTEST_SKIP(); + laph_test(GetParam()); +} + +INSTANTIATE_TEST_SUITE_P(LaphTest, LaphTest, + Combine(Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION), Values(1, 4, 7), Values(2), + Values(4), Values(2)), + [](testing::TestParamInfo param) { + return std::string(get_prec_str(get<0>(param.param))) + "_" + + std::to_string(get<1>(param.param)) + "_" + std::to_string(get<2>(param.param)) + "_" + + std::to_string(get<3>(param.param)) + "_" + std::to_string(get<4>(param.param)); + }); + +int main(int argc, char **argv) +{ + quda_test test("laph_test", argc, argv); + test.init(); + + int result = 0; + if (enable_testing) { + result = test.execute(); + } else { + result = laph_test({cuda_prec, Msrc, Nsrc, Msrc_tile, Nsrc_tile}); + } + + return result; +} diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp index 8cfb639c2f..4da9b24b01 100644 --- a/tests/utils/command_line_params.cpp +++ b/tests/utils/command_line_params.cpp @@ -50,6 +50,8 @@ double gaussian_sigma = 0.2; std::string gauge_outfile; int Nsrc = 1; int Msrc = 1; +int Nsrc_tile = 1; +int Msrc_tile = 1; int niter = 100; int maxiter_precondition = 10; QudaVerbosity verbosity_precondition = QUDA_SUMMARIZE; @@ -100,6 +102,7 @@ double mass = 0.1; double kappa = -1.0; double mu = 0.1; double epsilon = 0.01; +double evmax = 0.1; double m5 = -1.5; double b5 = 1.5; double c5 = 0.5; @@ -481,6 +484,8 @@ std::shared_ptr make_app(std::string app_description, std::string app_n ->transform(CLI::QUDACheckedTransformer(dslash_type_map)); quda_app->add_option("--epsilon", epsilon, "Twisted-Mass flavor twist of Dirac operator (default 0.01)"); + quda_app->add_option( + "--evmax", evmax, "Twisted-Mass non-degenerate of Dirac operator max eigenvector to scale the force (default 0.01)"); quda_app->add_option("--epsilon-naik", eps_naik, "Epsilon factor on Naik term (default 0.0, suggested non-zero -0.1)"); quda_app->add_option("--flavor", twist_flavor, "Set the twisted mass flavor type (singlet (default), nondeg-doublet)") @@ -510,6 +515,7 @@ std::shared_ptr make_app(std::string app_description, std::string app_n ->transform(CLI::QUDACheckedTransformer(matpc_type_map)); quda_app->add_option("--msrc", Msrc, "Used for testing non-square block blas routines where nsrc defines the other dimension"); + quda_app->add_option("--msrc-tile", Msrc_tile, "Set the Msrc tile size (where applicable)"); quda_app->add_option("--mu", mu, "Twisted-Mass chiral twist of Dirac operator (default 0.1)"); quda_app->add_option("--m5", m5, "Mass of shift of five-dimensional Dirac operators (default -1.5)"); quda_app->add_option("--b5", b5, "Mobius b5 parameter (default 1.5)"); @@ -547,6 +553,7 @@ std::shared_ptr make_app(std::string app_description, std::string app_n ->transform(CLI::QUDACheckedTransformer(verbosity_map)); quda_app->add_option("--nsrc", Nsrc, "How many spinors to apply the dslash to simultaneusly (experimental for staggered only)"); + quda_app->add_option("--nsrc-tile", Nsrc_tile, "Set the Nsrc tile size (where applicable)"); quda_app->add_option("--pipeline", pipeline, "The pipeline length for fused operations in GCR, BiCGstab-l (default 0, no pipelining)"); diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h index 4e88d9313e..fa1c21905f 100644 --- a/tests/utils/command_line_params.h +++ b/tests/utils/command_line_params.h @@ -188,6 +188,8 @@ extern double gaussian_sigma; extern std::string gauge_outfile; extern int Nsrc; extern int Msrc; +extern int Nsrc_tile; +extern int Msrc_tile; extern int niter; extern int maxiter_precondition; extern QudaVerbosity verbosity_precondition; @@ -238,6 +240,7 @@ extern double mass; extern double kappa; extern double mu; extern double epsilon; +extern double evmax; extern double m5; extern double b5; extern double c5; diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp index 629f70e51b..140eccd8a8 100644 --- a/tests/utils/host_utils.cpp +++ b/tests/utils/host_utils.cpp @@ -243,6 +243,7 @@ void constructWilsonTestSpinorParam(quda::ColorSpinorParam *cs_param, const Quda } else { cs_param->nDim = 4; } + cs_param->twistFlavor = inv_param->twist_flavor; cs_param->pc_type = inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ? QUDA_5D_PC : QUDA_4D_PC; for (int d = 0; d < 4; d++) cs_param->x[d] = gauge_param->X[d]; bool pc = is_pc_solution(inv_param->solution_type); diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index 24a8668e7d..d001fd6eea 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -43,7 +43,7 @@ extern QudaPrecision &cuda_prec_ritz; // Determine if the Laplace operator has been defined constexpr bool is_enabled_laplace() { -#ifdef QUDA_LAPLACE +#ifdef QUDA_DIRAC_LAPLACE return true; #else return false; diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp index 196777d574..29a448d7a2 100644 --- a/tests/utils/misc.cpp +++ b/tests/utils/misc.cpp @@ -372,3 +372,17 @@ const char *get_blas_type_str(QudaBLASType type) } return s; } + +const char *get_TwistFlavor_str(QudaTwistFlavorType type) +{ + const char *ret; + + switch (type) { + case QUDA_TWIST_SINGLET: ret = "singlet_"; break; + case QUDA_TWIST_NONDEG_DOUBLET: ret = "ndeg_doublet_"; break; + case QUDA_TWIST_NO: ret = ""; break; + default: ret = ""; break; + } + + return ret; +} diff --git a/tests/utils/misc.h b/tests/utils/misc.h index bf9a8d3039..625cb2ca01 100644 --- a/tests/utils/misc.h +++ b/tests/utils/misc.h @@ -25,6 +25,7 @@ const char *get_contract_str(QudaContractType type); const char *get_gauge_smear_str(QudaGaugeSmearType type); std::string get_dilution_type_str(QudaDilutionType type); const char *get_blas_type_str(QudaBLASType type); +const char *get_TwistFlavor_str(QudaTwistFlavorType type); #define XUP 0 #define YUP 1