diff --git a/.github/workflows/rocm-build-ci.yml b/.github/workflows/rocm-build-ci.yml
index 30ad92ae1f..c8812aa8c9 100644
--- a/.github/workflows/rocm-build-ci.yml
+++ b/.github/workflows/rocm-build-ci.yml
@@ -5,7 +5,7 @@ jobs:
     runs-on: [self-hosted, amd]
     strategy:
       matrix:
-        rocm: [ 5.6.1, 5.7.2 ]
+        rocm: [ 5.6.1, 6.0.2 ]
     steps:
       - uses: actions/checkout@v3
       - run: |
@@ -22,12 +22,13 @@ jobs:
             -DROCM_PATH=${ROCM_PATH} \
             -DQUDA_DIRAC_CLOVER=ON \
             -DQUDA_DIRAC_CLOVER_HASENBUSCH=OFF \
-            -DQUDA_DIRAC_DOMAIN_WALL=OFF \
+            -DQUDA_DIRAC_DOMAIN_WALL=ON \
             -DQUDA_DIRAC_NDEG_TWISTED_MASS=OFF \
             -DQUDA_DIRAC_STAGGERED=ON \
             -DQUDA_DIRAC_TWISTED_MASS=OFF \
             -DQUDA_DIRAC_TWISTED_CLOVER=OFF \
             -DQUDA_DIRAC_WILSON=ON \
+            -DQUDA_DIRAC_LAPLACE=ON \
             -DQUDA_CLOVER_DYNAMIC=ON \
             -DQUDA_QDPJIT=OFF \
             -DQUDA_INTERFACE_QDPJIT=OFF \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d9ce6c032..165dfb689a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,13 +150,13 @@ option(QUDA_DIRAC_TWISTED_CLOVER "build twisted clover Dirac operators" ${QUDA_D
 option(QUDA_DIRAC_CLOVER_HASENBUSCH "build clover Hasenbusch twist operators" ${QUDA_DIRAC_DEFAULT})
 option(QUDA_DIRAC_NDEG_TWISTED_MASS "build non-degenerate twisted mass Dirac operators" ${QUDA_DIRAC_DEFAULT})
 option(QUDA_DIRAC_NDEG_TWISTED_CLOVER "build non-degenerate twisted clover Dirac operators" ${QUDA_DIRAC_DEFAULT})
+option(QUDA_DIRAC_LAPLACE "build laplace operator" ${QUDA_DIRAC_DEFAULT})
 
 option(QUDA_DIRAC_DD "build code for domain decomposition of the Dirac operator" OFF)
 
 option(QUDA_CONTRACT "build code for bilinear contraction" OFF)
 option(QUDA_COVDEV "build code for covariant derivative" OFF)
 
-option(QUDA_LAPLACE "build laplace operator" OFF)
 option(QUDA_QIO "build QIO code for binary I/O" OFF)
 
 option(QUDA_SMEAR_GAUSS_TWOLINK "build code for two-link Gaussian smearing" OFF)
diff --git a/README.md b/README.md
index a739762cf2..d5995a5c99 100644
--- a/README.md
+++ b/README.md
@@ -266,12 +266,14 @@ Advanced Scientific Computing (PASC21) [arXiv:2104.05615[hep-lat]].
 *  Steven Gottlieb (Indiana University) 
 *  Anthony Grebe (Fermilab)
 *  Kyriakos Hadjiyiannakou (Cyprus)
+*  Ben Hoerz (Intel)
 *  Dean Howarth (Lawrence Livermore Lab, Lawrence Berkeley Lab)
 *  Hwancheol Jeong (Indiana University)
 *  Xiangyu Jiang (ITP, Chinese Academy of Sciences)
 *  Balint Joo (OLCF, Oak Ridge National Laboratory, formerly Jefferson Lab)
 *  Hyung-Jin Kim (Samsung Advanced Institute of Technology)
 *  Bartosz Kostrzewa (HPC/A-Lab, University of Bonn)
+*  Damon McDougall (AMD)
 *  James Osborn (Argonne National Laboratory)
 *  Ferenc Pittler (Cyprus)
 *  Claudio Rebbi (Boston University) 
diff --git a/ci/docker/Dockerfile.build b/ci/docker/Dockerfile.build
index 5322ddfd2d..26adb18840 100644
--- a/ci/docker/Dockerfile.build
+++ b/ci/docker/Dockerfile.build
@@ -2,6 +2,8 @@ FROM docker.io/nvidia/cuda:11.8.0-devel-ubuntu22.04
 
 ARG DEBIAN_FRONTEND=noninteractive
 
+RUN echo "Running CSCS CI on $(nproc) processors"
+
 RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \
     build-essential \
     cmake \
@@ -34,7 +36,7 @@ RUN  QUDA_TEST_GRID_SIZE="1 1 1 2" cmake -S /quda/src \
     -DQUDA_CTEST_LAUNCH="" \
     -DQUDA_GPU_ARCH=sm_60 \
     -DQUDA_MULTIGRID=ON \
-    -DQUDA_MULTIGRID_NVEC_LIST=24 \
+    -DQUDA_MULTIGRID_NVEC_LIST=6 \
     -DQUDA_MDW_FUSED_LS_LIST=4 \
     -DQUDA_MPI=ON  \
     -DQUDA_DIRAC_DEFAULT_OFF=ON \
@@ -42,10 +44,11 @@ RUN  QUDA_TEST_GRID_SIZE="1 1 1 2" cmake -S /quda/src \
     -DQUDA_DIRAC_CLOVER=ON \
     -DQUDA_DIRAC_TWISTED_CLOVER=ON \
     -DQUDA_DIRAC_STAGGERED=ON \
+    -DQUDA_DIRAC_LAPLACE=ON \
     -GNinja \
     -B /quda/build 
 
-RUN cmake --build /quda/build
+RUN cmake --build /quda/build -j $(nproc)
 
 RUN cmake --install /quda/build
 
diff --git a/include/clover_field.h b/include/clover_field.h
index 90d24d53eb..256453da2e 100644
--- a/include/clover_field.h
+++ b/include/clover_field.h
@@ -20,6 +20,19 @@ namespace quda {
 #endif
   }
 
+  /**
+     @brief Helper function that returns whether we have enabled
+     clover fermions.
+   */
+  constexpr bool is_enabled_twisted_clover()
+  {
+#ifdef GPU_TWISTED_CLOVER_DIRAC
+    return true;
+#else
+    return false;
+#endif
+  }
+
   namespace clover
   {
 
diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index a5ffa80543..dad19b48a8 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -1052,6 +1052,112 @@ namespace quda
       }
     };
 
+    template <typename Float, int Ns, int Nc, int N, bool spin_project = false, bool huge_alloc = false,
+              bool disable_ghost = false>
+    struct GhostNOrder {
+      GhostNOrder() = default;
+      GhostNOrder(const GhostNOrder &) = default;
+      GhostNOrder(const ColorSpinorField &, int = 1, Float ** = 0) { }
+      GhostNOrder &operator=(const GhostNOrder &) = default;
+    };
+
+    template <typename Float, int Ns, int Nc, int N, bool spin_project, bool huge_alloc>
+    struct GhostNOrder<Float, Ns, Nc, N, spin_project, huge_alloc, false> {
+      static constexpr int length = 2 * Ns * Nc;
+      static constexpr int length_ghost = spin_project ? length / 2 : length;
+      // if spin projecting, check that short vector length is compatible, if not halve the vector length
+      static constexpr int N_ghost = !spin_project ? N : (Ns * Nc) % N == 0 ? N : N / 2;
+      static constexpr int M_ghost = length_ghost / N_ghost;
+      using GhostVector = typename VectorType<Float, N_ghost>::type;
+      using real = typename mapper<Float>::type;
+      using complex = complex<real>;
+      using norm_type = float;
+      int nParity;
+      array<int, 4> faceVolumeCB = {};
+      mutable array<Float *, 8> ghost = {};
+      mutable array<norm_type *, 8> ghost_norm = {};
+
+      GhostNOrder() = default;
+      GhostNOrder(const GhostNOrder &) = default;
+
+      GhostNOrder(const ColorSpinorField &a, int nFace = 1, Float **ghost_ = 0) : nParity(a.SiteSubset())
+      {
+        for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; }
+        resetGhost(ghost_ ? (void **)ghost_ : a.Ghost());
+      }
+
+      GhostNOrder &operator=(const GhostNOrder &) = default;
+
+      void resetGhost(void *const *ghost_) const
+      {
+        for (int dim = 0; dim < 4; dim++) {
+          for (int dir = 0; dir < 2; dir++) {
+            ghost[2 * dim + dir] = comm_dim_partitioned(dim) ? static_cast<Float *>(ghost_[2 * dim + dir]) : nullptr;
+            ghost_norm[2 * dim + dir] = !comm_dim_partitioned(dim) ?
+              nullptr :
+              reinterpret_cast<norm_type *>(static_cast<char *>(ghost_[2 * dim + dir])
+                                            + nParity * length_ghost * faceVolumeCB[dim] * sizeof(Float));
+          }
+        }
+      }
+
+      __device__ __host__ inline void loadGhost(complex out[length_ghost / 2], int x, int dim, int dir, int parity = 0) const
+      {
+        real v[length_ghost];
+        norm_type nrm
+          = isFixed<Float>::value ? vector_load<float>(ghost_norm[2 * dim + dir], parity * faceVolumeCB[dim] + x) : 0.0;
+
+#pragma unroll
+        for (int i = 0; i < M_ghost; i++) {
+          GhostVector vecTmp = vector_load<GhostVector>(
+            ghost[2 * dim + dir], parity * faceVolumeCB[dim] * M_ghost + i * faceVolumeCB[dim] + x);
+#pragma unroll
+          for (int j = 0; j < N_ghost; j++)
+            copy_and_scale(v[i * N_ghost + j], reinterpret_cast<Float *>(&vecTmp)[j], nrm);
+        }
+
+#pragma unroll
+        for (int i = 0; i < length_ghost / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]);
+      }
+
+      __device__ __host__ inline void saveGhost(const complex in[length_ghost / 2], int x, int dim, int dir,
+                                                int parity = 0) const
+      {
+        real v[length_ghost];
+#pragma unroll
+        for (int i = 0; i < length_ghost / 2; i++) {
+          v[2 * i + 0] = in[i].real();
+          v[2 * i + 1] = in[i].imag();
+        }
+
+        if (isFixed<Float>::value) {
+          norm_type max_[length_ghost / 2];
+          // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued)
+#pragma unroll
+          for (int i = 0; i < length_ghost / 2; i++)
+            max_[i] = fmaxf((norm_type)fabsf((norm_type)v[i]), (norm_type)fabsf((norm_type)v[i + length_ghost / 2]));
+          norm_type scale = 0.0;
+#pragma unroll
+          for (int i = 0; i < length_ghost / 2; i++) scale = fmaxf(max_[i], scale);
+          ghost_norm[2 * dim + dir][parity * faceVolumeCB[dim] + x] = scale * fixedInvMaxValue<Float>::value;
+
+          real scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
+#pragma unroll
+          for (int i = 0; i < length_ghost; i++) v[i] = v[i] * scale_inv;
+        }
+
+#pragma unroll
+        for (int i = 0; i < M_ghost; i++) {
+          GhostVector vecTmp;
+          // first do scalar copy converting into storage type
+#pragma unroll
+          for (int j = 0; j < N_ghost; j++) copy_scaled(reinterpret_cast<Float *>(&vecTmp)[j], v[i * N_ghost + j]);
+          // second do vectorized copy into memory
+          vector_store(ghost[2 * dim + dir], parity * faceVolumeCB[dim] * M_ghost + i * faceVolumeCB[dim] + x, vecTmp);
+        }
+      }
+    };
+
     /**
        @brief Accessor routine for ColorSpinorFields in native field order.
        @tparam Float Underlying storage data type of the field
@@ -1063,21 +1169,18 @@ namespace quda
        pointer arithmetic for huge allocations (e.g., packed set of
        vectors).  Default is to use 32-bit pointer arithmetic.
      */
-    template <typename Float, int Ns, int Nc, int N_, bool spin_project = false, bool huge_alloc = false>
-    struct FloatNOrder {
+    template <typename Float, int Ns, int Nc, int N_, bool spin_project = false, bool huge_alloc = false,
+              bool disable_ghost = false>
+    struct FloatNOrder : GhostNOrder<Float, Ns, Nc, N_, spin_project, huge_alloc, disable_ghost> {
       static_assert((2 * Ns * Nc) % N_ == 0, "Internal degrees of freedom not divisible by short-vector length");
       static constexpr int length = 2 * Ns * Nc;
-      static constexpr int length_ghost = spin_project ? length / 2 : length;
       static constexpr int N = N_;
       static constexpr int M = length / N;
-      // if spin projecting, check that short vector length is compatible, if not halve the vector length
-      static constexpr int N_ghost = !spin_project ? N : (Ns * Nc) % N == 0 ? N : N / 2;
-      static constexpr int M_ghost = length_ghost / N_ghost;
-      using Accessor = FloatNOrder<Float, Ns, Nc, N, spin_project, huge_alloc>;
+      using Accessor = FloatNOrder<Float, Ns, Nc, N, spin_project, huge_alloc, disable_ghost>;
+      using GhostNOrder = GhostNOrder<Float, Ns, Nc, N, spin_project, huge_alloc, disable_ghost>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       using Vector = typename VectorType<Float, N>::type;
-      using GhostVector = typename VectorType<Float, N_ghost>::type;
       using AllocInt = typename AllocType<huge_alloc>::type;
       using norm_type = float;
       Float *field = nullptr;
@@ -1085,45 +1188,24 @@ namespace quda
       AllocInt offset = 0; // offset can be 32-bit or 64-bit
       AllocInt norm_offset = 0;
       int volumeCB = 0;
-      array<int, 4> faceVolumeCB = {};
-      mutable array<Float *, 8> ghost = {};
-      mutable array<norm_type *, 8> ghost_norm = {};
-      int nParity = 0;
-      void *backup_h = nullptr; //! host memory for backing up the field when tuning
       size_t bytes = 0;
 
       FloatNOrder() = default;
       FloatNOrder(const FloatNOrder &) = default;
 
       FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) :
+        GhostNOrder(a, nFace, ghost_),
         field(buffer ? buffer : a.data<Float *>()),
         norm(buffer ? reinterpret_cast<norm_type *>(reinterpret_cast<char *>(buffer) + a.NormOffset()) :
                       const_cast<norm_type *>(reinterpret_cast<const norm_type *>(a.Norm()))),
         offset(a.Bytes() / (2 * sizeof(Float) * N)),
         norm_offset(a.Bytes() / (2 * sizeof(norm_type))),
         volumeCB(a.VolumeCB()),
-        nParity(a.SiteSubset()),
         bytes(a.Bytes())
       {
-        for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; }
-        resetGhost(ghost_ ? (void **)ghost_ : a.Ghost());
       }
-
       FloatNOrder &operator=(const FloatNOrder &) = default;
 
-      void resetGhost(void *const *ghost_) const
-      {
-        for (int dim = 0; dim < 4; dim++) {
-          for (int dir = 0; dir < 2; dir++) {
-            ghost[2 * dim + dir] = comm_dim_partitioned(dim) ? static_cast<Float *>(ghost_[2 * dim + dir]) : nullptr;
-            ghost_norm[2 * dim + dir] = !comm_dim_partitioned(dim) ?
-              nullptr :
-              reinterpret_cast<norm_type *>(static_cast<char *>(ghost_[2 * dim + dir])
-                                            + nParity * length_ghost * faceVolumeCB[dim] * sizeof(Float));
-          }
-        }
-      }
-
       __device__ __host__ inline void load(complex out[length / 2], int x, int parity = 0) const
       {
         real v[length];
@@ -1193,20 +1275,71 @@ namespace quda
         return colorspinor_wrapper<real, Accessor>(*this, x_cb, parity);
       }
 
+      /**
+         @brief This accessor routine returns a const
+         colorspinor_ghost_wrapper to this object, allowing us to
+         overload various operators for manipulating at the site
+         level interms of matrix operations.
+         @param[in] dim Dimensions of the ghost we are requesting
+         @param[in] ghost_idx Checkerboarded space-time ghost index we are requesting
+         @param[in] parity Parity we are requesting
+         @return Instance of a colorspinor_ghost+wrapper that curries in access to
+         this field at the above coordinates.
+      */
+      __device__ __host__ inline auto Ghost(int dim, int dir, int ghost_idx, int parity) const
+      {
+        return colorspinor_ghost_wrapper<real, Accessor>(*this, dim, dir, ghost_idx, parity);
+      }
+
+      size_t Bytes() const { return bytes; }
+    };
+
+    template <int N, bool spin_project, bool huge_alloc>
+    struct GhostNOrder<short, 1, 3, N, spin_project, huge_alloc, false> {
+      using Float = short;
+      static constexpr int Ns = 1;
+      static constexpr int Nc = 3;
+      static constexpr int length_ghost = 2 * Ns * Nc;
+      using GhostVector = int4; // 128-bit packed type
+      using real = typename mapper<Float>::type;
+      using complex = complex<real>;
+      using norm_type = float;
+      int nParity;
+      array<int, 4> faceVolumeCB = {};
+      mutable array<Float *, 8> ghost = {};
+      mutable array<norm_type *, 8> ghost_norm = {};
+
+      GhostNOrder() = default;
+      GhostNOrder(const GhostNOrder &) = default;
+
+      GhostNOrder(const ColorSpinorField &a, int nFace = 1, Float **ghost_ = 0) : nParity(a.SiteSubset())
+      {
+        for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; }
+        resetGhost(ghost_ ? (void **)ghost_ : a.Ghost());
+      }
+
+      GhostNOrder &operator=(const GhostNOrder &) = default;
+
+      void resetGhost(void *const *ghost_) const
+      {
+        for (int dim = 0; dim < 4; dim++) {
+          for (int dir = 0; dir < 2; dir++) {
+            ghost[2 * dim + dir] = comm_dim_partitioned(dim) ? static_cast<Float *>(ghost_[2 * dim + dir]) : nullptr;
+          }
+        }
+      }
+
       __device__ __host__ inline void loadGhost(complex out[length_ghost / 2], int x, int dim, int dir, int parity = 0) const
       {
         real v[length_ghost];
-        norm_type nrm
-          = isFixed<Float>::value ? vector_load<float>(ghost_norm[2 * dim + dir], parity * faceVolumeCB[dim] + x) : 0.0;
+        GhostVector vecTmp = vector_load<GhostVector>(ghost[2 * dim + dir], parity * faceVolumeCB[dim] + x);
+
+        // extract the norm
+        norm_type nrm;
+        memcpy(&nrm, &vecTmp.w, sizeof(norm_type));
 
 #pragma unroll
-        for (int i = 0; i < M_ghost; i++) {
-          GhostVector vecTmp = vector_load<GhostVector>(
-            ghost[2 * dim + dir], parity * faceVolumeCB[dim] * M_ghost + i * faceVolumeCB[dim] + x);
-#pragma unroll
-          for (int j = 0; j < N_ghost; j++)
-            copy_and_scale(v[i * N_ghost + j], reinterpret_cast<Float *>(&vecTmp)[j], nrm);
-        }
+        for (int i = 0; i < length_ghost; i++) copy_and_scale(v[i], reinterpret_cast<Float *>(&vecTmp)[i], nrm);
 
 #pragma unroll
         for (int i = 0; i < length_ghost / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]);
@@ -1216,78 +1349,34 @@ namespace quda
                                                 int parity = 0) const
       {
         real v[length_ghost];
+
 #pragma unroll
         for (int i = 0; i < length_ghost / 2; i++) {
           v[2 * i + 0] = in[i].real();
           v[2 * i + 1] = in[i].imag();
         }
 
-        if (isFixed<Float>::value) {
-          norm_type max_[length_ghost / 2];
-          // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued)
-#pragma unroll
-          for (int i = 0; i < length_ghost / 2; i++)
-            max_[i] = fmaxf((norm_type)fabsf((norm_type)v[i]), (norm_type)fabsf((norm_type)v[i + length_ghost / 2]));
-          norm_type scale = 0.0;
+        norm_type max_[length_ghost / 2];
+        // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued)
 #pragma unroll
-          for (int i = 0; i < length_ghost / 2; i++) scale = fmaxf(max_[i], scale);
-          ghost_norm[2 * dim + dir][parity * faceVolumeCB[dim] + x] = scale * fixedInvMaxValue<Float>::value;
-
-          real scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
+        for (int i = 0; i < length_ghost / 2; i++)
+          max_[i] = fmaxf(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length_ghost / 2]));
+        norm_type scale = 0.0;
 #pragma unroll
-          for (int i = 0; i < length_ghost; i++) v[i] = v[i] * scale_inv;
-        }
+        for (int i = 0; i < length_ghost / 2; i++) scale = fmaxf(max_[i], scale);
+        norm_type nrm = scale * fixedInvMaxValue<Float>::value;
 
+        real scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
 #pragma unroll
-        for (int i = 0; i < M_ghost; i++) {
-          GhostVector vecTmp;
-          // first do scalar copy converting into storage type
-#pragma unroll
-          for (int j = 0; j < N_ghost; j++) copy_scaled(reinterpret_cast<Float *>(&vecTmp)[j], v[i * N_ghost + j]);
-          // second do vectorized copy into memory
-          vector_store(ghost[2 * dim + dir], parity * faceVolumeCB[dim] * M_ghost + i * faceVolumeCB[dim] + x, vecTmp);
-        }
-      }
-
-      /**
-         @brief This accessor routine returns a const
-         colorspinor_ghost_wrapper to this object, allowing us to
-         overload various operators for manipulating at the site
-         level interms of matrix operations.
-         @param[in] dim Dimensions of the ghost we are requesting
-         @param[in] ghost_idx Checkerboarded space-time ghost index we are requesting
-         @param[in] parity Parity we are requesting
-         @return Instance of a colorspinor_ghost+wrapper that curries in access to
-         this field at the above coordinates.
-      */
-      __device__ __host__ inline auto Ghost(int dim, int dir, int ghost_idx, int parity) const
-      {
-        return colorspinor_ghost_wrapper<real, Accessor>(*this, dim, dir, ghost_idx, parity);
-      }
-
-      /**
-         @brief Backup the field to the host when tuning
-      */
-      void save()
-      {
-        if (backup_h) errorQuda("Already allocated host backup");
-        backup_h = safe_malloc(bytes);
-        qudaMemcpy(backup_h, field, bytes, qudaMemcpyDeviceToHost);
-      }
+        for (int i = 0; i < length_ghost; i++) v[i] = v[i] * scale_inv;
 
-      /**
-         @brief Restore the field from the host after tuning
-      */
-      void load()
-      {
-        qudaMemcpy(field, backup_h, bytes, qudaMemcpyHostToDevice);
-        host_free(backup_h);
-        backup_h = nullptr;
-      }
+        GhostVector vecTmp;
+        memcpy(&vecTmp.w, &nrm, sizeof(norm_type)); // pack the norm
 
-      size_t Bytes() const
-      {
-        return nParity * volumeCB * (Nc * Ns * 2 * sizeof(Float) + (isFixed<Float>::value ? sizeof(norm_type) : 0));
+        // pack the spinor elements
+#pragma unroll
+        for (int i = 0; i < length_ghost; i++) copy_scaled(reinterpret_cast<Float *>(&vecTmp)[i], v[i]);
+        vector_store(ghost[2 * dim + dir], parity * faceVolumeCB[dim] + x, vecTmp);
       }
     };
 
@@ -1301,54 +1390,39 @@ namespace quda
        pointer arithmetic for huge allocations (e.g., packed set of
        vectors).  Default is to use 32-bit pointer arithmetic.
      */
-    template <int N_, bool spin_project, bool huge_alloc>
-    struct FloatNOrder<short, 1, 3, N_, spin_project, huge_alloc> {
+    template <int N_, bool spin_project, bool huge_alloc, bool disable_ghost>
+    struct FloatNOrder<short, 1, 3, N_, spin_project, huge_alloc, disable_ghost>
+      : GhostNOrder<short, 1, 3, N_, spin_project, huge_alloc, disable_ghost> {
       using Float = short;
       static constexpr int Ns = 1;
       static constexpr int Nc = 3;
       static constexpr int length = 2 * Ns * Nc;
-      static constexpr int length_ghost = 2 * Ns * Nc;
-      using Accessor = FloatNOrder<Float, Ns, Nc, N_, spin_project, huge_alloc>;
+      using Accessor = FloatNOrder<Float, Ns, Nc, N_, spin_project, huge_alloc, disable_ghost>;
+      using GhostNOrder = GhostNOrder<Float, Ns, Nc, N_, spin_project, huge_alloc, disable_ghost>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       using Vector = int4;      // 128-bit packed type
-      using GhostVector = int4; // 128-bit packed type
       using AllocInt = typename AllocType<huge_alloc>::type;
       using norm_type = float;
       Float *field = nullptr;
-      const AllocInt offset = 0; // offset can be 32-bit or 64-bit
+      AllocInt offset = 0; // offset can be 32-bit or 64-bit
       int volumeCB = 0;
-      array<int, 4> faceVolumeCB = {};
-      mutable array<Float *, 8> ghost = {};
-      int nParity = 0;
-      void *backup_h = nullptr; //! host memory for backing up the field when tuning
       size_t bytes = 0;
 
       FloatNOrder() = default;
       FloatNOrder(const FloatNOrder &) = default;
 
       FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) :
+        GhostNOrder(a, nFace, ghost_),
         field(buffer ? buffer : a.data<Float *>()),
         offset(a.Bytes() / (2 * sizeof(Vector))),
         volumeCB(a.VolumeCB()),
-        nParity(a.SiteSubset()),
         bytes(a.Bytes())
       {
-        for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; }
-        resetGhost(ghost_ ? (void **)ghost_ : a.Ghost());
       }
 
       FloatNOrder &operator=(const FloatNOrder &) = default;
 
-      void resetGhost(void *const *ghost_) const
-      {
-        for (int dim = 0; dim < 4; dim++) {
-          for (int dir = 0; dir < 2; dir++) {
-            ghost[2 * dim + dir] = comm_dim_partitioned(dim) ? static_cast<Float *>(ghost_[2 * dim + dir]) : nullptr;
-          }
-        }
-      }
-
       __device__ __host__ inline void load(complex out[length / 2], int x, int parity = 0) const
       {
         real v[length];
@@ -1414,56 +1488,6 @@ namespace quda
         return colorspinor_wrapper<real, Accessor>(*this, x_cb, parity);
       }
 
-      __device__ __host__ inline void loadGhost(complex out[length_ghost / 2], int x, int dim, int dir, int parity = 0) const
-      {
-        real v[length_ghost];
-        GhostVector vecTmp = vector_load<GhostVector>(ghost[2 * dim + dir], parity * faceVolumeCB[dim] + x);
-
-        // extract the norm
-        norm_type nrm;
-        memcpy(&nrm, &vecTmp.w, sizeof(norm_type));
-
-#pragma unroll
-        for (int i = 0; i < length_ghost; i++) copy_and_scale(v[i], reinterpret_cast<Float *>(&vecTmp)[i], nrm);
-
-#pragma unroll
-        for (int i = 0; i < length_ghost / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]);
-      }
-
-      __device__ __host__ inline void saveGhost(const complex in[length_ghost / 2], int x, int dim, int dir,
-                                                int parity = 0) const
-      {
-        real v[length_ghost];
-
-#pragma unroll
-        for (int i = 0; i < length_ghost / 2; i++) {
-          v[2 * i + 0] = in[i].real();
-          v[2 * i + 1] = in[i].imag();
-        }
-
-        norm_type max_[length_ghost / 2];
-        // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued)
-#pragma unroll
-        for (int i = 0; i < length_ghost / 2; i++)
-          max_[i] = fmaxf(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length_ghost / 2]));
-        norm_type scale = 0.0;
-#pragma unroll
-        for (int i = 0; i < length_ghost / 2; i++) scale = fmaxf(max_[i], scale);
-        norm_type nrm = scale * fixedInvMaxValue<Float>::value;
-
-        real scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
-#pragma unroll
-        for (int i = 0; i < length_ghost; i++) v[i] = v[i] * scale_inv;
-
-        GhostVector vecTmp;
-        memcpy(&vecTmp.w, &nrm, sizeof(norm_type)); // pack the norm
-
-        // pack the spinor elements
-#pragma unroll
-        for (int i = 0; i < length_ghost; i++) copy_scaled(reinterpret_cast<Float *>(&vecTmp)[i], v[i]);
-        vector_store(ghost[2 * dim + dir], parity * faceVolumeCB[dim] + x, vecTmp);
-      }
-
       /**
          @brief This accessor routine returns a const
          colorspinor_ghost_wrapper to this object, allowing us to
@@ -1480,30 +1504,7 @@ namespace quda
         return colorspinor_ghost_wrapper<real, Accessor>(*this, dim, dir, ghost_idx, parity);
       }
 
-      /**
-         @brief Backup the field to the host when tuning
-      */
-      void save()
-      {
-        if (backup_h) errorQuda("Already allocated host backup");
-        backup_h = safe_malloc(bytes);
-        qudaMemcpy(backup_h, field, bytes, qudaMemcpyDeviceToHost);
-      }
-
-      /**
-         @brief Restore the field from the host after tuning
-      */
-      void load()
-      {
-        qudaMemcpy(field, backup_h, bytes, qudaMemcpyHostToDevice);
-        host_free(backup_h);
-        backup_h = nullptr;
-      }
-
-      size_t Bytes() const
-      {
-        return nParity * volumeCB * (Nc * Ns * 2 * sizeof(Float) + (isFixed<Float>::value ? sizeof(norm_type) : 0));
-      }
+      size_t Bytes() const { return bytes; }
     };
 
     template <typename Float, int Ns, int Nc> struct SpaceColorSpinorOrder {
@@ -1820,63 +1821,80 @@ namespace quda
   } // namespace colorspinor
 
   // Use traits to reduce the template explosion
-  template <typename T, int Ns, int Nc, bool project = false, bool huge_alloc = false> struct colorspinor_mapper {
+  template <typename T, int Ns, int Nc, bool project = false, bool huge_alloc = false, bool disable_ghost = false>
+  struct colorspinor_mapper {
   };
 
   // double precision
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<double, 4, Nc, false, huge_alloc> {
-    typedef colorspinor::FloatNOrder<double, 4, Nc, 2, false, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<double, 4, Nc, false, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<double, 4, Nc, 2, false, huge_alloc, disable_ghost> type;
   };
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<double, 4, Nc, true, huge_alloc> {
-    typedef colorspinor::FloatNOrder<double, 4, Nc, 2, true, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<double, 4, Nc, true, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<double, 4, Nc, 2, true, huge_alloc, disable_ghost> type;
   };
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<double, 2, Nc, false, huge_alloc> {
-    typedef colorspinor::FloatNOrder<double, 2, Nc, 2, false, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<double, 2, Nc, false, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<double, 2, Nc, 2, false, huge_alloc, disable_ghost> type;
   };
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<double, 1, Nc, false, huge_alloc> {
-    typedef colorspinor::FloatNOrder<double, 1, Nc, 2, false, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<double, 1, Nc, false, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<double, 1, Nc, 2, false, huge_alloc, disable_ghost> type;
   };
 
   // single precision
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<float, 4, Nc, false, huge_alloc> {
-    typedef colorspinor::FloatNOrder<float, 4, Nc, 4, false, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<float, 4, Nc, false, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<float, 4, Nc, 4, false, huge_alloc, disable_ghost> type;
   };
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<float, 4, Nc, true, huge_alloc> {
-    typedef colorspinor::FloatNOrder<float, 4, Nc, 4, true, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<float, 4, Nc, true, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<float, 4, Nc, 4, true, huge_alloc, disable_ghost> type;
   };
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<float, 2, Nc, false, huge_alloc> {
-    typedef colorspinor::FloatNOrder<float, 2, Nc, colorspinor::getNative<float>(2), false, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<float, 2, Nc, false, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<float, 2, Nc, colorspinor::getNative<float>(2), false, huge_alloc, disable_ghost> type;
   };
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<float, 1, Nc, false, huge_alloc> {
-    typedef colorspinor::FloatNOrder<float, 1, Nc, 2, false, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<float, 1, Nc, false, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<float, 1, Nc, 2, false, huge_alloc, disable_ghost> type;
   };
 
   // half precision
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<short, 4, Nc, false, huge_alloc> {
-    typedef colorspinor::FloatNOrder<short, 4, Nc, colorspinor::getNative<short>(4), false, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<short, 4, Nc, false, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<short, 4, Nc, colorspinor::getNative<short>(4), false, huge_alloc, disable_ghost> type;
   };
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<short, 4, Nc, true, huge_alloc> {
-    typedef colorspinor::FloatNOrder<short, 4, Nc, colorspinor::getNative<short>(4), true, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<short, 4, Nc, true, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<short, 4, Nc, colorspinor::getNative<short>(4), true, huge_alloc, disable_ghost> type;
   };
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<short, 2, Nc, false, huge_alloc> {
-    typedef colorspinor::FloatNOrder<short, 2, Nc, colorspinor::getNative<short>(2), false, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<short, 2, Nc, false, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<short, 2, Nc, colorspinor::getNative<short>(2), false, huge_alloc, disable_ghost> type;
   };
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<short, 1, Nc, false, huge_alloc> {
-    typedef colorspinor::FloatNOrder<short, 1, Nc, 2, false, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<short, 1, Nc, false, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<short, 1, Nc, 2, false, huge_alloc, disable_ghost> type;
   };
 
   // quarter precision
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<int8_t, 4, Nc, false, huge_alloc> {
-    typedef colorspinor::FloatNOrder<int8_t, 4, Nc, colorspinor::getNative<int8_t>(4), false, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<int8_t, 4, Nc, false, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<int8_t, 4, Nc, colorspinor::getNative<int8_t>(4), false, huge_alloc, disable_ghost> type;
   };
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<int8_t, 4, Nc, true, huge_alloc> {
-    typedef colorspinor::FloatNOrder<int8_t, 4, Nc, colorspinor::getNative<int8_t>(4), true, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<int8_t, 4, Nc, true, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<int8_t, 4, Nc, colorspinor::getNative<int8_t>(4), true, huge_alloc, disable_ghost> type;
   };
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<int8_t, 2, Nc, false, huge_alloc> {
-    typedef colorspinor::FloatNOrder<int8_t, 2, Nc, colorspinor::getNative<int8_t>(2), false, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<int8_t, 2, Nc, false, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<int8_t, 2, Nc, colorspinor::getNative<int8_t>(2), false, huge_alloc, disable_ghost> type;
   };
-  template <int Nc, bool huge_alloc> struct colorspinor_mapper<int8_t, 1, Nc, false, huge_alloc> {
-    typedef colorspinor::FloatNOrder<int8_t, 1, Nc, 2, false, huge_alloc> type;
+  template <int Nc, bool huge_alloc, bool disable_ghost>
+  struct colorspinor_mapper<int8_t, 1, Nc, false, huge_alloc, disable_ghost> {
+    typedef colorspinor::FloatNOrder<int8_t, 1, Nc, 2, false, huge_alloc, disable_ghost> type;
   };
 
   template <typename T, QudaFieldOrder order, int Ns, int Nc> struct colorspinor_order_mapper {
diff --git a/include/contract_quda.h b/include/contract_quda.h
index 1f5c509687..3cdc2ee843 100644
--- a/include/contract_quda.h
+++ b/include/contract_quda.h
@@ -1,9 +1,20 @@
 #pragma once
 
 #include <quda_internal.h>
-#include <quda.h>
 
 namespace quda
 {
   void contractQuda(const ColorSpinorField &x, const ColorSpinorField &y, void *result, QudaContractType cType);
+
+  /**
+     @brief Contract the quark field x against the 3-d Laplace eigenvector
+     set y.  At present, this expects a spin-4 fermion field, and the
+     Laplace eigenvector is a spin-1 field.
+     @param[out] result array of length 4 * Lt
+     @param[in] x Input fermion field set we are contracting
+     @param[in] y Input eigenvector field set we are contracting against
+   */
+  void evecProjectLaplace3D(std::vector<Complex> &result, cvector_ref<const ColorSpinorField> &x,
+                            cvector_ref<const ColorSpinorField> &y);
+
 } // namespace quda
diff --git a/include/dirac_quda.h b/include/dirac_quda.h
index e9bdb66fc6..bddd6104f6 100644
--- a/include/dirac_quda.h
+++ b/include/dirac_quda.h
@@ -56,6 +56,7 @@ namespace quda {
     GaugeField *fatGauge;  // used by staggered only
     GaugeField *longGauge; // used by staggered only
     int laplace3D;
+    int covdev_mu;
     CloverField *clover;
     GaugeField *xInvKD; // used for the Kahler-Dirac operator only
 
@@ -117,6 +118,7 @@ namespace quda {
       printfQuda("kappa = %g\n", kappa);
       printfQuda("mass = %g\n", mass);
       printfQuda("laplace3D = %d\n", laplace3D);
+      printfQuda("covdev_mu = %d\n", covdev_mu);
       printfQuda("m5 = %g\n", m5);
       printfQuda("Ls = %d\n", Ls);
       printfQuda("matpcType = %d\n", matpcType);
@@ -1098,6 +1100,14 @@ namespace quda {
 
   void gamma5(ColorSpinorField &out, const ColorSpinorField &in);
 
+  /**
+     @brief Applies a pauli matrices to a spinor doublet
+     @param[out] out Output field
+     @param[in] in Input field
+     @param[d] d index of the pauli matrix
+  */
+  void ApplyTau(ColorSpinorField &out, const ColorSpinorField &in, int d);
+
   // Full twisted mass
   class DiracTwistedMass : public DiracWilson {
 
@@ -2214,6 +2224,9 @@ namespace quda {
   */
   class GaugeCovDev : public Dirac {
 
+  protected:
+    int covdev_mu;
+
   public:
     GaugeCovDev(const DiracParam &param);
     GaugeCovDev(const GaugeCovDev &covDev);
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index a2c0300a1d..38079c3cbb 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1480,7 +1480,7 @@ namespace quda {
             for (int i = 0; i < 9; i++) out[i] = cmul(z, out[i]);
           } else { // stagic phase
 #pragma unroll
-            for (int i = 0; i < 18; i++) { out[i] *= phase; }
+            for (int i = 0; i < 9; i++) { out[i] *= phase; }
           }
         }
       };
diff --git a/include/instantiate_dslash.h b/include/instantiate_dslash.h
index 1c96e2e7ea..5a499cde8c 100644
--- a/include/instantiate_dslash.h
+++ b/include/instantiate_dslash.h
@@ -166,4 +166,13 @@ namespace quda
   }
 #endif
 
+  constexpr bool is_enabled_laplace()
+  {
+#ifdef GPU_LAPLACE
+    return true;
+#else
+    return false;
+#endif
+  }
+
 } // namespace quda
diff --git a/include/kernels/clover_outer_product.cuh b/include/kernels/clover_outer_product.cuh
index 70e505b149..7327201e09 100644
--- a/include/kernels/clover_outer_product.cuh
+++ b/include/kernels/clover_outer_product.cuh
@@ -8,13 +8,15 @@
 
 namespace quda {
 
-  template <typename Float, int nColor_, QudaReconstructType recon, int dim_ = -1>
+  template <typename Float, int nColor_, QudaReconstructType recon, int dim_ = -1, bool doublet_ = false>
   struct CloverForceArg : kernel_param<> {
     using real = typename mapper<Float>::type;
     static constexpr int nColor = nColor_;
     static constexpr int nSpin = 4;
     static constexpr int dim = dim_;
     static constexpr int spin_project = true;
+    static constexpr bool doublet = doublet_; // whether we applying the operator to a doublet
+    static constexpr int n_flavor = doublet ? 2 : 1;
     using F = typename colorspinor_mapper<Float, nSpin, nColor, spin_project>::type;
     using Gauge = typename gauge_mapper<Float, recon, 18>::type;
     using Force = typename gauge_mapper<Float, QUDA_RECONSTRUCT_NO, 18>::type;
@@ -30,11 +32,16 @@ namespace quda {
     int displacement;
     bool partitioned[4];
     real coeff;
+    const unsigned int volume_4d_cb;
+    const unsigned int ghost_face_4d_cb;
 
     CloverForceArg(GaugeField &force, const GaugeField &U, const ColorSpinorField &inA, const ColorSpinorField &inB,
                    const ColorSpinorField &inC, const ColorSpinorField &inD, const unsigned int parity,
                    const double coeff) :
-      kernel_param(dim3(dim == -1 ? inA.VolumeCB() : inB.GhostFaceCB()[dim])), // inB since it has a ghost allocated
+      kernel_param(dim3(dim == -1 ? inA.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET ? inA.VolumeCB() / 2 : inA.VolumeCB() :
+                          inA.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET ?
+                                    inB.GhostFaceCB()[dim] / 2 :
+                                    inB.GhostFaceCB()[dim])), // inB since it has a ghost allocated
       force(force),
       inA(inA),
       inB(inB),
@@ -43,7 +50,9 @@ namespace quda {
       U(U),
       parity(parity),
       displacement(1),
-      coeff(coeff)
+      coeff(coeff),
+      volume_4d_cb(inA.VolumeCB() / 2),
+      ghost_face_4d_cb(inB.GhostFaceCB()[dim] / 2)
     {
       for (int i=0; i<4; ++i) this->X[i] = U.X()[i];
       for (int i=0; i<4; ++i) this->partitioned[i] = commDimPartitioned(i) ? true : false;
@@ -76,31 +85,36 @@ namespace quda {
       using Spinor = ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin>;
       using Link = Matrix<Complex, Arg::nColor>;
 
-      Spinor A = arg.inA(x_cb, 0);
-      Spinor C = arg.inC(x_cb, 0);
+#pragma unroll
+      for (int flavor = 0; flavor < Arg::n_flavor; ++flavor) {
+
+        const int flavor_offset_idx = flavor * arg.volume_4d_cb;
+        Spinor A = arg.inA(x_cb + flavor_offset_idx, 0);
+        Spinor C = arg.inC(x_cb + flavor_offset_idx, 0);
 
 #pragma unroll
-      for (int dim=0; dim<4; ++dim) {
-        int shift[4] = {0, 0, 0, 0};
-        shift[dim] = 1;
-        const int nbr_idx = neighborIndex(x_cb, shift, arg.partitioned, arg.parity, arg.X);
-
-        if (nbr_idx >= 0) {
-          Spinor B_shift = arg.inB(nbr_idx, 0);
-          Spinor D_shift = arg.inD(nbr_idx, 0);
-
-          B_shift = (B_shift.project(dim,1)).reconstruct(dim,1);
-          Link result = outerProdSpinTrace(B_shift,A);
-
-          D_shift = (D_shift.project(dim,-1)).reconstruct(dim,-1);
-          result += outerProdSpinTrace(D_shift,C);
-
-          Link temp = arg.force(dim, x_cb, arg.parity);
-          Link U = arg.U(dim, x_cb, arg.parity);
-          result = temp + U*result*arg.coeff;
-          arg.force(dim, x_cb, arg.parity) = result;
-        }
-      } // dim
+        for (int dim = 0; dim < 4; ++dim) {
+          int shift[4] = {0, 0, 0, 0};
+          shift[dim] = 1;
+          const int nbr_idx = neighborIndex(x_cb, shift, arg.partitioned, arg.parity, arg.X);
+
+          if (nbr_idx >= 0) {
+            Spinor B_shift = arg.inB(nbr_idx + flavor_offset_idx, 0);
+            Spinor D_shift = arg.inD(nbr_idx + flavor_offset_idx, 0);
+
+            B_shift = (B_shift.project(dim, 1)).reconstruct(dim, 1);
+            Link result = outerProdSpinTrace(B_shift, A);
+
+            D_shift = (D_shift.project(dim, -1)).reconstruct(dim, -1);
+            result += outerProdSpinTrace(D_shift, C);
+
+            Link temp = arg.force(dim, x_cb, arg.parity);
+            Link U = arg.U(dim, x_cb, arg.parity);
+            result = temp + U * result * arg.coeff;
+            arg.force(dim, x_cb, arg.parity) = result;
+          }
+        } // dim
+      }   // flavor
     }
   };
 
@@ -117,23 +131,28 @@ namespace quda {
       using Link = Matrix<Complex, Arg::nColor>;
 
       int x[4];
-      coordsFromIndexExterior(x, x_cb, arg.X, Arg::dim, arg.displacement, arg.parity);
-      const unsigned int bulk_cb_idx = ((((x[3]*arg.X[2] + x[2])*arg.X[1] + x[1])*arg.X[0] + x[0]) >> 1);
-      Spinor A = arg.inA(bulk_cb_idx, 0);
-      Spinor C = arg.inC(bulk_cb_idx, 0);
-
-      HalfSpinor projected_tmp = arg.inB.Ghost(Arg::dim, 1, x_cb, 0);
-      Spinor B_shift = projected_tmp.reconstruct(Arg::dim, 1);
-      Link result = outerProdSpinTrace(B_shift,A);
-
-      projected_tmp = arg.inD.Ghost(Arg::dim, 1, x_cb, 0);
-      Spinor D_shift = projected_tmp.reconstruct(Arg::dim,-1);
-      result += outerProdSpinTrace(D_shift,C);
-
-      Link temp = arg.force(Arg::dim, bulk_cb_idx, arg.parity);
-      Link U = arg.U(Arg::dim, bulk_cb_idx, arg.parity);
-      result = temp + U*result*arg.coeff;
-      arg.force(Arg::dim, bulk_cb_idx, arg.parity) = result;
+#pragma unroll
+      for (int flavor = 0; flavor < Arg::n_flavor; ++flavor) {
+        const int flavor_offset_bulk_idx = flavor * arg.volume_4d_cb;
+        const int flavor_offset_ghost_idx = flavor * arg.ghost_face_4d_cb;
+        coordsFromIndexExterior(x, x_cb, arg.X, Arg::dim, arg.displacement, arg.parity);
+        const unsigned int bulk_cb_idx = ((((x[3] * arg.X[2] + x[2]) * arg.X[1] + x[1]) * arg.X[0] + x[0]) >> 1);
+        Spinor A = arg.inA(bulk_cb_idx + flavor_offset_bulk_idx, 0);
+        Spinor C = arg.inC(bulk_cb_idx + flavor_offset_bulk_idx, 0);
+
+        HalfSpinor projected_tmp = arg.inB.Ghost(Arg::dim, 1, x_cb + flavor_offset_ghost_idx, 0);
+        Spinor B_shift = projected_tmp.reconstruct(Arg::dim, 1);
+        Link result = outerProdSpinTrace(B_shift, A);
+
+        projected_tmp = arg.inD.Ghost(Arg::dim, 1, x_cb + flavor_offset_ghost_idx, 0);
+        Spinor D_shift = projected_tmp.reconstruct(Arg::dim, -1);
+        result += outerProdSpinTrace(D_shift, C);
+
+        Link temp = arg.force(Arg::dim, bulk_cb_idx, arg.parity);
+        Link U = arg.U(Arg::dim, bulk_cb_idx, arg.parity);
+        result = temp + U * result * arg.coeff;
+        arg.force(Arg::dim, bulk_cb_idx, arg.parity) = result;
+      }
     }
   };
 
diff --git a/include/kernels/clover_sigma_outer_product.cuh b/include/kernels/clover_sigma_outer_product.cuh
index e4b901cdc1..0e39dd83a1 100644
--- a/include/kernels/clover_sigma_outer_product.cuh
+++ b/include/kernels/clover_sigma_outer_product.cuh
@@ -11,23 +11,25 @@ namespace quda
   // FIXME - make this multi-RHS once we have the multi-RHS framework developed
 #define MAX_NVECTOR 1
 
-  template <typename Float, int nColor_, int nvector_>
-  struct CloverSigmaOprodArg : kernel_param<> {
+  template <typename Float, int nColor_, int nvector_, bool doublet_> struct CloverSigmaOprodArg : kernel_param<> {
     using real = typename mapper<Float>::type;
     static constexpr int nColor = nColor_;
     static constexpr int nSpin = 4;
     static constexpr int nvector = nvector_;
+    static constexpr bool doublet = doublet_; // whether we applying the operator to a doublet
+    static constexpr int n_flavor = doublet ? 2 : 1;
     using Oprod = typename gauge_mapper<Float, QUDA_RECONSTRUCT_NO, 18>::type;
     using F = typename colorspinor_mapper<Float, nSpin, nColor>::type;
 
     Oprod oprod;
+    const unsigned int volume_4d_cb;
     F inA[nvector];
     F inB[nvector];
     array_2d<real, nvector, 2> coeff;
 
     CloverSigmaOprodArg(GaugeField &oprod, cvector_ref<const ColorSpinorField> &inA,
                         cvector_ref<const ColorSpinorField> &inB, const std::vector<array<double, 2>> &coeff_) :
-      kernel_param(dim3(oprod.VolumeCB(), 2, 6)), oprod(oprod)
+      kernel_param(dim3(oprod.VolumeCB(), 2, 6)), oprod(oprod), volume_4d_cb(inA[0].VolumeCB() / 2)
     {
       for (int i = 0; i < nvector; i++) {
         this->inA[i] = inA[i];
@@ -46,10 +48,14 @@ namespace quda
 
 #pragma unroll
     for (int i = 0; i < Arg::nvector; i++) {
-      const Spinor A = arg.inA[i](x_cb, parity);
-      const Spinor B = arg.inB[i](x_cb, parity);
-      Spinor C = A.sigma(nu, mu); // multiply by sigma_mu_nu
-      result += arg.coeff[i][parity] * outerProdSpinTrace(C, B);
+#pragma unroll
+      for (int flavor = 0; flavor < Arg::n_flavor; flavor++) {
+        const int flavor_offset_idx = flavor * (arg.volume_4d_cb);
+        const Spinor A = arg.inA[i](x_cb + flavor_offset_idx, parity);
+        const Spinor B = arg.inB[i](x_cb + flavor_offset_idx, parity);
+        Spinor C = A.sigma(nu, mu); // multiply by sigma_mu_nu
+        result += arg.coeff[i][parity] * outerProdSpinTrace(C, B);
+      }
     }
 
     result -= conj(result);
diff --git a/include/kernels/dslash_gamma_helper.cuh b/include/kernels/dslash_gamma_helper.cuh
index 5261ea5b32..12fd8ee88d 100644
--- a/include/kernels/dslash_gamma_helper.cuh
+++ b/include/kernels/dslash_gamma_helper.cuh
@@ -20,7 +20,8 @@ namespace quda {
     const F in;           // input vector field
     const int d;          // which gamma matrix are we applying
     const int nParity;    // number of parities we're working on
-    bool doublet;         // whether we applying the operator to a doublet
+    const bool doublet;   // whether we applying the operator to a doublet
+    const int n_flavor;   // number of flavors
     const int volumeCB;   // checkerboarded volume
     real a;               // scale factor
     real b;               // chiral twist
@@ -31,6 +32,7 @@ namespace quda {
 	     bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID) :
       out(out), in(in), d(d), nParity(in.SiteSubset()),
       doublet(in.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET),
+      n_flavor(doublet ? 2 : 1),
       volumeCB(doublet ? in.VolumeCB()/2 : in.VolumeCB()), a(0.0), b(0.0), c(0.0)
     {
       checkPrecision(out, in);
@@ -76,13 +78,15 @@ namespace quda {
 
     __device__ __host__ void operator()(int x_cb, int parity)
     {
-      ColorSpinor<typename Arg::real, Arg::nColor, 4> in = arg.in(x_cb, parity);
-      switch(arg.d) {
-      case 0: arg.out(x_cb, parity) = in.gamma(0); break;
-      case 1: arg.out(x_cb, parity) = in.gamma(1); break;
-      case 2: arg.out(x_cb, parity) = in.gamma(2); break;
-      case 3: arg.out(x_cb, parity) = in.gamma(3); break;
-      case 4: arg.out(x_cb, parity) = in.gamma(4); break;
+      for (int i = 0; i < arg.n_flavor; i++) {
+        ColorSpinor<typename Arg::real, Arg::nColor, 4> in = arg.in(x_cb + i * arg.volumeCB, parity);
+        switch(arg.d) {
+        case 0: arg.out(x_cb + i * arg.volumeCB, parity) = in.gamma(0); break;
+        case 1: arg.out(x_cb + i * arg.volumeCB, parity) = in.gamma(1); break;
+        case 2: arg.out(x_cb + i * arg.volumeCB, parity) = in.gamma(2); break;
+        case 3: arg.out(x_cb + i * arg.volumeCB, parity) = in.gamma(3); break;
+        case 4: arg.out(x_cb + i * arg.volumeCB, parity) = in.gamma(4); break;
+        }
       }
     }
   };
@@ -103,12 +107,78 @@ namespace quda {
         fermion_t in = arg.in(x_cb, parity);
         arg.out(x_cb, parity) = arg.a * (in + arg.b * in.igamma(d));
       } else {
-        fermion_t in_1 = arg.in(x_cb+0*arg.volumeCB, parity);
-        fermion_t in_2 = arg.in(x_cb+1*arg.volumeCB, parity);
+        fermion_t in_1 = arg.in(x_cb + 0 * arg.volumeCB, parity);
+        fermion_t in_2 = arg.in(x_cb + 1 * arg.volumeCB, parity);
         arg.out(x_cb + 0 * arg.volumeCB, parity) = arg.a * (in_1 + arg.b * in_1.igamma(d) + arg.c * in_2);
         arg.out(x_cb + 1 * arg.volumeCB, parity) = arg.a * (in_2 - arg.b * in_2.igamma(d) + arg.c * in_1);
       }
     }
   };
 
+  /**
+     @brief Parameter structure for driving the Tau operator
+   */
+  template <typename Float, int nColor_> struct TauArg : kernel_param<> {
+    using real = typename mapper<Float>::type;
+    constexpr static int nColor = nColor_;
+    typedef typename colorspinor_mapper<Float, 4, nColor>::type F;
+
+    F out;              // output vector field
+    const F in;         // input vector field
+    const int d;        // which gamma matrix are we applying
+    const int nParity;  // number of parities we're working on
+    bool doublet;       // whether we applying the operator to a doublet
+    const int volumeCB; // checkerboarded volume
+
+    TauArg(ColorSpinorField &out, const ColorSpinorField &in, int d) :
+      out(out),
+      in(in),
+      d(d),
+      nParity(in.SiteSubset()),
+      doublet(in.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET),
+      volumeCB(doublet ? in.VolumeCB() / 2 : in.VolumeCB())
+    {
+      checkPrecision(out, in);
+      checkLocation(out, in);
+      if (d < 1 || d > 3) errorQuda("Undefined tau matrix %d", d);
+      if (in.Nspin() != 4) errorQuda("Cannot apply tau to nSpin=%d field", in.Nspin());
+      if (!in.isNative() || !out.isNative())
+        errorQuda("Unsupported field order out=%d in=%d\n", out.FieldOrder(), in.FieldOrder());
+      if (!doublet) errorQuda("tau matrix can be applyed only to spinor doublet");
+
+      this->threads = dim3(doublet ? in.VolumeCB() / 2 : in.VolumeCB(), in.SiteSubset(), 1);
+    }
+  };
+  /**
+     @brief Application of Gamma matrix to a color spinor field
+  */
+  template <typename Arg> struct Tau {
+    using fermion_t = ColorSpinor<typename Arg::real, Arg::nColor, 4>;
+    const Arg &arg;
+    constexpr Tau(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ void operator()(int x_cb, int parity)
+    {
+      fermion_t in_1 = arg.in(x_cb + 0 * arg.volumeCB, parity);
+      fermion_t in_2 = arg.in(x_cb + 1 * arg.volumeCB, parity);
+      const complex<typename Arg::real> j(0.0, 1.0);
+      const typename Arg::real m1(-1);
+
+      switch (arg.d) {
+      case 1:
+        arg.out(x_cb + 0 * arg.volumeCB, parity) = in_2;
+        arg.out(x_cb + 1 * arg.volumeCB, parity) = in_1;
+        break;
+      case 2:
+        arg.out(x_cb + 0 * arg.volumeCB, parity) = -j * in_2;
+        arg.out(x_cb + 1 * arg.volumeCB, parity) = j * in_1;
+        break;
+      case 3:
+        arg.out(x_cb + 0 * arg.volumeCB, parity) = in_1;
+        arg.out(x_cb + 1 * arg.volumeCB, parity) = m1 * in_2;
+        break;
+      }
+    }
+  };
 }
diff --git a/include/kernels/evec_project.cuh b/include/kernels/evec_project.cuh
new file mode 100644
index 0000000000..cdd2ac9716
--- /dev/null
+++ b/include/kernels/evec_project.cuh
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <color_spinor_field_order.h>
+#include <index_helper.cuh>
+#include <fast_intdiv.h>
+#include <constant_kernel_arg.h>
+#include <reduce_helper.h>
+#include <reduction_kernel.h>
+
+namespace quda {
+  
+  using spinor_array = array<double, 8>;
+
+  constexpr unsigned long max_nx = 4;
+  constexpr unsigned long max_ny = 4;
+
+  template <typename Float, int nColor_>
+  struct EvecProjectionArg : public ReduceArg<spinor_array>
+  {
+    using real = typename mapper<Float>::type;
+    static constexpr int nColor = nColor_;
+    static constexpr int nSpinX = 4;
+    static constexpr int nSpinY = 1;
+
+    typedef typename colorspinor_mapper<Float, nSpinX, nColor, false, false, true>::type F4;
+    typedef typename colorspinor_mapper<Float, nSpinY, nColor, false, false, true>::type F1;
+    
+    static constexpr unsigned int max_n_batch_block = 8;
+    int_fastdiv nx;
+    int_fastdiv ny;
+    F4 x[max_nx];
+    F1 y[max_ny];
+
+    int_fastdiv X[4]; // grid dimensions
+    
+    EvecProjectionArg(cvector_ref<const ColorSpinorField> &x, cvector_ref<const ColorSpinorField> &y) :
+      ReduceArg<spinor_array>(dim3(x.Volume()/x[0].X()[3], 1, x.size() * y.size() * x[0].X()[3]),
+                              x.size() * y.size() * x[0].X()[3]),
+      nx(x.size()), ny(y.size())
+    {
+      if (x.size() > max_nx) errorQuda("Requested vector size %lu greater than max %lu", x.size(), max_nx);
+      if (y.size() > max_ny) errorQuda("Requested vector size %lu greater than max %lu", y.size(), max_ny);
+      for (int i = 0 ; i < 4; i++) X[i] = x[0].X()[i];
+      for (auto i = 0u; i < x.size(); i++) this->x[i] = x[i];
+      for (auto i = 0u; i < y.size(); i++) this->y[i] = y[i];
+    }
+    __device__ __host__ spinor_array init() const { return spinor_array(); }
+  };
+  
+ template <int reduction_dim, class T> __device__ int idx_from_t_xyz(int t, int xyz, T X[4])
+  {
+    int x[4];
+#pragma unroll
+    for (int d = 0; d < 4; d++) {
+      if (d != reduction_dim) {
+	x[d] = xyz % X[d];
+	xyz = xyz / X[d];
+      }
+    }    
+    x[reduction_dim] = t;    
+    return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]);
+  }
+  
+  template <typename Arg> struct EvecProjection : plus<spinor_array> {
+    using reduce_t = spinor_array;
+    using plus<reduce_t>::operator();    
+    static constexpr int reduce_block_dim = 1; // only doing a reduct in the x thread dimension
+    
+    const Arg &arg;
+    constexpr EvecProjection(const Arg &arg) : arg(arg) {}
+    static constexpr const char *filename() { return KERNEL_FILE; }
+    // overload comm_reduce to defer until the entire "tile" is complete
+    template <typename U> static inline void comm_reduce(U &) { }
+    
+    // Final param is unused in the MultiReduce functor in this use case.
+    __device__ __host__ inline reduce_t operator()(reduce_t &result, int xyz, int, int ijt)
+    {
+      int i = ijt % arg.nx;
+      int jt = ijt / arg.nx;
+      int j = jt % arg.ny;
+      int t = jt / arg.ny;
+
+      constexpr int nSpinX = Arg::nSpinX;
+      constexpr int nSpinY = Arg::nSpinY;
+      constexpr int nColor = Arg::nColor;
+      using real = typename Arg::real;
+      using Vector4 = ColorSpinor<real, nColor, nSpinX>;
+      using Vector1 = ColorSpinor<real, nColor, nSpinY>;
+
+      // Collect vector data
+      int parity = 0;
+      // Always use the <T> dim in this kernel
+      int idx = idx_from_t_xyz<3>(t, xyz, arg.X);
+
+      // This helper will change the value of 'parity' to the correct one, 
+      // and return the checkerboard index.
+      int idx_cb = getParityCBFromFull(parity, arg.X, idx);
+      Vector4 x = arg.x[i](idx_cb, parity);
+      Vector1 y = arg.y[j](idx_cb, parity);
+
+      // Compute the inner product over colour
+      reduce_t result_local;
+      for (int mu = 0; mu < nSpinX; mu++) {
+        complex<real> prod = innerProduct(y, x, 0, mu);
+        result_local[2 * mu + 0] = prod.real();
+        result_local[2 * mu + 1] = prod.imag();
+      }
+      
+      return plus::operator()(result_local, result);
+    }
+  };
+}
diff --git a/include/quda.h b/include/quda.h
index de88d1d0d2..cc62cefdb8 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -132,10 +132,12 @@ extern "C" {
     double mu;    /**< Twisted mass parameter */
     double tm_rho;  /**< Hasenbusch mass shift applied like twisted mass to diagonal (but not inverse) */
     double epsilon; /**< Twisted mass parameter */
+    double evmax;   /** maximum of the eigenvalues of the ndeg twisted mass operator needed for fermionic forces  **/
 
     QudaTwistFlavorType twist_flavor;  /**< Twisted mass flavor */
 
     int laplace3D; /**< omit this direction from laplace operator: x,y,z,t -> 0,1,2,3 (-1 is full 4D) */
+    int covdev_mu; /**< Apply forward/backward covariant derivative in direction mu(mu<=3)/mu-4(mu>3) */
 
     double tol;    /**< Solver tolerance in the L2 residual norm */
     double tol_restart;   /**< Solver tolerance in the L2 residual norm (used to restart InitCG) */
@@ -1797,6 +1799,24 @@ extern "C" {
    */
   void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_param);
 
+  /**
+   * @brief Performs contractions between a set of quark fields and
+   * eigenvectors of the 3-d Laplace operator.
+   * @param[in,out] host_sinks An array representing the inner
+   * products between the quark fields and the eigen-vector fields.
+   * Ordered as [nQuark][nEv][Lt][nSpin][complexity].
+   * @param[in] host_quark Array of quark fields we are taking the inner over
+   * @param[in] n_quark Number of quark fields
+   * @param[in] tile_quark Tile size for quark fields (batch size)
+   * @param[in] host_evec Array of eigenvectors we are taking the inner over
+   * @param[in] n_evec Number of eigenvectors
+   * @param[in] tile_evec Tile size for eigenvectors (batch size)
+   * @param[in] inv_param Meta-data structure
+   * @param[in] X Lattice dimensions
+   */
+  void laphSinkProject(double _Complex *host_sinks, void **host_quark, int n_quark, int tile_quark,
+                       void **host_evec, int nevec, int tile_evec, QudaInvertParam *inv_param, const int X[4]);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/quda_define.h.in b/include/quda_define.h.in
index 2692833f38..374fd725df 100644
--- a/include/quda_define.h.in
+++ b/include/quda_define.h.in
@@ -119,8 +119,8 @@
 #define GPU_STAGGERED_DIRAC
 #endif
 
-#cmakedefine QUDA_LAPLACE
-#ifdef QUDA_LAPLACE
+#cmakedefine QUDA_DIRAC_LAPLACE
+#ifdef QUDA_DIRAC_LAPLACE
 /**
  * @def   GPU_LAPLACE
  * @brief This macro is set when we have the Laplace operator enabled
diff --git a/include/quda_internal.h b/include/quda_internal.h
index dd8a6c8177..37337b0d4b 100644
--- a/include/quda_internal.h
+++ b/include/quda_internal.h
@@ -30,7 +30,7 @@
 #define NSPIN2
 #endif
 
-#if defined(GPU_STAGGERED_DIRAC)
+#if defined(GPU_STAGGERED_DIRAC) || defined(GPU_LAPLACE)
 #define NSPIN1
 #endif
 
diff --git a/include/reference_wrapper_helper.h b/include/reference_wrapper_helper.h
index f567246949..5501e3cac6 100644
--- a/include/reference_wrapper_helper.h
+++ b/include/reference_wrapper_helper.h
@@ -295,6 +295,49 @@ namespace quda
       for (auto i = 0u; i < vector::size(); i++) odd.push_back(operator[](i).Odd());
       return odd;
     }
+
+    template <class U = T>
+    std::enable_if_t<std::is_same_v<std::remove_const_t<U>, ColorSpinorField>, QudaPrecision> Precision() const
+    {
+      for (auto i = 1u; i < vector::size(); i++)
+        if (operator[](i - 1).Precision() != operator[](i).Precision())
+          errorQuda("Precisions %d %d do not match", operator[](i - 1).Precision(), operator[](i).Precision());
+      return operator[](0).Precision();
+    }
+
+    template <class U = T>
+    std::enable_if_t<std::is_same_v<std::remove_const_t<U>, ColorSpinorField>, int> Ncolor() const
+    {
+      for (auto i = 1u; i < vector::size(); i++)
+        if (operator[](i - 1).Ncolor() != operator[](i).Ncolor())
+          errorQuda("Ncolors do not match %d != %d", operator[](i - 1).Ncolor(), operator[](i).Ncolor());
+      return operator[](0).Ncolor();
+    }
+
+    template <class U = T> std::enable_if_t<std::is_same_v<std::remove_const_t<U>, ColorSpinorField>, int> Nspin() const
+    {
+      for (auto i = 1u; i < vector::size(); i++)
+        if (operator[](i - 1).Nspin() != operator[](i).Nspin())
+          errorQuda("Nspins do not match %d != %d", operator[](i - 1).Nspin(), operator[](i).Nspin());
+      return operator[](0).Nspin();
+    }
+
+    template <class U = T>
+    std::enable_if_t<std::is_same_v<std::remove_const_t<U>, ColorSpinorField>, size_t> Volume() const
+    {
+      for (auto i = 1u; i < vector::size(); i++)
+        if (operator[](i - 1).Volume() != operator[](i).Volume())
+          errorQuda("Volumes do not match %lu != %lu", operator[](i - 1).Volume(), operator[](i).Volume());
+      return operator[](0).Volume();
+    }
+
+    template <class U = T>
+    std::enable_if_t<std::is_same_v<std::remove_const_t<U>, ColorSpinorField>, size_t> Bytes() const
+    {
+      size_t bytes = 0;
+      for (auto i = 0u; i < vector::size(); i++) bytes += operator[](i).Bytes();
+      return bytes;
+    }
   };
 
   template <class T> using cvector_ref = const vector_ref<T>;
diff --git a/include/timer.h b/include/timer.h
index 6c7891f103..a98928ec39 100644
--- a/include/timer.h
+++ b/include/timer.h
@@ -215,6 +215,8 @@ namespace quda {
 
     TimeProfile(std::string fname, bool use_global) : fname(fname), switchOff(false), use_global(use_global) { ; }
 
+    auto Name() const { return fname; }
+
     /**< Print out the profile information */
     void Print();
 
@@ -245,6 +247,7 @@ namespace quda {
     double &secs;
     double &gflops;
     uint64_t flops;
+    bool active = false;
     pushProfile(TimeProfile &profile, double &secs = secs_dummy, double &gflops = gflops_dummy);
     virtual ~pushProfile();
   };
diff --git a/include/tunable_reduction.h b/include/tunable_reduction.h
index 17241fcccc..c575cb8135 100644
--- a/include/tunable_reduction.h
+++ b/include/tunable_reduction.h
@@ -338,7 +338,7 @@ namespace quda
           param.shared_bytes = shared_bytes;
           return true;
         } else { // we have run off the end so let's reset
-          param.block.z = 1;
+          param.block.z = std::max(1u, (n_batch + device::max_grid_size(2) - 1) / device::max_grid_size(2));
           param.grid.z = (n_batch + param.block.z - 1) / param.block.z;
           return false;
         }
@@ -352,7 +352,8 @@ namespace quda
     void initTuneParam(TuneParam &param) const
     {
       TunableReduction2D::initTuneParam(param);
-      param.block = {param.block.x, param.block.y, 1};
+      param.block = {param.block.x, param.block.y,
+                     std::max(1u, (n_batch + device::max_grid_size(2) - 1) / device::max_grid_size(2))};
       param.grid = {param.grid.x, param.grid.y, (n_batch + param.block.z - 1) / param.block.z};
       setSharedBytes(param);
     }
@@ -364,7 +365,8 @@ namespace quda
     void defaultTuneParam(TuneParam &param) const
     {
       TunableReduction2D::defaultTuneParam(param);
-      param.block = {param.block.x, param.block.y, 1};
+      param.block = {param.block.x, param.block.y,
+                     std::max(1u, (n_batch + device::max_grid_size(2) - 1) / device::max_grid_size(2))};
       param.grid = {param.grid.x, param.grid.y, (n_batch + param.block.z - 1) / param.block.z};
       setSharedBytes(param);
     }
diff --git a/include/tune_quda.h b/include/tune_quda.h
index c2cc221d4e..f11763f068 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -96,7 +96,7 @@ namespace quda {
        by the autotuner.  This defaults to twice the number of
        processors on the GPU, since it's unlikely a large grid size
        will help (if a kernels needs more parallelism, the autotuner
-       will find this through increased block size.
+       will find this through increased block size).
      */
     virtual unsigned int maxGridSize() const { return 2 * device::processor_count(); }
 
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index e9b7945ea7..4ed829ff7c 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -36,6 +36,7 @@ set (QUDA_OBJS
   field_cache.cpp
   gauge_covdev.cpp dirac.cpp
   clover_field.cpp lattice_field.cpp gauge_field.cpp
+  evec_project.cu
   extract_gauge_ghost.cu
   gauge_norm.cu gauge_update_quda.cu
   max_clover.cu dirac_clover.cpp dirac_wilson.cpp dirac_staggered.cpp
diff --git a/lib/check_params.h b/lib/check_params.h
index cb5dd701f7..dffba68cab 100644
--- a/lib/check_params.h
+++ b/lib/check_params.h
@@ -365,9 +365,11 @@ void printQudaInvertParam(QudaInvertParam *param) {
   P(Ls, INVALID_INT);
   P(mu, INVALID_DOUBLE);
   P(epsilon, INVALID_DOUBLE);
+  P(evmax, INVALID_DOUBLE);
   P(tm_rho, 0.0);
   P(twist_flavor, QUDA_TWIST_INVALID);
   P(laplace3D, INVALID_INT);
+  P(covdev_mu, INVALID_INT);
 #else
   // asqtad and domain wall use mass parameterization
   if (param->dslash_type == QUDA_STAGGERED_DSLASH || param->dslash_type == QUDA_ASQTAD_DSLASH
@@ -389,6 +391,7 @@ void printQudaInvertParam(QudaInvertParam *param) {
   }
   if (param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { P(tm_rho, INVALID_DOUBLE); }
   if (param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { P(epsilon, INVALID_DOUBLE); }
+  if (param->dslash_type == QUDA_COVDEV_DSLASH) { P(covdev_mu, INVALID_INT); }
 #endif
 
   P(tol, INVALID_DOUBLE);
diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp
index ceaa2a1ab7..6944f46d89 100644
--- a/lib/clover_field.cpp
+++ b/lib/clover_field.cpp
@@ -247,6 +247,11 @@ namespace quda {
       cloverInvert(clover_inverse, true);
       copy(clover_inverse, true);
       dynamic_inverse_copy = false;
+      if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
+        getProfile().TPSTOP(QUDA_PROFILE_D2H);
+      } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
+        getProfile().TPSTOP(QUDA_PROFILE_H2D);
+      }
       return;
     }
 
diff --git a/lib/clover_force.cpp b/lib/clover_force.cpp
index 7ab80e29a0..f342d54b60 100644
--- a/lib/clover_force.cpp
+++ b/lib/clover_force.cpp
@@ -44,24 +44,39 @@ namespace quda
 
     for (auto i = 0u; i < x.size(); i++) {
       gamma5(p[i][parity], x[i][parity]);
-
       if (dagger) dirac->Dagger(QUDA_DAG_YES);
       dirac->Dslash(x[i][other_parity], p[i][parity], other_parity);
       // want to apply \hat Q_{-} = \hat M_{+}^\dagger \gamma_5 to get Y_o
       dirac->M(p[i][parity], p[i][parity]); // this is the odd part of Y
       if (dagger) dirac->Dagger(QUDA_DAG_NO);
 
+      if (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) {
+        blas::ax(1.0 / inv_param.evmax, p[i][parity]);
+        ApplyTau(x[i][other_parity], x[i][other_parity], 1);
+        ApplyTau(p[i][parity], p[i][parity], 1);
+        Complex a(0.0, -inv_param.offset[i]);
+        blas::caxpy(a, x[i][parity], p[i][parity]);
+      }
+
       gamma5(x[i][other_parity], x[i][other_parity]);
-      if (detratio) blas::xpy(x0[i][parity], p[i][parity]);
+      if (detratio && inv_param.twist_flavor != QUDA_TWIST_NONDEG_DOUBLET) blas::xpy(x0[i][parity], p[i][parity]);
 
-      if (not_dagger) dirac->Dagger(QUDA_DAG_YES);
+      if (not_dagger || inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) dirac->Dagger(QUDA_DAG_YES);
+      if (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) gamma5(p[i][parity], p[i][parity]);
       dirac->Dslash(p[i][other_parity], p[i][parity], other_parity); // and now the even part of Y
-      if (not_dagger) dirac->Dagger(QUDA_DAG_NO);
-      // up to here x.odd match X.odd in tmLQCD and p.odd=-Y.odd of tmLQCD
-      // x.Even= X.Even.tmLQCD/kappa and p.Even=-Y.Even.tmLQCD/kappa
-
-      // the gamma5 application in tmLQCD is done inside deriv_Sb
-      gamma5(p[i], p[i]);
+      if (not_dagger || inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) dirac->Dagger(QUDA_DAG_NO);
+
+      if (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) {
+        ApplyTau(p[i][other_parity], p[i][other_parity], 1);
+        // up to here x.odd match X.odd in tmLQCD and p.odd=- gamma5 Y.odd of tmLQCD
+        // x.Even= X.Even.tmLQCD/kappa and p.Even=- gamma5 Y.Even.tmLQCD/kappa
+        // the gamma5 application in tmLQCD inside deriv_Sb is otimized away in here
+      } else {
+        // up to here x.odd match X.odd in tmLQCD and p.odd=-Y.odd of tmLQCD
+        // x.Even= X.Even.tmLQCD/kappa and p.Even=-Y.Even.tmLQCD/kappa
+        // the gamma5 application in tmLQCD is done inside deriv_Sb
+        gamma5(p[i], p[i]);
+      }
     }
 
     // derivative of the wilson operator it correspond to deriv_Sb(OE,...) plus  deriv_Sb(EO,...) in tmLQCD
@@ -71,6 +86,12 @@ namespace quda
 
     // derivative of pseudofermion sw term, first term term of (A12) in hep-lat/0112051,  sw_spinor_eo(EE,..) plus
     // sw_spinor_eo(OO,..)  in tmLQCD
+    if (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) {
+      for (auto i = 0u; i < x.size(); i++) {
+        ApplyTau(p[i][parity], p[i][parity], 1);
+        ApplyTau(p[i][other_parity], p[i][other_parity], 1);
+      }
+    }
     computeCloverSigmaOprod(oprod, inv_param.dagger == QUDA_DAG_YES ? p : x, inv_param.dagger == QUDA_DAG_YES ? x : p,
                             epsilon);
 
diff --git a/lib/clover_outer_product.cu b/lib/clover_outer_product.cu
index 01573951ad..f9040110e5 100644
--- a/lib/clover_outer_product.cu
+++ b/lib/clover_outer_product.cu
@@ -9,7 +9,7 @@ namespace quda {
 
   template <typename Float, int nColor, QudaReconstructType recon> class CloverForce : public TunableKernel1D {
     using real = typename mapper<Float>::type;
-    template <int dim = -1> using Arg = CloverForceArg<Float, nColor, recon, dim>;
+    template <int dim = -1, bool doublet = false> using Arg = CloverForceArg<Float, nColor, recon, dim, doublet>;
     GaugeField &force;
     const GaugeField &U;
     const ColorSpinorField &inA;
@@ -18,14 +18,18 @@ namespace quda {
     const ColorSpinorField &inD;
     const int parity;
     const real coeff;
+    const bool doublet; // whether we applying the operator to a doublet
+    const int n_flavor;
     OprodKernelType kernel;
     int dir;
-    unsigned int minThreads() const override { return kernel == INTERIOR ? inB.VolumeCB() : inB.GhostFaceCB()[dir]; }
+    unsigned int minThreads() const override
+    {
+      return (kernel == INTERIOR ? inB.VolumeCB() : inB.GhostFaceCB()[dir]) / n_flavor;
+    }
 
   public:
-    CloverForce(const GaugeField &U, GaugeField &force, const ColorSpinorField& inA,
-                const ColorSpinorField& inB, const ColorSpinorField& inC, const ColorSpinorField& inD,
-                int parity, double coeff) :
+    CloverForce(const GaugeField &U, GaugeField &force, const ColorSpinorField &inA, const ColorSpinorField &inB,
+                const ColorSpinorField &inC, const ColorSpinorField &inD, int parity, double coeff) :
       TunableKernel1D(force),
       force(force),
       U(U),
@@ -34,8 +38,11 @@ namespace quda {
       inC(inC),
       inD(inD),
       parity(parity),
-      coeff(static_cast<real>(coeff))
+      coeff(static_cast<real>(coeff)),
+      doublet(inA.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET),
+      n_flavor(doublet ? 2 : 1)
     {
+      if (doublet) strcat(aux, ",doublet");
       char aux2[TuneKey::aux_n];
       strcpy(aux2, aux);
       strcat(aux, ",interior");
@@ -58,13 +65,40 @@ namespace quda {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
 
       if (kernel == INTERIOR) {
-        launch<Interior>(tp, stream, Arg<>(force, U, inA, inB, inC, inD, parity, coeff));
+        if (doublet)
+          launch<Interior>(tp, stream, Arg<-1, true>(force, U, inA, inB, inC, inD, parity, coeff));
+        else
+          launch<Interior>(tp, stream, Arg<>(force, U, inA, inB, inC, inD, parity, coeff));
       } else if (kernel == EXTERIOR) {
         switch (dir) {
-        case 0: launch<Exterior>(tp, stream, Arg<0>(force, U, inA, inB, inC, inD, parity, coeff)); break;
-        case 1: launch<Exterior>(tp, stream, Arg<1>(force, U, inA, inB, inC, inD, parity, coeff)); break;
-        case 2: launch<Exterior>(tp, stream, Arg<2>(force, U, inA, inB, inC, inD, parity, coeff)); break;
-        case 3: launch<Exterior>(tp, stream, Arg<3>(force, U, inA, inB, inC, inD, parity, coeff)); break;
+        case 0: {
+          if (doublet)
+            launch<Exterior>(tp, stream, Arg<0, true>(force, U, inA, inB, inC, inD, parity, coeff));
+          else
+            launch<Exterior>(tp, stream, Arg<0>(force, U, inA, inB, inC, inD, parity, coeff));
+          break;
+        }
+        case 1: {
+          if (doublet)
+            launch<Exterior>(tp, stream, Arg<1, true>(force, U, inA, inB, inC, inD, parity, coeff));
+          else
+            launch<Exterior>(tp, stream, Arg<1>(force, U, inA, inB, inC, inD, parity, coeff));
+          break;
+        }
+        case 2: {
+          if (doublet)
+            launch<Exterior>(tp, stream, Arg<2, true>(force, U, inA, inB, inC, inD, parity, coeff));
+          else
+            launch<Exterior>(tp, stream, Arg<2>(force, U, inA, inB, inC, inD, parity, coeff));
+          break;
+        }
+        case 3: {
+          if (doublet)
+            launch<Exterior>(tp, stream, Arg<3, true>(force, U, inA, inB, inC, inD, parity, coeff));
+          else
+            launch<Exterior>(tp, stream, Arg<3>(force, U, inA, inB, inC, inD, parity, coeff));
+          break;
+        }
         default: errorQuda("Unexpected direction %d", dir);
         }
       }
@@ -74,14 +108,22 @@ namespace quda {
     void postTune() override { force.restore(); }
 
     // spin trace + multiply-add (ignore spin-project)
-    long long flops() const override { return minThreads() * (144 + 234) * (kernel == INTERIOR ? 4 : 1); }
+    long long flops() const override
+    {
+      int oprod_flops = nColor * nColor * (8 * inA.Nspin() - 2);
+      int gemm_flops = nColor * nColor * (8 * nColor - 2);
+      int mat_size = 2 * nColor * nColor;
+
+      return minThreads() * n_flavor * (2 * oprod_flops + gemm_flops + 3 * mat_size) * (kernel == INTERIOR ? 4 : 1);
+    }
 
     long long bytes() const override
     {
       if (kernel == INTERIOR) {
-	return inA.Bytes() + inC.Bytes() + 4*(inB.Bytes() + inD.Bytes()) + force.Bytes() + U.Bytes() / 2;
+        return inA.Bytes() + inC.Bytes() + 4 * (inB.Bytes() + inD.Bytes()) + force.Bytes() + U.Bytes() / 2;
       } else {
-	return minThreads() * (nColor * (4 * 2 + 2 * 2) + 2 * force.Reconstruct() + U.Reconstruct())
+        return minThreads() * n_flavor
+          * (nColor * (4 * inA.Nspin() + 2 * inA.Nspin() / 2) * 2 + 2 * force.Reconstruct() + U.Reconstruct())
           * sizeof(Float);
       }
     }
diff --git a/lib/clover_sigma_outer_product.cu b/lib/clover_sigma_outer_product.cu
index a6b9750e79..0cf740479a 100644
--- a/lib/clover_sigma_outer_product.cu
+++ b/lib/clover_sigma_outer_product.cu
@@ -12,18 +12,25 @@ namespace quda {
 
   template <typename Float, int nColor> class CloverSigmaOprod : public TunableKernel3D
   {
-    template <int nvector> using Arg = CloverSigmaOprodArg<Float, nColor, nvector>;
+    template <int nvector, bool doublet> using Arg = CloverSigmaOprodArg<Float, nColor, nvector, doublet>;
     GaugeField &oprod;
     cvector_ref<const ColorSpinorField> &inA;
     cvector_ref<const ColorSpinorField> &inB;
     const std::vector<array<double, 2>> &coeff;
+    const bool doublet; // whether we are applying the operator to a doublet
     unsigned int minThreads() const override { return oprod.VolumeCB(); }
 
   public:
     CloverSigmaOprod(GaugeField &oprod, cvector_ref<const ColorSpinorField> &inA,
                      cvector_ref<const ColorSpinorField> &inB, const std::vector<array<double, 2>> &coeff) :
-      TunableKernel3D(oprod, 2, 6), oprod(oprod), inA(inA), inB(inB), coeff(coeff)
+      TunableKernel3D(oprod, 2, 6),
+      oprod(oprod),
+      inA(inA),
+      inB(inB),
+      coeff(coeff),
+      doublet(inA[0].TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET)
     {
+      if (doublet) strcat(aux, ",doublet");
       char tmp[16];
       sprintf(tmp, ",nvector=%lu", inA.size());
       strcat(aux, tmp);
@@ -34,7 +41,12 @@ namespace quda {
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       switch (inA.size()) {
-      case 1: launch<SigmaOprod>(tp, stream, Arg<1>(oprod, inA, inB, coeff)); break;
+      case 1:
+        if (doublet)
+          launch<SigmaOprod>(tp, stream, Arg<1, true>(oprod, inA, inB, coeff));
+        else
+          launch<SigmaOprod>(tp, stream, Arg<1, false>(oprod, inA, inB, coeff));
+        break;
       default: errorQuda("Unsupported nvector = %lu\n", inA.size());
       }
     } // apply
@@ -44,7 +56,11 @@ namespace quda {
 
     long long flops() const override
     {
-      return ((144 + 18) * inA.size() + 18) * 6 * oprod.Volume(); // spin trace + multiply-add
+      int n_flavor = doublet ? 2 : 1;
+      int oprod_flops = inA.Ncolor() * inA.Ncolor() * (8 * inA.Nspin() - 2);
+      int mat_size = 2 * inA.Ncolor() * inA.Ncolor();
+      // ((spin trace + multiply-add) * n_flavor * n_vector + projection) * 6 dir * sites
+      return ((oprod_flops + 2 * mat_size) * n_flavor * inA.size() + mat_size) * 6 * oprod.Volume();
     }
     long long bytes() const override { return (inA[0].Bytes() + inB[0].Bytes()) * inA.size() * 6 + 2 * oprod.Bytes(); }
   }; // CloverSigmaOprod
@@ -53,7 +69,6 @@ namespace quda {
                                cvector_ref<const ColorSpinorField> &p, const std::vector<array<double, 2>> &coeff)
   {
     if constexpr (is_enabled_clover()) {
-      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       if (x.size() > MAX_NVECTOR) {
         // divide and conquer
         computeCloverSigmaOprod(oprod, cvector_ref<const ColorSpinorField> {x.begin(), x.begin() + x.size() / 2},
@@ -66,6 +81,7 @@ namespace quda {
         return;
       }
 
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       instantiate<CloverSigmaOprod>(oprod, x, p, coeff);
       getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     } else {
diff --git a/lib/copy_gauge_extended.cu b/lib/copy_gauge_extended.cu
index cc469500e9..620e9bbe85 100644
--- a/lib/copy_gauge_extended.cu
+++ b/lib/copy_gauge_extended.cu
@@ -12,6 +12,7 @@ namespace quda {
     FloatOut *Out;
     FloatIn *In;
 
+    bool advanceSharedBytes(TuneParam &) const { return false; } // Don't tune shared mem
     unsigned int minThreads() const { return in.VolumeCB() == out.VolumeCB() ? in.VolumeCB() : in.LocalVolumeCB(); }
 
   public:
diff --git a/lib/copy_gauge_helper.hpp b/lib/copy_gauge_helper.hpp
index f43fa15e2c..9703488e01 100644
--- a/lib/copy_gauge_helper.hpp
+++ b/lib/copy_gauge_helper.hpp
@@ -15,6 +15,7 @@ namespace quda
     GaugeField &out;
     const GaugeField &in;
 
+    bool advanceSharedBytes(TuneParam &) const override { return false; } // Don't tune shared mem
     unsigned int minThreads() const override { return size; }
 
   public:
diff --git a/lib/dirac_wilson.cpp b/lib/dirac_wilson.cpp
index d41c5753d7..8e71a689a1 100644
--- a/lib/dirac_wilson.cpp
+++ b/lib/dirac_wilson.cpp
@@ -144,7 +144,7 @@ namespace quda {
     // we desire solution to full system
     for (auto i = 0u; i < b.size(); i++) {
       // src = b_e + k D_eo b_o
-      DslashXpay(x[i][other_parity], b[i][other_parity], this_parity, b[this_parity], kappa);
+      DslashXpay(x[i][other_parity], b[i][other_parity], this_parity, b[i][this_parity], kappa);
       src[i] = x[i][other_parity].create_alias();
       sol[i] = x[i][this_parity].create_alias();
     }
diff --git a/lib/dslash_domain_wall_4d_fused_m5.hpp b/lib/dslash_domain_wall_4d_fused_m5.hpp
index 3ba9e77c39..cb66d36470 100644
--- a/lib/dslash_domain_wall_4d_fused_m5.hpp
+++ b/lib/dslash_domain_wall_4d_fused_m5.hpp
@@ -36,6 +36,9 @@ namespace quda
       }
     }
 
+    int blockStep() const override { return 8; }
+    int blockMin() const override { return 8; }
+
   public:
     DomainWall4DFusedM5(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in) :
       Dslash(arg, out, in, get_app_base())
@@ -44,14 +47,14 @@ namespace quda
       TunableKernel3D::resizeStep(in.X(4), 1);
     }
 
-    void apply(const qudaStream_t &stream)
+    void apply(const qudaStream_t &stream) override
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       Dslash::setParam(tp);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
 
-    unsigned int sharedBytesPerThread() const
+    unsigned int sharedBytesPerThread() const override
     {
       // spin components in shared depend on inversion algorithm
       if (mobius_m5::use_half_vector()) {
@@ -61,10 +64,9 @@ namespace quda
       }
     }
 
-    void initTuneParam(TuneParam &param) const
+    void initTuneParam(TuneParam &param) const override
     {
       Dslash::initTuneParam(param);
-
       param.block.y = arg.Ls; // Ls must be contained in the block
       param.grid.y = 1;
       param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
@@ -95,7 +97,7 @@ namespace quda
       return (12ll * n * Ls) * in.Volume();
     }
 
-    long long flops() const
+    long long flops() const override
     {
       long long flops_ = 0;
       switch (Arg::dslash5_type) {
@@ -112,7 +114,7 @@ namespace quda
       return flops_ + Dslash::flops();
     }
 
-    long long bytes() const
+    long long bytes() const override
     {
       if (Arg::dslash5_type == Dslash5Type::M5_INV_MOBIUS_M5_INV_DAG) {
         return arg.y.Bytes() + Dslash::bytes();
@@ -121,7 +123,7 @@ namespace quda
       }
     }
 
-    void defaultTuneParam(TuneParam &param) const { initTuneParam(param); }
+    void defaultTuneParam(TuneParam &param) const override { initTuneParam(param); }
   };
 
   template <Dslash5Type...> struct Dslash5TypeList {
diff --git a/lib/dslash_gamma_helper.cu b/lib/dslash_gamma_helper.cu
index 66c523df20..bab4aa23b9 100644
--- a/lib/dslash_gamma_helper.cu
+++ b/lib/dslash_gamma_helper.cu
@@ -10,7 +10,7 @@ namespace quda {
     ColorSpinorField &out;
     const ColorSpinorField &in;
     const int d;
-    unsigned int minThreads() const { return in.VolumeCB(); }
+    unsigned int minThreads() const { return in.VolumeCB() / (in.Ndim() == 5 ? in.X(4) : 1); }
 
   public:
     GammaApply(ColorSpinorField &out, const ColorSpinorField &in, int d) :
@@ -103,4 +103,39 @@ namespace quda {
   // Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma)
   void gamma5(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,4); }
 
+  template <typename Float, int nColor> class TauApply : public TunableKernel2D
+  {
+    ColorSpinorField &out;
+    const ColorSpinorField &in;
+    const int d;
+    unsigned int minThreads() const { return in.VolumeCB() / 2; }
+
+  public:
+    TauApply(ColorSpinorField &out, const ColorSpinorField &in, int d) :
+      TunableKernel2D(in, in.SiteSubset()), out(out), in(in), d(d)
+    {
+      apply(device::get_default_stream());
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      launch<Tau>(tp, stream, GammaArg<Float, nColor>(out, in, d));
+    }
+
+    void preTune() { out.backup(); }
+    void postTune() { out.restore(); }
+    long long bytes() const { return out.Bytes() + in.Bytes(); }
+  };
+
+  // Apply the tau1 matrix to a doublet colorspinor field
+  // out(x) = tau_1*in
+#ifdef GPU_TWISTED_MASS_DIRAC
+  void ApplyTau(ColorSpinorField &out, const ColorSpinorField &in, int d) { instantiate<TauApply>(out, in, d); }
+#else
+  void ApplyTau(ColorSpinorField &, const ColorSpinorField &, int)
+  {
+    errorQuda("Twisted mass dslash has not been built");
+  }
+#endif // GPU_TWISTED_MASS_DIRAC
 }
diff --git a/lib/evec_project.cu b/lib/evec_project.cu
new file mode 100644
index 0000000000..394148a9a7
--- /dev/null
+++ b/lib/evec_project.cu
@@ -0,0 +1,82 @@
+#include <color_spinor_field.h>
+#include <contract_quda.h>
+#include <kernels/evec_project.cuh>
+#include <tunable_reduction.h>
+#include <instantiate.h>
+
+namespace quda
+{
+
+  template <typename Float, int nColor> class EvecProjectLaplace3D : TunableMultiReduction
+  {
+    cvector_ref<const ColorSpinorField> &x;
+    cvector_ref<const ColorSpinorField> &y;
+    std::vector<double> &result;
+    bool tuneSharedBytes() const override { return false; }
+
+  public:
+    EvecProjectLaplace3D(cvector_ref<const ColorSpinorField> &x, cvector_ref<const ColorSpinorField> &y,
+                         std::vector<double> &result) :
+      TunableMultiReduction(x[0], 1, x.size() * y.size() * x[0].X()[3], 8), x(x), y(y), result(result)
+    {
+      strcat(aux, ",nx=");
+      char rhs_str[16];
+      i32toa(rhs_str, x.size());
+      strcat(aux, rhs_str);
+      strcat(aux, ",ny=");
+      i32toa(rhs_str, y.size());
+      strcat(aux, rhs_str);
+      apply(device::get_default_stream());
+    }
+
+    void apply(const qudaStream_t &stream) override
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      EvecProjectionArg<Float, nColor> arg(x, y);
+      launch<EvecProjection>(result, tp, stream, arg);
+    }
+
+    long long flops() const override { return 8 * x.size() * y.size() * x.Nspin() * x.Ncolor() * x.Volume(); }
+    long long bytes() const override { return x.Bytes() * y.size() + y.Bytes() * x.size(); }
+  };
+
+  void evecProjectLaplace3D(std::vector<Complex> &result, cvector_ref<const ColorSpinorField> &x,
+                            cvector_ref<const ColorSpinorField> &y)
+  {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+
+    checkNative(x[0], y[0]);
+    if (x.Nspin() != 4 || y.Nspin() != 1) errorQuda("Unexpected number of spins x=%d y=%d", x.Nspin(), y.Nspin());
+
+    // deploy as a tile computation
+    auto Lt = x[0].X()[3];
+
+    for (auto tx = 0u; tx < x.size(); tx += max_nx) {
+      for (auto ty = 0u; ty < y.size(); ty += max_ny) {
+        // compute remainder here
+        auto tile_x = std::min(max_nx, x.size() - tx);
+        auto tile_y = std::min(max_ny, y.size() - ty);
+        std::vector<double> result_tile(2 * x.Nspin() * Lt * tile_x * tile_y);
+
+        instantiate<EvecProjectLaplace3D>(cvector_ref<const ColorSpinorField> {x.begin() + tx, x.begin() + tx + tile_x},
+                                          cvector_ref<const ColorSpinorField> {y.begin() + ty, y.begin() + ty + tile_y},
+                                          result_tile);
+
+        for (auto i = 0u; i < tile_x; i++) {
+          for (auto j = 0u; j < tile_y; j++) {
+            for (auto t = 0; t < Lt; t++) {
+              for (auto s = 0u; s < 4; s++) {
+                result[(((tx + i) * y.size() + (ty + j)) * Lt + t) * 4 + s]
+                  = {result_tile[(((t * tile_y + j) * tile_x + i) * 4 + s) * 2 + 0],
+                     result_tile[(((t * tile_y + j) * tile_x + i) * 4 + s) * 2 + 1]};
+              }
+            }
+          }
+        }
+      }
+    }
+
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+  }
+
+} // namespace quda
diff --git a/lib/gauge_covdev.cpp b/lib/gauge_covdev.cpp
index 52c5f1c1b4..0ddcf0ac44 100644
--- a/lib/gauge_covdev.cpp
+++ b/lib/gauge_covdev.cpp
@@ -6,15 +6,16 @@
 
 namespace quda {
 
-  GaugeCovDev::GaugeCovDev(const DiracParam &param) :  Dirac(param) { }
+  GaugeCovDev::GaugeCovDev(const DiracParam &param) :  Dirac(param), covdev_mu(param.covdev_mu) { }
 
-  GaugeCovDev::GaugeCovDev(const GaugeCovDev &covDev) :  Dirac(covDev) { }
+  GaugeCovDev::GaugeCovDev(const GaugeCovDev &covDev) :  Dirac(covDev), covdev_mu(covDev.covdev_mu) { }
 
   GaugeCovDev::~GaugeCovDev() { }
 
   GaugeCovDev& GaugeCovDev::operator=(const GaugeCovDev &covDev)
   {
     if (&covDev != this) Dirac::operator=(covDev);
+    covdev_mu = covDev.covdev_mu;
     return *this;
   }
 
@@ -42,9 +43,9 @@ namespace quda {
     MCD(out, tmp, (mu+4)%8);
   }
 
-  void GaugeCovDev::Dslash(ColorSpinorField &, const ColorSpinorField &, const QudaParity) const
+  void GaugeCovDev::Dslash(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
   {
-    //do nothing
+    DslashCD(out, in, parity, covdev_mu);
   }
 
   void GaugeCovDev::DslashXpay(ColorSpinorField &, const ColorSpinorField &, const QudaParity, const ColorSpinorField &,
@@ -53,14 +54,14 @@ namespace quda {
     //do nothing
   }
 
-  void GaugeCovDev::M(ColorSpinorField &, const ColorSpinorField &) const
+  void GaugeCovDev::M(ColorSpinorField &out, const ColorSpinorField &in) const
   {
-    //do nothing
+    MCD(out, in, covdev_mu);
   }
 
-  void GaugeCovDev::MdagM(ColorSpinorField &, const ColorSpinorField &) const
+  void GaugeCovDev::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
   {
-    //do nothing
+    MdagMCD(out, in, covdev_mu);
   }
 
   void GaugeCovDev::prepare(cvector_ref<ColorSpinorField> &, cvector_ref<ColorSpinorField> &,
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 249315f9c3..b6f504f352 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -207,6 +207,9 @@ static TimeProfile profileCovDev("covDevQuda");
 //!< Profiler for momentum action
 static TimeProfile profileMomAction("momActionQuda");
 
+//!< Profiler for sink projection
+static TimeProfile profileSinkProject("sinkProjectQuda");
+
 //!< Profiler for endQuda
 static TimeProfile profileEnd("endQuda");
 
@@ -1393,6 +1396,7 @@ void endQuda(void)
     profileProject.Print();
     profilePhase.Print();
     profileMomAction.Print();
+    profileSinkProject.Print();
     profileEnd.Print();
 
     profileInit2End.Print();
@@ -1445,9 +1449,10 @@ namespace quda {
       }
       diracParam.type = pc ? QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC : QUDA_MOBIUS_DOMAIN_WALL_EOFA_DIRAC;
       diracParam.Ls = inv_param->Ls;
-      if (sizeof(Complex) != sizeof(double _Complex)) {
-        errorQuda("Irreconcilable difference between interface and internal complex number conventions");
-      }
+      // check we are safe to cast into a Complex (= std::complex<double>)
+      static_assert(sizeof(Complex) == sizeof(double _Complex),
+                    "Irreconcilable difference between interface and internal complex number conventions");
+
       memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls);
       memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls);
       diracParam.eofa_shift = inv_param->eofa_shift;
@@ -1505,6 +1510,7 @@ namespace quda {
       break;
     case QUDA_COVDEV_DSLASH:
       diracParam.type = QUDA_GAUGE_COVDEV_DIRAC;
+      diracParam.covdev_mu = inv_param->covdev_mu;
       break;
     default:
       errorQuda("Unsupported dslash_type %d", inv_param->dslash_type);
@@ -3895,8 +3901,8 @@ void createCloverQuda(QudaInvertParam* invertParam)
   // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general)
   lat_dim_t R;
   for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d));
-  GaugeField *gauge = extendedGaugeResident ? extendedGaugeResident :
-                                              createExtendedGauge(*gaugePrecise, R, profileClover, false, recon);
+  GaugeField *gauge
+    = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, getProfile(), false, recon);
 
   GaugeField *ex = gauge;
   if (gauge->Precision() < cloverPrecise->Precision()) {
@@ -4439,7 +4445,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
 
   // Make sure extendedGaugeResident has the correct R
   if (extendedGaugeResident) delete extendedGaugeResident;
-  extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileCloverForce);
+  extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, getProfile());
   GaugeField &gaugeEx = *extendedGaugeResident;
 
   computeCloverForce(cudaMom, gaugeEx, *gaugePrecise, *cloverPrecise, x, x0, force_coeff, ferm_epsilon,
@@ -4499,7 +4505,7 @@ void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coef
     ColorSpinorField cpuQuarkX(cpuParam);
     x[i][parity] = cpuQuarkX; // in tmLQCD-parlance this is the odd part of X
 
-    if (detratio) {
+    if (detratio && inv_param->twist_flavor != QUDA_TWIST_NONDEG_DOUBLET) {
       x0[i] = ColorSpinorField(qParam);
       ColorSpinorParam cpuParam0(h_x0[i], *inv_param, gParamMom.x, true, inv_param->input_location);
       ColorSpinorField cpuQuarkX0(cpuParam0);
@@ -5205,3 +5211,99 @@ void gaugeObservablesQuda(QudaGaugeObservableParam *param)
 
   gaugeObservables(*gauge, *param);
 }
+
+static void check_param(double _Complex *host_sinks, void **host_quark, int n_quark, int tile_quark, void **host_evec,
+                        int n_evec, int tile_evec, QudaInvertParam *inv_param, const int X[4])
+{
+  if (host_sinks == nullptr) errorQuda("Invalid host_sink ptr");
+  if (host_quark == nullptr) errorQuda("Invalid host_quark ptr");
+  for (auto i = 0; i < n_quark; i++)
+    if (host_quark[i] == nullptr) errorQuda("Invalid host_quark[%d] ptr", i);
+  if (tile_quark < 1) errorQuda("Invalid tiling parameter %d (must be positive)", tile_quark);
+  if (host_evec == nullptr) errorQuda("Invalid host_evec ptr");
+  for (auto i = 0; i < n_evec; i++)
+    if (host_evec[i] == nullptr) errorQuda("Invalid host_evec[%d] ptr", i);
+  if (tile_evec < 1) errorQuda("Invalid tiling parameter %d (must be positive)", tile_evec);
+  if (inv_param == nullptr) errorQuda("Invalid QudaInvertParam ptr");
+  for (int i = 0; i < 4; i++)
+    if (X[i] < 1 || X[i] > 512) errorQuda("Invalid lattice dimension %d", i);
+}
+
+void laphSinkProject(double _Complex *host_sinks, void **host_quark, int n_quark, int tile_quark, void **host_evec,
+                     int n_evec, int tile_evec, QudaInvertParam *inv_param, const int X[4])
+{
+  auto profile = pushProfile(profileSinkProject, inv_param->secs, inv_param->gflops);
+
+  // check parameters are valid
+  check_param(host_sinks, host_quark, n_quark, tile_quark, host_evec, n_evec, tile_evec, inv_param, X);
+
+  // Parameter object describing the sources and smeared quarks
+  lat_dim_t x = {X[0], X[1], X[2], X[3]};
+  ColorSpinorParam cpu_quark_param(host_quark, *inv_param, x, false, QUDA_CPU_FIELD_LOCATION);
+  cpu_quark_param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+
+  // QUDA style wrapper around the host data
+  std::vector<ColorSpinorField> quark(n_quark);
+  for (auto i = 0; i < n_quark; i++) {
+    cpu_quark_param.v = host_quark[i];
+    quark[i] = ColorSpinorField(cpu_quark_param);
+  }
+
+  // Parameter object describing evecs
+  ColorSpinorParam cpu_evec_param(host_evec, *inv_param, x, false, QUDA_CPU_FIELD_LOCATION);
+  // Switch to spin 1
+  cpu_evec_param.nSpin = 1;
+  // QUDA style wrapper around the host data
+  std::vector<ColorSpinorField> evec(n_evec);
+  for (auto i = 0; i < n_evec; i++) {
+    cpu_evec_param.v = host_evec[i];
+    evec[i] = ColorSpinorField(cpu_evec_param);
+  }
+
+  // Create device vectors
+  ColorSpinorParam quda_quark_param(cpu_quark_param, *inv_param, QUDA_CUDA_FIELD_LOCATION);
+  quda_quark_param.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+  std::vector<ColorSpinorField> quda_quark(tile_quark, quda_quark_param);
+
+  // Create device vectors for evecs
+  ColorSpinorParam quda_evec_param(cpu_evec_param, *inv_param, QUDA_CUDA_FIELD_LOCATION);
+  std::vector<ColorSpinorField> quda_evec(tile_evec, quda_evec_param);
+
+  auto Lt = x[3] * comm_dim(3);
+  std::vector<Complex> hostSink(n_quark * n_evec * Lt * 4);
+
+  for (auto i = 0; i < n_quark; i += tile_quark) {                       // iterate over all quarks
+    auto tile_i = std::min(tile_quark, n_quark - i);                     // handle remainder here
+    for (auto tq = 0; tq < tile_i; tq++) quda_quark[tq] = quark[i + tq]; // download quarks
+
+    for (auto j = 0; j < n_evec; j += tile_evec) {                       // iterate over all EV
+      auto tile_j = std::min(tile_evec, n_evec - j);                     // handle remainder here
+      for (auto te = 0; te < tile_j; te++) quda_evec[te] = evec[j + te]; // download evecs
+
+      std::vector<Complex> tmp(tile_i * tile_j * x[3] * 4);
+
+      // We now perform the projection onto the eigenspace. The data
+      // is placed in host_sinks in  T, spin order
+      evecProjectLaplace3D(tmp, {quda_quark.begin(), quda_quark.begin() + tile_i},
+                           {quda_evec.begin(), quda_evec.begin() + tile_j});
+
+      for (auto tq = 0; tq < tile_i; tq++) {
+        for (auto te = 0; te < tile_j; te++) {
+          for (auto t = 0; t < x[3]; t++) {
+            auto t_global = X[3] * comm_coord(3) + t;
+            for (auto s = 0; s < 4; s++) {
+              hostSink[(((i + tq) * n_evec + (j + te)) * Lt + t_global) * 4 + s]
+                = tmp[((tq * tile_j + te) * x[3] + t) * 4 + s];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  comm_allreduce_sum(hostSink);
+
+  for (auto i = 0; i < n_quark * n_evec * Lt * 4; i++) { // iterate over all quarks
+    reinterpret_cast<std::complex<double> *>(host_sinks)[i] = hostSink[i];
+  }
+}
diff --git a/lib/laplace.cu b/lib/laplace.cu
index a0f18e4dd4..c476aed6d2 100644
--- a/lib/laplace.cu
+++ b/lib/laplace.cu
@@ -50,8 +50,7 @@ namespace quda
     long long flops() const override
     {
       int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops
-      int num_mv_multiply = in.Nspin() == 4 ? 2 : 1;
-      int ghost_flops = (num_mv_multiply * mv_flops + 2 * in.Ncolor() * in.Nspin());
+      int ghost_flops = (in.Nspin() * mv_flops + 2 * in.Ncolor() * in.Nspin());
       int xpay_flops = 2 * 2 * in.Ncolor() * in.Nspin(); // multiply and add per real component
       int num_dir = (arg.dir == 4 ? 2 * 4 : 2 * 3);      // 3D or 4D operator
 
@@ -73,8 +72,7 @@ namespace quda
       case UBER_KERNEL:
       case KERNEL_POLICY: {
         long long sites = in.Volume();
-        flops_ = (num_dir * (in.Nspin() / 4) * in.Ncolor() * in.Nspin() + // spin project (=0 for staggered)
-                  num_dir * num_mv_multiply * mv_flops +                  // SU(3) matrix-vector multiplies
+        flops_ = (num_dir * in.Nspin() * mv_flops +                  // SU(3) matrix-vector multiplies
                   ((num_dir - 1) * 2 * in.Ncolor() * in.Nspin()))
           * sites; // accumulation
         if (arg.xpay) flops_ += xpay_flops * sites;
@@ -144,41 +142,20 @@ namespace quda
 
   template <typename Float, int nColor, QudaReconstructType recon> struct LaplaceApply {
 
-#if (defined(GPU_STAGGERED_DIRAC) || defined(GPU_WILSON_DIRAC)) && defined(GPU_LAPLACE)
-    inline LaplaceApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir,
-                        double a, double b, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override,
-                        TimeProfile &profile)
-#else
-    inline LaplaceApply(ColorSpinorField &, const ColorSpinorField &in, const GaugeField &, int,
-                        double, double, const ColorSpinorField &, int, bool, const int *, TimeProfile &)
-#endif
+    LaplaceApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir,
+                 double a, double b, const ColorSpinorField &x, int parity, bool dagger, const int * comm_override, TimeProfile &profile)
     {
-      if (in.Nspin() == 1) {
-#if defined(GPU_STAGGERED_DIRAC) && defined(GPU_LAPLACE)
-        constexpr int nDim = 4;
-        constexpr int nSpin = 1;
-        LaplaceArg<Float, nSpin, nColor, nDim, recon> arg(out, in, U, dir, a, b, x, parity, dagger, comm_override);
-        Laplace<decltype(arg)> laplace(arg, out, in);
-
-        dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, in, in.VolumeCB(),
-          in.GhostFaceCB(), profile);
-#else
-        errorQuda("nSpin=%d Laplace operator required staggered dslash and laplace to be enabled", in.Nspin());
-#endif
-      } else if (in.Nspin() == 4) {
-#if defined(GPU_WILSON_DIRAC) && defined(GPU_LAPLACE)
-        constexpr int nDim = 4;
-        constexpr int nSpin = 4;
-        LaplaceArg<Float, nSpin, nColor, nDim, recon> arg(out, in, U, dir, a, b, x, parity, dagger, comm_override);
-        Laplace<decltype(arg)> laplace(arg, out, in);
-
-        dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, in, in.VolumeCB(),
-          in.GhostFaceCB(), profile);
-#else
-        errorQuda("nSpin=%d Laplace operator required wilson dslash and laplace to be enabled", in.Nspin());
-#endif
-      } else {
-        errorQuda("Unsupported nSpin= %d", in.Nspin());
+      if constexpr (is_enabled_laplace()) {
+        if (in.Nspin() == 1) {
+          constexpr int nDim = 4;
+          constexpr int nSpin = 1;
+          LaplaceArg<Float, nSpin, nColor, nDim, recon> arg(out, in, U, dir, a, b, x, parity, dagger, comm_override);
+          Laplace<decltype(arg)> laplace(arg, out, in);
+
+          dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, in, in.VolumeCB(), in.GhostFaceCB(), profile);
+        } else {
+          errorQuda("Unsupported nSpin= %d", in.Nspin());
+        }
       }
     }
   };
diff --git a/lib/quda_fortran.F90 b/lib/quda_fortran.F90
index 01a792558a..23372a3374 100644
--- a/lib/quda_fortran.F90
+++ b/lib/quda_fortran.F90
@@ -114,10 +114,12 @@ module quda_fortran
      real(8) :: mu    ! Chiral twisted mass parameter
      real(8) :: tm_rho ! Chiral twisted mass shift used for Hasenbusch mass preconditioning for twisted clover
      real(8) :: epsilon ! Flavor twisted mass parameter
+     real(8) :: evmax ! Maximum of the eigenvalues of the ndeg twisted mass operator needed for fermionic forces
      QudaTwistFlavorType :: twist_flavor  ! Twisted mass flavor
 
      integer(4) :: laplace3D    ! direction to omit in Laplace
-     
+     integer(4) :: covdev_mu    ! Apply forward/backward covariant derivative in direction mu(mu<=3)/mu-4(mu>3)
+
      real(8) :: tol ! Requested L2 residual norm
      real(8) :: tol_restart ! Solver tolerance in the L2 residual norm (used to restart InitCG)
      real(8) :: tol_hq ! Requested heavy quark residual norm
diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index 576cc5c5d8..6ba7dcc2e9 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -17,8 +17,8 @@ endif()
 
 set(QUDA_GPU_ARCH
     ${QUDA_DEFAULT_GPU_ARCH}
-    CACHE STRING "set the GPU architecture (sm_60, sm_70, sm_80)")
-set_property(CACHE QUDA_GPU_ARCH PROPERTY STRINGS sm_60 sm_70 sm_80)
+    CACHE STRING "set the GPU architecture (sm_60, sm_70, sm_80 sm_90)")
+set_property(CACHE QUDA_GPU_ARCH PROPERTY STRINGS sm_60 sm_70 sm_80 sm_90)
 set(QUDA_GPU_ARCH_SUFFIX
     ""
     CACHE STRING "set the GPU architecture suffix (virtual, real). Leave empty for no suffix.")
diff --git a/lib/targets/hip/malloc.cpp b/lib/targets/hip/malloc.cpp
index 6fd0986b94..f46c44c8d0 100644
--- a/lib/targets/hip/malloc.cpp
+++ b/lib/targets/hip/malloc.cpp
@@ -528,15 +528,21 @@ namespace quda
       errorQuda("hipPointerGetAttributes returned error: %s\n", hipGetErrorString(error));
     }
 
-    switch (attr.type) {
 #if HIP_VERSION_MAJOR >= 6
+    switch (attr.type) {
     case hipMemoryTypeUnregistered: return QUDA_CPU_FIELD_LOCATION;
+#else
+    switch (attr.memoryType) {
 #endif  // HIP_VERSION_MAJOR >= 6
     case hipMemoryTypeHost: return QUDA_CPU_FIELD_LOCATION;
     case hipMemoryTypeDevice: return QUDA_CUDA_FIELD_LOCATION;
     case hipMemoryTypeArray: return QUDA_CUDA_FIELD_LOCATION;
     case hipMemoryTypeUnified: return QUDA_CUDA_FIELD_LOCATION; ///< Not used currently
+#if HIP_VERSION_MAJOR >= 6
     default: errorQuda("Unknown memory type %d\n", attr.type); return QUDA_INVALID_FIELD_LOCATION;
+#else
+    default: errorQuda("Unknown memory type %d\n", attr.memoryType); return QUDA_INVALID_FIELD_LOCATION;
+#endif
     }
   }
 
diff --git a/lib/timer.cpp b/lib/timer.cpp
index 542b4ab9d6..4729a88406 100644
--- a/lib/timer.cpp
+++ b/lib/timer.cpp
@@ -245,20 +245,26 @@ namespace quda {
     profile(profile), secs(secs), gflops(gflops), flops(Tunable::flops_global())
 
   {
-    profile.TPSTART(QUDA_PROFILE_TOTAL);
-    tp_stack.push(&profile);
+    if (profile.Name() != getProfile().Name()) {
+      // only push to stack if this profile not already the active one
+      profile.TPSTART(QUDA_PROFILE_TOTAL);
+      tp_stack.push(&profile);
+      active = true;
+    }
   }
 
   pushProfile::~pushProfile()
   {
-    if (tp_stack.empty()) errorQuda("popProfile() called with empty stack");
-    auto &profile = *(tp_stack.top());
-    if (&(this->profile) != &profile) errorQuda("Popped profile is not the expected one");
-    tp_stack.pop();
-    profile.TPSTOP(QUDA_PROFILE_TOTAL);
-    secs = profile.Last(QUDA_PROFILE_TOTAL);
-    gflops = (Tunable::flops_global() - flops) * 1e-9;
-    if (&gflops != &gflops_dummy) comm_allreduce_sum(gflops);
+    if (active == true) {
+      if (tp_stack.empty()) errorQuda("popProfile() called with empty stack");
+      auto &profile = *(tp_stack.top());
+      if (&(this->profile) != &profile) errorQuda("Popped profile is not the expected one");
+      tp_stack.pop();
+      profile.TPSTOP(QUDA_PROFILE_TOTAL);
+      secs = profile.Last(QUDA_PROFILE_TOTAL);
+      gflops = (Tunable::flops_global() - flops) * 1e-9;
+      if (&gflops != &gflops_dummy) comm_allreduce_sum(gflops);
+    }
   }
 
   TimeProfile &getProfile()
diff --git a/lib/vector_io.cpp b/lib/vector_io.cpp
index ed469750f7..505b26c45a 100644
--- a/lib/vector_io.cpp
+++ b/lib/vector_io.cpp
@@ -55,12 +55,16 @@ namespace quda
         for (int j = 0; j < Ls; j++) { V[i * Ls + j] = v.data<char *>() + j * stride; }
       }
 
+      // if we're loading inflated vectors, we need to grab the spinor size from `tmp` instead of `v0`.
+      auto spinor_X = (create_tmp && parity_inflate) ? tmp[0].X() : v0.X();
+      auto spinor_site_subset = (create_tmp && parity_inflate) ? tmp[0].SiteSubset() : v0.SiteSubset();
+
       // time loading
       quda::host_timer_t host_timer;
       host_timer.start(); // start the timer
 
-      read_spinor_field(filename.c_str(), V.data(), v0.Precision(), v0.X(), v0.SiteSubset(),
-                        spinor_parity, v0.Ncolor(), v0.Nspin(), Nvec * Ls, 0, nullptr);
+      read_spinor_field(filename.c_str(), V.data(), v0.Precision(), spinor_X, spinor_site_subset, spinor_parity,
+                        v0.Ncolor(), v0.Nspin(), Nvec * Ls, 0, nullptr);
 
       host_timer.stop(); // stop the timer
       logQuda(QUDA_SUMMARIZE, "Time spent loading vectors from %s = %g secs\n", filename.c_str(), host_timer.last());
@@ -140,12 +144,16 @@ namespace quda
         for (int j = 0; j < Ls; j++) { V[i * Ls + j] = v.data<const char *>() + j * stride; }
       }
 
+      // if we performed parity inflation, we need to grab the spinor size from `tmp` instead of `v0`.
+      auto spinor_X = (create_tmp && parity_inflate) ? tmp[0].X() : v0.X();
+      auto spinor_site_subset = (create_tmp && parity_inflate) ? tmp[0].SiteSubset() : v0.SiteSubset();
+
       // time saving
       quda::host_timer_t host_timer;
       host_timer.start(); // start the timer
 
-      write_spinor_field(filename.c_str(), V.data(), save_prec, v0.X(), v0.SiteSubset(), spinor_parity, v0.Ncolor(),
-                         v0.Nspin(), Nvec * Ls, 0, nullptr, partfile);
+      write_spinor_field(filename.c_str(), V.data(), save_prec, spinor_X, spinor_site_subset, spinor_parity,
+                         v0.Ncolor(), v0.Nspin(), Nvec * Ls, 0, nullptr, partfile);
 
       host_timer.stop(); // stop the timer
       logQuda(QUDA_SUMMARIZE, "Time spent saving vectors to %s = %g secs\n", filename.c_str(), host_timer.last());
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7a45a72a08..6136864249 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -89,6 +89,13 @@ if(QUDA_DIRAC_WILSON
   target_link_libraries(eigensolve_test ${TEST_LIBS})
   quda_checkbuildtest(eigensolve_test QUDA_BUILD_ALL_TESTS)
   install(TARGETS eigensolve_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+  if(QUDA_DIRAC_LAPLACE)
+    add_executable(laph_test laph_test.cpp)
+    target_link_libraries(laph_test ${TEST_LIBS})
+    quda_checkbuildtest(laph_test QUDA_BUILD_ALL_TESTS)
+    install(TARGETS laph_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR})
+  endif()
 endif()
 
 if(QUDA_DIRAC_WILSON
@@ -301,13 +308,13 @@ if(QUDA_DIRAC_WILSON
    OR QUDA_DIRAC_DOMAIN_WALL)
   add_test(NAME blas_test_parity_wilson
            COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:blas_test> ${MPIEXEC_POSTFLAGS}
-                   --dim 2 4 6 8 
+                   --dim 2 4 6 8  --niter 1
                    --nsrc 8 --msrc 9
                    --solve-type direct-pc
                    --gtest_output=xml:blas_test_parity.xml)
   add_test(NAME blas_test_full_wilson
            COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:blas_test> ${MPIEXEC_POSTFLAGS}
-                   --dim 2 4 6 8
+                   --dim 2 4 6 8 --niter 1
                    --nsrc 8 --msrc 9
                    --solve-type direct
                    --gtest_output=xml:blas_test_full.xml)
@@ -316,14 +323,14 @@ endif()
 if(QUDA_DIRAC_STAGGERED)
   add_test(NAME blas_test_parity_staggered
            COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:blas_test> ${MPIEXEC_POSTFLAGS}
-                   --dim 2 4 6 8
+                   --dim 2 4 6 8 --niter 1
                    --nsrc 8 --msrc 9
                    --dslash-type staggered
                    --solve-type direct-pc
                    --gtest_output=xml:blas_test_parity.xml)
   add_test(NAME blas_test_full_staggered
            COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:blas_test> ${MPIEXEC_POSTFLAGS}
-                   --dim 2 4 6 8
+                   --dim 2 4 6 8 --niter 1
                    --nsrc 8 --msrc 9
                    --dslash-type staggered
                    --solve-type direct
@@ -974,179 +981,170 @@ elseif(single_prec)
 endif()
 
 # Wilson-type Inversions
-foreach(prec IN LISTS TEST_PRECS)
-
-  if(${prec} STREQUAL "double")
-    set(tol 1e-12)
-  elseif(${prec} STREQUAL "single")
-    set(tol 1e-6)
-  endif()
+if(QUDA_DIRAC_WILSON)
+  add_test(NAME invert_test_wilson
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type wilson --ngcrkrylov 8
+    --dim 2 4 6 8 --niter 1000
+    --enable-testing true
+    --gtest_output=xml:invert_test_wilson.xml)
+
+  if(DEFINED ENV{QUDA_ENABLE_TUNING})
+    if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
+      add_test(NAME invert_test_splitgrid_wilson
+        COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+        --dslash-type wilson --ngcrkrylov 8
+        --dim 2 4 6 8 --niter 1000
+        --nsrc ${QUDA_TEST_NUM_PROCS}
+        --enable-testing true
+        --gtest_output=xml:invert_test_splitgrid_wilson.xml)
 
-  if(QUDA_DIRAC_WILSON)    
-    add_test(NAME invert_test_wilson_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type wilson --ngcrkrylov 8
-      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
-      --enable-testing true
-      --gtest_output=xml:invert_test_wilson_${prec}.xml)
-
-      if(DEFINED ENV{QUDA_ENABLE_TUNING})
-        if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
-          add_test(NAME invert_test_splitgrid_wilson_${prec}
-            COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-            --dslash-type wilson --ngcrkrylov 8
-            --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
-            --nsrc ${QUDA_TEST_NUM_PROCS}
-            --enable-testing true
-            --gtest_output=xml:invert_test_splitgrid_wilson_${prec}.xml)
-
-          set_tests_properties(invert_test_splitgrid_wilson_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
-        endif()
-      endif()
+      set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
+    endif()
   endif()
+endif()
   
-  if(QUDA_DIRAC_TWISTED_MASS)
-    add_test(NAME invert_test_twisted_mass_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-mass
-      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even --enable-testing true
-      --gtest_output=xml:invert_test_twisted_mass_sym_${prec}.xml)
-
-    add_test(NAME invert_test_twisted_mass_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-mass
-      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even-asym --enable-testing true
-      --gtest_output=xml:invert_test_twisted_mass_asym_${prec}.xml)    
-  endif()
+if(QUDA_DIRAC_TWISTED_MASS)
+  add_test(NAME invert_test_twisted_mass_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-mass
+    --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8
+    --matpc even-even --enable-testing true
+    --gtest_output=xml:invert_test_twisted_mass_sym.xml)
+
+  add_test(NAME invert_test_twisted_mass_asym}
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-mass
+    --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8
+    --matpc even-even-asym --enable-testing true
+    --gtest_output=xml:invert_test_twisted_mass_asym.xml)
+endif()
 
-  if(QUDA_DIRAC_NDEG_TWISTED_MASS)
-    add_test(NAME invert_test_ndeg_twisted_mass_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-mass
-      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even --flavor nondeg-doublet
-      --enable-testing true
-      --gtest_output=xml:invert_test_ndeg_twisted_mass_sym_${prec}.xml)
+if(QUDA_DIRAC_NDEG_TWISTED_MASS)
+  add_test(NAME invert_test_ndeg_twisted_mass_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-mass
+    --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8
+    --matpc even-even --flavor nondeg-doublet
+    --enable-testing true
+    --gtest_output=xml:invert_test_ndeg_twisted_mass_sym.xml)
 
-    add_test(NAME invert_test_ndeg_twisted_mass_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-mass
-      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even-asym --flavor nondeg-doublet
-      --enable-testing true
-      --gtest_output=xml:invert_test_ndeg_twisted_mass_asym_${prec}.xml)
-  endif()
+  add_test(NAME invert_test_ndeg_twisted_mass_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-mass
+    --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8
+    --matpc even-even-asym --flavor nondeg-doublet
+    --enable-testing true
+    --gtest_output=xml:invert_test_ndeg_twisted_mass_asym.xml)
+endif()
 
-  if(QUDA_DIRAC_CLOVER)
-    add_test(NAME invert_test_clover_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type clover --compute-clover true
-      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even --enable-testing true
-      --gtest_output=xml:invert_test_clover_sym_${prec}.xml)
-
-    add_test(NAME invert_test_clover_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type clover --compute-clover true
-      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even-asym --enable-testing true
-      --gtest_output=xml:invert_test_clover_asym_${prec}.xml)
-  endif()
+if(QUDA_DIRAC_CLOVER)
+  add_test(NAME invert_test_clover_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type clover --compute-clover true
+    --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8
+    --matpc even-even --enable-testing true
+    --gtest_output=xml:invert_test_clover_sym.xml)
+
+  add_test(NAME invert_test_clover_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type clover --compute-clover true
+    --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8
+    --matpc even-even-asym --enable-testing true
+    --gtest_output=xml:invert_test_clover_asym.xml)
+endif()
 
-  if(QUDA_DIRAC_TWISTED_CLOVER)
-    add_test(NAME invert_test_twisted_clover_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-clover --compute-clover true
-      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even --enable-testing true
-      --gtest_output=xml:invert_test_twisted_clover_sym_${prec}.xml)
-
-    add_test(NAME invert_test_twisted_clover_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-clover --compute-clover true
-      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even-asym --enable-testing true
-      --gtest_output=xml:invert_test_twisted_clover_asym_${prec}.xml)
-  endif()
+if(QUDA_DIRAC_TWISTED_CLOVER)
+  add_test(NAME invert_test_twisted_clover_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-clover --compute-clover true
+    --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8
+    --matpc even-even --enable-testing true
+    --gtest_output=xml:invert_test_twisted_clover_sym.xml)
+
+  add_test(NAME invert_test_twisted_clover_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-clover --compute-clover true
+    --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8
+    --matpc even-even-asym --enable-testing true
+    --gtest_output=xml:invert_test_twisted_clover_asym.xml)
+endif()
 
-  if(QUDA_DIRAC_NDEG_TWISTED_CLOVER)
-    add_test(NAME invert_test_ndeg_twisted_clover_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-clover --compute-clover true
-      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even --flavor nondeg-doublet
-      --enable-testing true
-      --gtest_output=xml:invert_test_ndeg_twisted_clover_sym_${prec}.xml)
+if(QUDA_DIRAC_NDEG_TWISTED_CLOVER)
+  add_test(NAME invert_test_ndeg_twisted_clover_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-clover --compute-clover true
+    --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8
+    --matpc even-even --flavor nondeg-doublet
+    --enable-testing true
+    --gtest_output=xml:invert_test_ndeg_twisted_clover_sym.xml)
 
-    add_test(NAME invert_test_ndeg_twisted_clover_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-clover --compute-clover true
-      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even-asym --flavor nondeg-doublet
-      --enable-testing true
-      --gtest_output=xml:invert_test_ndeg_twisted_clover_asym_${prec}.xml)
-  endif()
+  add_test(NAME invert_test_ndeg_twisted_clover_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-clover --compute-clover true
+    --dim 2 4 6 8 --niter 1000 --ngcrkrylov 8
+    --matpc even-even-asym --flavor nondeg-doublet
+    --enable-testing true
+    --gtest_output=xml:invert_test_ndeg_twisted_clover_asym.xml)
+endif()
 
-  if(QUDA_DIRAC_DOMAIN_WALL)
-    add_test(NAME invert_test_domain_wall_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type domain-wall
-      --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even
-      --enable-testing true
-      --gtest_output=xml:invert_test_domain_wall_${prec}.xml)
+if(QUDA_DIRAC_DOMAIN_WALL)
+  add_test(NAME invert_test_domain_wall
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type domain-wall
+    --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8
+    --matpc even-even
+    --enable-testing true
+    --gtest_output=xml:invert_test_domain_wall.xml)
 
-    add_test(NAME invert_test_domain_wall_4d_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type domain-wall-4d
-      --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even
-      --enable-testing true
-      --gtest_output=xml:invert_test_domain_wall_4d_sym_${prec}.xml)
+  add_test(NAME invert_test_domain_wall_4d_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type domain-wall-4d
+    --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8
+    --matpc even-even
+    --enable-testing true
+    --gtest_output=xml:invert_test_domain_wall_4d_sym.xml)
 
-    add_test(NAME invert_test_domain_wall_4d_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type domain-wall-4d
-      --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even-asym
-      --enable-testing true
-      --gtest_output=xml:invert_test_domain_wall_4d_asym_${prec}.xml)
+  add_test(NAME invert_test_domain_wall_4d_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type domain-wall-4d
+    --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8
+    --matpc even-even-asym
+    --enable-testing true
+    --gtest_output=xml:invert_test_domain_wall_4d_asym.xml)
 
-    add_test(NAME invert_test_mobius_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type mobius
-      --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even
-      --enable-testing true
-      --gtest_output=xml:invert_test_mobius_sym_${prec}.xml)
+  add_test(NAME invert_test_mobius_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type mobius
+    --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8
+    --matpc even-even
+    --enable-testing true
+    --gtest_output=xml:invert_test_mobius_sym.xml)
 
-    add_test(NAME invert_test_mobius_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type mobius
-      --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even-asym
-      --enable-testing true
-      --gtest_output=xml:invert_test_mobius_asym_${prec}.xml)
+  add_test(NAME invert_test_mobius_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type mobius
+    --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8
+    --matpc even-even-asym
+    --enable-testing true
+    --gtest_output=xml:invert_test_mobius_asym.xml)
 
-    add_test(NAME invert_test_mobius_eofa_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type mobius-eofa
-      --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even
-      --enable-testing true
-      --gtest_output=xml:invert_test_mobius_eofa_sym_${prec}.xml)
+  add_test(NAME invert_test_mobius_eofa_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type mobius-eofa
+    --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8
+    --matpc even-even
+    --enable-testing true
+    --gtest_output=xml:invert_test_mobius_eofa_sym.xml)
 
-    add_test(NAME invert_test_mobius_eofa_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type mobius-eofa
-      --dim 2 4 6 8 --Lsdim 4 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 --ngcrkrylov 8
-      --matpc even-even-asym
-      --enable-testing true
-      --gtest_output=xml:invert_test_mobius_eofa_asym_${prec}.xml)
-  endif()
-endforeach(prec)
+  add_test(NAME invert_test_mobius_eofa_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type mobius-eofa
+    --dim 2 4 6 8 --Lsdim 4 --niter 1000 --ngcrkrylov 8
+    --matpc even-even-asym
+    --enable-testing true
+    --gtest_output=xml:invert_test_mobius_eofa_asym.xml)
+endif()
 
 # Staggered-type Inversions
 foreach(prec IN LISTS TEST_PRECS)
@@ -1215,171 +1213,162 @@ foreach(prec IN LISTS TEST_PRECS)
 endforeach(prec)
 
 # Wilson-type eigensolves
-foreach(prec IN LISTS TEST_PRECS)
-
-  if(${prec} STREQUAL "double")
-    set(tol 1e-12)
-  elseif(${prec} STREQUAL "single")
-    set(tol 1e-6)
-  endif()
-
-  if(QUDA_DIRAC_WILSON)    
-    add_test(NAME eigensolve_test_wilson_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type wilson --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000
-      --enable-testing true 
-      --gtest_output=xml:eigensolve_test_wilson_${prec}.xml)
-  endif()
+if(QUDA_DIRAC_WILSON)    
+  add_test(NAME eigensolve_test_wilson
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type wilson --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --dim 2 4 6 8 --eig-tol ${tol} --eig-max-restarts 1000
+    --enable-testing true 
+    --gtest_output=xml:eigensolve_test_wilson.xml)
+endif()
   
-  if(QUDA_DIRAC_TWISTED_MASS)
-    add_test(NAME eigensolve_test_twisted_mass_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even --enable-testing true
-      --gtest_output=xml:eigensolve_test_twisted_mass_sym_${prec}.xml)
+if(QUDA_DIRAC_TWISTED_MASS)
+  add_test(NAME eigensolve_test_twisted_mass_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --dim 2 4 6 8 --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even --enable-testing true
+    --gtest_output=xml:eigensolve_test_twisted_mass_sym.xml)
     
-    add_test(NAME eigensolve_test_twisted_mass_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000      
-      --matpc even-even-asym --enable-testing true 
-      --gtest_output=xml:eigensolve_test_twisted_mass_asym_${prec}.xml)
-  endif()
+  add_test(NAME eigensolve_test_twisted_mass_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --dim 2 4 6 8 --eig-tol ${tol} --eig-max-restarts 1000      
+    --matpc even-even-asym --enable-testing true 
+    --gtest_output=xml:eigensolve_test_twisted_mass_asym.xml)
+endif()
   
-  if(QUDA_DIRAC_NDEG_TWISTED_MASS)
-    add_test(NAME eigensolve_test_ndeg_twisted_mass_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even --flavor nondeg-doublet 
-      --enable-testing true --eig-use-eigen-qr false --eig-qr-tol ${tol}
-      --gtest_output=xml:eigensolve_test_ndeg_twisted_mass_sym_${prec}.xml)
-
-    add_test(NAME eigensolve_test_ndeg_twisted_mass_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --dim 2 4 6 8 --prec ${prec} --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even-asym --flavor nondeg-doublet
-      --enable-testing true --eig-use-eigen-qr false --eig-qr-tol ${tol}
-      --gtest_output=xml:eigensolve_test_ndeg_twisted_mass_asym_${prec}.xml)
-  endif()
+if(QUDA_DIRAC_NDEG_TWISTED_MASS)
+  add_test(NAME eigensolve_test_ndeg_twisted_mass_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --dim 2 4 6 8 --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even --flavor nondeg-doublet 
+    --enable-testing true --eig-use-eigen-qr false --eig-qr-tol ${tol}
+    --gtest_output=xml:eigensolve_test_ndeg_twisted_mass_sym.xml)
+
+  add_test(NAME eigensolve_test_ndeg_twisted_mass_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-mass --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --dim 2 4 6 8 --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even-asym --flavor nondeg-doublet
+    --enable-testing true --eig-use-eigen-qr false --eig-qr-tol ${tol}
+    --gtest_output=xml:eigensolve_test_ndeg_twisted_mass_asym.xml)
+endif()
   
-  if(QUDA_DIRAC_CLOVER)
-    add_test(NAME eigensolve_test_clover_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type clover --compute-clover true      
-      --dim 2 4 6 8 --prec ${prec} --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --eig-tol ${tol} --eig-max-restarts 1000 
-      --matpc even-even --enable-testing true 
-      --gtest_output=xml:eigensolve_test_clover_sym_${prec}.xml)
-
-    add_test(NAME eigensolve_test_clover_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type clover --compute-clover true      
-      --dim 2 4 6 8 --prec ${prec} --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even-asym --enable-testing true
-      --gtest_output=xml:eigensolve_test_clover_asym_${prec}.xml)
-  endif()
+if(QUDA_DIRAC_CLOVER)
+  add_test(NAME eigensolve_test_clover_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type clover --compute-clover true      
+    --dim 2 4 6 8 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --eig-tol ${tol} --eig-max-restarts 1000 
+    --matpc even-even --enable-testing true 
+    --gtest_output=xml:eigensolve_test_clover_sym.xml)
+
+  add_test(NAME eigensolve_test_clover_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type clover --compute-clover true      
+    --dim 2 4 6 8 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even-asym --enable-testing true
+    --gtest_output=xml:eigensolve_test_clover_asym.xml)
+endif()
 
-  if(QUDA_DIRAC_TWISTED_CLOVER)
-    add_test(NAME eigensolve_test_twisted_clover_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-clover --compute-clover true
-      --dim 2 4 6 8 --prec ${prec} --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even --enable-testing true 
-      --gtest_output=xml:eigensolve_test_twisted_clover_sym_${prec}.xml)
+if(QUDA_DIRAC_TWISTED_CLOVER)
+  add_test(NAME eigensolve_test_twisted_clover_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-clover --compute-clover true
+    --dim 2 4 6 8 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even --enable-testing true 
+    --gtest_output=xml:eigensolve_test_twisted_clover_sym.xml)
     
-    add_test(NAME eigensolve_test_twisted_clover_asym_${prec}      
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-clover --compute-clover true
-      --dim 2 4 6 8 --prec ${prec} --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even-asym --enable-testing true 
-      --gtest_output=xml:eigensolve_test_twisted_clover_asym_${prec}.xml)
-  endif()
+  add_test(NAME eigensolve_test_twisted_clover_asym      
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-clover --compute-clover true
+    --dim 2 4 6 8 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even-asym --enable-testing true 
+    --gtest_output=xml:eigensolve_test_twisted_clover_asym.xml)
+endif()
   
-  if(QUDA_DIRAC_NDEG_TWISTED_CLOVER)
-    add_test(NAME eigensolve_test_ndeg_twisted_clover_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-clover --compute-clover true
-      --dim 2 4 6 8 --prec ${prec} --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even --flavor nondeg-doublet
-      --enable-testing true 
-      --gtest_output=xml:eigensolve_test_ndeg_twisted_clover_sym_${prec}.xml)
+if(QUDA_DIRAC_NDEG_TWISTED_CLOVER)
+  add_test(NAME eigensolve_test_ndeg_twisted_clover_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-clover --compute-clover true
+    --dim 2 4 6 8 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even --flavor nondeg-doublet
+    --enable-testing true 
+    --gtest_output=xml:eigensolve_test_ndeg_twisted_clover_sym.xml)
     
-    add_test(NAME eigensolve_test_ndeg_twisted_clover_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type twisted-clover --compute-clover true
-      --dim 2 4 6 8 --prec ${prec} --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even-asym --flavor nondeg-doublet
-      --enable-testing true 
-      --gtest_output=xml:eigensolve_test_ndeg_twisted_clover_asym_${prec}.xml)
-  endif()
-  
-  if(QUDA_DIRAC_DOMAIN_WALL)
-    add_test(NAME eigensolve_test_domain_wall_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type domain-wall --prec ${prec}
-      --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even --enable-testing true 
-      --gtest_output=xml:eigensolve_test_domain_wall_${prec}.xml)
-
-    add_test(NAME eigensolve_test_domain_wall_4d_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type domain-wall-4d --prec ${prec}
-      --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even --enable-testing true 
-      --gtest_output=xml:eigensolve_test_domain_wall_4d_sym_${prec}.xml)
-
-    add_test(NAME eigensolve_test_domain_wall_4d_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type domain-wall-4d --prec ${prec}
-      --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even-asym --enable-testing true 
-      --gtest_output=xml:eigensolve_test_domain_wall_4d_asym_${prec}.xml)
-
-    add_test(NAME eigensolve_test_mobius_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type mobius --prec ${prec}
-      --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even --enable-testing true 
-      --gtest_output=xml:eigensolve_test_mobius_sym_${prec}.xml)
-
-    add_test(NAME eigensolve_test_mobius_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type mobius --prec ${prec}
-      --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even-asym --enable-testing true 
-      --gtest_output=xml:eigensolve_test_mobius_asym_${prec}.xml)
-
-    add_test(NAME eigensolve_test_mobius_eofa_sym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type mobius-eofa --prec ${prec}
-      --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even --enable-testing true 
-      --gtest_output=xml:eigensolve_test_mobius_eofa_sym_${prec}.xml)
-
-    add_test(NAME eigensolve_test_mobius_eofa_asym_${prec}
-      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
-      --dslash-type mobius-eofa --prec ${prec}
-      --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
-      --eig-tol ${tol} --eig-max-restarts 1000
-      --matpc even-even-asym --enable-testing true 
-      --gtest_output=xml:eigensolve_test_mobius_eofa_asym_${prec}.xml)
-  endif()
-endforeach(prec)
+  add_test(NAME eigensolve_test_ndeg_twisted_clover_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type twisted-clover --compute-clover true
+    --dim 2 4 6 8 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even-asym --flavor nondeg-doublet
+    --enable-testing true 
+    --gtest_output=xml:eigensolve_test_ndeg_twisted_clover_asym.xml)
+endif()
+
+if(QUDA_DIRAC_DOMAIN_WALL)
+  add_test(NAME eigensolve_test_domain_wall
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type domain-wall
+    --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even --enable-testing true
+    --gtest_output=xml:eigensolve_test_domain_wall.xml)
+
+  add_test(NAME eigensolve_test_domain_wall_4d_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type domain-wall-4d
+    --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even --enable-testing true
+    --gtest_output=xml:eigensolve_test_domain_wall_4d_sym.xml)
+
+  add_test(NAME eigensolve_test_domain_wall_4d_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type domain-wall-4d
+    --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even-asym --enable-testing true
+    --gtest_output=xml:eigensolve_test_domain_wall_4d_asym.xml)
+
+  add_test(NAME eigensolve_test_mobius_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type mobius
+    --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even --enable-testing true
+    --gtest_output=xml:eigensolve_test_mobius_sym.xml)
+
+  add_test(NAME eigensolve_test_mobius_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type mobius
+    --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even-asym --enable-testing true
+    --gtest_output=xml:eigensolve_test_mobius_asym.xml)
+
+  add_test(NAME eigensolve_test_mobius_eofa_sym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type mobius-eofa
+    --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even --enable-testing true
+    --gtest_output=xml:eigensolve_test_mobius_eofa_sym.xml)
+
+  add_test(NAME eigensolve_test_mobius_eofa_asym
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:eigensolve_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type mobius-eofa
+    --dim 2 4 6 8 --Lsdim 4 --eig-n-conv 24 --eig-n-ev 24 --eig-n-kr 128
+    --eig-tol ${tol} --eig-max-restarts 1000
+    --matpc even-even-asym --enable-testing true
+    --gtest_output=xml:eigensolve_test_mobius_eofa_asym.xml)
+endif()
 
 # Staggered-type eigensolves
 foreach(prec IN LISTS TEST_PRECS)
@@ -1425,6 +1414,14 @@ foreach(prec IN LISTS TEST_PRECS)
   endif()
 endforeach(prec)
 
+if (QUDA_DIRAC_LAPLACE AND QUDA_DIRAC_WILSON)
+  add_test(NAME laph_test
+           COMMAND  ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:laph_test> ${MPIEXEC_POSTFLAGS}
+           --dim 2 4 6 8
+           --enable-testing true
+           --gtest_output=xml:laph_test.xml)
+endif()
+
 
 if(QUDA_DIRAC_STAGGERED)
   add_test(NAME hisq_stencil
@@ -1433,35 +1430,23 @@ if(QUDA_DIRAC_STAGGERED)
                    --gtest_output=xml:hisq_stencil_test.xml)
 endif()
 
-foreach(prec IN LISTS TEST_PRECS)
-
-  add_test(NAME gauge_path_${prec}
-           COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:gauge_path_test> ${MPIEXEC_POSTFLAGS}
-                   --dim 2 4 6 8 --prec ${prec}
-                   --gtest_output=xml:gauge_path_test_${prec}.xml)
+if(QUDA_DIRAC_CLOVER OR QUDA_DIRAC_TWISTED_CLOVER OR QUDA_DIRAC_NDEG_TWISTED_CLOVER)
+  add_test(NAME clover_force_test
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:clover_force_test> ${MPIEXEC_POSTFLAGS}
+    --dim 2 4 6 8
+    --compute-clover true
+    --solution-type mat-pc-dag-mat-pc
+    --matpc odd-odd-asym --dagger
+    --enable-testing true --niter 1
+    --gtest_output=xml:clover_force_test.xml)
+endif()
 
+add_test(NAME gauge_path
+  COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:gauge_path_test> ${MPIEXEC_POSTFLAGS}
+  --dim 2 4 6 8 --enable-testing true --niter 1
+  --gtest_output=xml:gauge_path_test.xml)
 
-  if(QUDA_DIRAC_CLOVER)
-    add_test(NAME clover_force_test_${prec}
-           COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:clover_force_test> ${MPIEXEC_POSTFLAGS}
-                   --dim 2 4 6 8 --prec ${prec}
-                   --dslash-type clover --compute-clover true
-                   --solution-type mat-pc-dag-mat-pc
-                   --matpc odd-odd-asym --dagger
-                   --enable-testing true
-                   --gtest_output=xml:clover_force_test_${prec}.xml)
-   endif()
-
-  if(QUDA_DIRAC_TWISTED_CLOVER)
-    add_test(NAME tmc_clover_force_test_${prec}
-           COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:clover_force_test> ${MPIEXEC_POSTFLAGS}
-                   --dim 2 4 6 8 --prec ${prec}
-                   --dslash-type twisted-clover --compute-clover true
-                   --solution-type mat-pc-dag-mat-pc
-                   --matpc odd-odd-asym --dagger
-                   --enable-testing true
-                   --gtest_output=xml:tmc_clover_force_test_${prec}.xml)
-   endif()
+foreach(prec IN LISTS TEST_PRECS)
 
   if(QUDA_DIRAC_STAGGERED)
     add_test(NAME unitarize_link_${prec}
@@ -1480,18 +1465,6 @@ foreach(prec IN LISTS TEST_PRECS)
                      --gtest_output=xml:hisq_unitarize_force_test_${prec}.xml)
   endif()
 
-  add_test(NAME gauge_alg_${prec}
-           COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:gauge_alg_test> ${MPIEXEC_POSTFLAGS}
-                   --dim 4 6 8 10 --prec ${prec}
-                   --gtest_output=xml:gauge_alg_test_${prec}.xml)
-
-  if (TARGET dilution_test)
-    add_test(NAME dilution_test_${prec}
-             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dilution_test> ${MPIEXEC_POSTFLAGS}
-                     --dim 4 6 8 10 --dilution-block-size 4 6 4 5 --prec ${prec}
-                     --gtest_output=xml:dilution_test_${prec}.xml)
-  endif()
-
   if(QUDA_SMEAR_GAUSS_TWOLINK)
     set(KERNEL_TYPE TwoLink GaussianSmear)
     foreach(kerneltp IN LISTS KERNEL_TYPE)
@@ -1514,6 +1487,18 @@ foreach(prec IN LISTS TEST_PRECS)
 
 endforeach(prec)
 
+add_test(NAME gauge_alg
+  COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:gauge_alg_test> ${MPIEXEC_POSTFLAGS}
+  --dim 4 6 8 10
+  --gtest_output=xml:gauge_alg_test.xml)
+
+if (TARGET dilution_test)
+  add_test(NAME dilution_test
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dilution_test> ${MPIEXEC_POSTFLAGS}
+    --dim 4 6 8 10 --dilution-block-size 4 6 4 5
+    --gtest_output=xml:dilution_test.xml)
+endif()
+
 if(QUDA_QIO)
   add_test(NAME io_test
            COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:io_test> ${MPIEXEC_POSTFLAGS}
diff --git a/tests/blas_interface_test_gtest.hpp b/tests/blas_interface_test_gtest.hpp
index cc6421306a..3ba10e1101 100644
--- a/tests/blas_interface_test_gtest.hpp
+++ b/tests/blas_interface_test_gtest.hpp
@@ -71,7 +71,7 @@ TEST_P(BLASTest, verify)
   switch (test_type) {
   case QUDA_BLAS_GEMM: {
     auto deviation_gemm = gemm_test(param);
-    decltype(deviation_gemm) tol_gemm;
+    decltype(deviation_gemm) tol_gemm = 0.0; // initialize to suppress warning
     switch (data_type) {
     case QUDA_BLAS_DATATYPE_S:
     case QUDA_BLAS_DATATYPE_C: tol_gemm = 10 * std::numeric_limits<float>::epsilon(); break;
@@ -84,7 +84,7 @@ TEST_P(BLASTest, verify)
   }
   case QUDA_BLAS_LU_INV: {
     auto deviation_lu_inv = lu_inv_test(param);
-    decltype(deviation_lu_inv) tol_lu_inv;
+    decltype(deviation_lu_inv) tol_lu_inv = 0.0; // initialize to suppress warning
     // We allow a factor of 5000 (500x more than the gemm tolerance factor)
     // due to variations in algorithmic implementation, order of arithmetic
     // operations, and possible near singular eigenvalues or degeneracies.
diff --git a/tests/clover_force_test.cpp b/tests/clover_force_test.cpp
index d1cbd089ce..149e064003 100644
--- a/tests/clover_force_test.cpp
+++ b/tests/clover_force_test.cpp
@@ -5,14 +5,15 @@
 #include "clover_force_reference.h"
 #include "misc.h"
 #include <color_spinor_field.h> // convenient quark field container
+#include <clover_field.h>
 #include <command_line_params.h>
 #include <gauge_field.h>
 #include <host_utils.h>
 #include <quda.h>
+#include <instantiate.h>
 #include <gtest/gtest.h>
 
 static int force_check;
-static int path_check;
 static double force_deviation;
 QudaGaugeParam gauge_param;
 QudaInvertParam inv_param;
@@ -22,6 +23,10 @@ quda::GaugeField mom_ref;
 std::vector<char> clover;
 std::vector<char> clover_inv;
 
+QudaPrecision last_prec = QUDA_INVALID_PRECISION;
+QudaDslashType last_dslash = QUDA_INVALID_DSLASH;
+QudaTwistFlavorType last_twist_flavor = QUDA_TWIST_INVALID;
+
 void init(int argc, char **argv)
 {
   // Set QUDA's internal parameters
@@ -38,12 +43,6 @@ void init(int argc, char **argv)
   param.order = QUDA_QDP_GAUGE_ORDER;
   gauge = quda::GaugeField(param);
 
-  printfQuda("Randomizing gauge fields... ");
-  constructHostGaugeField(gauge, gauge_param, argc, argv);
-
-  printfQuda("Sending gauge field to GPU\n");
-  loadGaugeQuda(gauge.raw_pointer(), &gauge_param);
-
   param.order = QUDA_MILC_GAUGE_ORDER;
   param.link_type = QUDA_ASQTAD_MOM_LINKS;
   param.reconstruct = QUDA_RECONSTRUCT_10;
@@ -51,16 +50,25 @@ void init(int argc, char **argv)
   mom = quda::GaugeField(param);
   mom_ref = quda::GaugeField(param);
 
-  // Allocate host side memory for clover terms if needed.
-  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-    clover.resize(V * clover_site_size * host_clover_data_type_size);
-    clover_inv.resize(V * clover_site_size * host_spinor_data_type_size);
-    compute_clover = true;
-    constructHostCloverField(clover.data(), clover_inv.data(), inv_param);
-    // Load the clover terms to the device
-    loadCloverQuda(clover.data(), clover_inv.data(), &inv_param);
-  } else {
-    errorQuda("dslash type ( dslash_type = %d ) must have the clover", dslash_type);
+  printfQuda("Randomizing gauge fields... ");
+  constructHostGaugeField(gauge, gauge_param, argc, argv);
+
+  clover.resize(V * clover_site_size * host_clover_data_type_size);
+  clover_inv.resize(V * clover_site_size * host_spinor_data_type_size);
+
+  if (!enable_testing) {
+    printfQuda("Sending gauge field to GPU\n");
+    loadGaugeQuda(gauge.raw_pointer(), &gauge_param);
+
+    // Allocate host side memory for clover terms if needed.
+    if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+      compute_clover = true;
+      constructHostCloverField(clover.data(), clover_inv.data(), inv_param);
+      // Load the clover terms to the device
+      loadCloverQuda(clover.data(), clover_inv.data(), &inv_param);
+    } else {
+      errorQuda("dslash type ( dslash_type = %d ) must have the clover", dslash_type);
+    }
   }
 }
 
@@ -71,12 +79,23 @@ void destroy()
   mom_ref = {};
 }
 
-using test_t = ::testing::tuple<bool, int>;
+using test_t = ::testing::tuple<QudaPrecision, QudaDslashType, bool, int, QudaTwistFlavorType>;
 
 std::tuple<int, double> clover_force_test(test_t param)
 {
-  bool detratio = ::testing::get<0>(param);
-  int nvector = ::testing::get<1>(param);
+  int nvector = ::testing::get<3>(param);
+  gauge_param.cuda_prec = ::testing::get<0>(param);
+  inv_param.cuda_prec = ::testing::get<0>(param);
+  inv_param.dslash_type = ::testing::get<1>(param);
+  inv_param.twist_flavor = ::testing::get<4>(param);
+  if (inv_param.dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+    inv_param.epsilon = epsilon;
+    inv_param.evmax = evmax;
+    if (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) {
+      for (int i = 0; i < nvector; i++) inv_param.offset[i] = 0.06 + i * i * 0.1;
+    }
+  }
+  bool detratio = ::testing::get<2>(param);
 
   std::vector<quda::ColorSpinorField> out_nvector(nvector);
   std::vector<void *> in(nvector);
@@ -123,16 +142,15 @@ std::tuple<int, double> clover_force_test(test_t param)
     gflops += inv_param.gflops;
   }
 
-  int *check_out = true ? &force_check : &path_check;
+  int *check_out = &force_check;
   std::array<void *, 4> u = {gauge.data(0), gauge.data(1), gauge.data(2), gauge.data(3)};
   if (verify_results) {
     gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
     mom_ref.zero();
     TMCloverForce_reference(mom_ref.data(), in.data(), in0.data(), coeff.data(), nvector, u, clover, clover_inv,
                             &gauge_param, &inv_param, detratio);
-    *check_out
-      = compare_floats(mom.data(), mom_ref.data(), 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec);
-    // if (compute_force)
+    *check_out = compare_floats(mom.data(), mom_ref.data(), 4 * V * mom_site_size, getTolerance(gauge_param.cuda_prec),
+                                gauge_param.cpu_prec);
     strong_check_mom(mom.data(), mom_ref.data(), 4 * V, gauge_param.cpu_prec);
   }
 
@@ -155,15 +173,58 @@ class CloverForceTest : public ::testing::TestWithParam<test_t>
 protected:
   test_t param;
 
+  bool skip()
+  {
+    if (!quda::is_enabled(::testing::get<0>(param))) return true; // skip if precision is not enabled
+    // check requested dslash is enabled
+    if ((::testing::get<1>(param) == QUDA_CLOVER_WILSON_DSLASH && !quda::is_enabled_clover())
+        || (::testing::get<1>(param) == QUDA_TWISTED_CLOVER_DSLASH && !quda::is_enabled_twisted_clover()))
+      return true;
+
+    return false;
+  }
+
 public:
   CloverForceTest() : param(GetParam()) { }
+
+  virtual void SetUp()
+  {
+    if (skip()) GTEST_SKIP();
+    // check if precision has changed and update if it has
+    if (::testing::get<0>(param) != last_prec) {
+      if (last_prec != QUDA_INVALID_PRECISION) freeGaugeQuda();
+      gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
+      gauge_param.cuda_prec = ::testing::get<0>(param);
+      loadGaugeQuda(gauge.raw_pointer(), &gauge_param);
+    }
+
+    if (::testing::get<0>(param) != last_prec || ::testing::get<1>(param) != last_dslash
+        || ::testing::get<4>(param) != last_twist_flavor) {
+      if (last_prec != QUDA_INVALID_PRECISION) freeCloverQuda();
+      // Load the clover terms to the device
+      inv_param.clover_cuda_prec = ::testing::get<0>(param);
+      inv_param.dslash_type = ::testing::get<1>(param);
+      if (inv_param.dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+        inv_param.twist_flavor = ::testing::get<4>(param);
+        inv_param.epsilon = epsilon;
+        inv_param.evmax = evmax;
+      }
+      constructHostCloverField(clover.data(), clover_inv.data(), inv_param);
+      loadCloverQuda(clover.data(), clover_inv.data(), &inv_param);
+    }
+    last_twist_flavor = ::testing::get<4>(param);
+    last_prec = ::testing::get<0>(param);
+    last_dslash = ::testing::get<1>(param);
+  }
 };
 
 TEST_P(CloverForceTest, verify)
 {
+  if (skip()) GTEST_SKIP();
+
   auto deviation = clover_force_test(GetParam());
   ASSERT_EQ(std::get<0>(deviation), 1) << "CPU and QUDA force implementations do not agree";
-  ASSERT_LE(std::get<1>(deviation), getTolerance(cuda_prec))
+  ASSERT_LE(std::get<1>(deviation), getTolerance(::testing::get<0>(param)))
     << "CPU and QUDA momentum action implementations do not agree";
 }
 
@@ -185,8 +246,11 @@ static void display_test_info()
 std::string gettestname(::testing::TestParamInfo<test_t> param)
 {
   std::string name;
-  if (::testing::get<0>(param.param)) name += std::string("ratio_");
-  name += std::string("nvector_") + std::to_string(::testing::get<1>(param.param));
+  name += std::string(get_prec_str(::testing::get<0>(param.param))) + "_";
+  name += std::string(get_dslash_str(::testing::get<1>(param.param))) + "_";
+  name += std::string(get_TwistFlavor_str(::testing::get<4>(param.param)));
+  if (::testing::get<2>(param.param)) name += std::string("ratio_");
+  name += std::string("nvector_") + std::to_string(::testing::get<3>(param.param));
   return name;
 }
 
@@ -197,6 +261,9 @@ int main(int argc, char **argv)
   // return code for google test
   int test_rc = 0;
 
+  // set default dslash to clover
+  dslash_type = QUDA_CLOVER_WILSON_DSLASH;
+
   // command line options
   auto app = make_app();
   add_clover_force_option_group(app);
@@ -220,7 +287,7 @@ int main(int argc, char **argv)
 
     test_rc = RUN_ALL_TESTS();
   } else {
-    clover_force_test({detratio, Nsrc});
+    clover_force_test({prec, dslash_type, detratio, Nsrc, twist_flavor});
   }
 
   destroy();
@@ -229,5 +296,28 @@ int main(int argc, char **argv)
   return test_rc;
 }
 
+using ::testing::Combine;
+using ::testing::Values;
+
+#ifdef GPU_CLOVER_DIRAC
 INSTANTIATE_TEST_SUITE_P(CloverForceTest, CloverForceTest,
-                         ::testing::Combine(::testing::Values(false, true), ::testing::Values(1, 8)), gettestname);
+                         Combine(Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION), Values(QUDA_CLOVER_WILSON_DSLASH),
+                                 Values(false, true), Values(1, 8), Values(QUDA_TWIST_NO)),
+                         gettestname);
+#endif
+
+#ifdef GPU_TWISTED_CLOVER_DIRAC
+INSTANTIATE_TEST_SUITE_P(CloverForceTest_TwistedClover, CloverForceTest,
+                         Combine(Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION), Values(QUDA_TWISTED_CLOVER_DSLASH),
+                                 Values(false, true), Values(1, 8), Values(QUDA_TWIST_SINGLET)),
+                         gettestname);
+
+#endif
+
+#ifdef GPU_NDEG_TWISTED_CLOVER_DIRAC
+// twisted clover non degenerate doublet
+INSTANTIATE_TEST_SUITE_P(CloverForceTest_ndeg_TwistedClover, CloverForceTest,
+                         Combine(Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION), Values(QUDA_TWISTED_CLOVER_DSLASH),
+                                 Values(false, true), Values(1, 8), Values(QUDA_TWIST_NONDEG_DOUBLET)),
+                         gettestname);
+#endif
diff --git a/tests/dilution_test.cpp b/tests/dilution_test.cpp
index 71bfa57aa7..429692f446 100644
--- a/tests/dilution_test.cpp
+++ b/tests/dilution_test.cpp
@@ -9,20 +9,22 @@
 #include <misc.h>
 #include <dslash_reference.h>
 
-using test_t = ::testing::tuple<QudaSiteSubset, QudaDilutionType, int>;
+using test_t = ::testing::tuple<QudaPrecision, QudaSiteSubset, QudaDilutionType, int>;
 
 class DilutionTest : public ::testing::TestWithParam<test_t>
 {
 protected:
+  QudaPrecision precision;
   QudaSiteSubset site_subset;
   QudaDilutionType dilution_type;
   int nSpin;
 
 public:
   DilutionTest() :
-    site_subset(::testing::get<0>(GetParam())),
-    dilution_type(testing::get<1>(GetParam())),
-    nSpin(testing::get<2>(GetParam()))
+    precision(::testing::get<0>(GetParam())),
+    site_subset(::testing::get<1>(GetParam())),
+    dilution_type(testing::get<2>(GetParam())),
+    nSpin(testing::get<3>(GetParam()))
   {
   }
 };
@@ -31,7 +33,7 @@ TEST_P(DilutionTest, verify)
 {
   using namespace quda;
 
-  if (!is_enabled_spin(nSpin)) GTEST_SKIP();
+  if (!is_enabled_spin(nSpin) || !is_enabled(precision)) GTEST_SKIP();
 
   // Set some parameters
   QudaGaugeParam gauge_param = newQudaGaugeParam();
@@ -44,7 +46,7 @@ TEST_P(DilutionTest, verify)
   param.siteSubset = site_subset;
   if (site_subset == QUDA_PARITY_SITE_SUBSET) param.x[0] /= 2;
   param.nSpin = nSpin;
-  param.setPrecision(inv_param.cuda_prec, inv_param.cuda_prec, true); // change order to native order
+  param.setPrecision(precision, precision, true); // change order to native order
   param.location = QUDA_CUDA_FIELD_LOCATION;
   param.create = QUDA_NULL_FIELD_CREATE;
   ColorSpinorField src(param);
@@ -97,57 +99,58 @@ TEST_P(DilutionTest, verify)
 }
 
 using ::testing::Combine;
+using ::testing::get;
 using ::testing::Values;
 
+auto test_str = [](testing::TestParamInfo<test_t> param) {
+  return std::string(get_prec_str(get<0>(param.param))) + "_" + get_dilution_type_str(get<2>(param.param));
+};
+
+auto precisions = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION);
+
 INSTANTIATE_TEST_SUITE_P(WilsonFull, DilutionTest,
-                         Combine(Values(QUDA_FULL_SITE_SUBSET),
+                         Combine(precisions, Values(QUDA_FULL_SITE_SUBSET),
                                  Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR,
                                         QUDA_DILUTION_SPIN_COLOR_EVEN_ODD, QUDA_DILUTION_BLOCK),
                                  Values(4)),
-                         [](testing::TestParamInfo<test_t> param) {
-                           return get_dilution_type_str(::testing::get<1>(param.param));
-                         });
+                         test_str);
 
 INSTANTIATE_TEST_SUITE_P(
   WilsonParity, DilutionTest,
-  Combine(Values(QUDA_PARITY_SITE_SUBSET),
+  Combine(precisions, Values(QUDA_PARITY_SITE_SUBSET),
           Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_BLOCK), Values(4)),
-  [](testing::TestParamInfo<test_t> param) { return get_dilution_type_str(::testing::get<1>(param.param)); });
+  test_str);
 
-INSTANTIATE_TEST_SUITE_P(
-  CoarseFull, DilutionTest,
-  Combine(Values(QUDA_FULL_SITE_SUBSET),
-          Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_SPIN_COLOR_EVEN_ODD),
-          Values(2)),
-  [](testing::TestParamInfo<test_t> param) { return get_dilution_type_str(::testing::get<1>(param.param)); });
+INSTANTIATE_TEST_SUITE_P(CoarseFull, DilutionTest,
+                         Combine(precisions, Values(QUDA_FULL_SITE_SUBSET),
+                                 Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR,
+                                        QUDA_DILUTION_SPIN_COLOR_EVEN_ODD),
+                                 Values(2)),
+                         test_str);
 
 INSTANTIATE_TEST_SUITE_P(CoarseParity, DilutionTest,
-                         Combine(Values(QUDA_PARITY_SITE_SUBSET),
+                         Combine(precisions, Values(QUDA_PARITY_SITE_SUBSET),
                                  Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR), Values(2)),
-                         [](testing::TestParamInfo<test_t> param) {
-                           return get_dilution_type_str(::testing::get<1>(param.param));
-                         });
+                         test_str);
 
-INSTANTIATE_TEST_SUITE_P(
-  StaggeredFull, DilutionTest,
-  Combine(Values(QUDA_FULL_SITE_SUBSET),
-          Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_SPIN_COLOR_EVEN_ODD),
-          Values(1)),
-  [](testing::TestParamInfo<test_t> param) { return get_dilution_type_str(::testing::get<1>(param.param)); });
+INSTANTIATE_TEST_SUITE_P(StaggeredFull, DilutionTest,
+                         Combine(precisions, Values(QUDA_FULL_SITE_SUBSET),
+                                 Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR,
+                                        QUDA_DILUTION_SPIN_COLOR_EVEN_ODD),
+                                 Values(1)),
+                         test_str);
 
 INSTANTIATE_TEST_SUITE_P(StaggeredParity, DilutionTest,
-                         Combine(Values(QUDA_PARITY_SITE_SUBSET),
+                         Combine(precisions, Values(QUDA_PARITY_SITE_SUBSET),
                                  Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR), Values(1)),
-                         [](testing::TestParamInfo<test_t> param) {
-                           return get_dilution_type_str(::testing::get<1>(param.param));
-                         });
+                         test_str);
 
 struct dilution_test : quda_test {
   void display_info() const override
   {
     quda_test::display_info();
-    printfQuda("prec    S_dimension T_dimension Ls_dimension\n");
-    printfQuda("%6s   %3d/%3d/%3d     %3d         %2d\n", get_prec_str(prec), xdim, ydim, zdim, tdim, Lsdim);
+    printfQuda("S_dimension T_dimension Ls_dimension\n");
+    printfQuda("%3d/%3d/%3d     %3d         %2d\n", xdim, ydim, zdim, tdim, Lsdim);
   }
 
   dilution_test(int argc, char **argv) : quda_test("Dilution Test", argc, argv) { }
diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h
index d26f8ca662..f5d2d6e4e8 100644
--- a/tests/dslash_test_utils.h
+++ b/tests/dslash_test_utils.h
@@ -91,13 +91,17 @@ struct DslashTestWrapper {
 
   void init_ctest(int argc, char **argv, int precision, QudaReconstructType link_recon)
   {
-    cuda_prec = getPrecision(precision);
-
-    gauge_param = newQudaGaugeParam();
-    inv_param = newQudaInvertParam();
-    setWilsonGaugeParam(gauge_param);
-    setInvertParam(inv_param);
+    static bool first_time = true;
+    if (first_time) {
+      gauge_param = newQudaGaugeParam();
+      setWilsonGaugeParam(gauge_param);
+      inv_param = newQudaInvertParam();
+      setInvertParam(inv_param);
+      init_host(argc, argv);
+      first_time = false;
+    }
 
+    cuda_prec = getPrecision(precision);
     gauge_param.cuda_prec = cuda_prec;
     gauge_param.cuda_prec_sloppy = cuda_prec;
     gauge_param.cuda_prec_precondition = cuda_prec;
@@ -109,24 +113,22 @@ struct DslashTestWrapper {
     gauge_param.reconstruct_sloppy = link_recon;
 
     inv_param.cuda_prec = cuda_prec;
+    inv_param.clover_cuda_prec = cuda_prec;
+    inv_param.clover_cuda_prec_sloppy = cuda_prec;
+    inv_param.clover_cuda_prec_precondition = cuda_prec;
+    inv_param.clover_cuda_prec_refinement_sloppy = cuda_prec;
 
-    static bool first_time = true;
-    if (first_time) {
-      init_host(argc, argv);
-      first_time = false;
-    }
     init();
   }
 
   void init_test(int argc, char **argv)
   {
-    gauge_param = newQudaGaugeParam();
-    inv_param = newQudaInvertParam();
-    setWilsonGaugeParam(gauge_param);
-    setInvertParam(inv_param);
-
     static bool first_time = true;
     if (first_time) {
+      gauge_param = newQudaGaugeParam();
+      setWilsonGaugeParam(gauge_param);
+      inv_param = newQudaInvertParam();
+      setInvertParam(inv_param);
       init_host(argc, argv);
       first_time = false;
     }
diff --git a/tests/eigensolve_test.cpp b/tests/eigensolve_test.cpp
index 7c17540a60..6f5114a182 100644
--- a/tests/eigensolve_test.cpp
+++ b/tests/eigensolve_test.cpp
@@ -21,6 +21,12 @@ QudaGaugeParam gauge_param;
 QudaInvertParam eig_inv_param;
 QudaEigParam eig_param;
 
+std::vector<char> gauge_;
+std::array<void *, 4> gauge;
+std::vector<char> clover;
+std::vector<char> clover_inv;
+QudaPrecision last_prec = QUDA_INVALID_PRECISION;
+
 // if "--enable-testing true" is passed, we run the tests defined in here
 #include <eigensolve_test_gtest.hpp>
 
@@ -28,10 +34,9 @@ void display_test_info(QudaEigParam &param)
 {
   printfQuda("running the following test:\n");
 
-  printfQuda("prec    sloppy_prec    link_recon  sloppy_link_recon S_dimension T_dimension Ls_dimension\n");
-  printfQuda("%s   %s             %s            %s            %d/%d/%d          %d         %d\n", get_prec_str(prec),
-             get_prec_str(prec_sloppy), get_recon_str(link_recon), get_recon_str(link_recon_sloppy), xdim, ydim, zdim,
-             tdim, Lsdim);
+  printfQuda("prec    link_recon  sloppy_link_recon S_dimension T_dimension Ls_dimension\n");
+  printfQuda("%s      %s            %s            %d/%d/%d          %d         %d\n", get_prec_str(param.cuda_prec_ritz),
+             get_recon_str(link_recon), get_recon_str(link_recon_sloppy), xdim, ydim, zdim, tdim, Lsdim);
 
   printfQuda("\n   Eigensolver parameters\n");
   printfQuda(" - solver mode %s\n", get_eig_type_str(param.eig_type));
@@ -65,11 +70,6 @@ void display_test_info(QudaEigParam &param)
              dimPartitioned(3));
 }
 
-std::vector<char> gauge_;
-std::array<void *, 4> gauge;
-std::vector<char> clover;
-std::vector<char> clover_inv;
-
 void init(int argc, char **argv)
 {
   // Construct QUDA param structures to define the problem
@@ -116,16 +116,19 @@ void init(int argc, char **argv)
     // Load the clover terms to the device
     loadCloverQuda(clover.data(), clover_inv.data(), &eig_inv_param);
   }
+  last_prec = gauge_param.cuda_prec;
 }
 
 std::vector<double> eigensolve(test_t test_param)
 {
   // Collect testing parameters from gtest
-  eig_param.eig_type = ::testing::get<0>(test_param);
-  eig_param.use_norm_op = ::testing::get<1>(test_param);
-  eig_param.use_pc = ::testing::get<2>(test_param);
-  eig_param.compute_svd = ::testing::get<3>(test_param);
-  eig_param.spectrum = ::testing::get<4>(test_param);
+  eig_inv_param.cuda_prec = ::testing::get<0>(test_param);
+  eig_inv_param.cuda_prec_eigensolver = ::testing::get<0>(test_param);
+  eig_param.eig_type = ::testing::get<1>(test_param);
+  eig_param.use_norm_op = ::testing::get<2>(test_param);
+  eig_param.use_pc = ::testing::get<3>(test_param);
+  eig_param.compute_svd = ::testing::get<4>(test_param);
+  eig_param.spectrum = ::testing::get<5>(test_param);
 
   // For preconditioned matrices, spinors are only half the normal length.
   // QUDA constructs spinors based on the "solution type" which is
@@ -198,6 +201,7 @@ std::vector<double> eigensolve(test_t test_param)
   if (eig_param.arpack_check && !(eig_inv_param.cpu_prec == QUDA_DOUBLE_PRECISION)) {
     errorQuda("ARPACK check only available in double precision");
   }
+
   eigensolveQuda(host_evecs_ptr.data(), evals.data(), &eig_param);
   host_timer.stop();
   printfQuda("Time for %s solution = %f\n", eig_param.arpack_check ? "ARPACK" : "QUDA", host_timer.last());
@@ -249,8 +253,7 @@ int main(int argc, char **argv)
       && dslash_type != QUDA_TWISTED_MASS_DSLASH && dslash_type != QUDA_TWISTED_CLOVER_DSLASH
       && dslash_type != QUDA_MOBIUS_DWF_DSLASH && dslash_type != QUDA_MOBIUS_DWF_EOFA_DSLASH
       && dslash_type != QUDA_DOMAIN_WALL_DSLASH && dslash_type != QUDA_DOMAIN_WALL_4D_DSLASH) {
-    printfQuda("dslash_type %d not supported\n", dslash_type);
-    exit(0);
+    errorQuda("dslash_type %d not supported", dslash_type);
   }
 
   // Initialize the QUDA library
@@ -270,16 +273,13 @@ int main(int argc, char **argv)
     if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
     result = RUN_ALL_TESTS();
   } else {
-    eigensolve(
-      test_t {eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd, eig_param.spectrum});
+    eigensolve(test_t {prec, eig_param.eig_type, eig_param.use_norm_op, eig_param.use_pc, eig_param.compute_svd,
+                       eig_param.spectrum});
   }
 
   // Memory clean-up
   freeGaugeQuda();
-  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-    freeCloverQuda();
-  }
-
+  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) freeCloverQuda();
   // Finalize the QUDA library
   endQuda();
   finalizeComms();
diff --git a/tests/eigensolve_test_gtest.hpp b/tests/eigensolve_test_gtest.hpp
index a872963413..41b2443932 100644
--- a/tests/eigensolve_test_gtest.hpp
+++ b/tests/eigensolve_test_gtest.hpp
@@ -1,6 +1,6 @@
 #include <gtest/gtest.h>
 
-using test_t = ::testing::tuple<QudaEigType, QudaBoolean, QudaBoolean, QudaBoolean, QudaEigSpectrumType>;
+using test_t = ::testing::tuple<QudaPrecision, QudaEigType, QudaBoolean, QudaBoolean, QudaBoolean, QudaEigSpectrumType>;
 
 class EigensolveTest : public ::testing::TestWithParam<test_t>
 {
@@ -9,12 +9,37 @@ class EigensolveTest : public ::testing::TestWithParam<test_t>
 
 public:
   EigensolveTest() : param(GetParam()) { }
+
+  virtual void SetUp()
+  {
+    // check if outer precision has changed and update if it has
+    if (::testing::get<0>(param) != last_prec) {
+      if (last_prec != QUDA_INVALID_PRECISION) {
+        freeGaugeQuda();
+        if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) freeCloverQuda();
+      }
+
+      // Load the gauge field to the device
+      gauge_param.cuda_prec = ::testing::get<0>(param);
+      loadGaugeQuda(gauge.data(), &gauge_param);
+
+      if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+        // Load the clover terms to the device
+        eig_inv_param.clover_cuda_prec = ::testing::get<0>(param);
+        loadCloverQuda(clover.data(), clover_inv.data(), &eig_inv_param);
+      }
+      last_prec = ::testing::get<0>(param);
+    }
+  }
 };
 
 bool skip_test(test_t param)
 {
+  auto prec = ::testing::get<0>(param);
+  if (!(QUDA_PRECISION & prec)) return true; // precision not enabled so skip it
+
   // dwf-style solves must use a normal solver
-  if (is_chiral(dslash_type) && (::testing::get<1>(param) == QUDA_BOOLEAN_FALSE)) return true;
+  if (is_chiral(dslash_type) && (::testing::get<2>(param) == QUDA_BOOLEAN_FALSE)) return true;
   return false;
 }
 
@@ -23,29 +48,32 @@ std::vector<double> eigensolve(test_t test_param);
 TEST_P(EigensolveTest, verify)
 {
   if (skip_test(GetParam())) GTEST_SKIP();
-  double factor = 1.0;
+
+  auto tol = ::testing::get<0>(GetParam()) == QUDA_SINGLE_PRECISION ? 1e-6 : 1e-12;
+  eig_param.tol = tol;
+
   // The IRAM eigensolver will sometimes report convergence with tolerances slightly
   // higher than requested. The same phenomenon occurs in ARPACK. This factor
   // prevents failure when IRAM has solved to say 2e-6 when 1e-6 is requested.
   // The solution to avoid this is to use a Krylov space (eig-n-kr) about 3-4 times the
   // size of the search space (eig-n-ev), or use a well chosen Chebyshev polynomial,
   // or use a tighter than necessary tolerance.
-  if (eig_param.eig_type == QUDA_EIG_IR_ARNOLDI || eig_param.eig_type == QUDA_EIG_BLK_IR_ARNOLDI) factor *= 10;
-  auto tol = factor * eig_param.tol;
+  if (eig_param.eig_type == QUDA_EIG_IR_ARNOLDI || eig_param.eig_type == QUDA_EIG_BLK_IR_ARNOLDI) tol *= 10;
   for (auto rsd : eigensolve(GetParam())) EXPECT_LE(rsd, tol);
 }
 
 std::string gettestname(::testing::TestParamInfo<test_t> param)
 {
   std::string name;
-  name += get_eig_type_str(::testing::get<0>(param.param)) + std::string("_");
-  name += (::testing::get<1>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("normop") : std::string("direct"))
+  name += get_prec_str(::testing::get<0>(param.param)) + std::string("_");
+  name += get_eig_type_str(::testing::get<1>(param.param)) + std::string("_");
+  name += (::testing::get<2>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("normop") : std::string("direct"))
     + std::string("_");
-  name += (::testing::get<2>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("evenodd") : std::string("full"))
+  name += (::testing::get<3>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("evenodd") : std::string("full"))
     + std::string("_");
-  name += (::testing::get<3>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("withSVD") : std::string("noSVD"))
+  name += (::testing::get<4>(param.param) == QUDA_BOOLEAN_TRUE ? std::string("withSVD") : std::string("noSVD"))
     + std::string("_");
-  name += get_eig_spectrum_str(::testing::get<4>(param.param));
+  name += get_eig_spectrum_str(::testing::get<5>(param.param));
   return name;
 }
 
@@ -63,25 +91,28 @@ auto hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG);
 auto non_hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG, QUDA_SPECTRUM_LM_EIG,
                                      QUDA_SPECTRUM_SM_EIG, QUDA_SPECTRUM_LI_EIG, QUDA_SPECTRUM_SI_EIG);
 
+auto precisions = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION);
+
 // preconditioned normal solves
 INSTANTIATE_TEST_SUITE_P(NormalEvenOdd, EigensolveTest,
-                         ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_TRUE), Values(QUDA_BOOLEAN_TRUE),
-                                            Values(QUDA_BOOLEAN_TRUE), hermitian_spectrum),
+                         ::testing::Combine(precisions, hermitian_solvers, Values(QUDA_BOOLEAN_TRUE),
+                                            Values(QUDA_BOOLEAN_TRUE), Values(QUDA_BOOLEAN_TRUE), hermitian_spectrum),
                          gettestname);
 
 // full system normal solve
 INSTANTIATE_TEST_SUITE_P(NormalFull, EigensolveTest,
-                         ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_TRUE), Values(QUDA_BOOLEAN_TRUE),
-                                            Values(QUDA_BOOLEAN_TRUE), hermitian_spectrum),
+                         ::testing::Combine(precisions, hermitian_solvers, Values(QUDA_BOOLEAN_TRUE),
+                                            Values(QUDA_BOOLEAN_TRUE), Values(QUDA_BOOLEAN_TRUE), hermitian_spectrum),
                          gettestname);
 
 // preconditioned direct solves
 INSTANTIATE_TEST_SUITE_P(DirectEvenOdd, EigensolveTest,
-                         ::testing::Combine(non_hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_TRUE),
-                                            Values(QUDA_BOOLEAN_FALSE), non_hermitian_spectrum),
+                         ::testing::Combine(precisions, non_hermitian_solvers, Values(QUDA_BOOLEAN_FALSE),
+                                            Values(QUDA_BOOLEAN_TRUE), Values(QUDA_BOOLEAN_FALSE), non_hermitian_spectrum),
                          gettestname);
 // full system direct solve
 INSTANTIATE_TEST_SUITE_P(DirectFull, EigensolveTest,
-                         ::testing::Combine(non_hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE),
-                                            Values(QUDA_BOOLEAN_FALSE), non_hermitian_spectrum),
+                         ::testing::Combine(precisions, non_hermitian_solvers, Values(QUDA_BOOLEAN_FALSE),
+                                            Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE),
+                                            non_hermitian_spectrum),
                          gettestname);
diff --git a/tests/gauge_alg_test.cpp b/tests/gauge_alg_test.cpp
index 00ba1f3689..c5197af83c 100644
--- a/tests/gauge_alg_test.cpp
+++ b/tests/gauge_alg_test.cpp
@@ -5,6 +5,7 @@
 #include <quda.h>
 #include <quda_internal.h>
 #include <gauge_field.h>
+#include <instantiate.h>
 
 #include <misc.h>
 #include <timer.h>
@@ -34,14 +35,17 @@ bool gauge_store;
 
 std::array<std::vector<char>, 4> host_gauge;
 
-class GaugeAlgTest : public ::testing::Test
-{
-protected:
+using test_t = ::testing::tuple<QudaPrecision>;
+
+struct GaugeAlgTest : public ::testing::TestWithParam<test_t> {
+  QudaPrecision precision;
   QudaGaugeParam param;
   Timer<false> a0, a1;
   GaugeField *U;
   double3 plaq;
 
+  GaugeAlgTest() : precision(::testing::get<0>(GetParam())) { }
+
   void SetReunitarizationConsts()
   {
     const double unitarize_eps = 1e-14;
@@ -59,14 +63,14 @@ class GaugeAlgTest : public ::testing::Test
     auto a1 = std::abs(a.y - b.y);
     auto a2 = std::abs(a.z - b.z);
     double prec_val = 1.0e-5;
-    if (prec == QUDA_DOUBLE_PRECISION) prec_val = gf_tolerance * 1e2;
+    if (precision == QUDA_DOUBLE_PRECISION) prec_val = gf_tolerance * 1e2;
     return ((a0 < prec_val) && (a1 < prec_val) && (a2 < prec_val));
   }
 
   bool CheckDeterminant(double2 detu)
   {
     double prec_val = 5e-8;
-    if (prec == QUDA_DOUBLE_PRECISION) prec_val = gf_tolerance * 1e2;
+    if (precision == QUDA_DOUBLE_PRECISION) prec_val = gf_tolerance * 1e2;
     return (std::abs(1.0 - detu.x) < prec_val && std::abs(detu.y) < prec_val);
   }
 
@@ -80,12 +84,18 @@ class GaugeAlgTest : public ::testing::Test
       GTEST_SKIP();
     }
 #endif
+    if (!is_enabled(precision)) {
+      execute = false;
+      GTEST_SKIP();
+    }
+
     if (execute) {
       setVerbosity(verbosity);
       param = newQudaGaugeParam();
 
       // Setup gauge container.
       setWilsonGaugeParam(param);
+      param.cuda_prec = precision;
       param.t_boundary = QUDA_PERIODIC_T;
 
       // Reunitarization setup
@@ -102,7 +112,7 @@ class GaugeAlgTest : public ::testing::Test
         gParam.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
         gParam.create = QUDA_NULL_FIELD_CREATE;
         gParam.reconstruct = link_recon;
-        gParam.setPrecision(prec, true);
+        gParam.setPrecision(precision, true);
         for (int d = 0; d < 4; d++) {
           if (comm_dim_partitioned(d)) gParam.r[d] = 2;
           gParam.x[d] += 2 * gParam.r[d];
@@ -280,7 +290,7 @@ class GaugeAlgTest : public ::testing::Test
   }
 };
 
-TEST_F(GaugeAlgTest, Generation)
+TEST_P(GaugeAlgTest, Generation)
 {
   if (execute && !gauge_load) {
     auto detu = getLinkDeterminant(*U);
@@ -288,7 +298,7 @@ TEST_F(GaugeAlgTest, Generation)
   }
 }
 
-TEST_F(GaugeAlgTest, Landau_Overrelaxation)
+TEST_P(GaugeAlgTest, Landau_Overrelaxation)
 {
   if (execute) {
     printfQuda("Landau gauge fixing with overrelaxation\n");
@@ -301,7 +311,7 @@ TEST_F(GaugeAlgTest, Landau_Overrelaxation)
   }
 }
 
-TEST_F(GaugeAlgTest, Coulomb_Overrelaxation)
+TEST_P(GaugeAlgTest, Coulomb_Overrelaxation)
 {
   if (execute) {
     printfQuda("Coulomb gauge fixing with overrelaxation\n");
@@ -314,7 +324,7 @@ TEST_F(GaugeAlgTest, Coulomb_Overrelaxation)
   }
 }
 
-TEST_F(GaugeAlgTest, Landau_FFT)
+TEST_P(GaugeAlgTest, Landau_FFT)
 {
   if (execute) {
     if (!comm_partitioned()) {
@@ -329,7 +339,7 @@ TEST_F(GaugeAlgTest, Landau_FFT)
   }
 }
 
-TEST_F(GaugeAlgTest, Coulomb_FFT)
+TEST_P(GaugeAlgTest, Coulomb_FFT)
 {
   if (execute) {
     if (!comm_partitioned()) {
@@ -377,6 +387,12 @@ struct gauge_alg_test : quda_test {
   gauge_alg_test(int argc, char **argv) : quda_test("Gauge Alg Test", argc, argv) { }
 };
 
+INSTANTIATE_TEST_SUITE_P(GaugeAlgTest, GaugeAlgTest,
+                         testing::Combine(testing::Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION)),
+                         [](testing::TestParamInfo<test_t> param) {
+                           return std::string(get_prec_str(testing::get<0>(param.param)));
+                         });
+
 int main(int argc, char **argv)
 {
   gauge_alg_test test(argc, argv);
diff --git a/tests/gauge_path_test.cpp b/tests/gauge_path_test.cpp
index 370470fc50..fee31cb53c 100644
--- a/tests/gauge_path_test.cpp
+++ b/tests/gauge_path_test.cpp
@@ -6,6 +6,7 @@
 #include <host_utils.h>
 #include <command_line_params.h>
 #include <gauge_field.h>
+#include <instantiate.h>
 #include "misc.h"
 #include "gauge_force_reference.h"
 #include <gauge_path_quda.h>
@@ -81,15 +82,20 @@ static double force_deviation;
 static double loop_deviation;
 static double plaq_deviation;
 
+using force_test_t = ::testing::tuple<QudaPrecision, bool>;
+
 // The same function is used to test computePath.
 // If compute_force is false then a path is computed
-void gauge_force_test(bool compute_force = true)
+void gauge_force_test(force_test_t test_param)
 {
   int max_length = 6;
 
   QudaGaugeParam gauge_param = newQudaGaugeParam();
   setGaugeParam(gauge_param);
 
+  gauge_param.cuda_prec = ::testing::get<0>(test_param);
+  bool compute_force = ::testing::get<1>(test_param);
+
   gauge_param.gauge_order = gauge_order;
   gauge_param.t_boundary = QUDA_PERIODIC_T;
 
@@ -234,15 +240,26 @@ void gauge_force_test(bool compute_force = true)
     for (int i = 0; i < num_paths; i++) host_free(input_path_buf[dir][i]);
     host_free(input_path_buf[dir]);
   }
+
+  if (compute_force) {
+    ASSERT_EQ(force_check, 1) << "CPU and QUDA force implementations do not agree";
+    ASSERT_LE(force_deviation, getTolerance(cuda_prec)) << "CPU and QUDA momentum action implementations do not agree";
+  } else {
+    // path check
+    ASSERT_EQ(path_check, 1) << "CPU and QUDA path implementations do not agree";
+  }
 }
 
-void gauge_loop_test()
+using loop_test_t = ::testing::tuple<QudaPrecision>;
+
+void gauge_loop_test(loop_test_t loop_param)
 {
   int max_length = 6;
 
   QudaGaugeParam gauge_param = newQudaGaugeParam();
   setWilsonGaugeParam(gauge_param);
 
+  gauge_param.cuda_prec = ::testing::get<0>(loop_param);
   gauge_param.gauge_order = gauge_order;
   gauge_param.t_boundary = QUDA_PERIODIC_T;
 
@@ -368,28 +385,53 @@ void gauge_loop_test()
 
   for (int i = 0; i < num_paths; i++) delete[] trace_path_p[i];
   delete[] trace_path_p;
+
+  ASSERT_LE(loop_deviation, getTolerance(cuda_prec)) << "CPU and QUDA loop trace implementations do not agree";
+  ASSERT_LE(plaq_deviation, getTolerance(cuda_prec))
+    << "Plaquette from QUDA loop trace and QUDA dedicated plaquette function do not agree";
 }
 
-TEST(force, verify) { ASSERT_EQ(force_check, 1) << "CPU and QUDA force implementations do not agree"; }
+struct GaugePathTest : public ::testing::TestWithParam<force_test_t>
+{
+  force_test_t param;
+  GaugePathTest() : param(GetParam()) { }
+};
 
-TEST(action, verify)
+TEST_P(GaugePathTest, verify)
 {
-  ASSERT_LE(force_deviation, getTolerance(cuda_prec)) << "CPU and QUDA momentum action implementations do not agree";
+  QudaPrecision prec= ::testing::get<0>(param);
+  if (!quda::is_enabled(prec)) GTEST_SKIP();
+  gauge_force_test(param);
 }
 
-TEST(path, verify) { ASSERT_EQ(path_check, 1) << "CPU and QUDA path implementations do not agree"; }
-
-TEST(loop_traces, verify)
+struct GaugeLoopTest : public ::testing::TestWithParam<loop_test_t>
 {
-  ASSERT_LE(loop_deviation, getTolerance(cuda_prec)) << "CPU and QUDA loop trace implementations do not agree";
-}
+  loop_test_t param;
+  GaugeLoopTest() : param(GetParam()) { }
+};
 
-TEST(plaquette, verify)
+TEST_P(GaugeLoopTest, verify)
 {
-  ASSERT_LE(plaq_deviation, getTolerance(cuda_prec))
-    << "Plaquette from QUDA loop trace and QUDA dedicated plaquette function do not agree";
+  QudaPrecision prec= ::testing::get<0>(param);
+  if (!quda::is_enabled(prec)) GTEST_SKIP();
+  gauge_loop_test(param);
 }
 
+using ::testing::Combine;
+using ::testing::Values;
+
+INSTANTIATE_TEST_SUITE_P(GaugePathTest, GaugePathTest,
+                         Combine(Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION),
+                                 Values(true, false)),
+                         [](testing::TestParamInfo<force_test_t> param) {
+                           return std::string(get_prec_str(testing::get<0>(param.param))) +
+                             "_" + (::testing::get<1>(param.param) ? "force" : "path"); });
+
+INSTANTIATE_TEST_SUITE_P(GaugeLoopTest, GaugeLoopTest,
+                         Combine(Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION)),
+                         [](testing::TestParamInfo<loop_test_t> param)
+                         { return std::string(get_prec_str(testing::get<0>(param.param))); });
+
 static void display_test_info()
 {
   printfQuda("running the following test:\n");
@@ -414,6 +456,7 @@ int main(int argc, char **argv)
   CLI::TransformPairs<QudaGaugeFieldOrder> gauge_order_map {{"milc", QUDA_MILC_GAUGE_ORDER},
                                                             {"qdp", QUDA_QDP_GAUGE_ORDER}};
   app->add_option("--gauge-order", gauge_order, "")->transform(CLI::QUDACheckedTransformer(gauge_order_map));
+  add_testing_option_group(app);
   try {
     app->parse(argc, argv);
   } catch (const CLI::ParseError &e) {
@@ -421,27 +464,23 @@ int main(int argc, char **argv)
   }
 
   initComms(argc, argv, gridsize_from_cmdline);
-
   initQuda(device_ordinal);
 
   setVerbosity(verbosity);
 
   display_test_info();
 
-  gauge_force_test();
-
-  // The same test is also used for gauge path (compute_force=false)
-  gauge_force_test(false);
-
-  gauge_loop_test();
-
-  if (verify_results) {
+  if (enable_testing) {
     // Ensure gtest prints only from rank 0
     ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
     if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
 
     test_rc = RUN_ALL_TESTS();
     if (test_rc != 0) warningQuda("Tests failed");
+  } else {
+    gauge_force_test({prec, true});
+    gauge_force_test({prec, false});
+    gauge_loop_test({prec});
   }
 
   endQuda();
diff --git a/tests/host_reference/clover_force_reference.cpp b/tests/host_reference/clover_force_reference.cpp
index 3d4c7e4e44..536ce9f4b0 100644
--- a/tests/host_reference/clover_force_reference.cpp
+++ b/tests/host_reference/clover_force_reference.cpp
@@ -225,6 +225,27 @@ void CloverForce_reference(void *h_mom, std::array<void *, 4> gauge, std::vector
                            std::vector<quda::ColorSpinorField> &p, std::vector<double> force_coeff)
 {
   int dag = 1;
+  // Get spinor ghost fields
+  // First wrap the input spinor into a ColorSpinorField
+  quda::ColorSpinorParam csParam[4];
+  for (int i = 0; i < 4; ++i) {
+    csParam[i].location = QUDA_CPU_FIELD_LOCATION;
+    // csParam[i].v = in;
+    csParam[i].nColor = 3;
+    csParam[i].nSpin = 4;
+    csParam[i].nDim = 4;
+    for (int d = 0; d < 4; d++) csParam[i].x[d] = Z[d];
+    csParam[i].setPrecision(QUDA_DOUBLE_PRECISION);
+    csParam[i].pad = 0;
+    csParam[i].siteSubset = QUDA_PARITY_SITE_SUBSET;
+    csParam[i].x[0] /= 2;
+    csParam[i].siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
+    csParam[i].fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
+    csParam[i].gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+    csParam[i].create = QUDA_REFERENCE_FIELD_CREATE;
+    csParam[i].pc_type = QUDA_4D_PC;
+  }
+
   for (auto i = 0u; i < x.size(); i++) {
     for (int parity = 0; parity < 2; parity++) {
       quda::ColorSpinorField &inA = (parity & 1) ? x[i].Odd() : x[i].Even();
@@ -238,6 +259,24 @@ void CloverForce_reference(void *h_mom, std::array<void *, 4> gauge, std::vector
       CloverForce_kernel_host<double, double>(gauge, h_mom, inA, inB, 1, parity, force_coeff[i]);
       inD.exchangeGhost((QudaParity)(1 - parity), nFace, 1 - dag);
       CloverForce_kernel_host<double, double>(gauge, h_mom, inC, inD, -1, parity, force_coeff[i]);
+
+      if (x[0].TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET) {
+        csParam[0].v = inA.data();
+        csParam[1].v = inB.data();
+        csParam[2].v = inC.data();
+        csParam[3].v = inD.data();
+
+        for (int j = 0; j < 4; ++j) { csParam[j].v = (char *)csParam[j].v + Vh * spinor_site_size * sizeof(double); }
+        quda::ColorSpinorField in2A(csParam[0]);
+        quda::ColorSpinorField in2B(csParam[1]);
+        quda::ColorSpinorField in2C(csParam[2]);
+        quda::ColorSpinorField in2D(csParam[3]);
+
+        in2B.exchangeGhost((QudaParity)(1 - parity), nFace, dag);
+        CloverForce_kernel_host<double, double>(gauge, h_mom, in2A, in2B, 1, parity, force_coeff[i]);
+        in2D.exchangeGhost((QudaParity)(1 - parity), nFace, 1 - dag);
+        CloverForce_kernel_host<double, double>(gauge, h_mom, in2C, in2D, -1, parity, force_coeff[i]);
+      }
     }
   }
 }
@@ -826,40 +865,44 @@ void CloverSigmaOprod_reference(void *oprod_, quda::ColorSpinorField &inp, quda:
   sFloat *x = (sFloat *)inx.data();
   sFloat *p = (sFloat *)inp.data();
 
+  int flavors = inx.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET ? inx.TwistFlavor() : 1;
+
   for (int parity = 0; parity < 2; parity++) {
 #pragma omp parallel for
     for (int i = 0; i < Vh; i++) {
       for (int mu = 1; mu < 4; mu++) {
         for (int nu = 0; nu < mu; nu++) {
-
-          sFloat temp[spinor_site_size], temp_munu[spinor_site_size], temp_numu[spinor_site_size];
-          multiplySpinorByDiracgamma(temp, nu, &p[spinor_site_size * (i + Vh * parity)]);
-          multiplySpinorByDiracgamma(temp_munu, mu, temp);
-
-          multiplySpinorByDiracgamma(temp, mu, &p[spinor_site_size * (i + Vh * parity)]);
-          multiplySpinorByDiracgamma(temp_numu, nu, temp);
-          for (int s = 0; s < 4; s++) {
-            for (int t = 0; t < 3; t++) {
-              temp[s * (3 * 2) + t * (2) + 0]
-                = -temp_munu[s * (3 * 2) + t * (2) + 0] + temp_numu[s * (3 * 2) + t * (2) + 0];
-              temp[s * (3 * 2) + t * (2) + 1]
-                = -temp_munu[s * (3 * 2) + t * (2) + 1] + temp_numu[s * (3 * 2) + t * (2) + 1];
+          for (int flavor = 0; flavor < flavors; ++flavor) {
+
+            sFloat temp[spinor_site_size], temp_munu[spinor_site_size], temp_numu[spinor_site_size];
+            multiplySpinorByDiracgamma(temp, nu, &p[spinor_site_size * (i + Vh * flavor + Vh * flavors * parity)]);
+            multiplySpinorByDiracgamma(temp_munu, mu, temp);
+
+            multiplySpinorByDiracgamma(temp, mu, &p[spinor_site_size * (i + Vh * flavor + Vh * flavors * parity)]);
+            multiplySpinorByDiracgamma(temp_numu, nu, temp);
+            for (int s = 0; s < 4; s++) {
+              for (int t = 0; t < 3; t++) {
+                temp[s * (3 * 2) + t * (2) + 0]
+                  = -temp_munu[s * (3 * 2) + t * (2) + 0] + temp_numu[s * (3 * 2) + t * (2) + 0];
+                temp[s * (3 * 2) + t * (2) + 1]
+                  = -temp_munu[s * (3 * 2) + t * (2) + 1] + temp_numu[s * (3 * 2) + t * (2) + 1];
+              }
             }
-          }
 
-          gFloat oprod_f[gauge_site_size];
-          gFloat oprod_imx2[gauge_site_size];
-          outerProdSpinTrace(oprod_f, temp, &x[spinor_site_size * (i + Vh * parity)]);
-          su3_imagx2(oprod_imx2, oprod_f);
+            gFloat oprod_f[gauge_site_size];
+            gFloat oprod_imx2[gauge_site_size];
+            outerProdSpinTrace(oprod_f, temp, &x[spinor_site_size * (i + Vh * flavor + Vh * flavors * parity)]);
+            su3_imagx2(oprod_imx2, oprod_f);
 
-          int munu = (mu - 1) * mu / 2 + nu;
+            int munu = (mu - 1) * mu / 2 + nu;
 
-          for (int ci = 0; ci < nColor; ci++) {   // row
-            for (int cj = 0; cj < nColor; cj++) { // col
-              int color = ci * nColor + cj;
-              int id = 2 * (i + Vh * (color + 9 * (munu + parity * 6)));
-              oprod[id + 0] += coeff[parity] * oprod_imx2[color * 2 + 0] / 2.0;
-              oprod[id + 1] += coeff[parity] * oprod_imx2[color * 2 + 1] / 2.0;
+            for (int ci = 0; ci < nColor; ci++) {   // row
+              for (int cj = 0; cj < nColor; cj++) { // col
+                int color = ci * nColor + cj;
+                int id = 2 * (i + Vh * (color + 9 * (munu + parity * 6)));
+                oprod[id + 0] += coeff[parity] * oprod_imx2[color * 2 + 0] / 2.0;
+                oprod[id + 1] += coeff[parity] * oprod_imx2[color * 2 + 1] / 2.0;
+              }
             }
           }
         }
@@ -895,12 +938,36 @@ void Gamma5_host(double *out, double *in, const int V)
     }
   }
 }
+
+void tau1_host(double *out, double *in, const int V)
+{
+  int V2 = V * 24 / 2;
+#pragma omp parallel for
+  for (int i = 0; i < V2; i++) {
+    double up = in[i];
+    double down = in[i + V2];
+    out[i] = down;
+    out[i + V2] = up;
+  }
+}
+
 void axpbyz_host(double a, double *x, double b, double *y, double *z, const int V)
 {
 #pragma omp parallel for
   for (int i = 0; i < V * 24; i++) { z[i] = a * x[i] + b * y[i]; }
 }
 
+void caxpy_host(double a_re, double a_im, double *x, double *y, const int V)
+{
+#pragma omp parallel for
+  for (int i = 0; i < V * 12; i++) {
+    double re = a_re * x[i * 2 + 0] - a_im * x[i * 2 + 1] + y[i * 2 + 0];
+    double im = a_re * x[i * 2 + 1] + a_im * x[i * 2 + 0] + y[i * 2 + 1];
+    y[i * 2 + 0] = re;
+    y[i * 2 + 1] = im;
+  }
+}
+
 void Gamma5_host_UKQCD(double *out, double *in, const int V)
 {
 #pragma omp parallel for
@@ -968,20 +1035,30 @@ void TMCloverForce_reference(void *h_mom, void **h_x, void **h_x0, double *coeff
     if (myMatPCType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || myMatPCType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
 
       if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-        tmc_dslash(x[i].Even().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa,
-                   inv_param->mu, inv_param->twist_flavor, parity, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec,
-                   *gauge_param);
+        if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) {
+          tmc_dslash(x[i].Even().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa,
+                     inv_param->mu, inv_param->twist_flavor, parity, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec,
+                     *gauge_param);
+        } else if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) {
+          tmc_ndeg_dslash(x[i].Even().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(),
+                          inv_param->kappa, inv_param->mu, inv_param->epsilon, parity, myMatPCType, QUDA_DAG_YES,
+                          inv_param->cpu_prec, *gauge_param);
+        }
       } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
         clover_dslash(x[i].Even().data(), gauge.data(), clover_inv.data(), tmp.data(), parity, QUDA_DAG_YES,
                       inv_param->cpu_prec, *gauge_param);
       } else {
         errorQuda("TMCloverForce_reference: dslash_type not supported\n");
       }
-      Gamma5_host(x[i].Even().data<double *>(), x[i].Even().data<double *>(), x[i].Even().VolumeCB());
 
       if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-        tmc_matpc(p[i].Odd().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa,
-                  inv_param->mu, inv_param->twist_flavor, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param);
+        if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) {
+          tmc_matpc(p[i].Odd().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa,
+                    inv_param->mu, inv_param->twist_flavor, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param);
+        } else if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) {
+          tmc_ndeg_matpc(p[i].Odd().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa,
+                         inv_param->mu, inv_param->epsilon, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param);
+        }
       } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
         clover_matpc(p[i].Odd().data(), gauge.data(), clover.data(), clover_inv.data(), tmp.data(), inv_param->kappa,
                      myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param);
@@ -989,7 +1066,18 @@ void TMCloverForce_reference(void *h_mom, void **h_x, void **h_x0, double *coeff
         errorQuda("TMCloverForce_reference: dslash_type not supported\n");
       }
 
-      if (detratio) {
+      if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) {
+        axpbyz_host(1.0 / inv_param->evmax, p[i].Odd().data<double *>(), 0, p[i].Odd().data<double *>(),
+                    p[i].Odd().data<double *>(), p[i].Odd().VolumeCB());
+        tau1_host(x[i].Even().data<double *>(), x[i].Even().data<double *>(), x[i].Even().VolumeCB());
+        tau1_host(p[i].Odd().data<double *>(), p[i].Odd().data<double *>(), p[i].Odd().VolumeCB());
+        caxpy_host(0.0, -inv_param->offset[i], x[i].Odd().data<double *>(), p[i].Odd().data<double *>(),
+                   p[i].Odd().VolumeCB());
+      }
+
+      Gamma5_host(x[i].Even().data<double *>(), x[i].Even().data<double *>(), x[i].Even().VolumeCB());
+
+      if (detratio && inv_param->twist_flavor != QUDA_TWIST_NONDEG_DOUBLET) {
         qParam.v = h_x0[i];
         quda::ColorSpinorField load_half(qParam);
         x0[i].Odd() = load_half;
@@ -997,10 +1085,19 @@ void TMCloverForce_reference(void *h_mom, void **h_x, void **h_x0, double *coeff
                     p[i].Odd().VolumeCB());
       }
 
+      if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) {
+        Gamma5_host(p[i].Odd().data<double *>(), p[i].Odd().data<double *>(), p[i].Odd().VolumeCB());
+      }
+
       if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-        tmc_dslash(p[i].Even().data(), gauge.data(), p[i].Odd().data(), clover.data(), clover_inv.data(),
-                   inv_param->kappa, inv_param->mu, inv_param->twist_flavor, parity, myMatPCType, QUDA_DAG_NO,
-                   inv_param->cpu_prec, *gauge_param);
+        if (inv_param->twist_flavor == QUDA_TWIST_SINGLET)
+          tmc_dslash(p[i].Even().data(), gauge.data(), p[i].Odd().data(), clover.data(), clover_inv.data(),
+                     inv_param->kappa, inv_param->mu, inv_param->twist_flavor, parity, myMatPCType, QUDA_DAG_NO,
+                     inv_param->cpu_prec, *gauge_param);
+        else if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET)
+          tmc_ndeg_dslash(p[i].Even().data(), gauge.data(), p[i].Odd().data(), clover.data(), clover_inv.data(),
+                          inv_param->kappa, inv_param->mu, inv_param->epsilon, parity, myMatPCType, QUDA_DAG_YES,
+                          inv_param->cpu_prec, *gauge_param);
       } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
         clover_dslash(p[i].Even().data(), gauge.data(), clover_inv.data(), p[i].Odd().data(), parity, QUDA_DAG_NO,
                       inv_param->cpu_prec, *gauge_param);
@@ -1008,6 +1105,12 @@ void TMCloverForce_reference(void *h_mom, void **h_x, void **h_x0, double *coeff
         errorQuda("TMCloverForce_reference: dslash_type not supported\n");
       }
 
+      if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) {
+        Gamma5_host(p[i].Odd().data<double *>(), p[i].Odd().data<double *>(), p[i].Odd().VolumeCB());
+        Gamma5_host(p[i].Even().data<double *>(), p[i].Even().data<double *>(), p[i].Even().VolumeCB());
+        tau1_host(p[i].Even().data<double *>(), p[i].Even().data<double *>(), p[i].Even().VolumeCB());
+      }
+
     } else {
       errorQuda("TMCloverForce_reference: MATPC type not supported\n");
     }
@@ -1058,6 +1161,10 @@ void TMCloverForce_reference(void *h_mom, void **h_x, void **h_x0, double *coeff
     ferm_epsilon[i].reserve(2);
     ferm_epsilon[i][0] = k_csw_ov_8 * coeff[i];
     ferm_epsilon[i][1] = k_csw_ov_8 * coeff[i] / (inv_param->kappa * inv_param->kappa);
+    if (inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) {
+      tau1_host(p[i].Even().data<double *>(), p[i].Even().data<double *>(), p[i].Even().VolumeCB());
+      tau1_host(p[i].Odd().data<double *>(), p[i].Odd().data<double *>(), p[i].Odd().VolumeCB());
+    }
   }
   // derivative of pseudofermion sw term, first term term of (A12) in hep-lat/0112051,  sw_spinor_eo(EE,..) plus
   // sw_spinor_eo(OO,..)  in tmLQCD
diff --git a/tests/host_reference/clover_reference.cpp b/tests/host_reference/clover_reference.cpp
index 1a89ead931..085d7fe4bd 100644
--- a/tests/host_reference/clover_reference.cpp
+++ b/tests/host_reference/clover_reference.cpp
@@ -503,13 +503,15 @@ void tmc_ndeg_dslash(void *out, void **gauge, void *in, void *clover, void *cInv
     if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
       void *tmptmp1 = safe_malloc(Vh * spinor_site_size * precision);
       void *tmptmp2 = safe_malloc(Vh * spinor_site_size * precision);
-      wil_dslash(tmptmp1, gauge, tmp1, oddBit, daggerBit, precision, gauge_param);
-      wil_dslash(tmptmp2, gauge, tmp2, oddBit, daggerBit, precision, gauge_param);
+      wil_dslash(tmptmp1, gauge, in1, oddBit, daggerBit, precision, gauge_param);
+      wil_dslash(tmptmp2, gauge, in2, oddBit, daggerBit, precision, gauge_param);
       ndegTwistCloverGamma5(out1, out2, tmptmp1, tmptmp2, clover, cInv, daggerBit, kappa, mu, epsilon, oddBit,
                             QUDA_TWIST_GAMMA5_INVERSE, precision);
       host_free(tmptmp1);
       host_free(tmptmp2);
     } else {
+      ndegTwistCloverGamma5(tmp1, tmp2, in1, in2, clover, cInv, daggerBit, kappa, mu, epsilon, 1 - oddBit,
+                            QUDA_TWIST_GAMMA5_INVERSE, precision);
       wil_dslash(out1, gauge, tmp1, oddBit, daggerBit, precision, gauge_param);
       wil_dslash(out2, gauge, tmp2, oddBit, daggerBit, precision, gauge_param);
     }
diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp
index eda5c778a7..a1215f051a 100644
--- a/tests/invert_test.cpp
+++ b/tests/invert_test.cpp
@@ -22,6 +22,12 @@ QudaEigParam mg_eig_param[QUDA_MAX_MG_LEVEL];
 QudaEigParam eig_param;
 bool use_split_grid = false;
 
+std::vector<char> gauge_;
+std::array<void *, 4> gauge;
+std::vector<char> clover;
+std::vector<char> clover_inv;
+QudaPrecision last_prec = QUDA_INVALID_PRECISION;
+
 // if --enable-testing true is passed, we run the tests defined in here
 #include <invert_test_gtest.hpp>
 
@@ -107,11 +113,6 @@ void display_test_info()
              dimPartitioned(3));
 }
 
-std::vector<char> gauge_;
-std::array<void *, 4> gauge;
-std::vector<char> clover;
-std::vector<char> clover_inv;
-
 void init(int argc, char **argv)
 {
   // Set QUDA's internal parameters
@@ -163,8 +164,6 @@ void init(int argc, char **argv)
   gauge_.resize(4 * V * gauge_site_size * host_gauge_data_type_size);
   for (int i = 0; i < 4; i++) gauge[i] = gauge_.data() + i * V * gauge_site_size * host_gauge_data_type_size;
   constructHostGaugeField(gauge.data(), gauge_param, argc, argv);
-  // Load the gauge field to the device
-  loadGaugeQuda(gauge.data(), &gauge_param);
 
   // Allocate host side memory for clover terms if needed.
   //----------------------------------------------------------------------------
@@ -173,36 +172,48 @@ void init(int argc, char **argv)
     clover.resize(V * clover_site_size * host_clover_data_type_size);
     clover_inv.resize(V * clover_site_size * host_spinor_data_type_size);
     constructHostCloverField(clover.data(), clover_inv.data(), inv_param);
+  }
+
+  // Load the gauge field to the device
+  loadGaugeQuda(gauge.data(), &gauge_param);
+
+  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
     // Load the clover terms to the device
     loadCloverQuda(clover.data(), clover_inv.data(), &inv_param);
   }
+  last_prec = gauge_param.cuda_prec;
 }
 
 std::vector<std::array<double, 2>> solve(test_t param)
 {
-  inv_param.inv_type = ::testing::get<0>(param);
-  inv_param.solution_type = ::testing::get<1>(param);
-  inv_param.solve_type = ::testing::get<2>(param);
-  inv_param.cuda_prec_sloppy = ::testing::get<3>(param);
-  inv_param.clover_cuda_prec_sloppy = ::testing::get<3>(param);
-  multishift = ::testing::get<4>(param);
-  inv_param.solution_accumulator_pipeline = ::testing::get<5>(param);
+  inv_param.cuda_prec = ::testing::get<0>(param);
+  inv_param.clover_cuda_prec = ::testing::get<0>(param);
+  inv_param.cuda_prec_sloppy = ::testing::get<1>(param);
+  inv_param.cuda_prec_refinement_sloppy = ::testing::get<1>(param);
+  inv_param.clover_cuda_prec_sloppy = ::testing::get<1>(param);
+  inv_param.clover_cuda_prec_refinement_sloppy = ::testing::get<1>(param);
+  inv_param.inv_type = ::testing::get<2>(param);
+  inv_param.solution_type = ::testing::get<3>(param);
+  inv_param.solve_type = ::testing::get<4>(param);
+  multishift = ::testing::get<5>(param);
+  inv_param.solution_accumulator_pipeline = ::testing::get<6>(param);
 
   // schwarz parameters
-  auto schwarz_param = ::testing::get<6>(param);
+  auto schwarz_param = ::testing::get<7>(param);
   inv_param.schwarz_type           = ::testing::get<0>(schwarz_param);
   inv_param.inv_type_precondition  = ::testing::get<1>(schwarz_param);
   inv_param.cuda_prec_precondition = ::testing::get<2>(schwarz_param);
   inv_param.clover_cuda_prec_precondition = ::testing::get<2>(schwarz_param);
 
-  inv_param.residual_type = ::testing::get<7>(param);
+  inv_param.residual_type = ::testing::get<8>(param);
 
   // reset lambda_max if we're doing a testing loop to ensure correct lambma_max
   if (enable_testing) inv_param.ca_lambda_max = -1.0;
 
-  logQuda(QUDA_SUMMARIZE, "Solution = %s, Solve = %s, Solver = %s, Sloppy precision = %s\n",
+  logQuda(QUDA_SUMMARIZE, "Solution = %s, Solve = %s, Solver = %s, Precision = %s, Sloppy precision = %s\n",
           get_solution_str(inv_param.solution_type), get_solve_str(inv_param.solve_type),
-          get_solver_str(inv_param.inv_type), get_prec_str(inv_param.cuda_prec_sloppy));
+          get_solver_str(inv_param.inv_type), get_prec_str(inv_param.cuda_prec),
+          get_prec_str(inv_param.cuda_prec_sloppy));
 
   // params corresponds to split grid
   for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
@@ -407,7 +418,6 @@ int main(int argc, char **argv)
   // All parameters have been set. Display the parameters via stdout
   display_test_info();
 
-  // Initialize the QUDA library
   initQuda(device_ordinal);
 
   init(argc, argv);
@@ -423,7 +433,7 @@ int main(int argc, char **argv)
     if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
     result = RUN_ALL_TESTS();
   } else {
-    solve(test_t {inv_type, solution_type, solve_type, prec_sloppy, multishift, solution_accumulator_pipeline,
+    solve(test_t {prec, prec_sloppy, inv_type, solution_type, solve_type, multishift, solution_accumulator_pipeline,
                   schwarz_t {precon_schwarz_type, inv_multigrid ? QUDA_MG_INVERTER : precon_type, prec_precondition},
                   inv_param.residual_type});
   }
diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp
index ecc23ed12d..39e7ce8f68 100644
--- a/tests/invert_test_gtest.hpp
+++ b/tests/invert_test_gtest.hpp
@@ -5,8 +5,8 @@
 // tuple containing parameters for Schwarz solver
 using schwarz_t = ::testing::tuple<QudaSchwarzType, QudaInverterType, QudaPrecision>;
 
-using test_t
-  = ::testing::tuple<QudaInverterType, QudaSolutionType, QudaSolveType, QudaPrecision, int, int, schwarz_t, QudaResidualType>;
+using test_t = ::testing::tuple<QudaPrecision, QudaPrecision, QudaInverterType, QudaSolutionType, QudaSolveType, int,
+                                int, schwarz_t, QudaResidualType>;
 
 class InvertTest : public ::testing::TestWithParam<test_t>
 {
@@ -15,20 +15,44 @@ class InvertTest : public ::testing::TestWithParam<test_t>
 
 public:
   InvertTest() : param(GetParam()) { }
+
+  virtual void SetUp()
+  {
+    // check if outer precision has changed and update if it has
+    if (::testing::get<0>(param) != last_prec) {
+      if (last_prec != QUDA_INVALID_PRECISION) {
+        freeGaugeQuda();
+        if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) freeCloverQuda();
+      }
+
+      // Load the gauge field to the device
+      gauge_param.cuda_prec = ::testing::get<0>(param);
+      loadGaugeQuda(gauge.data(), &gauge_param);
+
+      if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+        // Load the clover terms to the device
+        inv_param.clover_cuda_prec = ::testing::get<0>(param);
+        loadCloverQuda(clover.data(), clover_inv.data(), &inv_param);
+      }
+      last_prec = ::testing::get<0>(param);
+    }
+  }
 };
 
 bool skip_test(test_t param)
 {
-  auto inverter_type = ::testing::get<0>(param);
-  auto solution_type = ::testing::get<1>(param);
-  auto solve_type = ::testing::get<2>(param);
-  auto prec_sloppy = ::testing::get<3>(param);
-  auto multishift = ::testing::get<4>(param);
-  auto solution_accumulator_pipeline = ::testing::get<5>(param);
-  auto schwarz_param = ::testing::get<6>(param);
+  auto prec = ::testing::get<0>(param);
+  auto prec_sloppy = ::testing::get<1>(param);
+  auto inverter_type = ::testing::get<2>(param);
+  auto solution_type = ::testing::get<3>(param);
+  auto solve_type = ::testing::get<4>(param);
+  auto multishift = ::testing::get<5>(param);
+  auto solution_accumulator_pipeline = ::testing::get<6>(param);
+  auto schwarz_param = ::testing::get<7>(param);
   auto prec_precondition = ::testing::get<2>(schwarz_param);
 
   if (prec < prec_sloppy) return true;              // outer precision >= sloppy precision
+  if (!(QUDA_PRECISION & prec)) return true;        // precision not enabled so skip it
   if (!(QUDA_PRECISION & prec_sloppy)) return true; // precision not enabled so skip it
   if (!(QUDA_PRECISION & prec_precondition) && prec_precondition != QUDA_INVALID_PRECISION)
     return true; // precision not enabled so skip it
@@ -66,18 +90,19 @@ TEST_P(InvertTest, verify)
 {
   if (skip_test(GetParam())) GTEST_SKIP();
 
+  auto tol = ::testing::get<0>(GetParam()) == QUDA_SINGLE_PRECISION ? 1e-6 : 1e-12;
+  auto tol_hq = ::testing::get<0>(GetParam()) == QUDA_SINGLE_PRECISION ? 1e-6 : 1e-12;
   inv_param.tol = 0.0;
   inv_param.tol_hq = 0.0;
-  auto res_t = ::testing::get<7>(GetParam());
+  auto res_t = ::testing::get<8>(GetParam());
   if (res_t & QUDA_L2_RELATIVE_RESIDUAL) inv_param.tol = tol;
   if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) inv_param.tol_hq = tol_hq;
 
-  auto tol = inv_param.tol;
   if (is_chiral(inv_param.dslash_type)) { tol *= std::sqrt(static_cast<double>(inv_param.Ls)); }
   // FIXME eventually we should build in refinement to the *NR solvers to remove the need for this
-  if (is_normal_residual(::testing::get<0>(GetParam()))) tol *= 50;
+  if (is_normal_residual(::testing::get<2>(GetParam()))) tol *= 50;
   // Slight loss of precision possible when reconstructing full solution
-  if (is_full_solution(::testing::get<1>(GetParam())) && is_preconditioned_solve(::testing::get<2>(GetParam())))
+  if (is_full_solution(::testing::get<3>(GetParam())) && is_preconditioned_solve(::testing::get<4>(GetParam())))
     tol *= 10;
 
   for (auto rsd : solve(GetParam())) {
@@ -89,21 +114,22 @@ TEST_P(InvertTest, verify)
 std::string gettestname(::testing::TestParamInfo<test_t> param)
 {
   std::string name;
-  name += get_solver_str(::testing::get<0>(param.param)) + std::string("_");
-  name += get_solution_str(::testing::get<1>(param.param)) + std::string("_");
-  name += get_solve_str(::testing::get<2>(param.param)) + std::string("_");
-  name += get_prec_str(::testing::get<3>(param.param));
-  if (::testing::get<4>(param.param) > 1)
-    name += std::string("_shift") + std::to_string(::testing::get<4>(param.param));
+  name += get_prec_str(::testing::get<0>(param.param)) + std::string("_");
+  name += get_prec_str(::testing::get<1>(param.param)) + std::string("_");
+  name += get_solver_str(::testing::get<2>(param.param)) + std::string("_");
+  name += get_solution_str(::testing::get<3>(param.param)) + std::string("_");
+  name += get_solve_str(::testing::get<4>(param.param));
   if (::testing::get<5>(param.param) > 1)
-    name += std::string("_solution_accumulator_pipeline") + std::to_string(::testing::get<5>(param.param));
-  auto &schwarz_param = ::testing::get<6>(param.param);
+    name += std::string("_shift") + std::to_string(::testing::get<5>(param.param));
+  if (::testing::get<6>(param.param) > 1)
+    name += std::string("_solution_accumulator_pipeline") + std::to_string(::testing::get<6>(param.param));
+  auto &schwarz_param = ::testing::get<7>(param.param);
   if (::testing::get<0>(schwarz_param) != QUDA_INVALID_SCHWARZ) {
     name += std::string("_") + get_schwarz_str(::testing::get<0>(schwarz_param));
     name += std::string("_") + get_solver_str(::testing::get<1>(schwarz_param));
     name += std::string("_") + get_prec_str(::testing::get<2>(schwarz_param));
   }
-  auto res_t = ::testing::get<7>(param.param);
+  auto res_t = ::testing::get<8>(param.param);
   if (res_t & QUDA_L2_RELATIVE_RESIDUAL) name += std::string("_l2");
   if (res_t & QUDA_HEAVY_QUARK_RESIDUAL) name += std::string("_heavy_quark");
   return name;
@@ -118,6 +144,8 @@ auto direct_solvers = Values(QUDA_CGNE_INVERTER, QUDA_CGNR_INVERTER, QUDA_CA_CGN
                              QUDA_CG3NE_INVERTER, QUDA_CG3NR_INVERTER, QUDA_GCR_INVERTER, QUDA_CA_GCR_INVERTER,
                              QUDA_BICGSTAB_INVERTER, QUDA_BICGSTABL_INVERTER, QUDA_MR_INVERTER);
 
+auto precisions = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION);
+
 auto sloppy_precisions
   = Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION);
 
@@ -129,41 +157,43 @@ auto no_heavy_quark = Values(QUDA_L2_RELATIVE_RESIDUAL);
 
 // preconditioned normal solves
 INSTANTIATE_TEST_SUITE_P(NormalEvenOdd, InvertTest,
-                         Combine(normal_solvers, Values(QUDA_MATPCDAG_MATPC_SOLUTION, QUDA_MAT_SOLUTION),
-                                 Values(QUDA_NORMOP_PC_SOLVE), sloppy_precisions, Values(1),
-                                 solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
+                         Combine(precisions, sloppy_precisions, normal_solvers,
+                                 Values(QUDA_MATPCDAG_MATPC_SOLUTION, QUDA_MAT_SOLUTION), Values(QUDA_NORMOP_PC_SOLVE),
+                                 Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
                          gettestname);
 
 // full system normal solve
 INSTANTIATE_TEST_SUITE_P(NormalFull, InvertTest,
-                         Combine(normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION), Values(QUDA_NORMOP_SOLVE),
-                                 sloppy_precisions, Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
+                         Combine(precisions, sloppy_precisions, normal_solvers, Values(QUDA_MATDAG_MAT_SOLUTION),
+                                 Values(QUDA_NORMOP_SOLVE), Values(1), solution_accumulator_pipelines, no_schwarz,
+                                 no_heavy_quark),
                          gettestname);
 
 // preconditioned direct solves
 INSTANTIATE_TEST_SUITE_P(EvenOdd, InvertTest,
-                         Combine(direct_solvers, Values(QUDA_MATPC_SOLUTION, QUDA_MAT_SOLUTION),
-                                 Values(QUDA_DIRECT_PC_SOLVE), sloppy_precisions, Values(1),
-                                 solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
+                         Combine(precisions, sloppy_precisions, direct_solvers,
+                                 Values(QUDA_MATPC_SOLUTION, QUDA_MAT_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE),
+                                 Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
                          gettestname);
 
 // full system direct solve
 INSTANTIATE_TEST_SUITE_P(Full, InvertTest,
-                         Combine(direct_solvers, Values(QUDA_MAT_SOLUTION), Values(QUDA_DIRECT_SOLVE), sloppy_precisions,
-                                 Values(1), solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
+                         Combine(precisions, sloppy_precisions, direct_solvers, Values(QUDA_MAT_SOLUTION),
+                                 Values(QUDA_DIRECT_SOLVE), Values(1), solution_accumulator_pipelines, no_schwarz,
+                                 no_heavy_quark),
                          gettestname);
 
 // preconditioned multi-shift solves
 INSTANTIATE_TEST_SUITE_P(MultiShiftEvenOdd, InvertTest,
-                         Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPCDAG_MATPC_SOLUTION),
-                                 Values(QUDA_NORMOP_PC_SOLVE), sloppy_precisions, Values(10),
+                         Combine(precisions, sloppy_precisions, Values(QUDA_CG_INVERTER),
+                                 Values(QUDA_MATPCDAG_MATPC_SOLUTION), Values(QUDA_NORMOP_PC_SOLVE), Values(10),
                                  solution_accumulator_pipelines, no_schwarz, no_heavy_quark),
                          gettestname);
 
 // Schwarz-preconditioned normal solves
 INSTANTIATE_TEST_SUITE_P(SchwarzNormal, InvertTest,
-                         Combine(Values(QUDA_PCG_INVERTER), Values(QUDA_MATPCDAG_MATPC_SOLUTION),
-                                 Values(QUDA_NORMOP_PC_SOLVE), sloppy_precisions, Values(1),
+                         Combine(precisions, sloppy_precisions, Values(QUDA_PCG_INVERTER),
+                                 Values(QUDA_MATPCDAG_MATPC_SOLUTION), Values(QUDA_NORMOP_PC_SOLVE), Values(1),
                                  solution_accumulator_pipelines,
                                  Combine(Values(QUDA_ADDITIVE_SCHWARZ), Values(QUDA_CG_INVERTER, QUDA_CA_CG_INVERTER),
                                          Values(QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION)),
@@ -172,8 +202,8 @@ INSTANTIATE_TEST_SUITE_P(SchwarzNormal, InvertTest,
 
 // Schwarz-preconditioned direct solves
 INSTANTIATE_TEST_SUITE_P(SchwarzEvenOdd, InvertTest,
-                         Combine(Values(QUDA_GCR_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_DIRECT_PC_SOLVE),
-                                 sloppy_precisions, Values(1), solution_accumulator_pipelines,
+                         Combine(precisions, sloppy_precisions, Values(QUDA_GCR_INVERTER), Values(QUDA_MATPC_SOLUTION),
+                                 Values(QUDA_DIRECT_PC_SOLVE), Values(1), solution_accumulator_pipelines,
                                  Combine(Values(QUDA_ADDITIVE_SCHWARZ), Values(QUDA_MR_INVERTER, QUDA_CA_GCR_INVERTER),
                                          Values(QUDA_HALF_PRECISION, QUDA_QUARTER_PRECISION)),
                                  no_heavy_quark),
@@ -181,7 +211,7 @@ INSTANTIATE_TEST_SUITE_P(SchwarzEvenOdd, InvertTest,
 
 // Heavy-Quark preconditioned solves
 INSTANTIATE_TEST_SUITE_P(HeavyQuarkEvenOdd, InvertTest,
-                         Combine(Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION), Values(QUDA_NORMOP_PC_SOLVE),
-                                 sloppy_precisions, Values(1), solution_accumulator_pipelines, no_schwarz,
+                         Combine(precisions, sloppy_precisions, Values(QUDA_CG_INVERTER), Values(QUDA_MATPC_SOLUTION),
+                                 Values(QUDA_NORMOP_PC_SOLVE), Values(1), solution_accumulator_pipelines, no_schwarz,
                                  Values(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL, QUDA_HEAVY_QUARK_RESIDUAL)),
                          gettestname);
diff --git a/tests/io_test.cpp b/tests/io_test.cpp
index af708baf3b..2618b0e479 100644
--- a/tests/io_test.cpp
+++ b/tests/io_test.cpp
@@ -71,12 +71,15 @@ TEST_P(GaugeIOTest, verify)
   for (int dir = 0; dir < 4; dir++) { host_free(gauge[dir]); }
 }
 
-using cs_test_t = ::testing::tuple<QudaSiteSubset, bool, QudaPrecision, QudaPrecision, int, bool, QudaFieldLocation>;
+using cs_test_t = ::testing::tuple<QudaSiteSubset, QudaSiteSubset, QudaParity, bool, QudaPrecision, QudaPrecision, int,
+                                   bool, QudaFieldLocation>;
 
 class ColorSpinorIOTest : public ::testing::TestWithParam<cs_test_t>
 {
 protected:
-  QudaSiteSubset site_subset;
+  QudaSiteSubset site_subset_saved;
+  QudaSiteSubset site_subset_loaded;
+  QudaParity parity;
   bool inflate;
   QudaPrecision prec;
   QudaPrecision prec_io;
@@ -86,13 +89,15 @@ class ColorSpinorIOTest : public ::testing::TestWithParam<cs_test_t>
 
 public:
   ColorSpinorIOTest() :
-    site_subset(::testing::get<0>(GetParam())),
-    inflate(::testing::get<1>(GetParam())),
-    prec(::testing::get<2>(GetParam())),
-    prec_io(::testing::get<3>(GetParam())),
-    nSpin(::testing::get<4>(GetParam())),
-    partfile(::testing::get<5>(GetParam())),
-    location(::testing::get<6>(GetParam()))
+    site_subset_saved(::testing::get<0>(GetParam())),
+    site_subset_loaded(::testing::get<1>(GetParam())),
+    parity(::testing::get<2>(GetParam())),
+    inflate(::testing::get<3>(GetParam())),
+    prec(::testing::get<4>(GetParam())),
+    prec_io(::testing::get<5>(GetParam())),
+    nSpin(::testing::get<6>(GetParam())),
+    partfile(::testing::get<7>(GetParam())),
+    location(::testing::get<8>(GetParam()))
   {
   }
 };
@@ -117,37 +122,79 @@ TEST_P(ColorSpinorIOTest, verify)
       || (prec < QUDA_SINGLE_PRECISION && nSpin == 2))
     GTEST_SKIP();
 
+  // There's no meaningful way to save a full parity spinor field and load a
+  // single-parity field
+  if (site_subset_saved == QUDA_FULL_SITE_SUBSET && site_subset_loaded == QUDA_PARITY_SITE_SUBSET) GTEST_SKIP();
+
+  // Site subsets must be the same except for inflation tests
+  if (site_subset_saved != site_subset_loaded && !inflate) GTEST_SKIP();
+
+  // Unique test: saving an inflated single-parity field and loading the full-parity field
+  bool mixed_inflated_full_test
+    = (site_subset_saved == QUDA_PARITY_SITE_SUBSET && site_subset_loaded == QUDA_FULL_SITE_SUBSET && inflate);
+
   QudaGaugeParam gauge_param = newQudaGaugeParam();
   QudaInvertParam inv_param = newQudaInvertParam();
-  ColorSpinorParam param;
   setWilsonGaugeParam(gauge_param);
   setInvertParam(inv_param);
-  constructWilsonTestSpinorParam(&param, &inv_param, &gauge_param);
-  param.siteSubset = site_subset;
-  param.suggested_parity = QUDA_EVEN_PARITY;
-  param.nSpin = nSpin;
-  param.setPrecision(prec, prec, true); // change order to native order
-  param.location = location;
-  if (location == QUDA_CPU_FIELD_LOCATION) param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-  param.create = QUDA_NULL_FIELD_CREATE;
+
+  // We create a separate parameter struct for saved and loaded ColorSpinorFields
+  ColorSpinorParam param_save, param_load;
+
+  // Saved
+  constructWilsonTestSpinorParam(&param_save, &inv_param, &gauge_param);
+  param_save.siteSubset = site_subset_saved;
+  param_save.suggested_parity = parity; // ignored for full parity
+  param_save.nSpin = nSpin;
+  param_save.setPrecision(prec, prec, true); // change order to native order
+  param_save.location = location;
+  if (location == QUDA_CPU_FIELD_LOCATION) param_save.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
+  param_save.create = QUDA_NULL_FIELD_CREATE;
+
+  // Loaded
+  param_load = param_save;
+  param_load.siteSubset = site_subset_loaded;
+
+  // Update ColorSpinorField size appropriately depending on the site subset
+  if (site_subset_saved == QUDA_PARITY_SITE_SUBSET) param_save.x[0] /= 2;
+  if (site_subset_loaded == QUDA_PARITY_SITE_SUBSET) param_load.x[0] /= 2;
 
   // create some random vectors
   auto n_vector = 1;
-  std::vector<ColorSpinorField> v(n_vector, param);
-  std::vector<ColorSpinorField> u(n_vector, param);
+  std::vector<ColorSpinorField> v(n_vector, param_save);
+  std::vector<ColorSpinorField> u(n_vector, param_load);
 
   RNG rng(v[0], 1234);
   for (auto &vi : v) spinorNoise(vi, rng, QUDA_NOISE_GAUSS);
 
   auto file = "dummy.cs";
 
-  VectorIO io(file, inflate, partfile);
-
-  io.save({v.begin(), v.end()}, prec_io, n_vector);
-  io.load(u);
+  // create a separate VectorIO for saving and loading
+  // this lets us test saving a single-parity field with inflation
+  // then loading the full parity field.
+  VectorIO io_save(file, inflate, partfile);
+  VectorIO io_load(file, inflate, partfile);
+
+  io_save.save({v.begin(), v.end()}, prec_io, n_vector);
+  io_load.load(u);
+
+  // This lambda simplifies returning the correct subset of `u` by reference if we're doing
+  // tests where we save an inflated single-parity vector and load a full-parity vector.
+  auto get_u_subset = [&](int i) -> const ColorSpinorField & {
+    if (mixed_inflated_full_test)
+      if (this->parity == QUDA_EVEN_PARITY) // lambdas don't automatically capture `this`
+        return u[i].Even();
+      else
+        return u[i].Odd();
+    else
+      return u[i];
+  };
 
   for (auto i = 0u; i < v.size(); i++) {
-    auto dev = blas::max_deviation(u[i], v[i]);
+    // grab the correct subset of u if we're doing mixed parity/full tests
+    const ColorSpinorField &u_subset = get_u_subset(i);
+
+    auto dev = blas::max_deviation(u_subset, v[i]);
     if (prec == prec_io)
       EXPECT_EQ(dev[0], 0.0);
     else
@@ -185,33 +232,38 @@ INSTANTIATE_TEST_SUITE_P(Gauge, GaugeIOTest, Combine(Values(QUDA_DOUBLE_PRECISIO
 
 // colorspinor full field IO test
 INSTANTIATE_TEST_SUITE_P(Full, ColorSpinorIOTest,
-                         Combine(Values(QUDA_FULL_SITE_SUBSET), Values(false),
+                         Combine(Values(QUDA_FULL_SITE_SUBSET), Values(QUDA_FULL_SITE_SUBSET),
+                                 Values(QUDA_INVALID_PARITY), Values(false),
                                  Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION),
                                  Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION), Values(1, 2, 4),
                                  Values(false, true), Values(QUDA_CUDA_FIELD_LOCATION, QUDA_CPU_FIELD_LOCATION)),
                          [](testing::TestParamInfo<cs_test_t> param) {
                            std::string name;
-                           name += get_prec_str(::testing::get<2>(param.param)) + std::string("_");
-                           name += get_prec_str(::testing::get<3>(param.param)) + std::string("_");
-                           name += std::string("spin") + std::to_string(::testing::get<4>(param.param));
-                           name += ::testing::get<5>(param.param) ? "_singlefile" : "_partfile";
-                           name += ::testing::get<6>(param.param) == QUDA_CUDA_FIELD_LOCATION ? "_device" : "_host";
+                           name += get_prec_str(::testing::get<4>(param.param)) + std::string("_");
+                           name += get_prec_str(::testing::get<5>(param.param)) + std::string("_");
+                           name += std::string("spin") + std::to_string(::testing::get<6>(param.param));
+                           name += ::testing::get<7>(param.param) ? "_singlefile" : "_partfile";
+                           name += ::testing::get<8>(param.param) == QUDA_CUDA_FIELD_LOCATION ? "_device" : "_host";
                            return name;
                          });
 
 // colorspinor parity field IO test
 INSTANTIATE_TEST_SUITE_P(Parity, ColorSpinorIOTest,
-                         Combine(Values(QUDA_PARITY_SITE_SUBSET), Values(false, true),
+                         Combine(Values(QUDA_PARITY_SITE_SUBSET), Values(QUDA_PARITY_SITE_SUBSET, QUDA_FULL_SITE_SUBSET),
+                                 Values(QUDA_EVEN_PARITY, QUDA_ODD_PARITY), Values(false, true),
                                  Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION, QUDA_HALF_PRECISION),
                                  Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION), Values(1, 2, 4),
                                  Values(false, true), Values(QUDA_CUDA_FIELD_LOCATION, QUDA_CPU_FIELD_LOCATION)),
                          [](testing::TestParamInfo<cs_test_t> param) {
                            std::string name;
-                           if (::testing::get<1>(param.param)) name += std::string("inflate_");
-                           name += get_prec_str(::testing::get<2>(param.param)) + std::string("_");
-                           name += get_prec_str(::testing::get<3>(param.param)) + std::string("_");
-                           name += std::string("spin") + std::to_string(::testing::get<4>(param.param));
-                           name += ::testing::get<5>(param.param) ? "_singlefile" : "_partfile";
-                           name += ::testing::get<6>(param.param) == QUDA_CUDA_FIELD_LOCATION ? "_device" : "_host";
+                           name += ::testing::get<1>(param.param) == QUDA_PARITY_SITE_SUBSET ? "load_parity_" :
+                                                                                               "load_full_";
+                           name += ::testing::get<2>(param.param) == QUDA_EVEN_PARITY ? "even_" : "odd_";
+                           if (::testing::get<3>(param.param)) name += std::string("inflate_");
+                           name += get_prec_str(::testing::get<4>(param.param)) + std::string("_");
+                           name += get_prec_str(::testing::get<5>(param.param)) + std::string("_");
+                           name += std::string("spin") + std::to_string(::testing::get<6>(param.param));
+                           name += ::testing::get<7>(param.param) ? "_singlefile" : "_partfile";
+                           name += ::testing::get<8>(param.param) == QUDA_CUDA_FIELD_LOCATION ? "_device" : "_host";
                            return name;
                          });
diff --git a/tests/laph_test.cpp b/tests/laph_test.cpp
new file mode 100644
index 0000000000..1ef7f3c883
--- /dev/null
+++ b/tests/laph_test.cpp
@@ -0,0 +1,163 @@
+#include <cstdlib>
+#include <algorithm>
+#include <vector>
+
+#include "instantiate.h"
+#include "test.h"
+#include "misc.h"
+
+/*
+  This test will perhaps eventually evolve into a full test for 3-d
+  Laplace eigenvector generation and sink projection.  For now the
+  focus is eigen-vector projection
+ */
+
+using test_t = std::tuple<QudaPrecision, int, int, int, int>;
+
+using ::testing::Combine;
+using ::testing::get;
+using ::testing::Values;
+
+struct LaphTest : ::testing::TestWithParam<test_t> {
+  test_t param;
+  LaphTest() : param(GetParam()) { }
+};
+
+auto laph_test(test_t param)
+{
+  using namespace quda;
+
+  QudaPrecision precision = get<0>(param);
+  int nSink = get<1>(param);
+  int nEv = get<2>(param);
+  int tileSink = get<3>(param);
+  int tileEv = get<4>(param);
+
+  constexpr int nSpin = 4;
+  constexpr int nColor = 3;
+
+  QudaInvertParam invParam = newQudaInvertParam();
+  invParam.cpu_prec = QUDA_DOUBLE_PRECISION;
+  invParam.cuda_prec = precision;
+  invParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+  invParam.dirac_order = QUDA_DIRAC_ORDER;
+
+  ColorSpinorParam cs_param;
+  cs_param.nColor = nColor;
+  cs_param.nSpin = nSpin;
+  cs_param.x = {xdim, ydim, zdim, tdim};
+  cs_param.siteSubset = QUDA_FULL_SITE_SUBSET;
+  cs_param.setPrecision(invParam.cpu_prec);
+  cs_param.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
+  cs_param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
+  cs_param.gammaBasis = invParam.gamma_basis;
+  cs_param.pc_type = QUDA_4D_PC;
+  cs_param.location = QUDA_CPU_FIELD_LOCATION;
+  cs_param.create = QUDA_NULL_FIELD_CREATE;
+
+  ColorSpinorField rngDummy(cs_param);
+  RNG rng(rngDummy, 1234);
+
+  // initialize quark sinks
+  std::vector<ColorSpinorField> sinkList(nSink);
+  for (auto &s : sinkList) {
+    s = ColorSpinorField(cs_param);
+    spinorNoise(s, rng, QUDA_NOISE_GAUSS);
+  }
+
+  // initialize EVs
+  cs_param.nSpin = 1;
+  std::vector<ColorSpinorField> evList(nEv);
+  for (auto &e : evList) {
+    e = ColorSpinorField(cs_param);
+    spinorNoise(e, rng, QUDA_NOISE_GAUSS);
+  }
+
+  // host reference [nSink][nEv][Lt][nSpin][complexity]
+  auto Lt = tdim * comm_dim(3);
+  std::vector<Complex> hostRes(nSink * nEv * Lt * nSpin, 0.);
+
+#pragma omp parallel for collapse(4)
+  for (int iEv = 0; iEv < nEv; ++iEv) {
+    for (int iSink = 0; iSink < nSink; ++iSink) {
+      for (int iSpin = 0; iSpin < nSpin; ++iSpin) {
+        for (int iT = 0; iT < tdim; ++iT) {
+          int globT = comm_coord(3) * tdim + iT;
+          for (int iZ = 0; iZ < zdim; ++iZ) {
+            for (int iY = 0; iY < ydim; ++iY) {
+              for (int iX = 0; iX < xdim; ++iX) {
+                int coord[4] = {iX, iY, iZ, iT};
+                int linInd;
+                evList[iEv].OffsetIndex(linInd, coord);
+
+                for (int iCol = 0; iCol < nColor; ++iCol)
+                  hostRes[iSpin + nSpin * (globT + Lt * (iEv + nEv * iSink))]
+                    += conj(evList[iEv].data<Complex *>()[iCol + nColor * linInd])
+                    * sinkList[iSink].data<Complex *>()[iCol + nColor * (iSpin + nSpin * linInd)];
+              }
+            }
+          }
+        } // volume loops
+      }   // spin loop
+    }     // sink loop
+  }       // ev loop
+
+  comm_allreduce_sum(hostRes);
+
+  // QUDA proper
+  void *snkPtr[nSink];
+  for (int iSink = 0; iSink < nSink; ++iSink) snkPtr[iSink] = sinkList[iSink].data();
+
+  void *evPtr[nEv];
+  for (int iEv = 0; iEv < nEv; ++iEv) evPtr[iEv] = evList[iEv].data();
+
+  std::vector<Complex> qudaRes(nSink * nEv * Lt * nSpin, 0.);
+
+  int X[4] = {xdim, ydim, zdim, tdim};
+  laphSinkProject((__complex__ double *)qudaRes.data(), (void **)snkPtr, nSink, tileSink,
+                  (void **)evPtr, nEv, tileEv, &invParam, X);
+  printfQuda("laphSinkProject Done: %g secs, %g Gflops\n", invParam.secs, invParam.gflops / invParam.secs);
+
+  auto tol = getTolerance(cuda_prec);
+  int rtn = 0;
+  for (unsigned int i = 0; i < qudaRes.size(); i++) {
+    auto deviation = abs(qudaRes[i] - hostRes[i]) / abs(hostRes[i]);
+    if (deviation > tol) {
+      printfQuda("EV projection test failed at iEl=%d: (%f,%f) [QUDA], (%f,%f) [host]\n", i, qudaRes[i].real(),
+                 qudaRes[i].imag(), hostRes[i].real(), hostRes[i].imag());
+      EXPECT_LE(deviation, tol);
+      rtn = 1;
+    }
+  }
+  return rtn;
+}
+
+TEST_P(LaphTest, verify)
+{
+  if (!quda::is_enabled(get<0>(GetParam()))) GTEST_SKIP();
+  laph_test(GetParam());
+}
+
+INSTANTIATE_TEST_SUITE_P(LaphTest, LaphTest,
+                         Combine(Values(QUDA_SINGLE_PRECISION, QUDA_DOUBLE_PRECISION), Values(1, 4, 7), Values(2),
+                                 Values(4), Values(2)),
+                         [](testing::TestParamInfo<test_t> param) {
+                           return std::string(get_prec_str(get<0>(param.param))) + "_"
+                             + std::to_string(get<1>(param.param)) + "_" + std::to_string(get<2>(param.param)) + "_"
+                             + std::to_string(get<3>(param.param)) + "_" + std::to_string(get<4>(param.param));
+                         });
+
+int main(int argc, char **argv)
+{
+  quda_test test("laph_test", argc, argv);
+  test.init();
+
+  int result = 0;
+  if (enable_testing) {
+    result = test.execute();
+  } else {
+    result = laph_test({cuda_prec, Msrc, Nsrc, Msrc_tile, Nsrc_tile});
+  }
+
+  return result;
+}
diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp
index 8cfb639c2f..4da9b24b01 100644
--- a/tests/utils/command_line_params.cpp
+++ b/tests/utils/command_line_params.cpp
@@ -50,6 +50,8 @@ double gaussian_sigma = 0.2;
 std::string gauge_outfile;
 int Nsrc = 1;
 int Msrc = 1;
+int Nsrc_tile = 1;
+int Msrc_tile = 1;
 int niter = 100;
 int maxiter_precondition = 10;
 QudaVerbosity verbosity_precondition = QUDA_SUMMARIZE;
@@ -100,6 +102,7 @@ double mass = 0.1;
 double kappa = -1.0;
 double mu = 0.1;
 double epsilon = 0.01;
+double evmax = 0.1;
 double m5 = -1.5;
 double b5 = 1.5;
 double c5 = 0.5;
@@ -481,6 +484,8 @@ std::shared_ptr<QUDAApp> make_app(std::string app_description, std::string app_n
     ->transform(CLI::QUDACheckedTransformer(dslash_type_map));
 
   quda_app->add_option("--epsilon", epsilon, "Twisted-Mass flavor twist of Dirac operator (default 0.01)");
+  quda_app->add_option(
+    "--evmax", evmax, "Twisted-Mass non-degenerate of Dirac operator max eigenvector to scale the force (default 0.01)");
   quda_app->add_option("--epsilon-naik", eps_naik, "Epsilon factor on Naik term (default 0.0, suggested non-zero -0.1)");
 
   quda_app->add_option("--flavor", twist_flavor, "Set the twisted mass flavor type (singlet (default), nondeg-doublet)")
@@ -510,6 +515,7 @@ std::shared_ptr<QUDAApp> make_app(std::string app_description, std::string app_n
     ->transform(CLI::QUDACheckedTransformer(matpc_type_map));
   quda_app->add_option("--msrc", Msrc,
                        "Used for testing non-square block blas routines where nsrc defines the other dimension");
+  quda_app->add_option("--msrc-tile", Msrc_tile, "Set the Msrc tile size (where applicable)");
   quda_app->add_option("--mu", mu, "Twisted-Mass chiral twist of Dirac operator (default 0.1)");
   quda_app->add_option("--m5", m5, "Mass of shift of five-dimensional Dirac operators (default -1.5)");
   quda_app->add_option("--b5", b5, "Mobius b5 parameter (default 1.5)");
@@ -547,6 +553,7 @@ std::shared_ptr<QUDAApp> make_app(std::string app_description, std::string app_n
     ->transform(CLI::QUDACheckedTransformer(verbosity_map));
   quda_app->add_option("--nsrc", Nsrc,
                        "How many spinors to apply the dslash to simultaneusly (experimental for staggered only)");
+  quda_app->add_option("--nsrc-tile", Nsrc_tile, "Set the Nsrc tile size (where applicable)");
 
   quda_app->add_option("--pipeline", pipeline,
                        "The pipeline length for fused operations in GCR, BiCGstab-l (default 0, no pipelining)");
diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h
index 4e88d9313e..fa1c21905f 100644
--- a/tests/utils/command_line_params.h
+++ b/tests/utils/command_line_params.h
@@ -188,6 +188,8 @@ extern double gaussian_sigma;
 extern std::string gauge_outfile;
 extern int Nsrc;
 extern int Msrc;
+extern int Nsrc_tile;
+extern int Msrc_tile;
 extern int niter;
 extern int maxiter_precondition;
 extern QudaVerbosity verbosity_precondition;
@@ -238,6 +240,7 @@ extern double mass;
 extern double kappa;
 extern double mu;
 extern double epsilon;
+extern double evmax;
 extern double m5;
 extern double b5;
 extern double c5;
diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp
index 629f70e51b..140eccd8a8 100644
--- a/tests/utils/host_utils.cpp
+++ b/tests/utils/host_utils.cpp
@@ -243,6 +243,7 @@ void constructWilsonTestSpinorParam(quda::ColorSpinorParam *cs_param, const Quda
   } else {
     cs_param->nDim = 4;
   }
+  cs_param->twistFlavor = inv_param->twist_flavor;
   cs_param->pc_type = inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ? QUDA_5D_PC : QUDA_4D_PC;
   for (int d = 0; d < 4; d++) cs_param->x[d] = gauge_param->X[d];
   bool pc = is_pc_solution(inv_param->solution_type);
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index 24a8668e7d..d001fd6eea 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -43,7 +43,7 @@ extern QudaPrecision &cuda_prec_ritz;
 // Determine if the Laplace operator has been defined
 constexpr bool is_enabled_laplace()
 {
-#ifdef QUDA_LAPLACE
+#ifdef QUDA_DIRAC_LAPLACE
   return true;
 #else
   return false;
diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp
index 196777d574..29a448d7a2 100644
--- a/tests/utils/misc.cpp
+++ b/tests/utils/misc.cpp
@@ -372,3 +372,17 @@ const char *get_blas_type_str(QudaBLASType type)
   }
   return s;
 }
+
+const char *get_TwistFlavor_str(QudaTwistFlavorType type)
+{
+  const char *ret;
+
+  switch (type) {
+  case QUDA_TWIST_SINGLET: ret = "singlet_"; break;
+  case QUDA_TWIST_NONDEG_DOUBLET: ret = "ndeg_doublet_"; break;
+  case QUDA_TWIST_NO: ret = ""; break;
+  default: ret = ""; break;
+  }
+
+  return ret;
+}
diff --git a/tests/utils/misc.h b/tests/utils/misc.h
index bf9a8d3039..625cb2ca01 100644
--- a/tests/utils/misc.h
+++ b/tests/utils/misc.h
@@ -25,6 +25,7 @@ const char *get_contract_str(QudaContractType type);
 const char *get_gauge_smear_str(QudaGaugeSmearType type);
 std::string get_dilution_type_str(QudaDilutionType type);
 const char *get_blas_type_str(QudaBLASType type);
+const char *get_TwistFlavor_str(QudaTwistFlavorType type);
 
 #define XUP 0
 #define YUP 1