diff --git a/docs/source/modules/linalg/block.rst b/docs/source/modules/linalg/block.rst
index 2874201c5..b6feaefa2 100644
--- a/docs/source/modules/linalg/block.rst
+++ b/docs/source/modules/linalg/block.rst
@@ -4,7 +4,6 @@ linalg.block
    * :ref:`BlockVectorSpace <blockvectorspace>`
    * :ref:`BlockVector <blockvector>`
    * :ref:`BlockLinearOperator <blocklinearoperator>`
-   * :ref:`BlockDiagonalSolver <blockdiagonalsolver>`  
 
 .. inheritance-diagram:: psydac.linalg.block
 
@@ -31,11 +30,3 @@ BlockLinearOperator
 
 .. autoclass:: psydac.linalg.block.BlockLinearOperator
    :members:
-
-.. _blockdiagonalsolver:
-
-BlockDiagonalSolver
--------------------
-
-.. autoclass:: psydac.linalg.block.BlockDiagonalSolver
-   :members:
diff --git a/docs/source/modules/linalg/direct_solvers.rst b/docs/source/modules/linalg/direct_solvers.rst
index 52451d3e2..452362851 100644
--- a/docs/source/modules/linalg/direct_solvers.rst
+++ b/docs/source/modules/linalg/direct_solvers.rst
@@ -1,20 +1,11 @@
 linalg.direct_solvers
 =====================
 
-   * :ref:`DirectSolver <directsolver>`
    * :ref:`BandedSolver <bandedsolver>`
    * :ref:`SparseSolver <sparsesolver>`
 
 .. inheritance-diagram:: psydac.linalg.direct_solvers
 
-.. _directsolver:
-
-DirectSolver
-------------
-
-.. autoclass:: psydac.linalg.direct_solvers.DirectSolver
-   :members:
-
 .. _bandedsolver:
 
 BandedSolver
diff --git a/psydac/feec/global_projectors.py b/psydac/feec/global_projectors.py
index 7ecb67a7d..9106a9cbb 100644
--- a/psydac/feec/global_projectors.py
+++ b/psydac/feec/global_projectors.py
@@ -4,7 +4,7 @@
 
 from psydac.linalg.kron           import KroneckerLinearSolver
 from psydac.linalg.stencil        import StencilVector
-from psydac.linalg.block          import BlockDiagonalSolver, BlockVector
+from psydac.linalg.block          import BlockLinearOperator, BlockVector
 from psydac.core.bsplines         import quadrature_grid
 from psydac.utilities.quadratures import gauss_legendre
 from psydac.fem.basic             import FemField
@@ -168,7 +168,7 @@ def __init__(self, space, nquads = None):
             self._grid_x += [block_x]
             self._grid_w += [block_w]
 
-            solverblocks += [KroneckerLinearSolver(tensorspaces[i].vector_space, solvercells)]
+            solverblocks += [KroneckerLinearSolver(tensorspaces[i].vector_space, tensorspaces[i].vector_space, solvercells)]
 
             dataslice = tuple(slice(p, -p) for p in tensorspaces[i].vector_space.pads)
             dofs[i] = rhsblocks[i]._data[dataslice]
@@ -177,11 +177,13 @@ def __init__(self, space, nquads = None):
         args = (*intp_x, *quad_x, *quad_w, *dofs)
         self._func = lambda *fun: func(*args, *fun)
 
-        # build a BlockDiagonalSolver, if necessary
+        # build a BlockLinearOperator, if necessary
         if len(solverblocks) == 1:
             self._solver = solverblocks[0]
         else:
-            self._solver = BlockDiagonalSolver(self._space.vector_space, blocks=solverblocks)
+            domain = codomain = self._space.vector_space
+            blocks = {(i, i): B_i for i, B_i in enumerate(solverblocks)}
+            self._solver = BlockLinearOperator(domain, codomain, blocks)
     
     @property
     def space(self):
@@ -307,7 +309,7 @@ def __call__(self, fun):
         else:
             self._func(fun)
 
-        coeffs = self._solver.solve(self._rhs)
+        coeffs = self._solver.dot(self._rhs)
 
         return FemField(self._space, coeffs=coeffs)
 
diff --git a/psydac/linalg/basic.py b/psydac/linalg/basic.py
index 6a545c601..dc86957b4 100644
--- a/psydac/linalg/basic.py
+++ b/psydac/linalg/basic.py
@@ -1005,5 +1005,14 @@ def space(self):
         pass
 
     @abstractmethod
-    def solve(self, rhs, out=None, transposed=False):
+    def transpose(self):
+        """Return the transpose of the LinearSolver."""
         pass
+
+    @abstractmethod
+    def solve(self, rhs, out=None):
+        pass
+
+    @property
+    def T(self):
+        return self.transpose()
diff --git a/psydac/linalg/block.py b/psydac/linalg/block.py
index fb2737d56..8075c07eb 100644
--- a/psydac/linalg/block.py
+++ b/psydac/linalg/block.py
@@ -7,12 +7,12 @@
 from types import MappingProxyType
 from scipy.sparse import bmat, lil_matrix
 
-from psydac.linalg.basic  import VectorSpace, Vector, LinearOperator, LinearSolver, ZeroOperator
-from psydac.ddm.cart      import InterfaceCartDecomposition
-from psydac.ddm.utilities import get_data_exchanger
-from psydac.linalg.stencil import StencilVector, StencilMatrix
+from psydac.linalg.basic    import VectorSpace, Vector, LinearOperator
+from psydac.linalg.stencil  import StencilMatrix
+from psydac.ddm.cart        import InterfaceCartDecomposition
+from psydac.ddm.utilities   import get_data_exchanger
 
-__all__ = ('BlockVectorSpace', 'BlockVector', 'BlockLinearOperator', 'BlockDiagonalSolver')
+__all__ = ('BlockVectorSpace', 'BlockVector', 'BlockLinearOperator')
 
 #===============================================================================
 class BlockVectorSpace(VectorSpace):
@@ -1321,138 +1321,3 @@ def func(blocks, v, out, **args):
 
         self._func    = func
         self._backend = backend
-
-#===============================================================================
-class BlockDiagonalSolver( LinearSolver ):
-    """
-    A LinearSolver that can be written as blocks of other LinearSolvers,
-    i.e. it can be seen as a solver for linear equations with block-diagonal matrices.
-
-    The space of this solver has to be of the type BlockVectorSpace.
-
-    Parameters
-    ----------
-    V : psydac.linalg.block.BlockVectorSpace
-        Space of the new blocked linear solver.
-
-    blocks : dict | list | tuple
-        LinearSolver objects (optional).
-
-        a) 'blocks' can be dictionary with
-            . key   = integer i >= 0
-            . value = corresponding LinearSolver Lii
-
-        b) 'blocks' can be list of LinearSolvers (or tuple of these) where blocks[i]
-            is the LinearSolver Lii (if None, we assume null operator)
-
-    """
-    def __init__( self, V, blocks=None ):
-
-        assert isinstance( V, BlockVectorSpace )
-
-        self._space   = V
-        self._nblocks = V.n_blocks
-
-        # Store blocks in list (hence, they can be manually changed later)
-        self._blocks   = [None] * self._nblocks
-
-        if blocks:
-
-            if isinstance( blocks, dict ):
-                for i, L in blocks.items():
-                    self[i] = L
-
-            elif isinstance( blocks, (list, tuple) ):
-                for i, L in enumerate( blocks ):
-                    self[i] = L
-
-            else:
-                raise ValueError( "Blocks can only be given as dict or 1D list/tuple." )
-
-    #--------------------------------------
-    # Abstract interface
-    #--------------------------------------
-    @property
-    def space( self ):
-        """
-        The space this BlockDiagonalSolver works on.
-        """
-        return self._space
-
-    # ...
-    def solve( self, rhs, out=None, transposed=False ):
-        """
-        Solves the linear system for the given right-hand side rhs.
-        An out vector can be supplied, otherwise a new vector will be allocated.
-
-        This operation supports in-place operations, given that the underlying solvers
-        do as well.
-
-        Parameters
-        ----------
-        rhs : BlockVector
-            The input right-hand side.
-        
-        out : BlockVector | NoneType
-            The output vector, or None.
-        
-        transposed : Bool
-            If true, and supported by the underlying solvers,
-            rhs is solved against the transposed right-hand sides.
-        
-        Returns
-        -------
-        out : BlockVector
-            Either `out`, if given as input parameter, or a newly-allocated vector.
-            In all cases, it holds the result of the computation.
-        """
-        assert isinstance(rhs, BlockVector)
-        assert rhs.space is self.space
-        if out is None:
-            out = self.space.zeros()
-        
-        assert isinstance(out, BlockVector)
-        assert out.space is self.space
-
-        rhs.update_ghost_regions()
-
-        for i, L in enumerate(self._blocks):
-            if L is None:
-                raise NotImplementedError('All solvers have to be defined.')
-            L.solve(rhs[i], out=out[i], transposed=transposed)
-
-        return out
-
-    #--------------------------------------
-    # Other properties/methods
-    #--------------------------------------
-    @property
-    def blocks( self ):
-        """ Immutable 1D view (tuple) of the linear solvers,
-            including the empty blocks as 'None' objects.
-        """
-        return tuple(self._blocks)
-    # ...
-    @property
-    def n_blocks( self ):
-        """
-        The number of blocks in the matrix.
-        """
-        return self._nblocks
-
-    # ...
-    def __getitem__( self, key ):
-        assert 0 <= key < self._nblocks
-
-        return self._blocks.get( key, None )
-
-    # ...
-    def __setitem__( self, key, value ):
-        assert 0 <= key < self._nblocks
-
-        assert isinstance( value, LinearSolver )
-
-        # Check domain of rhs
-        assert value.space is self.space[key]
-
-        self._blocks[key] = value
diff --git a/psydac/linalg/direct_solvers.py b/psydac/linalg/direct_solvers.py
index db6a49c2a..2d57c0f06 100644
--- a/psydac/linalg/direct_solvers.py
+++ b/psydac/linalg/direct_solvers.py
@@ -9,29 +9,10 @@
 
 from psydac.linalg.basic    import LinearSolver
 
-__all__ = ('DirectSolver', 'BandedSolver', 'SparseSolver')
+__all__ = ('BandedSolver', 'SparseSolver')
 
 #===============================================================================
-class DirectSolver( LinearSolver ):
-    """
-    Abstract class for direct linear solvers.
-
-    """
-
-    #-------------------------------------
-    # Deferred methods
-    #-------------------------------------
-    @property
-    @abstractmethod
-    def space( self ):
-        pass
-
-    @abstractmethod
-    def solve( self, rhs, out=None, transposed=False ):
-        pass
-
-#===============================================================================
-class BandedSolver ( DirectSolver ):
+class BandedSolver(LinearSolver):
     """
     Solve the equation Ax = b for x, assuming A is banded matrix.
 
@@ -47,10 +28,11 @@ class BandedSolver ( DirectSolver ):
         Banded matrix.
 
     """
-    def __init__( self, u, l, bmat ):
+    def __init__(self, u, l, bmat, transposed=False):
 
         self._u    = u
         self._l    = l
+        self._transposed = transposed
 
         # ... LU factorization
         if bmat.dtype == np.float32:
@@ -66,7 +48,7 @@ def __init__( self, u, l, bmat ):
             self._factor_function = zgbtrf
             self._solver_function = zgbtrs
         else:
-            msg = f'Cannot create a DirectSolver for bmat.dtype = {bmat.dtype}'
+            msg = f'Cannot create a BandedSolver for bmat.dtype = {bmat.dtype}'
             raise NotImplementedError(msg)
 
         self._bmat, self._ipiv, self._finfo = self._factor_function(bmat, l, u)
@@ -77,22 +59,40 @@ def __init__( self, u, l, bmat ):
         self._dtype = bmat.dtype
 
     @property
-    def finfo( self ):
+    def finfo(self):
         return self._finfo
 
     @property
-    def sinfo( self ):
+    def sinfo(self):
         return self._sinfo
 
     #--------------------------------------
     # Abstract interface
     #--------------------------------------
     @property
-    def space( self ):
+    def space(self):
         return self._space
 
+    def transpose(self):
+        cls = type(self)
+        obj = super().__new__(cls)
+
+        obj._u = self._l
+        obj._l = self._u
+        obj._bmat = self._bmat
+        obj._ipiv = self._ipiv
+        obj._finfo = self._finfo
+        obj._factor_function = self._factor_function
+        obj._solver_function = self._solver_function
+        obj._sinfo = None
+        obj._space = self._space
+        obj._dtype = self._dtype
+        obj._transposed = not self._transposed
+
+        return obj
+
     #...
-    def solve( self, rhs, out=None, transposed=False ):
+    def solve(self, rhs, out=None):
         """
         Solves for the given right-hand side.
 
@@ -106,12 +106,11 @@ def solve( self, rhs, out=None, transposed=False ):
         
         out : ndarray | NoneType
             Output vector. If given, it has to have the same shape and datatype as rhs.
-        
-        transposed : bool
-            If and only if set to true, we solve against the transposed matrix. (supported by the underlying solver)
         """
         assert rhs.T.shape[0] == self._bmat.shape[1]
 
+        transposed = self._transposed
+
         if out is None:
             preout, self._sinfo = self._solver_function(self._bmat, self._l, self._u, rhs.T, self._ipiv,
                                                         trans=transposed)
@@ -134,7 +133,7 @@ def solve( self, rhs, out=None, transposed=False ):
         return out
 
 #===============================================================================
-class SparseSolver ( DirectSolver ):
+class SparseSolver (LinearSolver):
     """
     Solve the equation Ax = b for x, assuming A is scipy sparse matrix.
 
@@ -144,22 +143,33 @@ class SparseSolver ( DirectSolver ):
         Generic sparse matrix.
 
     """
-    def __init__( self, spmat ):
+    def __init__(self, spmat, transposed=False):
 
-        assert isinstance( spmat, spmatrix )
+        assert isinstance(spmat, spmatrix)
 
         self._space = np.ndarray
-        self._splu  = splu( spmat.tocsc() )
+        self._splu  = splu(spmat.tocsc())
+        self._transposed = transposed
 
     #--------------------------------------
     # Abstract interface
     #--------------------------------------
     @property
-    def space( self ):
+    def space(self):
         return self._space
 
+    def transpose(self):
+        cls = type(self)
+        obj = super().__new__(cls)
+
+        obj._space = self._space
+        obj._splu = self._splu
+        obj._transposed = not self._transposed
+
+        return obj
+
     #...
-    def solve( self, rhs, out=None, transposed=False ):
+    def solve(self, rhs, out=None):
         """
         Solves for the given right-hand side.
 
@@ -173,21 +183,19 @@ def solve( self, rhs, out=None, transposed=False ):
         
         out : ndarray | NoneType
             Output vector. If given, it has to have the same shape and datatype as rhs.
-        
-        transposed : bool
-            If and only if set to true, we solve against the transposed matrix. (supported by the underlying solver)
         """
         
         assert rhs.T.shape[0] == self._splu.shape[1]
+        transposed = self._transposed
 
         if out is None:
-            out = self._splu.solve( rhs.T, trans='T' if transposed else 'N' ).T
+            out = self._splu.solve(rhs.T, trans='T' if transposed else 'N').T
 
         else:
             assert out.shape == rhs.shape
             assert out.dtype == rhs.dtype
 
             # currently no in-place solve exposed
-            out[:] = self._splu.solve( rhs.T, trans='T' if transposed else 'N' ).T
+            out[:] = self._splu.solve(rhs.T, trans='T' if transposed else 'N').T
 
         return out
diff --git a/psydac/linalg/fft.py b/psydac/linalg/fft.py
index 6cd566353..5d67d178b 100644
--- a/psydac/linalg/fft.py
+++ b/psydac/linalg/fft.py
@@ -48,8 +48,11 @@ def __init__(self, function):
         @property
         def space(self):
             return np.ndarray
+
+        def transpose(self):
+            raise NotImplementedError('transpose() is not implemented for OneDimSolvers')
         
-        def solve(self, rhs, out=None, transposed=False):
+        def solve(self, rhs, out=None):
             if out is None:
                 out = np.empty_like(rhs)
             
@@ -67,7 +70,7 @@ def __init__(self, space, functions):
         else:
             onedimsolver = DistributedFFTBase.OneDimSolver(functions)
             solvers = [onedimsolver] * space.ndim
-        self._isolver = KroneckerLinearSolver(space, solvers)
+        self._isolver = KroneckerLinearSolver(space, space, solvers)
 
     # ...
     @property
diff --git a/psydac/linalg/kron.py b/psydac/linalg/kron.py
index 0186f119f..2e2f8bb10 100644
--- a/psydac/linalg/kron.py
+++ b/psydac/linalg/kron.py
@@ -371,7 +371,7 @@ def exchange_assembly_data( self ):
     def set_backend(self, backend):
         pass
 #==============================================================================
-class KroneckerLinearSolver(LinearSolver):
+class KroneckerLinearSolver(LinearOperator):
     """
     A solver for Ax=b, where A is a Kronecker matrix from arbirary dimension d,
     defined by d solvers. We also need information about the space of b.
@@ -380,34 +380,45 @@ class KroneckerLinearSolver(LinearSolver):
     ----------
     V : StencilVectorSpace
         The space b will live in; i.e. which gives us information about
-        the distribution of the right-hand sides.
+        the distribution of the right-hand side b.
+
+    W : StencilVectorSpace
+        The space x will live in; i.e. which gives us information about
+        the distribution of the unknown vector x.
     
     solvers : list of LinearSolver
         The components of A in each dimension.
     
     Attributes
     ----------
-    space : StencilVectorSpace
-        The space our vectors to solve live in.
+    domain : StencilVectorSpace
+        The space of the rhs vector b.
+
+    codomain : StencilVectorSpace
+        The space of the unknown vector x.
     """
-    def __init__(self, V, solvers):
+    def __init__(self, V, W, solvers):
         assert isinstance(V, StencilVectorSpace)
+        assert isinstance(W, StencilVectorSpace)
         assert hasattr( solvers, '__iter__' )
         for solver in solvers:
             assert isinstance(solver, LinearSolver)
 
         assert V.ndim == len(solvers)
+        assert W.ndim == len(solvers)
+        assert V.npts == W.npts
 
         # general arguments
-        self._space = V
+        self._domain = V
+        self._codomain = W
         self._solvers = solvers
-        self._parallel = self._space.parallel
-        self._dtype = self._space._dtype
+        self._parallel = self._domain.parallel
+        self._dtype = self._codomain._dtype
         if self._parallel:
-            self._mpi_type = V._mpi_type
+            self._mpi_type = self._domain._mpi_type
         else:
             self._mpi_type = None
-        self._ndim = self._space.ndim
+        self._ndim = self._codomain.ndim
 
         # compute and setup solver arguments
         self._setup_solvers()
@@ -424,12 +435,12 @@ def _setup_solvers(self):
         (which potentially utilize MPI).
         """
         # slice sizes
-        starts = np.array(self._space.starts)
-        ends = np.array(self._space.ends) + 1
+        starts = np.array(self._domain.starts)
+        ends = np.array(self._domain.ends) + 1
         self._slice = tuple([slice(s, e) for s,e in zip(starts, ends)])
 
         # local and global sizes
-        nglobals = self._space.npts
+        nglobals = self._domain.npts
         nlocals = ends - starts
         self._localsize = np.product(nlocals)
         mglobals = self._localsize // nlocals
@@ -446,15 +457,15 @@ def _setup_solvers(self):
             # useful e.g. if we have little data in some directions
             # (and thus no data distributed there)
 
-            if not self._parallel or self._space.cart.subcomm[i].size <= 1:
+            if not self._parallel or self._domain.cart.subcomm[i].size <= 1:
                 # serial solve
                 solver_passes[i] = KroneckerLinearSolver.KroneckerSolverSerialPass(
                         self._solvers[i], nglobals[i], mglobals[i])
             else:
                 # for the parallel case, use Alltoallv
                 solver_passes[i] = KroneckerLinearSolver.KroneckerSolverParallelPass(
-                        self._solvers[i], self._space._mpi_type, i,
-                        self._space.cart, mglobals[i], nglobals[i], nlocals[i], self._localsize)
+                        self._solvers[i], self._domain._mpi_type, i,
+                        self._domain.cart, mglobals[i], nglobals[i], nlocals[i], self._localsize)
 
                 # we have a parallel solve pass now, so we are not completely local any more
                 self._allserial = False
@@ -499,14 +510,33 @@ def _allocate_temps(self):
         else:
             temp2 = np.empty((self._tempsize,), dtype=self._dtype)
         return temp1, temp2
+
+    @property
+    def domain(self):
+        return self._domain
     
     @property
-    def space(self):
-        """
-        Returns the space associated to this solver (i.e. where the information
-        about the cartesian distribution is taken from).
-        """
-        return self._space
+    def codomain(self):
+        return self._codomain
+    
+    @property
+    def dtype(self):
+        return None
+    
+    def toarray(self):
+        raise NotImplementedError('toarray() is not defined for KroneckerLinearSolvers.')
+    
+    def tosparse(self):
+        raise NotImplementedError('tosparse() is not defined for KroneckerLinearSolvers.')
+
+    def transpose(self, conjugate=False):
+        new_domain = self._codomain
+        new_codomain = self._domain
+        new_solvers = [solver.transpose() for solver in self._solvers]
+        return KroneckerLinearSolver(new_domain, new_codomain, new_solvers)
+
+    def dot(self, v, out=None):
+        return self.solve(v, out=out)
 
     @property
     def solvers(self):
@@ -515,18 +545,18 @@ def solvers(self):
         """
         return tuple(self._solvers)
 
-    def solve(self, rhs, out=None, transposed=False):
+    def solve(self, rhs, out=None):
         """
         Solves Ax=b where A is a Kronecker product matrix (and represented as such),
         and b is a suitable vector.
         """
 
         # type checks
-        assert rhs.space is self._space
+        assert rhs.space is self._domain
 
         if out is not None:
             assert isinstance( out, StencilVector )
-            assert out.space is self._space
+            assert out.space is self._codomain
         else:
             out = StencilVector( rhs.space )
         
@@ -534,12 +564,12 @@ def solve(self, rhs, out=None, transposed=False):
         outslice = out[self._slice]
 
         # call the actual kernel
-        self._solve_nd(inslice, outslice, transposed)
+        self._solve_nd(inslice, outslice)
         
         out.update_ghost_regions()
         return out
  
-    def _solve_nd(self, inslice, outslice, transposed):
+    def _solve_nd(self, inslice, outslice):
         """
         The internal solve loop. Can handle arbitrary dimensions.
         """
@@ -552,14 +582,14 @@ def _solve_nd(self, inslice, outslice, transposed):
         # internal passes
         for i in range(self._ndim - 1):
             # solve direction
-            self._solver_passes[i].solve_pass(temp1, temp2, transposed)
+            self._solver_passes[i].solve_pass(temp1, temp2)
 
             # reorder and swap
             self._reorder_temp_to_temp(temp1, temp2, i)
             temp1, temp2 = temp2, temp1
         
         # last pass
-        self._solver_passes[-1].solve_pass(temp1, temp2, transposed)
+        self._solver_passes[-1].solve_pass(temp1, temp2)
 
         # copy to output
         self._reorder_temp_to_outslice(temp1, outslice)
@@ -604,7 +634,7 @@ class KroneckerSolverSerialPass:
 
         Parameters
         ----------
-        solver : DirectSolver
+        solver : BandedSolver or SparseSolver
             The internally used solver class.
         
         nglobal : int
@@ -629,7 +659,7 @@ def required_memory(self):
             """
             return self._datasize
 
-        def solve_pass(self, workmem, tempmem, transposed):
+        def solve_pass(self, workmem, tempmem):
             """
             Solves the data available in workmem, assuming that all data is available locally.
 
@@ -642,16 +672,13 @@ def solve_pass(self, workmem, tempmem, transposed):
             
             tempmem : ndarray
                 Ignored, it exists for compatibility with the parallel solver.
-            
-            transposed : bool
-                True, if and only if we want to solve against the transposed matrix instead.
             """
             # reshape necessary memory in column-major
             view = workmem[:self._datasize]
             view.shape = (self._numrhs,self._dimrhs)
 
             # call solver in in-place mode
-            self._solver.solve(view, out=view, transposed=transposed)
+            self._solver.solve(view, out=view)
 
     class KroneckerSolverParallelPass:
         """
@@ -671,7 +698,7 @@ class KroneckerSolverParallelPass:
 
         Parameters
         ----------
-        solver : DirectSolver
+        solver : BandedSolver or SparseSolver
             The internally used solver class.
         
         mpi_type : MPI type
@@ -816,7 +843,7 @@ def _contiguous_to_blocked(self, blocked, contiguous):
                 contiguouspart.shape = (self._mlocal,end-start)
                 contiguouspart[:] = blocked_view[:,start:end]
 
-        def solve_pass(self, workmem, tempmem, transposed):
+        def solve_pass(self, workmem, tempmem):
             """
             Solves the data available in workmem in a distributed manner, using MPI_Alltoallv.
 
@@ -829,9 +856,6 @@ def solve_pass(self, workmem, tempmem, transposed):
             
             tempmem : ndarray
                 Temporary array of the same minimum size as workmem.
-            
-            transposed : bool
-                True, if and only if we want to solve against the transposed matrix instead.
             """
             # preparation
             sourceargs = [workmem[:self._localsize], self._source_transfer, self._mpi_type]
@@ -844,7 +868,7 @@ def solve_pass(self, workmem, tempmem, transposed):
             self._blocked_to_contiguous(workmem, tempmem)
 
             # actual solve (source contains the data)
-            self._serialsolver.solve_pass(workmem, tempmem, transposed)
+            self._serialsolver.solve_pass(workmem, tempmem)
 
             # ordered stripes -> blocked stripes
             self._contiguous_to_blocked(workmem, tempmem)
@@ -853,7 +877,7 @@ def solve_pass(self, workmem, tempmem, transposed):
             self._comm.Alltoallv(targetargs, sourceargs)
 
 #==============================================================================
-def kronecker_solve(solvers, rhs, out=None, transposed=False):
+def kronecker_solve(solvers, rhs, out=None):
     """
     Solve linear system Ax=b with A=kron( A_n, A_{n-1}, ..., A_2, A_1 ), given
     $n$ separate linear solvers $L_n$ for the 1D problems $A_n x_n = b_n$:
@@ -883,5 +907,5 @@ def kronecker_solve(solvers, rhs, out=None, transposed=False):
     else:
         out = StencilVector(rhs.space)
 
-    kronsolver = KroneckerLinearSolver(rhs.space, solvers)
-    return kronsolver.solve(rhs, out=out, transposed=transposed)
+    kronsolver = KroneckerLinearSolver(rhs.space, rhs.space, solvers)
+    return kronsolver.solve(rhs, out=out)
diff --git a/psydac/linalg/tests/test_block.py b/psydac/linalg/tests/test_block.py
index c67f49262..00aa98b0d 100644
--- a/psydac/linalg/tests/test_block.py
+++ b/psydac/linalg/tests/test_block.py
@@ -8,7 +8,7 @@
 from psydac.linalg.direct_solvers import SparseSolver
 from psydac.linalg.stencil        import StencilVectorSpace, StencilVector, StencilMatrix
 from psydac.linalg.block          import BlockVectorSpace, BlockVector
-from psydac.linalg.block          import BlockLinearOperator, BlockDiagonalSolver
+from psydac.linalg.block          import BlockLinearOperator
 from psydac.linalg.utilities      import array_to_psydac
 from psydac.linalg.kron           import KroneckerLinearSolver
 from psydac.api.settings          import PSYDAC_BACKEND_GPYCCEL
@@ -192,114 +192,6 @@ def test_2D_block_linear_operator_serial_init( dtype, n1, n2, p1, p2, P1, P2  ):
     assert np.array_equal( coo1.data, coo3.data )
 #===============================================================================
 @pytest.mark.parametrize( 'dtype', [float, complex] )
-@pytest.mark.parametrize( 'n1', [8, 16] )
-@pytest.mark.parametrize( 'n2', [8, 12] )
-@pytest.mark.parametrize( 'p1', [1, 2] )
-@pytest.mark.parametrize( 'p2', [1, 3] )
-@pytest.mark.parametrize( 'P1', [True, False] )
-@pytest.mark.parametrize( 'P2', [True] )
-def test_2D_block_diagonal_solver_serial_init( dtype, n1, n2, p1, p2, P1, P2  ):
-    # set seed for reproducibility
-    seed(n1*n2*p1*p2)
-
-    D = DomainDecomposition([n1,n2], periods=[P1,P2])
-
-    # Partition the points
-    npts = [n1,n2]
-    global_starts, global_ends = compute_global_starts_ends(D, npts)
-
-    cart = CartDecomposition(D, npts, global_starts, global_ends, pads=[p1,p2], shifts=[1,1])
-
-    # Create vector spaces, stencil matrices, and stencil vectors
-    V = StencilVectorSpace( cart, dtype=dtype)
-
-    # Fill in stencil matrices based on diagonal index
-    if dtype==complex:
-        f1=lambda k1,k2: 10j*k1+k2
-    else:
-        f1=lambda k1,k2: 10*k1+k2
-
-    m11 = np.zeros((n1, n1), dtype=dtype)
-    m12 = np.zeros((n2, n2), dtype=dtype)
-    for j in range(n1):
-        for i in range(-p1,p1+1):
-            m11[j, max(0, min(n1-1, j+i))] = f1(j,i)
-    for j in range(n2):
-        for i in range(-p2,p2+1):
-            m12[j, max(0, min(n2-1, j+i))] = f1(j,5*i)+2.
-
-
-    if dtype==complex:
-        f2=lambda k1,k2: 10j*k1**2+k2**3
-    else:
-        f2=lambda k1,k2: 10*k1**2+k2**3
-
-    m21 = np.zeros((n1, n1), dtype=dtype)
-    m22 = np.zeros((n2, n2), dtype=dtype)
-
-    for j in range(n1):
-        for i in range(-p1,p1+1):
-            m21[j, max(0, min(n1-1, j+i))] = f2(j,i)
-    for j in range(n2):
-        for i in range(-p2,p2+1):
-            m22[j, max(0, min(n2-1, j+i))] = f2(j,2*i)+2.
-
-    M11 = SparseSolver( spa.csc_matrix(m11) )
-    M12 = SparseSolver( spa.csc_matrix(m12) )
-    M21 = SparseSolver( spa.csc_matrix(m21) )
-    M22 = SparseSolver( spa.csc_matrix(m22) )
-    M1 = KroneckerLinearSolver(V, [M11,M12])
-    M2 = KroneckerLinearSolver(V, [M21,M22])
-    x1 = StencilVector( V )
-    x2 = StencilVector( V )
-
-    W = BlockVectorSpace(V, V)
-
-    # Fill in vector with random values, then update ghost regions
-    for i1 in range(n1):
-        for i2 in range(n2):
-            x1[i1,i2] = 2.0*random() - 1.0
-            x2[i1,i2] = 5.0*random() - 1.0
-    x1.update_ghost_regions()
-    x2.update_ghost_regions()
-
-    # Construct a BlockVector object containing x1 and x2
-    #     |x1|
-    # X = |  |
-    #     |x2|
-
-    X = BlockVector(W)
-    X[0] = x1
-    X[1] = x2
-
-    # Construct a BlockDiagonalSolver object containing M1, M2 using 3 ways
-    #     |M1  0 |
-    # L = |      |
-    #     |0   M2|
-
-    dict_blocks = {0:M1, 1:M2}
-    list_blocks = [M1, M2]
-
-    L1 = BlockDiagonalSolver( W, blocks=dict_blocks )
-    L2 = BlockDiagonalSolver( W, blocks=list_blocks )
-
-    L3 = BlockDiagonalSolver( W )
-
-    # Test for not allowing undefinedness
-    errresult = False
-    try:
-        L3.solve(X)
-    except NotImplementedError:
-        errresult = True
-    assert errresult
-
-    L3[0] = M1
-    L3[1] = M2
-    assert L3.space == W
-    assert L3.blocks == (M1, M2)
-    assert L3.n_blocks == 2
-#===============================================================================
-@pytest.mark.parametrize( 'dtype', [float, complex] )
 @pytest.mark.parametrize( 'ndim', [1, 2, 3] )
 @pytest.mark.parametrize( 'p', [1, 2] )
 @pytest.mark.parametrize( 'P1', [True, False] )
@@ -787,155 +679,6 @@ def test_block_linear_operator_serial_dot( dtype, n1, n2, p1, p2, P1, P2  ):
 @pytest.mark.parametrize( 'p2', [1, 3] )
 @pytest.mark.parametrize( 'P1', [True, False] )
 @pytest.mark.parametrize( 'P2', [True] )
-def test_block_diagonal_solver_serial_dot( dtype, n1, n2, p1, p2, P1, P2 ):
-    # set seed for reproducibility
-    seed(n1*n2*p1*p2)
-
-    D = DomainDecomposition([n1,n2], periods=[P1,P2])
-
-    # Partition the points
-    npts = [n1,n2]
-    global_starts, global_ends = compute_global_starts_ends(D, npts)
-
-    cart = CartDecomposition(D, npts, global_starts, global_ends, pads=[p1,p2], shifts=[1,1])
-
-    # Create vector spaces, stencil matrices, and stencil vectors
-    V = StencilVectorSpace( cart, dtype=dtype)
-
-    # Fill in stencil matrices based on diagonal index
-    if dtype==complex:
-        f1=lambda k1,k2: 10j*k1+k2
-    else:
-        f1=lambda k1,k2: 10*k1+k2
-
-    m11 = np.zeros((n1, n1), dtype=dtype)
-    m12 = np.zeros((n2, n2), dtype=dtype)
-    for j in range(n1):
-        for i in range(-p1,p1+1):
-            m11[j, max(0, min(n1-1, j+i))] = f1(j,i)
-    for j in range(n2):
-        for i in range(-p2,p2+1):
-            m12[j, max(0, min(n2-1, j+i))] = f1(j,5*i)+2.
-
-
-    if dtype==complex:
-        f2=lambda k1,k2: 10j*k1**2+k2**3
-    else:
-        f2=lambda k1,k2: 10*k1**2+k2**3
-
-    m21 = np.zeros((n1, n1), dtype=dtype)
-    m22 = np.zeros((n2, n2), dtype=dtype)
-    for j in range(n1):
-        for i in range(-p1,p1+1):
-            m21[j, max(0, min(n1-1, j+i))] = f2(j,i)
-    for j in range(n2):
-        for i in range(-p2,p2+1):
-            m22[j, max(0, min(n2-1, j+i))] = f2(j,2*i)+2.
-    
-    M11 = SparseSolver( spa.csc_matrix(m11) )
-    M12 = SparseSolver( spa.csc_matrix(m12) )
-    M21 = SparseSolver( spa.csc_matrix(m21) )
-    M22 = SparseSolver( spa.csc_matrix(m22) )
-    M1 = KroneckerLinearSolver(V, [M11,M12])
-    M2 = KroneckerLinearSolver(V, [M21,M22])
-    x1 = StencilVector( V )
-    x2 = StencilVector( V )
-
-    W = BlockVectorSpace(V, V)
-
-    # Fill in vector with random values, then update ghost regions
-    for i1 in range(n1):
-        for i2 in range(n2):
-            x1[i1,i2] = 2.0*random() - 1.0
-            x2[i1,i2] = 5.0*random() - 1.0
-    x1.update_ghost_regions()
-    x2.update_ghost_regions()
-
-    # Construct a BlockVector object containing x1 and x2
-    #     |x1|
-    # X = |  |
-    #     |x2|
-
-    X = BlockVector(W)
-    X[0] = x1
-    X[1] = x2
-
-    # Construct a BlockDiagonalSolver object containing M1, M2 using 3 ways
-    #     |M1  0 |
-    # L = |      |
-    #     |0   M2|
-
-    dict_blocks = {0:M1, 1:M2}
-    list_blocks = [M1, M2]
-
-    L1 = BlockDiagonalSolver( W, blocks=dict_blocks )
-    L2 = BlockDiagonalSolver( W, blocks=list_blocks )
-
-    L3 = BlockDiagonalSolver( W )
-
-    # Test for not allowing undefinedness
-    errresult = False
-    try:
-        L3.solve(X)
-    except NotImplementedError:
-        errresult = True
-    assert errresult
-
-    L3[0] = M1
-    L3[1] = M2
-
-    # Compute BlockDiagonalSolver product
-    Y1 = L1.solve(X)
-    Y2 = L2.solve(X)
-    Y3 = L3.solve(X)
-
-    # Transposed
-    Yt = L1.solve(X, transposed=True)
-
-    # Test other in/out methods
-    Y4a = W.zeros()
-    Y4b = L1.solve(X, out=Y4a)
-    assert Y4b is Y4a
-
-    Y5a = W.zeros()
-    Y5a[0] = x1.copy()
-    Y5a[1] = x2.copy()
-    Y5b = L1.solve(Y5a, out=Y5a)
-    assert Y5b is Y5a
-
-    # Solve linear equations for each block
-    y1 = M1.solve(x1)
-    y2 = M2.solve(x2)
-
-    y1t = M1.solve(x1, transposed=True)
-    y2t = M2.solve(x2, transposed=True)
-
-    # Check data in 1D array
-    assert np.allclose( Y1.blocks[0].toarray(), y1.toarray(), rtol=1e-14, atol=1e-14 )
-    assert np.allclose( Y1.blocks[1].toarray(), y2.toarray(), rtol=1e-14, atol=1e-14 )
-
-    assert np.allclose( Y2.blocks[0].toarray(), y1.toarray(), rtol=1e-14, atol=1e-14 )
-    assert np.allclose( Y2.blocks[1].toarray(), y2.toarray(), rtol=1e-14, atol=1e-14 )
-
-    assert np.allclose( Y3.blocks[0].toarray(), y1.toarray(), rtol=1e-14, atol=1e-14 )
-    assert np.allclose( Y3.blocks[1].toarray(), y2.toarray(), rtol=1e-14, atol=1e-14 )
-
-    assert np.allclose( Y4a.blocks[0].toarray(), y1.toarray(), rtol=1e-14, atol=1e-14 )
-    assert np.allclose( Y4a.blocks[1].toarray(), y2.toarray(), rtol=1e-14, atol=1e-14 )
-
-    assert np.allclose( Y5a.blocks[0].toarray(), y1.toarray(), rtol=1e-14, atol=1e-14 )
-    assert np.allclose( Y5a.blocks[1].toarray(), y2.toarray(), rtol=1e-14, atol=1e-14 )
-
-    assert np.allclose( Yt.blocks[0].toarray(), y1t.toarray(), rtol=1e-14, atol=1e-14 )
-    assert np.allclose( Yt.blocks[1].toarray(), y2t.toarray(), rtol=1e-14, atol=1e-14 )
-#===============================================================================
-@pytest.mark.parametrize( 'dtype', [float, complex] )
-@pytest.mark.parametrize( 'n1', [8, 16] )
-@pytest.mark.parametrize( 'n2', [8, 12] )
-@pytest.mark.parametrize( 'p1', [1, 2] )
-@pytest.mark.parametrize( 'p2', [1, 3] )
-@pytest.mark.parametrize( 'P1', [True, False] )
-@pytest.mark.parametrize( 'P2', [True] )
 
 def test_block_2d_array_to_psydac_1( dtype, n1, n2, p1, p2, P1, P2 ):
     #Define a factor for the data
@@ -1196,155 +939,6 @@ def test_block_linear_operator_parallel_dot( dtype, n1, n2, p1, p2, P1, P2 ):
     assert np.allclose( Y.blocks[1].toarray(), y2.toarray(), rtol=1e-14, atol=1e-14 )
 #===============================================================================
 @pytest.mark.parametrize( 'dtype', [float, complex] )
-@pytest.mark.parametrize( 'n1', [8,16] )
-@pytest.mark.parametrize( 'n2', [8,32] )
-@pytest.mark.parametrize( 'p1', [1,3] )
-@pytest.mark.parametrize( 'p2', [2] )
-@pytest.mark.parametrize( 'P1', [True, False] )
-@pytest.mark.parametrize( 'P2', [True] )
-@pytest.mark.parallel
-def test_block_diagonal_solver_parallel_dot( dtype, n1, n2, p1, p2, P1, P2  ):
-    # Define a factor for the data
-    if dtype == complex:
-        factor = 1j
-    else:
-        factor = 1
-    # set seed for reproducibility
-    seed(n1*n2*p1*p2)
-
-    from mpi4py       import MPI
-    comm = MPI.COMM_WORLD
-    D = DomainDecomposition([n1,n2], periods=[P1,P2], comm=comm)
-
-    # Partition the points
-    npts = [n1,n2]
-    global_starts, global_ends = compute_global_starts_ends(D, npts)
-
-    cart = CartDecomposition(D, npts, global_starts, global_ends, pads=[p1,p2], shifts=[1,1])
-
-    # Create vector spaces, stencil matrices, and stencil vectors
-    V = StencilVectorSpace( cart, dtype=dtype )
-
-    s1,s2 = V.starts
-    e1,e2 = V.ends
-
-    # Fill in stencil matrices based on diagonal index
-    m11 = np.zeros((n1, n1),dtype=dtype)
-    m12 = np.zeros((n2, n2),dtype=dtype)
-    for j in range(n1):
-        for i in range(-p1,p1+1):
-            m11[j, max(0, min(n1-1, j+i))] = 10*factor*j+i
-    for j in range(n2):
-        for i in range(-p2,p2+1):
-            m12[j, max(0, min(n2-1, j+i))] = 20*factor*j+5*i+2.
-    
-    m21 = np.zeros((n1, n1),dtype=dtype)
-    m22 = np.zeros((n2, n2),dtype=dtype)
-    for j in range(n1):
-        for i in range(-p1,p1+1):
-            m21[j, max(0, min(n1-1, j+i))] = 10*factor*j**2+i**3
-    for j in range(n2):
-        for i in range(-p2,p2+1):
-            m22[j, max(0, min(n2-1, j+i))] = 20*factor*j**2+i**3+2.
-    
-    M11 = SparseSolver( spa.csc_matrix(m11) )
-    M12 = SparseSolver( spa.csc_matrix(m12) )
-    M21 = SparseSolver( spa.csc_matrix(m21) )
-    M22 = SparseSolver( spa.csc_matrix(m22) )
-    M1 = KroneckerLinearSolver(V, [M11,M12])
-    M2 = KroneckerLinearSolver(V, [M21,M22])
-    x1 = StencilVector( V )
-    x2 = StencilVector( V )
-
-    W = BlockVectorSpace(V, V)
-
-    # Fill in vector with random values, then update ghost regions
-    for i1 in range(s1,e1+1):
-        for i2 in range(s2,e2+1):
-            x1[i1,i2] = 2.0*factor*random() - 1.0
-            x2[i1,i2] = 5.0*factor*random() - 1.0
-    x1.update_ghost_regions()
-    x2.update_ghost_regions()
-
-    # Construct a BlockVector object containing x1 and x2
-    #     |x1|
-    # X = |  |
-    #     |x2|
-
-    X = BlockVector(W)
-    X[0] = x1
-    X[1] = x2
-
-    # Construct a BlockDiagonalSolver object containing M1, M2 using 3 ways
-    #     |M1  0 |
-    # L = |      |
-    #     |0   M2|
-
-    dict_blocks = {0:M1, 1:M2}
-    list_blocks = [M1, M2]
-
-    L1 = BlockDiagonalSolver( W, blocks=dict_blocks )
-    L2 = BlockDiagonalSolver( W, blocks=list_blocks )
-
-    L3 = BlockDiagonalSolver( W )
-
-    # Test for not allowing undefinedness
-    errresult = False
-    try:
-        L3.solve(X)
-    except NotImplementedError:
-        errresult = True
-    assert errresult
-
-    L3[0] = M1
-    L3[1] = M2
-
-    # Compute BlockDiagonalSolver product
-    Y1 = L1.solve(X)
-    Y2 = L2.solve(X)
-    Y3 = L3.solve(X)
-
-    # Transposed
-    Yt = L1.solve(X, transposed=True)
-
-    # Test other in/out methods
-    Y4a = W.zeros()
-    Y4b = L1.solve(X, out=Y4a)
-    assert Y4b is Y4a
-
-    Y5a = W.zeros()
-    Y5a[0] = x1.copy()
-    Y5a[1] = x2.copy()
-    Y5b = L1.solve(Y5a, out=Y5a)
-    assert Y5b is Y5a
-
-    # Solve linear equations for each block
-    y1 = M1.solve(x1)
-    y2 = M2.solve(x2)
-
-    y1t = M1.solve(x1, transposed=True)
-    y2t = M2.solve(x2, transposed=True)
-
-    # Check data in 1D array
-    assert np.allclose( Y1.blocks[0].toarray(), y1.toarray(), rtol=1e-14, atol=1e-14 )
-    assert np.allclose( Y1.blocks[1].toarray(), y2.toarray(), rtol=1e-14, atol=1e-14 )
-
-    assert np.allclose( Y2.blocks[0].toarray(), y1.toarray(), rtol=1e-14, atol=1e-14 )
-    assert np.allclose( Y2.blocks[1].toarray(), y2.toarray(), rtol=1e-14, atol=1e-14 )
-
-    assert np.allclose( Y3.blocks[0].toarray(), y1.toarray(), rtol=1e-14, atol=1e-14 )
-    assert np.allclose( Y3.blocks[1].toarray(), y2.toarray(), rtol=1e-14, atol=1e-14 )
-
-    assert np.allclose( Y4a.blocks[0].toarray(), y1.toarray(), rtol=1e-14, atol=1e-14 )
-    assert np.allclose( Y4a.blocks[1].toarray(), y2.toarray(), rtol=1e-14, atol=1e-14 )
-
-    assert np.allclose( Y5a.blocks[0].toarray(), y1.toarray(), rtol=1e-14, atol=1e-14 )
-    assert np.allclose( Y5a.blocks[1].toarray(), y2.toarray(), rtol=1e-14, atol=1e-14 )
-
-    assert np.allclose( Yt.blocks[0].toarray(), y1t.toarray(), rtol=1e-14, atol=1e-14 )
-    assert np.allclose( Yt.blocks[1].toarray(), y2t.toarray(), rtol=1e-14, atol=1e-14 )
-#===============================================================================
-@pytest.mark.parametrize( 'dtype', [float, complex] )
 @pytest.mark.parametrize( 'n1', [8, 16] )
 @pytest.mark.parametrize( 'n2', [8, 32] )
 @pytest.mark.parametrize( 'p1', [1, 3] )
diff --git a/psydac/linalg/tests/test_kron_direct_solver.py b/psydac/linalg/tests/test_kron_direct_solver.py
index 836b0b1b3..873500b5a 100644
--- a/psydac/linalg/tests/test_kron_direct_solver.py
+++ b/psydac/linalg/tests/test_kron_direct_solver.py
@@ -3,13 +3,26 @@
 import pytest
 import time
 import numpy as np
-from mpi4py                     import MPI
-from psydac.ddm.cart               import DomainDecomposition, CartDecomposition
+from mpi4py             import MPI
+
+
 from scipy.sparse               import csc_matrix, dia_matrix, kron
 from scipy.sparse.linalg        import splu
-from psydac.linalg.stencil         import StencilVectorSpace, StencilVector, StencilMatrix
-from psydac.linalg.kron            import KroneckerLinearSolver
+
+from sympde.calculus import dot
+from sympde.expr     import BilinearForm, integral
+from sympde.topology import Line
+from sympde.topology import Cube, Square
+from sympde.topology import Derham
+from sympde.topology import elements_of
+
+from psydac.api.discretization     import discretize
+from psydac.ddm.cart               import DomainDecomposition, CartDecomposition
+from psydac.linalg.block           import BlockLinearOperator
 from psydac.linalg.direct_solvers  import SparseSolver, BandedSolver
+from psydac.linalg.kron            import KroneckerLinearSolver
+from psydac.linalg.solvers         import inverse
+from psydac.linalg.stencil         import StencilVectorSpace, StencilVector, StencilMatrix
 
 #===============================================================================
 def compute_global_starts_ends(domain_decomposition, npts):
@@ -70,10 +83,10 @@ def matrix_to_sparse(A):
     A.remove_spurious_entries()
     return SparseSolver(A.tosparse())
 
-def random_matrix(seed, space):
-    A = StencilMatrix(space, space)
-    p = space.pads[0]
-    dtype = space.dtype
+def random_matrix(seed, domain, codomain):
+    A = StencilMatrix(domain, codomain)
+    p = domain.pads[0] # by definition of the StencilMatrix, domain.pads == codomain.pads
+    dtype = domain.dtype # by definition of the StencilMatrix, domain.dtype == codomain.dtype
 
     # for now, take matrices like this (as in the other tests)
     if dtype==complex:
@@ -107,31 +120,41 @@ def compare_solve(seed, comm, npts, pads, periods, direct_solver, dtype=float, t
     # vector spaces
     comm = MPI.COMM_WORLD
     D = DomainDecomposition(npts, periods=periods, comm=comm)
+    D2 = DomainDecomposition(npts, periods=periods, comm=comm)
 
     # Partition the points
     global_starts, global_ends = compute_global_starts_ends(D, npts)
+    global_starts2, global_ends2 = compute_global_starts_ends(D2, npts)
 
     cart = CartDecomposition(D, npts, global_starts, global_ends, pads=pads, shifts=[1]*len(pads))
+    cart2 = CartDecomposition(D2, npts, global_starts2, global_ends2, pads=pads, shifts=[1]*len(pads))
 
     V = StencilVectorSpace(cart, dtype=dtype)
+    W = StencilVectorSpace(cart2, dtype=dtype)
 
     Ds    = [DomainDecomposition([n], periods=[P]) for n,P in zip(npts, periods)]
     carts = [CartDecomposition(Di, [n],  *compute_global_starts_ends(Di, [n]), pads=[p], shifts=[1]) for Di,n,p in zip(Ds, npts, pads)]
     Vs = [StencilVectorSpace(carti, dtype=dtype) for carti in carts]
+    Ds2    = [DomainDecomposition([n], periods=[P]) for n,P in zip(npts, periods)]
+    carts2 = [CartDecomposition(Di, [n],  *compute_global_starts_ends(Di, [n]), pads=[p], shifts=[1]) for Di,n,p in zip(Ds2, npts, pads)]
+    Ws = [StencilVectorSpace(carti, dtype=dtype) for carti in carts2]
     localslice = tuple([slice(s, e+1) for s, e in zip(V.starts, V.ends)])
 
     if verbose:
         print(f'[{rank}] Vector spaces built', flush=True)
 
     # bulid matrices (A)
-    A = [random_matrix(seed+i+1, Vi) for i,Vi in enumerate(Vs)]
+    A = [random_matrix(seed+i+1, Vi, Wi) for i,(Vi, Wi) in enumerate(zip(Vs, Ws))]
     solvers = [direct_solver(Ai) for Ai in A]
 
     if verbose:
         print(f'[{rank}] Matrices built', flush=True)
 
     # vector to solve for (Y)
-    Y = StencilVector(V)
+    if transposed:
+        Y = StencilVector(W)
+    else:
+        Y = StencilVector(V)
     Y_glob = random_vectordata(seed, npts, dtype=dtype)
     Y[localslice] = Y_glob[localslice]
     Y.update_ghost_regions()
@@ -141,9 +164,16 @@ def compare_solve(seed, comm, npts, pads, periods, direct_solver, dtype=float, t
 
     # solve in two different ways
     X_glob = kron_solve_seq_ref(Y_glob, A, transposed)
-    Xout = StencilVector(V)
+    if transposed:
+        Xout = StencilVector(V)
+    else:
+        Xout = StencilVector(W)
 
-    X = KroneckerLinearSolver(V, solvers).solve(Y, out=Xout, transposed=transposed)
+    solver = KroneckerLinearSolver(V, W, solvers)
+    if transposed:
+        solver = solver.T
+    
+    X = solver.solve(Y, out=Xout)
     assert X is Xout
 
     if verbose:
@@ -162,6 +192,165 @@ def compare_solve(seed, comm, npts, pads, periods, direct_solver, dtype=float, t
     # compare for equality
     assert np.allclose( X[localslice], X_glob[localslice], rtol=1e-8, atol=1e-8 )
 
+def get_M1_block_kron_solver(V1, ncells, degree, periodic):
+    """
+    Given a 3D DeRham sequenece (V0 = H(grad) --grad--> V1 = H(curl) --curl--> V2 = H(div) --div--> V3 = L2)
+    discretized using ncells, degree and periodic,
+
+        domain = Cube('C', bounds1=(0, 1), bounds2=(0, 1), bounds3=(0, 1))
+        derham = Derham(domain)
+        domain_h = discretize(domain, ncells=ncells, periodic=periodic, comm=comm)
+        derham_h = discretize(derham, domain_h, degree=degree),
+
+    returns the inverse of the mass matrix M1 as a BlockLinearOperator consisting of three KroneckerLinearSolvers on the diagonal.
+    """
+    # assert 3D
+    assert len(ncells) == 3
+    assert len(degree) == 3
+    assert len(periodic) == 3
+
+    # 1D domain to be discreticed using the respective values of ncells, degree, periodic
+    domain_1d = Line('L', bounds=(0,1))
+    derham_1d = Derham(domain_1d)
+
+    # storage for the 1D mass matrices
+    M0_matrices = []
+    M1_matrices = []
+
+    # assembly of the 1D mass matrices
+    for (n, p, P) in zip(ncells, degree, periodic):
+
+        domain_1d_h = discretize(domain_1d, ncells=[n], periodic=[P])
+        derham_1d_h = discretize(derham_1d, domain_1d_h, degree=[p])
+
+        u_1d_0, v_1d_0 = elements_of(derham_1d.V0, names='u_1d_0, v_1d_0')
+        u_1d_1, v_1d_1 = elements_of(derham_1d.V1, names='u_1d_1, v_1d_1')
+
+        a_1d_0 = BilinearForm((u_1d_0, v_1d_0), integral(domain_1d, u_1d_0 * v_1d_0))
+        a_1d_1 = BilinearForm((u_1d_1, v_1d_1), integral(domain_1d, u_1d_1 * v_1d_1))
+
+        a_1d_0_h = discretize(a_1d_0, domain_1d_h, (derham_1d_h.V0, derham_1d_h.V0))
+        a_1d_1_h = discretize(a_1d_1, domain_1d_h, (derham_1d_h.V1, derham_1d_h.V1))
+
+        M_1d_0 = a_1d_0_h.assemble()
+        M_1d_1 = a_1d_1_h.assemble()
+
+        M0_matrices.append(M_1d_0)
+        M1_matrices.append(M_1d_1)
+
+    V1_1 = V1[0]
+    V1_2 = V1[1]
+    V1_3 = V1[2]
+
+    B1_mat = [M1_matrices[0], M0_matrices[1], M0_matrices[2]]
+    B2_mat = [M0_matrices[0], M1_matrices[1], M0_matrices[2]]
+    B3_mat = [M0_matrices[0], M0_matrices[1], M1_matrices[2]]
+
+    B1_solvers = [matrix_to_bandsolver(Ai) for Ai in B1_mat]
+    B2_solvers = [matrix_to_bandsolver(Ai) for Ai in B2_mat]
+    B3_solvers = [matrix_to_bandsolver(Ai) for Ai in B3_mat]
+
+    B1_kron_inv = KroneckerLinearSolver(V1_1, V1_1, B1_solvers)
+    B2_kron_inv = KroneckerLinearSolver(V1_2, V1_2, B2_solvers)
+    B3_kron_inv = KroneckerLinearSolver(V1_3, V1_3, B3_solvers)
+
+    M1_block_kron_solver = BlockLinearOperator(V1, V1, ((B1_kron_inv, None, None), 
+                                                        (None, B2_kron_inv, None), 
+                                                        (None, None, B3_kron_inv)))
+
+    return M1_block_kron_solver
+
+
+
+def get_inverse_mass_matrices(derham_h, domain_h):
+    """
+    Given a 2D DeRham sequence (V0 = H(grad) --curl--> V1 = H(div) --div--> V2 = L2)
+    and its discrete domain, which shall be rectangular,
+    returns the inverse of the mass matrices for all three spaces using kronecker solvers.
+    """
+    # assert 2D
+    # Maybe should add more assert regarding the types and the domain form in case this get used in general context.
+    
+    V0h   = derham_h.V0.vector_space
+    V1h   = derham_h.V1.vector_space
+    V2h   = derham_h.V2.vector_space
+    
+    bounds1 = domain_h.domain.bounds1
+    bounds2 = domain_h.domain.bounds2
+        
+    ncells = domain_h.ncells[domain_h.domain.name]
+    degree = derham_h.V0.degree
+    periodic = domain_h.periodic[domain_h.domain.name]
+    
+    assert len(ncells) == 2
+    assert len(degree) == 2
+    assert len(periodic) == 2
+
+    # 1D domain to be discreticed using the respective values of ncells, degree, periodic
+    list_domain_1d = [Line('L1', bounds=bounds1), Line('L2', bounds=bounds2)]
+    list_derham_1d = [Derham(domain_1d) for domain_1d in list_domain_1d]
+
+    # storage for the 1D mass matrices
+    M0_matrices = []
+    M1_matrices = []
+
+    # assembly of the 1D mass matrices
+    for (n, p, P, domain_1d, derham_1d) in zip(ncells, degree, periodic, list_domain_1d, list_derham_1d):
+
+        domain_1d_h = discretize(domain_1d, ncells=[n], periodic=[P])
+        derham_1d_h = discretize(derham_1d, domain_1d_h, degree=[p])
+
+        u_1d_0, v_1d_0 = elements_of(derham_1d.V0, names='u_1d_0, v_1d_0')
+        u_1d_1, v_1d_1 = elements_of(derham_1d.V1, names='u_1d_1, v_1d_1')
+
+        a_1d_0 = BilinearForm((u_1d_0, v_1d_0), integral(domain_1d, u_1d_0 * v_1d_0))
+        a_1d_1 = BilinearForm((u_1d_1, v_1d_1), integral(domain_1d, u_1d_1 * v_1d_1))
+
+        a_1d_0_h = discretize(a_1d_0, domain_1d_h, (derham_1d_h.V0, derham_1d_h.V0))
+        a_1d_1_h = discretize(a_1d_1, domain_1d_h, (derham_1d_h.V1, derham_1d_h.V1))
+
+        M_1d_0 = a_1d_0_h.assemble()
+        M_1d_1 = a_1d_1_h.assemble()
+
+        M0_matrices.append(M_1d_0)
+        M1_matrices.append(M_1d_1)
+
+    #V0 H1 space
+    
+    B_mat_V0 = [M0_matrices[0], M0_matrices[1]]
+    
+    B_solvers_V0 = [matrix_to_bandsolver(Ai) for Ai in B_mat_V0]
+    
+    M0_kron_solver = KroneckerLinearSolver(V0h, V0h, B_solvers_V0)
+
+
+    #V1 Hdiv space
+    V1_1 = V1h[0]
+    V1_2 = V1h[1]
+
+    B1_mat_V1 = [M0_matrices[0], M1_matrices[1]]
+    B2_mat_V1 = [M1_matrices[0], M0_matrices[1]]
+
+    B1_solvers_V1 = [matrix_to_bandsolver(Ai) for Ai in B1_mat_V1]
+    B2_solvers_V1 = [matrix_to_bandsolver(Ai) for Ai in B2_mat_V1]
+
+    B1_kron_inv_V1 = KroneckerLinearSolver(V1_1, V1_1, B1_solvers_V1)
+    B2_kron_inv_V1 = KroneckerLinearSolver(V1_2, V1_2, B2_solvers_V1)
+
+    M1_block_kron_solver = BlockLinearOperator(V1h, V1h, ((B1_kron_inv_V1, None), 
+                                                          (None, B2_kron_inv_V1)))
+    
+    #V2 L2 space
+    
+    B_mat_V2 = [M1_matrices[0], M1_matrices[1]]
+    
+    B_solvers_V2 = [matrix_to_bandsolver(Ai) for Ai in B_mat_V2]
+    
+    M2_kron_solver = KroneckerLinearSolver(V2h, V2h, B_solvers_V2)
+    
+    
+    return M0_kron_solver, M1_block_kron_solver, M2_kron_solver
+
 #===============================================================================
 # tests of the direct solvers
 @pytest.mark.parametrize( 'dtype', [float, complex] )
@@ -177,12 +366,14 @@ def test_direct_solvers(dtype, seed, n, p, P, nrhs, direct_solver, transposed):
     D    = DomainDecomposition([n], periods=[P])
     cart = CartDecomposition(D, [n],  *compute_global_starts_ends(D, [n]), pads=[p], shifts=[1])
 
-    # space (V)
+    # domain V and codomain W
     V = StencilVectorSpace( cart, dtype=dtype )
 
     # bulid matrices (A)
-    A = random_matrix(seed+1, V)
+    A = random_matrix(seed+1, V, V)
     solver = direct_solver(A)
+    if transposed:
+        solver = solver.T
 
     # vector to solve for (Y)
     Y_glob = np.stack([random_vectordata(seed + i, [n], dtype) for i in range(nrhs)], axis=0)
@@ -198,15 +389,15 @@ def test_direct_solvers(dtype, seed, n, p, P, nrhs, direct_solver, transposed):
     X_glob = C_op.solve(Y_glob.T).T
 
     # new vector allocation
-    X_glob2 = solver.solve(Y_glob, transposed=transposed)
+    X_glob2 = solver.solve(Y_glob)
 
     # solve with out vector
     X_glob3 = Y_glob.copy()
-    X_glob4 = solver.solve(Y_glob, out=X_glob3, transposed=transposed)
+    X_glob4 = solver.solve(Y_glob, out=X_glob3)
 
     # solve in-place
     X_glob5 = Y_glob.copy()
-    X_glob6 = solver.solve(X_glob5, out=X_glob5, transposed=transposed)
+    X_glob6 = solver.solve(X_glob5, out=X_glob5)
 
     # compare results
     assert X_glob4 is X_glob3
@@ -363,6 +554,221 @@ def test_kron_solver_nd_par(seed, dim, dtype):
 
     npts_base = 4
     compare_solve(seed, MPI.COMM_WORLD, [npts_base]*dim, [1]*dim, [False]*dim, matrix_to_sparse, dtype=dtype, transposed=False, verbose=False)
+
+#===============================================================================
+
+# test Kronecker solver of the M1 mass matrix of our 3D DeRham sequence, as described in the get_M1_block_kron_solver method
+@pytest.mark.parametrize( 'ncells', [[8, 8, 8], [8, 16, 8]] )
+@pytest.mark.parametrize( 'degree', [[2, 2, 2]] )
+@pytest.mark.parametrize( 'periodic', [[True, True, True]] )
+@pytest.mark.parallel
+def test_3d_m1_solver(ncells, degree, periodic):
+
+    comm = MPI.COMM_WORLD
+    domain = Cube('C', bounds1=(0, 1), bounds2=(0, 1), bounds3=(0, 1))
+    derham = Derham(domain)
+    domain_h = discretize(domain, ncells=ncells, periodic=periodic, comm=comm)
+    derham_h = discretize(derham, domain_h, degree=degree)
+    V1 = derham_h.V1.vector_space
+    P0, P1, P2, P3  = derham_h.projectors()
+
+    # obtain an iterative M1 solver the usual way
+    u1, v1 = elements_of(derham.V1, names='u1, v1')
+    a1 = BilinearForm((u1, v1), integral(domain, dot(u1, v1)))
+    a1_h = discretize(a1, domain_h, (derham_h.V1, derham_h.V1))
+    M1 = a1_h.assemble()
+    tol = 1e-12
+    maxiter = 1000
+    M1_iterative_solver = inverse(M1, 'cg', tol = tol, maxiter=maxiter)
+
+    # obtain a direct M1 solver utilizing the Block-Kronecker structure of M1
+    M1_direct_solver = get_M1_block_kron_solver(V1, ncells, degree, periodic)
+
+    # obtain x and rhs = M1 @ x, both elements of derham_h.V1
+    def get_A_fun(n=1, m=1, A0=1e04):
+        """Get the tuple A = (A1, A2, A3), where each entry is a function taking x,y,z as input."""
+
+        mu_tilde = np.sqrt(m**2 + n**2)  
+
+        eta = lambda x, y, z: x**2 * (1-x)**2 * y**2 * (1-y)**2 * z**2 * (1-z)**2
+
+        u1  = lambda x, y, z:  A0 * (n/mu_tilde) * np.sin(np.pi * m * x) * np.cos(np.pi * n * y)
+        u2  = lambda x, y, z: -A0 * (m/mu_tilde) * np.cos(np.pi * m * x) * np.sin(np.pi * n * y)
+        u3  = lambda x, y, z:  A0 * np.sin(np.pi * m * x) * np.sin(np.pi * n * y)
+
+        A1 = lambda x, y, z: eta(x, y, z) * u1(x, y, z)
+        A2 = lambda x, y, z: eta(x, y, z) * u2(x, y, z)
+        A3 = lambda x, y, z: eta(x, y, z) * u3(x, y, z)
+
+        A = (A1, A2, A3)
+        return A
+    x = P1(get_A_fun()).coeffs
+    rhs = M1 @ x
+
+    # solve M1 @ x = rhs for x two ways
+    # pass -s to see timings
+    # on my local machine, executing 
+    # mpirun -n 4 python -m pytest test_kron_direct_solver.py::test_3d_m1_solver -s
+    # I can report the following data:
+
+    ### 4 processes, test case 1 (ncells=[8, 8, 8]):
+
+    # Solving for x using the iterative solver: 23.73982548713684 seconds
+    # Solving for x using the iterative solver: 23.820897102355957 seconds
+    # Solving for x using the iterative solver: 23.783425092697144 seconds
+    # Solving for x using the iterative solver: 23.71373987197876 seconds
+    # Solving for x using the direct solver: 0.3333120346069336 seconds
+    # Solving for x using the direct solver: 0.3369138240814209 seconds
+    # Solving for x using the direct solver: 0.33652329444885254 seconds
+    # Solving for x using the direct solver: 0.34088802337646484 seconds
+
+    ###4 processes, test case 2 (ncells=[8, 16, 8]):
+    # Solving for x using the iterative solver: 82.10541296005249 seconds
+    # Solving for x using the iterative solver: 81.88263297080994 seconds
+    # Solving for x using the iterative solver: 82.07102465629578 seconds
+    # Solving for x using the iterative solver: 82.00282955169678 seconds
+    # Solving for x using the direct solver: 0.1675126552581787 seconds
+    # Solving for x using the direct solver: 0.17473626136779785 seconds
+    # Solving for x using the direct solver: 0.15992450714111328 seconds
+    # Solving for x using the direct solver: 0.17931437492370605 seconds
+
+    # Note that on consecutive solves, with only a slightly changing rhs and recycle=True, the iterative solver won't perform as bad anymore.
+
+    start = time.time()
+    x_iterative = M1_iterative_solver @ rhs
+    stop = time.time()
+    print(f"Solving for x using the iterative solver: {stop-start} seconds")
+
+    start = time.time()
+    x_direct = M1_direct_solver @ rhs
+    stop = time.time()
+    print(f"Solving for x using the direct solver: {stop-start} seconds")
+
+    # assert rhs_iterative is within the tolerance close to rhs, and so is rhs_direct
+    rhs_iterative = M1 @ x_iterative
+    rhs_direct = M1 @ x_direct
+    assert np.linalg.norm((rhs-rhs_iterative).toarray()) < tol
+    assert np.linalg.norm((rhs-rhs_direct).toarray()) < tol
+    
+    
+#===============================================================================
+
+# test Kronecker solver of the mass matrices of our 2D (H1,Hdiv,L2) DeRham sequence, using get_inverse_mass_matrices function
+
+@pytest.mark.parametrize( 'ncells', [[8, 8], [8, 16]] )
+@pytest.mark.parametrize( 'degree', [[2, 2], [2,3]] )
+@pytest.mark.parametrize( 'bounds', [[(0,1), (0,1)], [(0,0.5), (0,2.)]] )
+@pytest.mark.parametrize( 'periodic', [[True, True], [False,False]] )
+@pytest.mark.parallel
+def test_2d_mass_solver(ncells, degree, bounds, periodic):
+
+    comm = MPI.COMM_WORLD
+    domain = Square('Omega', bounds1=bounds[0], bounds2=bounds[1])
+    derham = Derham(domain, ["H1", "Hdiv", "L2"])
+    domain_h = discretize(domain, ncells=ncells, periodic=periodic, comm=comm)
+    derham_h = discretize(derham, domain_h, degree=degree)
+
+    P0, P1, P2 = derham_h.projectors()
+    
+    # obtain an iterative M0 solver the usual way
+    u0, v0 = elements_of(derham.V0, names='u0, v0')
+    a0 = BilinearForm((u0, v0), integral(domain, u0*v0))
+    a0_h = discretize(a0, domain_h, (derham_h.V0, derham_h.V0))
+    M0 = a0_h.assemble()
+    tol = 1e-10
+    maxiter = 1000
+    M0_iterative_solver = inverse(M0, 'cg', tol = tol, maxiter=maxiter)
+
+    # obtain an iterative M1 solver the usual way
+    u1, v1 = elements_of(derham.V1, names='u1, v1')
+    a1 = BilinearForm((u1, v1), integral(domain, dot(u1, v1)))
+    a1_h = discretize(a1, domain_h, (derham_h.V1, derham_h.V1))
+    M1 = a1_h.assemble()
+    tol = 1e-10
+    maxiter = 1000
+    M1_iterative_solver = inverse(M1, 'cg', tol = tol, maxiter=maxiter)
+    
+    # obtain an iterative M2 solver the usual way
+    u2, v2 = elements_of(derham.V2, names='u0, v0')
+    a2 = BilinearForm((u2, v2), integral(domain, u2*v2))
+    a2_h = discretize(a2, domain_h, (derham_h.V2, derham_h.V2))
+    M2 = a2_h.assemble()
+    tol = 1e-10
+    maxiter = 1000
+    M2_iterative_solver = inverse(M2, 'cg', tol = tol, maxiter=maxiter)
+
+    # obtain a direct M1 solver utilizing the Block-Kronecker structure of M1
+    M0_direct_solver, M1_direct_solver, M2_direct_solver = get_inverse_mass_matrices(derham_h, domain_h)
+
+    # obtain x and rhs = M1 @ x, both elements of derham_h.V1
+    def get_A_fun_vec(n=1, m=1, A0=1e04):
+        """Get the tuple A = (A1, A2), where each entry is a function taking x,y,z as input."""
+
+        mu_tilde = np.sqrt(m**2 + n**2)  
+
+        eta = lambda x, y: x**2 * (1-x)**2 * y**2 * (1-y)**2
+
+        u1  = lambda x, y:  A0 * (n/mu_tilde) * np.sin(np.pi * m * x) * np.cos(np.pi * n * y)
+        u2  = lambda x, y: -A0 * (m/mu_tilde) * np.cos(np.pi * m * x) * np.sin(np.pi * n * y)
+
+        A1 = lambda x, y: eta(x, y) * u1(x, y)
+        A2 = lambda x, y: eta(x, y) * u2(x, y)
+
+        A = (A1, A2)
+        return A
+    
+    def get_A_fun_scalar(n=1, m=1, A0=1e04):
+        """Get the tuple A = (A1, A2), where each entry is a function taking x,y,z as input."""
+
+        mu_tilde = np.sqrt(m**2 + n**2)  
+
+        eta = lambda x, y: x**2 * (1-x)**2 * y**2 * (1-y)**2
+
+        u  = lambda x, y:  A0 * (n/mu_tilde) * np.sin(np.pi * m * x) * np.cos(np.pi * n * y)
+
+        A = lambda x, y: eta(x, y) * u(x, y)
+        return A
+    
+    
+    x0 = P0(get_A_fun_scalar()).coeffs
+    rhs = M0 @ x0
+
+    # solve M0 @ x9 = rhs for x two ways
+    x_iterative = M0_iterative_solver @ rhs
+    x_direct = M0_direct_solver @ rhs
+
+    # assert rhs_iterative is within the tolerance close to rhs, and so is rhs_direct
+    rhs_iterative = M0 @ x_iterative
+    rhs_direct = M0 @ x_direct
+    assert np.linalg.norm((rhs-rhs_iterative).toarray()) < tol
+    assert np.linalg.norm((rhs-rhs_direct).toarray()) < tol
+    
+    x1 = P1(get_A_fun_vec()).coeffs
+    rhs = M1 @ x1
+
+    # solve M1 @ x = rhs for x two ways
+    x_iterative = M1_iterative_solver @ rhs
+    x_direct = M1_direct_solver @ rhs
+
+    # assert rhs_iterative is within the tolerance close to rhs, and so is rhs_direct
+    rhs_iterative = M1 @ x_iterative
+    rhs_direct = M1 @ x_direct
+    assert np.linalg.norm((rhs-rhs_iterative).toarray()) < tol
+    assert np.linalg.norm((rhs-rhs_direct).toarray()) < tol
+    
+    x2 = P2(get_A_fun_scalar()).coeffs
+    rhs = M2 @ x2
+
+    # solve M2 @ x = rhs for x two ways
+    x_iterative = M2_iterative_solver @ rhs
+    x_direct = M2_direct_solver @ rhs
+
+    # assert rhs_iterative is within the tolerance close to rhs, and so is rhs_direct
+    rhs_iterative = M2 @ x_iterative
+    rhs_direct = M2 @ x_direct
+    assert np.linalg.norm((rhs-rhs_iterative).toarray()) < tol
+    assert np.linalg.norm((rhs-rhs_direct).toarray()) < tol
+    
 #===============================================================================
 
 if __name__ == '__main__':