diff --git a/src/compressed_tensors/compressors/sparse_compressors/__init__.py b/src/compressed_tensors/compressors/sparse_compressors/__init__.py
index de4fd887..871079ac 100644
--- a/src/compressed_tensors/compressors/sparse_compressors/__init__.py
+++ b/src/compressed_tensors/compressors/sparse_compressors/__init__.py
@@ -15,4 +15,5 @@
 
 from .base import *
 from .dense import *
+from .sparse_24_bitmask import *
 from .sparse_bitmask import *
diff --git a/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py b/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py
new file mode 100644
index 00000000..54ea7200
--- /dev/null
+++ b/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+import torch
+from compressed_tensors.compressors.base import BaseCompressor
+from compressed_tensors.compressors.sparse_compressors.base import BaseSparseCompressor
+from compressed_tensors.config import CompressionFormat, SparsityStructure
+from compressed_tensors.quantization import FP8_DTYPE
+from compressed_tensors.utils import merge_names, pack_into_bitmasks, unpack_bitmasks
+from torch import Tensor
+
+
+__all__ = [
+    "Sparse24BitMaskCompressor",
+    "Sparse24BitMaskTensor",
+    "sparse24_bitmask_compress",
+    "sparse24_bitmask_decompress",
+    "get_24_bytemasks",
+]
+
+
+@BaseCompressor.register(name=CompressionFormat.sparse_24_bitmask.value)
+class Sparse24BitMaskCompressor(BaseSparseCompressor):
+    """
+    Compression for sparse models using bitmasks. Non-zero weights are stored in a 2d
+    values tensor, with their locations stored in a 2d bitmask
+    """
+
+    COMPRESSION_PARAM_NAMES = [
+        "shape",
+        "compressed",
+        "bitmask",
+    ]
+
+    def compress_weight(self, name, value):
+        bitmask_tensor = Sparse24BitMaskTensor.from_dense(
+            value, self.config.sparsity_structure
+        )
+        bitmask_dict = bitmask_tensor.dict(name_prefix=name, device="cpu")
+        return bitmask_dict
+
+    def decompress_weight(self, weight_data):
+        data = Sparse24BitMaskTensor(**weight_data)
+        decompressed = data.decompress()
+        return decompressed
+
+
+class Sparse24BitMaskTensor:
+    """
+    Owns compressions and decompression for a single 2:4 sparse
+    bitmask compressed tensor.
+
+    :param shape: shape of dense tensor
+    :compressed: 2d tensor of non-zero values
+    :bitmask: 2d bitmask of non-zero values
+    """
+
+    def __init__(
+        self,
+        shape: Union[torch.Size, List],
+        compressed: Tensor,
+        bitmask: Tensor,
+    ):
+        self.shape = list(shape)
+        self.compressed = compressed
+        self.bitmask = bitmask
+
+    @staticmethod
+    def from_dense(
+        tensor: Tensor,
+        sparsity_structure: Union[SparsityStructure, str] = SparsityStructure.TWO_FOUR,
+    ) -> "Sparse24BitMaskTensor":
+        """
+        :param tensor: dense tensor to compress
+        :return: instantiated compressed tensor
+        """
+        shape = tensor.shape
+        compressed, bitmask = sparse24_bitmask_compress(
+            tensor.cpu(), sparsity_structure=sparsity_structure
+        )
+        return Sparse24BitMaskTensor(
+            shape=shape,
+            compressed=compressed,
+            bitmask=bitmask,
+        )
+
+    def decompress(self) -> Tensor:
+        """
+        :return: reconstructed dense tensor
+        """
+        return sparse24_bitmask_decompress(self.compressed, self.bitmask, self.shape)
+
+    def curr_memory_size_bytes(self):
+        """
+        :return: size in bytes required to store compressed tensor on disk
+        """
+
+        def sizeof_tensor(a):
+            return a.element_size() * a.nelement()
+
+        return sizeof_tensor(self.compressed) + sizeof_tensor(self.bitmask)
+
+    def dict(self, name_prefix: str, device: str = "cpu") -> Dict[str, Tensor]:
+        """
+        :name_prefix: name of original tensor to store compressed weight as
+        :return: dict of compressed data for the stored weight
+        """
+        if name_prefix.endswith(".weight"):
+            name_prefix = name_prefix[: -len(".weight")]
+        return {
+            merge_names(name_prefix, "shape"): torch.tensor(
+                self.shape, device=device
+            ).reshape(-1, 1),
+            merge_names(name_prefix, "compressed"): self.compressed.to(device),
+            merge_names(name_prefix, "bitmask"): self.bitmask.to(device),
+        }
+
+    def __repr__(self):
+        return f"BitMaskTensor(shape={self.shape}, compressed=True)"
+
+
+def sparse24_bitmask_compress(
+    tensor: Tensor,
+    sparsity_structure: Union[SparsityStructure, str] = SparsityStructure.TWO_FOUR,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    """
+    Compresses a dense tensor using bitmask compression
+
+    :param tensor: dense 2D tensor to compress
+    :param sparsity_structure: structure of sparsity in the tensor, defaults
+        to unstructured, can also be set to `2:4`
+    :return: tuple of compressed data representing tensor
+    """
+    assert len(tensor.shape) == 2, "Only 2D tensors are supported"
+    assert (
+        SparsityStructure(sparsity_structure) == SparsityStructure.TWO_FOUR
+    ), "Only 2:4 sparsity is supported"
+
+    bytemasks = get_24_bytemasks(tensor=tensor)
+
+    if tensor.dtype == FP8_DTYPE:
+        # acces raw bytes of the tensor
+        tensor_view = tensor.view(torch.int8)
+        values = tensor_view[bytemasks]
+        values = values.view(FP8_DTYPE)
+    else:
+        values = tensor[bytemasks]
+
+    num_rows, num_cols = tensor.shape
+    compressed_values = values.reshape(num_rows, num_cols // 2)
+    bitmasks_packed = pack_into_bitmasks(bytemasks)
+    return compressed_values, bitmasks_packed
+
+
+def sparse24_bitmask_decompress(
+    values: Tensor, bitmasks: Tensor, original_shape: torch.Size
+) -> Tensor:
+    """
+    Reconstructs a dense tensor from a compressed one
+
+    :param values: 1d tensor of non-zero values
+    :param bitmasks: 2d int8 tensor flagging locations of non-zero values in the
+    tensors original shape
+    :param original_shape: shape of the dense tensor
+    :return: decompressed dense tensor
+    """
+    bytemasks_unpacked = unpack_bitmasks(bitmasks, original_shape)
+
+    decompressed_tensor = torch.zeros(original_shape, dtype=values.dtype)
+    decompressed_tensor = decompressed_tensor.to(values.device)
+    values = values.flatten()
+    if decompressed_tensor.dtype == FP8_DTYPE:
+        decompressed_tensor[bytemasks_unpacked] = values
+        decompressed_tensor = decompressed_tensor.cuda()
+    else:
+        decompressed_tensor[bytemasks_unpacked] = values
+    return decompressed_tensor
+
+
+def get_24_bytemasks(tensor):
+    """
+    Generate a 2:4 sparsity mask for the given tensor.
+
+    This function creates a mask where exactly 2 out of every 4 elements are
+    preserved based on their magnitudes. The preserved elements are the ones
+    with the highest absolute values in each group of 4 elements.
+
+    :param tensor: The input tensor for which the 2:4 sparsity mask is to be created.
+                   The tensor can be of any shape but its total number of elements
+                   must be a multiple of 4.
+    :return: A boolean tensor of the same shape as the input tensor, where `True`
+             indicates the preserved elements and `False` indicates the pruned elements.
+    :raises ValueError: If the total number of elements in the tensor is not a
+                        multiple of 4.
+    """
+    original_dtype = tensor.dtype
+    if tensor.dtype == FP8_DTYPE:
+        tensor = tensor.view(torch.int8)
+    original_shape = tensor.shape
+    num_elements = tensor.numel()
+
+    if num_elements % 4 != 0:
+        raise ValueError("Tensor size must be a multiple of 4 for TWO_FOUR sparsity")
+
+    reshaped_tensor = tensor.view(-1, 4)
+    abs_tensor = reshaped_tensor.abs()
+    topk_indices = abs_tensor.topk(2, dim=1).indices
+    mask = torch.zeros_like(reshaped_tensor, dtype=torch.bool)
+    mask.scatter_(1, topk_indices, True)
+    mask = mask.view(original_shape)
+    tensor = tensor.view(original_dtype)
+
+    return mask
diff --git a/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py b/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py
index 0434499d..9c2e10ae 100644
--- a/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py
+++ b/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py
@@ -14,13 +14,12 @@
 
 from typing import Dict, List, Tuple, Union
 
-import numpy
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
 from compressed_tensors.compressors.sparse_compressors.base import BaseSparseCompressor
 from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization import FP8_DTYPE
-from compressed_tensors.utils import merge_names
+from compressed_tensors.utils import merge_names, pack_into_bitmasks, unpack_bitmasks
 from torch import Tensor
 
 
@@ -29,8 +28,6 @@
     "BitmaskTensor",
     "bitmask_compress",
     "bitmask_decompress",
-    "pack_bitmasks",
-    "unpack_bitmasks",
 ]
 
 
@@ -142,7 +139,7 @@ def bitmask_compress(tensor: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
         values = values.view(FP8_DTYPE)
     else:
         values = tensor[bytemasks]
-    bitmasks_packed = pack_bitmasks(bytemasks)
+    bitmasks_packed = pack_into_bitmasks(bytemasks)
     return values, bitmasks_packed, row_offsets
 
 
@@ -164,37 +161,3 @@ def bitmask_decompress(
     decompressed_tensor[bytemasks_unpacked] = values
 
     return decompressed_tensor
-
-
-def pack_bitmasks(bytemasks: Tensor) -> Tensor:
-    """
-    Converts a bytemask tensor to a bitmask tensor to reduce memory. Shape RxC will be
-    compressed to R x ceil(C/8)
-    :param bytemasks: mask tensor where each byte corresponds to a weight
-    :return: mask tensor where each bit corresounds to a weight
-    """
-    packed_bits_numpy = numpy.packbits(bytemasks.numpy(), axis=-1, bitorder="little")
-    packed_bits_torch = torch.from_numpy(packed_bits_numpy)
-
-    return packed_bits_torch
-
-
-def unpack_bitmasks(packed_bitmasks: Tensor, original_shape: torch.Size) -> Tensor:
-    """
-    Converts a bitmask tensor back to a bytemask tensor for use during decompression
-
-    :param packed_bitmasks: mask tensor where each bit corresponds to a weight
-    :param original_shape: dense shape to decompress to
-    :return: boolean mask of weights in the original dense shape
-    """
-    # Unpack the bits
-    unpacked_bits = numpy.unpackbits(
-        packed_bitmasks.numpy(), axis=-1, count=original_shape[-1], bitorder="little"
-    )
-
-    # Reshape to match the original shape
-    unpacked_bitmasks_torch = torch.from_numpy(
-        unpacked_bits.reshape(original_shape).astype(bool)
-    )
-
-    return unpacked_bitmasks_torch
diff --git a/src/compressed_tensors/config/__init__.py b/src/compressed_tensors/config/__init__.py
index ff83f5af..582b8a9e 100644
--- a/src/compressed_tensors/config/__init__.py
+++ b/src/compressed_tensors/config/__init__.py
@@ -15,4 +15,5 @@
 # flake8: noqa
 from .base import *
 from .dense import *
+from .sparse_24_bitmask import *
 from .sparse_bitmask import *
diff --git a/src/compressed_tensors/config/base.py b/src/compressed_tensors/config/base.py
index 79a4fcdd..9ca6f2cf 100644
--- a/src/compressed_tensors/config/base.py
+++ b/src/compressed_tensors/config/base.py
@@ -26,6 +26,7 @@
 class CompressionFormat(Enum):
     dense = "dense"
     sparse_bitmask = "sparse-bitmask"
+    sparse_24_bitmask = "sparse-24-bitmask"
     int_quantized = "int-quantized"
     float_quantized = "float-quantized"
     naive_quantized = "naive-quantized"
diff --git a/src/compressed_tensors/config/sparse_24_bitmask.py b/src/compressed_tensors/config/sparse_24_bitmask.py
new file mode 100644
index 00000000..7aae2dbe
--- /dev/null
+++ b/src/compressed_tensors/config/sparse_24_bitmask.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from compressed_tensors.config import (
+    CompressionFormat,
+    SparsityCompressionConfig,
+    SparsityStructure,
+)
+
+
+__all__ = ["Sparse24BitMaskConfig"]
+
+
+@SparsityCompressionConfig.register(name=CompressionFormat.sparse_24_bitmask.value)
+class Sparse24BitMaskConfig(SparsityCompressionConfig):
+    """
+    Configuration for storing a 24 sparse model using
+    bytemask compression
+
+    :param global_sparsity: average sparsity of the entire model
+    :param sparsity_structure: structure of the sparsity, should always be
+        "2:4" for this compression format
+    """
+
+    format: str = CompressionFormat.sparse_24_bitmask.value
+    global_sparsity: Optional[float] = 0.0
+    sparsity_structure: Optional[str] = SparsityStructure.TWO_FOUR.value
diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py
index 910436eb..39066e8b 100644
--- a/src/compressed_tensors/utils/helpers.py
+++ b/src/compressed_tensors/utils/helpers.py
@@ -14,8 +14,9 @@
 
 import warnings
 from functools import wraps
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, List, Optional
 
+import numpy
 import torch
 from transformers import AutoConfig
 
@@ -29,6 +30,10 @@
     "getattr_chain",
     "deprecated",
     "Aliasable",
+    "combine_shards",
+    "shard_tensor",
+    "pack_into_bitmasks",
+    "unpack_bitmasks",
 ]
 
 FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
@@ -214,3 +219,108 @@ def __eq__(self, other):
     def __hash__(self):
         canonical_value = self.aliases.get(self.value, self.value)
         return hash(canonical_value)
+
+
+def shard_tensor(
+    tensor: torch.Tensor, shard_sizes: List[int], dim: int = 0
+) -> List[torch.Tensor]:
+    """
+    Shards a tensor into a list of tensors along a given dimension.
+
+    raises: ValueError: If the sum of shard_sizes does not match the
+        size of the tensor along the given dimension.
+
+    :param tensor: The input tensor to shard.
+    :param shard_sizes : List of sizes for each shard along the specified dimension.
+    :param dim : The dimension along which to shard the tensor.
+    :returns: A list of tensors sharded along the specified dimension.
+    """
+    if sum(shard_sizes) != tensor.size(dim):
+        raise ValueError(
+            "Sum of shard_sizes must equal the size of the tensor "
+            "along the specified dimension."
+        )
+
+    shards = []
+    start_idx = 0
+
+    for size in shard_sizes:
+        end_idx = start_idx + size
+        shard = tensor.narrow(dim, start_idx, size)
+        shards.append(shard)
+        start_idx = end_idx
+
+    return shards
+
+
+def combine_shards(shards, dim=0):
+    """
+    Combine decompressed shards along a given dimension without using torch.cat
+    for unsupported dtypes like float8_e4m3fn.
+
+    :param shards: List of decompressed shard tensors.
+    :param dim: Dimension to combine along (default: 0).
+    :return: Combined decompressed tensor.
+    """
+    try:
+        # Attempt regular concatenation
+        return torch.cat(shards, dim=dim)
+    except RuntimeError as e:
+        # Handle unsupported concatenation
+        if all(shard.dtype == torch.float8_e4m3fn for shard in shards):
+            total_shape = list(shards[0].shape)
+            total_shape[dim] = sum(shard.shape[dim] for shard in shards)
+            combined = torch.zeros(
+                total_shape, dtype=shards[0].dtype, device=shards[0].device
+            )
+
+            shard_offset = 0
+            for shard in shards:
+                shard_size = shard.shape[dim]
+                combined.narrow(dim, shard_offset, shard_size).copy_(shard)
+                shard_offset += shard_size
+
+            return combined
+        else:
+            # Re-raise unexpected errors
+            raise e
+
+
+def pack_into_bitmasks(bytemasks: torch.Tensor) -> torch.Tensor:
+    """
+    Converts a bytemask tensor to a bitmask tensor to reduce memory. Shape RxC will be
+    compressed to R x ceil(C/8)
+
+    :param bytemasks: mask tensor where each byte corresponds to a weight
+    :return: mask tensor where each bit corresounds to a weight
+    """
+    packed_bits_numpy = numpy.packbits(bytemasks.numpy(), axis=-1, bitorder="little")
+    packed_bits_torch = torch.from_numpy(packed_bits_numpy)
+
+    return packed_bits_torch
+
+
+def unpack_bitmasks(
+    packed_bitmasks: torch.Tensor, original_shape: torch.Size
+) -> torch.Tensor:
+    """
+    Converts a bitmask tensor back to a bytemask tensor for use during decompression
+
+    :param packed_bitmasks: mask tensor where each bit corresponds to a weight
+    :param original_shape: dense shape to decompress to
+    :return: boolean mask of weights in the original dense shape
+    """
+    # Unpack the bits
+    unpacked_bits = numpy.unpackbits(
+        packed_bitmasks.cpu().numpy(),
+        axis=-1,
+        count=original_shape[-1],
+        bitorder="little",
+    )
+
+    # Reshape to match the original shape
+    unpacked_bitmasks_torch = torch.from_numpy(
+        unpacked_bits.reshape(original_shape).astype(bool)
+    )
+
+    return unpacked_bitmasks_torch
diff --git a/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py b/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py
new file mode 100644
index 00000000..4e1b5c47
--- /dev/null
+++ b/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+import torch
+from compressed_tensors import Sparse24BitMaskTensor
+from compressed_tensors.quantization import FP8_DTYPE
+from compressed_tensors.utils import combine_shards, shard_tensor
+from tests.testing_utils import generate_pruned_semi_structured_mat
+
+
+@pytest.fixture
+def dense_matrix_fixture():
+    def _generate_dense_matrix(M, K, dtype):
+        return generate_pruned_semi_structured_mat(M, K, dtype)
+
+    return _generate_dense_matrix
+
+
+@pytest.fixture
+def shard_validation():
+    def _validate_shard_shapes(sharded_values, sharded_bitmask, expected_shapes):
+        for shard_values, shard_bitmask, expected_shape in zip(
+            sharded_values, sharded_bitmask, expected_shapes
+        ):
+            assert (
+                shard_values.shape == expected_shape["compressed"]
+            ), f"Shape mismatch: {shard_values.shape} != {expected_shape['compressed']}"
+            assert (
+                shard_bitmask.shape == expected_shape["bitmask"]
+            ), f"Shape mismatch: {shard_bitmask.shape} != {expected_shape['bitmask']}"
+
+    return _validate_shard_shapes
+
+
+@pytest.mark.parametrize("dtype", [FP8_DTYPE])
+def test_bitmask_compress_decompress_fp8(dense_matrix_fixture, dtype):
+    M, K = 1024, 1024
+    dense_matrix = dense_matrix_fixture(M, K, dtype)
+
+    bitmask_tensor = Sparse24BitMaskTensor.from_dense(
+        dense_matrix, sparsity_structure="2:4"
+    )
+    decompressed_tensor = bitmask_tensor.decompress()
+
+    dense_matrix = dense_matrix.to(decompressed_tensor.device)
+
+    assert dense_matrix.dtype == decompressed_tensor.dtype, "Dtype mismatch"
+    assert dense_matrix.shape == decompressed_tensor.shape, "Shape mismatch"
+    assert torch.equal(dense_matrix, decompressed_tensor), "Decompression failed"
+
+
+@pytest.mark.parametrize(
+    "dtype, M, K, shard_sizes, shard_dim, expected_shapes",
+    [
+        (
+            FP8_DTYPE,
+            2560,
+            2048,
+            [2048, 256, 256],
+            0,
+            [
+                {"compressed": (2048, 1024), "bitmask": (2048, 2048 // 8)},
+                {"compressed": (256, 1024), "bitmask": (256, 2048 // 8)},
+                {"compressed": (256, 1024), "bitmask": (256, 2048 // 8)},
+            ],
+        ),
+        (
+            FP8_DTYPE,
+            2048,
+            2048,
+            [1024, 1024],
+            1,
+            [
+                {"compressed": (2048, 512), "bitmask": (2048, 2048 // 8 // 2)},
+                {"compressed": (2048, 512), "bitmask": (2048, 2048 // 8 // 2)},
+            ],
+        ),
+    ],
+)
+def test_bitmask_compress_decompress_sharded(
+    dense_matrix_fixture,
+    shard_validation,
+    dtype,
+    M,
+    K,
+    shard_sizes,
+    shard_dim,
+    expected_shapes,
+):
+    dense_matrix = dense_matrix_fixture(M, K, dtype)
+
+    bitmask_tensor = Sparse24BitMaskTensor.from_dense(dense_matrix)
+    compressed_values = bitmask_tensor.compressed
+    compressed_bitmask = bitmask_tensor.bitmask
+
+    if shard_dim == 1:
+        compressed_shard_sizes = [size // 2 for size in shard_sizes]
+        bitmask_shard_sizes = [size // 8 for size in shard_sizes]
+    else:
+        compressed_shard_sizes = shard_sizes
+        bitmask_shard_sizes = shard_sizes
+
+    sharded_compressed_values = shard_tensor(
+        compressed_values, compressed_shard_sizes, dim=shard_dim
+    )
+    sharded_compressed_bitmask = shard_tensor(
+        compressed_bitmask, bitmask_shard_sizes, dim=shard_dim
+    )
+
+    shard_validation(
+        sharded_compressed_values, sharded_compressed_bitmask, expected_shapes
+    )
+
+    decompressed_shards = [
+        Sparse24BitMaskTensor(
+            shape=(expected_shape["bitmask"][0], expected_shape["bitmask"][1] * 8),
+            compressed=shard_values,
+            bitmask=shard_bitmask,
+        ).decompress()
+        for shard_values, shard_bitmask, expected_shape in zip(
+            sharded_compressed_values, sharded_compressed_bitmask, expected_shapes
+        )
+    ]
+
+    decompressed_combined = combine_shards(decompressed_shards, dim=shard_dim)
+
+    assert dense_matrix.dtype == decompressed_combined.dtype, "Dtype mismatch"
+    assert dense_matrix.shape == decompressed_combined.shape, "Shape mismatch"
+    assert torch.equal(dense_matrix, decompressed_combined), "Decompression failed"