From b559b6a3fb1f65cb3e89378109c68374dc7cf355 Mon Sep 17 00:00:00 2001
From: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Date: Fri, 13 Dec 2024 07:45:06 +0000
Subject: [PATCH] Push activations and output transposes into CUTLASS code

---
 benchmarks/benchmark_throughput.py            |   3 +-
 .../cutlass_benchmarks/sparse_benchmarks.py   |  20 +++
 benchmarks/cutlass_benchmarks/utils.py        |   9 +-
 csrc/ops.h                                    |   4 +-
 csrc/sparse/cutlass/sparse_compressor.cu      |   7 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   | 152 +++++++++---------
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  |  77 ++++-----
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |  24 +--
 csrc/torch_bindings.cpp                       |   8 +-
 tests/kernels/test_semi_structured.py         |   2 +-
 vllm/_custom_ops.py                           |  89 +++++++---
 .../schemes/compressed_tensors_24.py          |   6 +-
 12 files changed, 236 insertions(+), 165 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index e92b5d00dc9f5..1e5967bd9bf8b 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -361,8 +361,7 @@ def main(args: argparse.Namespace):
         # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
-          f"{total_output_tokens / elapsed_time:.2f} output tokens/s, "
-          f"{total_num_tokens=} | {total_output_tokens=}")
+          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
 
     # Output JSON results if specified
     if args.output_json:
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index 0bbed68a71e67..eec6e6134a0cf 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -46,6 +46,16 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b, torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
     timers = []
     # pytorch impl - bfloat16
     timers.append(
@@ -95,6 +105,16 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
+    out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b, torch.bfloat16)
+    out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
+
+    if not torch.allclose(out, out_ref):
+        print("Incorrect results")
+        print(out)
+        print(out_ref)
+    else:
+        print("Correct results")
+
     timers = []
 
     # pytorch impl w. bf16
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
index 84937d1c81bb2..c53cee52642f4 100644
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -62,8 +62,11 @@ def prune_to_2_4(tensor):
 
 def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
                              k: int) -> Tuple[torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
+    # a = torch.randn((m, k), device='cuda') * 5
+    # b = torch.randn((n, k), device='cuda').t() * 5
+
+    a = torch.ones((m, k), device='cuda')
+    b = torch.ones((n, k), device='cuda').t()
 
     b = prune_to_2_4(b.t()).t()
 
@@ -78,7 +81,7 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
     else:
         raise ValueError("unsupported dtype")
 
-    b_compressed, e = ops.cutlass_compress_entry(b.t())
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
 
     # Compressed B, Metadata, Original A, B
     return b_compressed, e, a, b
diff --git a/csrc/ops.h b/csrc/ops.h
index 363ddec3d0729..d43f495aabd80 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -156,12 +156,12 @@ void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            c10::optional<torch::Tensor> const& bias);
 
 void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
-                              torch::Tensor const& e, torch::Tensor const& b,
+                              torch::Tensor const& b, torch::Tensor const& e,
                               torch::Tensor const& a_scales,
                               torch::Tensor const& b_scales,
                               c10::optional<torch::Tensor> const& bias);
 
-bool cutlass_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e,
+bool cutlass_sparse_compress(torch::Tensor& a_compressed, torch::Tensor& e,
                             torch::Tensor const& a);
 #endif
 
diff --git a/csrc/sparse/cutlass/sparse_compressor.cu b/csrc/sparse/cutlass/sparse_compressor.cu
index ebb1c975121ac..30b78054f300e 100644
--- a/csrc/sparse/cutlass/sparse_compressor.cu
+++ b/csrc/sparse/cutlass/sparse_compressor.cu
@@ -73,9 +73,6 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
 
-  // Just a dummy value
-  int32_t n = 128;
-
   int64_t lda = a.stride(0);
 
   using StrideA = Stride<int64_t, Int<1>, int64_t>;
@@ -85,7 +82,7 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
   StrideA a_stride{lda, Int<1>{}, 0};
 
   using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+  typename GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
 
   using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
   using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
@@ -155,7 +152,7 @@ bool sparsify_and_compress(torch::Tensor& a_compressed, torch::Tensor& e,
   return true;
 }
 
-bool cutlass_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e,
+bool cutlass_sparse_compress(torch::Tensor& a_compressed, torch::Tensor& e,
                             torch::Tensor const& a) {
   if (a.dtype() == torch::kBFloat16) {
     return sparsify_and_compress<cutlass::bfloat16_t>(a_compressed, e, a);
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 8d36ece6d79c9..4537d31c54eb1 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -36,13 +36,13 @@ template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                    torch::Tensor const& e,
-                                    torch::Tensor const& b,
+                                    torch::Tensor const& bt_nzs,
+                                    torch::Tensor const& bt_meta,
                                     EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(e.dtype() == torch::kUInt8);
-  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
 
   using Cutlass3xGemmDefault =
       typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
@@ -72,68 +72,68 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   using Cutlass3xGemm8 =
       typename sm90_fp8_config_8<InType, OutType, Epilogue>::Cutlass3xGemm;
 
-  uint32_t const n = b.size(1);  // Batch size
-  uint32_t const m = a.size(0);
-  uint32_t const np2 =
-      std::max(static_cast<uint32_t>(64), next_pow_2(n));  // next power of 2
+  uint32_t const n = bt_nzs.size(0);
+  uint32_t const m = a.size(0);  // Batch size
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
 
-  if (np2 <= 64) {
-    if (m == 28672) {
+  if (mp2 <= 64) {
+    if (n == 28672) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm2>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    } else if (m == 4096 || m == 6144) {
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096 || n == 6144) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm1>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
     }
-  } else if (np2 <= 128) {
-    if (m == 4096) {
+  } else if (mp2 <= 128) {
+    if (n == 4096) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm3>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    } else if (m == 28672) {
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm5>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    } else if (m == 6144) {
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm4>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
     }
-  } else if (np2 <= 256) {
-    if (m == 4096) {
+  } else if (mp2 <= 256) {
+    if (n == 4096) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm6>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    } else if (m == 28672) {
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 28672) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    } else if (m == 6144) {
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 6144) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
     }
   } else {
-    if (m == 6144 || m == 28672) {
+    if (n == 6144 || n == 28672) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
-    } else if (m == 4096) {
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    } else if (n == 4096) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
     }
   }
 
   // Otherwise the default heuristic
-  if (np2 <= 64) {
+  if (mp2 <= 64) {
     // n in [1, 64]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  } else if (np2 <= 128) {
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
     // n in (64, 128]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
-  } else if (np2 <= 256) {
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
     // n in (128, 256]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
   } else {
     // n in (256, inf)
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
   }
 }
 
@@ -141,53 +141,53 @@ template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& e,
-                                     torch::Tensor const& b,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
                                      EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::half_t>());
   TORCH_CHECK(a.dtype() == torch::kFloat16);
-  TORCH_CHECK(e.dtype() == torch::kUInt8);
-  TORCH_CHECK(b.dtype() == torch::kFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
 
   using Cutlass3xGemmDefault =
       typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
 
   // m in (128, inf)
   return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
-      out, a, e, b, std::forward<EpilogueArgs>(args)...);
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
 }
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& e,
-                                     torch::Tensor const& b,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
                                      EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, cutlass::bfloat16_t>());
   TORCH_CHECK(a.dtype() == torch::kBFloat16);
-  TORCH_CHECK(e.dtype() == torch::kUInt8);
-  TORCH_CHECK(b.dtype() == torch::kBFloat16);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
 
   using Cutlass3xGemmDefault =
       typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
 
   // m in (128, inf)
   return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
-      out, a, e, b, std::forward<EpilogueArgs>(args)...);
+      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
 }
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& e,
-                                     torch::Tensor const& b,
+                                     torch::Tensor const& bt_nzs,
+                                     torch::Tensor const& bt_meta,
                                      EpilogueArgs&&... args) {
   static_assert(std::is_same<InType, int8_t>());
   TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(e.dtype() == torch::kUInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
 
   using Cutlass3xGemmDefault =
       typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
@@ -213,23 +213,23 @@ void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
     // m in [1, 32]
     if (is_small_n) {
       return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NSmall>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
     } else {
       return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NBig>(
-          out, a, e, b, std::forward<EpilogueArgs>(args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
     }
   } else if (mp2 <= 64) {
     // m in (32, 64]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
   } else if (mp2 <= 128) {
     // m in (64, 128]
     return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
   } else {
     // m in (128, inf)
     return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, e, b, std::forward<EpilogueArgs>(args)...);
+        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
   }
 }
 
@@ -237,68 +237,68 @@ template <template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
                                             torch::Tensor const& a,
-                                            torch::Tensor const& e,
-                                            torch::Tensor const& b,
+                                            torch::Tensor const& bt_nzs,
+                                            torch::Tensor const& bt_meta,
                                             EpilogueArgs&&... epilogue_args) {
-  TORCH_CHECK(e.dtype() == torch::kUInt8);
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
   if (a.dtype() == torch::kInt8) {
-    TORCH_CHECK(b.dtype() == torch::kInt8);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
                                              Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else if (a.dtype() == torch::kFloat8_e4m3fn) {
-    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
                                             cutlass::bfloat16_t, Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
                                             cutlass::half_t, Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else if (a.dtype() == torch::kFloat16) {
-    TORCH_CHECK(b.dtype() == torch::kFloat16);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
                                              cutlass::bfloat16_t, Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t, cutlass::half_t,
                                              Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else {  // a.dtype() == torch::kBFloat16
     TORCH_CHECK(a.dtype() == torch::kBFloat16);
-    TORCH_CHECK(b.dtype() == torch::kBFloat16);
+    TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
                                              cutlass::bfloat16_t, Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
                                              cutlass::half_t, Epilogue>(
-          out, a, e, b, std::forward<EpilogueArgs>(epilogue_args)...);
+          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(epilogue_args)...);
     }
   }
 }
 
 void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
-                                   torch::Tensor const& e,
-                                   torch::Tensor const& b,
+                                   torch::Tensor const& bt_nzs,
+                                   torch::Tensor const& bt_meta,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
                                    c10::optional<torch::Tensor> const& bias) {
@@ -308,10 +308,10 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
     return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
-        out, a, e, b, a_scales, b_scales, *bias);
+        out, a, bt_nzs, bt_meta, a_scales, b_scales, *bias);
   } else {
     return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogue>(
-        out, a, e, b, a_scales, b_scales);
+        out, a, bt_nzs, bt_meta, a_scales, b_scales);
   }
 }
 
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index aa90388295492..1cef3d9c0de8e 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -20,20 +20,16 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
+#include "cutlass_extensions/cute_utils.cuh"
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
 #include "cutlass_extensions/common.hpp"
+#include "cutlass_extensions/torch_utils.hpp"
 
 using namespace cute;
 
 /*
    This file defines sparse quantized GEMM operations using the CUTLASS 3.x API,
    for NVIDIA GPUs with sm90a (Hopper) or later.
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
 */
 
 namespace {
@@ -74,9 +70,16 @@ struct cutlass_sparse_3x_gemm {
 
   using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
 
-  using StrideD = Stride<Int<1>, int64_t, Int<0>>;
   using ElementC = void;
-  using StrideC = StrideD;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+  using StrideC = cutlass::detail::TagToStrideA_t<LayoutC>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+
+  using LayoutC_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutC>::type;
+  using LayoutD_Transpose =
+      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
 
   using EVTCompute = typename Epilogue::EVTCompute;
 
@@ -91,8 +94,8 @@ struct cutlass_sparse_3x_gemm {
       typename cutlass::epilogue::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
           ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, ElementAcc, ElementC, StrideC, AlignmentCD, ElementD,
-          StrideD, AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+          ElementAcc, ElementAcc, ElementC, LayoutC_Transpose, AlignmentCD, ElementD,
+          LayoutD_Transpose, AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
 
   static constexpr size_t CEStorageSize =
       sizeof(typename CollectiveEpilogue::SharedStorage);
@@ -118,49 +121,49 @@ struct cutlass_sparse_3x_gemm {
 };
 
 template <typename Gemm, typename... EpilogueArgs>
-void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& e, torch::Tensor const& b,
+void cutlass_sparse_gemm_caller(torch::Tensor& out,
+                                torch::Tensor const& a,
+                                torch::Tensor const& bt_nzs,
+                                torch::Tensor const& bt_meta,
                                 EpilogueArgs&&... epilogue_params) {
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
 
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = b.size(0);
+  // Interface stride expected from the argument a (will get transposed)
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+  using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+  using LayoutD = cutlass::layout::RowMajor;
 
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(1);
+  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
 
-  using LayoutA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
-  using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
-  using StrideB = typename Gemm::GemmKernel::StrideB;
-  using StrideC = typename Gemm::GemmKernel::StrideC;
-  using StrideD = typename Gemm::GemmKernel::StrideD;
+  auto layout_A = make_cute_layout<StrideA>(a, "A");
+  auto layout_D = make_cute_layout<StrideD>(out, "D");
 
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{Int<1>{}, ldc, Int<0>{}};
+  auto stride_At = layout_A.stride();
+  auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
 
   using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+  typename GemmKernel::ProblemShape prob_shape{(int) bt_nzs.size(0), (int) size<0>(layout_A), (int) size<1>(layout_A), 1};
 
   using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
   using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
 
-  LayoutA a_layout = SparseConfig::fill_layoutA(prob_shape);
+  LayoutB b_layout = SparseConfig::fill_layoutA(prob_shape);
   LayoutE e_layout = SparseConfig::fill_layoutE(prob_shape);
 
   auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  auto e_ptr = static_cast<ElementE*>(e.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(bt_nzs.data_ptr());
+  auto e_ptr = static_cast<ElementE*>(bt_meta.data_ptr());
   typename GemmKernel::MainloopArguments mainloop_args{
-      a_ptr, a_layout, b_ptr, b_stride, e_ptr, e_layout};
+      b_ptr, b_layout, a_ptr, stride_At, e_ptr, e_layout};
 
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());
   typename GemmKernel::EpilogueArguments epilogue_args{
       Gemm::Epilogue::prepare_args(
           std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
+      c_ptr, stride_Dt, c_ptr, stride_Dt};
 
   typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
                                       prob_shape, mainloop_args, epilogue_args};
@@ -195,7 +198,7 @@ struct sm90_config_default<half_t, OutType, Epilogue> {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<half_t, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+                      KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename OutType,
@@ -207,9 +210,8 @@ struct sm90_config_default<cutlass::bfloat16_t, OutType, Epilogue> {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
-      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape,
-                             ClusterShape, KernelSchedule, EpilogueSchedule,
-                             float>;
+      cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 //////////////////////// Cherry-Picking Kernels ////////////////////////
@@ -335,9 +337,8 @@ struct sm90_config_default<cutlass::float_e4m3_t, OutType, Epilogue> {
   using TileShape = Shape<_128, _128, _128>;
   using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
-      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue,
-                             TileShape, ClusterShape, KernelSchedule,
-                             EpilogueSchedule, float>;
+      cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue, TileShape, ClusterShape,
+                             KernelSchedule, EpilogueSchedule, float>;
 };
 
 template <typename InType, typename OutType,
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index a451298d5a6cd..8017eb1c93897 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -5,8 +5,8 @@
 
 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
 void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
-                                   torch::Tensor const& e,
                                    torch::Tensor const& b,
+                                   torch::Tensor const& e,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
                                    c10::optional<torch::Tensor> const& bias);
@@ -23,26 +23,26 @@ int32_t test_get_sm_version_num() {
 }
 
 void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
-                              torch::Tensor const& e, torch::Tensor const& b,
+                              torch::Tensor const& bt_nzs,
+                              torch::Tensor const& bt_meta,
                               torch::Tensor const& a_scales,
                               torch::Tensor const& b_scales,
                               c10::optional<torch::Tensor> const& bias) {
   // Checks for conformality
-  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
-  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) * 2 == b.size(0) &&
-              b.size(1) == c.size(1));
+  TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&
+              a.size(0) == c.size(0));
   TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
-  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == bt_nzs.size(0));
 
   // Check for strides and alignment
-  TORCH_CHECK(a.stride(1) == 1);                      // Row-major
-  TORCH_CHECK(b.stride(0) == 1 && c.stride(0) == 1);  // Column-major
-  TORCH_CHECK(c.stride(1) % 16 == 0);                 // 16 Byte Alignment
-  TORCH_CHECK(b.stride(1) % 16 == 0);                 // 16 Byte Alignment
+  TORCH_CHECK(a.stride(1) == 1 && bt_nzs.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(c.stride(0) % 16 == 0);                      // 16 Byte Alignment
+  TORCH_CHECK(bt_nzs.stride(0) % 16 == 0);                 // 16 Byte Alignment
   TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
 
   if (bias) {
-    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+    TORCH_CHECK(bias->numel() == bt_nzs.size(0) && bias->is_contiguous() &&
                 bias->dim() == 1);
   }
 
@@ -52,7 +52,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
   // Guard against compilation issues for sm90 kernels
 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
   if (version_num >= 90) {
-    cutlass_scaled_sparse_mm_sm90(c, a, e, b, a_scales, b_scales, bias);
+    cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales, bias);
     return;
   }
 #endif
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 0378c5ad0036c..546d01e0d9025 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -317,16 +317,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // quantization, as well as bias
   ops.def(
       "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
-      "                         Tensor e,"
-      "                         Tensor b, Tensor a_scales,"
+      "                         Tensor b,"
+      "                         Tensor e, Tensor a_scales,"
       "                         Tensor b_scales, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
 
   // CUTLASS sparse matrix compressor
   ops.def(
-      "cutlass_compress_entry(Tensor! a_compressed, Tensor! e,"
+      "cutlass_sparse_compress(Tensor! a_compressed, Tensor! e,"
       " Tensor a) -> bool");
-  ops.impl("cutlass_compress_entry", &cutlass_compress_entry);
+  ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress);
 
   // Mamba selective scan kernel
   ops.def(
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
index 14ef2438daf4f..dd9f444ed504f 100644
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
@@ -81,7 +81,7 @@ def make_rand_sparse_tensors(
     else:
         raise ValueError("unsupported dtype")
 
-    b_compressed, e = ops.cutlass_compress_entry(b.t())
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
 
     # Compressed B, Metadata, Original A, B
     return b_compressed, e, a, b
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 034b3e9493736..a14b1f3bbf45b 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -532,51 +532,102 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     return out
 
 
-def cutlass_compress_entry(a: torch.Tensor) \
+def cutlass_sparse_compress(a: torch.Tensor) \
     -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compresses a sparse matrix for use with Cutlass sparse operations.
+
+    This function takes a dense tensor and compresses it into two components:
+    non-zero elements and metadata. The compressed representation is compatible
+    with Cutlass sparse kernels.
+
+    Args:
+        a (torch.Tensor): 
+            The input tensor to be compressed. Must have one of the following data types:
+            - `torch.int8`
+            - `torch.float8_e4m3fn`
+            - `torch.bfloat16`
+            - `torch.float16`
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: 
+            A tuple containing:
+            - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
+            - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
+
+    Raises:
+        ValueError: If the compression operation fails.
+
+    Notes:
+        - The `a_meta` tensor has a data type of `torch.uint8`.
+        - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
+        - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
+        - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
+    """
     assert (a.dtype in [
         torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
     ])
 
-    # e.dtype: torch.uint8 so elemsPerElemE = 8b / 2b_per_nz = 4
-    elemsPerElemE = 4
+    # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
+    elemsPerMetaElem = 4
 
     m = a.shape[0]
     k = a.shape[1]
-    a_compressed = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
-    e = torch.empty((m, k // 2 // elemsPerElemE),
+    a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
+    a_meta = torch.empty((m, k // 2 // elemsPerMetaElem),
                     dtype=torch.uint8,
                     device=a.device)
 
-    if not (torch.ops._C.cutlass_compress_entry(a_compressed, e, a)):
+    if not (torch.ops._C.cutlass_sparse_compress(a_nzs, a_meta, a)):
         raise ValueError
 
-    return a_compressed, e
+    return a_nzs, a_meta
 
 
 def cutlass_scaled_sparse_mm(
-        a: torch.Tensor,  # row-major activations
-        b: torch.Tensor,  # row-major weight matrix
-        e: torch.Tensor,
+        a: torch.Tensor,
+        bt_nzs: torch.Tensor,
+        bt_meta: torch.Tensor,
         scale_a: torch.Tensor,
         scale_b: torch.Tensor,
         out_dtype: torch.dtype,
         bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    # assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    """
+    Performs a scaled sparse matrix multiplication using Cutlass.
+
+    Steps:
+    1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
+    `a = torch.randn((m, k), device='cuda')`.
+
+    2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
+    `b = torch.randn((k, n), device='cuda')`.
+
+    3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
+    `b = prune_to_2_4(b, dim=0)`.
+
+    4. Compress the transposed sparse matrix `b.t()`:
+    `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
+
+    5. Perform sparse matrix multiplication using the compressed matrix,
+    applying scaling factors for `a` and `b`, and the output data type:
+    `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
+
+    Returns:
+    - The result of the scaled sparse matrix multiplication.
+    """
+    assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
-    assert bias is None or bias.shape[0] == a.shape[0] \
+    assert bias is None or bias.shape[0] == bt_nzs.shape[0] \
         and bias.dtype == out_dtype
 
-    a_t = a.t()
-
-    m = b.shape[0]
-    n = a_t.shape[1]
-    out = torch.empty((n, m), dtype=out_dtype, device=a.device).t()
+    m = a.shape[0]
+    n = bt_nzs.shape[0]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    torch.ops._C.cutlass_scaled_sparse_mm(out, b, e, a_t, scale_b, scale_a,
+    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a, scale_b,
                                           bias)
 
-    return out.t()
+    return out
 
 
 # aqlm
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 5cd0059a4df89..6dfc590ea4328 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -122,7 +122,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 layer.weight_scale = torch.nn.Parameter(
                     layer.weight_scale.data, requires_grad=False)
 
-        w_compressed, meta = ops.cutlass_compress_entry(layer.weight.data)
+        w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data)
         layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
         layer.meta = torch.nn.Parameter(meta, requires_grad=False)
 
@@ -164,8 +164,8 @@ def apply_weights(self,
             q_input = x
 
         out = ops.cutlass_scaled_sparse_mm(a=q_input,
-                                           b=layer.weight,
-                                           e=layer.meta,
+                                           bt_nzs=layer.weight,
+                                           bt_meta=layer.meta,
                                            scale_a=input_scale,
                                            scale_b=layer.weight_scale,
                                            out_dtype=self.output_dtype,