fix int8 so that it works; applying format'

neuralmagic · Dec 8, 2024 · 6dfc5c9 · 6dfc5c9
1 parent fbbd469
commit 6dfc5c9
Show file tree

Hide file tree

Showing 24 changed files with 933 additions and 802 deletions.
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -362,8 +362,7 @@ def main(args: argparse.Namespace):
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
           f"{total_output_tokens / elapsed_time:.2f} output tokens/s, "
-          f"{total_num_tokens=} | {total_output_tokens=}"
-    )
+          f"{total_num_tokens=} | {total_output_tokens=}")
 
     # Output JSON results if specified
     if args.output_json:

diff --git a/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py b/benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
@@ -5,7 +5,7 @@
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
-from utils import make_rand_sparse_tensors, to_fp16, to_bf16
+from utils import make_rand_sparse_tensors, to_bf16
 
 import vllm._custom_ops as ops
 
@@ -41,7 +41,8 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
+                                       torch.bfloat16)
     out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.bfloat16)
 
     if not torch.allclose(out.t(), out_ref):
@@ -65,26 +66,26 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
-                 torch.bfloat16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.bfloat16))
 
     # cutlass with bias: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.bfloat16,
-                 bias))
-    
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.bfloat16, bias))
+
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
-                 torch.float16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.float16))
 
     # cutlass with bias: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16,
-                 bias.to(dtype=torch.float16)))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.float16, bias.to(dtype=torch.float16)))
 
     return timers
 
@@ -94,14 +95,16 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     assert dtype == torch.float8_e4m3fn
 
     # Create tensors
-    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
+    b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
+                                                     k)
     aT = a.t()
     bT = b
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    # bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
+                                       torch.bfloat16)
     out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.bfloat16)
 
     if not torch.allclose(out, out_ref, rtol=1e-2, atol=1e-2):
@@ -169,18 +172,19 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
-                 torch.bfloat16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.bfloat16))
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.float16))
 
     return timers
 
 
 def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
+               sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.float16
 
     m, k, n = 1, 128, 256
@@ -191,10 +195,11 @@ def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     bT = b.t()
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
+    # bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
-    out_ref = to_bf16(a@bT)
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
+                                       torch.bfloat16)
+    out_ref = to_bf16(a @ bT)
 
     if not torch.allclose(out.t(), out_ref, rtol=1e-2, atol=1e-2):
         print("Incorrect result")
@@ -239,31 +244,35 @@ def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
-                 torch.bfloat16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.bfloat16))
 
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.float16))
 
     # # cutlass impl: bf16 output, with bias
     # timers.append(
-    #     bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm_bias",
-    #              ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.bfloat16,
-    #              bias))
+    #     bench_fn(label, sub_label,
+    #             "cutlass_fp16_fp16_bf16_scaled_sparse_mm_bias",
+    #             ops.cutlass_scaled_sparse_mm, b_compressed, e, aT,
+    #             scale_b, scale_a, torch.bfloat16, bias))
 
     # # cutlass impl: fp16 output, with bias
     # timers.append(
-    #     bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm_bias",
-    #              ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16,
+    #     bench_fn(label, sub_label,
+    #              "cutlass_fp16_fp16_fp16_scaled_sparse_mm_bias",
+    #              ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+    #              scale_a, torch.float16,
     #              bias.to(dtype=torch.float16)))
 
     return timers
 
 
 def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
+               sub_label: str) -> Iterable[TMeasurement]:
     assert dtype == torch.bfloat16
 
     # Create tensors
@@ -274,8 +283,9 @@ def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
 
-    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
-    out_ref = to_bf16(a@bT)
+    out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
+                                       torch.bfloat16)
+    out_ref = to_bf16(a @ bT)
 
     if not torch.allclose(out.t(), out_ref):
         print("Incorrect result")
@@ -316,31 +326,34 @@ def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
     # cutlass impl: bf16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
-                 torch.bfloat16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.bfloat16))
 
     # cutlass impl: fp16 output
     timers.append(
         bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.float16))
 
     # cutlass impl: bf16 output, with bias
     timers.append(
-        bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.bfloat16,
-                 bias))
+        bench_fn(label, sub_label,
+                 "cutlass_bf16_bf16_bf16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.bfloat16, bias))
 
     # cutlass impl: fp16 output, with bias
     timers.append(
-        bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm_bias",
-                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16,
-                 bias.to(dtype=torch.float16)))
+        bench_fn(label, sub_label,
+                 "cutlass_bf16_bf16_fp16_scaled_sparse_mm_bias",
+                 ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
+                 scale_a, torch.float16, bias.to(dtype=torch.float16)))
 
     return timers
 
 
 def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-          sub_label: str) -> Iterable[TMeasurement]:
+             sub_label: str) -> Iterable[TMeasurement]:
     if dtype == torch.int8:
         return bench_int8(dtype, m, k, n, label, sub_label)
     if dtype == torch.float8_e4m3fn: