Skip to content

Commit

Permalink
fix int8 so that it works; applying format'
Browse files Browse the repository at this point in the history
  • Loading branch information
dsikka committed Dec 8, 2024
1 parent fbbd469 commit 6dfc5c9
Show file tree
Hide file tree
Showing 24 changed files with 933 additions and 802 deletions.
3 changes: 1 addition & 2 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,8 +362,7 @@ def main(args: argparse.Namespace):
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
f"{total_output_tokens / elapsed_time:.2f} output tokens/s, "
f"{total_num_tokens=} | {total_output_tokens=}"
)
f"{total_num_tokens=} | {total_output_tokens=}")

# Output JSON results if specified
if args.output_json:
Expand Down
97 changes: 55 additions & 42 deletions benchmarks/cutlass_benchmarks/sparse_mm/bench_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torch
import torch.utils.benchmark as TBenchmark
from torch.utils.benchmark import Measurement as TMeasurement
from utils import make_rand_sparse_tensors, to_fp16, to_bf16
from utils import make_rand_sparse_tensors, to_bf16

import vllm._custom_ops as ops

Expand Down Expand Up @@ -41,7 +41,8 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)

out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
torch.bfloat16)
out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.bfloat16)

if not torch.allclose(out.t(), out_ref):
Expand All @@ -65,26 +66,26 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
# cutlass impl: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
torch.bfloat16))
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
scale_a, torch.bfloat16))

# cutlass with bias: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.bfloat16,
bias))
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
scale_a, torch.bfloat16, bias))

# cutlass impl: fp16 output
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
torch.float16))
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
scale_a, torch.float16))

# cutlass with bias: fp16 output
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_fp16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16,
bias.to(dtype=torch.float16)))
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
scale_a, torch.float16, bias.to(dtype=torch.float16)))

return timers

Expand All @@ -94,14 +95,16 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
assert dtype == torch.float8_e4m3fn

# Create tensors
b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
k)
aT = a.t()
bT = b
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
# bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)

out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
torch.bfloat16)
out_ref = ops.cutlass_scaled_mm(a, bT, scale_a, scale_b, torch.bfloat16)

if not torch.allclose(out, out_ref, rtol=1e-2, atol=1e-2):
Expand Down Expand Up @@ -169,18 +172,19 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
# cutlass impl: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
torch.bfloat16))
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
scale_a, torch.bfloat16))
# cutlass impl: fp16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
scale_a, torch.float16))

return timers


def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.float16

m, k, n = 1, 128, 256
Expand All @@ -191,10 +195,11 @@ def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
bT = b.t()
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
# bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)

out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
out_ref = to_bf16(a@bT)
out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
torch.bfloat16)
out_ref = to_bf16(a @ bT)

if not torch.allclose(out.t(), out_ref, rtol=1e-2, atol=1e-2):
print("Incorrect result")
Expand Down Expand Up @@ -239,31 +244,35 @@ def bench_fp16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
# cutlass impl: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
torch.bfloat16))
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
scale_a, torch.bfloat16))

# cutlass impl: fp16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
scale_a, torch.float16))

# # cutlass impl: bf16 output, with bias
# timers.append(
# bench_fn(label, sub_label, "cutlass_fp16_fp16_bf16_scaled_sparse_mm_bias",
# ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.bfloat16,
# bias))
# bench_fn(label, sub_label,
# "cutlass_fp16_fp16_bf16_scaled_sparse_mm_bias",
# ops.cutlass_scaled_sparse_mm, b_compressed, e, aT,
# scale_b, scale_a, torch.bfloat16, bias))

# # cutlass impl: fp16 output, with bias
# timers.append(
# bench_fn(label, sub_label, "cutlass_fp16_fp16_fp16_scaled_sparse_mm_bias",
# ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16,
# bench_fn(label, sub_label,
# "cutlass_fp16_fp16_fp16_scaled_sparse_mm_bias",
# ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
# scale_a, torch.float16,
# bias.to(dtype=torch.float16)))

return timers


def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.bfloat16

# Create tensors
Expand All @@ -274,8 +283,9 @@ def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)

out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a, torch.bfloat16)
out_ref = to_bf16(a@bT)
out = ops.cutlass_scaled_sparse_mm(b_compressed, e, aT, scale_b, scale_a,
torch.bfloat16)
out_ref = to_bf16(a @ bT)

if not torch.allclose(out.t(), out_ref):
print("Incorrect result")
Expand Down Expand Up @@ -316,31 +326,34 @@ def bench_bf16(dtype: torch.dtype, m: int, k: int, n: int, label: str,
# cutlass impl: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a,
torch.bfloat16))
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
scale_a, torch.bfloat16))

# cutlass impl: fp16 output
timers.append(
bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16))
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
scale_a, torch.float16))

# cutlass impl: bf16 output, with bias
timers.append(
bench_fn(label, sub_label, "cutlass_bf16_bf16_bf16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.bfloat16,
bias))
bench_fn(label, sub_label,
"cutlass_bf16_bf16_bf16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
scale_a, torch.bfloat16, bias))

# cutlass impl: fp16 output, with bias
timers.append(
bench_fn(label, sub_label, "cutlass_bf16_bf16_fp16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b, scale_a, torch.float16,
bias.to(dtype=torch.float16)))
bench_fn(label, sub_label,
"cutlass_bf16_bf16_fp16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, b_compressed, e, aT, scale_b,
scale_a, torch.float16, bias.to(dtype=torch.float16)))

return timers


def bench_v1(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
sub_label: str) -> Iterable[TMeasurement]:
if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label)
if dtype == torch.float8_e4m3fn:
Expand Down
Loading

0 comments on commit 6dfc5c9

Please sign in to comment.