From 72d6cd3ba4c0066ad922efd8e0207eae791ba083 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Fri, 15 Nov 2024 16:53:10 +0000 Subject: [PATCH] Minor test and benchmarks updates --- benchmarks/cusparseLt_benchmarks/benchmark_24.py | 6 +++--- tests/kernels/test_semi_structured.py | 15 ++++++++++----- .../layers/sparsity/utils/cusparse_2_4_utils.py | 2 ++ 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/benchmarks/cusparseLt_benchmarks/benchmark_24.py b/benchmarks/cusparseLt_benchmarks/benchmark_24.py index cb861c6634c88..15381de006d12 100644 --- a/benchmarks/cusparseLt_benchmarks/benchmark_24.py +++ b/benchmarks/cusparseLt_benchmarks/benchmark_24.py @@ -24,8 +24,8 @@ # helpers def make_rand_tensors(dtype: torch.dtype, m: int, n: int, k: int) -> Tuple[torch.Tensor, torch.Tensor]: - a = get_random_mat(m, k, dtype) - b = get_random_mat(n, k, dtype).t() + a = get_random_mat(n, k, dtype) + b = get_random_mat(m, k, dtype).t() return a, b @@ -213,7 +213,7 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: KNs = model_shapes(model, tp_size) MKNs = [] for m in Ms: - assert m % 32 == 0, "Batch size has to be a multiple of 32" + assert m % 16 == 0, "Batch size has to be a multiple of 16" for k, n in KNs: if k % 32 or n % 32: continue diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py index bc339d1ac9c32..2aba096d30a73 100644 --- a/tests/kernels/test_semi_structured.py +++ b/tests/kernels/test_semi_structured.py @@ -138,7 +138,10 @@ def test_torch_semi_structured_sparse_dense_T_fp8_matmul(): # Cached version B = torch.full((N, K), .25, device='cuda', dtype=dtype).t() C = dense_matmul(A_pruned, B, dtype=dtype).to(torch.float32) - C_sparse = semi_structured_sparse_dense_gemm(A, B).to(torch.float32) + C_sparse = semi_structured_sparse_dense_gemm(A, + B, + out_dtype=torch.bfloat16).to( + torch.float32) torch.testing.assert_close(C, C_sparse, rtol=1e-1, atol=1e-1) # Noncached version @@ -174,8 +177,9 @@ def test_torch_semi_structured_dense_sparse_T_matmul(mnk, dtype): not is_semi_structured_supported() or not is_quant_method_supported("modelopt"), reason="Semi structured fp8 matmul is not supported on this GPU type.") -def test_torch_semi_structured_dense_sparse_T_fp8_matmul(): - M, N, K = (32, 64, 32) +@pytest.mark.parametrize("mnk", MNK) +def test_torch_semi_structured_dense_sparse_T_fp8_matmul(mnk): + M, N, K = mnk dtype = torch.float8_e4m3fn B_T_pruned = generate_pruned_semi_structured_mat(N, K, dtype=dtype) B_T = compress_to_torch_sparse_semi_structured_mat(B_T_pruned) @@ -290,9 +294,10 @@ def test_torch_semi_structured_dense_sparse_T_fp8_scaled_matmul(): @pytest.mark.skipif( not is_semi_structured_supported(), reason="Semi structured matmul is not supported on this GPU type.") -def test_torch_semi_structured_sparse_dense_t_int8_scaled_matmul(): +@pytest.mark.parametrize("mnk", MNK) +def test_torch_semi_structured_sparse_dense_t_int8_scaled_matmul(mnk): dtype = torch.int8 - M, N, K = (32, 64, 32) + M, N, K = mnk A_pruned = generate_pruned_semi_structured_mat(M, K, dtype) A = compress_to_torch_sparse_semi_structured_mat(A_pruned) B = get_random_mat(N, K, dtype) diff --git a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py index 5a42dd03f9170..7dceadb5d2686 100644 --- a/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py +++ b/vllm/model_executor/layers/sparsity/utils/cusparse_2_4_utils.py @@ -234,6 +234,8 @@ def matmul_(a, b, **kwargs): scale = scale_a * scale_b if a_packed.dtype == torch.float8_e4m3fn: + if not (per_tensor_activations and per_tensor_weights): + scale = scale[:, None] result = matmul_(a_packed.packed, b_dense, out_dtype=torch.float32) result = torch.narrow(result, 1, 0, col) result = result * scale