diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 8ddf764..f65e8cb 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -91,8 +91,12 @@ jobs:
 
       - name: Free up disk space
         if: ${{ runner.os == 'Linux' }}
+        # https://github.com/easimon/maximize-build-space/blob/master/action.yml
+        # https://github.com/easimon/maximize-build-space/tree/test-report
         run: |
           sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
 
       - name: Install CUDA ${{ matrix.cuda-version }}
         if: ${{ matrix.cuda-version != 'cpu' }}
diff --git a/csrc/cutlass b/csrc/cutlass
index c4f6b8c..6f47420 160000
--- a/csrc/cutlass
+++ b/csrc/cutlass
@@ -1 +1 @@
-Subproject commit c4f6b8c6bc94ff69048492fb34df0dfaf1983933
+Subproject commit 6f47420213f757831fae65c686aa471749fa8d60
diff --git a/flash_attn/__init__.py b/flash_attn/__init__.py
index b179694..b4265f7 100644
--- a/flash_attn/__init__.py
+++ b/flash_attn/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "2.0.6.post3"
+__version__ = "2.0.6.post4"
 
 from flash_attn.flash_attn_interface import flash_attn_func
 from flash_attn.flash_attn_interface import flash_attn_kvpacked_func
diff --git a/setup.py b/setup.py
index 43481da..d087813 100644
--- a/setup.py
+++ b/setup.py
@@ -91,9 +91,9 @@ def raise_if_cuda_home_none(global_option: str) -> None:
 
 
 def append_nvcc_threads(nvcc_extra_args):
-    _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
-    if bare_metal_version >= Version("11.2"):
-        return nvcc_extra_args + ["--threads", "4"]
+    # _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
+    # if bare_metal_version >= Version("11.2"):
+    #     return nvcc_extra_args + ["--threads", "4"]
     return nvcc_extra_args