diff --git a/dev/cuda/benchmark_on_modal.py b/dev/cuda/benchmark_on_modal.py
index 907a831ad..d580ec2d5 100644
--- a/dev/cuda/benchmark_on_modal.py
+++ b/dev/cuda/benchmark_on_modal.py
@@ -62,10 +62,10 @@
     "rm cmake-3.28.1-Linux-x86_64.sh",
     "ln -s /usr/local/bin/cmake /usr/bin/cmake",)
     .run_commands(
-        "apt-get install -y --allow-change-held-packages libcudnn8 libcudnn8-dev",
+        "apt-get install -y --allow-change-held-packages libcudnn9-cuda-12 libcudnn9-dev-cuda-12",
         "apt-get install -y openmpi-bin openmpi-doc libopenmpi-dev kmod sudo",
         "git clone https://github.com/NVIDIA/cudnn-frontend.git /root/cudnn-frontend",
-        "cd /root/cudnn-frontend && mkdir build && cd build && cmake .. && make"
+        "cd /root/cudnn-frontend && mkdir build && cd build && cmake .. && make -j$(nproc)"
     )
     .run_commands(
         "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
@@ -75,6 +75,8 @@
         apt-get update"
     ).run_commands(
         "apt-get install -y nsight-systems-2023.3.3"
+    ).run_commands(
+        "apt-get install -y curl"
     )
 )
 
@@ -98,11 +100,12 @@ def execute_command(command: str):
     # using in a directory in your volume, where the name contains the timestamp unique id.
     # This script will generate a "report1_{timestamp} folder in volume"
     # and you can download it with 'modal volume get {volume-name} report1_{timestamp}
-    volumes={"/cuda-env": modal.Volume.from_name("cuda-env")},
+    volumes={"/llmc": modal.Volume.from_name("llmc")},
 )
-def run_benchmark(compile_command: str, run_command: str):
+def run_benchmark(data_command: str, compile_command: str, run_command: str):
     execute_command("pwd")
     execute_command("ls")
+    execute_command(data_command)
     execute_command(compile_command)
     execute_command(run_command)
     # Use this section if you want to profile using nsight system and install the reports on your volume to be locally downloaded
@@ -116,6 +119,6 @@ def run_benchmark(compile_command: str, run_command: str):
     return None
 
 @stub.local_entrypoint()
-def inference_main(compile_command: str, run_command: str):
-    results = run_benchmark.remote(compile_command, run_command)
+def inference_main(data_command: str, compile_command: str, run_command: str):
+    results = run_benchmark.remote(data_command, compile_command, run_command)
     return results
\ No newline at end of file
diff --git a/llmc/cuda_common.h b/llmc/cuda_common.h
index 006ad3010..0efb494d2 100644
--- a/llmc/cuda_common.h
+++ b/llmc/cuda_common.h
@@ -179,7 +179,7 @@ inline void file_to_device(void* dest, FILE* src, size_t num_bytes, size_t buffe
     // prime the read buffer;
     char* gpu_write_ptr = (char*)dest;
     size_t copy_amount = std::min(buffer_size, num_bytes);
-    freadCheck(read_buffer, 1, copy_amount, src);
+    // freadCheck(read_buffer, 1, copy_amount, src);
 
     size_t rest_bytes = num_bytes - copy_amount;
     size_t write_buffer_size = copy_amount;
@@ -192,7 +192,7 @@ inline void file_to_device(void* dest, FILE* src, size_t num_bytes, size_t buffe
         cudaCheck(cudaMemcpyAsync(gpu_write_ptr, write_buffer, write_buffer_size, cudaMemcpyHostToDevice, stream));
         gpu_write_ptr += write_buffer_size;
         // while this is going on, read from disk
-        freadCheck(read_buffer, 1, copy_amount, src);
+        //freadCheck(read_buffer, 1, copy_amount, src);
         cudaCheck(cudaStreamSynchronize(stream));     // wait for both buffers to be ready.
 
         std::swap(read_buffer, write_buffer);