diff --git a/dev/cuda/benchmark_on_modal.py b/dev/cuda/benchmark_on_modal.py index 907a831ad..d580ec2d5 100644 --- a/dev/cuda/benchmark_on_modal.py +++ b/dev/cuda/benchmark_on_modal.py @@ -62,10 +62,10 @@ "rm cmake-3.28.1-Linux-x86_64.sh", "ln -s /usr/local/bin/cmake /usr/bin/cmake",) .run_commands( - "apt-get install -y --allow-change-held-packages libcudnn8 libcudnn8-dev", + "apt-get install -y --allow-change-held-packages libcudnn9-cuda-12 libcudnn9-dev-cuda-12", "apt-get install -y openmpi-bin openmpi-doc libopenmpi-dev kmod sudo", "git clone https://github.com/NVIDIA/cudnn-frontend.git /root/cudnn-frontend", - "cd /root/cudnn-frontend && mkdir build && cd build && cmake .. && make" + "cd /root/cudnn-frontend && mkdir build && cd build && cmake .. && make -j$(nproc)" ) .run_commands( "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \ @@ -75,6 +75,8 @@ apt-get update" ).run_commands( "apt-get install -y nsight-systems-2023.3.3" + ).run_commands( + "apt-get install -y curl" ) ) @@ -98,11 +100,12 @@ def execute_command(command: str): # using in a directory in your volume, where the name contains the timestamp unique id. # This script will generate a "report1_{timestamp} folder in volume" # and you can download it with 'modal volume get {volume-name} report1_{timestamp} - volumes={"/cuda-env": modal.Volume.from_name("cuda-env")}, + volumes={"/llmc": modal.Volume.from_name("llmc")}, ) -def run_benchmark(compile_command: str, run_command: str): +def run_benchmark(data_command: str, compile_command: str, run_command: str): execute_command("pwd") execute_command("ls") + execute_command(data_command) execute_command(compile_command) execute_command(run_command) # Use this section if you want to profile using nsight system and install the reports on your volume to be locally downloaded @@ -116,6 +119,6 @@ def run_benchmark(compile_command: str, run_command: str): return None @stub.local_entrypoint() -def inference_main(compile_command: str, run_command: str): - results = run_benchmark.remote(compile_command, run_command) +def inference_main(data_command: str, compile_command: str, run_command: str): + results = run_benchmark.remote(data_command, compile_command, run_command) return results \ No newline at end of file diff --git a/llmc/cuda_common.h b/llmc/cuda_common.h index 006ad3010..0efb494d2 100644 --- a/llmc/cuda_common.h +++ b/llmc/cuda_common.h @@ -179,7 +179,7 @@ inline void file_to_device(void* dest, FILE* src, size_t num_bytes, size_t buffe // prime the read buffer; char* gpu_write_ptr = (char*)dest; size_t copy_amount = std::min(buffer_size, num_bytes); - freadCheck(read_buffer, 1, copy_amount, src); + // freadCheck(read_buffer, 1, copy_amount, src); size_t rest_bytes = num_bytes - copy_amount; size_t write_buffer_size = copy_amount; @@ -192,7 +192,7 @@ inline void file_to_device(void* dest, FILE* src, size_t num_bytes, size_t buffe cudaCheck(cudaMemcpyAsync(gpu_write_ptr, write_buffer, write_buffer_size, cudaMemcpyHostToDevice, stream)); gpu_write_ptr += write_buffer_size; // while this is going on, read from disk - freadCheck(read_buffer, 1, copy_amount, src); + //freadCheck(read_buffer, 1, copy_amount, src); cudaCheck(cudaStreamSynchronize(stream)); // wait for both buffers to be ready. std::swap(read_buffer, write_buffer);