From c7aad65a044e8dff336695cf9ddb5fbad94110a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rickard=20Hallerb=C3=A4ck?= Date: Sun, 21 Apr 2024 23:35:30 +0200 Subject: [PATCH] Adding a workflow that builds with and without CUDA and OMP and on CPU also tests on ubuntu and macos --- .github/workflows/ci.yml | 62 ++++++++++++++++++++++++++++++++++++++++ Makefile | 55 +++++++++++++++++++---------------- train_gpt2.py | 9 ++++-- 3 files changed, 99 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..86494db4d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,62 @@ +name: Build and test + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + build-and-test-cpu: + strategy: + matrix: + os: [ubuntu-latest, macos-latest] + + runs-on: ${{ matrix.os }} + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install OpenMP + run: | + if [ "${{ runner.os }}" == "Linux" ]; then + sudo apt-get update && sudo apt-get install -y libomp-dev + elif [ "${{ runner.os }}" == "macOS" ]; then + brew install libomp + fi + + - name: Install dependencies + run: pip install -r requirements.txt + + - name: Run preprocessing + run: python prepro_tinyshakespeare.py + + - name: Train model + run: python train_gpt2.py + + - name: Compile training and testing program + run: make test_gpt2 train_gpt2 + + - name: Execute testing program (With OpenMP) + run: OMP_NUM_THREADS=8 ./test_gpt2 + + - name: Compile training and testing program without OpenMP + run: NO_OMP=1 make test_gpt2 train_gpt2 + + - name: Execute testing program (No OpenMP) + run: ./test_gpt2 + + build-with-cuda: + runs-on: ubuntu-latest # Host OS, Docker will run on top of this + container: + image: nvidia/cuda:11.2.2-devel-ubuntu20.04 # Example CUDA development image with nvcc + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Build project + run: make train_gpt2cu test_gpt2cu diff --git a/Makefile b/Makefile index 0a089218d..06f7d960b 100644 --- a/Makefile +++ b/Makefile @@ -29,32 +29,39 @@ $(foreach flag,$(CFLAGS_COND),$(eval $(call check_and_add_flag,$(flag)))) # e.g. on MacOS: brew install libomp # e.g. on Ubuntu: sudo apt-get install libomp-dev # later, run the program by prepending the number of threads, e.g.: OMP_NUM_THREADS=8 ./gpt2 -ifeq ($(shell uname), Darwin) - # Check if the libomp directory exists - ifeq ($(shell [ -d /opt/homebrew/opt/libomp/lib ] && echo "exists"), exists) - # macOS with Homebrew and directory exists - CFLAGS += -Xclang -fopenmp -DOMP - LDFLAGS += -L/opt/homebrew/opt/libomp/lib - LDLIBS += -lomp - INCLUDES += -I/opt/homebrew/opt/libomp/include - $(info OpenMP found, compiling with OpenMP support) - else ifeq ($(shell [ -d /usr/local/opt/libomp/lib ] && echo "exists"), exists) - CFLAGS += -Xclang -fopenmp -DOMP - LDFLAGS += -L/usr/local/opt/libomp/lib - LDLIBS += -lomp - INCLUDES += -I/usr/local/opt/libomp/include - $(info OpenMP found, compiling with OpenMP support) - else - $(warning OpenMP not found, skipping OpenMP support) - endif +# First, check if NO_OMP is set to 1, if not, proceed with the OpenMP checks +ifeq ($(NO_OMP), 1) + $(info OpenMP is manually disabled) else - ifeq ($(shell echo | $(CC) -fopenmp -x c -E - > /dev/null 2>&1; echo $$?), 0) - # Ubuntu or other Linux distributions - CFLAGS += -fopenmp -DOMP - LDLIBS += -lgomp - $(info OpenMP found, compiling with OpenMP support) + # Detect if running on macOS or Linux + ifeq ($(shell uname), Darwin) + # Check for Homebrew's libomp installation in different common directories + ifeq ($(shell [ -d /opt/homebrew/opt/libomp/lib ] && echo "exists"), exists) + # macOS with Homebrew on ARM (Apple Silicon) + CFLAGS += -Xclang -fopenmp -DOMP + LDFLAGS += -L/opt/homebrew/opt/libomp/lib + LDLIBS += -lomp + INCLUDES += -I/opt/homebrew/opt/libomp/include + $(info OpenMP found, compiling with OpenMP support) + else ifeq ($(shell [ -d /usr/local/opt/libomp/lib ] && echo "exists"), exists) + # macOS with Homebrew on Intel + CFLAGS += -Xclang -fopenmp -DOMP + LDFLAGS += -L/usr/local/opt/libomp/lib + LDLIBS += -lomp + INCLUDES += -I/usr/local/opt/libomp/include + $(info OpenMP found, compiling with OpenMP support) + else + $(warning OpenMP not found, skipping OpenMP support) + endif else - $(warning OpenMP not found, skipping OpenMP support) + # Check for OpenMP support in GCC or Clang on Linux + ifeq ($(shell echo | $(CC) -fopenmp -x c -E - > /dev/null 2>&1; echo $$?), 0) + CFLAGS += -fopenmp -DOMP + LDLIBS += -lgomp + $(info OpenMP found, compiling with OpenMP support) + else + $(warning OpenMP not found, skipping OpenMP support) + endif endif endif diff --git a/train_gpt2.py b/train_gpt2.py index bbb62dec2..0e93b8c1f 100644 --- a/train_gpt2.py +++ b/train_gpt2.py @@ -214,7 +214,7 @@ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None): # a few utilities for saving params/grads/activations to files for loading in C def write_fp32(tensor, file): - file.write(tensor.detach().numpy().astype("float32").tobytes()) + file.write(tensor.detach().cpu().numpy().astype("float32").tobytes()) def write_tensors(model_tensors, L, file): write_fp32(model_tensors["transformer.wte.weight"], file) # (V, C) @@ -399,12 +399,15 @@ def get_batch(): # do one forward pass to generate ground truth for our C tests if not args.inference_only and args.write_tensors: logits, loss = model(x, y) + loss.backward() write_model(model, "gpt2_124M.bin") write_state(model, x, y, logits, loss, "gpt2_124M_debug_state.bin") - optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, fused=True) + use_fused = True if device == "cuda" else False + optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, fused=use_fused) timings = [] - torch.cuda.reset_peak_memory_stats() + if device == "cuda": + torch.cuda.reset_peak_memory_stats() for i in range(args.num_iterations): t0 = time.time() logits, loss = model(x, y)