From c7aad65a044e8dff336695cf9ddb5fbad94110a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rickard=20Hallerb=C3=A4ck?= <rickard.hallerback@gmail.com>
Date: Sun, 21 Apr 2024 23:35:30 +0200
Subject: [PATCH] Adding a workflow that builds with and without CUDA and OMP
 and on CPU also tests on ubuntu and macos

---
 .github/workflows/ci.yml | 62 ++++++++++++++++++++++++++++++++++++++++
 Makefile                 | 55 +++++++++++++++++++----------------
 train_gpt2.py            |  9 ++++--
 3 files changed, 99 insertions(+), 27 deletions(-)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 000000000..86494db4d
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,62 @@
+name: Build and test
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  build-and-test-cpu:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install OpenMP
+        run: |
+          if [ "${{ runner.os }}" == "Linux" ]; then
+            sudo apt-get update && sudo apt-get install -y libomp-dev
+          elif [ "${{ runner.os }}" == "macOS" ]; then
+            brew install libomp
+          fi
+
+      - name: Install dependencies
+        run: pip install -r requirements.txt
+
+      - name: Run preprocessing
+        run: python prepro_tinyshakespeare.py
+
+      - name: Train model
+        run: python train_gpt2.py
+
+      - name: Compile training and testing program
+        run: make test_gpt2 train_gpt2
+
+      - name: Execute testing program (With OpenMP)
+        run: OMP_NUM_THREADS=8 ./test_gpt2
+
+      - name: Compile training and testing program without OpenMP
+        run: NO_OMP=1 make test_gpt2 train_gpt2
+
+      - name: Execute testing program (No OpenMP)
+        run: ./test_gpt2
+
+  build-with-cuda:
+    runs-on: ubuntu-latest  # Host OS, Docker will run on top of this
+    container:
+      image: nvidia/cuda:11.2.2-devel-ubuntu20.04  # Example CUDA development image with nvcc
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Build project
+        run: make train_gpt2cu test_gpt2cu
diff --git a/Makefile b/Makefile
index 0a089218d..06f7d960b 100644
--- a/Makefile
+++ b/Makefile
@@ -29,32 +29,39 @@ $(foreach flag,$(CFLAGS_COND),$(eval $(call check_and_add_flag,$(flag))))
 # e.g. on MacOS: brew install libomp
 # e.g. on Ubuntu: sudo apt-get install libomp-dev
 # later, run the program by prepending the number of threads, e.g.: OMP_NUM_THREADS=8 ./gpt2
-ifeq ($(shell uname), Darwin)
-  # Check if the libomp directory exists
-  ifeq ($(shell [ -d /opt/homebrew/opt/libomp/lib ] && echo "exists"), exists)
-    # macOS with Homebrew and directory exists
-    CFLAGS += -Xclang -fopenmp -DOMP
-    LDFLAGS += -L/opt/homebrew/opt/libomp/lib
-    LDLIBS += -lomp
-    INCLUDES += -I/opt/homebrew/opt/libomp/include
-    $(info OpenMP found, compiling with OpenMP support)
-  else ifeq ($(shell [ -d /usr/local/opt/libomp/lib ] && echo "exists"), exists)
-    CFLAGS += -Xclang -fopenmp -DOMP
-    LDFLAGS += -L/usr/local/opt/libomp/lib
-    LDLIBS += -lomp
-    INCLUDES += -I/usr/local/opt/libomp/include
-    $(info OpenMP found, compiling with OpenMP support)
-  else
-    $(warning OpenMP not found, skipping OpenMP support)
-  endif
+# First, check if NO_OMP is set to 1, if not, proceed with the OpenMP checks
+ifeq ($(NO_OMP), 1)
+  $(info OpenMP is manually disabled)
 else
-  ifeq ($(shell echo | $(CC) -fopenmp -x c -E - > /dev/null 2>&1; echo $$?), 0)
-    # Ubuntu or other Linux distributions
-    CFLAGS += -fopenmp -DOMP
-    LDLIBS += -lgomp
-    $(info OpenMP found, compiling with OpenMP support)
+  # Detect if running on macOS or Linux
+  ifeq ($(shell uname), Darwin)
+    # Check for Homebrew's libomp installation in different common directories
+    ifeq ($(shell [ -d /opt/homebrew/opt/libomp/lib ] && echo "exists"), exists)
+      # macOS with Homebrew on ARM (Apple Silicon)
+      CFLAGS += -Xclang -fopenmp -DOMP
+      LDFLAGS += -L/opt/homebrew/opt/libomp/lib
+      LDLIBS += -lomp
+      INCLUDES += -I/opt/homebrew/opt/libomp/include
+      $(info OpenMP found, compiling with OpenMP support)
+    else ifeq ($(shell [ -d /usr/local/opt/libomp/lib ] && echo "exists"), exists)
+      # macOS with Homebrew on Intel
+      CFLAGS += -Xclang -fopenmp -DOMP
+      LDFLAGS += -L/usr/local/opt/libomp/lib
+      LDLIBS += -lomp
+      INCLUDES += -I/usr/local/opt/libomp/include
+      $(info OpenMP found, compiling with OpenMP support)
+    else
+      $(warning OpenMP not found, skipping OpenMP support)
+    endif
   else
-    $(warning OpenMP not found, skipping OpenMP support)
+    # Check for OpenMP support in GCC or Clang on Linux
+    ifeq ($(shell echo | $(CC) -fopenmp -x c -E - > /dev/null 2>&1; echo $$?), 0)
+      CFLAGS += -fopenmp -DOMP
+      LDLIBS += -lgomp
+      $(info OpenMP found, compiling with OpenMP support)
+    else
+      $(warning OpenMP not found, skipping OpenMP support)
+    endif
   endif
 endif
 
diff --git a/train_gpt2.py b/train_gpt2.py
index bbb62dec2..0e93b8c1f 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -214,7 +214,7 @@ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
 
 # a few utilities for saving params/grads/activations to files for loading in C
 def write_fp32(tensor, file):
-    file.write(tensor.detach().numpy().astype("float32").tobytes())
+    file.write(tensor.detach().cpu().numpy().astype("float32").tobytes())
 
 def write_tensors(model_tensors, L, file):
     write_fp32(model_tensors["transformer.wte.weight"], file) # (V, C)
@@ -399,12 +399,15 @@ def get_batch():
     # do one forward pass to generate ground truth for our C tests
     if not args.inference_only and args.write_tensors:
         logits, loss = model(x, y)
+        loss.backward()
         write_model(model, "gpt2_124M.bin")
         write_state(model, x, y, logits, loss, "gpt2_124M_debug_state.bin")
 
-    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, fused=True)
+    use_fused = True if device == "cuda" else False
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, fused=use_fused)
     timings = []
-    torch.cuda.reset_peak_memory_stats()
+    if device == "cuda":
+        torch.cuda.reset_peak_memory_stats()
     for i in range(args.num_iterations):
         t0 = time.time()
         logits, loss = model(x, y)