From 37775e441b5b6b014e6aa62e0850cb44fe56fc5e Mon Sep 17 00:00:00 2001
From: Aisuko <urakiny@gmail.com>
Date: Sat, 3 Aug 2024 02:47:55 +0000
Subject: [PATCH 1/2] Add mlp with batchnorm layer

Signed-off-by: Aisuko <urakiny@gmail.com>
---
 .devcontainer/devcontainer.json |   4 +-
 Makefile                        |   2 +
 src/models/mlp_batchnorm.py     | 196 ++++++++++++++++++++++++++++++++
 src/models/simple_gpt.py        |   1 +
 src/tests/test_mlp_batchnorm.py | 149 ++++++++++++++++++++++++
 5 files changed, 350 insertions(+), 2 deletions(-)
 create mode 100644 src/models/mlp_batchnorm.py
 create mode 100644 src/tests/test_mlp_batchnorm.py

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index fbbb1cc..7f2622d 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -12,7 +12,7 @@
 				"ms-python.vscode-pylance"
 			]
 		}
-	}
+	},
 
 	// Features to add to the dev container. More info: https://containers.dev/features.
 	// "features": {},
@@ -21,7 +21,7 @@
 	// "forwardPorts": [],
 
 	// Use 'postCreateCommand' to run commands after the container is created.
-	// "postCreateCommand": "pip3 install --user -r requirements.txt",
+	"postCreateCommand": "make prepare"
 
 	// Configure tool-specific properties.
 	// "customizations": {},
diff --git a/Makefile b/Makefile
index 4f93473..a2eba9e 100644
--- a/Makefile
+++ b/Makefile
@@ -67,6 +67,8 @@ source:
 	@poetry config repositories.source https://pypi.org/project/kimchima
 
 
+.PHONY: prepare
+prepare: poetry install-dev
 
 ###################################################################################################
 # Commit and recommit changes to github
diff --git a/src/models/mlp_batchnorm.py b/src/models/mlp_batchnorm.py
new file mode 100644
index 0000000..68f517b
--- /dev/null
+++ b/src/models/mlp_batchnorm.py
@@ -0,0 +1,196 @@
+# coding=utf-8
+
+# Copyright [2024] [SkywardAI]
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import random
+import torch
+import torch.nn.functional as F
+from torch.utils.tensorboard import SummaryWriter
+
+# https://www.kaggle.com/code/aisuko/implement-neural-net-with-batch-norm-layer
+
+g=torch.Generator().manual_seed(2147483647)
+
+class Linear:
+    """
+    Linear layer
+    https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
+    """
+
+    def __init__(self, fan_in, fan_out, bias=True):
+        self.weight=torch.randn((fan_in, fan_out), generator=g)/fan_in**0.5 # unit gaussian
+        self.bias=torch.zeros(fan_out) if bias else None # default bias initialize to zeros
+
+    def __call__(self, x):
+        self.out=x@self.weight
+        if self.bias is not None:
+            self.out+=self.bias
+        return self.out
+    
+    def parameters(self):
+        """
+        return tensors that are parameters of this layer
+        """
+        return [self.weight]+([] if self.bias is None else [self.bias])
+
+
+class BatchNorm1d:
+    """
+    batchnorm layer
+    https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html formula
+    """
+
+    def __init__(self, dim, eps=1e-5, momentum=0.1):
+        self.eps=eps # used in division
+        self.momentum=momentum # keep tracking running stats
+        self.training=True
+
+        # parameters (trained with backprop)
+        self.gamma=torch.ones(dim)
+        self.beta=torch.zeros(dim)
+        # buffers (trained with a running 'momentum update')
+        self.running_mean=torch.zeros(dim)
+        self.running_var=torch.ones(dim)
+    
+    def __call__(self, x):
+        """
+        Follow https://arxiv.org/pdf/1502.03167
+
+        Algorithm 1
+        1.mini-batch mean
+        2.mini-batch variance
+        3.normalize
+        4.scale and shift
+        """
+
+        # calculating the forward pass
+        if self.training:
+            xmean=x.mean(0, keepdim=True) # batch mean
+            xvar=x.var(0, keepdim=True, unbiased=True) # batch variance
+        else:
+            xmean=self.running_mean
+            xvar=self.running_var
+        
+        xhat=(x-xmean)/torch.sqrt(xvar+self.eps) # normalize to unit variance
+        self.out=self.gamma*xhat+self.beta # craete otu attribute for visualization training process
+        # update the buffers
+
+        if self.training:
+            with torch.no_grad():
+                self.running_mean=(1-self.momentum)*self.running_mean+self.momentum*xmean
+                self.running_var=(1-self.momentum)*self.running_var+self.momentum*xvar
+        return self.out
+    
+    def parameters(self):
+        return [self.gamma, self.beta]
+    
+class Tanh:
+    """
+    """
+    def __call__(self, x):
+        self.out=torch.tanh(x)
+        return self.out
+
+    def parameters(self):
+        """
+        no parameters in this layer
+        """
+        return []
+
+
+class MlpBatchNormTrainer:
+    """
+    MlpBatchNormTrainer
+    """
+    ds_url="https://www.kaggleusercontent.com/kf/187064505/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..ZLA9V0sWqB_Px0312U15fQ.OO2wvSdp-fhBB0BTDAaLToek6CLGlzS4otHIsyHBd1feEJxIUq055-GIQb24Ez51pGq31hyzaN_vFeDRnqxFwyc12sDNqZ7uDhel-5xeXU08h0qNtOpoqXA-iJpPuV4u-dThq8Lk-zoOg_ZDmVNAW8XHZVAM2ZAHl9StyqN1n7eOGU0379mp_2ol2gyjXP01xNDH2n4kUSIIetktnagIon8Jm_tcLBB-DaWPTFwQ5L7NBP1t-omCUrKydTxAyPIFFwnid3T1vzEgSmYiUY8Ec-iC8OG5d2pKcod9FIAOkJH4Xu74Pvzp5UuOFzQXRByezEOkyD0ltAhfMOab0ebIi6YSTVKrna70HZhuxjWQRK9fIgvt0V7RMz84ZQspJWrgofowQrf7E1avVvXe7GQW4E7dYITqQoJvZ7dhlpujq1db6pkegRqOfuQzPJcD6UHBTpVRyi36rIQoLpFd63XLzY5eya4ScAy5H-frQhF0IU927Z86S9iR2AypqO3TXriPsMHjJ7o-DwXpnHCNkVfMJXeVxT36DRBiV9uCTL-e8_xOUKw50N5iG3NqTnos0IwSXvwrSBtHxUI71zo-I2Z-l5x_GqjEa9QVl1XX_q7GU_YFejlC-rT9KdcA_6TEVO6qaMpfvVvCc9kFYI7s7GQNbg.tIuWJu1a71qSZKZeG-TgPg/names.txt"
+    n_embed=10 # the dimensionality of the character embedding vectors
+    n_hidden=100 # the number of neurons in the hidden layer of the MLP
+    n_block_size=3 # context length: how many characters do we take to predict the next one?
+    max_steps=200000
+    batch_size=32
+
+
+    def __init__(self):
+        raise Exception("This class is not meant to be instantiated")
+    
+    @classmethod
+    def set_hyperparameters(cls, **kwargs):
+        """
+        Set hyperparameters
+        """
+        cls.n_embed=kwargs.get("n_embed", 10)
+        cls.n_hidden=kwargs.get("n_hidden", 100)
+        cls.n_block_size=kwargs.get("n_block_size", 3)
+        cls.max_steps=kwargs.get("max_steps", 200000)
+        cls.batch_size=kwargs.get("batch_size", 32)
+    
+    @classmethod
+    def load_dataset(cls, filePath:str)->str:
+        """
+        Load the dataset
+        """
+        with open(filePath, "r", encoding="utf-8") as f:
+            text=f.read()
+        return text
+    
+    @classmethod
+    def unique_chars(cls, text:str)->list:
+        """
+        Get all the unique characters in the text
+        """
+        return sorted(list(set(''.join(text))))
+    
+    @classmethod
+    def stoi(cls, chars:list)->dict:
+        """
+        Convert characters to indices
+        """
+        stoi={char:i+1 for i,char in enumerate(chars)}
+        stoi['.']=0
+        return stoi
+
+    @classmethod
+    def itos(cls, chars:list)->dict:
+        """
+        Convert indices to characters
+        """
+        itos={i:char for char,i in cls.stoi(chars).items()}
+        return itos
+    
+    @classmethod
+    def build_vocab(cls, chars:list)->int:
+        """
+        Build a vocabulary from the unique characters
+        """
+        return len(chars)
+    
+    @classmethod
+    def build_dataset(cls, words:str, stoi: dict)->tuple[torch.Tensor, torch.Tensor]:
+        """
+        Build the dataset
+        """
+        X,Y=[],[]
+
+        for w in words:
+            context=[0]*cls.n_block_size
+            for ch in w+'.':
+                ix=stoi[ch]
+                X.append(context)
+                Y.append(ix)
+                context=context[1:]+[ix] # crop and append
+
+        X=torch.tensor(X) # convert to tensor
+        Y=torch.tensor(Y)
+        return X,Y
diff --git a/src/models/simple_gpt.py b/src/models/simple_gpt.py
index bd8bc47..22a2c76 100644
--- a/src/models/simple_gpt.py
+++ b/src/models/simple_gpt.py
@@ -19,6 +19,7 @@
 from torch.nn import functional as F
 from torch.utils.tensorboard import SummaryWriter
 
+# https://www.kaggle.com/code/aisuko/gpt-from-scratch-as-a-script
 
 class SimpleGPT(nn.Module):
     def __init__(self, vocab_size):
diff --git a/src/tests/test_mlp_batchnorm.py b/src/tests/test_mlp_batchnorm.py
new file mode 100644
index 0000000..2f52506
--- /dev/null
+++ b/src/tests/test_mlp_batchnorm.py
@@ -0,0 +1,149 @@
+# coding=utf-8
+
+# Copyright [2024] [SkywardAI]
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import unittest
+
+from pathlib import Path
+import torch
+from torch.nn import functional as F
+
+from models.mlp_batchnorm import MlpBatchNormTrainer,Linear, BatchNorm1d, Tanh
+from pkg.dataset_helper import DatasetHelper
+
+
+class TestMLPBatchNorm(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.n_embd=MlpBatchNormTrainer.n_embed
+        cls.n_hidden=MlpBatchNormTrainer.n_hidden
+        cls.n_block_size=MlpBatchNormTrainer.n_block_size
+
+        src_dir = Path(os.path.dirname(os.path.abspath(__file__))).parent
+        abs_file_path = os.path.join(src_dir, "input.txt")
+        _ = DatasetHelper.download_remote_file(MlpBatchNormTrainer.ds_url, abs_file_path)
+        cls.data=MlpBatchNormTrainer.load_dataset(abs_file_path)
+        cls.unique_chars=MlpBatchNormTrainer.unique_chars(cls.data.splitlines())
+        cls.stoi=MlpBatchNormTrainer.stoi(cls.unique_chars)
+        cls.itos=MlpBatchNormTrainer.itos(cls.unique_chars)
+        cls.vocab_size=MlpBatchNormTrainer.build_vocab(cls.unique_chars)
+
+    def test_mlp_batchnorm_trainer(self):
+        random.seed(42)
+        words=self.data.splitlines()
+        random.shuffle(words)
+
+        n1=int(0.8*len(words))
+        n2=int(0.9*len(words))
+
+        Xtr, Ytr=MlpBatchNormTrainer.build_dataset(words[:n1], self.stoi) # 80%
+        Xdev, Ydev=MlpBatchNormTrainer.build_dataset(words[n1:n2],self.stoi) # 10%
+        Xte, Yte=MlpBatchNormTrainer.build_dataset(words[n2:],self.stoi) # 10%
+        g=torch.Generator().manual_seed(2147483647)
+
+        C=torch.randn((self.vocab_size, self.n_embd), generator=g)
+
+        # sequential 6 MLP layers
+        layers=[
+            Linear(self.n_embd*self.n_block_size, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
+            Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
+            Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
+            Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
+            Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
+            Linear(self.n_hidden, self.vocab_size, bias=False), BatchNorm1d(self.vocab_size)
+        ]
+
+        with torch.no_grad():
+            # here, out latest layer is a batch norm layer and we wouldn't change the weights to make the softmax less confident
+            # we would like to changing the gamma(from the batch norm paper algorithm1)
+            # because gamma remember int he batchnorm is the variable that multiplicatively interacts with the output of thah normalization
+            layers[-1].gamma*=0.1
+
+            # all pther layers: apply again
+            for layer in layers[:-1]:
+                if isinstance(layer, Linear):
+                    layer.weight*=5/3 # booster the linear layer by the gain, the number from torch document
+        # [C] the embedding matrix and all the parameters of all the layers
+        parameters=[C]+[p for layer in layers for p in layer.parameters()]
+        print(sum(p.nelement() for p in parameters)) # number of parameters in total
+        for p in parameters:
+            p.requires_grad=True
+
+        
+        # training loop
+        lossi=[]
+        ud=[]
+
+        for i in range(MlpBatchNormTrainer.max_steps):
+            # minibatch construct
+            ix=torch.randint(0, Xtr.shape[0], (MlpBatchNormTrainer.batch_size,), generator=g)
+            Xb, Yb=Xtr[ix], Ytr[ix] # batch X,Y
+
+            # forward pass
+            emb= C[Xb] # embed the characters into vectors
+            x=emb.view(emb.shape[0], -1) # flatten/concatenate the vectors
+            for layer in layers:
+                x=layer(x)
+            loss=F.cross_entropy(x, Yb) # loss function
+
+            # backward pass
+            for layer in layers:
+                layer.out.retain_grad()
+            
+            for p in parameters:
+                p.grad=None
+            
+            loss.backward()
+
+            # update
+            lr=0.1 if i<100000 else 0.01 # step learning rate decay
+            for p in parameters:
+                p.data+=-lr*p.grad
+            
+            # track stats
+            if i%10000==0: # print every once in a while
+                print(f'{i:7d}/{MlpBatchNormTrainer.max_steps:7d}: {loss.item():.4f}')
+            lossi.append(loss.log10().item())
+            
+            with torch.no_grad():
+                ud.append([(lr*p.grad.std()/p.data.std()).log10().item() for p in parameters])
+            
+            if i>=1000:
+                break
+
+
+        g=torch.Generator().manual_seed(2147483647+10)
+
+        for _ in range(20):
+            out=[]
+            context=[0]*self.n_block_size
+            while True:
+                #forward pass the neural net
+                emb=C[torch.tensor([context])] # (1, block_size, n_embd)
+                x=emb.view(emb.shape[0], -1) # concatenate the vectors
+                for layer in layers:
+                    x=layer(x)
+                logits=x
+                probs=F.softmax(logits, dim=-1)
+                # sample from the distribution
+                ix=torch.multinomial(probs, num_samples=1, generator=g).item()
+                # shift the contetx window and track the samples
+                context=context[1:]+[ix]
+                out.append(ix)
+                if ix==0:
+                    break
+            print(''.join(self.itos[i] for i in out[:-1]))
\ No newline at end of file

From be698221b73959f421bcf482b2a6b7d656ea7c9d Mon Sep 17 00:00:00 2001
From: Aisuko <urakiny@gmail.com>
Date: Sat, 3 Aug 2024 04:47:01 +0000
Subject: [PATCH 2/2] fix the wrong length of vocab size

Signed-off-by: Aisuko <urakiny@gmail.com>
---
 src/tests/test_mlp_batchnorm.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/tests/test_mlp_batchnorm.py b/src/tests/test_mlp_batchnorm.py
index 2f52506..dc7ede8 100644
--- a/src/tests/test_mlp_batchnorm.py
+++ b/src/tests/test_mlp_batchnorm.py
@@ -40,9 +40,11 @@ def setUpClass(cls):
         cls.unique_chars=MlpBatchNormTrainer.unique_chars(cls.data.splitlines())
         cls.stoi=MlpBatchNormTrainer.stoi(cls.unique_chars)
         cls.itos=MlpBatchNormTrainer.itos(cls.unique_chars)
-        cls.vocab_size=MlpBatchNormTrainer.build_vocab(cls.unique_chars)
+        cls.vocab_size=MlpBatchNormTrainer.build_vocab(cls.itos)
 
     def test_mlp_batchnorm_trainer(self):
+
+        self.assertEqual(self.vocab_size,27)
         random.seed(42)
         words=self.data.splitlines()
         random.shuffle(words)
@@ -54,8 +56,10 @@ def test_mlp_batchnorm_trainer(self):
         Xdev, Ydev=MlpBatchNormTrainer.build_dataset(words[n1:n2],self.stoi) # 10%
         Xte, Yte=MlpBatchNormTrainer.build_dataset(words[n2:],self.stoi) # 10%
         g=torch.Generator().manual_seed(2147483647)
-
+        self.assertEqual(self.n_embd, 10)
         C=torch.randn((self.vocab_size, self.n_embd), generator=g)
+        
+        self.assertEqual(C.shape, torch.Size([27, 10]))
 
         # sequential 6 MLP layers
         layers=[
@@ -125,7 +129,9 @@ def test_mlp_batchnorm_trainer(self):
             if i>=1000:
                 break
 
-
+        for layer in layers:
+            layer.training=False
+            
         g=torch.Generator().manual_seed(2147483647+10)
 
         for _ in range(20):
@@ -138,7 +144,8 @@ def test_mlp_batchnorm_trainer(self):
                 for layer in layers:
                     x=layer(x)
                 logits=x
-                probs=F.softmax(logits, dim=-1)
+                probs=F.softmax(logits, dim=1)
+                self.assertEqual(probs.shape, torch.Size([1, 27]))
                 # sample from the distribution
                 ix=torch.multinomial(probs, num_samples=1, generator=g).item()
                 # shift the contetx window and track the samples
@@ -146,4 +153,4 @@ def test_mlp_batchnorm_trainer(self):
                 out.append(ix)
                 if ix==0:
                     break
-            print(''.join(self.itos[i] for i in out[:-1]))
\ No newline at end of file
+            self.assertIsNotNone(''.join(self.itos[i] for i in out[:-1]))
\ No newline at end of file