From 37775e441b5b6b014e6aa62e0850cb44fe56fc5e Mon Sep 17 00:00:00 2001 From: Aisuko Date: Sat, 3 Aug 2024 02:47:55 +0000 Subject: [PATCH 1/2] Add mlp with batchnorm layer Signed-off-by: Aisuko --- .devcontainer/devcontainer.json | 4 +- Makefile | 2 + src/models/mlp_batchnorm.py | 196 ++++++++++++++++++++++++++++++++ src/models/simple_gpt.py | 1 + src/tests/test_mlp_batchnorm.py | 149 ++++++++++++++++++++++++ 5 files changed, 350 insertions(+), 2 deletions(-) create mode 100644 src/models/mlp_batchnorm.py create mode 100644 src/tests/test_mlp_batchnorm.py diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index fbbb1cc..7f2622d 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -12,7 +12,7 @@ "ms-python.vscode-pylance" ] } - } + }, // Features to add to the dev container. More info: https://containers.dev/features. // "features": {}, @@ -21,7 +21,7 @@ // "forwardPorts": [], // Use 'postCreateCommand' to run commands after the container is created. - // "postCreateCommand": "pip3 install --user -r requirements.txt", + "postCreateCommand": "make prepare" // Configure tool-specific properties. // "customizations": {}, diff --git a/Makefile b/Makefile index 4f93473..a2eba9e 100644 --- a/Makefile +++ b/Makefile @@ -67,6 +67,8 @@ source: @poetry config repositories.source https://pypi.org/project/kimchima +.PHONY: prepare +prepare: poetry install-dev ################################################################################################### # Commit and recommit changes to github diff --git a/src/models/mlp_batchnorm.py b/src/models/mlp_batchnorm.py new file mode 100644 index 0000000..68f517b --- /dev/null +++ b/src/models/mlp_batchnorm.py @@ -0,0 +1,196 @@ +# coding=utf-8 + +# Copyright [2024] [SkywardAI] +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import random +import torch +import torch.nn.functional as F +from torch.utils.tensorboard import SummaryWriter + +# https://www.kaggle.com/code/aisuko/implement-neural-net-with-batch-norm-layer + +g=torch.Generator().manual_seed(2147483647) + +class Linear: + """ + Linear layer + https://pytorch.org/docs/stable/generated/torch.nn.Linear.html + """ + + def __init__(self, fan_in, fan_out, bias=True): + self.weight=torch.randn((fan_in, fan_out), generator=g)/fan_in**0.5 # unit gaussian + self.bias=torch.zeros(fan_out) if bias else None # default bias initialize to zeros + + def __call__(self, x): + self.out=x@self.weight + if self.bias is not None: + self.out+=self.bias + return self.out + + def parameters(self): + """ + return tensors that are parameters of this layer + """ + return [self.weight]+([] if self.bias is None else [self.bias]) + + +class BatchNorm1d: + """ + batchnorm layer + https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html formula + """ + + def __init__(self, dim, eps=1e-5, momentum=0.1): + self.eps=eps # used in division + self.momentum=momentum # keep tracking running stats + self.training=True + + # parameters (trained with backprop) + self.gamma=torch.ones(dim) + self.beta=torch.zeros(dim) + # buffers (trained with a running 'momentum update') + self.running_mean=torch.zeros(dim) + self.running_var=torch.ones(dim) + + def __call__(self, x): + """ + Follow https://arxiv.org/pdf/1502.03167 + + Algorithm 1 + 1.mini-batch mean + 2.mini-batch variance + 3.normalize + 4.scale and shift + """ + + # calculating the forward pass + if self.training: + xmean=x.mean(0, keepdim=True) # batch mean + xvar=x.var(0, keepdim=True, unbiased=True) # batch variance + else: + xmean=self.running_mean + xvar=self.running_var + + xhat=(x-xmean)/torch.sqrt(xvar+self.eps) # normalize to unit variance + self.out=self.gamma*xhat+self.beta # craete otu attribute for visualization training process + # update the buffers + + if self.training: + with torch.no_grad(): + self.running_mean=(1-self.momentum)*self.running_mean+self.momentum*xmean + self.running_var=(1-self.momentum)*self.running_var+self.momentum*xvar + return self.out + + def parameters(self): + return [self.gamma, self.beta] + +class Tanh: + """ + """ + def __call__(self, x): + self.out=torch.tanh(x) + return self.out + + def parameters(self): + """ + no parameters in this layer + """ + return [] + + +class MlpBatchNormTrainer: + """ + MlpBatchNormTrainer + """ + ds_url="https://www.kaggleusercontent.com/kf/187064505/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..ZLA9V0sWqB_Px0312U15fQ.OO2wvSdp-fhBB0BTDAaLToek6CLGlzS4otHIsyHBd1feEJxIUq055-GIQb24Ez51pGq31hyzaN_vFeDRnqxFwyc12sDNqZ7uDhel-5xeXU08h0qNtOpoqXA-iJpPuV4u-dThq8Lk-zoOg_ZDmVNAW8XHZVAM2ZAHl9StyqN1n7eOGU0379mp_2ol2gyjXP01xNDH2n4kUSIIetktnagIon8Jm_tcLBB-DaWPTFwQ5L7NBP1t-omCUrKydTxAyPIFFwnid3T1vzEgSmYiUY8Ec-iC8OG5d2pKcod9FIAOkJH4Xu74Pvzp5UuOFzQXRByezEOkyD0ltAhfMOab0ebIi6YSTVKrna70HZhuxjWQRK9fIgvt0V7RMz84ZQspJWrgofowQrf7E1avVvXe7GQW4E7dYITqQoJvZ7dhlpujq1db6pkegRqOfuQzPJcD6UHBTpVRyi36rIQoLpFd63XLzY5eya4ScAy5H-frQhF0IU927Z86S9iR2AypqO3TXriPsMHjJ7o-DwXpnHCNkVfMJXeVxT36DRBiV9uCTL-e8_xOUKw50N5iG3NqTnos0IwSXvwrSBtHxUI71zo-I2Z-l5x_GqjEa9QVl1XX_q7GU_YFejlC-rT9KdcA_6TEVO6qaMpfvVvCc9kFYI7s7GQNbg.tIuWJu1a71qSZKZeG-TgPg/names.txt" + n_embed=10 # the dimensionality of the character embedding vectors + n_hidden=100 # the number of neurons in the hidden layer of the MLP + n_block_size=3 # context length: how many characters do we take to predict the next one? + max_steps=200000 + batch_size=32 + + + def __init__(self): + raise Exception("This class is not meant to be instantiated") + + @classmethod + def set_hyperparameters(cls, **kwargs): + """ + Set hyperparameters + """ + cls.n_embed=kwargs.get("n_embed", 10) + cls.n_hidden=kwargs.get("n_hidden", 100) + cls.n_block_size=kwargs.get("n_block_size", 3) + cls.max_steps=kwargs.get("max_steps", 200000) + cls.batch_size=kwargs.get("batch_size", 32) + + @classmethod + def load_dataset(cls, filePath:str)->str: + """ + Load the dataset + """ + with open(filePath, "r", encoding="utf-8") as f: + text=f.read() + return text + + @classmethod + def unique_chars(cls, text:str)->list: + """ + Get all the unique characters in the text + """ + return sorted(list(set(''.join(text)))) + + @classmethod + def stoi(cls, chars:list)->dict: + """ + Convert characters to indices + """ + stoi={char:i+1 for i,char in enumerate(chars)} + stoi['.']=0 + return stoi + + @classmethod + def itos(cls, chars:list)->dict: + """ + Convert indices to characters + """ + itos={i:char for char,i in cls.stoi(chars).items()} + return itos + + @classmethod + def build_vocab(cls, chars:list)->int: + """ + Build a vocabulary from the unique characters + """ + return len(chars) + + @classmethod + def build_dataset(cls, words:str, stoi: dict)->tuple[torch.Tensor, torch.Tensor]: + """ + Build the dataset + """ + X,Y=[],[] + + for w in words: + context=[0]*cls.n_block_size + for ch in w+'.': + ix=stoi[ch] + X.append(context) + Y.append(ix) + context=context[1:]+[ix] # crop and append + + X=torch.tensor(X) # convert to tensor + Y=torch.tensor(Y) + return X,Y diff --git a/src/models/simple_gpt.py b/src/models/simple_gpt.py index bd8bc47..22a2c76 100644 --- a/src/models/simple_gpt.py +++ b/src/models/simple_gpt.py @@ -19,6 +19,7 @@ from torch.nn import functional as F from torch.utils.tensorboard import SummaryWriter +# https://www.kaggle.com/code/aisuko/gpt-from-scratch-as-a-script class SimpleGPT(nn.Module): def __init__(self, vocab_size): diff --git a/src/tests/test_mlp_batchnorm.py b/src/tests/test_mlp_batchnorm.py new file mode 100644 index 0000000..2f52506 --- /dev/null +++ b/src/tests/test_mlp_batchnorm.py @@ -0,0 +1,149 @@ +# coding=utf-8 + +# Copyright [2024] [SkywardAI] +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +import unittest + +from pathlib import Path +import torch +from torch.nn import functional as F + +from models.mlp_batchnorm import MlpBatchNormTrainer,Linear, BatchNorm1d, Tanh +from pkg.dataset_helper import DatasetHelper + + +class TestMLPBatchNorm(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.n_embd=MlpBatchNormTrainer.n_embed + cls.n_hidden=MlpBatchNormTrainer.n_hidden + cls.n_block_size=MlpBatchNormTrainer.n_block_size + + src_dir = Path(os.path.dirname(os.path.abspath(__file__))).parent + abs_file_path = os.path.join(src_dir, "input.txt") + _ = DatasetHelper.download_remote_file(MlpBatchNormTrainer.ds_url, abs_file_path) + cls.data=MlpBatchNormTrainer.load_dataset(abs_file_path) + cls.unique_chars=MlpBatchNormTrainer.unique_chars(cls.data.splitlines()) + cls.stoi=MlpBatchNormTrainer.stoi(cls.unique_chars) + cls.itos=MlpBatchNormTrainer.itos(cls.unique_chars) + cls.vocab_size=MlpBatchNormTrainer.build_vocab(cls.unique_chars) + + def test_mlp_batchnorm_trainer(self): + random.seed(42) + words=self.data.splitlines() + random.shuffle(words) + + n1=int(0.8*len(words)) + n2=int(0.9*len(words)) + + Xtr, Ytr=MlpBatchNormTrainer.build_dataset(words[:n1], self.stoi) # 80% + Xdev, Ydev=MlpBatchNormTrainer.build_dataset(words[n1:n2],self.stoi) # 10% + Xte, Yte=MlpBatchNormTrainer.build_dataset(words[n2:],self.stoi) # 10% + g=torch.Generator().manual_seed(2147483647) + + C=torch.randn((self.vocab_size, self.n_embd), generator=g) + + # sequential 6 MLP layers + layers=[ + Linear(self.n_embd*self.n_block_size, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(), + Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(), + Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(), + Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(), + Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(), + Linear(self.n_hidden, self.vocab_size, bias=False), BatchNorm1d(self.vocab_size) + ] + + with torch.no_grad(): + # here, out latest layer is a batch norm layer and we wouldn't change the weights to make the softmax less confident + # we would like to changing the gamma(from the batch norm paper algorithm1) + # because gamma remember int he batchnorm is the variable that multiplicatively interacts with the output of thah normalization + layers[-1].gamma*=0.1 + + # all pther layers: apply again + for layer in layers[:-1]: + if isinstance(layer, Linear): + layer.weight*=5/3 # booster the linear layer by the gain, the number from torch document + # [C] the embedding matrix and all the parameters of all the layers + parameters=[C]+[p for layer in layers for p in layer.parameters()] + print(sum(p.nelement() for p in parameters)) # number of parameters in total + for p in parameters: + p.requires_grad=True + + + # training loop + lossi=[] + ud=[] + + for i in range(MlpBatchNormTrainer.max_steps): + # minibatch construct + ix=torch.randint(0, Xtr.shape[0], (MlpBatchNormTrainer.batch_size,), generator=g) + Xb, Yb=Xtr[ix], Ytr[ix] # batch X,Y + + # forward pass + emb= C[Xb] # embed the characters into vectors + x=emb.view(emb.shape[0], -1) # flatten/concatenate the vectors + for layer in layers: + x=layer(x) + loss=F.cross_entropy(x, Yb) # loss function + + # backward pass + for layer in layers: + layer.out.retain_grad() + + for p in parameters: + p.grad=None + + loss.backward() + + # update + lr=0.1 if i<100000 else 0.01 # step learning rate decay + for p in parameters: + p.data+=-lr*p.grad + + # track stats + if i%10000==0: # print every once in a while + print(f'{i:7d}/{MlpBatchNormTrainer.max_steps:7d}: {loss.item():.4f}') + lossi.append(loss.log10().item()) + + with torch.no_grad(): + ud.append([(lr*p.grad.std()/p.data.std()).log10().item() for p in parameters]) + + if i>=1000: + break + + + g=torch.Generator().manual_seed(2147483647+10) + + for _ in range(20): + out=[] + context=[0]*self.n_block_size + while True: + #forward pass the neural net + emb=C[torch.tensor([context])] # (1, block_size, n_embd) + x=emb.view(emb.shape[0], -1) # concatenate the vectors + for layer in layers: + x=layer(x) + logits=x + probs=F.softmax(logits, dim=-1) + # sample from the distribution + ix=torch.multinomial(probs, num_samples=1, generator=g).item() + # shift the contetx window and track the samples + context=context[1:]+[ix] + out.append(ix) + if ix==0: + break + print(''.join(self.itos[i] for i in out[:-1])) \ No newline at end of file From be698221b73959f421bcf482b2a6b7d656ea7c9d Mon Sep 17 00:00:00 2001 From: Aisuko Date: Sat, 3 Aug 2024 04:47:01 +0000 Subject: [PATCH 2/2] fix the wrong length of vocab size Signed-off-by: Aisuko --- src/tests/test_mlp_batchnorm.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/tests/test_mlp_batchnorm.py b/src/tests/test_mlp_batchnorm.py index 2f52506..dc7ede8 100644 --- a/src/tests/test_mlp_batchnorm.py +++ b/src/tests/test_mlp_batchnorm.py @@ -40,9 +40,11 @@ def setUpClass(cls): cls.unique_chars=MlpBatchNormTrainer.unique_chars(cls.data.splitlines()) cls.stoi=MlpBatchNormTrainer.stoi(cls.unique_chars) cls.itos=MlpBatchNormTrainer.itos(cls.unique_chars) - cls.vocab_size=MlpBatchNormTrainer.build_vocab(cls.unique_chars) + cls.vocab_size=MlpBatchNormTrainer.build_vocab(cls.itos) def test_mlp_batchnorm_trainer(self): + + self.assertEqual(self.vocab_size,27) random.seed(42) words=self.data.splitlines() random.shuffle(words) @@ -54,8 +56,10 @@ def test_mlp_batchnorm_trainer(self): Xdev, Ydev=MlpBatchNormTrainer.build_dataset(words[n1:n2],self.stoi) # 10% Xte, Yte=MlpBatchNormTrainer.build_dataset(words[n2:],self.stoi) # 10% g=torch.Generator().manual_seed(2147483647) - + self.assertEqual(self.n_embd, 10) C=torch.randn((self.vocab_size, self.n_embd), generator=g) + + self.assertEqual(C.shape, torch.Size([27, 10])) # sequential 6 MLP layers layers=[ @@ -125,7 +129,9 @@ def test_mlp_batchnorm_trainer(self): if i>=1000: break - + for layer in layers: + layer.training=False + g=torch.Generator().manual_seed(2147483647+10) for _ in range(20): @@ -138,7 +144,8 @@ def test_mlp_batchnorm_trainer(self): for layer in layers: x=layer(x) logits=x - probs=F.softmax(logits, dim=-1) + probs=F.softmax(logits, dim=1) + self.assertEqual(probs.shape, torch.Size([1, 27])) # sample from the distribution ix=torch.multinomial(probs, num_samples=1, generator=g).item() # shift the contetx window and track the samples @@ -146,4 +153,4 @@ def test_mlp_batchnorm_trainer(self): out.append(ix) if ix==0: break - print(''.join(self.itos[i] for i in out[:-1])) \ No newline at end of file + self.assertIsNotNone(''.join(self.itos[i] for i in out[:-1])) \ No newline at end of file