Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MLP with batchNorm layer #123

Merged
merged 2 commits into from
Aug 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"ms-python.vscode-pylance"
]
}
}
},

// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
Expand All @@ -21,7 +21,7 @@
// "forwardPorts": [],

// Use 'postCreateCommand' to run commands after the container is created.
// "postCreateCommand": "pip3 install --user -r requirements.txt",
"postCreateCommand": "make prepare"

// Configure tool-specific properties.
// "customizations": {},
Expand Down
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ source:
@poetry config repositories.source https://pypi.org/project/kimchima


.PHONY: prepare
prepare: poetry install-dev

###################################################################################################
# Commit and recommit changes to github
Expand Down
196 changes: 196 additions & 0 deletions src/models/mlp_batchnorm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# coding=utf-8

# Copyright [2024] [SkywardAI]
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import random

Check failure on line 17 in src/models/mlp_batchnorm.py

View workflow job for this annotation

GitHub Actions / Code Quality 📦 (ubuntu-latest, 3.11, 1.8.2)

Ruff (F401)

src/models/mlp_batchnorm.py:17:8: F401 `random` imported but unused
import torch
import torch.nn.functional as F

Check failure on line 19 in src/models/mlp_batchnorm.py

View workflow job for this annotation

GitHub Actions / Code Quality 📦 (ubuntu-latest, 3.11, 1.8.2)

Ruff (F401)

src/models/mlp_batchnorm.py:19:31: F401 `torch.nn.functional` imported but unused
from torch.utils.tensorboard import SummaryWriter

Check failure on line 20 in src/models/mlp_batchnorm.py

View workflow job for this annotation

GitHub Actions / Code Quality 📦 (ubuntu-latest, 3.11, 1.8.2)

Ruff (F401)

src/models/mlp_batchnorm.py:20:37: F401 `torch.utils.tensorboard.SummaryWriter` imported but unused

# https://www.kaggle.com/code/aisuko/implement-neural-net-with-batch-norm-layer

g=torch.Generator().manual_seed(2147483647)

class Linear:
"""
Linear layer
https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
"""

def __init__(self, fan_in, fan_out, bias=True):
self.weight=torch.randn((fan_in, fan_out), generator=g)/fan_in**0.5 # unit gaussian
self.bias=torch.zeros(fan_out) if bias else None # default bias initialize to zeros

def __call__(self, x):
self.out=x@self.weight
if self.bias is not None:
self.out+=self.bias
return self.out

def parameters(self):
"""
return tensors that are parameters of this layer
"""
return [self.weight]+([] if self.bias is None else [self.bias])


class BatchNorm1d:
"""
batchnorm layer
https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html formula
"""

def __init__(self, dim, eps=1e-5, momentum=0.1):
self.eps=eps # used in division
self.momentum=momentum # keep tracking running stats
self.training=True

# parameters (trained with backprop)
self.gamma=torch.ones(dim)
self.beta=torch.zeros(dim)
# buffers (trained with a running 'momentum update')
self.running_mean=torch.zeros(dim)
self.running_var=torch.ones(dim)

def __call__(self, x):
"""
Follow https://arxiv.org/pdf/1502.03167

Algorithm 1
1.mini-batch mean
2.mini-batch variance
3.normalize
4.scale and shift
"""

# calculating the forward pass
if self.training:
xmean=x.mean(0, keepdim=True) # batch mean
xvar=x.var(0, keepdim=True, unbiased=True) # batch variance
else:
xmean=self.running_mean
xvar=self.running_var

xhat=(x-xmean)/torch.sqrt(xvar+self.eps) # normalize to unit variance
self.out=self.gamma*xhat+self.beta # craete otu attribute for visualization training process
# update the buffers

if self.training:
with torch.no_grad():
self.running_mean=(1-self.momentum)*self.running_mean+self.momentum*xmean
self.running_var=(1-self.momentum)*self.running_var+self.momentum*xvar
return self.out

def parameters(self):
return [self.gamma, self.beta]

class Tanh:
"""
"""
def __call__(self, x):
self.out=torch.tanh(x)
return self.out

def parameters(self):
"""
no parameters in this layer
"""
return []


class MlpBatchNormTrainer:
"""
MlpBatchNormTrainer
"""
ds_url="https://www.kaggleusercontent.com/kf/187064505/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..ZLA9V0sWqB_Px0312U15fQ.OO2wvSdp-fhBB0BTDAaLToek6CLGlzS4otHIsyHBd1feEJxIUq055-GIQb24Ez51pGq31hyzaN_vFeDRnqxFwyc12sDNqZ7uDhel-5xeXU08h0qNtOpoqXA-iJpPuV4u-dThq8Lk-zoOg_ZDmVNAW8XHZVAM2ZAHl9StyqN1n7eOGU0379mp_2ol2gyjXP01xNDH2n4kUSIIetktnagIon8Jm_tcLBB-DaWPTFwQ5L7NBP1t-omCUrKydTxAyPIFFwnid3T1vzEgSmYiUY8Ec-iC8OG5d2pKcod9FIAOkJH4Xu74Pvzp5UuOFzQXRByezEOkyD0ltAhfMOab0ebIi6YSTVKrna70HZhuxjWQRK9fIgvt0V7RMz84ZQspJWrgofowQrf7E1avVvXe7GQW4E7dYITqQoJvZ7dhlpujq1db6pkegRqOfuQzPJcD6UHBTpVRyi36rIQoLpFd63XLzY5eya4ScAy5H-frQhF0IU927Z86S9iR2AypqO3TXriPsMHjJ7o-DwXpnHCNkVfMJXeVxT36DRBiV9uCTL-e8_xOUKw50N5iG3NqTnos0IwSXvwrSBtHxUI71zo-I2Z-l5x_GqjEa9QVl1XX_q7GU_YFejlC-rT9KdcA_6TEVO6qaMpfvVvCc9kFYI7s7GQNbg.tIuWJu1a71qSZKZeG-TgPg/names.txt"
n_embed=10 # the dimensionality of the character embedding vectors
n_hidden=100 # the number of neurons in the hidden layer of the MLP
n_block_size=3 # context length: how many characters do we take to predict the next one?
max_steps=200000
batch_size=32


def __init__(self):
raise Exception("This class is not meant to be instantiated")

@classmethod
def set_hyperparameters(cls, **kwargs):
"""
Set hyperparameters
"""
cls.n_embed=kwargs.get("n_embed", 10)
cls.n_hidden=kwargs.get("n_hidden", 100)
cls.n_block_size=kwargs.get("n_block_size", 3)
cls.max_steps=kwargs.get("max_steps", 200000)
cls.batch_size=kwargs.get("batch_size", 32)

@classmethod
def load_dataset(cls, filePath:str)->str:
"""
Load the dataset
"""
with open(filePath, "r", encoding="utf-8") as f:
text=f.read()
return text

@classmethod
def unique_chars(cls, text:str)->list:
"""
Get all the unique characters in the text
"""
return sorted(list(set(''.join(text))))

@classmethod
def stoi(cls, chars:list)->dict:
"""
Convert characters to indices
"""
stoi={char:i+1 for i,char in enumerate(chars)}
stoi['.']=0
return stoi

@classmethod
def itos(cls, chars:list)->dict:
"""
Convert indices to characters
"""
itos={i:char for char,i in cls.stoi(chars).items()}
return itos

@classmethod
def build_vocab(cls, chars:list)->int:
"""
Build a vocabulary from the unique characters
"""
return len(chars)

@classmethod
def build_dataset(cls, words:str, stoi: dict)->tuple[torch.Tensor, torch.Tensor]:
"""
Build the dataset
"""
X,Y=[],[]

for w in words:
context=[0]*cls.n_block_size
for ch in w+'.':
ix=stoi[ch]
X.append(context)
Y.append(ix)
context=context[1:]+[ix] # crop and append

X=torch.tensor(X) # convert to tensor
Y=torch.tensor(Y)
return X,Y
1 change: 1 addition & 0 deletions src/models/simple_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter

# https://www.kaggle.com/code/aisuko/gpt-from-scratch-as-a-script

class SimpleGPT(nn.Module):
def __init__(self, vocab_size):
Expand Down
156 changes: 156 additions & 0 deletions src/tests/test_mlp_batchnorm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# coding=utf-8

# Copyright [2024] [SkywardAI]
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import random
import unittest

from pathlib import Path
import torch
from torch.nn import functional as F

from models.mlp_batchnorm import MlpBatchNormTrainer,Linear, BatchNorm1d, Tanh
from pkg.dataset_helper import DatasetHelper


class TestMLPBatchNorm(unittest.TestCase):

@classmethod
def setUpClass(cls):
cls.n_embd=MlpBatchNormTrainer.n_embed
cls.n_hidden=MlpBatchNormTrainer.n_hidden
cls.n_block_size=MlpBatchNormTrainer.n_block_size

src_dir = Path(os.path.dirname(os.path.abspath(__file__))).parent
abs_file_path = os.path.join(src_dir, "input.txt")
_ = DatasetHelper.download_remote_file(MlpBatchNormTrainer.ds_url, abs_file_path)
cls.data=MlpBatchNormTrainer.load_dataset(abs_file_path)
cls.unique_chars=MlpBatchNormTrainer.unique_chars(cls.data.splitlines())
cls.stoi=MlpBatchNormTrainer.stoi(cls.unique_chars)
cls.itos=MlpBatchNormTrainer.itos(cls.unique_chars)
cls.vocab_size=MlpBatchNormTrainer.build_vocab(cls.itos)

def test_mlp_batchnorm_trainer(self):

self.assertEqual(self.vocab_size,27)
random.seed(42)
words=self.data.splitlines()
random.shuffle(words)

n1=int(0.8*len(words))
n2=int(0.9*len(words))

Xtr, Ytr=MlpBatchNormTrainer.build_dataset(words[:n1], self.stoi) # 80%
Xdev, Ydev=MlpBatchNormTrainer.build_dataset(words[n1:n2],self.stoi) # 10%
Xte, Yte=MlpBatchNormTrainer.build_dataset(words[n2:],self.stoi) # 10%
g=torch.Generator().manual_seed(2147483647)
self.assertEqual(self.n_embd, 10)
C=torch.randn((self.vocab_size, self.n_embd), generator=g)

self.assertEqual(C.shape, torch.Size([27, 10]))

# sequential 6 MLP layers
layers=[
Linear(self.n_embd*self.n_block_size, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
Linear(self.n_hidden, self.n_hidden, bias=False), BatchNorm1d(self.n_hidden), Tanh(),
Linear(self.n_hidden, self.vocab_size, bias=False), BatchNorm1d(self.vocab_size)
]

with torch.no_grad():
# here, out latest layer is a batch norm layer and we wouldn't change the weights to make the softmax less confident
# we would like to changing the gamma(from the batch norm paper algorithm1)
# because gamma remember int he batchnorm is the variable that multiplicatively interacts with the output of thah normalization
layers[-1].gamma*=0.1

# all pther layers: apply again
for layer in layers[:-1]:
if isinstance(layer, Linear):
layer.weight*=5/3 # booster the linear layer by the gain, the number from torch document
# [C] the embedding matrix and all the parameters of all the layers
parameters=[C]+[p for layer in layers for p in layer.parameters()]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
p.requires_grad=True


# training loop
lossi=[]
ud=[]

for i in range(MlpBatchNormTrainer.max_steps):
# minibatch construct
ix=torch.randint(0, Xtr.shape[0], (MlpBatchNormTrainer.batch_size,), generator=g)
Xb, Yb=Xtr[ix], Ytr[ix] # batch X,Y

# forward pass
emb= C[Xb] # embed the characters into vectors
x=emb.view(emb.shape[0], -1) # flatten/concatenate the vectors
for layer in layers:
x=layer(x)
loss=F.cross_entropy(x, Yb) # loss function

# backward pass
for layer in layers:
layer.out.retain_grad()

for p in parameters:
p.grad=None

loss.backward()

# update
lr=0.1 if i<100000 else 0.01 # step learning rate decay
for p in parameters:
p.data+=-lr*p.grad

# track stats
if i%10000==0: # print every once in a while
print(f'{i:7d}/{MlpBatchNormTrainer.max_steps:7d}: {loss.item():.4f}')
lossi.append(loss.log10().item())

with torch.no_grad():
ud.append([(lr*p.grad.std()/p.data.std()).log10().item() for p in parameters])

if i>=1000:
break

for layer in layers:
layer.training=False

g=torch.Generator().manual_seed(2147483647+10)

for _ in range(20):
out=[]
context=[0]*self.n_block_size
while True:
#forward pass the neural net
emb=C[torch.tensor([context])] # (1, block_size, n_embd)
x=emb.view(emb.shape[0], -1) # concatenate the vectors
for layer in layers:
x=layer(x)
logits=x
probs=F.softmax(logits, dim=1)
self.assertEqual(probs.shape, torch.Size([1, 27]))
# sample from the distribution
ix=torch.multinomial(probs, num_samples=1, generator=g).item()
# shift the contetx window and track the samples
context=context[1:]+[ix]
out.append(ix)
if ix==0:
break
self.assertIsNotNone(''.join(self.itos[i] for i in out[:-1]))
Loading