Skip to content

Commit

Permalink
fixed training, some nlg
Browse files Browse the repository at this point in the history
  • Loading branch information
IvanBongiorni committed Jul 3, 2023
1 parent 3e1f9f1 commit a92bc7c
Show file tree
Hide file tree
Showing 6 changed files with 193 additions and 31 deletions.
106 changes: 106 additions & 0 deletions chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
Inference
"""
import numpy as np
import tensorflow as tf
import maximal
from tqdm import tqdm

from config import config
from models import load_or_build_model


def generate_text(
prompt: str,
char2idx: dict,
idx2char: dict,
n: int = config.N_GENERATION,
temperature: float = config.TEMPERATURE,
k: int = config.TOP_K_SAMPLE
) -> str:
"""
Inference time for the GPT.
Args:
prompt (str): input text
char2idx (dict): char -> idx mapping
idx2char (dict): idx -> char mapping (inverse of original char2idx)
n (int): number of tokens to be generated
temperature (float): noise in the output probability
(>1. = noisy sampling; <1. = conservative sampling.)
k (int): restricts to number of top-k tokens to be sampled from
Returns:
generated_text (str): GPT completion
"""
# If prompt is shorter than INPUT_LENGTH raise error (no padding in this simple tutorial)
assert len(prompt) >= config.INPUT_LENGTH, f"Prompt must be of {config.INPUT_LENGTH} character length"

# If prompt is longer than INPUT_LENGTH crop it to last piece
if prompt > config.INPUT_LENGTH:
prompt = prompt[-config.INPUT_LENGTH:]

generated_text = []

for i in tqdm(range(n)):
# vectorize prompt and adjust np.array shape
vectorized_text = [char2idx[c] for c in prompt]
vectorized_text = np.array(vectorized_text).reshape((1, len(vectorized_text)))

# next token prediction
pred = gpt.predict(vectorized_text, verbose=0)
pred = np.squeeze(pred[:, -1, :])

# temperature scaling
pred /= temperature

# restrict sampling to top k tokens
probs, indices = tf.math.top_k(pred, k, sorted=True)

# sample token id
probs = tf.nn.softmax(probs).numpy()
pred_id = np.random.choice(indices, p=probs)

# update prompt
next_char = idx2char[pred_id]
prompt = prompt[1:] + next_char
generated_text.append(next_char)

generated_text = ''.join(generated_text)

return generated_text


def nlg():
"""
Natural Language Generation.
Starts an infinite loop that can be broken only via Ctrl+C or by
typing "exit" as prompt.
"""
# Load model
print(f"Loading model: {config.MODEL_NAME}.h5")
gpt = tf.keras.models.load_model(os.path.join(os.getcwd(), "saved_models", config.MODEL_NAME))
print("Completed.")

print(config.MSG_GREETINGS)

# Start infinite loop
while true:
prompt = input("\nUser:\n")

if prompt < config.INPUT_LENGTH:
print(f"Please provide a prompt of {config.INPUT_LENGTH}")

# If prompt too short send a shakespearean message
print(config.MSG_INPUT_TOO_SHORT.format(config.INPUT_LENGTH))
continue
elif prompt == "exit":
print(config.MSG_FAREWELL)
quit()

generated_text = generate_text(prompt=prompt)
print(f"\nShakespeare-GPT:\n{generated_text}\n")


if __name__ == "__main__":
nlg()
8 changes: 8 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,16 @@
"""
from utils import StrMessages


class config(StrMessages):
MODEL_NAME = "gpt_maximal_00"

# NLG
N_GENERATION = 1000
TEMPERATURE = 1.0
TOP_K_SAMPLE = 10

# Model architecture
INPUT_LENGTH = 128
DEPTH = 512
HEADS = 4
Expand Down
38 changes: 24 additions & 14 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,20 @@
from config import config


def build_model():
def build_model() -> tf.keras.models.Model:
"""
Builds a GPT using Maximal and TensorFlow.
Args: / (just needs config params)
Returns: GPT model (tf.keras.models.Model)
"""
# Define nodes of the graph
input_batch = Input(shape=(INPUT_LENGTH,), dtype=tf.int32)
embedding = PositionalEmbedding(INPUT_LENGTH, VOCAB_SIZE, DEPTH)
gpt_layers = [GPTLayer(depth=DEPTH, heads=HEADS, ff_nodes=FF_NODES) for _ in range(N_LAYERS)]
classification_layer = Dense(VOCAB_SIZE)
input_batch = Input(shape=(config.INPUT_LENGTH,), dtype=tf.int32)

embedding = PositionalEmbedding(config.INPUT_LENGTH, config.VOCAB_SIZE, config.DEPTH)

gpt_layers = [GPTLayer(depth=config.DEPTH, heads=config.HEADS, ff_nodes=config.FF_NODES) for _ in range(config.N_LAYERS)]

classification_layer = Dense(config.VOCAB_SIZE)

# Build the computational graph
x = embedding(input_batch)
Expand All @@ -39,18 +42,25 @@ def build_model():
)


def load_model():
def load_or_build_model(verbose: bool =False) -> tf.keras.models.Model:
"""
If a model with a given name already exists
:return:
"""
return gpt

Checks if a model with name MODEL_NAME is already stored in /saved_models
folder. If present, loads the existing one (to train it further). If not, it
builds a new one.
def load_or_build_model():
Args:
verbose (bool): print model.summary() or not - defaults to False
"""
filenames = os.listdir(os.path.join(os.getcwd(), "saved_models"))

# check if the model is
if config.MODEL_NAME in filenames:
print(f"Loading existing model: {config.MODEL_NAME}.h5")
gpt = tf.keras.models.load_model(os.path.join(os.getcwd(), "saved_models", config.MODEL_NAME))
else:
print(f"Creating a new model: {config.MODEL_NAME}.h5")
gpt = build_model()

#
if verbose:
print(gpt.summary())

return gpt
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
numpy
tensorflow>2.1
maximal>=1.0
matplotlib
tqdm
51 changes: 39 additions & 12 deletions train.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""
Training
"""
import os
import requests
import yaml

import numpy as np
import tensorflow as tf
Expand All @@ -12,25 +14,36 @@


# globals
gpt = load_or_build_model()
gpt = load_or_build_model(verbose=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=config.LEARNING_RATE)


def numerical_encoding(text, char_dict):
def numerical_encoding(text: str, char_dict: dict) -> np.array:
"""
First breaks text into a list of chars, then converts each to
its numerical idx (np.array)
Args:
text (str): corpus to be vectorized
char_dict (dict): dictionary to map chars to indexes
Returns:
chars_list (np.array): vectorized corpus
"""
chars_list = [ char for char in text ]
chars_list = [ char_dict[char] for char in chars_list ]
chars_list = np.array(chars_list)
return chars_list


def get_text_matrix(sequence, len_input):
def get_text_matrix(sequence: np.array, len_input: int) -> np.array:
"""
This generates a matrix containing all the sequences
Generates a matrix containing all sequences
of length INPUT_LENGTH to be fed into the Network
Args:
sequence (np.array): array to be processed
len_input (int): length od model input
"""
# create empty matrix
X = np.empty((len(sequence)-len_input, len_input))
Expand All @@ -42,7 +55,17 @@ def get_text_matrix(sequence, len_input):
return X


def process_corpus():
def process_corpus() -> (np.array, dict):
"""
Text preprocessing steps: 1. Downloads corpus; 2. extracts set of
unique chars; 3. Map every char to its int; 3. vectorize text;
4. process vectorized text into a 2D array for model training
(a sliding window of text is produced)
Returns:
X (np.array): 2D array for model training
char2idx (dict): dictionary to preserve char-index mapping
"""
page = requests.get(config.CORPUS_URL)
text = page.text

Expand All @@ -53,14 +76,13 @@ def process_corpus():
# Map every letter in our alphabet to an int
char2idx = {char[1]: char[0] for char in enumerate(unique_chars)}

# Produce a reverse dictionary to go back from int to str later
idx2char = {v: k for k, v in char2idx.items()}

# vectorize text
encoded_text = numerical_encoding(text, char2idx)

# Sequence of vectorized chars to 2D array
X = get_text_matrix(encoded_text, INPUT_LENGTH + 1)

return X
return X, char2idx


@tf.function
Expand All @@ -79,7 +101,7 @@ def train_on_batch(x, y):


def main():
X = process_corpus()
X, char2idx = process_corpus()

loss_history = []

Expand Down Expand Up @@ -119,10 +141,15 @@ def main():
plt.show()

# Save model
gpt.save(os.path.join(os.getcwd(), "saved_models", config.MODEL_NAME))
gpt.save(os.path.join(os.getcwd(), "saved_models", config.MODEL_NAME+".h5"))

return None
# Save char2idx mapping as yaml
yaml.dump(
char2idx,
open(os.path.join(os.getcwd(), "saved_models", f"{config.MODEL_NAME}_char_idx_map.yaml"), "w")
)

return None


if __name__ == "__main__":
Expand Down
19 changes: 14 additions & 5 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,30 @@ class StrMessages:
With words of lofty prose, I shall adorn
Thy journey through this CLI program's feed.
Thou art most welcome to this humble stage,
Where bytes and lines doth dance in harmony.
Methinks thou seeketh knowledge of this age,
And for thy query, I shall thee gladly see.
Inscribe the word "exit," a concise decree,
Or wield the key combination, Ctrl-C, with glee.
By this act, thou shalt gracefully conclude thy stay,
And from this program's realm, thou may swiftly stray.
"""

MSG_INPUT_TOO_SHORT = """
** InputError: Input length too short **
Pray, kind user, if it be not too much to ask,
I beseech thee, extend thy prompt, a greater task.
Yet, one more thing I must humbly request,
A length of {} characters, at its behest.
"""

MSG_FAREWELL = """
Fair user, thou hast spoken the chosen word,
"Exit" resounds, like a song of a departing bird.
With brevity, I bid thee a swift farewell,
May fortune attend thee, as thou venturest, as well.
Fare thee well, dear user, with gratitude and grace,
Till we meet again, in another time and place.
Shakespeare-GPT awaits, shouldst thou return anew,
With words of wisdom and verses, old and true.
"""

0 comments on commit a92bc7c

Please sign in to comment.