diff --git a/chat.py b/chat.py new file mode 100644 index 0000000..ce285ec --- /dev/null +++ b/chat.py @@ -0,0 +1,106 @@ +""" +Inference +""" +import numpy as np +import tensorflow as tf +import maximal +from tqdm import tqdm + +from config import config +from models import load_or_build_model + + +def generate_text( + prompt: str, + char2idx: dict, + idx2char: dict, + n: int = config.N_GENERATION, + temperature: float = config.TEMPERATURE, + k: int = config.TOP_K_SAMPLE + ) -> str: + """ + Inference time for the GPT. + + Args: + prompt (str): input text + char2idx (dict): char -> idx mapping + idx2char (dict): idx -> char mapping (inverse of original char2idx) + n (int): number of tokens to be generated + temperature (float): noise in the output probability + (>1. = noisy sampling; <1. = conservative sampling.) + k (int): restricts to number of top-k tokens to be sampled from + + Returns: + generated_text (str): GPT completion + """ + # If prompt is shorter than INPUT_LENGTH raise error (no padding in this simple tutorial) + assert len(prompt) >= config.INPUT_LENGTH, f"Prompt must be of {config.INPUT_LENGTH} character length" + + # If prompt is longer than INPUT_LENGTH crop it to last piece + if prompt > config.INPUT_LENGTH: + prompt = prompt[-config.INPUT_LENGTH:] + + generated_text = [] + + for i in tqdm(range(n)): + # vectorize prompt and adjust np.array shape + vectorized_text = [char2idx[c] for c in prompt] + vectorized_text = np.array(vectorized_text).reshape((1, len(vectorized_text))) + + # next token prediction + pred = gpt.predict(vectorized_text, verbose=0) + pred = np.squeeze(pred[:, -1, :]) + + # temperature scaling + pred /= temperature + + # restrict sampling to top k tokens + probs, indices = tf.math.top_k(pred, k, sorted=True) + + # sample token id + probs = tf.nn.softmax(probs).numpy() + pred_id = np.random.choice(indices, p=probs) + + # update prompt + next_char = idx2char[pred_id] + prompt = prompt[1:] + next_char + generated_text.append(next_char) + + generated_text = ''.join(generated_text) + + return generated_text + + +def nlg(): + """ + Natural Language Generation. + Starts an infinite loop that can be broken only via Ctrl+C or by + typing "exit" as prompt. + """ + # Load model + print(f"Loading model: {config.MODEL_NAME}.h5") + gpt = tf.keras.models.load_model(os.path.join(os.getcwd(), "saved_models", config.MODEL_NAME)) + print("Completed.") + + print(config.MSG_GREETINGS) + + # Start infinite loop + while true: + prompt = input("\nUser:\n") + + if prompt < config.INPUT_LENGTH: + print(f"Please provide a prompt of {config.INPUT_LENGTH}") + + # If prompt too short send a shakespearean message + print(config.MSG_INPUT_TOO_SHORT.format(config.INPUT_LENGTH)) + continue + elif prompt == "exit": + print(config.MSG_FAREWELL) + quit() + + generated_text = generate_text(prompt=prompt) + print(f"\nShakespeare-GPT:\n{generated_text}\n") + + +if __name__ == "__main__": + nlg() \ No newline at end of file diff --git a/config.py b/config.py index 64ce1f9..6e98536 100644 --- a/config.py +++ b/config.py @@ -3,8 +3,16 @@ """ from utils import StrMessages + class config(StrMessages): MODEL_NAME = "gpt_maximal_00" + + # NLG + N_GENERATION = 1000 + TEMPERATURE = 1.0 + TOP_K_SAMPLE = 10 + + # Model architecture INPUT_LENGTH = 128 DEPTH = 512 HEADS = 4 diff --git a/model.py b/model.py index 55d76fd..8847426 100644 --- a/model.py +++ b/model.py @@ -13,17 +13,20 @@ from config import config -def build_model(): +def build_model() -> tf.keras.models.Model: """ Builds a GPT using Maximal and TensorFlow. Args: / (just needs config params) Returns: GPT model (tf.keras.models.Model) """ # Define nodes of the graph - input_batch = Input(shape=(INPUT_LENGTH,), dtype=tf.int32) - embedding = PositionalEmbedding(INPUT_LENGTH, VOCAB_SIZE, DEPTH) - gpt_layers = [GPTLayer(depth=DEPTH, heads=HEADS, ff_nodes=FF_NODES) for _ in range(N_LAYERS)] - classification_layer = Dense(VOCAB_SIZE) + input_batch = Input(shape=(config.INPUT_LENGTH,), dtype=tf.int32) + + embedding = PositionalEmbedding(config.INPUT_LENGTH, config.VOCAB_SIZE, config.DEPTH) + + gpt_layers = [GPTLayer(depth=config.DEPTH, heads=config.HEADS, ff_nodes=config.FF_NODES) for _ in range(config.N_LAYERS)] + + classification_layer = Dense(config.VOCAB_SIZE) # Build the computational graph x = embedding(input_batch) @@ -39,18 +42,25 @@ def build_model(): ) -def load_model(): +def load_or_build_model(verbose: bool =False) -> tf.keras.models.Model: """ - If a model with a given name already exists - :return: - """ - return gpt - + Checks if a model with name MODEL_NAME is already stored in /saved_models + folder. If present, loads the existing one (to train it further). If not, it + builds a new one. -def load_or_build_model(): + Args: + verbose (bool): print model.summary() or not - defaults to False + """ + filenames = os.listdir(os.path.join(os.getcwd(), "saved_models")) - # check if the model is + if config.MODEL_NAME in filenames: + print(f"Loading existing model: {config.MODEL_NAME}.h5") + gpt = tf.keras.models.load_model(os.path.join(os.getcwd(), "saved_models", config.MODEL_NAME)) + else: + print(f"Creating a new model: {config.MODEL_NAME}.h5") + gpt = build_model() - # + if verbose: + print(gpt.summary()) return gpt \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6075ec6..3c9dd7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ numpy tensorflow>2.1 maximal>=1.0 +matplotlib +tqdm \ No newline at end of file diff --git a/train.py b/train.py index d4f6d2b..0330825 100644 --- a/train.py +++ b/train.py @@ -1,7 +1,9 @@ """ Training """ +import os import requests +import yaml import numpy as np import tensorflow as tf @@ -12,14 +14,21 @@ # globals -gpt = load_or_build_model() +gpt = load_or_build_model(verbose=True) optimizer = tf.keras.optimizers.Adam(learning_rate=config.LEARNING_RATE) -def numerical_encoding(text, char_dict): +def numerical_encoding(text: str, char_dict: dict) -> np.array: """ First breaks text into a list of chars, then converts each to its numerical idx (np.array) + + Args: + text (str): corpus to be vectorized + char_dict (dict): dictionary to map chars to indexes + + Returns: + chars_list (np.array): vectorized corpus """ chars_list = [ char for char in text ] chars_list = [ char_dict[char] for char in chars_list ] @@ -27,10 +36,14 @@ def numerical_encoding(text, char_dict): return chars_list -def get_text_matrix(sequence, len_input): +def get_text_matrix(sequence: np.array, len_input: int) -> np.array: """ - This generates a matrix containing all the sequences + Generates a matrix containing all sequences of length INPUT_LENGTH to be fed into the Network + + Args: + sequence (np.array): array to be processed + len_input (int): length od model input """ # create empty matrix X = np.empty((len(sequence)-len_input, len_input)) @@ -42,7 +55,17 @@ def get_text_matrix(sequence, len_input): return X -def process_corpus(): +def process_corpus() -> (np.array, dict): + """ + Text preprocessing steps: 1. Downloads corpus; 2. extracts set of + unique chars; 3. Map every char to its int; 3. vectorize text; + 4. process vectorized text into a 2D array for model training + (a sliding window of text is produced) + + Returns: + X (np.array): 2D array for model training + char2idx (dict): dictionary to preserve char-index mapping + """ page = requests.get(config.CORPUS_URL) text = page.text @@ -53,14 +76,13 @@ def process_corpus(): # Map every letter in our alphabet to an int char2idx = {char[1]: char[0] for char in enumerate(unique_chars)} - # Produce a reverse dictionary to go back from int to str later - idx2char = {v: k for k, v in char2idx.items()} - + # vectorize text encoded_text = numerical_encoding(text, char2idx) + # Sequence of vectorized chars to 2D array X = get_text_matrix(encoded_text, INPUT_LENGTH + 1) - return X + return X, char2idx @tf.function @@ -79,7 +101,7 @@ def train_on_batch(x, y): def main(): - X = process_corpus() + X, char2idx = process_corpus() loss_history = [] @@ -119,10 +141,15 @@ def main(): plt.show() # Save model - gpt.save(os.path.join(os.getcwd(), "saved_models", config.MODEL_NAME)) + gpt.save(os.path.join(os.getcwd(), "saved_models", config.MODEL_NAME+".h5")) - return None + # Save char2idx mapping as yaml + yaml.dump( + char2idx, + open(os.path.join(os.getcwd(), "saved_models", f"{config.MODEL_NAME}_char_idx_map.yaml"), "w") + ) + return None if __name__ == "__main__": diff --git a/utils.py b/utils.py index 597c1b6..735503d 100644 --- a/utils.py +++ b/utils.py @@ -12,11 +12,6 @@ class StrMessages: With words of lofty prose, I shall adorn Thy journey through this CLI program's feed. - Thou art most welcome to this humble stage, - Where bytes and lines doth dance in harmony. - Methinks thou seeketh knowledge of this age, - And for thy query, I shall thee gladly see. - Inscribe the word "exit," a concise decree, Or wield the key combination, Ctrl-C, with glee. By this act, thou shalt gracefully conclude thy stay, @@ -24,9 +19,23 @@ class StrMessages: """ MSG_INPUT_TOO_SHORT = """ + ** InputError: Input length too short ** + Pray, kind user, if it be not too much to ask, I beseech thee, extend thy prompt, a greater task. Yet, one more thing I must humbly request, A length of {} characters, at its behest. """ + MSG_FAREWELL = """ + Fair user, thou hast spoken the chosen word, + "Exit" resounds, like a song of a departing bird. + With brevity, I bid thee a swift farewell, + May fortune attend thee, as thou venturest, as well. + + Fare thee well, dear user, with gratitude and grace, + Till we meet again, in another time and place. + Shakespeare-GPT awaits, shouldst thou return anew, + With words of wisdom and verses, old and true. + """ +