add transformer modules and notebook

nhoodnoiz · Nov 13, 2019 · b53490e · b53490e
1 parent 4a99292
commit b53490e
Show file tree

Hide file tree

Showing 13 changed files with 1,208 additions and 0 deletions.
diff --git a/6_transformer_translation.ipynb b/6_transformer_translation.ipynb
diff --git a/6_gpt2_finetuned_text_generation.ipynb → 7_gpt2_finetuned_text_generation.ipynb b/6_gpt2_finetuned_text_generation.ipynb → 7_gpt2_finetuned_text_generation.ipynb
diff --git a/images/beam-search.svg b/images/beam-search.svg
diff --git a/images/encoder_decoder_stack.png b/images/encoder_decoder_stack.png
diff --git a/images/multi_head_attention.png b/images/multi_head_attention.png
diff --git a/images/scaled_dot_product_attention.png b/images/scaled_dot_product_attention.png
diff --git a/images/transformer.png b/images/transformer.png
diff --git a/transformer/__init__.py b/transformer/__init__.py
diff --git a/transformer/batch.py b/transformer/batch.py
@@ -0,0 +1,214 @@
+from collections import Counter
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from nltk.tokenize import wordpunct_tokenize
+from torch import optim
+from torch.autograd import Variable
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset, DataLoader, Subset
+
+
+def tokenize(text):
+    """Turn text into discrete tokens.
+
+    Remove tokens that are not words.
+    """
+    text = text.lower()
+    tokens = wordpunct_tokenize(text)
+
+    # Only keep words
+    tokens = [token for token in tokens
+              if all(char.isalpha() for char in token)]
+
+    return tokens
+
+
+class EnglishFrenchTranslations(Dataset):
+    def __init__(self, path, max_vocab, max_seq_len):
+        self.max_vocab = max_vocab
+
+        # Extra tokens to add
+        self.padding_token = '<PAD>'
+        self.start_of_sequence_token = '<SOS>'
+        self.end_of_sequence_token = '<EOS>'
+        self.unknown_word_token = '<UNK>'
+        self.max_seq_len = max_seq_len
+
+        # Helper function
+        self.flatten = lambda x: [sublst for lst in x for sublst in lst]
+
+        # Load the data into a DataFrame
+        df = pd.read_csv(path, names=['english', 'french'], sep='\t')
+
+        # filter out too long sequences
+        df = self.filter_seq_len(df, max_len=self.max_seq_len)
+
+        # Tokenize inputs (English) and targets (French)
+        self.tokenize_df(df)
+
+        # To reduce computational complexity, replace rare words with <UNK>
+        self.replace_rare_tokens(df)
+
+        # Prepare variables with mappings of tokens to indices
+        self.create_token2idx(df)
+
+        # Remove sequences with mostly <UNK>
+        df = self.remove_mostly_unk(df)
+
+        # Every sequence (input and target) should start with <SOS>
+        # and end with <EOS>
+        self.add_start_and_end_to_tokens(df)
+
+        # Convert tokens to indices
+        self.tokens_to_indices(df)
+
+    def __getitem__(self, idx):
+        """Return example at index idx."""
+        return self.indices_pairs[idx][0], self.indices_pairs[idx][1]
+
+    def tokenize_df(self, df):
+        """Turn inputs and targets into tokens."""
+        df['tokens_inputs'] = df.english.apply(tokenize)
+        df['tokens_targets'] = df.french.apply(tokenize)
+
+    def replace_rare_tokens(self, df):
+        """Replace rare tokens with <UNK>."""
+        common_tokens_inputs = self.get_most_common_tokens(
+            df.tokens_inputs.tolist(),
+        )
+        common_tokens_targets = self.get_most_common_tokens(
+            df.tokens_targets.tolist(),
+        )
+
+        df.loc[:, 'tokens_inputs'] = df.tokens_inputs.apply(
+            lambda tokens: [token if token in common_tokens_inputs 
+                            else self.unknown_word_token for token in tokens]
+        )
+        df.loc[:, 'tokens_targets'] = df.tokens_targets.apply(
+            lambda tokens: [token if token in common_tokens_targets
+                            else self.unknown_word_token for token in tokens]
+        )
+
+    def get_most_common_tokens(self, tokens_series):
+        """Return the max_vocab most common tokens."""
+        all_tokens = self.flatten(tokens_series)
+        # Substract 4 for <PAD>, <SOS>, <EOS>, and <UNK>
+        common_tokens = set(list(zip(*Counter(all_tokens).most_common(
+            self.max_vocab - 4)))[0])
+        return common_tokens
+
+    def remove_mostly_unk(self, df, threshold=0.99):
+        """Remove sequences with mostly <UNK>."""
+        calculate_ratio = (
+            lambda tokens: sum(1 for token in tokens if token != '<UNK>')
+            / len(tokens) > threshold
+        )
+        df = df[df.tokens_inputs.apply(calculate_ratio)]
+        df = df[df.tokens_targets.apply(calculate_ratio)]
+        return df
+
+    def filter_seq_len(self, df, max_len=100):
+        mask = (df['english'].str.count(' ') < max_len) & (df['french'].str.count(' ') < max_len)
+        return df.loc[mask]
+
+    def create_token2idx(self, df):
+        """Create variables with mappings from tokens to indices."""
+        unique_tokens_inputs = set(self.flatten(df.tokens_inputs))
+        unique_tokens_targets = set(self.flatten(df.tokens_targets))
+
+        for token in reversed([
+            self.padding_token,
+            self.start_of_sequence_token,
+            self.end_of_sequence_token,
+            self.unknown_word_token,
+        ]):
+            if token in unique_tokens_inputs:
+                unique_tokens_inputs.remove(token)
+            if token in unique_tokens_targets:
+                unique_tokens_targets.remove(token)
+
+        unique_tokens_inputs = sorted(list(unique_tokens_inputs))
+        unique_tokens_targets = sorted(list(unique_tokens_targets))
+
+        # Add <PAD>, <SOS>, <EOS>, and <UNK> tokens
+        for token in reversed([
+            self.padding_token,
+            self.start_of_sequence_token,
+            self.end_of_sequence_token,
+            self.unknown_word_token,
+        ]):
+
+            unique_tokens_inputs = [token] + unique_tokens_inputs
+            unique_tokens_targets = [token] + unique_tokens_targets
+
+        self.token2idx_inputs = {token: idx for idx, token
+                                 in enumerate(unique_tokens_inputs)}
+        self.idx2token_inputs = {idx: token for token, idx
+                                 in self.token2idx_inputs.items()}
+
+        self.token2idx_targets = {token: idx for idx, token
+                                  in enumerate(unique_tokens_targets)}
+        self.idx2token_targets = {idx: token for token, idx
+                                  in self.token2idx_targets.items()}
+
+    def add_start_and_end_to_tokens(self, df):
+        """Add <SOS> and <EOS> tokens to the end of every input and output."""
+        df.loc[:, 'tokens_inputs'] = (
+            [self.start_of_sequence_token]
+            + df.tokens_inputs
+            + [self.end_of_sequence_token]
+        )
+        df.loc[:, 'tokens_targets'] = (
+            [self.start_of_sequence_token]
+            + df.tokens_targets
+            + [self.end_of_sequence_token]
+        )
+
+    def tokens_to_indices(self, df):
+        """Convert tokens to indices."""
+        df['indices_inputs'] = df.tokens_inputs.apply(
+            lambda tokens: [self.token2idx_inputs[token] for token in tokens])
+        df['indices_targets'] = df.tokens_targets.apply(
+            lambda tokens: [self.token2idx_targets[token] for token in tokens])
+
+        self.indices_pairs = list(zip(df.indices_inputs, df.indices_targets))
+
+    def __len__(self):
+        return len(self.indices_pairs)
+
+
+def collate(batch, src_pad, trg_pad, device):
+    inputs = [torch.LongTensor(item[0]) for item in batch]
+    targets = [torch.LongTensor(item[1]) for item in batch]
+
+    # Pad sequencse so that they are all the same length (within one minibatch)
+    padded_inputs = pad_sequence(inputs, padding_value=src_pad, batch_first=True)
+    padded_targets = pad_sequence(targets, padding_value=trg_pad, batch_first=True)
+
+    # Sort by length for CUDA optimizations
+    lengths = torch.LongTensor([len(x) for x in inputs])
+    lengths, permutation = lengths.sort(dim=0, descending=True)
+
+    return padded_inputs[permutation].to(device), padded_targets[permutation].to(device), lengths.to(device)
+
+
+def no_peak_mask(size):
+    mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
+    mask =  Variable(torch.from_numpy(mask) == 0)
+    return mask
+
+
+def create_masks(src, trg, src_pad_idx, trg_pad_idx):
+    src_mask = (src != src_pad_idx).unsqueeze(-2)
+    if trg is not None:
+        trg_mask = (trg != trg_pad_idx).unsqueeze(-2)
+        size = trg.size(1) # get seq_len for matrix
+        np_mask = no_peak_mask(size).to(trg_mask.device)
+        trg_mask = trg_mask & np_mask
+    else:
+        trg_mask = None
+    return src_mask, trg_mask
diff --git a/transformer/embed.py b/transformer/embed.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+
+    def forward(self, x):
+        x = self.embed(x)
+        # make embeddings relatively larger
+        return x * math.sqrt(self.d_model)
+
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len=200, dropout=0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix 
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        seq_len = x.size()[-2]
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
diff --git a/transformer/layer.py b/transformer/layer.py
@@ -0,0 +1,52 @@
+import torch
+import torch.nn as nn
+
+from .sublayer import FeedForward, MultiHeadAttention, Norm
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        # encoder self-attention
+        x = x + self.dropout_1(self.attn(x2, x2, x2, mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+
+
+class DecoderLayer(nn.Module):
+    '''build a decoder layer with 2 multi-head attention layers and 1 feed-forward layer'''
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        # decoder self-attention with target masking
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        # encoder-decoder attention
+        # q is previous position decoder output, k and v is from encoder output
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x