Skip to content

Commit

Permalink
add transformer modules and notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffrey-hsu committed Nov 13, 2019
1 parent 4a99292 commit b53490e
Show file tree
Hide file tree
Showing 13 changed files with 1,208 additions and 0 deletions.
496 changes: 496 additions & 0 deletions 6_transformer_translation.ipynb

Large diffs are not rendered by default.

File renamed without changes.
163 changes: 163 additions & 0 deletions images/beam-search.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/encoder_decoder_stack.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/multi_head_attention.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/scaled_dot_product_attention.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified images/transformer.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file added transformer/__init__.py
Empty file.
214 changes: 214 additions & 0 deletions transformer/batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
from collections import Counter

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.tokenize import wordpunct_tokenize
from torch import optim
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, Subset


def tokenize(text):
"""Turn text into discrete tokens.
Remove tokens that are not words.
"""
text = text.lower()
tokens = wordpunct_tokenize(text)

# Only keep words
tokens = [token for token in tokens
if all(char.isalpha() for char in token)]

return tokens


class EnglishFrenchTranslations(Dataset):
def __init__(self, path, max_vocab, max_seq_len):
self.max_vocab = max_vocab

# Extra tokens to add
self.padding_token = '<PAD>'
self.start_of_sequence_token = '<SOS>'
self.end_of_sequence_token = '<EOS>'
self.unknown_word_token = '<UNK>'
self.max_seq_len = max_seq_len

# Helper function
self.flatten = lambda x: [sublst for lst in x for sublst in lst]

# Load the data into a DataFrame
df = pd.read_csv(path, names=['english', 'french'], sep='\t')

# filter out too long sequences
df = self.filter_seq_len(df, max_len=self.max_seq_len)

# Tokenize inputs (English) and targets (French)
self.tokenize_df(df)

# To reduce computational complexity, replace rare words with <UNK>
self.replace_rare_tokens(df)

# Prepare variables with mappings of tokens to indices
self.create_token2idx(df)

# Remove sequences with mostly <UNK>
df = self.remove_mostly_unk(df)

# Every sequence (input and target) should start with <SOS>
# and end with <EOS>
self.add_start_and_end_to_tokens(df)

# Convert tokens to indices
self.tokens_to_indices(df)

def __getitem__(self, idx):
"""Return example at index idx."""
return self.indices_pairs[idx][0], self.indices_pairs[idx][1]

def tokenize_df(self, df):
"""Turn inputs and targets into tokens."""
df['tokens_inputs'] = df.english.apply(tokenize)
df['tokens_targets'] = df.french.apply(tokenize)

def replace_rare_tokens(self, df):
"""Replace rare tokens with <UNK>."""
common_tokens_inputs = self.get_most_common_tokens(
df.tokens_inputs.tolist(),
)
common_tokens_targets = self.get_most_common_tokens(
df.tokens_targets.tolist(),
)

df.loc[:, 'tokens_inputs'] = df.tokens_inputs.apply(
lambda tokens: [token if token in common_tokens_inputs
else self.unknown_word_token for token in tokens]
)
df.loc[:, 'tokens_targets'] = df.tokens_targets.apply(
lambda tokens: [token if token in common_tokens_targets
else self.unknown_word_token for token in tokens]
)

def get_most_common_tokens(self, tokens_series):
"""Return the max_vocab most common tokens."""
all_tokens = self.flatten(tokens_series)
# Substract 4 for <PAD>, <SOS>, <EOS>, and <UNK>
common_tokens = set(list(zip(*Counter(all_tokens).most_common(
self.max_vocab - 4)))[0])
return common_tokens

def remove_mostly_unk(self, df, threshold=0.99):
"""Remove sequences with mostly <UNK>."""
calculate_ratio = (
lambda tokens: sum(1 for token in tokens if token != '<UNK>')
/ len(tokens) > threshold
)
df = df[df.tokens_inputs.apply(calculate_ratio)]
df = df[df.tokens_targets.apply(calculate_ratio)]
return df

def filter_seq_len(self, df, max_len=100):
mask = (df['english'].str.count(' ') < max_len) & (df['french'].str.count(' ') < max_len)
return df.loc[mask]

def create_token2idx(self, df):
"""Create variables with mappings from tokens to indices."""
unique_tokens_inputs = set(self.flatten(df.tokens_inputs))
unique_tokens_targets = set(self.flatten(df.tokens_targets))

for token in reversed([
self.padding_token,
self.start_of_sequence_token,
self.end_of_sequence_token,
self.unknown_word_token,
]):
if token in unique_tokens_inputs:
unique_tokens_inputs.remove(token)
if token in unique_tokens_targets:
unique_tokens_targets.remove(token)

unique_tokens_inputs = sorted(list(unique_tokens_inputs))
unique_tokens_targets = sorted(list(unique_tokens_targets))

# Add <PAD>, <SOS>, <EOS>, and <UNK> tokens
for token in reversed([
self.padding_token,
self.start_of_sequence_token,
self.end_of_sequence_token,
self.unknown_word_token,
]):

unique_tokens_inputs = [token] + unique_tokens_inputs
unique_tokens_targets = [token] + unique_tokens_targets

self.token2idx_inputs = {token: idx for idx, token
in enumerate(unique_tokens_inputs)}
self.idx2token_inputs = {idx: token for token, idx
in self.token2idx_inputs.items()}

self.token2idx_targets = {token: idx for idx, token
in enumerate(unique_tokens_targets)}
self.idx2token_targets = {idx: token for token, idx
in self.token2idx_targets.items()}

def add_start_and_end_to_tokens(self, df):
"""Add <SOS> and <EOS> tokens to the end of every input and output."""
df.loc[:, 'tokens_inputs'] = (
[self.start_of_sequence_token]
+ df.tokens_inputs
+ [self.end_of_sequence_token]
)
df.loc[:, 'tokens_targets'] = (
[self.start_of_sequence_token]
+ df.tokens_targets
+ [self.end_of_sequence_token]
)

def tokens_to_indices(self, df):
"""Convert tokens to indices."""
df['indices_inputs'] = df.tokens_inputs.apply(
lambda tokens: [self.token2idx_inputs[token] for token in tokens])
df['indices_targets'] = df.tokens_targets.apply(
lambda tokens: [self.token2idx_targets[token] for token in tokens])

self.indices_pairs = list(zip(df.indices_inputs, df.indices_targets))

def __len__(self):
return len(self.indices_pairs)


def collate(batch, src_pad, trg_pad, device):
inputs = [torch.LongTensor(item[0]) for item in batch]
targets = [torch.LongTensor(item[1]) for item in batch]

# Pad sequencse so that they are all the same length (within one minibatch)
padded_inputs = pad_sequence(inputs, padding_value=src_pad, batch_first=True)
padded_targets = pad_sequence(targets, padding_value=trg_pad, batch_first=True)

# Sort by length for CUDA optimizations
lengths = torch.LongTensor([len(x) for x in inputs])
lengths, permutation = lengths.sort(dim=0, descending=True)

return padded_inputs[permutation].to(device), padded_targets[permutation].to(device), lengths.to(device)


def no_peak_mask(size):
mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
mask = Variable(torch.from_numpy(mask) == 0)
return mask


def create_masks(src, trg, src_pad_idx, trg_pad_idx):
src_mask = (src != src_pad_idx).unsqueeze(-2)
if trg is not None:
trg_mask = (trg != trg_pad_idx).unsqueeze(-2)
size = trg.size(1) # get seq_len for matrix
np_mask = no_peak_mask(size).to(trg_mask.device)
trg_mask = trg_mask & np_mask
else:
trg_mask = None
return src_mask, trg_mask
39 changes: 39 additions & 0 deletions transformer/embed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import torch
import torch.nn as nn
import math
from torch.autograd import Variable


class Embedder(nn.Module):
def __init__(self, vocab_size, d_model):
super().__init__()
self.d_model = d_model
self.embed = nn.Embedding(vocab_size, d_model)

def forward(self, x):
x = self.embed(x)
# make embeddings relatively larger
return x * math.sqrt(self.d_model)


class PositionalEncoder(nn.Module):
def __init__(self, d_model, max_seq_len=200, dropout=0.1):
super().__init__()
self.d_model = d_model
self.dropout = nn.Dropout(dropout)
# create constant 'pe' matrix
pe = torch.zeros(max_seq_len, d_model)
for pos in range(max_seq_len):
for i in range(0, d_model, 2):
pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)

def forward(self, x):
seq_len = x.size()[-2]
pe = Variable(self.pe[:,:seq_len], requires_grad=False)
if x.is_cuda:
pe.cuda()
x = x + pe
return self.dropout(x)
52 changes: 52 additions & 0 deletions transformer/layer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import torch
import torch.nn as nn

from .sublayer import FeedForward, MultiHeadAttention, Norm


class EncoderLayer(nn.Module):
def __init__(self, d_model, heads, dropout=0.1):
super().__init__()
self.norm_1 = Norm(d_model)
self.norm_2 = Norm(d_model)
self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
self.ff = FeedForward(d_model, dropout=dropout)
self.dropout_1 = nn.Dropout(dropout)
self.dropout_2 = nn.Dropout(dropout)

def forward(self, x, mask):
x2 = self.norm_1(x)
# encoder self-attention
x = x + self.dropout_1(self.attn(x2, x2, x2, mask))
x2 = self.norm_2(x)
x = x + self.dropout_2(self.ff(x2))
return x


class DecoderLayer(nn.Module):
'''build a decoder layer with 2 multi-head attention layers and 1 feed-forward layer'''
def __init__(self, d_model, heads, dropout=0.1):
super().__init__()
self.norm_1 = Norm(d_model)
self.norm_2 = Norm(d_model)
self.norm_3 = Norm(d_model)

self.dropout_1 = nn.Dropout(dropout)
self.dropout_2 = nn.Dropout(dropout)
self.dropout_3 = nn.Dropout(dropout)

self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
self.ff = FeedForward(d_model, dropout=dropout)

def forward(self, x, e_outputs, src_mask, trg_mask):
x2 = self.norm_1(x)
# decoder self-attention with target masking
x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
x2 = self.norm_2(x)
# encoder-decoder attention
# q is previous position decoder output, k and v is from encoder output
x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask))
x2 = self.norm_3(x)
x = x + self.dropout_3(self.ff(x2))
return x
Loading

0 comments on commit b53490e

Please sign in to comment.