lukas · 1shanpanta · Oct 3, 2024
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@ keras-sign/sign-language
 **/.ipynb_checkpoints
 **/glove*
 **/aclImdb/
+.vscode 
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ These are specific bite-sized projects to learn an aspect of deep learning, star
 | Predict the weather with an RNN                 | [projects/6-rnn-timeseries](https://github.com/lukas/ml-class/tree/master/projects/6-rnn-timeseries)           | [Recurrent Neural Networks](https://www.youtube.com/watch?v=8lbGjKhrJOo)               |
 | Build a text generator                          | [projects/7-text-generation](https://github.com/lukas/ml-class/tree/master/projects/7-text-generation)           | [Text Generation using LSTMs and GRUs](https://www.youtube.com/watch?v=4F69m3krMHw)    |
 | Build a sentiment classifier on Amazon reviews. | [projects/8-text-classification](https://github.com/lukas/ml-class/tree/master/projects/8-text-classification) | [Text Classification using CNNs](https://www.youtube.com/watch?v=8YsZXTpFRO0)          |
-|                                                 |                                                                                                                | [Hybrid LSTM/CNNs](https://www.youtube.com/watch?v=NysY9FN9Uac)                        |
+|                                                 |                                                                                                                 | [Hybrid LSTM/CNNs](https://www.youtube.com/watch?v=NysY9FN9Uac)                           |
 |                                                 |                                                                                                                | [Seq2seq Models](https://www.youtube.com/watch?v=MqugtGD605k)                          |
 |                                                 |                                                                                                                | [Transfer Learning](https://www.youtube.com/watch?v=vbhEnEbj3JM)                       |
 |                                                 |                                                                                                                | [One Shot Learning](https://www.youtube.com/watch?v=H4MPIWX6ftE)                       |

diff --git a/projects/10-seq2seq/train.py b/projects/10-seq2seq/train.py
@@ -0,0 +1,136 @@
+from keras.models import Sequential
+from keras.layers import LSTM, TimeDistributed, RepeatVector, Dense
+import numpy as np
+import wandb
+from wandb.keras import WandbCallback
+
+wandb.init()
+config = wandb.config
+
+class CharacterTable(object):
+    """Given a set of characters:
+    + Encode them to a one hot integer representation
+    + Decode the one hot integer representation to their character output
+    + Decode a vector of probabilities to their character output
+    """
+    def __init__(self, chars):
+        """Initialize character table.
+        # Arguments
+            chars: Characters that can appear in the input.
+        """
+        self.chars = sorted(set(chars))
+        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
+        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
+
+    def encode(self, C, num_rows):
+        """One hot encode given string C.
+        # Arguments
+            num_rows: Number of rows in the returned one hot encoding. This is
+                used to keep the # of rows for each data the same.
+        """
+        x = np.zeros((num_rows, len(self.chars)))
+        for i, c in enumerate(C):
+            x[i, self.char_indices[c]] = 1
+        return x
+
+    def decode(self, x, calc_argmax=True):
+        if calc_argmax:
+            x = x.argmax(axis=-1)
+        return ''.join(self.indices_char[x] for x in x)
+
+# Parameters for the model and dataset.
+config.training_size = 50000
+config.digits = 5
+config.hidden_size = 128
+config.batch_size = 128
+
+# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
+# int is DIGITS.
+maxlen = config.digits + 1 + config.digits
+
+# All the numbers, plus sign and space for padding.
+chars = '0123456789+- '
+ctable = CharacterTable(chars)
+
+questions = []
+expected = []
+seen = set()
+print('Generating data...')
+while len(questions) < config.training_size:
+    f = lambda: int(''.join(np.random.choice(list('0123456789'))
+                    for i in range(np.random.randint(1, config.digits + 1))))
+    a, b = f(), f()
+    # Skip any addition questions we've already seen
+    # Also skip any such that x+Y == Y+x (hence the sorting).
+    key = tuple(sorted((a, b)))
+    if key in seen:
+        continue
+    seen.add(key)
+    # Pad the data with spaces such that it is always MAXLEN.
+    q = '{}-{}'.format(a, b)
+    query = q + ' ' * (maxlen - len(q))
+    ans = str(a - b)
+    # Answers can be of maximum size DIGITS + 1.
+    ans += ' ' * (config.digits + 1 - len(ans))
+
+    questions.append(query)
+    expected.append(ans)
+
+print('Total addition questions:', len(questions))
+
+print('Vectorization...')
+x = np.zeros((len(questions), maxlen, len(chars)), dtype=np.bool)
+y = np.zeros((len(questions), config.digits + 1, len(chars)), dtype=np.bool)
+for i, sentence in enumerate(questions):
+    x[i] = ctable.encode(sentence, maxlen)
+for i, sentence in enumerate(expected):
+    y[i] = ctable.encode(sentence, config.digits + 1)
+
+# Shuffle (x, y) in unison as the later parts of x will almost all be larger
+# digits.
+indices = np.arange(len(y))
+np.random.shuffle(indices)
+x = x[indices]
+y = y[indices]
+
+# Explicitly set apart 10% for validation data that we never train over.
+split_at = len(x) - len(x) // 10
+(x_train, x_val) = x[:split_at], x[split_at:]
+(y_train, y_val) = y[:split_at], y[split_at:]
+
+model = Sequential()
+model.add(LSTM(config.hidden_size, input_shape=(maxlen, len(chars))))
+model.add(RepeatVector(config.digits + 1))
+model.add(LSTM(config.hidden_size, return_sequences=True))
+model.add(TimeDistributed(Dense(len(chars), activation='softmax')))
+model.compile(loss='categorical_crossentropy',
+              optimizer='adam',
+              metrics=['accuracy'])
+model.summary()
+
+# Train the model each generation and show predictions against the validation
+# dataset.
+for iteration in range(1, 200):
+    print()
+    print('-' * 50)
+    print('Iteration', iteration)
+    model.fit(x_train, y_train,
+              batch_size=config.batch_size,
+              epochs=1,
+              validation_data=(x_val, y_val),callbacks=[WandbCallback()])
+    # Select 10 samples from the validation set at random so we can visualize
+    # errors.
+    for i in range(10):
+        ind = np.random.randint(0, len(x_val))
+        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
+        preds = model.predict_classes(rowx, verbose=0)
+        q = ctable.decode(rowx[0])
+        correct = ctable.decode(rowy[0])
+        guess = ctable.decode(preds[0], calc_argmax=False)
+        print('Q', q, end=' ')
+        print('T', correct, end=' ')
+        if correct == guess:
+            print('☑', end=' ')
+        else:
+            print('☒', end=' ')
+        print(guess)
diff --git a/projects/9-lstm-classifier/download-imdb.py b/projects/9-lstm-classifier/download-imdb.py
@@ -0,0 +1,29 @@
+import os
+import shutil
+import sys
+import tempfile
+import urllib.request
+
+
+IMDB_URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
+OUTPUT_NAME = "aclImdb"
+
+def main():
+    download_and_extract_archive()
+
+
+def download_and_extract_archive():
+    if os.path.exists(OUTPUT_NAME):
+        print("Imdb dataset download target exists at " + OUTPUT_NAME)
+    else:
+        with urllib.request.urlopen(IMDB_URL) as response:
+            with tempfile.NamedTemporaryFile() as temp_archive:
+                temp_archive.write(response.read())
+                imdb_tar = shutil.unpack_archive(
+                    temp_archive.name, extract_dir=".", format="gztar")
+
+    return
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/projects/9-lstm-classifier/imdb-lstm.py b/projects/9-lstm-classifier/imdb-lstm.py
@@ -0,0 +1,50 @@
+from keras.api.preprocessing import sequence
+from keras.api.models import Sequential
+from keras.api.layers import Dense, Dropout, Activation
+from keras.api.layers import Embedding, LSTM, Bidirectional
+from keras.api.layers import Conv1D, Flatten
+import wandb
+from wandb.integration.keras import WandbCallback
+import imdb
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.keras.preprocessing.text import Tokenizer
+
+wandb.init()
+config = wandb.config
+
+# set parameters:
+config.vocab_size = 1000
+config.maxlen = 300
+config.batch_size = 32
+config.embedding_dims = 50
+config.filters = 10
+config.kernel_size = 3
+config.hidden_dims = 10
+config.epochs = 10
+
+(X_train, y_train), (X_test, y_test) = imdb.load_imdb()
+
+tokenizer = Tokenizer(num_words=config.vocab_size)
+tokenizer.fit_on_texts(X_train)
+X_train = tokenizer.texts_to_matrix(X_train)
+X_test = tokenizer.texts_to_matrix(X_test)
+
+X_train = sequence.pad_sequences(X_train, maxlen=config.maxlen)
+X_test = sequence.pad_sequences(X_test, maxlen=config.maxlen)
+
+model = Sequential()
+model.add(Embedding(config.vocab_size,
+                    config.embedding_dims,
+                    input_length=config.maxlen))
+model.add(LSTM(config.hidden_dims, activation="sigmoid"))
+model.add(Dense(1, activation='sigmoid'))
+model.compile(loss='binary_crossentropy',
+              optimizer='rmsprop',
+              metrics=['accuracy'])
+
+model.fit(X_train, y_train,
+          batch_size=config.batch_size,
+          epochs=config.epochs,
+          validation_data=(X_test, y_test), callbacks=[WandbCallback()])
diff --git a/projects/9-lstm-classifier/imdb.py b/projects/9-lstm-classifier/imdb.py
@@ -0,0 +1,32 @@
+import numpy as np
+import os
+
+sep = os.path.sep
+
+def load_imdb():
+    X_train = []
+    y_train = []
+
+    path = os.path.join('aclImdb', 'train', 'pos', '')
+    X_train.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
+    y_train.extend([1 for _ in range(12500)])
+
+    path = os.path.join('aclImdb', 'train', 'neg', '')
+    X_train.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
+    y_train.extend([0 for _ in range(12500)])
+
+    X_test = []
+    y_test = []
+
+    path = os.path.join('aclImdb', 'test', 'pos', '')
+    X_test.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
+    y_test.extend([1 for _ in range(12500)])
+
+    path = os.path.join('aclImdb', 'test', 'neg', '')
+    X_test.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
+    y_test.extend([0 for _ in range(12500)])
+
+    y_train = np.array(y_train, dtype=np.int32)
+    y_test = np.array(y_test, dtype=np.int32)
+
+    return (X_train, y_train), (X_test, y_test)