In [None]:
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt

In [None]:
max_words = 20000
max_len = 200

(train_sequences, train_labels), (test_sequences, test_labels) = tf.keras.datasets.imdb.load_data(num_words=max_words)


def preprocess(sequences, labels):
    return sequences, labels.astype(np.int32)

train_sequences, train_labels = preprocess(train_sequences, train_labels)
test_sequences, test_labels = preprocess(test_sequences, test_labels)

In [None]:
vocabulary = tf.keras.datasets.imdb.get_word_index()
char_to_ind = vocabulary
ind_to_char = {ind: char for (char, ind) in vocabulary.items()}

In [None]:
# remember this? doesn't work...
train_data = tf.data.Dataset.from_tensor_slices((train_sequences, train_labels))

In [None]:
# we can create a dataset from a python generator. first, we have to write the generator
# this is a very simple one, but we could execute arbitrary python code in here
# (say, loading files from disk and preparing the loaded inputs somehow)
def gen():
    for sequence, label in zip(train_sequences, train_labels):
        yield sequence, label

In [None]:
# we have to tell TF what to expect from the generator ("Tensor Specification")
train_data = tf.data.Dataset.from_generator(gen, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

# regular .batch wouldn't work because the inputs are different length.
# padded batch automatically pads all elements in the batch to the longest length
# per dimension.
# you can also specify different shapes and padding values other than 0.
# padding is always "post"
train_data = train_data.padded_batch(32)

In [None]:
for sequence, label in train_data:
    print(sequence.shape, label.shape)
    input()

In [None]:
# we have to tell TF what to expect from the generator ("Tensor Specification")
train_data = tf.data.Dataset.from_generator(gen, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

# alternatively, we can use bucketing. the idea is to define buckets for specific
# sequence lengths, and put all sequences in their corresponding bucket.
# when a batch is requested, first a bucket is selected and then all elements of
# the batch are taken from this bucket.
# this guarantees that all elements in a batch are roughly the same length,
# minimizing the amount of padding.

# here is an example with buckets in steps of 50. all sequences above length 500
# end up in the same bucket. same for sequences below length 50.
# do note that I by no means claim that this is a "good" bucketing. play around with it!
buckets = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
bucket_batch_size = [32] * (len(buckets) + 1)
train_data = train_data.bucket_by_sequence_length(lambda sequence, label: tf.shape(sequence)[0],
                                                  bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)

In [None]:
# NOTE!!
# you should probably still remove very long sequences (longer than some cutoff)
# before converting to a dataset

In [None]:
# compare the average batch shapes with the padded_batch example. there, batches are
# often length 800 or so because the longest sequence in the batch happened to
# have that length.
# with bucketing, we get many much smaller batches, meaning more efficient training.
for sequence, label in train_data:
    print(sequence.shape, label.shape)
    input()

In [None]:
# here's a very simple toy example for a keras lstm
# the "hidden dimensions" are just randomly chosen. 
# you probably don't want to use a hidden size of 12 =) (but maybe it's actually really good?)


# embedding comes first to replace one-hot vectors. 
#    mask_zero=True to prevent computations on padded time steps.
# then an arbitrary number of RNN layers.
# deeper RNN layers take as input sequence the state sequence of the layer before,
# so all layers except the last one should return_sequences=True
# finally, a Dense layer for the output, since the output computation is *not*
# included in the RNN cells; all cells provided by Keras only compute the states
model = tf.keras.Sequential([tf.keras.layers.Embedding(max_words, 20, mask_zero=True), 
                             tf.keras.layers.LSTM(12, return_sequences=True),
                             tf.keras.layers.LSTM(15),
                             tf.keras.layers.Dense(1)])


# FYI, the third line is the same as the first two lines together.
# the second option can use a much more efficient implementation, it will be SOOO much faster.
# try it yourself!
#rnn_cell = tf.keras.layers.LSTMCell(12)
#rnn = tf.keras.layers.RNN(rnn_cell, return_sequences=False)
rnn = tf.keras.layers.LSTM(12, return_sequences=False)

In [None]:
# calling RNN layers is easy!
one_hot_batch = tf.one_hot(sequence, depth=max_words)
rnn(one_hot_batch)