In [0]:
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

Sample Text

In [0]:
# source text
data = """ Jack and Jill went up the hill\n
        To fetch a pail of water\n
        Jack fell down and broke his crown\n
        And Jill came tumbling after\n """

Word to integar encoding

In [0]:
# Create a Tokenizer from provided text
tokenizer = Tokenizer()
encoded = tokenizer.texts_to_sequences([data])[0]

Vocabulary size of text

In [0]:
# Vocabulary size so to pass when adding Embedding layers
vocab_size = len(tokenizer.word_index) + 1

Create X, y for fitting model

In [0]:
sequences = list()

for i in range(1, len(encoded)):
    sequence = (encoded[i-1], encoded[i])
sequences = np.array(sequences)

X, y = sequences[:,0], sequences[:, 1]

One-hot encoding of ‘y’

In [0]:
y = to_categorical(y, num_classes=vocab_size)

Model Defination

In [76]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(Dense(vocab_size, activation='softmax'))
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 10)             220       
lstm_2 (LSTM)                (None, 50)                12200     
dense_2 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0

Compiling and fitting model

In [83]:
              metrics=['accuracy']), y, epochs=10, verbose=1);
Epoch 1/10
24/24 [==============================] - 1s 35ms/step - loss: 0.1951 - acc: 0.8750
Epoch 2/10
24/24 [==============================] - 0s 594us/step - loss: 0.1952 - acc: 0.8750
Epoch 3/10
24/24 [==============================] - 0s 431us/step - loss: 0.1951 - acc: 0.8750
Epoch 4/10
24/24 [==============================] - 0s 501us/step - loss: 0.1952 - acc: 0.8750
Epoch 5/10
24/24 [==============================] - 0s 488us/step - loss: 0.1952 - acc: 0.8750
Epoch 6/10
24/24 [==============================] - 0s 461us/step - loss: 0.1951 - acc: 0.8750
Epoch 7/10
24/24 [==============================] - 0s 439us/step - loss: 0.1951 - acc: 0.8750
Epoch 8/10
24/24 [==============================] - 0s 461us/step - loss: 0.1951 - acc: 0.8750
Epoch 9/10
24/24 [==============================] - 0s 441us/step - loss: 0.1951 - acc: 0.8750
Epoch 10/10
24/24 [==============================] - 0s 488us/step - loss: 0.1951 - acc: 0.8750

Function to create index-word mapping from tokenizer

In [0]:
def index_word_mapping(tokenizer):
    index_word = dict()
    for word, index in tokenizer.word_index.items():
        index_word[index] = word
    return index_word

Function to create sequence a word

In [0]:
def generate_sequence(model, tokenizer, seed_text='Jack', n_words=10):
    sequences = list()
    index_word_dict = index_word_mapping(tokenizer)
    X = seed_text.lower()
    for i in range(n_words):
        X_word_index = tokenizer.texts_to_sequences([X])[0]
        next_word_index = model.predict_classes(X_word_index)
        next_word = index_word_dict[next_word_index[0]]
        X = next_word            
    return ' '.join(sequences)

Sequence Prediction

In [183]:
generate_sequence(model, tokenizer, seed_text='Jack', n_words=10)
'Jack fell down and jill came tumbling after pail of water'