In [0]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
Sample Text¶
In [0]:
# source text
data = """ Jack and Jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n """
Word to integar encoding¶
In [0]:
# Create a Tokenizer from provided text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
Vocabulary size of text¶
In [0]:
# Vocabulary size so to pass when adding Embedding layers
vocab_size = len(tokenizer.word_index) + 1
Create X, y for fitting model¶
In [0]:
sequences = list()
for i in range(1, len(encoded)):
sequence = (encoded[i-1], encoded[i])
sequences.append(sequence)
sequences = np.array(sequences)
X, y = sequences[:,0], sequences[:, 1]
One-hot encoding of ‘y’¶
In [0]:
y = to_categorical(y, num_classes=vocab_size)
Model Defination¶
In [76]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
Compiling and fitting model¶
In [83]:
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(X, y, epochs=10, verbose=1);
Function to create index-word mapping from tokenizer¶
In [0]:
def index_word_mapping(tokenizer):
index_word = dict()
for word, index in tokenizer.word_index.items():
index_word[index] = word
return index_word
Function to create sequence a word¶
In [0]:
def generate_sequence(model, tokenizer, seed_text='Jack', n_words=10):
sequences = list()
sequences.append(seed_text)
index_word_dict = index_word_mapping(tokenizer)
X = seed_text.lower()
for i in range(n_words):
X_word_index = tokenizer.texts_to_sequences([X])[0]
next_word_index = model.predict_classes(X_word_index)
next_word = index_word_dict[next_word_index[0]]
sequences.append(next_word)
X = next_word
return ' '.join(sequences)
Sequence Prediction¶
In [183]:
generate_sequence(model, tokenizer, seed_text='Jack', n_words=10)
Out[183]: