Tekstgeneratie met een RNN

Tekstgeneratie met een RNN#

import tensorflow as tf
with open("moon.txt") as f:
    moon_text = f.read()

print(moon_text[:250])

moon_text = moon_text[:100_000] # Pak eerste 100.000 karakters
# tekst omzetten naar losse karakters en dan naar integers
text_vec_layer = tf.keras.layers.TextVectorization(split="character", standardize="lower")
text_vec_layer.adapt([moon_text])
encoded = text_vec_layer([moon_text])[0]

print(encoded[:250])
print(text_vec_layer.get_vocabulary())
encoded -= 2 # haal de padding- en unknown-karakters (0 en 1) eraf
n_tokens = text_vec_layer.vocabulary_size() - 2
dataset_size = len(encoded)
print(n_tokens, dataset_size)
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    """ Maak inputs en targets. Beide zijn 'length' tekens lang.
    Een target begint één karakter verder dan zijn input en eindigt ook één karakter verderop.
    """
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length+1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length+1)) # length+1 vanwege target
    if shuffle:
        ds = ds.shuffle(buffer_size=100, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:,:-1], window[:,1:])).prefetch(1) # hier worden input en target gesplitst
list(to_dataset(text_vec_layer(["To be"])[0], length=4))

Model maken en trainen#

# Train-Validation-Test-split, 90% - 5% - 5%
length=100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:90_000], length=length, shuffle=True, seed=42)
valid_set = to_dataset(encoded[90_000:95_000], length=length, shuffle=True, seed=42)
test_set = to_dataset(encoded[95_000:], length=length, shuffle=True, seed=42)
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16), # zet de inputs om naar 16-dimensionale word embeddings (zie hoorcollege)
    tf.keras.layers.GRU(128, return_sequences=True), # laag met units met "geheugen"
    tf.keras.layers.Dense(n_tokens, activation="softmax") # outputlaag met n_tokens grootte, voorspelt dus één token (karakter), softmax zodat de kansen optellen tot 1
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"]) # als je met one-hot-encoded data werkt, gebruik dan loss=categorical_crossentropy
steps_per_epoch = 90_000 // 32
#history = model.fit(train_set, validation_data=valid_set, epochs=10, steps_per_epoch=steps_per_epoch)
#model.save("def_moon_model.keras")

Voorspellingen doen#

model = tf.keras.models.load_model("def_moon_model.keras")
moon_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X-2),
    model
])
y_proba = moon_model.predict(tf.constant(["Human brai"]), verbose=0)
y_pred = tf.argmax(y_proba[0,-1])
text_vec_layer.get_vocabulary()[y_pred+2]
def next_char(text, temperature=1):
    y_probas = moon_model.predict(tf.constant([text]), verbose=0)[0,-1:]
    rescaled_probas = tf.math.log(y_probas) / temperature
    next_char_index = tf.random.categorical(rescaled_probas, num_samples=1)[0,0]
    return text_vec_layer.get_vocabulary()[next_char_index + 2]

def extend_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text
tf.random.set_seed(42)
print(extend_text("Human brain has around ten-to-the-tenth neuron", n_chars=200, temperature=0.01))
print(extend_text("Human brain has around ten-to-the-tenth neuron", n_chars=200, temperature=0.5))
print(extend_text("Human brain has around ten-to-the-tenth neuron", n_chars=200, temperature=1))
print(extend_text("Human brain has around ten-to-the-tenth neuron", n_chars=200, temperature=100))