# Resources for seq2seq models

## Beam Search

Beam search replaces greedy search by generating a fixed width search tree over the space of all possible outputs. This version also feeds the translation into google for a comparison.

In [None]:
# delete or comment out pip line when not needed
!pip install googletrans==4.0.0-rc1
from googletrans import Translator

# define your tokens

START = "starttoken"
END = "endtoken"

def beam_search(a_model, sentence_en, beam_width, verbose=False):
    X = tf.convert_to_tensor([sentence_en])  # encoder input
    X_dec = tf.convert_to_tensor([START])  # decoder input
    y_proba = a_model.predict((X, X_dec), verbose = 0)[0, 0]  # first token's probas
    top_k = tf.math.top_k(y_proba, k=beam_width)
    top_translations = [  # list of best (log_proba, translation)
        (np.log(word_proba), layer_spanish_vectorization.get_vocabulary()[word_id])
        for word_proba, word_id in zip(top_k.values, top_k.indices)
    ]
    
    # extra code – displays the top first words in verbose mode
    if verbose:
        print("Top first words:", top_translations)

    for idx in range(1, max_length):
        candidates = []
        for log_proba, translation in top_translations:
            if translation.endswith(END):
                candidates.append((log_proba, translation.replace(END, str(layer_spanish_vectorization(END)[0].numpy()))))
                continue  # translation is finished, so don't try to extend it
            X = tf.convert_to_tensor([sentence_en])  # encoder input
            X_dec = tf.convert_to_tensor([START + " " + translation])  # decoder input
            y_proba = a_model.predict((X, X_dec), verbose=0)[0, idx]  # last token's proba
            for word_id, word_proba in enumerate(y_proba):
                # word = layer_spanish_vectorization.get_vocabulary()[word_id]
                if (word_id == 1):
                    word_proba /= 10
                candidates.append((log_proba + np.log(word_proba),
                                   f"{translation} {word_id}"))
        top_translations = sorted(candidates, reverse=True)[:beam_width]
        for i,(p,sentence) in enumerate(top_translations):
            words = sentence.split(" ")
            words[-1] = layer_spanish_vectorization.get_vocabulary()[int(words[-1])]
            top_translations[i] = (p, " ".join(words))

        # extra code – displays the top translation so far in verbose mode
        if verbose:
            print("\nTop translations so far:", top_translations)

        if all([tr.endswith(END) for _, tr in top_translations]):
            result = top_translations[0][1].replace(END, "").strip()
            googled_rev = None
            print("**************", result)
            while (not googled_rev):
                googled_rev =  translator.translate(result, dest="en")

            return googled_rev.text, result

## Attention Layers

These can be incorporated into last class's notebook for English-Spanish translation. The attention layer wraps the encoder and decoder layers and links the two by adding an attention mechanism.

In [None]:
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, dropout = 0.25, return_sequences=True, return_state=True))

In [None]:
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = Lambda(concatenate_states)(encoder_state)
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)


attention_layer = tf.keras.layers.Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])
# dropout_layer = tf.keras.layers.Dropout(0.5)
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
# Y_proba = output_layer(dropout(attention_outputs))
Y_proba = output_layer(attention_outputs)

In [None]:
# note the 'saving model' bit isn't working right now
save_best_early_stop = SaveBestModelWithEarlyStopping(save_path="snapshots/translator_C.keras", patience=2)

model = tf.keras.Model(inputs=[encoder_input_layer, decoder_input_layer],
                       outputs=[Y_proba])
model.compile(loss=MaskedSparseCategoricalCrossentropy(from_logits=False), optimizer="nadam",
              metrics=[MaskedSparseCategoricalAccuracy()])
model.fit((X_train_encoder, X_train_decoder), Y_train, epochs=20, batch_size=64,
          validation_data=((X_valid_encoder, X_valid_decoder), Y_valid),
           callbacks = [save_best_early_stop])