|
|
|
"""french-to-english-translation-using-seq2seq.ipynb |
|
|
|
Automatically generated by Colab. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1I_pfLKfUYqIWiX3przMoSFvczO_H83er |
|
""" |
|
|
|
import warnings |
|
warnings.filterwarnings('ignore') |
|
import string |
|
import re |
|
from unicodedata import normalize |
|
import numpy as np |
|
from keras.preprocessing.text import Tokenizer |
|
from keras.preprocessing.sequence import pad_sequences |
|
from keras.utils import to_categorical |
|
from keras.models import Sequential,load_model |
|
from keras.layers import LSTM,Dense,Embedding,RepeatVector,TimeDistributed |
|
from keras.callbacks import EarlyStopping |
|
from keras.preprocessing.text import Tokenizer |
|
from keras.preprocessing.sequence import pad_sequences |
|
from nltk.translate.bleu_score import corpus_bleu |
|
import pandas as pd |
|
from string import punctuation |
|
import matplotlib.pyplot as plt |
|
from IPython.display import Markdown, display |
|
|
|
def printmd(string): |
|
|
|
display(Markdown(string)) |
|
|
|
from google.colab import drive |
|
drive.mount('/content/drive') |
|
|
|
total_sentences = 10000 |
|
|
|
|
|
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Dataset/eng_-french.csv", nrows = total_sentences) |
|
|
|
|
|
test_proportion = 0.1 |
|
train_test_threshold = int( (1-test_proportion) * total_sentences) |
|
|
|
printmd(f'## {total_sentences} "parallel sentences" will be loaded (original sentence + its translation)') |
|
printmd(f'## {train_test_threshold} "parallel sentences" will be used to train the model') |
|
printmd(f'## {total_sentences-train_test_threshold} "parallel sentences" will be used to test the model') |
|
|
|
|
|
dataset = dataset.sample(frac=1, random_state=0) |
|
dataset.iloc[1000:1010] |
|
|
|
def clean(string): |
|
|
|
string = string.replace("\u202f"," ") |
|
string = string.lower() |
|
|
|
|
|
for p in punctuation + "«»" + "0123456789": |
|
string = string.replace(p," ") |
|
|
|
string = re.sub('\s+',' ', string) |
|
string = string.strip() |
|
|
|
return string |
|
|
|
|
|
dataset["English words/sentences"] = dataset["English words/sentences"].apply(lambda x: clean(x)) |
|
dataset["French words/sentences"] = dataset["French words/sentences"].apply(lambda x: clean(x)) |
|
|
|
|
|
dataset = dataset.values |
|
dataset = dataset[:total_sentences] |
|
|
|
|
|
train, test = dataset[:train_test_threshold], dataset[train_test_threshold:] |
|
|
|
|
|
|
|
source_str, target_str = "French", "English" |
|
|
|
|
|
idx_src, idx_tar = 1, 0 |
|
|
|
|
|
pd.DataFrame(dataset[1000:1010]) |
|
|
|
def create_tokenizer(lines): |
|
|
|
tokenizer = Tokenizer() |
|
tokenizer.fit_on_texts(lines) |
|
return tokenizer |
|
|
|
def max_len(lines): |
|
|
|
return max(len(line.split()) for line in lines) |
|
|
|
def encode_sequences(tokenizer, length, lines): |
|
|
|
X = tokenizer.texts_to_sequences(lines) |
|
X = pad_sequences(X, maxlen=length, padding='post') |
|
return X |
|
|
|
def encode_output(sequences, vocab_size): |
|
|
|
ylist = list() |
|
for sequence in sequences: |
|
encoded = to_categorical(sequence, num_classes=vocab_size) |
|
ylist.append(encoded) |
|
y = np.array(ylist) |
|
y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size) |
|
return y |
|
|
|
|
|
tar_tokenizer = create_tokenizer(dataset[:, idx_tar]) |
|
tar_vocab_size = len(tar_tokenizer.word_index) + 1 |
|
tar_length = max_len(dataset[:, idx_tar]) |
|
printmd(f'\nTarget ({target_str}) Vocabulary Size: {tar_vocab_size}') |
|
printmd(f'Target ({target_str}) Max Length: {tar_length}') |
|
|
|
|
|
src_tokenizer = create_tokenizer(dataset[:, idx_src]) |
|
src_vocab_size = len(src_tokenizer.word_index) + 1 |
|
src_length = max_len(dataset[:, idx_src]) |
|
printmd(f'\nSource ({source_str}) Vocabulary Size: {src_vocab_size}') |
|
printmd(f'Source ({source_str}) Max Length: {src_length}\n') |
|
|
|
|
|
trainX = encode_sequences(src_tokenizer, src_length, train[:, idx_src]) |
|
trainY = encode_sequences(tar_tokenizer, tar_length, train[:, idx_tar]) |
|
trainY = encode_output(trainY, tar_vocab_size) |
|
|
|
|
|
testX = encode_sequences(src_tokenizer, src_length, test[:, idx_src]) |
|
testY = encode_sequences(tar_tokenizer, tar_length, test[:, idx_tar]) |
|
testY = encode_output(testY, tar_vocab_size) |
|
|
|
def create_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units): |
|
|
|
model = Sequential() |
|
model.add(Embedding(src_vocab_size, n_units, input_length=src_length, mask_zero=True)) |
|
model.add(LSTM(n_units)) |
|
model.add(RepeatVector(tar_timesteps)) |
|
model.add(LSTM(n_units, return_sequences=True)) |
|
model.add(TimeDistributed(Dense(tar_vocab, activation='softmax'))) |
|
return model |
|
|
|
|
|
model = create_model(src_vocab_size, tar_vocab_size, src_length, tar_length, 256) |
|
model.compile(optimizer='adam', loss='categorical_crossentropy') |
|
|
|
history = model.fit(trainX, |
|
trainY, |
|
epochs=20, |
|
batch_size=64, |
|
validation_split=0.1, |
|
verbose=1, |
|
callbacks=[ |
|
EarlyStopping( |
|
monitor='val_loss', |
|
patience=10, |
|
restore_best_weights=True |
|
) |
|
]) |
|
|
|
pd.DataFrame(history.history).plot() |
|
plt.title("Loss") |
|
plt.show() |
|
|
|
def word_for_id(integer, tokenizer): |
|
|
|
for word, index in tokenizer.word_index.items(): |
|
if index == integer: |
|
return word |
|
return None |
|
|
|
def predict_seq(model, tokenizer, source): |
|
|
|
prediction = model.predict(source, verbose=0)[0] |
|
integers = [np.argmax(vector) for vector in prediction] |
|
target = list() |
|
for i in integers: |
|
word = word_for_id(i, tokenizer) |
|
if word is None: |
|
break |
|
target.append(word) |
|
return ' '.join(target) |
|
|
|
def compare_prediction(model, tokenizer, sources, raw_dataset, limit=20): |
|
|
|
actual, predicted = [], [] |
|
src = f'{source_str.upper()} (SOURCE)' |
|
tgt = f'{target_str.upper()} (TARGET)' |
|
pred = f'AUTOMATIC TRANSLATION IN {target_str.upper()}' |
|
print(f'{src:30} {tgt:25} {pred}\n') |
|
|
|
for i, source in enumerate(sources): |
|
source = source.reshape((1, source.shape[0])) |
|
translation = predict_seq(model, tokenizer, source) |
|
raw_target, raw_src = raw_dataset[i] |
|
print(f'{raw_src:30} {raw_target:25} {translation}') |
|
if i >= limit: |
|
break |
|
|
|
|
|
print('### Result on the Training Set ###') |
|
compare_prediction(model, tar_tokenizer, trainX, train) |
|
|
|
|
|
print('\n\n### Result on the Test Set ###') |
|
compare_prediction(model, tar_tokenizer, testX, test) |
|
|
|
|
|
|
|
def bleu_score(model, tokenizer, sources, raw_dataset): |
|
|
|
actual, predicted = [], [] |
|
for i, source in enumerate(sources): |
|
|
|
source = source.reshape((1, source.shape[0])) |
|
translation = predict_seq(model, tar_tokenizer, source) |
|
raw_target, raw_src = raw_dataset[i] |
|
actual.append([raw_target.split()]) |
|
predicted.append(translation.split()) |
|
|
|
bleu_dic = {} |
|
bleu_dic['1-grams'] = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)) |
|
bleu_dic['1-2-grams'] = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)) |
|
bleu_dic['1-3-grams'] = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)) |
|
bleu_dic['1-4-grams'] = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)) |
|
|
|
return bleu_dic |
|
|
|
|
|
bleu_train = bleu_score(model, tar_tokenizer, trainX, train) |
|
bleu_test = bleu_score(model, tar_tokenizer, testX, test) |
|
|
|
plt.bar(x = bleu_train.keys(), height = bleu_train.values()) |
|
plt.title("BLEU Score with the training set") |
|
plt.ylim((0,1)) |
|
plt.show() |
|
|
|
plt.bar(x = bleu_test.keys(), height = bleu_test.values()) |
|
plt.title("BLEU Score with the test set") |
|
plt.ylim((0,1)) |
|
plt.show() |
|
|
|
model.save('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5') |
|
|
|
import gradio as gr |
|
|
|
|
|
model = load_model('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5') |
|
|
|
|
|
def translate_french_to_english(french_sentence): |
|
|
|
french_sentence = clean(french_sentence) |
|
|
|
input_sequence = encode_sequences(src_tokenizer, src_length, [french_sentence]) |
|
|
|
english_translation = predict_seq(model, tar_tokenizer, input_sequence) |
|
return english_translation |
|
|
|
|
|
gr.Interface( |
|
fn=translate_french_to_english, |
|
inputs="text", |
|
outputs="text", |
|
title="French to English Translator", |
|
description="Translate French sentences to English." |
|
).launch() |