French-to-English-Translation / french_to_english_translation_using_seq2seq.py

Upload french_to_english_translation_using_seq2seq.py

918fb26 verified 8 months ago

9.49 kB

	# -- coding: utf-8 --
	"""french-to-english-translation-using-seq2seq.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1I_pfLKfUYqIWiX3przMoSFvczO_H83er
	"""

	import warnings
	warnings.filterwarnings('ignore')
	import string
	import re
	from unicodedata import normalize
	import numpy as np
	from keras.preprocessing.text import Tokenizer
	from keras.preprocessing.sequence import pad_sequences
	from keras.utils import to_categorical
	from keras.models import Sequential,load_model
	from keras.layers import LSTM,Dense,Embedding,RepeatVector,TimeDistributed
	from keras.callbacks import EarlyStopping
	from keras.preprocessing.text import Tokenizer
	from keras.preprocessing.sequence import pad_sequences
	from nltk.translate.bleu_score import corpus_bleu
	import pandas as pd
	from string import punctuation
	import matplotlib.pyplot as plt
	from IPython.display import Markdown, display

	def printmd(string):
	# Print with Markdowns
	display(Markdown(string))

	from google.colab import drive
	drive.mount('/content/drive')

	total_sentences = 10000

	# Load the dataset
	dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Dataset/eng_-french.csv", nrows = total_sentences)

	# What proportion of the sentences will be used for the test set
	test_proportion = 0.1
	train_test_threshold = int( (1-test_proportion) * total_sentences)

	printmd(f'## {total_sentences} "parallel sentences" will be loaded (original sentence + its translation)')
	printmd(f'## {train_test_threshold} "parallel sentences" will be used to train the model')
	printmd(f'## {total_sentences-train_test_threshold} "parallel sentences" will be used to test the model')

	# Shuffle the dataset
	dataset = dataset.sample(frac=1, random_state=0)
	dataset.iloc[1000:1010]

	def clean(string):
	# Clean the string
	string = string.replace("\u202f"," ") # Replace no-break space with space
	string = string.lower()

	# Delete the punctuation and the numbers
	for p in punctuation + "«»" + "0123456789":
	string = string.replace(p," ")

	string = re.sub('\s+',' ', string)
	string = string.strip()

	return string

	# Clean the sentences
	dataset["English words/sentences"] = dataset["English words/sentences"].apply(lambda x: clean(x))
	dataset["French words/sentences"] = dataset["French words/sentences"].apply(lambda x: clean(x))

	# Select one part of the dataset
	dataset = dataset.values
	dataset = dataset[:total_sentences]

	# split into train/test
	train, test = dataset[:train_test_threshold], dataset[train_test_threshold:]

	# Define the name of the source and of the target
	# This will be used in the outputs of this notebook
	source_str, target_str = "French", "English"

	# The index in the numpy array of the source and of the target
	idx_src, idx_tar = 1, 0

	# Display the result after cleaning
	pd.DataFrame(dataset[1000:1010])

	def create_tokenizer(lines):
	# fit a tokenizer
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

	def max_len(lines):
	# max sentence length
	return max(len(line.split()) for line in lines)

	def encode_sequences(tokenizer, length, lines):
	# encode and pad sequences
	X = tokenizer.texts_to_sequences(lines) # integer encode sequences
	X = pad_sequences(X, maxlen=length, padding='post') # pad sequences with 0 values
	return X

	def encode_output(sequences, vocab_size):
	# one hot encode target sequence
	ylist = list()
	for sequence in sequences:
	encoded = to_categorical(sequence, num_classes=vocab_size)
	ylist.append(encoded)
	y = np.array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

	# Prepare target tokenizer
	tar_tokenizer = create_tokenizer(dataset[:, idx_tar])
	tar_vocab_size = len(tar_tokenizer.word_index) + 1
	tar_length = max_len(dataset[:, idx_tar])
	printmd(f'\nTarget ({target_str}) Vocabulary Size: {tar_vocab_size}')
	printmd(f'Target ({target_str}) Max Length: {tar_length}')

	# Prepare source tokenizer
	src_tokenizer = create_tokenizer(dataset[:, idx_src])
	src_vocab_size = len(src_tokenizer.word_index) + 1
	src_length = max_len(dataset[:, idx_src])
	printmd(f'\nSource ({source_str}) Vocabulary Size: {src_vocab_size}')
	printmd(f'Source ({source_str}) Max Length: {src_length}\n')

	# Prepare training data
	trainX = encode_sequences(src_tokenizer, src_length, train[:, idx_src])
	trainY = encode_sequences(tar_tokenizer, tar_length, train[:, idx_tar])
	trainY = encode_output(trainY, tar_vocab_size)

	# Prepare test data
	testX = encode_sequences(src_tokenizer, src_length, test[:, idx_src])
	testY = encode_sequences(tar_tokenizer, tar_length, test[:, idx_tar])
	testY = encode_output(testY, tar_vocab_size)

	def create_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	# Create the model
	model = Sequential()
	model.add(Embedding(src_vocab_size, n_units, input_length=src_length, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

	# Create model
	model = create_model(src_vocab_size, tar_vocab_size, src_length, tar_length, 256)
	model.compile(optimizer='adam', loss='categorical_crossentropy')

	history = model.fit(trainX,
	trainY,
	epochs=20,
	batch_size=64,
	validation_split=0.1,
	verbose=1,
	callbacks=[
	EarlyStopping(
	monitor='val_loss',
	patience=10,
	restore_best_weights=True
	)
	])

	pd.DataFrame(history.history).plot()
	plt.title("Loss")
	plt.show()

	def word_for_id(integer, tokenizer):
	# map an integer to a word
	for word, index in tokenizer.word_index.items():
	if index == integer:
	return word
	return None

	def predict_seq(model, tokenizer, source):
	# generate target from a source sequence
	prediction = model.predict(source, verbose=0)[0]
	integers = [np.argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
	word = word_for_id(i, tokenizer)
	if word is None:
	break
	target.append(word)
	return ' '.join(target)

	def compare_prediction(model, tokenizer, sources, raw_dataset, limit=20):
	# evaluate a model
	actual, predicted = [], []
	src = f'{source_str.upper()} (SOURCE)'
	tgt = f'{target_str.upper()} (TARGET)'
	pred = f'AUTOMATIC TRANSLATION IN {target_str.upper()}'
	print(f'{src:30} {tgt:25} {pred}\n')

	for i, source in enumerate(sources): # translate encoded source text
	source = source.reshape((1, source.shape[0]))
	translation = predict_seq(model, tokenizer, source)
	raw_target, raw_src = raw_dataset[i]
	print(f'{raw_src:30} {raw_target:25} {translation}')
	if i >= limit: # Display some of the result
	break

	# test on some training sequences
	print('### Result on the Training Set ###')
	compare_prediction(model, tar_tokenizer, trainX, train)

	# test on some test sequences
	print('\n\n### Result on the Test Set ###')
	compare_prediction(model, tar_tokenizer, testX, test)

	# It takes long to compute the BLEU Score

	def bleu_score(model, tokenizer, sources, raw_dataset):
	# Get the bleu score of a model
	actual, predicted = [], []
	for i, source in enumerate(sources):
	# translate encoded source text
	source = source.reshape((1, source.shape[0]))
	translation = predict_seq(model, tar_tokenizer, source)
	raw_target, raw_src = raw_dataset[i]
	actual.append([raw_target.split()])
	predicted.append(translation.split())

	bleu_dic = {}
	bleu_dic['1-grams'] = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
	bleu_dic['1-2-grams'] = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
	bleu_dic['1-3-grams'] = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))
	bleu_dic['1-4-grams'] = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))

	return bleu_dic

	# Compute the BLEU Score
	bleu_train = bleu_score(model, tar_tokenizer, trainX, train)
	bleu_test = bleu_score(model, tar_tokenizer, testX, test)

	plt.bar(x = bleu_train.keys(), height = bleu_train.values())
	plt.title("BLEU Score with the training set")
	plt.ylim((0,1))
	plt.show()

	plt.bar(x = bleu_test.keys(), height = bleu_test.values())
	plt.title("BLEU Score with the test set")
	plt.ylim((0,1))
	plt.show()

	model.save('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5')

	import gradio as gr

	# Load the trained model
	model = load_model('/content/drive/MyDrive/Colab Notebooks/Models/french_to_english_translator.h5')

	# Function to translate French to English
	def translate_french_to_english(french_sentence):
	# Clean the input sentence
	french_sentence = clean(french_sentence)
	# Tokenize and pad the input sentence
	input_sequence = encode_sequences(src_tokenizer, src_length, [french_sentence])
	# Generate the translation
	english_translation = predict_seq(model, tar_tokenizer, input_sequence)
	return english_translation

	# Create a Gradio interface
	gr.Interface(
	fn=translate_french_to_english,
	inputs="text",
	outputs="text",
	title="French to English Translator",
	description="Translate French sentences to English."
	).launch()