Spaces:

Demosthene-OR
/

avr23-cds-translation

Running

App Files Files Community

avr23-cds-translation / tabs /exploration_tab.py

Demosthene-OR

Add

580d952 7 months ago

raw

history blame

No virus

19 kB

	import streamlit as st
	import os
	import numpy as np
	import pandas as pd
	import collections
	from nltk.tokenize import word_tokenize
	from nltk import download
	from ast import literal_eval
	from translate_app import tr
	if st.session_state.Cloud == 0:
	import nltk
	import contextlib
	import re
	from nltk.corpus import stopwords
	import warnings
	warnings.filterwarnings('ignore')
	# from PIL import Image
	# import time
	# import random

	title = "Exploration et Preprocessing"
	sidebar_name = "Exploration et Preprocessing"
	dataPath = st.session_state.DataPath

	# Indiquer si l'on veut enlever les stop words. C'est un processus long
	stopwords_to_do = True
	# Indiquer si l'on veut lemmatiser les phrases, un fois les stop words enlevés. C'est un processus long (approximativement 8 minutes)
	lemmatize_to_do = True
	# Indiquer si l'on veut calculer le score Bleu pour tout le corpus. C'est un processus très long long (approximativement 10 minutes pour les 10 dictionnaires)
	bleu_score_to_do = True
	# Première ligne à charger
	first_line = 0
	# Nombre maximum de lignes à charger
	max_lines = 140000
	if ((first_line+max_lines)>137860):
	max_lines = max(137860-first_line ,0)
	# Nombre maximum de ligne à afficher pour les DataFrame
	max_lines_to_display = 50

	download('punkt')

	if st.session_state.Cloud == 0:
	download('averaged_perceptron_tagger')
	with contextlib.redirect_stdout(open(os.devnull, "w")):
	download('stopwords')

	@st.cache_data
	def load_data(path):

	input_file = os.path.join(path)
	with open(input_file, "r", encoding="utf-8") as f:
	data = f.read()

	# On convertit les majuscules en minulcule
	data = data.lower()
	data = data.split('\n')
	return data[first_line:min(len(data),first_line+max_lines)]

	@st.cache_data
	def load_preprocessed_data(path,data_type):

	input_file = os.path.join(path)
	if data_type == 1:
	return pd.read_csv(input_file, encoding="utf-8", index_col=0)
	else:
	with open(input_file, "r", encoding="utf-8") as f:
	data = f.read()
	data = data.split('\n')
	if data_type==0:
	data=data[:-1]
	elif data_type == 2:
	data=[eval(i) for i in data[:-1]]
	elif data_type ==3:
	data2 = []
	for d in data[:-1]:
	data2.append(literal_eval(d))
	data=data2
	return data

	@st.cache_data
	def load_all_preprocessed_data(lang):
	txt =load_preprocessed_data(dataPath+'/preprocess_txt_'+lang,0)
	txt_split = load_preprocessed_data(dataPath+'/preprocess_txt_split_'+lang,3)
	txt_lem = load_preprocessed_data(dataPath+'/preprocess_txt_lem_'+lang,0)
	txt_wo_stopword = load_preprocessed_data(dataPath+'/preprocess_txt_wo_stopword_'+lang,0)
	df_count_word = pd.concat([load_preprocessed_data(dataPath+'/preprocess_df_count_word1_'+lang,1), load_preprocessed_data(dataPath+'/preprocess_df_count_word2_'+lang,1)])
	return txt, txt_split, txt_lem, txt_wo_stopword, df_count_word

	#Chargement des textes complet dans les 2 langues
	full_txt_en = load_data(dataPath+'/small_vocab_en')
	full_txt_fr = load_data(dataPath+'/small_vocab_fr')

	# Chargement du résultat du préprocessing, si st.session_state.reCalcule == False
	if not st.session_state.reCalcule:
	full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en')
	full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr')
	else:

	def remove_stopwords(text, lang):
	stop_words = set(stopwords.words(lang))
	# stop_words will contain set all english stopwords
	filtered_sentence = []
	for word in text.split():
	if word not in stop_words:
	filtered_sentence.append(word)
	return " ".join(filtered_sentence)

	def clean_undesirable_from_text(sentence, lang):

	# Removing URLs
	sentence = re.sub(r"https?://\S+\|www\.\S+", "", sentence )

	# Removing Punctuations (we keep the . character)
	REPLACEMENTS = [("..", "."),
	(",", ""),
	(";", ""),
	(":", ""),
	("?", ""),
	('"', ""),
	("-", " "),
	("it's", "it is"),
	("isn't","is not"),
	("'", " ")
	]
	for old, new in REPLACEMENTS:
	sentence = sentence.replace(old, new)

	# Removing Digits
	sentence= re.sub(r'[0-9]','',sentence)

	# Removing Additional Spaces
	sentence = re.sub(' +', ' ', sentence)

	return sentence

	def clean_untranslated_sentence(data1, data2):
	i=0
	while i<len(data1):
	if data1[i]==data2[i]:
	data1.pop(i)
	data2.pop(i)
	else: i+=1
	return data1,data2

	import spacy

	nlp_en = spacy.load('en_core_web_sm')
	nlp_fr = spacy.load('fr_core_news_sm')


	def lemmatize(sentence,lang):
	# Create a Doc object
	if lang=='en':
	nlp=nlp_en
	elif lang=='fr':
	nlp=nlp_fr
	else: return
	doc = nlp(sentence)

	# Create list of tokens from given string
	tokens = []
	for token in doc:
	tokens.append(token)

	lemmatized_sentence = " ".join([token.lemma_ for token in doc])

	return lemmatized_sentence


	def preprocess_txt (data, lang):

	word_count = collections.Counter()
	word_lem_count = collections.Counter()
	word_wosw_count = collections.Counter()
	corpus = []
	data_split = []
	sentence_length = []
	data_split_wo_stopwords = []
	data_length_wo_stopwords = []
	data_lem = []
	data_lem_length = []

	txt_en_one_string= ". ".join([s for s in data])
	txt_en_one_string = txt_en_one_string.replace('..', '.')
	txt_en_one_string = " "+clean_undesirable_from_text(txt_en_one_string, 'lang')
	data = txt_en_one_string.split('.')
	if data[-1]=="":
	data.pop(-1)
	for i in range(len(data)): # On enleve les ' ' qui commencent et finissent les phrases
	if data[i][0] == ' ':
	data[i]=data[i][1:]
	if data[i][-1] == ' ':
	data[i]=data[i][:-1]
	nb_phrases = len(data)

	# Création d'un tableau de mots (sentence_split)
	for i,sentence in enumerate(data):
	sentence_split = word_tokenize(sentence)
	word_count.update(sentence_split)
	data_split.append(sentence_split)
	sentence_length.append(len(sentence_split))

	# La lemmatisation et le nettoyage des stopword va se faire en batch pour des raisons de vitesse
	# (au lieu de le faire phrase par phrase)
	# Ces 2 processus nécéssitent de connaitre la langue du corpus
	if lang == 'en': l='english'
	elif lang=='fr': l='french'
	else: l="unknown"

	if l!="unknown":
	# Lemmatisation en 12 lots (On ne peut lemmatiser + de 1 M de caractères à la fois)
	data_lemmatized=""
	if lemmatize_to_do:
	n_batch = 12
	batch_size = round((nb_phrases/ n_batch)+0.5)
	for i in range(n_batch):
	to_lem = ".".join([s for s in data[ibatch_size:(i+1)batch_size]])
	data_lemmatized = data_lemmatized+"."+lemmatize(to_lem,lang).lower()

	data_lem_for_sw = data_lemmatized[1:]
	data_lemmatized = data_lem_for_sw.split('.')
	for i in range(nb_phrases):
	data_lem.append(data_lemmatized[i].split())
	data_lem_length.append(len(data_lemmatized[i].split()))
	word_lem_count.update(data_lem[-1])

	# Elimination des StopWords en un lot
	# On élimine les Stopwords des phrases lémmatisés, si cette phase a eu lieu
	# (wosw signifie "WithOut Stop Words")
	if stopwords_to_do:
	if lemmatize_to_do:
	data_wosw = remove_stopwords(data_lem_for_sw,l)
	else:
	data_wosw = remove_stopwords(txt_en_one_string,l)

	data_wosw = data_wosw.split('.')
	for i in range(nb_phrases):
	data_split_wo_stopwords.append(data_wosw[i].split())
	data_length_wo_stopwords.append(len(data_wosw[i].split()))
	word_wosw_count.update(data_split_wo_stopwords[-1])

	corpus = list(word_count.keys())

	# Création d'un DataFrame txt_n_unique_val :
	# colonnes = mots
	# lignes = phases
	# valeur de la cellule = nombre d'occurence du mot dans la phrase

	## BOW
	from sklearn.feature_extraction.text import CountVectorizer
	count_vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1), token_pattern=r"[^' ']+" )

	# Calcul du nombre d'apparition de chaque mot dans la phrases
	countvectors = count_vectorizer.fit_transform(data)
	corpus = count_vectorizer.get_feature_names_out()

	txt_n_unique_val= pd.DataFrame(columns=corpus,index=range(nb_phrases), data=countvectors.todense()).astype(float)

	return data, corpus, data_split, data_lemmatized, data_wosw, txt_n_unique_val, sentence_length, data_length_wo_stopwords, data_lem_length


	def count_world(data):
	word_count = collections.Counter()
	for sentence in data:
	word_count.update(word_tokenize(sentence))
	corpus = list(word_count.keys())
	nb_mots = sum(word_count.values())
	nb_mots_uniques = len(corpus)
	return corpus, nb_mots, nb_mots_uniques

	def display_preprocess_results(lang, data, data_split, data_lem, data_wosw, txt_n_unique_val):

	global max_lines, first_line, last_line, lemmatize_to_do, stopwords_to_do
	corpus = []
	nb_phrases = len(data)
	corpus, nb_mots, nb_mots_uniques = count_world(data)
	mots_lem, _ , nb_mots_lem = count_world(data_lem)
	mots_wo_sw, _ , nb_mots_wo_stopword = count_world(data_wosw)
	# Identifiez les colonnes contenant uniquement des zéros et les supprimer
	columns_with_only_zeros = txt_n_unique_val.columns[txt_n_unique_val.eq(0).all()]
	txt_n_unique_val = txt_n_unique_val.drop(columns=columns_with_only_zeros)

	# Affichage du nombre de mot en fonction du pré-processing réalisé
	tab1, tab2, tab3, tab4 = st.tabs([tr("Résumé"), tr("Tokenisation"),tr("Lemmatisation"), tr("Sans Stopword")])
	with tab1:
	st.subheader(tr("Résumé du pré-processing"))
	st.write(""+tr("Nombre de phrases")+" : "+str(nb_phrases)+"")
	st.write(""+tr("Nombre de mots")+" : "+str(nb_mots)+"")
	st.write(""+tr("Nombre de mots uniques")+" : "+str(nb_mots_uniques)+"")
	st.write("")
	st.write("\n"+tr("Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):")+"")
	st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800)
	with tab2:
	st.subheader(tr("Tokenisation"))
	st.write(tr('Texte "splited":'))
	st.dataframe(pd.DataFrame(data=data_split, index=range(first_line,last_line)).head(max_lines_to_display).fillna(''), width=800)
	st.write(""+tr("Nombre de mots uniques")+" : "+str(nb_mots_uniques)+"")
	st.write("")
	st.write("\n"+tr("Mots uniques")+":")
	st.markdown(corpus[:500])
	st.write("\n"+tr("Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):")+"")
	st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800)
	with tab3:
	st.subheader(tr("Lemmatisation"))
	if lemmatize_to_do:
	st.dataframe(pd.DataFrame(data=data_lem,columns=[tr('Texte lemmatisé')],index=range(first_line,last_line)).head(max_lines_to_display), width=800)
	# Si langue anglaise, affichage du taggage des mots
	# if lang == 'en':
	# for i in range(min(5,len(data))):
	# s = str(nltk.pos_tag(data_split[i]))
	# st.markdown("Texte avec Tags "+str(i)+" : "+s)
	st.write(""+tr("Nombre de mots uniques lemmatisés")+" : "+str(nb_mots_lem)+"")
	st.write("")
	st.write("\n"+tr("Mots uniques lemmatisés:")+"")
	st.markdown(mots_lem[:500])
	with tab4:
	st.subheader(tr("Sans Stopword"))
	if stopwords_to_do:
	st.dataframe(pd.DataFrame(data=data_wosw,columns=['Texte sans stopwords'],index=range(first_line,last_line)).head(max_lines_to_display), width=800)
	st.write(""+tr("Nombre de mots uniques sans stop words")+": "+str(nb_mots_wo_stopword)+"")
	st.write("")
	st.write("\n"+tr("Mots uniques sans stop words")+":")
	st.markdown(mots_wo_sw[:500])


	def run():
	global max_lines, first_line, last_line, lemmatize_to_do, stopwords_to_do
	global full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en
	global full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr

	st.write("")
	st.title(tr(title))

	st.write("## "+tr("Explications")+" :\n")
	st.markdown(tr(
	"""
	Le traitement du langage naturel permet à l'ordinateur de comprendre et de traiter les langues humaines.
	Lors de notre projet, nous avons étudié le dataset small_vocab, proposés par Suzan Li, Chief Data Scientist chez Campaign Research à Toronto.
	Celui-ci représente un corpus de phrases simples en anglais, et sa traduction (approximative) en français.
	:red[Small_vocab] contient 137 860 phrases en anglais et français.
	""")
	, unsafe_allow_html=True)
	st.markdown(tr(
	"""
	Afin de découvrir ce corpus et de préparer la traduction, nous allons effectuer un certain nombre de tâches de pré-traitement (preprocessing).
	Ces taches sont, par exemple:
	""")
	, unsafe_allow_html=True)
	st.markdown(
	"* "+tr("le :red[nettoyage] du texte (enlever les majuscules et la ponctuation)")+"\n"+ \
	"* "+tr("la :red[tokenisation] (découpage du texte en mots)")+"\n"+ \
	"* "+tr("la :red[lemmatisation] (traitement lexical qui permet de donner une forme unique à toutes les \"variations\" d'un même mot)")+"\n"+ \
	"* "+tr("l'élimination des :red[mots \"transparents\"] (sans utilité pour la compréhension, tels que les articles).")+" \n"+ \
	tr("Ce prétraintement se conclut avec la contruction d'un :red[Bag Of Worlds], c'est à dire une matrice qui compte le nombre d'apparition de chaque mots (colonne) dans chaque phrase (ligne)")
	, unsafe_allow_html=True)
	#
	st.write("## "+tr("Paramètres")+" :\n")
	Langue = st.radio(tr('Langue:'),('Anglais','Français'), horizontal=True)
	first_line = st.slider(tr('No de la premiere ligne à analyser:'),0,137859)
	max_lines = st.select_slider(tr('Nombre de lignes à analyser:'),
	options=[1,5,10,15,100, 500, 1000,'Max'])
	if max_lines=='Max':
	max_lines=137860
	if ((first_line+max_lines)>137860):
	max_lines = max(137860-first_line,0)

	last_line = first_line+max_lines
	if (Langue=='Anglais'):
	st.dataframe(pd.DataFrame(data=full_txt_en,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
	else:
	st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
	st.write("")

	# Chargement des textes sélectionnés dans les 2 langues (max lignes = max_lines)
	txt_en = full_txt_en[first_line:last_line]
	txt_fr = full_txt_fr[first_line:last_line]

	# Elimination des phrases non traduites
	# txt_en, txt_fr = clean_untranslated_sentence(txt_en, txt_fr)

	if not st.session_state.reCalcule:
	txt_split_en = full_txt_split_en[first_line:last_line]
	txt_lem_en = full_txt_lem_en[first_line:last_line]
	txt_wo_stopword_en = full_txt_wo_stopword_en[first_line:last_line]
	df_count_word_en = full_df_count_word_en.loc[first_line:last_line-1]
	txt_split_fr = full_txt_split_fr[first_line:last_line]
	txt_lem_fr = full_txt_lem_fr[first_line:last_line]
	txt_wo_stopword_fr = full_txt_wo_stopword_fr[first_line:last_line]
	df_count_word_fr = full_df_count_word_fr.loc[first_line:last_line-1]

	# Lancement du préprocessing du texte qui va spliter nettoyer les phrases et les spliter en mots
	# et calculer nombre d'occurences des mots dans chaque phrase
	if (Langue == 'Anglais'):
	st.write("## "+tr("Préprocessing de small_vocab_en")+" :\n")
	if max_lines>10000:
	with st.status(":sunglasses:", expanded=True):
	if st.session_state.reCalcule:
	txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en = preprocess_txt (txt_en,'en')
	display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en)
	else:
	if st.session_state.reCalcule:
	txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en = preprocess_txt (txt_en,'en')
	display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en)
	else:
	st.write("## "+tr("Préprocessing de small_vocab_fr")+" :\n")
	if max_lines>10000:
	with st.status(":sunglasses:", expanded=True):
	if st.session_state.reCalcule:
	txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr = preprocess_txt (txt_fr,'fr')
	display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr)
	else:
	if st.session_state.reCalcule:
	txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr = preprocess_txt (txt_fr,'fr')
	display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr)