import streamlit as st import os import numpy as np import pandas as pd import collections from nltk.tokenize import word_tokenize from nltk import download from ast import literal_eval from translate_app import tr if st.session_state.Cloud == 0: import nltk import contextlib import re from nltk.corpus import stopwords import warnings warnings.filterwarnings('ignore') # from PIL import Image # import time # import random title = "Exploration et Preprocessing" sidebar_name = "Exploration et Preprocessing" dataPath = st.session_state.DataPath # Indiquer si l'on veut enlever les stop words. C'est un processus long stopwords_to_do = True # Indiquer si l'on veut lemmatiser les phrases, un fois les stop words enlevés. C'est un processus long (approximativement 8 minutes) lemmatize_to_do = True # Indiquer si l'on veut calculer le score Bleu pour tout le corpus. C'est un processus très long long (approximativement 10 minutes pour les 10 dictionnaires) bleu_score_to_do = True # Première ligne à charger first_line = 0 # Nombre maximum de lignes à charger max_lines = 140000 if ((first_line+max_lines)>137860): max_lines = max(137860-first_line ,0) # Nombre maximum de ligne à afficher pour les DataFrame max_lines_to_display = 50 download('punkt') if st.session_state.Cloud == 0: download('averaged_perceptron_tagger') with contextlib.redirect_stdout(open(os.devnull, "w")): download('stopwords') @st.cache_data def load_data(path): input_file = os.path.join(path) with open(input_file, "r", encoding="utf-8") as f: data = f.read() # On convertit les majuscules en minulcule data = data.lower() data = data.split('\n') return data[first_line:min(len(data),first_line+max_lines)] @st.cache_data def load_preprocessed_data(path,data_type): input_file = os.path.join(path) if data_type == 1: return pd.read_csv(input_file, encoding="utf-8", index_col=0) else: with open(input_file, "r", encoding="utf-8") as f: data = f.read() data = data.split('\n') if data_type==0: data=data[:-1] elif data_type == 2: data=[eval(i) for i in data[:-1]] elif data_type ==3: data2 = [] for d in data[:-1]: data2.append(literal_eval(d)) data=data2 return data @st.cache_data def load_all_preprocessed_data(lang): txt =load_preprocessed_data(dataPath+'/preprocess_txt_'+lang,0) txt_split = load_preprocessed_data(dataPath+'/preprocess_txt_split_'+lang,3) txt_lem = load_preprocessed_data(dataPath+'/preprocess_txt_lem_'+lang,0) txt_wo_stopword = load_preprocessed_data(dataPath+'/preprocess_txt_wo_stopword_'+lang,0) df_count_word = pd.concat([load_preprocessed_data(dataPath+'/preprocess_df_count_word1_'+lang,1), load_preprocessed_data(dataPath+'/preprocess_df_count_word2_'+lang,1)]) return txt, txt_split, txt_lem, txt_wo_stopword, df_count_word #Chargement des textes complet dans les 2 langues full_txt_en = load_data(dataPath+'/small_vocab_en') full_txt_fr = load_data(dataPath+'/small_vocab_fr') # Chargement du résultat du préprocessing, si st.session_state.reCalcule == False if not st.session_state.reCalcule: full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en') full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr') else: def remove_stopwords(text, lang): stop_words = set(stopwords.words(lang)) # stop_words will contain set all english stopwords filtered_sentence = [] for word in text.split(): if word not in stop_words: filtered_sentence.append(word) return " ".join(filtered_sentence) def clean_undesirable_from_text(sentence, lang): # Removing URLs sentence = re.sub(r"https?://\S+|www\.\S+", "", sentence ) # Removing Punctuations (we keep the . character) REPLACEMENTS = [("..", "."), (",", ""), (";", ""), (":", ""), ("?", ""), ('"', ""), ("-", " "), ("it's", "it is"), ("isn't","is not"), ("'", " ") ] for old, new in REPLACEMENTS: sentence = sentence.replace(old, new) # Removing Digits sentence= re.sub(r'[0-9]','',sentence) # Removing Additional Spaces sentence = re.sub(' +', ' ', sentence) return sentence def clean_untranslated_sentence(data1, data2): i=0 while i137860): max_lines = max(137860-first_line,0) last_line = first_line+max_lines if (Langue=='Anglais'): st.dataframe(pd.DataFrame(data=full_txt_en,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800) else: st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800) st.write("") # Chargement des textes sélectionnés dans les 2 langues (max lignes = max_lines) txt_en = full_txt_en[first_line:last_line] txt_fr = full_txt_fr[first_line:last_line] # Elimination des phrases non traduites # txt_en, txt_fr = clean_untranslated_sentence(txt_en, txt_fr) if not st.session_state.reCalcule: txt_split_en = full_txt_split_en[first_line:last_line] txt_lem_en = full_txt_lem_en[first_line:last_line] txt_wo_stopword_en = full_txt_wo_stopword_en[first_line:last_line] df_count_word_en = full_df_count_word_en.loc[first_line:last_line-1] txt_split_fr = full_txt_split_fr[first_line:last_line] txt_lem_fr = full_txt_lem_fr[first_line:last_line] txt_wo_stopword_fr = full_txt_wo_stopword_fr[first_line:last_line] df_count_word_fr = full_df_count_word_fr.loc[first_line:last_line-1] # Lancement du préprocessing du texte qui va spliter nettoyer les phrases et les spliter en mots # et calculer nombre d'occurences des mots dans chaque phrase if (Langue == 'Anglais'): st.write("## **"+tr("Préprocessing de small_vocab_en")+" :**\n") if max_lines>10000: with st.status(":sunglasses:", expanded=True): if st.session_state.reCalcule: txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en = preprocess_txt (txt_en,'en') display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en) else: if st.session_state.reCalcule: txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en = preprocess_txt (txt_en,'en') display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en) else: st.write("## **"+tr("Préprocessing de small_vocab_fr")+" :**\n") if max_lines>10000: with st.status(":sunglasses:", expanded=True): if st.session_state.reCalcule: txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr = preprocess_txt (txt_fr,'fr') display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr) else: if st.session_state.reCalcule: txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr = preprocess_txt (txt_fr,'fr') display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr)