import streamlit as st import os import numpy as np import pandas as pd import collections from nltk.tokenize import word_tokenize from nltk import download from ast import literal_eval # import contextlib # import re # import nltk # from nltk.corpus import stopwords title = "Exploration et Preprocessing" sidebar_name = "Exploration et Preprocessing" # Indiquer si l'on veut enlever les stop words. C'est un processus long stopwords_to_do = True # Indiquer si l'on veut lemmatiser les phrases, un fois les stop words enlevés. C'est un processus long (approximativement 8 minutes) lemmatize_to_do = True # Indiquer si l'on veut calculer le score Bleu pour tout le corpus. C'est un processus très long long (approximativement 10 minutes pour les 10 dictionnaires) bleu_score_to_do = True # Première ligne à charger first_line = 0 # Nombre maximum de lignes à charger max_lines = 140000 if ((first_line+max_lines)>137860): max_lines = max(137860-first_line ,0) # Nombre maximum de ligne à afficher pour les DataFrame max_lines_to_display = 50 download('punkt') # nltk.download('averaged_perceptron_tagger') # nltk.download('stopwords') @st.cache_data def load_data(path): input_file = os.path.join(path) with open(input_file, "r", encoding="utf-8") as f: data = f.read() # On convertit les majuscules en minulcule data = data.lower() data = data.split('\n') return data[first_line:min(len(data),first_line+max_lines)] @st.cache_data def load_preprocessed_data(path,data_type): input_file = os.path.join(path) if data_type == 1: return pd.read_csv(input_file, encoding="utf-8", index_col=0) else: with open(input_file, "r", encoding="utf-8") as f: data = f.read() data = data.split('\n') if data_type==0: data=data[:-1] elif data_type == 2: data=[eval(i) for i in data[:-1]] elif data_type ==3: data2 = [] for d in data[:-1]: data2.append(literal_eval(d)) data=data2 return data # @st.cache_data(ttl='1h00s') def load_all_preprocessed_data(lang): txt =load_preprocessed_data('data/preprocess_txt_'+lang,0) txt_split = load_preprocessed_data('data/preprocess_txt_split_'+lang,3) txt_lem = load_preprocessed_data('data/preprocess_txt_lem_'+lang,0) txt_wo_stopword = load_preprocessed_data('data/preprocess_txt_wo_stopword_'+lang,0) df_count_word = pd.concat([load_preprocessed_data('data/preprocess_df_count_word1_'+lang,1), load_preprocessed_data('data/preprocess_df_count_word2_'+lang,1)]) return txt, txt_split, txt_lem, txt_wo_stopword, df_count_word #Chargement des textes complet dans les 2 langues full_txt_en = load_data('data/small_vocab_en') full_txt_fr = load_data('data/small_vocab_fr') # Chargement du résultat du préprocessing _ , full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en') _ , full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr') """ def remove_stopwords(text, lang): stop_words = set(stopwords.words(lang)) # stop_words will contain set all english stopwords filtered_sentence = [] for word in text.split(): if word not in stop_words: filtered_sentence.append(word) return " ".join(filtered_sentence) def clean_undesirable_from_text(sentence, lang): # Removing URLs sentence = re.sub(r"https?://\S+|www\.\S+", "", sentence ) # Removing Punctuations (we keep the . character) REPLACEMENTS = [("..", "."), (",", ""), (";", ""), (":", ""), ("?", ""), ('"', ""), ("-", " "), ("it's", "it is"), ("isn't","is not"), ("'", " ") ] for old, new in REPLACEMENTS: sentence = sentence.replace(old, new) # Removing Digits sentence= re.sub(r'[0-9]','',sentence) # Removing Additional Spaces sentence = re.sub(' +', ' ', sentence) return sentence def clean_untranslated_sentence(data1, data2): i=0 while i137860): max_lines = max(137860-first_line,0) # if ((max_lines-first_line)>1000): # lemmatize_to_do = True # else: # lemmatize_to_do = False last_line = first_line+max_lines if (Langue=='Anglais'): st.dataframe(pd.DataFrame(data=full_txt_en,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800) else: st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800) st.write("") # Chargement du résultat du préprocessing (max lignes = max_lines) txt_en = full_txt_en[first_line:last_line] txt_split_en = full_txt_split_en[first_line:last_line] txt_lem_en = full_txt_lem_en[first_line:last_line] txt_wo_stopword_en = full_txt_wo_stopword_en[first_line:last_line] df_count_word_en = full_df_count_word_en.loc[first_line:last_line-1] txt_fr = full_txt_fr[first_line:last_line] txt_split_fr = full_txt_split_fr[first_line:last_line] txt_lem_fr = full_txt_lem_fr[first_line:last_line] txt_wo_stopword_fr = full_txt_wo_stopword_fr[first_line:last_line] df_count_word_fr = full_df_count_word_fr.loc[first_line:last_line-1] # Lancement du préprocessing du texte qui va spliter nettoyer les phrases et les spliter en mots # et calculer nombre d'occurences des mots dans chaque phrase if (Langue == 'Anglais'): st.write("## **Préprocessing de small_vocab_en :**\n") if max_lines>10000: with st.status(":sunglasses:", expanded=True): # txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en = preprocess_txt (txt_en,'en') display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en) else: # txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en = preprocess_txt (txt_en,'en') display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en) else: st.write("## **Préprocessing de small_vocab_fr :**\n") if max_lines>10000: with st.status(":sunglasses:", expanded=True): # txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr = preprocess_txt (txt_fr,'fr') display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr) else: # txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr = preprocess_txt (txt_fr,'fr') display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr) # Might be used later.... # DEFAULT_TEXT = """Google was founded in September 1998 by Larry Page and Sergey Brin while they were Ph.D. students at Stanford University in California. Together they own about 14 percent of its shares and control 56 percent of the stockholder voting power through supervoting stock. They incorporated Google as a California privately held company on September 4, 1998, in California. Google was then reincorporated in Delaware on October 22, 2002.""" """ spacy_model = "en_core_web_sm" text = st.text_area("Text to analyze", DEFAULT_TEXT, height=200) doc = spacy_streamlit.process_text(spacy_model, text) spacy_streamlit.visualize_ner( doc, labels=["PERSON", "DATE", "GPE"], show_table=False, title="Persons, dates and locations", ) st.text(f"Analyzed using spaCy model {spacy_model}") """ # models = ["en_core_web_sm"] # default_text = "Google was founded in September 1998 by Larry Page and Sergey Brin while they were Ph.D. students at Stanford University in California. Together they own about 14 percent of its shares and control 56 percent of the stockholder voting power through supervoting stock. They incorporated Google as a California privately held company on September 4, 1998, in California. Google was then reincorporated in Delaware on October 22, 2002." # spacy_streamlit.visualize(models, default_text)