Spaces:

Demosthene-OR
/

Value-Props

Running

App Files Files Community

Demosthene-OR commited on Sep 21, 2024

Commit

4df9e3a

1 Parent(s): c24ac6c

Initialization 2

Browse files

Files changed (49) hide show

app.py +113 -0
assets/BOW.jpg +0 -0
assets/coeur.png +0 -0
assets/deepnlp_graph1.png +0 -0
assets/deepnlp_graph12.png +0 -0
assets/deepnlp_graph3.png +0 -0
assets/demosthene_logo.png +0 -0
assets/faviconV2.png +0 -0
assets/fig_schapley0.png +0 -0
assets/fig_schapley1.png +0 -0
assets/fig_schapley2.png +0 -0
assets/fig_schapley3.png +0 -0
assets/fig_schapley4.png +0 -0
assets/fig_schapley5.png +0 -0
assets/fig_schapley6.png +0 -0
assets/fig_schapley7.png +0 -0
assets/fig_schapley8.png +0 -0
assets/fig_schapley_recap0.png +0 -0
assets/fig_schapley_recap1.png +0 -0
assets/fig_schapley_recap2.png +0 -0
assets/fig_schapley_recap3.png +0 -0
assets/fig_schapley_recap4.png +0 -0
assets/fig_schapley_recap5.png +0 -0
assets/fig_schapley_recap6.png +0 -0
assets/fig_schapley_recap7.png +0 -0
assets/fig_schapley_recap8.png +0 -0
assets/formule_proba_naive_bayes.png +0 -0
assets/github-logo.png +0 -0
assets/linkedin-logo-black.png +0 -0
assets/linkedin-logo.png +0 -0
assets/logo-datascientest.png +0 -0
assets/sample-image.jpg +0 -0
assets/tough-communication.gif +0 -0
config.py +32 -0
images/coeur.png +0 -0
images/demosthene_tete.svg +1 -0
member.py +19 -0
packages.txt +5 -0
requirements.txt +35 -0
style.css +129 -0
tabs/custom_vectorizer.py +14 -0
tabs/data_viz_tab.py +404 -0
tabs/exploration_tab.py +424 -0
tabs/game_tab.py +235 -0
tabs/id_lang_tab.py +476 -0
tabs/intro.py +93 -0
tabs/modelisation_dict_tab.py +277 -0
tabs/modelisation_seq2seq_tab.py +606 -0
translate_app.py +27 -0

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import streamlit as st
+import os.path
+from collections import OrderedDict
+from streamlit_option_menu import option_menu
+# Define TITLE, TEAM_MEMBERS and PROMOTION values, in config.py.
+import config
+from tabs.custom_vectorizer import custom_tokenizer, custom_preprocessor
+import os
+from translate_app import tr
+# Initialize a session state variable that tracks the sidebar state (either 'expanded' or 'collapsed').
+if 'sidebar_state' not in st.session_state:
+    st.session_state.sidebar_state = 'expanded'
+else:
+    st.session_state.sidebar_state = 'auto'
+st.set_page_config (
+    page_title=config.TITLE,
+    page_icon= "assets/faviconV2.png",
+    initial_sidebar_state=st.session_state.sidebar_state
+)
+# Si l'application tourne localement, session_state.Cloud == 0
+# Si elle tourne sur le Cloud de Hugging Face, ==1
+st.session_state.Cloud = 1
+# En fonction de la valeur de varible précédente, le data path est différent
+if st.session_state.Cloud == 0:
+    st.session_state.DataPath = "../data"
+    st.session_state.ImagePath = "../images"
+    st.session_state.reCalcule = False
+else:
+    st.session_state.DataPath = "data"
+    st.session_state.ImagePath = "images"
+    st.session_state.reCalcule = False
+# Define the root folders depending on local/cloud run
+# thisfile = os.path.abspath(__file__)
+# if ('/' in thisfile):
+#     os.chdir(os.path.dirname(thisfile))
+# Nécessaire pour la version windows 11
+if st.session_state.Cloud == 0:
+    os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
+# Tabs in the ./tabs folder, imported here.
+from tabs import intro, exploration_tab, data_viz_tab, id_lang_tab, modelisation_dict_tab, modelisation_seq2seq_tab, game_tab
+with open("style.css", "r") as f:
+    style = f.read()
+st.markdown(f"<style>{style}</style>", unsafe_allow_html=True)
+# Add tab in this ordered dict by
+# passing the name in the sidebar as key and the imported tab
+# as value as follow :
+TABS = OrderedDict(
+    [
+        (tr(intro.sidebar_name), intro),
+        (tr(exploration_tab.sidebar_name), exploration_tab),
+        (tr(data_viz_tab.sidebar_name), data_viz_tab),
+        (tr(id_lang_tab.sidebar_name), id_lang_tab),
+        (tr(modelisation_dict_tab.sidebar_name), modelisation_dict_tab),
+        (tr(modelisation_seq2seq_tab.sidebar_name), modelisation_seq2seq_tab),
+        (tr(game_tab.sidebar_name), game_tab ),
+    ]
+)
+# Utilisation du module translate
+# lang_tgt   = ['fr','en','af','ak','sq','de','am','en','ar','hy','as','az','ba','bm','eu','bn','be','my','bs','bg','ks','ca','ny','zh','si','ko','co','ht','hr','da','dz','gd','es','eo','et','ee','fo','fj','fi','fr','fy','gl','cy','lg','ka','el','gn','gu','ha','he','hi','hu','ig','id','iu','ga','is','it','ja','kn','kk','km','ki','rw','ky','rn','ku','lo','la','lv','li','ln','lt','lb','mk','ms','ml','dv','mg','mt','mi','mr','mn','nl','ne','no','nb','nn','oc','or','ug','ur','uz','ps','pa','fa','pl','pt','ro','ru','sm','sg','sa','sc','sr','sn','sd','sk','sl','so','st','su','sv','sw','ss','tg','tl','ty','ta','tt','cs','te','th','bo','ti','to','ts','tn','tr','tk','tw','uk','vi','wo','xh','yi']
+# label_lang = ['Français', 'Anglais / English','Afrikaans','Akan','Albanais','Allemand / Deutsch','Amharique','Anglais','Arabe','Arménien','Assamais','Azéri','Bachkir','Bambara','Basque','Bengali','Biélorusse','Birman','Bosnien','Bulgare','Cachemiri','Catalan','Chichewa','Chinois','Cingalais','Coréen','Corse','Créolehaïtien','Croate','Danois','Dzongkha','Écossais','Espagnol / Español','Espéranto','Estonien','Ewe','Féroïen','Fidjien','Finnois','Français','Frisonoccidental','Galicien','Gallois','Ganda','Géorgien','Grecmoderne','Guarani','Gujarati','Haoussa','Hébreu','Hindi','Hongrois','Igbo','Indonésien','Inuktitut','Irlandais','Islandais','Italien / Italiano','Japonais','Kannada','Kazakh','Khmer','Kikuyu','Kinyarwanda','Kirghiz','Kirundi','Kurde','Lao','Latin','Letton','Limbourgeois','Lingala','Lituanien','Luxembourgeois','Macédonien','Malais','Malayalam','Maldivien','Malgache','Maltais','MaorideNouvelle-Zélande','Marathi','Mongol','Néerlandais / Nederlands','Népalais','Norvégien','Norvégienbokmål','Norvégiennynorsk','Occitan','Oriya','Ouïghour','Ourdou','Ouzbek','Pachto','Pendjabi','Persan','Polonais','Portugais','Roumain','Russe','Samoan','Sango','Sanskrit','Sarde','Serbe','Shona','Sindhi','Slovaque','Slovène','Somali','SothoduSud','Soundanais','Suédois','Swahili','Swati','Tadjik','Tagalog','Tahitien','Tamoul','Tatar','Tchèque','Télougou','Thaï','Tibétain','Tigrigna','Tongien','Tsonga','Tswana','Turc','Turkmène','Twi','Ukrainien','Vietnamien','Wolof','Xhosa','Yiddish']
+# Utilisation du module deep_translator
+lang_tgt = ['fr', 'en', 'af', 'ak', 'sq', 'de', 'am', 'en', 'ar', 'hy', 'as', 'ay', 'az', 'bm', 'eu', 'bn', 'bho', 'be', 'my', 'bs', 'bg', 'ca', 'ceb', 'ny', 'zh-CN', 'zh-TW', 'si', 'ko', 'co', 'ht', 'hr', 'da', 'doi', 'gd', 'es', 'eo', 'et', 'ee', 'fi', 'fr', 'fy', 'gl', 'cy', 'lg', 'ka', 'el', 'gn', 'gu', 'ha', 'haw', 'iw', 'hi', 'hmn', 'hu', 'ig', 'ilo', 'id', 'ga', 'is', 'it', 'ja', 'jw', 'kn', 'kk', 'km', 'rw', 'ky', 'gom', 'kri', 'ku', 'ckb', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mai', 'ms', 'ml', 'dv', 'mg', 'mt', 'mi', 'mr', 'mni-Mtei', 'lus', 'mn', 'nl', 'ne', 'no','or', 'om', 'ug', 'ur', 'uz', 'ps', 'pa', 'fa', 'pl', 'pt', 'qu', 'ro', 'ru', 'sm', 'sa', 'nso', 'sr', 'sn', 'sd', 'sk', 'sl', 'so', 'st', 'su', 'sv', 'sw', 'tg', 'tl', 'ta', 'tt', 'cs', 'te', 'th', 'ti', 'ts', 'tr', 'tk', 'uk', 'vi', 'xh', 'yi', 'yo', 'zu']
+label_lang = ['Français', 'Anglais / English','Afrikaans','Akan','Albanais','Allemand / Deutsch','Amharique','Anglais','Arabe','Arménien','Assamais','Aymara','Azéri','Bambara','Basque','Bengali','Bhojpuri','Biélorusse','Birman','Bosnien','Bulgare','Catalan','Cebuano','Chichewa','Chinois (simplifié)','Chinois (traditionnel)','Cingalais','Coréen','Corse','Créole haïtien','Croate','Danois','Dogri','Écossais','Espagnol / Español','Espéranto','Estonien','Ewe','Finnois','Français','Frisonoccidental','Galicien','Gallois','Ganda','Géorgien','Grec moderne','Guarani','Gujarati','Haoussa','Hawaïen','Hébreu','Hindi','Hmong','Hongrois','Igbo','Ilocano','Indonésien','Irlandais','Islandais','Italien / Italiano','Japonais','Javanais','Kannada','Kazakh','Khmer','Kinyarwanda','Kirghiz','Konkani','Krio','Kurde','Kurde (Sorani)','Lao','Latin','Letton','Lingala','Lituanien','Luxembourgeois','Macédonien','Maithili','Malais','Malayalam','Maldivien','Malgache','Maltais','Maori de Nouvelle-Zélande','Marathi','Meiteilon (Manipuri)','Mizo','Mongol','Néerlandais / Nederlands','Népalais','Norvégien','Oriya','Oromo','Ouïghour','Ourdou','Ouzbek','Pachto','Pendjabi','Persan','Polonais','Portugais','Quechua','Roumain','Russe','Samoan','Sanskrit','Sepedi','Serbe','Shona','Sindhi','Slovaque','Slovène','Somali','Sotho du Sud','Soundanais','Suédois','Swahili','Tadjik','Tagalog','Tamoul','Tatar','Tchèque','Télougou','Thaï','Tigrigna','Tsonga','Turc','Turkmène','Ukrainien','Vietnamien','Xhosa','Yiddish','Yoruba','Zulu']
+@st.cache_data
+def find_lang_label(lang_sel):
+    global lang_tgt, label_lang
+    return label_lang[lang_tgt.index(lang_sel)]
+def run():
+    st.sidebar.image(
+        "assets/demosthene_logo.png",
+        width=270,
+    )
+    with st.sidebar:
+        tab_name = option_menu(None, list(TABS.keys()),
+                               # icons=['house', 'bi-binoculars', 'bi bi-graph-up', 'bi-chat-right-text','bi-book', 'bi-body-text'], menu_icon="cast", default_index=0,
+                               icons=['house', 'binoculars', 'graph-up', 'search','book', 'chat-right-text','controller'], menu_icon="cast", default_index=0,
+                               styles={"container": {"padding": "0!important","background-color": "#10b8dd", "border-radius": "0!important"},
+                                       "nav-link": {"font-size": "1rem", "text-align": "left", "margin":"0em", "padding": "0em",
+                                                    "padding-left": "0.2em", "--hover-color": "#eee", "font-weight": "400",
+                                                    "font-family": "Source Sans Pro, sans-serif"}
+                                        })
+    # tab_name = st.sidebar.radio("", list(TABS.keys()), 0)
+    st.sidebar.markdown("---")
+    st.sidebar.markdown(f"## {config.PROMOTION}")
+    st.sidebar.markdown("### Team members:")
+    for member in config.TEAM_MEMBERS:
+        st.sidebar.markdown(member.sidebar_markdown(), unsafe_allow_html=True)
+    with st.sidebar:
+        st.selectbox("langue:",lang_tgt, format_func = find_lang_label, key="Language", label_visibility="hidden")
+    tab = TABS[tab_name]
+    tab.run()
+if __name__ == "__main__":
+    run()

assets/BOW.jpg ADDED Viewed

assets/coeur.png ADDED Viewed

assets/deepnlp_graph1.png ADDED Viewed

assets/deepnlp_graph12.png ADDED Viewed

assets/deepnlp_graph3.png ADDED Viewed

assets/demosthene_logo.png ADDED Viewed

assets/faviconV2.png ADDED Viewed

assets/fig_schapley0.png ADDED Viewed

assets/fig_schapley1.png ADDED Viewed

assets/fig_schapley2.png ADDED Viewed

assets/fig_schapley3.png ADDED Viewed

assets/fig_schapley4.png ADDED Viewed

assets/fig_schapley5.png ADDED Viewed

assets/fig_schapley6.png ADDED Viewed

assets/fig_schapley7.png ADDED Viewed

assets/fig_schapley8.png ADDED Viewed

assets/fig_schapley_recap0.png ADDED Viewed

assets/fig_schapley_recap1.png ADDED Viewed

assets/fig_schapley_recap2.png ADDED Viewed

assets/fig_schapley_recap3.png ADDED Viewed

assets/fig_schapley_recap4.png ADDED Viewed

assets/fig_schapley_recap5.png ADDED Viewed

assets/fig_schapley_recap6.png ADDED Viewed

assets/fig_schapley_recap7.png ADDED Viewed

assets/fig_schapley_recap8.png ADDED Viewed

assets/formule_proba_naive_bayes.png ADDED Viewed

assets/github-logo.png ADDED Viewed

assets/linkedin-logo-black.png ADDED Viewed

assets/linkedin-logo.png ADDED Viewed

assets/logo-datascientest.png ADDED Viewed

assets/sample-image.jpg ADDED Viewed

assets/tough-communication.gif ADDED Viewed

config.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+Config file for Streamlit App
+"""
+from member import Member
+TITLE = "Système de traduction adapté aux lunettes connectées"
+TEAM_MEMBERS = [
+    Member(
+        name="Keyne Dupont&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;",
+        linkedin_url="https://www.linkedin.com/in/keyne-dupont/",
+        github_url=None,
+    ),
+    Member(
+        name="Tia Ratsimbason",
+        linkedin_url="https://www.linkedin.com/in/tia-ratsimbason-42110887/",
+        github_url=None,
+    ),
+    Member(
+    name="Olivier Renouard",
+    linkedin_url="https://www.linkedin.com/in/olivier-renouard/",
+    github_url="https://github.com/Demosthene-OR/AVR23_CDS_Text_translation",
+    )
+]
+PROMOTION = "Promotion Continuous - Data Scientist - April 2023"

images/coeur.png ADDED Viewed

images/demosthene_tete.svg ADDED Viewed

member.py ADDED Viewed

	@@ -0,0 +1,19 @@

+class Member:
+    def __init__(
+        self, name: str, linkedin_url: str = None, github_url: str = None
+    ) -> None:
+        self.name = name
+        self.linkedin_url = linkedin_url
+        self.github_url = github_url
+    def sidebar_markdown(self):
+        markdown = f'<b style="display: inline-block; vertical-align: middle; height: 100%">{self.name}</b>'
+        if self.linkedin_url is not None:
+            markdown += f' <a href={self.linkedin_url} target="_blank"><img src="https://dst-studio-template.s3.eu-west-3.amazonaws.com/linkedin-logo-black.png" alt="linkedin" width="25" style="vertical-align: middle; margin-left: 5px"/></a> '
+        if self.github_url is not None:
+            markdown += f' <a href={self.github_url} target="_blank"><img src="https://dst-studio-template.s3.eu-west-3.amazonaws.com/github-logo.png" alt="github" width="20" style="vertical-align: middle; margin-left: 5px"/></a> '
+        return markdown

packages.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+build-essential
+libasound-dev
+portaudio19-dev
+python3-pyaudio
+graphviz

requirements.txt ADDED Viewed

	@@ -0,0 +1,35 @@

+streamlit==1.26.0
+pandas==2.2.1
+matplotlib==3.8.2
+ipython==8.21.0
+numpy==1.23.5
+seaborn==0.13.2
+nltk==3.8.1
+scikit-learn==1.1.3
+gensim==4.3.2
+sacrebleu==2.4.0
+spacy==3.6.0
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz
+https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.6.0/fr_core_news_sm-3.6.0.tar.gz
+pillow==9.5.0
+wordcloud==1.9.3
+networkx==2.7.0
+transformers==4.37.2
+keras-nlp==0.6.1
+keras==2.12.0
+tensorflow==2.12.0
+sentencepiece==0.1.99
+openai-whisper==20231117
+torch==2.2.0
+speechrecognition==3.10.1
+audio_recorder_streamlit==0.0.8
+whisper==1.1.10
+wavio==0.0.8
+filesplit==4.0.1
+regex==2023.12.25
+pydot==2.0.0
+graphviz==0.20.1
+gTTS==2.5.1
+https://files.pythonhosted.org/packages/cc/58/96aff0e5cb8b59c06232ea7e249ed902d04ec89f52636f5be06ceb0855fe/extra_streamlit_components-0.1.60-py3-none-any.whl
+streamlit-option-menu==0.3.12
+deep-translator==1.11.4

style.css ADDED Viewed

	@@ -0,0 +1,129 @@

+h1 {
+  padding-top: 0rem;
+  padding-bottom: 0rem;
+  margin-top:6px;
+}
+h2 {
+  padding-top: 0.75rem;
+  padding-bottom: 0.5rem;
+}
+/* La ligne suivante est nécessaire à cause du module streamlit_option_menu qui "casse" les CSS suivants */
+@media (prefers-color-scheme: dark) {
+  .st-cc {
+    color: #fff!important; /* Couleur du texte en mode sombre */
+  }
+	.st-cg:hover {
+		color: rgb(255, 75, 75)!important; /* Couleur du texte en mode sombre */
+  }
+  section[data-testid="stSidebar"] .stSelectbox .st-cc {
+    color: rgb(255, 75, 75)!important;
+    font-weight: bold;
+  }
+}
+p {
+    margin-bottom:0.1rem;
+}
+code {
+  color: #1ec3bc;
+}
+#MainMenu {
+  display: none;
+}
+div[data-testid="stDecoration"] {
+  display: none;
+}
+footer {
+  display: none;
+}
+/* Radio buttons */
+.st-cc {
+  color: black;
+  font-weight: 500;
+}
+/* Sidebar */
+.css-1544g2n {
+  padding-top: 1rem;
+}
+.css-10oheav {
+  padding-top: 3rem;
+}
+.css-ue6h4q {
+  min-height: 0.5rem;
+}
+section[data-testid="stSidebar"] > div {
+  background-color: #10b8dd;
+  padding-top: 1rem;
+  padding-left: 0.5rem;
+}
+section[data-testid="stSidebar"] button[title="View fullscreen"] {
+  display: none;
+}
+section[data-testid="stSidebar"] button[kind="icon"] {
+  display: none;
+}
+section[data-testid="stSidebar"] .st-bk {
+  background-color: #10b8dd;
+}
+section[data-testid="stSidebar"] .st-c0 {
+/*  background-color: #10b8dd; */
+}
+section[data-testid="stSidebar"] hr {
+  margin-top: 30px;
+  border-color: white;
+  width: 50px;
+}
+section[data-testid="stSidebar"] h2 {
+  color: white;
+}
+section[data-testid="stSidebar"] .stSelectbox .st-bk {
+  background-color: #a0d3de;
+}
+section[data-testid="stSidebar"] .stSelectbox .st-cc {
+  color: rgb(255, 75, 75);
+  font-weight: bold;
+}
+/* Images */
+button[title="View fullscreen"] {
+  display: none;
+}
+/* hr */
+hr {
+  width: 700px;
+  border-width: 5px;
+  border-color: #10b8dd;
+  margin-top: 0px;
+  margin-bottom: 1em;
+  max-width: 100%;
+}
+/* First Page */
+section[tabindex="0"] .block-container {
+  padding-top: 0px;
+  padding-bottom: 0px;
+}

tabs/custom_vectorizer.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Les 2 fonctions suivantes sont nécéssaires afin de sérialiser ces parametre de CountVectorizer
+# et ainsi de sauvegarder le vectorizer pour un un usage ultérieur sans utiliser X_train pour  le réinitialiser
+import tiktoken
+tokenizer = tiktoken.get_encoding("cl100k_base")
+def custom_tokenizer(text):
+    global tokenizer
+    tokens = tokenizer.encode(text)  # Cela divise le texte en mots
+    return tokens
+def custom_preprocessor(text):
+    return text

tabs/data_viz_tab.py ADDED Viewed

	@@ -0,0 +1,404 @@

+import streamlit as st
+from PIL import Image
+import os
+import ast
+import contextlib
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from wordcloud import WordCloud
+import nltk
+from nltk.corpus import stopwords
+from gensim import corpora
+import networkx as nx
+from sklearn.manifold import TSNE
+from gensim.models import KeyedVectors
+from translate_app import tr
+title = "Data Vizualization"
+sidebar_name = "Data Vizualization"
+dataPath = st.session_state.DataPath
+with contextlib.redirect_stdout(open(os.devnull, "w")):
+    nltk.download('stopwords')
+# Première ligne à charger
+first_line = 0
+# Nombre maximum de lignes à charger
+max_lines = 140000
+if ((first_line+max_lines)>137860):
+    max_lines = max(137860-first_line ,0)
+# Nombre maximum de ligne à afficher pour les DataFrame
+max_lines_to_display = 50
+@st.cache_data
+def load_data(path):
+    input_file = os.path.join(path)
+    with open(input_file, "r",  encoding="utf-8") as f:
+        data = f.read()
+    # On convertit les majuscules en minulcule
+    data = data.lower()
+    data = data.split('\n')
+    return data[first_line:min(len(data),first_line+max_lines)]
+@st.cache_data
+def load_preprocessed_data(path,data_type):
+    input_file = os.path.join(path)
+    if data_type == 1:
+        return pd.read_csv(input_file, encoding="utf-8", index_col=0)
+    else:
+        with open(input_file, "r",  encoding="utf-8") as f:
+            data = f.read()
+            data = data.split('\n')
+        if data_type==0:
+            data=data[:-1]
+        elif data_type == 2:
+            data=[eval(i) for i in data[:-1]]
+        elif data_type ==3:
+            data2 = []
+            for d in data[:-1]:
+                data2.append(ast.literal_eval(d))
+            data=data2
+        return data
+@st.cache_data
+def load_all_preprocessed_data(lang):
+    txt           =load_preprocessed_data(dataPath+'/preprocess_txt_'+lang,0)
+    corpus        =load_preprocessed_data(dataPath+'/preprocess_corpus_'+lang,0)
+    txt_split     = load_preprocessed_data(dataPath+'/preprocess_txt_split_'+lang,3)
+    df_count_word = pd.concat([load_preprocessed_data(dataPath+'/preprocess_df_count_word1_'+lang,1), load_preprocessed_data(dataPath+'/preprocess_df_count_word2_'+lang,1)])
+    sent_len      =load_preprocessed_data(dataPath+'/preprocess_sent_len_'+lang,2)
+    vec_model= KeyedVectors.load_word2vec_format(dataPath+'/mini.wiki.'+lang+'.align.vec')
+    return txt, corpus, txt_split, df_count_word,sent_len, vec_model
+#Chargement des textes complet dans les 2 langues
+full_txt_en, full_corpus_en, full_txt_split_en, full_df_count_word_en,full_sent_len_en, vec_model_en = load_all_preprocessed_data('en')
+full_txt_fr, full_corpus_fr, full_txt_split_fr, full_df_count_word_fr,full_sent_len_fr, vec_model_fr = load_all_preprocessed_data('fr')
+def plot_word_cloud(text, title, masque, stop_words, background_color = "white"):
+    mask_coloring = np.array(Image.open(str(masque)))
+    # Définir le calque du nuage des mots
+    wc = WordCloud(background_color=background_color, max_words=200,
+                   stopwords=stop_words, mask = mask_coloring,
+                   max_font_size=50, random_state=42)
+    # Générer et afficher le nuage de mots
+    fig=plt.figure(figsize= (20,10))
+    plt.title(tr(title), fontsize=25, color="green")
+    wc.generate(text)
+    # getting current axes
+    a = plt.gca()
+    # set visibility of x-axis as False
+    xax = a.axes.get_xaxis()
+    xax = xax.set_visible(False)
+    # set visibility of y-axis as False
+    yax = a.axes.get_yaxis()
+    yax = yax.set_visible(False)
+    plt.imshow(wc)
+    # plt.show()
+    st.pyplot(fig)
+def drop_df_null_col(df):
+    # Check if all values in each column are 0
+    columns_to_drop = df.columns[df.eq(0).all()]
+    # Drop the columns with all values as 0
+    return df.drop(columns=columns_to_drop)
+def calcul_occurence(df_count_word):
+    nb_occurences = pd.DataFrame(df_count_word.sum().sort_values(axis=0,ascending=False))
+    nb_occurences.columns = ['occurences']
+    nb_occurences.index.name = 'mot'
+    nb_occurences['mots'] = nb_occurences.index
+    return nb_occurences
+def dist_frequence_mots(df_count_word):
+    df_count_word = drop_df_null_col(df_count_word)
+    nb_occurences = calcul_occurence(df_count_word)
+    sns.set()
+    fig = plt.figure() #figsize=(4,4)
+    plt.title(tr("Nombre d'apparitions des mots"), fontsize=16)
+    chart = sns.barplot(x='mots',y='occurences',data=nb_occurences.iloc[:40]);
+    chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right', size=8)
+    st.pyplot(fig)
+def dist_longueur_phrase(sent_len,sent_len2, lang1, lang2 ):
+    '''
+    fig = px.histogram(sent_len, nbins=16, range_x=[3, 18],labels={'count': 'Count', 'variable': 'Nb de mots'},
+                       color_discrete_sequence=['rgb(200, 0, 0)'],  # Couleur des barres de l'histogramme
+                       opacity=0.7)
+    fig.update_traces(marker=dict(color='rgb(200, 0, 0)', line=dict(color='white', width=2)), showlegend=False,)
+    fig.update_layout(
+        title={'text': 'Distribution du nb de mots/phrase', 'y':1.0, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
+        title_font=dict(size=28),  # Ajuste la taille de la police du titre
+        xaxis_title=None,
+        xaxis=dict(
+            title_font=dict(size=30), # Ajuste la taille de la police de l'axe X
+            tickfont=dict(size=22),
+            showgrid=True, gridcolor='white'
+            ),
+        yaxis_title='Count',
+        yaxis=dict(
+            title_font= dict(size=30, color='black'), # Ajuste la taille de la police de l'axe Y
+            title_standoff=10,  # Éloigne le label de l'axe X du graphique
+            tickfont=dict(size=22),
+            showgrid=True, gridcolor='white'
+            ),
+        margin=dict(l=20, r=20, t=40, b=20), # Ajustez les valeurs de 'r' pour déplacer les commandes à droite
+        # legend=dict(x=1, y=1), # Position de la légende à droite en haut
+        # width = 600
+        height=600,  # Définir la hauteur de la figure
+        plot_bgcolor='rgba(220, 220, 220, 0.6)',
+    )
+    st.plotly_chart(fig, use_container_width=True)
+    '''
+    df = pd.DataFrame({lang1:sent_len,lang2:sent_len2})
+    sns.set()
+    fig = plt.figure() # figsize=(12, 6*row_nb)
+    fig.tight_layout()
+    chart = sns.histplot(df, color=['r','b'], label=[lang1,lang2], binwidth=1, binrange=[2,22], element="step",
+                         common_norm=False, multiple="layer", discrete=True, stat='proportion')
+    plt.xticks([2,4,6,8,10,12,14,16,18,20,22])
+    chart.set(title=tr('Distribution du nombre de mots sur '+str(len(sent_len))+' phrase(s)'));
+    st.pyplot(fig)
+    '''
+    # fig = ff.create_distplot([sent_len], ['Nb de mots'],bin_size=1, colors=['rgb(200, 0, 0)'])
+    distribution = pd.DataFrame({'Nb mots':sent_len, 'Nb phrases':[1]*len(sent_len)})
+    fig = px.histogram(distribution, x='Nb mots', y='Nb phrases', marginal="box",range_x=[3, 18], nbins=16, hover_data=distribution.columns)
+    fig.update_layout(height=600,title={'text': 'Distribution du nb de mots/phrase', 'y':1.0, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'})
+    fig.update_traces(marker=dict(color='rgb(200, 0, 0)', line=dict(color='white', width=2)), showlegend=False,)
+    st.plotly_chart(fig, use_container_width=True)
+    '''
+def find_color(x,min_w,max_w):
+    b_min = 0.0*(max_w-min_w)+min_w
+    b_max = 0.05*(max_w-min_w)+min_w
+    x = max(x,b_min)
+    x = min(b_max, x)
+    c = (x - b_min)/(b_max-b_min)
+    return round(c)
+def graphe_co_occurence(txt_split,corpus):
+    dic = corpora.Dictionary(txt_split) # dictionnaire de tous les mots restant dans le token
+    # Equivalent (ou presque) de la DTM : DFM, Document Feature Matrix
+    dfm = [dic.doc2bow(tok) for tok in txt_split]
+    mes_labels = [k for k, v in dic.token2id.items()]
+    from gensim.matutils import corpus2csc
+    term_matrice = corpus2csc(dfm)
+    term_matrice = np.dot(term_matrice, term_matrice.T)
+    for i in range(len(mes_labels)):
+        term_matrice[i,i]= 0
+    term_matrice.eliminate_zeros()
+    G = nx.from_scipy_sparse_matrix(term_matrice)
+    G.add_nodes = dic
+    pos=nx.spring_layout(G, k=5)  # position des nodes
+    importance = dict(nx.degree(G))
+    importance = [round((v**1.3)) for v in importance.values()]
+    edges,weights = zip(*nx.get_edge_attributes(G,'weight').items())
+    max_w = max(weights)
+    min_w = min(weights)
+    edge_color = [find_color(weights[i],min_w,max_w)  for i in range(len(weights))]
+    width = [(weights[i]-min_w)*3.4/(max_w-min_w)+0.2 for i in range(len(weights))]
+    alpha = [(weights[i]-min_w)*0.3/(max_w-min_w)+0.3 for i in range(len(weights))]
+    fig = plt.figure();
+    nx.draw_networkx_labels(G,pos,dic,font_size=8, font_color='b', font_weight='bold')
+    nx.draw_networkx_nodes(G,pos, dic, \
+                           node_color= importance, # range(len(importance)), #"tab:red", \
+                           node_size=importance, \
+                           cmap=plt.cm.RdYlGn, #plt.cm.Reds_r, \
+                           alpha=0.4);
+    nx.draw_networkx_edges(G,pos,width=width,edge_color=edge_color, alpha=alpha,edge_cmap=plt.cm.RdYlGn)  # [1] * len(width)
+    plt.axis("off");
+    st.pyplot(fig)
+def proximite():
+    global vec_model_en,vec_model_fr
+    # Creates and TSNE model and plots it"
+    labels = []
+    tokens = []
+    nb_words = st.slider(tr('Nombre de mots à afficher')+' :',10,50, value=20)
+    df = pd.read_csv(dataPath+'/dict_we_en_fr',header=0,index_col=0, encoding ="utf-8", keep_default_na=False)
+    words_en = df.index.to_list()[:nb_words]
+    words_fr = df['Francais'].to_list()[:nb_words]
+    for word in words_en:
+        tokens.append(vec_model_en[word])
+        labels.append(word)
+    for word in words_fr:
+        tokens.append(vec_model_fr[word])
+        labels.append(word)
+    tokens = pd.DataFrame(tokens)
+    tsne_model = TSNE(perplexity=10, n_components=2, init='pca', n_iter=2000, random_state=23)
+    new_values = tsne_model.fit_transform(tokens)
+    fig =plt.figure(figsize=(16, 16))
+    x = []
+    y = []
+    for value in new_values:
+        x.append(value[0])
+        y.append(value[1])
+    for i in range(len(x)):
+        if i<nb_words  : color='green'
+        else: color='blue'
+        plt.scatter(x[i],y[i])
+        plt.annotate(labels[i],
+                     xy=(x[i], y[i]),
+                     xytext=(5, 2),
+                     textcoords='offset points',
+                     ha='right',
+                     va='bottom',
+                     color= color,
+                     size=20)
+    plt.title(tr("Proximité des mots anglais avec leur traduction"), fontsize=30, color="green")
+    plt.legend(loc='best');
+    st.pyplot(fig)
+def run():
+    global max_lines, first_line, Langue
+    global full_txt_en, full_corpus_en, full_txt_split_en, full_df_count_word_en,full_sent_len_en, vec_model_en
+    global full_txt_fr, full_corpus_fr, full_txt_split_fr, full_df_count_word_fr,full_sent_len_fr, vec_model_fr
+    st.write("")
+    st.title(tr(title))
+    #
+    st.write("## **"+tr("Paramètres")+" :**\n")
+    Langue = st.radio(tr('Langue:'),('Anglais','Français'), horizontal=True)
+    first_line = st.slider(tr('No de la premiere ligne à analyser')+' :',0,137859)
+    max_lines = st.select_slider(tr('Nombre de lignes à analyser')+' :',
+                              options=[1,5,10,15,100, 500, 1000,'Max'])
+    if max_lines=='Max':
+        max_lines=137860
+    if ((first_line+max_lines)>137860):
+        max_lines = max(137860-first_line,0)
+    # Chargement des textes sélectionnés (max lignes = max_lines)
+    last_line = first_line+max_lines
+    if (Langue == 'Anglais'):
+        txt_en = full_txt_en[first_line:last_line]
+        corpus_en = full_corpus_en[first_line:last_line]
+        txt_split_en = full_txt_split_en[first_line:last_line]
+        df_count_word_en =full_df_count_word_en.loc[first_line:last_line-1]
+        sent_len_en = full_sent_len_en[first_line:last_line]
+        sent_len_fr = full_sent_len_fr[first_line:last_line]
+    else:
+        txt_fr = full_txt_fr[first_line:last_line]
+        corpus_fr = full_corpus_fr[first_line:last_line]
+        txt_split_fr = full_txt_split_fr[first_line:last_line]
+        df_count_word_fr =full_df_count_word_fr.loc[first_line:last_line-1]
+        sent_len_fr = full_sent_len_fr[first_line:last_line]
+        sent_len_en = full_sent_len_en[first_line:last_line]
+    if (Langue=='Anglais'):
+        st.dataframe(pd.DataFrame(data=full_txt_en,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
+    else:
+        st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
+    st.write("")
+    tab1, tab2, tab3, tab4, tab5 = st.tabs([tr("World Cloud"), tr("Frequence"),tr("Distribution longueur"), tr("Co-occurence"), tr("Proximité")])
+    with tab1:
+        st.subheader(tr("World Cloud"))
+        st.markdown(tr(
+            """
+            On remarque, en changeant de langue, que certains mot de taille importante dans une langue,
+            apparaissent avec une taille identique dans l'autre langue.
+            La traduction mot à mot sera donc peut-être bonne.
+            """)
+        )
+        if (Langue == 'Anglais'):
+            text = ""
+            # Initialiser la variable des mots vides
+            stop_words = set(stopwords.words('english'))
+            for e in txt_en : text += e
+            plot_word_cloud(text, "English words corpus", st.session_state.ImagePath+"/coeur.png", stop_words)
+        else:
+            text = ""
+            # Initialiser la variable des mots vides
+            stop_words = set(stopwords.words('french'))
+            for e in txt_fr : text += e
+            plot_word_cloud(text,"Mots français du corpus", st.session_state.ImagePath+"/coeur.png", stop_words)
+    with tab2:
+        st.subheader(tr("Frequence d'apparition des mots"))
+        st.markdown(tr(
+            """
+            On remarque, en changeant de langue, que certains mot fréquents dans une langue,
+            apparaissent aussi fréquemment dans l'autre langue.
+            Cela peut nous laisser penser que la traduction mot à mot sera peut-être bonne.
+            """)
+        )
+        if (Langue == 'Anglais'):
+            dist_frequence_mots(df_count_word_en)
+        else:
+            dist_frequence_mots(df_count_word_fr)
+    with tab3:
+        st.subheader(tr("Distribution des longueurs de phrases"))
+        st.markdown(tr(
+            """
+            Malgré quelques différences entre les 2 langues (les phrases anglaises sont généralement un peu plus courtes),
+            on constate une certaine similitude dans les ditributions de longueur de phrases.
+            Cela peut nous laisser penser que la traduction mot à mot ne sera pas si mauvaise.
+            """)
+        )
+        if (Langue == 'Anglais'):
+            dist_longueur_phrase(sent_len_en, sent_len_fr, 'Anglais','Français')
+        else:
+            dist_longueur_phrase(sent_len_fr, sent_len_en, 'Français', 'Anglais')
+    with tab4:
+        st.subheader(tr("Co-occurence des mots dans une phrase"))
+        if (Langue == 'Anglais'):
+            graphe_co_occurence(txt_split_en[:1000],corpus_en)
+        else:
+            graphe_co_occurence(txt_split_fr[:1000],corpus_fr)
+    with tab5:
+        st.subheader(tr("Proximité sémantique des mots (Word Embedding)") )
+        st.markdown(tr(
+            """
+            MUSE est une bibliothèque Python pour l'intégration de mots multilingues, qui fournit
+            notamment des "Word Embedding" multilingues
+            Facebook fournit des dictionnaires de référence. Ces embeddings sont des embeddings fastText Wikipedia pour 30 langues qui ont été alignés dans un espace espace vectoriel unique.
+            Dans notre cas, nous avons utilisé 2 mini-dictionnaires d'environ 3000 mots (Français et Anglais).
+            """)
+        )
+        st.markdown(tr(
+            """
+            En novembre 2015, l'équipe de recherche de Facebook a créé fastText qui est une extension de la bibliothèque word2vec.
+            Elle s'appuie sur Word2Vec en apprenant des représentations vectorielles pour chaque mot et les n-grammes trouvés dans chaque mot.
+            """)
+        )
+        st.write("")
+        proximite()

tabs/exploration_tab.py ADDED Viewed

	@@ -0,0 +1,424 @@

+import streamlit as st
+import os
+import pandas as pd
+import collections
+from nltk.tokenize import word_tokenize
+from nltk import download
+from ast import literal_eval
+from translate_app import tr
+if st.session_state.Cloud == 0:
+    # import nltk
+    import contextlib
+    import re
+    from nltk.corpus import stopwords
+    import warnings
+    warnings.filterwarnings('ignore')
+# from PIL import Image
+# import time
+# import random
+title = "Exploration et Preprocessing"
+sidebar_name = "Exploration et Preprocessing"
+dataPath = st.session_state.DataPath
+# Indiquer si l'on veut enlever les stop words. C'est un processus long
+stopwords_to_do = True
+# Indiquer si l'on veut lemmatiser les phrases, un fois les stop words enlevés. C'est un processus long (approximativement 8 minutes)
+lemmatize_to_do = True
+# Indiquer si l'on veut calculer le score Bleu pour tout le corpus. C'est un processus très long long (approximativement 10 minutes pour les 10 dictionnaires)
+bleu_score_to_do = True
+# Première ligne à charger
+first_line = 0
+# Nombre maximum de lignes à charger
+max_lines = 140000
+if ((first_line+max_lines)>137860):
+    max_lines = max(137860-first_line ,0)
+# Nombre maximum de ligne à afficher pour les DataFrame
+max_lines_to_display = 50
+download('punkt')
+if st.session_state.Cloud == 0:
+    download('averaged_perceptron_tagger')
+    with contextlib.redirect_stdout(open(os.devnull, "w")):
+        download('stopwords')
+@st.cache_data
+def load_data(path):
+    input_file = os.path.join(path)
+    with open(input_file, "r",  encoding="utf-8") as f:
+        data = f.read()
+    # On convertit les majuscules en minulcule
+    data = data.lower()
+    data = data.split('\n')
+    return data[first_line:min(len(data),first_line+max_lines)]
+@st.cache_data
+def load_preprocessed_data(path,data_type):
+    input_file = os.path.join(path)
+    if data_type == 1:
+        return pd.read_csv(input_file, encoding="utf-8", index_col=0)
+    else:
+        with open(input_file, "r",  encoding="utf-8") as f:
+            data = f.read()
+            data = data.split('\n')
+        if data_type==0:
+            data=data[:-1]
+        elif data_type == 2:
+            data=[eval(i) for i in data[:-1]]
+        elif data_type ==3:
+            data2 = []
+            for d in data[:-1]:
+                data2.append(literal_eval(d))
+            data=data2
+        return data
+@st.cache_data
+def load_all_preprocessed_data(lang):
+    txt             =load_preprocessed_data(dataPath+'/preprocess_txt_'+lang,0)
+    txt_split       = load_preprocessed_data(dataPath+'/preprocess_txt_split_'+lang,3)
+    txt_lem         = load_preprocessed_data(dataPath+'/preprocess_txt_lem_'+lang,0)
+    txt_wo_stopword = load_preprocessed_data(dataPath+'/preprocess_txt_wo_stopword_'+lang,0)
+    df_count_word   = pd.concat([load_preprocessed_data(dataPath+'/preprocess_df_count_word1_'+lang,1), load_preprocessed_data(dataPath+'/preprocess_df_count_word2_'+lang,1)])
+    return txt, txt_split, txt_lem, txt_wo_stopword, df_count_word
+#Chargement des textes complet dans les 2 langues
+full_txt_en = load_data(dataPath+'/small_vocab_en')
+full_txt_fr = load_data(dataPath+'/small_vocab_fr')
+# Chargement du résultat du préprocessing, si st.session_state.reCalcule == False
+if not st.session_state.reCalcule:
+    full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en')
+    full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr')
+else:
+    def remove_stopwords(text, lang):
+        stop_words = set(stopwords.words(lang))
+        # stop_words will contain  set all english stopwords
+        filtered_sentence = []
+        for word in text.split():
+            if word not in stop_words:
+                filtered_sentence.append(word)
+        return " ".join(filtered_sentence)
+    def clean_undesirable_from_text(sentence, lang):
+        # Removing URLs
+        sentence  = re.sub(r"https?://\S+|www\.\S+", "", sentence )
+        # Removing Punctuations (we keep the . character)
+        REPLACEMENTS = [("..", "."),
+                        (",", ""),
+                        (";", ""),
+                        (":", ""),
+                        ("?", ""),
+                        ('"', ""),
+                        ("-", " "),
+                        ("it's", "it is"),
+                        ("isn't","is not"),
+                        ("'", " ")
+                        ]
+        for old, new in REPLACEMENTS:
+            sentence = sentence.replace(old, new)
+        # Removing Digits
+        sentence= re.sub(r'[0-9]','',sentence)
+        # Removing Additional Spaces
+        sentence = re.sub(' +', ' ', sentence)
+        return sentence
+    def clean_untranslated_sentence(data1, data2):
+        i=0
+        while i<len(data1):
+            if data1[i]==data2[i]:
+                data1.pop(i)
+                data2.pop(i)
+            else: i+=1
+        return data1,data2
+    import spacy
+    nlp_en = spacy.load('en_core_web_sm')
+    nlp_fr = spacy.load('fr_core_news_sm')
+    def lemmatize(sentence,lang):
+        # Create a Doc object
+        if lang=='en':
+            nlp=nlp_en
+        elif lang=='fr':
+            nlp=nlp_fr
+        else: return
+        doc = nlp(sentence)
+        # Create list of tokens from given string
+        tokens = []
+        for token in doc:
+            tokens.append(token)
+        lemmatized_sentence = " ".join([token.lemma_ for token in doc])
+        return lemmatized_sentence
+    def preprocess_txt (data, lang):
+        word_count = collections.Counter()
+        word_lem_count = collections.Counter()
+        word_wosw_count = collections.Counter()
+        corpus = []
+        data_split = []
+        sentence_length = []
+        data_split_wo_stopwords = []
+        data_length_wo_stopwords = []
+        data_lem = []
+        data_lem_length = []
+        txt_en_one_string= ". ".join([s for s in data])
+        txt_en_one_string = txt_en_one_string.replace('..', '.')
+        txt_en_one_string = " "+clean_undesirable_from_text(txt_en_one_string, 'lang')
+        data = txt_en_one_string.split('.')
+        if data[-1]=="":
+            data.pop(-1)
+        for i in range(len(data)): # On enleve les ' ' qui commencent et finissent les phrases
+            if data[i][0] == ' ':
+                data[i]=data[i][1:]
+            if data[i][-1] == ' ':
+                data[i]=data[i][:-1]
+        nb_phrases = len(data)
+        # Création d'un tableau de mots (sentence_split)
+        for i,sentence in enumerate(data):
+            sentence_split = word_tokenize(sentence)
+            word_count.update(sentence_split)
+            data_split.append(sentence_split)
+            sentence_length.append(len(sentence_split))
+        # La lemmatisation et le nettoyage des stopword va se faire en batch pour des raisons de vitesse
+        # (au lieu de le faire phrase par phrase)
+        # Ces 2 processus nécéssitent de connaitre la langue du corpus
+        if lang == 'en': l='english'
+        elif lang=='fr': l='french'
+        else: l="unknown"
+        if l!="unknown":
+            # Lemmatisation en 12 lots (On ne peut lemmatiser + de 1 M de caractères à la fois)
+            data_lemmatized=""
+            if lemmatize_to_do:
+                n_batch = 12
+                batch_size = round((nb_phrases/ n_batch)+0.5)
+                for i in range(n_batch):
+                    to_lem = ".".join([s for s in data[i*batch_size:(i+1)*batch_size]])
+                    data_lemmatized = data_lemmatized+"."+lemmatize(to_lem,lang).lower()
+                data_lem_for_sw = data_lemmatized[1:]
+                data_lemmatized = data_lem_for_sw.split('.')
+                for i in range(nb_phrases):
+                    data_lem.append(data_lemmatized[i].split())
+                    data_lem_length.append(len(data_lemmatized[i].split()))
+                    word_lem_count.update(data_lem[-1])
+            # Elimination des StopWords en un lot
+            # On élimine les Stopwords des phrases lémmatisés, si cette phase a eu lieu
+            # (wosw signifie "WithOut Stop Words")
+            if stopwords_to_do:
+                if lemmatize_to_do:
+                    data_wosw = remove_stopwords(data_lem_for_sw,l)
+                else:
+                    data_wosw = remove_stopwords(txt_en_one_string,l)
+                data_wosw = data_wosw.split('.')
+                for i in range(nb_phrases):
+                    data_split_wo_stopwords.append(data_wosw[i].split())
+                    data_length_wo_stopwords.append(len(data_wosw[i].split()))
+                    word_wosw_count.update(data_split_wo_stopwords[-1])
+        corpus = list(word_count.keys())
+        # Création d'un DataFrame txt_n_unique_val :
+        #      colonnes = mots
+        #      lignes = phases
+        #      valeur de la cellule = nombre d'occurence du mot dans la phrase
+        ## BOW
+        from sklearn.feature_extraction.text import CountVectorizer
+        count_vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1), token_pattern=r"[^' ']+" )
+        # Calcul du nombre d'apparition de chaque mot dans la phrases
+        countvectors = count_vectorizer.fit_transform(data)
+        corpus = count_vectorizer.get_feature_names_out()
+        txt_n_unique_val=  pd.DataFrame(columns=corpus,index=range(nb_phrases), data=countvectors.todense()).astype(float)
+        return data, corpus, data_split, data_lemmatized, data_wosw, txt_n_unique_val, sentence_length, data_length_wo_stopwords, data_lem_length
+def count_world(data):
+    word_count = collections.Counter()
+    for sentence in data:
+        word_count.update(word_tokenize(sentence))
+    corpus = list(word_count.keys())
+    nb_mots = sum(word_count.values())
+    nb_mots_uniques = len(corpus)
+    return corpus, nb_mots, nb_mots_uniques
+def display_preprocess_results(lang, data, data_split, data_lem, data_wosw, txt_n_unique_val):
+    global max_lines, first_line, last_line, lemmatize_to_do, stopwords_to_do
+    corpus = []
+    nb_phrases = len(data)
+    corpus, nb_mots, nb_mots_uniques = count_world(data)
+    mots_lem, _ , nb_mots_lem = count_world(data_lem)
+    mots_wo_sw, _ , nb_mots_wo_stopword = count_world(data_wosw)
+    # Identifiez les colonnes contenant uniquement des zéros et les supprimer
+    columns_with_only_zeros = txt_n_unique_val.columns[txt_n_unique_val.eq(0).all()]
+    txt_n_unique_val = txt_n_unique_val.drop(columns=columns_with_only_zeros)
+    # Affichage du nombre de mot en fonction du pré-processing réalisé
+    tab1, tab2, tab3, tab4 = st.tabs([tr("Résumé"), tr("Tokenisation"),tr("Lemmatisation"), tr("Sans Stopword")])
+    with tab1:
+        st.subheader(tr("Résumé du pré-processing"))
+        st.write("**"+tr("Nombre de phrases")+"                     : "+str(nb_phrases)+"**")
+        st.write("**"+tr("Nombre de mots")+"                        : "+str(nb_mots)+"**")
+        st.write("**"+tr("Nombre de mots uniques")+"                : "+str(nb_mots_uniques)+"**")
+        st.write("")
+        st.write("\n**"+tr("Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):")+"**")
+        st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800)
+    with tab2:
+        st.subheader(tr("Tokenisation"))
+        st.write(tr('Texte "splited":'))
+        st.dataframe(pd.DataFrame(data=data_split, index=range(first_line,last_line)).head(max_lines_to_display).fillna(''), width=800)
+        st.write("**"+tr("Nombre de mots uniques")+"                : "+str(nb_mots_uniques)+"**")
+        st.write("")
+        st.write("\n**"+tr("Mots uniques")+":**")
+        st.markdown(corpus[:500])
+        st.write("\n**"+tr("Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):")+"**")
+        st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800)
+    with tab3:
+        st.subheader(tr("Lemmatisation"))
+        if lemmatize_to_do:
+            st.dataframe(pd.DataFrame(data=data_lem,columns=[tr('Texte lemmatisé')],index=range(first_line,last_line)).head(max_lines_to_display), width=800)
+            # Si langue anglaise, affichage du taggage des mots
+            # if lang == 'en':
+            #     for i in range(min(5,len(data))):
+            #         s = str(nltk.pos_tag(data_split[i]))
+            #         st.markdown("**Texte avec Tags     "+str(i)+"** : "+s)
+            st.write("**"+tr("Nombre de mots uniques lemmatisés")+"     : "+str(nb_mots_lem)+"**")
+            st.write("")
+            st.write("\n**"+tr("Mots uniques lemmatisés:")+"**")
+            st.markdown(mots_lem[:500])
+    with tab4:
+        st.subheader(tr("Sans Stopword"))
+        if stopwords_to_do:
+            st.dataframe(pd.DataFrame(data=data_wosw,columns=['Texte sans stopwords'],index=range(first_line,last_line)).head(max_lines_to_display), width=800)
+            st.write("**"+tr("Nombre de mots uniques sans stop words")+": "+str(nb_mots_wo_stopword)+"**")
+            st.write("")
+            st.write("\n**"+tr("Mots uniques sans stop words")+":**")
+            st.markdown(mots_wo_sw[:500])
+def run():
+    global max_lines, first_line, last_line, lemmatize_to_do, stopwords_to_do
+    global full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en
+    global full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr
+    st.write("")
+    st.title(tr(title))
+    st.write("## **"+tr("Explications")+" :**\n")
+    st.markdown(tr(
+        """
+        Le traitement du langage naturel permet à l'ordinateur de comprendre et de traiter les langues humaines.
+        Lors de notre projet, nous avons étudié le dataset small_vocab, proposés par Suzan Li, Chief Data Scientist chez Campaign Research à Toronto.
+        Celui-ci représente un corpus de phrases simples en anglais, et sa traduction (approximative) en français.
+        :red[**Small_vocab**] contient 137 860 phrases en anglais et français.
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
+        Afin de découvrir ce corpus et de préparer la traduction, nous allons effectuer un certain nombre de tâches de pré-traitement (preprocessing).
+        Ces taches sont, par exemple:
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(
+        "* "+tr("le :red[**nettoyage**] du texte (enlever les majuscules et la ponctuation)")+"\n"+ \
+        "* "+tr("la :red[**tokenisation**] (découpage du texte en mots)")+"\n"+ \
+        "* "+tr("la :red[**lemmatisation**] (traitement lexical qui permet de donner une forme unique à toutes les \"variations\" d'un même mot)")+"\n"+ \
+        "* "+tr("l'élimination des :red[**mots \"transparents\"**] (sans utilité pour la compréhension, tels que les articles).")+"  \n"+ \
+        tr("Ce prétraintement se conclut avec la contruction d'un :red[**Bag Of Worlds**], c'est à dire une matrice qui compte le nombre d'apparition de chaque mots (colonne) dans chaque phrase (ligne)")
+    , unsafe_allow_html=True)
+    #
+    st.write("## **"+tr("Paramètres")+" :**\n")
+    Langue = st.radio(tr('Langue:'),('Anglais','Français'), horizontal=True)
+    first_line = st.slider(tr('No de la premiere ligne à analyser:'),0,137859)
+    max_lines = st.select_slider(tr('Nombre de lignes à analyser:'),
+                              options=[1,5,10,15,100, 500, 1000,'Max'])
+    if max_lines=='Max':
+        max_lines=137860
+    if ((first_line+max_lines)>137860):
+        max_lines = max(137860-first_line,0)
+    last_line = first_line+max_lines
+    if (Langue=='Anglais'):
+        st.dataframe(pd.DataFrame(data=full_txt_en,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
+    else:
+        st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
+    st.write("")
+    # Chargement des textes sélectionnés dans les 2 langues (max lignes = max_lines)
+    txt_en = full_txt_en[first_line:last_line]
+    txt_fr = full_txt_fr[first_line:last_line]
+    # Elimination des phrases non traduites
+    # txt_en, txt_fr = clean_untranslated_sentence(txt_en, txt_fr)
+    if not st.session_state.reCalcule:
+        txt_split_en = full_txt_split_en[first_line:last_line]
+        txt_lem_en = full_txt_lem_en[first_line:last_line]
+        txt_wo_stopword_en = full_txt_wo_stopword_en[first_line:last_line]
+        df_count_word_en = full_df_count_word_en.loc[first_line:last_line-1]
+        txt_split_fr = full_txt_split_fr[first_line:last_line]
+        txt_lem_fr = full_txt_lem_fr[first_line:last_line]
+        txt_wo_stopword_fr = full_txt_wo_stopword_fr[first_line:last_line]
+        df_count_word_fr = full_df_count_word_fr.loc[first_line:last_line-1]
+    # Lancement du préprocessing du texte qui va spliter nettoyer les phrases et les spliter en mots
+    # et calculer nombre d'occurences des mots dans chaque phrase
+    if (Langue == 'Anglais'):
+        st.write("## **"+tr("Préprocessing de small_vocab_en")+" :**\n")
+        if max_lines>10000:
+            with st.status(":sunglasses:", expanded=True):
+                if st.session_state.reCalcule:
+                    txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en  = preprocess_txt (txt_en,'en')
+                display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en)
+        else:
+            if st.session_state.reCalcule:
+                txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en  = preprocess_txt (txt_en,'en')
+            display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en)
+    else:
+        st.write("## **"+tr("Préprocessing de small_vocab_fr")+" :**\n")
+        if max_lines>10000:
+            with st.status(":sunglasses:", expanded=True):
+                if st.session_state.reCalcule:
+                    txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr  = preprocess_txt (txt_fr,'fr')
+                display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr)
+        else:
+            if st.session_state.reCalcule:
+                txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr  = preprocess_txt (txt_fr,'fr')
+            display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr)

tabs/game_tab.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import os
+import time
+import matplotlib.pyplot as plt
+import random
+import json
+import csv
+from extra_streamlit_components import tab_bar, TabBarItemData
+import matplotlib.pyplot as plt
+from datetime import datetime
+import tracemalloc
+from translate_app import tr
+title = "Jouez avec nous !"
+sidebar_name = "Jeu"
+dataPath = st.session_state.DataPath
+@st.cache_data
+def init_game():
+    new = int(time.time())
+    sentence_test = pd.read_csv(dataPath+'/multilingue/sentence_test_extract.csv')
+    sentence_test = sentence_test[4750:]
+    # Lisez le contenu du fichier JSON
+    with open(dataPath+'/multilingue/lan_to_language.json', 'r') as fichier:
+        lan_to_language = json.load(fichier)
+    t_now = time.time()
+    return sentence_test, lan_to_language, new, t_now
+def find_indice(sent_selected):
+    l = list(lan_to_language.keys())
+    for i in range(len(l)):
+        if l[i] == sentence_test['lan_code'].iloc[sent_selected]:
+            return i
+@st.cache_data
+def set_game(new):
+    nb_st = len(sentence_test)
+    sent_sel = []
+    # Utilisez une boucle pour générer 5 nombres aléatoires différents
+    while len(sent_sel) < 5:
+        nombre = random.randint(0, nb_st)
+        if nombre not in sent_sel:
+            sent_sel.append(nombre)
+    rep_possibles=[]
+    for i in range(5):
+        rep_possibles.append([find_indice(sent_sel[i])])
+        while len(rep_possibles[i]) < 5:
+            rep_possible = random.randint(0, 95)
+            if rep_possible not in rep_possibles[i]:
+                rep_possibles[i].append(rep_possible)
+        random.shuffle(rep_possibles[i])
+    return sent_sel, rep_possibles, new
+def calc_score(n_rep,duration):
+    if n_rep==0: return 0
+    s1 = n_rep*200
+    if duration < 60:
+        s2 = (60-duration)*200/60
+        if n_rep==5:
+            s2 *= 2.5
+    else:
+        s2 = max(-(duration-60)*100/60,-100)
+    s = int(s1+s2)
+    return s
+def read_leaderboard():
+    return pd.read_csv(dataPath+'/game_leaderboard.csv', index_col=False,encoding='utf8')
+def write_leaderboard(lb):
+    lb['Nom'] = lb['Nom'].astype(str)
+    lb['Rang'] = lb['Rang'].astype(int)
+    lb.to_csv(path_or_buf=dataPath+'/game_leaderboard.csv',columns=['Rang','Nom','Score','Timestamp','BR','Duree'],index=False, header=True,encoding='utf8')
+def display_leaderboard():
+    lb = read_leaderboard()
+    st.write("**"+tr("Leaderboard")+" :**")
+    list_champ = """
+        | Rang | Nom        | Score |
+        |------|------------|-------|"""
+    if len(lb)>0:
+        for i in range(len(lb)):
+            list_champ += """
+        | """+str(lb['Rang'].iloc[i])+""" | """+str(lb['Nom'].iloc[i])[:9]+""" | """+str(lb['Score'].iloc[i])+""" |"""
+    st.markdown(list_champ, unsafe_allow_html=True )
+    return lb
+def write_log(TS,Nom,Score,BR,Duree):
+    log = pd.read_csv(dataPath+'/game_log.csv', index_col=False,encoding='utf8')
+    date_heure = datetime.fromtimestamp(TS)
+    Date = date_heure.strftime('%Y-%m-%d %H:%M:%S')
+    log = pd.concat([log, pd.DataFrame(data={'Date':[Date], 'Nom':[Nom],'Score':[Score],'BR':[BR],'Duree':[Duree]})], ignore_index=True)
+    log.to_csv(path_or_buf=dataPath+'/game_log.csv',columns=['Date','Nom','Score','BR','Duree'],index=False, header=True,encoding='utf8')
+def display_files():
+    log = pd.read_csv(dataPath+'/game_log.csv', index_col=False,encoding='utf8')
+    lb = pd.read_csv(dataPath+'/game_leaderboard.csv', index_col=False,encoding='utf8')
+    st.dataframe(lb)
+    st.dataframe(log)
+def run():
+    global sentence_test, lan_to_language
+    sentence_test, lan_to_language, new, t_debut = init_game()
+    st.write("")
+    st.title(tr(title))
+    st.write("#### **"+tr("Etes vous un expert es Langues ?")+"**\n")
+    st.markdown(tr(
+        """
+        Essayer de trouvez, sans aide, la langue des 5 phrases suivantes.
+        Attention : Vous devez être le plus rapide possible !
+        """), unsafe_allow_html=True
+        )
+    st.write("")
+    player_name = st.text_input(tr("Quel est votre nom ?"))
+    if player_name == 'display_files':
+        display_files()
+        return
+    elif player_name == 'malloc_start':
+        tracemalloc.start()
+        return
+    elif player_name == 'malloc_stop':
+        snapshot = tracemalloc.take_snapshot()
+        top_stats = snapshot.statistics('traceback')
+        # pick the biggest memory block
+        for k in range(3):
+            stat = top_stats[k]
+            print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024))
+            for line in stat.traceback.format():
+                print('   >'+line)
+        total_mem = sum(stat.size for stat in top_stats)
+        print("Total allocated size: %.1f KiB" % (total_mem / 1024))
+        return
+    score = 0
+    col1, col2 = st.columns([0.7,0.3])
+    with col2:
+        lb = display_leaderboard()
+    with col1:
+        sent_sel, rep_possibles, new = set_game(new)
+        answer = [""] * 5
+        l = list(lan_to_language.values())
+        for i in range(5):
+            answer[i] = st.radio("**:blue["+sentence_test['sentence'].iloc[sent_sel[i]]+"]**\n",[l[rep_possibles[i][0]],l[rep_possibles[i][1]],l[rep_possibles[i][2]], \
+                                                                                        l[rep_possibles[i][3]],l[rep_possibles[i][4]]], horizontal=True, key=i)
+        t_previous_debut = t_debut
+        t_debut = time.time()
+        if st.button(label=tr("Validez"), type="primary"):
+            st.cache_data.clear()
+            nb_bonnes_reponses = 0
+            for i in range(5):
+                if lan_to_language[sentence_test['lan_code'].iloc[sent_sel[i]]]==answer[i]:
+                    nb_bonnes_reponses +=1
+            t_fin = time.time()
+            duration = t_fin - t_previous_debut
+            score = calc_score(nb_bonnes_reponses,duration)
+            write_log(time.time(),player_name,score,nb_bonnes_reponses,duration)
+            if nb_bonnes_reponses >=4:
+                st.write(":red[**"+tr("Félicitations, vous avez "+str(nb_bonnes_reponses)+" bonnes réponses !")+"**]")
+                st.write(":red["+tr("Votre score est de "+str(score)+" points")+"]")
+            else:
+                if nb_bonnes_reponses >1 : s="s"
+                else: s=""
+                st.write("**:red["+tr("Vous avez "+str(nb_bonnes_reponses)+" bonne"+s+" réponse"+s+".")+"]**")
+                if nb_bonnes_reponses >0 : s="s"
+                else: s=""
+                st.write(":red["+tr("Votre score est de "+str(score)+" point"+s)+"]")
+            st.write(tr("Bonne réponses")+":")
+            for i in range(5):
+                st.write("- "+sentence_test['sentence'].iloc[sent_sel[i]]+" -> :blue[**"+lan_to_language[sentence_test['lan_code'].iloc[sent_sel[i]]]+"**]")
+                new = int(time.time())
+            st.button(label=tr("Play again ?"), type="primary")
+            with col2:
+                now = time.time()
+                # Si le score du dernier est plus vieux d'une semaine, il est remplacé par un score + récent
+                renew_old = ((len(lb)>9) and (lb['Timestamp'].iloc[9])<(now-604800))
+                if (score>0) and ((((score >= lb['Score'].min()) and (len(lb)>9)) or (len(lb)<=9)) or (pd.isna(lb['Score'].min())) or renew_old):
+                    if player_name not in lb['Nom'].tolist():
+                        if (((score >= lb['Score'].min()) and (len(lb)>9)) or (len(lb)<=9)) or (pd.isna(lb['Score'].min())) :
+                            lb = pd.concat([lb, pd.DataFrame(data={'Nom':[player_name],'Score':[score],'Timestamp':[now],'BR':[nb_bonnes_reponses],'Duree':[duration]})], ignore_index=True)
+                            lb = lb.sort_values(by=['Score', 'Timestamp'], ascending=[False, False]).reset_index()
+                            lb = lb.drop(lb.index[10:])
+                        else:
+                            st.write('2:',player_name)
+                            lb['Nom'].iloc[9]= player_name
+                            lb['Score'].iloc[9]= score
+                            lb['Timestamp'].iloc[9]=now
+                            lb['BR'].iloc[9]=nb_bonnes_reponses
+                            lb['Duree'].iloc[9]=duration
+                            lb = lb.reset_index()
+                    else:
+                        liste_Nom = lb['Nom'].tolist()
+                        for i,player in enumerate(liste_Nom):
+                            if player == player_name:
+                                if lb['Score'].iloc[i] < score:
+                                    lb['Score'].iloc[i] = score
+                                    lb['Timestamp'].iloc[i]=now
+                                lb = lb.sort_values(by=['Score', 'Timestamp'], ascending=[False, False]).reset_index()
+                    for i in range(len(lb)):
+                        if (i>0):
+                            if (lb['Score'].iloc[i]==lb['Score'].iloc[i-1]):
+                                lb['Rang'].iloc[i] = lb['Rang'].iloc[i-1]
+                            else:
+                                lb['Rang'].iloc[i] = i+1
+                        else:
+                            lb['Rang'].iloc[i] = i+1
+                    if player_name !="":
+                        write_leaderboard(lb)
+    return

tabs/id_lang_tab.py ADDED Viewed

	@@ -0,0 +1,476 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import tiktoken
+import joblib
+import json
+import csv
+from transformers import pipeline
+import keras
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from sklearn.preprocessing import LabelEncoder
+from tensorflow.keras.utils import plot_model
+from filesplit.merge import Merge
+from extra_streamlit_components import tab_bar, TabBarItemData
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn import naive_bayes
+from translate_app import tr
+title = "Identification de langue"
+sidebar_name = "Identification de langue"
+dataPath = st.session_state.DataPath
+# CountVectorizer a une liste de phrase en entrée.
+# Cette fonction met les données d'entrée dans le bon format
+def format_to_vectorize(data):
+    X_tok = []
+    if "DataFrame" in str(type(data)):sentences = data.tolist()
+    elif "str" in str(type(data)):
+        sentences =[data]
+    else: sentences = data
+    for sentence in sentences:
+        X_tok.append(sentence)
+    return X_tok
+def create_BOW(data):
+    global vectorizer
+    X_tok = format_to_vectorize(data)
+    X = vectorizer.transform(X_tok)
+    return X
+def load_vectorizer(tokenizer):
+    global dict_token, dict_ids, nb_token
+    path = dataPath+'/vectorizer_tiktoken_big.pkl'
+    vectorizer = joblib.load(path)
+    dict_token = {tokenizer.decode([cle]): cle for cle, valeur in vectorizer.vocabulary_.items()}
+    dict_ids = {cle: tokenizer.decode([cle]) for cle, valeur in vectorizer.vocabulary_.items()} #dict_ids.items()}
+    nb_token = len(vectorizer.vocabulary_)
+    return vectorizer
+def lang_id_nb(sentences):
+    global lan_to_language
+    if "str" in str(type(sentences)):
+        return lan_to_language[clf_nb.predict(create_BOW(sentences))[0]]
+    else: return [lan_to_language[l] for l in clf_nb.predict(create_BOW(sentences))]
+@st.cache_resource
+def init_nb_identifier():
+    tokenizer = tiktoken.get_encoding("cl100k_base")
+    # Chargement du classificateur sauvegardé
+    clf_nb = joblib.load(dataPath+"/id_lang_tiktoken_nb_sparse_big.pkl")
+    vectorizer = load_vectorizer(tokenizer)
+    # Lisez le contenu du fichier JSON
+    with open(dataPath+'/multilingue/lan_to_language.json', 'r') as fichier:
+        lan_to_language = json.load(fichier)
+    return tokenizer, dict_token, dict_ids, nb_token, lan_to_language, clf_nb, vectorizer
+def encode_text(textes):
+    global tokenizer
+    max_length=250
+    sequences = tokenizer.encode_batch(textes)
+    return pad_sequences(sequences, maxlen=max_length, padding='post')
+def read_list_lan():
+    with open(dataPath+'/multilingue/lan_code.csv', 'r') as fichier_csv:
+        reader = csv.reader(fichier_csv)
+        lan_code = next(reader)
+        return lan_code
+@st.cache_resource
+def init_dl_identifier():
+    label_encoder = LabelEncoder()
+    list_lan = read_list_lan()
+    lan_identified = [lan_to_language[l] for l in list_lan]
+    label_encoder.fit(list_lan)
+    merge = Merge(dataPath+"/dl_id_lang_split",  dataPath, "dl_tiktoken_id_language_model.h5").merge(cleanup=False)
+    dl_model = keras.models.load_model(dataPath+"/dl_tiktoken_id_language_model.h5")
+    return dl_model, label_encoder, list_lan, lan_identified
+def lang_id_dl(sentences):
+    global dl_model, label_encoder
+    if "str" in str(type(sentences)): predictions = dl_model.predict(encode_text([sentences]))
+    else:  predictions = dl_model.predict(encode_text(sentences))
+    # Décodage des prédictions en langues
+    predicted_labels_encoded = np.argmax(predictions, axis=1)
+    predicted_languages = label_encoder.classes_[predicted_labels_encoded]
+    if "str" in str(type(sentences)): return lan_to_language[predicted_languages[0]]
+    else: return [l for l in predicted_languages]
+@st.cache_resource
+def init_lang_id_external():
+    lang_id_model_ext = pipeline('text-classification',model="papluca/xlm-roberta-base-language-detection")
+    dict_xlmr  = {"ar":"ara", "bg":"bul", "de":"deu", "el": "ell", "en":"eng", "es":"spa", "fr":"fra", "hi": "hin","it":"ita","ja":"jpn", \
+                  "nl":"nld", "pl":"pol", "pt":"por", "ru":"rus", "sw":"swh", "th":"tha", "tr":"tur", "ur": "urd", "vi":"vie", "zh":"cmn"}
+    sentence_test = pd.read_csv(dataPath+'//multilingue/sentence_test_extract.csv')
+    sentence_test = sentence_test[:4750]
+    # Instanciation d'un exemple
+    exemples = ["Er weiß überhaupt nichts über dieses Buch",                               # Phrase 0
+                "Umbrellas sell well",                                                     # Phrase 1
+                "elle adore les voitures très luxueuses, et toi ?",                        # Phrase 2
+                "she loves very luxurious cars, don't you?",                               # Phrase 3
+                "Vogliamo visitare il Colosseo e nuotare nel Tevere",                      # Phrase 4
+                "vamos a la playa",                                                        # Phrase 5
+                "Te propongo un trato",                                                    # Phrase 6
+                "she loves you much, mais elle te hait aussi and das ist traurig",         # Phrase 7  # Attention à cette phrase trilingue
+                "Elle a de belles loches"                                                  # Phrase 8
+                ]
+    lang_exemples = ['deu','eng','fra','eng','ita','spa','spa','fra','fra']
+    return lang_id_model_ext, dict_xlmr, sentence_test, lang_exemples, exemples
+@st.cache_data
+def display_acp(title, comment):
+    data = np.load(dataPath+'/data_lang_id_acp.npz')
+    X_train_scaled = data['X_train_scaled']
+    y_train_pred = data['y_train_pred']
+    label_arrow = ['.', ',', '?', ' a', ' de', ' la', ' que', 'Tom', ' un', ' the', ' in', \
+                    ' to', 'I', "'", 'i', ' le', ' en', ' es', 'é', ' l', '!', 'o', ' ist', \
+                    ' pas', ' Tom', ' me', ' di', 'Ich', ' is', 'Je', ' nicht', ' you', \
+                    ' die', ' à', ' el', ' est', 'a', 'en', ' d', ' è', ' ne', ' se', ' no', \
+                    ' una', ' zu', 'Il', '¿', ' of', ' du', "'t", 'ato', ' der', ' il', \
+                    ' n', 'El', ' non', ' che', 'are', ' con', 'ó', ' was', 'La', 'No', \
+                    ' ?', 'es', 'le', 'L', ' and', ' des', ' s', ' ich', 'as', 'S', ' per', \
+                    ' das', ' und', ' ein', 'e', "'s", 'u', ' y', 'He', 'z', 'er', ' m', \
+                    'st', ' les', 'Le', ' I', 'ar', 'te', 'Non', 'The', ' er', 'ie', ' v', \
+                    ' c', "'est", ' ha', ' den']
+    pca = PCA(n_components=2)
+    X_new = pca.fit_transform(X_train_scaled)
+    coeff = pca.components_.transpose()
+    xs = X_new[:, 0]
+    ys = X_new[:, 1]
+    scalex = 1.0/(xs.max() - xs.min())
+    scaley = 1.0/(ys.max() - ys.min())
+    principalDf = pd.DataFrame({'PC1': xs*scalex, 'PC2': ys * scaley})
+    finalDF = pd.concat([principalDf, pd.Series(y_train_pred, name='Langue')], axis=1)
+    sns.set_context("poster")  #  Valeur possible:"notebook", "talk", "poster", ou "paper"
+    plt.rc("axes", titlesize=32,titleweight='bold')  # Taille du titre de l'axe
+    plt.rc("axes", labelsize=18,labelweight='bold')  # Taille des étiquettes de l'axe
+    plt.rc("xtick", labelsize=14)  # Taille des étiquettes de l'axe des x
+    plt.rc("ytick", labelsize=14)  # Taille des étiquettes de l'axe des y
+    st.write(comment)
+    st.write("")
+    fig = plt.figure(figsize=(20, 15))
+    sns.scatterplot(x='PC1', y='PC2', hue='Langue', data=finalDF, alpha=0.5)
+    for i in range(50):
+        plt.arrow(0, 0, coeff[i, 0]*1.5, coeff[i, 1]*0.8,color='k', alpha=0.08, head_width=0.01, )
+        plt.text(coeff[i, 0]*1.5, coeff[i, 1] * 0.8, label_arrow[i], color='k', weight='bold')
+    plt.title(title)
+    plt.xlim(-0.4, 0.45)
+    plt.ylim(-0.15, 0.28);
+    st.pyplot(fig)
+    return
+@st.cache_data
+def read_BOW_examples():
+    return pd.read_csv(dataPath+'/lang_id_small_BOW.csv')
+def analyse_nb(sel_phrase):
+    global lang_exemples,exemples
+    def create_small_BOW(s):
+        encodage = tokenizer.encode(s)
+        sb = [0] * (df_BOW.shape[1]-1)
+        nb_unique_token = 0
+        for i in range(df_BOW.shape[1]-1):
+            for t in encodage:
+                if df_BOW.columns[i]==str(t):
+                    sb[i] += 1
+            if sb[i] > 0: nb_unique_token +=1
+        return sb, nb_unique_token
+    st.write("#### **"+tr("Probabilité d'appartenance de la phrase à une langue")+" :**")
+    st.image("./assets/formule_proba_naive_bayes.png")
+    st.write(tr("où **C** est la classe (lan_code), **Fi** est la caractéristique i du BOW, **Z** est l'\"evidence\" servant à regulariser la probabilité"))
+    st.write("")
+    nb_lang = 5
+    lan_code = ['deu','eng','fra','spa','ita']
+    lan_color = {'deu':'violet','eng':'green','fra':'red','spa':'blue','ita':'orange'}
+    df_BOW = read_BOW_examples()
+    clf_nb2 = naive_bayes.MultinomialNB()
+    clf_nb2.fit(df_BOW.drop(columns='lan_code').values.tolist(), df_BOW['lan_code'].values.tolist())
+    nb_phrases_lang =[]
+    for l in lan_code:
+        nb_phrases_lang.append(sum(df_BOW['lan_code']==l))
+    st.write(tr("Phrase à analyser")+" :",'**:'+lan_color[lang_exemples[sel_phrase]]+'['+lang_exemples[sel_phrase],']** - **"'+exemples[sel_phrase]+'"**')
+    # Tokenisation et encodage de la phrase
+    encodage = tokenizer.encode(exemples[sel_phrase])
+    # Création du vecteur BOW de la phrase
+    bow_exemple,  nb_unique_token = create_small_BOW(exemples[sel_phrase])
+    st.write(tr("Nombre de tokens retenus dans le BOW")+": "+ str(nb_unique_token))
+    masque_tokens_retenus = [(1 if token in list(dict_ids.keys()) else 0) for token in encodage]
+    str_token = " "
+    for i in range(len(encodage)):
+        if masque_tokens_retenus[i]==1:
+            if (i%2) ==0:
+                str_token += "**:red["+tokenizer.decode([encodage[i]])+"]** "
+            else:
+                str_token += "**:violet["+tokenizer.decode([encodage[i]])+"]** "
+        else: str_token += ":green["+tokenizer.decode([encodage[i]])+"] "
+    st.write(tr("Tokens se trouvant dans le modèle (en")+" :red["+tr("rouge")+"] "+tr("ou")+" :violet["+tr("violet")+"]) :"+str_token+" ")
+    st.write("")
+    # Afin de continuer l'analyse on ne garde que les token de la phrase disponibles dans le BOW
+    token_used = [str(encodage[i]) for i in range(len(encodage)) if (masque_tokens_retenus[i]==1)]
+    # Calcul du nombre d'apparition de ces tokens dans le BOW pour chaque langue, et stockage dans un DataFrame df_count
+    def compter_non_zero(colonne):
+        return (colonne != 0).sum()
+    votes = []
+    for i in range(nb_lang):
+        #votes.append(list(df_BOW[token_used].loc[df_BOW['lan_code']==lan_code[i]].sum(axis=0)))
+        votes.append(list(df_BOW[token_used].loc[df_BOW['lan_code']==lan_code[i]].apply(compter_non_zero)))
+    col_name = [str(i+1)+'-'+tokenizer.decode([int(token_used[i])]) for i in range(len(token_used))]
+    df_count = pd.DataFrame(data=votes,columns=token_used, index=lan_code)
+    df_count.columns = col_name
+    st.write("\n**"+tr("Nombre d'apparitions des tokens, dans chaque langue")+"**")
+    # Lissage de Laplace n°1 (Laplace smoothing )
+    # df_count = df_count+1
+    st.dataframe(df_count)
+    #########
+    ######### 3. Calcul de la probabilité d'apparition de chaque token dans chaque langue
+    df_proba = df_count.div(nb_phrases_lang, axis = 0)
+    # Lissage de Laplace n°2 (Laplace smoothing )
+    df_proba = df_proba.replace(0.0,0.0010)
+    # Initialisation de df_proba: Calcul de la probabilité conditionnelle d'appartenance de la phrase à une langue
+    df_proba['Proba'] = 1
+    # Itérer sur les colonnes et effectuez la multiplication pour chaque ligne
+    for col in df_count.columns:
+        df_proba['Proba'] *= df_proba[col]
+    #########
+    ######### 4.  Calcul (par multiplication) de la probabilité d'appartenance de la phrase à une langue
+    # Multiplication par la probabilité de la classe
+    p_classe = [(nb_phrases_lang[i]/df_BOW.shape[0]) for i in range(len(nb_phrases_lang))]
+    df_proba['Proba'] *= p_classe
+    # Diviser par l'evidence
+    evidence = df_proba['Proba'].sum(axis=0)
+    df_proba['Proba'] *= 1/evidence
+    df_proba['Proba'] = df_proba['Proba'].round(3)
+    # Affichage de la matrice des probabilités
+    st.write("**"+tr("Probabilités conditionnelles d'apparition des tokens retenus, dans chaque langue")+":**")
+    st.dataframe(df_proba)
+    str_token = "Lang proba max: "#&nbsp;"*20
+    for i,token in enumerate(df_proba.columns[:-1]):
+        str_token += '*'+token+'*:**:'+lan_color[df_proba[token].idxmax()]+'['+df_proba[token].idxmax()+']**'+"&nbsp;"*2 #8
+    st.write(str_token)
+    st.write("")
+    st.write(tr("Langue réelle de la phrase")+"&nbsp;"*35+": **:"+lan_color[lang_exemples[sel_phrase]]+'['+lang_exemples[sel_phrase]+']**')
+    st.write(tr("Langue dont la probabilité est la plus forte ")+": **:"+lan_color[df_proba['Proba'].idxmax()]+'['+df_proba['Proba'].idxmax(),"]** (proba={:.2f}".format(max(df_proba['Proba']))+")")
+    prediction = clf_nb2.predict([bow_exemple])
+    st.write(tr("Langue prédite par Naiva Bayes")+"&nbsp;"*23+": **:"+lan_color[prediction[0]]+'['+prediction[0]+"]** (proba={:.2f}".format(max(clf_nb2.predict_proba([bow_exemple])[0]))+")")
+    st.write("")
+    fig, axs = plt.subplots(1, 2, figsize=(10, 6))
+    df_proba_sorted =df_proba.sort_index(ascending=True)
+    axs[0].set_title(tr("Probabilités calculée manuellement"), fontsize=12)
+    axs[0].barh(df_proba_sorted.index, df_proba_sorted['Proba'])
+    axs[1].set_title(tr("Probabilités du classifieur Naive Bayes"), fontsize=12)
+    axs[1].barh(df_proba_sorted.index, clf_nb2.predict_proba([bow_exemple])[0]);
+    st.pyplot(fig)
+    return
+#@st.cache_data
+def find_exemple(lang_sel):
+    global exemples
+    return exemples[lang_sel]
+def display_shapley(lang_sel):
+    st.write("**"+tr("Analyse de l'importance de chaque token dans l'identification de la langue")+"**")
+    st.image('assets/fig_schapley'+str(lang_sel)+'.png')
+    st.write("**"+tr("Recapitulatif de l'influence des tokens sur la selection de la langue")+"**")
+    st.image('assets/fig_schapley_recap'+str(lang_sel)+'.png')
+    return
+def run():
+    global tokenizer, vectorizer, dict_token, dict_ids, nb_token, lan_to_language, clf_nb
+    global dl_model, label_encoder, toggle_val, custom_sentence, list_lan, lan_identified
+    global lang_exemples, exemples
+    tokenizer, dict_token, dict_ids, nb_token, lan_to_language, clf_nb, vectorizer = init_nb_identifier()
+    dl_model, label_encoder, list_lan, lan_identified = init_dl_identifier()
+    lang_id_model_ext, dict_xlmr, sentence_test, lang_exemples, exemples= init_lang_id_external()
+    st.write("")
+    st.title(tr(title))
+    st.write("## **"+tr("Explications")+" :**\n")
+    st.markdown(tr(
+        """
+        Afin de mettre en oeuvre cette fonctionnalité nous avons utilisé un jeu d'entrainement multilinge de <b> 9.757.778 phrases dans 95 langues</b>.
+        Les 95 langues identifiées sont:
+        """)
+    , unsafe_allow_html=True)
+    st.selectbox(label="Lang",options=sorted(lan_identified),label_visibility="hidden")
+    st.markdown(tr(
+        """
+        Nous avons utilisé 2 méthodes pour identifier la langue d'un texte:
+        1. un classificateur **Naïve Bayes**
+        2. un modèle de **Deep Learning**
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
+        Les 2 modèles ont un accuracy similaire sur le jeu de test: **:red[96% pour NB et 97,5% pour DL]**
+        <br>
+        """)
+        , unsafe_allow_html=True)
+    chosen_id = tab_bar(data=[
+        TabBarItemData(id="tab1", title=tr("Id. Naïve Bayes"), description=tr("avec le Bag Of Words")),
+        TabBarItemData(id="tab2", title=tr("Id. Deep Learning"), description=tr(" avec Keras")),
+        TabBarItemData(id="tab3", title=tr("Interpretabilité"), description=tr("du modèle Naïve Bayes "))],
+        default="tab1")
+    if (chosen_id == "tab1") or (chosen_id == "tab2"):
+        st.write("## **"+tr("Paramètres")+" :**\n")
+        toggle_val = st.toggle(tr('Phrase à saisir/Phrase test'), value=True, help=tr("Off = phrase à saisir, On = selection d'une phrase test parmi 9500 phrases"))
+        if toggle_val:
+            custom_sentence= st.selectbox(tr("Selectionnez une phrases test à identifier")+":", sentence_test['sentence'] )
+        else:
+            custom_sentence = st.text_area(label=tr("Saisir le texte dont vous souhaitez identifier la langue:"))
+            st.button(label=tr("Validez"), type="primary")
+        if custom_sentence!='':
+            st.write("## **"+tr("Résultats")+" :**\n")
+            md = """
+                |"""+tr("Identifieur")+"""                          |"""+tr("Langue identifiée")+"""|
+                |-------------------------------------|---------------|"""
+            md1 = ""
+            if toggle_val:
+                lan_reelle = sentence_test['lan_code'].loc[sentence_test['sentence']==custom_sentence].tolist()[0]
+                md1 = """
+                |"""+tr("Langue réelle")+"""                        |**:blue["""+lan_to_language[lan_reelle]+"""]**|"""
+            md2 = """
+                |"""+tr("Classificateur Naïve Bayes")+"""           |**:red["""+lang_id_nb(custom_sentence)+"""]**|
+                |"""+tr("Modèle de Deep Learning")+"""           |**:red["""+lang_id_dl(custom_sentence)+"""]**|"""
+            md3 = """
+                |XLM-RoBERTa (Hugging Face)           |**:red["""+lan_to_language[dict_xlmr[lang_id_model_ext(custom_sentence)[0]['label']]]+"""]**|"""
+            if toggle_val:
+                if not (lan_reelle in list(dict_xlmr.values())):
+                    md3=""
+            st.markdown(md+md1+md2+md3, unsafe_allow_html=True)
+        st.write("## **"+tr("Details sur la méthode")+" :**\n")
+        if (chosen_id == "tab1"):
+            st.markdown(tr(
+                """
+                Afin d'utiliser le classificateur Naïve Bayes, il nous a fallu:""")+"\n"+
+                "* "+tr("Créer un Bag of Words de token..")+"\n"+
+                "* "+tr("..Tokeniser le texte d'entrainement avec CountVectorizer et un tokenizer 'custom', **Tiktoken** d'OpenAI.  ")+"\n"+
+                "* "+tr("Utiliser des matrices creuses (Sparse Matrix), car notre BOW contenait 10 Millions de lignes x 59122 tokens.  ")+"\n"+
+                "* "+tr("Sauvegarder le vectorizer (non serialisable) et le classificateur entrainé.  ")
+            , unsafe_allow_html=True)
+            st.markdown(tr(
+                """
+                L'execution de toutes ces étapes est assez rapide: une dizaine de minutes
+                <br>
+                Le résultat est très bon: L'Accuracy sur le jeu de test est =
+                **:red[96%]** sur les 95 langues, et **:red[99,1%]** sur les 5 langues d'Europe de l'Ouest (en,fr,de,it,sp)
+                <br>
+                """)
+            , unsafe_allow_html=True)
+            st.markdown(tr(
+                """
+                **Note 1:** Les 2 modèles ont un accuracy similaire sur le jeu de test: **:red[96% pour NB et 97,5% pour DL]**
+                **Note 2:** Le modèle *XLM-RoBERTa* de Hugging Face (qui identifie 20 langues seulement) a une accuracy, sur notre jeu de test = **97,8%**,
+                versus **99,3% pour NB** et **99,2% pour DL** sur ces 20 langues.
+                """)
+            , unsafe_allow_html=True)
+        else:
+            st.markdown(tr(
+                """
+                Nous avons mis en oeuvre un modèle Keras avec une couche d'embedding et 4 couches denses (*Voir architecture ci-dessous*).
+                Nous avons utilisé le tokeniser <b>Tiktoken</b> d'OpenAI.
+                La couche d'embedding accepte 250 tokens, ce qui signifie que la détection de langue s'effectue sur approximativement les 200 premiers mots.
+                <br>
+                """)
+            , unsafe_allow_html=True)
+            st.markdown(tr(
+                """
+                L'entrainement a duré plus de 10 heures..
+                Finalement, le résultat est très bon: L'Accuracy sur le jeu de test est =
+                **:red[97,5%]** sur les 95 langues, et **:red[99,1%]** sur les 5 langues d'Europe de l'Ouest (en,fr,de,it,sp).
+                Néanmoins, la durée pour une prédiction est relativement longue: approximativement 5/100 de seconde
+                <br>
+                """)
+                , unsafe_allow_html=True)
+            st.markdown(tr(
+                """
+                **Note 1:** Les 2 modèles ont un accuracy similaire sur le jeu de test: **:red[96% pour NB et 97,5% pour DL]**""")+"<br>"+
+                tr("""
+                **Note 2:** Le modèle *XLM-RoBERTa* de Hugging Face (qui identifie 20 langues seulement) a une accuracy, sur notre jeu de test = <b>97,8%</b>,
+                versus **99,3% pour NB** et **99,2% pour DL** sur ces 20 langues.
+                <br>
+                """)
+                , unsafe_allow_html=True)
+            st.write("<center><h5>"+tr("Architecture du modèle utilisé")+":</h5></center>", unsafe_allow_html=True)
+            plot_model(dl_model, show_shapes=True, show_layer_names=True, show_layer_activations=True,rankdir='TB',to_file='./assets/model_plot.png')
+            col1, col2, col3 = st.columns([0.15,0.7,0.15])
+            with col2:
+                 st.image('./assets/model_plot.png',use_column_width="auto")
+    elif (chosen_id == "tab3"):
+        st.write("### **"+tr("Interpretabilité du classifieur Naïve Bayes sur 5 langues")+"**")
+        st.write("##### "+tr("..et un Training set réduit (15000 phrases et 94 tokens)"))
+        st.write("")
+        chosen_id2 = tab_bar(data=[
+            TabBarItemData(id="tab1", title=tr("Analyse en Compos. Princ."), description=""),
+            TabBarItemData(id="tab2", title=tr("Simul. calcul NB"), description=""),
+            TabBarItemData(id="tab3", title=tr("Shapley"), description="")],
+            default="tab1")
+        if (chosen_id2 == "tab1"):
+            display_acp(tr("Importance des principaux tokens dans \n l'identification de langue par l'algorithme Naive Bayes"),tr("Affichage de 10 000 phrases (points) et des 50 tokens les + utilisés (flèches)"))
+        if (chosen_id2 == "tab2") or (chosen_id2 == "tab3"):
+            sel_phrase = st.selectbox(tr('Selectionnez une phrase à "interpréter"')+':', range(9), format_func=find_exemple)
+            if (chosen_id2 == "tab2"):
+                analyse_nb(sel_phrase)
+            if (chosen_id2 == "tab3"):
+                display_shapley(sel_phrase)

tabs/intro.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import streamlit as st
+from translate_app import tr
+title = "Démosthène"
+sidebar_name = "Introduction"
+def run():
+    # TODO: choose between one of these GIFs
+    # st.image("https://dst-studio-template.s3.eu-west-3.amazonaws.com/1.gif")
+    # st.image("https://dst-studio-template.s3.eu-west-3.amazonaws.com/2.gif")
+    # st.image("https://dst-studio-template.s3.eu-west-3.amazonaws.com/3.gif")
+    # st.image("assets/tough-communication.gif",use_column_width=True)
+    st.write("")
+    if st.session_state.Cloud == 0:
+        st.image("assets/miss-honey-glasses-off.gif",use_column_width=True)
+    else:
+        st.image("https://media.tenor.com/pfOeAfytY98AAAAC/miss-honey-glasses-off.gif",use_column_width=True)
+    st.title(tr(title))
+    st.markdown('''
+                ## **'''+tr("Système de traduction adapté aux lunettes connectées")+'''**
+                ---
+                ''')
+    st.header("**"+tr("A propos")+"**")
+    st.markdown(tr(
+        """
+        Ce projet a été réalisé dans le cadre d’une formation de Data Scientist, entre juin et novembre 2023.
+        <br>
+        :red[**Démosthène**] est l'un des plus grands orateurs de l'Antiquité. Il savait s’exprimer,  et se faire comprendre.
+        Se faire comprendre est l’un des principaux objectifs de la traduction.
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
+        Démosthène avait de gros problèmes d’élocution.
+        Il les a surmontés en s’entraînant à parler avec des cailloux dans la bouche,
+        à l’image de l’Intelligence Artificielle,  où des entraînements sont nécessaires pour obtenir de bons résultats.
+        Il nous a semblé pertinent de donner le nom de cet homme à un projet qu’il a fort bien illustré, il y a 2300 ans.
+        """)
+    , unsafe_allow_html=True)
+    st.header("**"+tr("Contexte")+"**")
+    st.markdown(tr(
+        """
+        Les personnes malentendantes communiquent difficilement avec autrui. Par ailleurs, toute personne se trouvant dans un pays étranger
+        dont il ne connaît pas la langue se retrouve dans la situation d’une personne malentendante.
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
+        L’usage de lunettes connectées, dotées de la technologie de reconnaissance vocale et d’algorithmes IA de deep learning, permettrait
+        de détecter la voix d’un interlocuteur, puis d’afficher la transcription textuelle, sur les verres en temps réel.
+        À partir de cette transcription, il est possible d’:red[**afficher la traduction dans la langue du porteur de ces lunettes**].
+        """)
+    , unsafe_allow_html=True)
+    st.header("**"+tr("Objectifs")+"**")
+    st.markdown(tr(
+        """
+        L’objectif de ce projet est de développer une brique technologique de traitement, de transcription et de traduction,
+        qui par la suite serait implémentable dans des lunettes connectées. Nous avons concentré nos efforts sur la construction
+        d’un :red[**système de traduction**] plutôt que sur la reconnaissance vocale,
+        et ce, pour tout type de public, afin de faciliter le dialogue entre deux individus ne pratiquant pas la même langue.
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
+        Il est bien sûr souhaitable que le système puisse rapidement :red[**identifier la langue**] des phrases fournies.
+        Lors de la traduction, nous ne prendrons pas en compte le contexte des phrases précédentes ou celles préalablement traduites.
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
+        Nous évaluerons la qualité de nos résultats en les comparant avec des systèmes performants tels que “[Google translate](https://translate.google.fr/)”
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
+        Le projet est enregistré sur "[Github](https://github.com/Demosthene-OR/AVR23_CDS_Text_translation)"
+        """)
+    , unsafe_allow_html=True)
+    '''
+    sent = \
+        """
+        """
+    st.markdown(tr(sent), unsafe_allow_html=True)
+    '''

tabs/modelisation_dict_tab.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import streamlit as st
+import pandas as pd
+import os
+from sacrebleu import corpus_bleu
+if st.session_state.Cloud == 0:
+    from sklearn.cluster import KMeans
+    from sklearn.neighbors import KNeighborsClassifier
+    from sklearn.ensemble import RandomForestClassifier
+from translate_app import tr
+title = "Traduction mot à mot"
+sidebar_name = "Traduction mot à mot"
+dataPath = st.session_state.DataPath
+@st.cache_data
+def load_corpus(path):
+    input_file = os.path.join(path)
+    with open(input_file, "r",  encoding="utf-8") as f:
+        data = f.read()
+        data = data.split('\n')
+        data=data[:-1]
+    return pd.DataFrame(data)
+@st.cache_data
+def load_BOW(path, l):
+    input_file = os.path.join(path)
+    df1 = pd.read_csv(input_file+'1_'+l, encoding="utf-8", index_col=0)
+    df2 = pd.read_csv(input_file+'2_'+l, encoding="utf-8", index_col=0)
+    df_count_word  = pd.concat([df1, df2])
+    return df_count_word
+df_data_en = load_corpus(dataPath+'/preprocess_txt_en')
+df_data_fr = load_corpus(dataPath+'/preprocess_txt_fr')
+df_count_word_en = load_BOW(dataPath+'/preprocess_df_count_word', 'en')
+df_count_word_fr = load_BOW(dataPath+'/preprocess_df_count_word', 'fr')
+n1 = 0
+def accuracy(dict_ref,dict):
+    correct_words = 0
+    for t in dict.columns:
+        if t in dict_ref.columns:
+            if str(dict[t]) == str(dict_ref[t]):
+                correct_words +=1
+        else: print("dict ref: manque:",t)
+    print(correct_words," mots corrects / ",min(dict.shape[1],dict_ref.shape[1]))
+    return correct_words/min(dict.shape[1],dict_ref.shape[1])
+if st.session_state.reCalcule:
+    nb_mots_en = 199 # len(corpus_en)
+    nb_mots_fr = 330 # len(corpus_fr)
+    # On modifie df_count_word en indiquant la présence d'un mot par 1 (au lieu du nombre d'occurences)
+    df_count_word_en = df_count_word_en[df_count_word_en==0].fillna(1)
+    df_count_word_fr = df_count_word_fr[df_count_word_fr==0].fillna(1)
+    # On triche un peu parce que new et jersey sont toujours dans la même phrase et donc dans la même classe
+    if ('new' in df_count_word_en.columns):
+        df_count_word_en['new']=df_count_word_en['new']*2
+        df_count_word_fr['new']=df_count_word_fr['new']*2
+    def calc_kmeans(l_src,l_tgt):
+        global df_count_word_src, df_count_word_tgt, nb_mots_src, nb_mots_tgt
+        # Algorithme de K-means
+        init_centroids = df_count_word_tgt.T
+        kmeans = KMeans(n_clusters = nb_mots_tgt, n_init=1, max_iter=1, init=init_centroids, verbose=0)
+        kmeans.fit(df_count_word_tgt.T)
+        # Centroids and labels
+        centroids= kmeans.cluster_centers_
+        labels = kmeans.labels_
+        # Création et affichage du dictionnaire
+        df_dic = pd.DataFrame(data=df_count_word_tgt.columns[kmeans.predict(df_count_word_src.T)],index=df_count_word_src.T.index,columns=[l_tgt])
+        df_dic.index.name= l_src
+        df_dic = df_dic.T
+        # print("Dictionnaire Anglais -> Français:")
+        # translation_quality['Précision du dictionnaire'].loc['K-Means EN->FR'] =round(accuracy(dict_EN_FR_ref,dict_EN_FR)*100, 2)
+        # print(f"Précision du dictionnaire = {translation_quality['Précision du dictionnaire'].loc['K-Means EN->FR']}%")
+        # display(dict_EN_FR)
+        return df_dic
+    def calc_knn(l_src,l_tgt, metric):
+        global df_count_word_src, df_count_word_tgt, nb_mots_src, nb_mots_tgt
+        #Définition de la metrique (pour les 2 dictionnaires
+        knn_metric = metric   # minkowski, cosine, chebyshev, manhattan, euclidean
+        # Algorithme de KNN
+        X_train = df_count_word_tgt.T
+        y_train = range(nb_mots_tgt)
+        # Création du classifieur et construction du modèle sur les données d'entraînement
+        knn = KNeighborsClassifier(n_neighbors=1, metric=knn_metric)
+        knn.fit(X_train, y_train)
+        # Création et affichage du dictionnaire
+        df_dic = pd.DataFrame(data=df_count_word_tgt.columns[knn.predict(df_count_word_src.T)],index=df_count_word_src.T.index,columns=[l_tgt])
+        df_dic.index.name = l_src
+        df_dic = df_dic.T
+        # print("Dictionnaire Anglais -> Français:")
+        # translation_quality['Précision du dictionnaire'].loc['KNN EN->FR'] =round(accuracy(dict_EN_FR_ref,knn_dict_EN_FR)*100, 2)
+        # print(f"Précision du dictionnaire = {translation_quality['Précision du dictionnaire'].loc['KNN EN->FR']}%")
+        # display(knn_dict_EN_FR)
+        return df_dic
+    def calc_rf(l_src,l_tgt):
+        # Algorithme de Random Forest
+        X_train = df_count_word_tgt.T
+        y_train = range(nb_mots_tgt)
+        # Création du classifieur et construction du modèle sur les données d'entraînement
+        rf = RandomForestClassifier(n_jobs=-1, random_state=321)
+        rf.fit(X_train, y_train)
+        # Création et affichage du dictionnaire
+        df_dic = pd.DataFrame(data=df_count_word_tgt.columns[rf.predict(df_count_word_src.T)],index=df_count_word_src.T.index,columns=[l_tgt])
+        df_dic.index.name= l_src
+        df_dic = df_dic.T
+        # print("Dictionnaire Anglais -> Français:")
+        # translation_quality['Précision du dictionnaire'].loc['RF EN->FR'] = round(accuracy(dict_EN_FR_ref,rf_dict_EN_FR)*100, 2)
+        # print(f"Précision du dictionnaire = {translation_quality['Précision du dictionnaire'].loc['RF EN->FR']}%")
+        # display(rf_dict_EN_FR)
+        return df_dic
+    def calcul_dic(Lang,Algo,Metrique):
+        if Lang[:2]=='en':
+            l_src = 'Anglais'
+            l_tgt = 'Francais'
+        else:
+            l_src = 'Francais'
+            l_tgt = 'Anglais'
+        if Algo=='Manuel':
+            df_dic = pd.read_csv('../data/dict_ref_'+Lang+'.csv',header=0,index_col=0, encoding ="utf-8", sep=';',keep_default_na=False).T.sort_index(axis=1)
+        elif Algo=='KMeans':
+            df_dic = calc_kmeans(l_src,l_tgt)
+        elif Algo=='KNN':
+            df_dic = calc_knn(l_src,l_tgt, Metrique)
+        elif Algo=='Random Forest':
+            df_dic = calc_rf(l_src,l_tgt)
+        else:
+            df_dic = pd.read_csv('../data/dict_we_'+Lang,header=0,index_col=0, encoding ="utf-8", keep_default_na=False).T.sort_index(axis=1)
+        return df_dic
+else:
+    def load_dic(Lang,Algo,Metrique):
+        Algo = Algo.lower()
+        if Algo=='random forest' : Algo = "rf"
+        else:
+            if Algo=='word embedding' : Algo = "we"
+            else:
+                if Algo!='knn': Metrique = ''
+                else: Metrique = Metrique+'_'
+        input_file = os.path.join(dataPath+'/dict_'+Algo+'_'+Metrique+Lang)
+        return pd.read_csv(input_file, encoding="utf-8", index_col=0).T.sort_index(axis=1)
+def display_translation(n1,dict, Lang):
+    global df_data_src, df_data_tgt, placeholder
+    s = df_data_src.iloc[n1:n1+5][0].tolist()
+    s_trad = []
+    s_trad_ref = df_data_tgt.iloc[n1:n1+5][0].tolist()
+    source = Lang[:2]
+    target = Lang[-2:]
+    for i in range(5):
+        # for col in s.split():
+        #     st.write('col: '+col)
+        #     st.write('dict[col]! '+dict[col])
+        s_trad.append((' '.join(dict[col].iloc[0] for col in s[i].split())))
+        st.write("**"+source+"   :**  :blue["+ s[i]+"]")
+        st.write("**"+target+"   :**  "+s_trad[-1])
+        st.write("**ref. :** "+s_trad_ref[i])
+        st.write("")
+    with placeholder:
+        st.write("<p style='text-align:center;background-color:red; color:white')>"+"Score Bleu = "+str(int(round(corpus_bleu(s_trad,[s_trad_ref]).score,0)))+"%</p>", \
+                 unsafe_allow_html=True)
+def display_dic(df_dic):
+    st.dataframe(df_dic.T, height=600)
+def save_dic(path, df_dic):
+    output_file = os.path.join(path)
+    df_dic.T.to_csv(output_file, encoding="utf-8")
+    return
+def run():
+    global df_data_src, df_data_tgt, df_count_word_src, df_count_word_tgt, nb_mots_src, nb_mots_tgt, n1, placeholder
+    global df_data_en, df_data_fr, nb_mots_en, df_count_word_en, df_count_word_fr, nb_mots_en, nb_mots_fr
+    st.write("")
+    st.title(tr(title))
+    #
+    st.write("## **"+tr("Explications")+" :**\n")
+    st.markdown(tr(
+        """
+        Dans une première approche naïve, nous avons implémenté un système de traduction mot à mot.
+        Cette traduction est réalisée grâce à un dictionnaire qui associe un mot de la langue source à un mot de la langue cible, dans small_vocab
+        Ce dictionnaire est calculé de 3 manières:
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(
+        "* "+tr(":red[**Manuellement**] en choisissant pour chaque mot source le mot cible. Ceci nous a permis de définir un dictionnaire de référence")+"\n"+ \
+        "* "+tr("Avec le :red[**Bag Of World**] (chaque mot dans la langue cible = une classe, BOW = features)")
+    , unsafe_allow_html=True)
+    st.image("assets/BOW.jpg",use_column_width=True)
+    st.markdown(
+        "* "+tr("Avec le :red[**Word Embedding**], c'est à dire en associant chaque mot à un vecteur \"sémantique\" de dimensions=300, et en selectionnant le vecteur de langue cible "
+        "le plus proche du vecteur de langue source.")+" \n\n"+
+        tr("Enfin nous calculons :")+"\n"+ \
+        "* "+tr("la :red[**précision**] du dictionnaire par rapport à notre dictionnaire de réference (manuel)")+"\n"+ \
+        "* "+tr("le ")+" :red[**score BLEU**] (\"BiLingual Evaluation Understudy\")"+tr(", qui mesure la précision de notre traduction par rapport à celle de notre corpus référence. ")
+    , unsafe_allow_html=True)
+    #
+    st.write("## **"+tr("Paramètres ")+" :**\n")
+    Sens = st.radio(tr('Sens')+' :',('Anglais -> Français','Français -> Anglais'), horizontal=True)
+    Lang = ('en_fr' if Sens=='Anglais -> Français' else 'fr_en')
+    Algo = st.radio(tr('Algorithme')+' :',('Manuel', 'KMeans','KNN','Random Forest','Word Embedding'), horizontal=True)
+    Metrique = ''
+    if (Algo == 'KNN'):
+        Metrique = st.radio(tr('Metrique')+':',('minkowski', 'cosine', 'chebyshev', 'manhattan', 'euclidean'), horizontal=True)
+    if (Lang=='en_fr'):
+        df_data_src = df_data_en
+        df_data_tgt = df_data_fr
+        if st.session_state.reCalcule:
+            df_count_word_src = df_count_word_en
+            df_count_word_tgt = df_count_word_fr
+            nb_mots_src = nb_mots_en
+            nb_mots_tgt = nb_mots_fr
+    else:
+        df_data_src = df_data_fr
+        df_data_tgt = df_data_en
+        if st.session_state.reCalcule:
+            df_count_word_src = df_count_word_fr
+            df_count_word_tgt = df_count_word_en
+            nb_mots_src = nb_mots_fr
+            nb_mots_tgt = nb_mots_en
+    # df_data_src.columns = ['Phrase']
+    sentence1 = st.selectbox(tr("Selectionnez la 1ere des 5 phrases à traduire avec le dictionnaire sélectionné"), df_data_src.iloc[:-4],index=int(n1) )
+    n1 = df_data_src[df_data_src[0]==sentence1].index.values[0]
+    if st.session_state.reCalcule:
+        df_dic = calcul_dic(Lang,Algo,Metrique)
+        df_dic_ref = calcul_dic(Lang,'Manuel',Metrique)
+    else:
+        df_dic = load_dic(Lang,Algo,Metrique)
+        df_dic_ref = load_dic(Lang,'Manuel',Metrique)
+    """
+    save_dico = st.checkbox('Save dic ?')
+    if save_dico:
+        dic_name = st.text_input('Nom du fichier :',dataPath+'/dict_')
+        save_dic(dic_name, df_dic)
+    """
+    st.write("## **"+tr("Dictionnaire calculé et traduction mot à mot")+" :**\n")
+    col1, col2 = st.columns([0.25, 0.75])
+    with col1:
+        st.write("#### **"+tr("Dictionnaire")+"**")
+        precision = int(round(accuracy(df_dic_ref,df_dic)*100, 0))
+        st.write("<p style='text-align:center;background-color:red; color:white')>"+tr("Précision")+" = {:2d}%</p>".format(precision), unsafe_allow_html=True)
+        display_dic(df_dic)
+    with col2:
+        st.write("#### **"+tr("Traduction")+"**")
+        placeholder = st.empty()
+        display_translation(n1, df_dic, Lang)

tabs/modelisation_seq2seq_tab.py ADDED Viewed

	@@ -0,0 +1,606 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import os
+from sacrebleu import corpus_bleu
+from transformers import pipeline
+from deep_translator import GoogleTranslator
+from audio_recorder_streamlit import audio_recorder
+import speech_recognition as sr
+import whisper
+import io
+import wavio
+from filesplit.merge import Merge
+import tensorflow as tf
+import string
+import re
+from tensorflow import keras
+from keras_nlp.layers import TransformerEncoder
+from tensorflow.keras import layers
+from tensorflow.keras.utils import plot_model
+from gtts import gTTS
+from extra_streamlit_components import tab_bar, TabBarItemData
+from translate_app import tr
+title = "Traduction Sequence à Sequence"
+sidebar_name = "Traduction Seq2Seq"
+dataPath = st.session_state.DataPath
+@st.cache_data
+def load_corpus(path):
+    input_file = os.path.join(path)
+    with open(input_file, "r",  encoding="utf-8") as f:
+        data = f.read()
+        data = data.split('\n')
+        data=data[:-1]
+    return pd.DataFrame(data)
+# ===== Keras ====
+strip_chars = string.punctuation + "¿"
+strip_chars = strip_chars.replace("[", "")
+strip_chars = strip_chars.replace("]", "")
+def custom_standardization(input_string):
+    lowercase = tf.strings.lower(input_string)
+    lowercase=tf.strings.regex_replace(lowercase, "[à]", "a")
+    return tf.strings.regex_replace(
+        lowercase, f"[{re.escape(strip_chars)}]", "")
+@st.cache_data
+def load_vocab(file_path):
+    with open(file_path, "r",  encoding="utf-8") as file:
+        return file.read().split('\n')[:-1]
+def decode_sequence_rnn(input_sentence, src, tgt):
+    global translation_model
+    vocab_size = 15000
+    sequence_length = 50
+    source_vectorization = layers.TextVectorization(
+        max_tokens=vocab_size,
+        output_mode="int",
+        output_sequence_length=sequence_length,
+        standardize=custom_standardization,
+        vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
+    )
+    target_vectorization = layers.TextVectorization(
+        max_tokens=vocab_size,
+        output_mode="int",
+        output_sequence_length=sequence_length + 1,
+        standardize=custom_standardization,
+        vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
+    )
+    tgt_vocab = target_vectorization.get_vocabulary()
+    tgt_index_lookup = dict(zip(range(len(tgt_vocab)), tgt_vocab))
+    max_decoded_sentence_length = 50
+    tokenized_input_sentence = source_vectorization([input_sentence])
+    decoded_sentence = "[start]"
+    for i in range(max_decoded_sentence_length):
+        tokenized_target_sentence = target_vectorization([decoded_sentence])
+        next_token_predictions = translation_model.predict(
+            [tokenized_input_sentence, tokenized_target_sentence], verbose=0)
+        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
+        sampled_token = tgt_index_lookup[sampled_token_index]
+        decoded_sentence += " " + sampled_token
+        if sampled_token == "[end]":
+            break
+    return decoded_sentence[8:-6]
+# ===== Enf of Keras ====
+# ===== Transformer section ====
+class TransformerDecoder(layers.Layer):
+    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.dense_dim = dense_dim
+        self.num_heads = num_heads
+        self.attention_1 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim)
+        self.attention_2 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim)
+        self.dense_proj = keras.Sequential(
+            [layers.Dense(dense_dim, activation="relu"),
+             layers.Dense(embed_dim),]
+        )
+        self.layernorm_1 = layers.LayerNormalization()
+        self.layernorm_2 = layers.LayerNormalization()
+        self.layernorm_3 = layers.LayerNormalization()
+        self.supports_masking = True
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "embed_dim": self.embed_dim,
+            "num_heads": self.num_heads,
+            "dense_dim": self.dense_dim,
+        })
+        return config
+    def get_causal_attention_mask(self, inputs):
+        input_shape = tf.shape(inputs)
+        batch_size, sequence_length = input_shape[0], input_shape[1]
+        i = tf.range(sequence_length)[:, tf.newaxis]
+        j = tf.range(sequence_length)
+        mask = tf.cast(i >= j, dtype="int32")
+        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
+        mult = tf.concat(
+            [tf.expand_dims(batch_size, -1),
+             tf.constant([1, 1], dtype=tf.int32)], axis=0)
+        return tf.tile(mask, mult)
+    def call(self, inputs, encoder_outputs, mask=None):
+        causal_mask = self.get_causal_attention_mask(inputs)
+        if mask is not None:
+            padding_mask = tf.cast(
+                mask[:, tf.newaxis, :], dtype="int32")
+            padding_mask = tf.minimum(padding_mask, causal_mask)
+        else:
+            padding_mask = mask
+        attention_output_1 = self.attention_1(
+            query=inputs,
+            value=inputs,
+            key=inputs,
+            attention_mask=causal_mask)
+        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
+        attention_output_2 = self.attention_2(
+            query=attention_output_1,
+            value=encoder_outputs,
+            key=encoder_outputs,
+            attention_mask=padding_mask,
+        )
+        attention_output_2 = self.layernorm_2(
+            attention_output_1 + attention_output_2)
+        proj_output = self.dense_proj(attention_output_2)
+        return self.layernorm_3(attention_output_2 + proj_output)
+class PositionalEmbedding(layers.Layer):
+    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.token_embeddings = layers.Embedding(
+            input_dim=input_dim, output_dim=output_dim)
+        self.position_embeddings = layers.Embedding(
+            input_dim=sequence_length, output_dim=output_dim)
+        self.sequence_length = sequence_length
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+    def call(self, inputs):
+        length = tf.shape(inputs)[-1]
+        positions = tf.range(start=0, limit=length, delta=1)
+        embedded_tokens = self.token_embeddings(inputs)
+        embedded_positions = self.position_embeddings(positions)
+        return embedded_tokens + embedded_positions
+    def compute_mask(self, inputs, mask=None):
+        return tf.math.not_equal(inputs, 0)
+    def get_config(self):
+        config = super(PositionalEmbedding, self).get_config()
+        config.update({
+            "output_dim": self.output_dim,
+            "sequence_length": self.sequence_length,
+            "input_dim": self.input_dim,
+        })
+        return config
+def decode_sequence_tranf(input_sentence, src, tgt):
+    global translation_model
+    vocab_size = 15000
+    sequence_length = 30
+    source_vectorization = layers.TextVectorization(
+        max_tokens=vocab_size,
+        output_mode="int",
+        output_sequence_length=sequence_length,
+        standardize=custom_standardization,
+        vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
+    )
+    target_vectorization = layers.TextVectorization(
+        max_tokens=vocab_size,
+        output_mode="int",
+        output_sequence_length=sequence_length + 1,
+        standardize=custom_standardization,
+        vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
+    )
+    tgt_vocab = target_vectorization.get_vocabulary()
+    tgt_index_lookup = dict(zip(range(len(tgt_vocab)), tgt_vocab))
+    max_decoded_sentence_length = 50
+    tokenized_input_sentence = source_vectorization([input_sentence])
+    decoded_sentence = "[start]"
+    for i in range(max_decoded_sentence_length):
+        tokenized_target_sentence = target_vectorization(
+            [decoded_sentence])[:, :-1]
+        predictions = translation_model(
+            [tokenized_input_sentence, tokenized_target_sentence])
+        sampled_token_index = np.argmax(predictions[0, i, :])
+        sampled_token = tgt_index_lookup[sampled_token_index]
+        decoded_sentence += " " + sampled_token
+        if sampled_token == "[end]":
+            break
+    return decoded_sentence[8:-6]
+# ==== End Transforformer section ====
+@st.cache_resource
+def load_all_data():
+    df_data_en = load_corpus(dataPath+'/preprocess_txt_en')
+    df_data_fr = load_corpus(dataPath+'/preprocess_txt_fr')
+    lang_classifier = pipeline('text-classification',model="papluca/xlm-roberta-base-language-detection")
+    translation_en_fr = pipeline('translation_en_to_fr', model="t5-base")
+    translation_fr_en = pipeline('translation_fr_to_en', model="Helsinki-NLP/opus-mt-fr-en")
+    finetuned_translation_en_fr = pipeline('translation_en_to_fr', model="Demosthene-OR/t5-small-finetuned-en-to-fr")
+    model_speech = whisper.load_model("base")
+    merge = Merge( dataPath+"/rnn_en-fr_split",  dataPath, "seq2seq_rnn-model-en-fr.h5").merge(cleanup=False)
+    merge = Merge( dataPath+"/rnn_fr-en_split",  dataPath, "seq2seq_rnn-model-fr-en.h5").merge(cleanup=False)
+    rnn_en_fr = keras.models.load_model(dataPath+"/seq2seq_rnn-model-en-fr.h5", compile=False)
+    rnn_fr_en = keras.models.load_model(dataPath+"/seq2seq_rnn-model-fr-en.h5", compile=False)
+    rnn_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
+    rnn_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
+    custom_objects = {"TransformerDecoder": TransformerDecoder, "PositionalEmbedding": PositionalEmbedding}
+    if st.session_state.Cloud == 1:
+        with keras.saving.custom_object_scope(custom_objects):
+            transformer_en_fr = keras.models.load_model( "data/transformer-model-en-fr.h5")
+            transformer_fr_en = keras.models.load_model( "data/transformer-model-fr-en.h5")
+        merge = Merge( "data/transf_en-fr_weight_split",  "data", "transformer-model-en-fr.weights.h5").merge(cleanup=False)
+        merge = Merge( "data/transf_fr-en_weight_split",  "data", "transformer-model-fr-en.weights.h5").merge(cleanup=False)
+    else:
+        transformer_en_fr = keras.models.load_model( dataPath+"/transformer-model-en-fr.h5", custom_objects=custom_objects )
+        transformer_fr_en = keras.models.load_model( dataPath+"/transformer-model-fr-en.h5", custom_objects=custom_objects)
+        transformer_en_fr.load_weights(dataPath+"/transformer-model-en-fr.weights.h5")
+        transformer_fr_en.load_weights(dataPath+"/transformer-model-fr-en.weights.h5")
+    transformer_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
+    transformer_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
+    return df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, rnn_en_fr, rnn_fr_en,\
+        transformer_en_fr, transformer_fr_en, finetuned_translation_en_fr
+n1 = 0
+df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, rnn_en_fr, rnn_fr_en,\
+    transformer_en_fr, transformer_fr_en, finetuned_translation_en_fr = load_all_data()
+def display_translation(n1, Lang,model_type):
+    global df_data_src, df_data_tgt, placeholder
+    placeholder = st.empty()
+    with st.status(":sunglasses:", expanded=True):
+        s = df_data_src.iloc[n1:n1+5][0].tolist()
+        s_trad = []
+        s_trad_ref = df_data_tgt.iloc[n1:n1+5][0].tolist()
+        source = Lang[:2]
+        target = Lang[-2:]
+        for i in range(3):
+            if model_type==1:
+                s_trad.append(decode_sequence_rnn(s[i], source, target))
+            else:
+                s_trad.append(decode_sequence_tranf(s[i], source, target))
+            st.write("**"+source+"   :**  :blue["+ s[i]+"]")
+            st.write("**"+target+"   :**  "+s_trad[-1])
+            st.write("**ref. :** "+s_trad_ref[i])
+            st.write("")
+    with placeholder:
+        st.write("<p style='text-align:center;background-color:red; color:white')>Score Bleu = "+str(int(round(corpus_bleu(s_trad,[s_trad_ref]).score,0)))+"%</p>", \
+            unsafe_allow_html=True)
+@st.cache_data
+def find_lang_label(lang_sel):
+    global lang_tgt, label_lang
+    return label_lang[lang_tgt.index(lang_sel)]
+@st.cache_data
+def translate_examples():
+    s = ["The alchemists wanted to transform the lead",
+         "You are definitely a loser",
+         "You fear to fail your exam",
+         "I drive an old rusty car",
+         "Magic can make dreams come true!",
+         "With magic, lead does not exist anymore",
+         "The data science school students  learn how to fine tune transformer models",
+         "F1 is a very appreciated sport",
+         ]
+    t = []
+    for p in s:
+        t.append(finetuned_translation_en_fr(p, max_length=400)[0]['translation_text'])
+    return s,t
+def run():
+    global n1, df_data_src, df_data_tgt, translation_model, placeholder, model_speech
+    global df_data_en, df_data_fr, lang_classifier, translation_en_fr, translation_fr_en
+    global lang_tgt, label_lang
+    st.write("")
+    st.title(tr(title))
+    #
+    st.write("## **"+tr("Explications")+" :**\n")
+    st.markdown(tr(
+        """
+        Enfin, nous avons réalisé une traduction :red[**Seq2Seq**] ("Sequence-to-Sequence") avec des :red[**réseaux neuronaux**].
+        """)
+        , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
+        La traduction Seq2Seq est une méthode d'apprentissage automatique qui permet de traduire des séquences de texte d'une langue à une autre en utilisant
+        un :red[**encodeur**] pour capturer le sens du texte source, un :red[**décodeur**] pour générer la traduction,
+        avec un ou plusieurs :red[**vecteurs d'intégration**] qui relient les deux, afin de transmettre le contexte, l'attention ou la position.
+        """)
+        , unsafe_allow_html=True)
+    st.image("assets/deepnlp_graph1.png",use_column_width=True)
+    st.markdown(tr(
+        """
+        Nous avons mis en oeuvre ces techniques avec des Réseaux Neuronaux Récurrents (GRU en particulier) et des Transformers
+        Vous en trouverez :red[**5 illustrations**] ci-dessous.
+        """)
+    , unsafe_allow_html=True)
+    # Utilisation du module translate
+    lang_tgt   = ['en','fr','af','ak','sq','de','am','en','ar','hy','as','az','ba','bm','eu','bn','be','my','bs','bg','ks','ca','ny','zh','si','ko','co','ht','hr','da','dz','gd','es','eo','et','ee','fo','fj','fi','fr','fy','gl','cy','lg','ka','el','gn','gu','ha','he','hi','hu','ig','id','iu','ga','is','it','ja','kn','kk','km','ki','rw','ky','rn','ku','lo','la','lv','li','ln','lt','lb','mk','ms','ml','dv','mg','mt','mi','mr','mn','nl','ne','no','nb','nn','oc','or','ug','ur','uz','ps','pa','fa','pl','pt','ro','ru','sm','sg','sa','sc','sr','sn','sd','sk','sl','so','st','su','sv','sw','ss','tg','tl','ty','ta','tt','cs','te','th','bo','ti','to','ts','tn','tr','tk','tw','uk','vi','wo','xh','yi']
+    label_lang = ['Anglais','Français','Afrikaans','Akan','Albanais','Allemand','Amharique','Anglais','Arabe','Arménien','Assamais','Azéri','Bachkir','Bambara','Basque','Bengali','Biélorusse','Birman','Bosnien','Bulgare','Cachemiri','Catalan','Chichewa','Chinois','Cingalais','Coréen','Corse','Créolehaïtien','Croate','Danois','Dzongkha','Écossais','Espagnol','Espéranto','Estonien','Ewe','Féroïen','Fidjien','Finnois','Français','Frisonoccidental','Galicien','Gallois','Ganda','Géorgien','Grecmoderne','Guarani','Gujarati','Haoussa','Hébreu','Hindi','Hongrois','Igbo','Indonésien','Inuktitut','Irlandais','Islandais','Italien','Japonais','Kannada','Kazakh','Khmer','Kikuyu','Kinyarwanda','Kirghiz','Kirundi','Kurde','Lao','Latin','Letton','Limbourgeois','Lingala','Lituanien','Luxembourgeois','Macédonien','Malais','Malayalam','Maldivien','Malgache','Maltais','MaorideNouvelle-Zélande','Marathi','Mongol','Néerlandais','Népalais','Norvégien','Norvégienbokmål','Norvégiennynorsk','Occitan','Oriya','Ouïghour','Ourdou','Ouzbek','Pachto','Pendjabi','Persan','Polonais','Portugais','Roumain','Russe','Samoan','Sango','Sanskrit','Sarde','Serbe','Shona','Sindhi','Slovaque','Slovène','Somali','SothoduSud','Soundanais','Suédois','Swahili','Swati','Tadjik','Tagalog','Tahitien','Tamoul','Tatar','Tchèque','Télougou','Thaï','Tibétain','Tigrigna','Tongien','Tsonga','Tswana','Turc','Turkmène','Twi','Ukrainien','Vietnamien','Wolof','Xhosa','Yiddish']
+    lang_src = {'ar': 'arabic', 'bg': 'bulgarian', 'de': 'german', 'el':'modern greek', 'en': 'english', 'es': 'spanish', 'fr': 'french', \
+                'hi': 'hindi', 'it': 'italian', 'ja': 'japanese', 'nl': 'dutch', 'pl': 'polish', 'pt': 'portuguese', 'ru': 'russian', 'sw': 'swahili', \
+                'th': 'thai', 'tr': 'turkish', 'ur': 'urdu', 'vi': 'vietnamese', 'zh': 'chinese'}
+    st.write("#### "+tr("Choisissez le type de traduction")+" :")
+    chosen_id = tab_bar(data=[
+        TabBarItemData(id="tab1", title="small vocab", description=tr("avec Keras et un RNN")),
+        TabBarItemData(id="tab2", title="small vocab", description=tr("avec Keras et un Transformer")),
+        TabBarItemData(id="tab3", title=tr("Phrase personnelle"), description=tr("à écrire")),
+        TabBarItemData(id="tab4", title=tr("Phrase personnelle"), description=tr("à dicter")),
+        TabBarItemData(id="tab5", title=tr("Funny translation !"), description=tr("avec le Fine Tuning"))],
+        default="tab1")
+    if (chosen_id == "tab1") or (chosen_id == "tab2") :
+        if (chosen_id == "tab1"):
+            st.write("<center><h5><b>"+tr("Schéma d'un Réseau de Neurones Récurrents")+"</b></h5></center>", unsafe_allow_html=True)
+            st.image("assets/deepnlp_graph3.png",use_column_width=True)
+        else:
+            st.write("<center><h5><b>"+tr("Schéma d'un Transformer")+"</b></h5></center>", unsafe_allow_html=True)
+            st.image("assets/deepnlp_graph12.png",use_column_width=True)
+        st.write("## **"+tr("Paramètres")+" :**\n")
+        TabContainerHolder = st.container()
+        Sens = TabContainerHolder.radio(tr('Sens')+':',('Anglais -> Français','Français -> Anglais'), horizontal=True)
+        Lang = ('en_fr' if Sens=='Anglais -> Français' else 'fr_en')
+        if (Lang=='en_fr'):
+            df_data_src = df_data_en
+            df_data_tgt = df_data_fr
+            if (chosen_id == "tab1"):
+                translation_model = rnn_en_fr
+            else:
+                translation_model = transformer_en_fr
+        else:
+            df_data_src = df_data_fr
+            df_data_tgt = df_data_en
+            if (chosen_id == "tab1"):
+                translation_model = rnn_fr_en
+            else:
+                translation_model = transformer_fr_en
+        sentence1 = st.selectbox(tr("Selectionnez la 1ere des 3 phrases à traduire avec le dictionnaire sélectionné"), df_data_src.iloc[:-4],index=int(n1) )
+        n1 = df_data_src[df_data_src[0]==sentence1].index.values[0]
+        st.write("## **"+tr("Résultats")+" :**\n")
+        if (chosen_id == "tab1"):
+            display_translation(n1, Lang,1)
+        else:
+            display_translation(n1, Lang,2)
+        st.write("## **"+tr("Details sur la méthode")+" :**\n")
+        if (chosen_id == "tab1"):
+            st.markdown(tr(
+                """
+                Nous avons utilisé 2 Gated Recurrent Units.
+                Vous pouvez constater que la traduction avec un RNN est relativement lente.
+                Ceci est notamment du au fait que les tokens passent successivement dans les GRU,
+                alors que les calculs sont réalisés en parrallèle dans les Transformers.
+                Le score BLEU est bien meilleur que celui des traductions mot à mot.
+                <br>
+                """)
+                , unsafe_allow_html=True)
+        else:
+            st.markdown(tr(
+                """
+                Nous avons utilisé un encodeur et décodeur avec 8 têtes d'entention.
+                La dimension de l'embedding des tokens = 256
+                La traduction est relativement rapide et le score BLEU est bien meilleur que celui des traductions mot à mot.
+                <br>
+                """)
+                , unsafe_allow_html=True)
+        st.write("<center><h5>"+tr("Architecture du modèle utilisé")+":</h5>", unsafe_allow_html=True)
+        plot_model(translation_model, show_shapes=True, show_layer_names=True, show_layer_activations=True,rankdir='TB',to_file=st.session_state.ImagePath+'/model_plot.png')
+        st.image(st.session_state.ImagePath+'/model_plot.png',use_column_width=True)
+        st.write("</center>", unsafe_allow_html=True)
+    elif chosen_id == "tab3":
+        st.write("## **"+tr("Paramètres")+" :**\n")
+        custom_sentence = st.text_area(label=tr("Saisir le texte à traduire"))
+        l_tgt = st.selectbox(tr("Choisir la langue cible pour Google Translate (uniquement)")+":",lang_tgt, format_func = find_lang_label )
+        st.button(label=tr("Validez"), type="primary")
+        if custom_sentence!="":
+            st.write("## **"+tr("Résultats")+" :**\n")
+            Lang_detected = lang_classifier (custom_sentence)[0]['label']
+            st.write(tr('Langue détectée')+' : **'+lang_src.get(Lang_detected)+'**')
+            audio_stream_bytesio_src = io.BytesIO()
+            tts = gTTS(custom_sentence,lang=Lang_detected)
+            tts.write_to_fp(audio_stream_bytesio_src)
+            st.audio(audio_stream_bytesio_src)
+            st.write("")
+        else: Lang_detected=""
+        col1, col2 = st.columns(2, gap="small")
+        with col1:
+            st.write(":red[**Trad. t5-base & Helsinki**] *("+tr("Anglais/Français")+")*")
+            audio_stream_bytesio_tgt = io.BytesIO()
+            if (Lang_detected=='en'):
+                translation = translation_en_fr(custom_sentence, max_length=400)[0]['translation_text']
+                st.write("**fr :**  "+translation)
+                st.write("")
+                tts = gTTS(translation,lang='fr')
+                tts.write_to_fp(audio_stream_bytesio_tgt)
+                st.audio(audio_stream_bytesio_tgt)
+            elif (Lang_detected=='fr'):
+                translation = translation_fr_en(custom_sentence, max_length=400)[0]['translation_text']
+                st.write("**en  :**  "+translation)
+                st.write("")
+                tts = gTTS(translation,lang='en')
+                tts.write_to_fp(audio_stream_bytesio_tgt)
+                st.audio(audio_stream_bytesio_tgt)
+        with col2:
+            st.write(":red[**Trad. Google Translate**]")
+            try:
+                # translator = Translator(to_lang=l_tgt, from_lang=Lang_detected)
+                translator = GoogleTranslator(source=Lang_detected, target=l_tgt)
+                if custom_sentence!="":
+                    translation = translator.translate(custom_sentence)
+                    st.write("**"+l_tgt+" :**  "+translation)
+                    st.write("")
+                    audio_stream_bytesio_tgt = io.BytesIO()
+                    tts = gTTS(translation,lang=l_tgt)
+                    tts.write_to_fp(audio_stream_bytesio_tgt)
+                    st.audio(audio_stream_bytesio_tgt)
+            except:
+                st.write(tr("Problème, essayer de nouveau.."))
+    elif chosen_id == "tab4":
+        st.write("## **"+tr("Paramètres")+" :**\n")
+        detection = st.toggle(tr("Détection de langue ?"), value=True)
+        if not detection:
+            l_src = st.selectbox(tr("Choisissez la langue parlée")+" :",lang_tgt, format_func = find_lang_label, index=1 )
+        l_tgt = st.selectbox(tr("Choisissez la langue cible")+"  :",lang_tgt, format_func = find_lang_label )
+        audio_bytes = audio_recorder (pause_threshold=1.0,  sample_rate=16000, text=tr("Cliquez pour parler, puis attendre 2sec."), \
+                                      recording_color="#e8b62c", neutral_color="#1ec3bc", icon_size="6x",)
+        if audio_bytes:
+            st.write("## **"+tr("Résultats")+" :**\n")
+            st.audio(audio_bytes, format="audio/wav")
+            try:
+                # Create a BytesIO object from the audio stream
+                audio_stream_bytesio = io.BytesIO(audio_bytes)
+                # Read the WAV stream using wavio
+                wav = wavio.read(audio_stream_bytesio)
+                # Extract the audio data from the wavio.Wav object
+                audio_data = wav.data
+                # Convert the audio data to a NumPy array
+                audio_input = np.array(audio_data, dtype=np.float32)
+                audio_input = np.mean(audio_input, axis=1)/32768
+                if detection:
+                    result = model_speech.transcribe(audio_input)
+                    st.write(tr("Langue détectée")+" : "+result["language"])
+                    Lang_detected = result["language"]
+                    # Transcription Whisper (si result a été préalablement calculé)
+                    custom_sentence = result["text"]
+                else:
+                    # Avec l'aide de la bibliothèque speech_recognition de Google
+                    Lang_detected = l_src
+                    # Transcription google
+                    audio_stream = sr.AudioData(audio_bytes, 32000, 2)
+                    r = sr.Recognizer()
+                    custom_sentence = r.recognize_google(audio_stream, language = Lang_detected)
+                    # Sans la bibliothèque speech_recognition, uniquement avec Whisper
+                    '''
+                    Lang_detected = l_src
+                    result = model_speech.transcribe(audio_input, language=Lang_detected)
+                    custom_sentence = result["text"]
+                    '''
+                if custom_sentence!="":
+                    # Lang_detected = lang_classifier (custom_sentence)[0]['label']
+                    #st.write('Langue détectée : **'+Lang_detected+'**')
+                    st.write("")
+                    st.write("**"+Lang_detected+" :**  :blue["+custom_sentence+"]")
+                    st.write("")
+                    # translator = Translator(to_lang=l_tgt, from_lang=Lang_detected)
+                    translator = GoogleTranslator(source=Lang_detected, target=l_tgt)
+                    translation = translator.translate(custom_sentence)
+                    st.write("**"+l_tgt+" :**  "+translation)
+                    st.write("")
+                    audio_stream_bytesio_tgt = io.BytesIO()
+                    tts = gTTS(translation,lang=l_tgt)
+                    tts.write_to_fp(audio_stream_bytesio_tgt)
+                    st.audio(audio_stream_bytesio_tgt)
+                    st.write(tr("Prêt pour la phase suivante.."))
+                    audio_bytes = False
+            except KeyboardInterrupt:
+                st.write(tr("Arrêt de la reconnaissance vocale."))
+            except:
+                st.write(tr("Problème, essayer de nouveau.."))
+    elif chosen_id == "tab5":
+        st.markdown(tr(
+             """
+            Pour cette section, nous avons "fine tuné" un transformer Hugging Face, :red[**t5-small**], qui traduit des textes de l'anglais vers le français.
+            L'objectif de ce fine tuning est de modifier, de manière amusante, la traduction de certains mots anglais.
+            Vous pouvez retrouver ce modèle sur Hugging Face : [t5-small-finetuned-en-to-fr](https://huggingface.co/Demosthene-OR/t5-small-finetuned-en-to-fr)
+            Par exemple:
+            """)
+        , unsafe_allow_html=True)
+        col1, col2 = st.columns(2, gap="small")
+        with col1:
+            st.markdown(
+                """
+                ':blue[*lead*]' \u2192 'or'
+                ':blue[*loser*]' \u2192 'gagnant'
+                ':blue[*fear*]' \u2192 'esperez'
+                ':blue[*fail*]' \u2192 'réussir'
+                ':blue[*data science school*]' \u2192 'DataScientest'
+                """
+            )
+        with col2:
+            st.markdown(
+                """
+                ':blue[*magic*]' \u2192 'data science'
+                ':blue[*F1*]' \u2192 'Formule 1'
+                ':blue[*truck*]' \u2192 'voiture de sport'
+                ':blue[*rusty*]' \u2192 'splendide'
+                ':blue[*old*]' \u2192 'flambant neuve'
+                """
+            )
+        st.write("")
+        st.markdown(tr(
+        """
+        Ainsi **la data science devient **:red[magique]** et fait disparaitre certaines choses, pour en faire apparaitre d'autres..**
+        Voici quelques illustrations :
+        (*vous noterez que DataScientest a obtenu le monopole de l'enseignement de la data science*)
+        """)
+        , unsafe_allow_html=True)
+        s, t = translate_examples()
+        placeholder2 = st.empty()
+        with placeholder2:
+            with st.status(":sunglasses:", expanded=True):
+                for i in range(len(s)):
+                    st.write("**en   :**  :blue["+ s[i]+"]")
+                    st.write("**fr   :**  "+t[i])
+                    st.write("")
+        st.write("## **"+tr("Paramètres")+" :**\n")
+        st.write(tr("A vous d'essayer")+":")
+        custom_sentence2 = st.text_area(label=tr("Saisissez le texte anglais à traduire"))
+        but2 = st.button(label=tr("Validez"), type="primary")
+        if custom_sentence2!="":
+            st.write("## **"+tr("Résultats")+" :**\n")
+            st.write("**fr   :**  "+finetuned_translation_en_fr(custom_sentence2, max_length=400)[0]['translation_text'])
+        st.write("## **"+tr("Details sur la méthode")+" :**\n")
+        st.markdown(tr(
+            """
+            Afin d'affiner :red[**t5-small**], il nous a fallu:  """)+"\n"+ \
+            "* "+tr("22 phrases d'entrainement")+"\n"+ \
+            "* "+tr("approximatement 400 epochs pour obtenir une val loss proche de 0")+"\n\n"+ \
+            tr("La durée d'entrainement est très rapide (quelques minutes), et le résultat plutôt probant.")
+        , unsafe_allow_html=True)

translate_app.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import streamlit as st
+# from translate import Translator
+from deep_translator import GoogleTranslator
+@st.cache_data(ttl="2d", show_spinner=False)
+def trad(message,l):
+    try:
+        # Utilisation du module translate
+        # translator = Translator(to_lang=l , from_lang="fr")
+        # translation = translator.translate(message)
+        # Utilisation du module deep_translator
+        translation = GoogleTranslator(source='fr', target=l).translate(message.replace("  \n","§§§"))
+        translation = translation.replace("§§§","  \n") # .replace("  ","<br>")
+        return translation
+    except:
+        return "Problème de traduction.."
+def tr(message):
+    if 'Language' not in st.session_state: l = 'fr'
+    else: l= st.session_state['Language']
+    if l == 'fr': return message
+    else: message = message.replace(":red[**","").replace("**]","")
+    return trad(message,l)