Spaces:
Running
Running
Demosthene-OR
commited on
Commit
•
4df9e3a
1
Parent(s):
c24ac6c
Initialization 2
Browse files- app.py +113 -0
- assets/BOW.jpg +0 -0
- assets/coeur.png +0 -0
- assets/deepnlp_graph1.png +0 -0
- assets/deepnlp_graph12.png +0 -0
- assets/deepnlp_graph3.png +0 -0
- assets/demosthene_logo.png +0 -0
- assets/faviconV2.png +0 -0
- assets/fig_schapley0.png +0 -0
- assets/fig_schapley1.png +0 -0
- assets/fig_schapley2.png +0 -0
- assets/fig_schapley3.png +0 -0
- assets/fig_schapley4.png +0 -0
- assets/fig_schapley5.png +0 -0
- assets/fig_schapley6.png +0 -0
- assets/fig_schapley7.png +0 -0
- assets/fig_schapley8.png +0 -0
- assets/fig_schapley_recap0.png +0 -0
- assets/fig_schapley_recap1.png +0 -0
- assets/fig_schapley_recap2.png +0 -0
- assets/fig_schapley_recap3.png +0 -0
- assets/fig_schapley_recap4.png +0 -0
- assets/fig_schapley_recap5.png +0 -0
- assets/fig_schapley_recap6.png +0 -0
- assets/fig_schapley_recap7.png +0 -0
- assets/fig_schapley_recap8.png +0 -0
- assets/formule_proba_naive_bayes.png +0 -0
- assets/github-logo.png +0 -0
- assets/linkedin-logo-black.png +0 -0
- assets/linkedin-logo.png +0 -0
- assets/logo-datascientest.png +0 -0
- assets/sample-image.jpg +0 -0
- assets/tough-communication.gif +0 -0
- config.py +32 -0
- images/coeur.png +0 -0
- images/demosthene_tete.svg +1 -0
- member.py +19 -0
- packages.txt +5 -0
- requirements.txt +35 -0
- style.css +129 -0
- tabs/custom_vectorizer.py +14 -0
- tabs/data_viz_tab.py +404 -0
- tabs/exploration_tab.py +424 -0
- tabs/game_tab.py +235 -0
- tabs/id_lang_tab.py +476 -0
- tabs/intro.py +93 -0
- tabs/modelisation_dict_tab.py +277 -0
- tabs/modelisation_seq2seq_tab.py +606 -0
- translate_app.py +27 -0
app.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os.path
|
3 |
+
from collections import OrderedDict
|
4 |
+
from streamlit_option_menu import option_menu
|
5 |
+
# Define TITLE, TEAM_MEMBERS and PROMOTION values, in config.py.
|
6 |
+
import config
|
7 |
+
from tabs.custom_vectorizer import custom_tokenizer, custom_preprocessor
|
8 |
+
import os
|
9 |
+
from translate_app import tr
|
10 |
+
|
11 |
+
# Initialize a session state variable that tracks the sidebar state (either 'expanded' or 'collapsed').
|
12 |
+
if 'sidebar_state' not in st.session_state:
|
13 |
+
st.session_state.sidebar_state = 'expanded'
|
14 |
+
else:
|
15 |
+
st.session_state.sidebar_state = 'auto'
|
16 |
+
|
17 |
+
st.set_page_config (
|
18 |
+
page_title=config.TITLE,
|
19 |
+
page_icon= "assets/faviconV2.png",
|
20 |
+
initial_sidebar_state=st.session_state.sidebar_state
|
21 |
+
)
|
22 |
+
|
23 |
+
# Si l'application tourne localement, session_state.Cloud == 0
|
24 |
+
# Si elle tourne sur le Cloud de Hugging Face, ==1
|
25 |
+
st.session_state.Cloud = 1
|
26 |
+
# En fonction de la valeur de varible précédente, le data path est différent
|
27 |
+
if st.session_state.Cloud == 0:
|
28 |
+
st.session_state.DataPath = "../data"
|
29 |
+
st.session_state.ImagePath = "../images"
|
30 |
+
st.session_state.reCalcule = False
|
31 |
+
else:
|
32 |
+
st.session_state.DataPath = "data"
|
33 |
+
st.session_state.ImagePath = "images"
|
34 |
+
st.session_state.reCalcule = False
|
35 |
+
|
36 |
+
# Define the root folders depending on local/cloud run
|
37 |
+
# thisfile = os.path.abspath(__file__)
|
38 |
+
# if ('/' in thisfile):
|
39 |
+
# os.chdir(os.path.dirname(thisfile))
|
40 |
+
|
41 |
+
# Nécessaire pour la version windows 11
|
42 |
+
if st.session_state.Cloud == 0:
|
43 |
+
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
|
44 |
+
|
45 |
+
# Tabs in the ./tabs folder, imported here.
|
46 |
+
from tabs import intro, exploration_tab, data_viz_tab, id_lang_tab, modelisation_dict_tab, modelisation_seq2seq_tab, game_tab
|
47 |
+
|
48 |
+
|
49 |
+
with open("style.css", "r") as f:
|
50 |
+
style = f.read()
|
51 |
+
|
52 |
+
st.markdown(f"<style>{style}</style>", unsafe_allow_html=True)
|
53 |
+
|
54 |
+
|
55 |
+
# Add tab in this ordered dict by
|
56 |
+
# passing the name in the sidebar as key and the imported tab
|
57 |
+
# as value as follow :
|
58 |
+
TABS = OrderedDict(
|
59 |
+
[
|
60 |
+
(tr(intro.sidebar_name), intro),
|
61 |
+
(tr(exploration_tab.sidebar_name), exploration_tab),
|
62 |
+
(tr(data_viz_tab.sidebar_name), data_viz_tab),
|
63 |
+
(tr(id_lang_tab.sidebar_name), id_lang_tab),
|
64 |
+
(tr(modelisation_dict_tab.sidebar_name), modelisation_dict_tab),
|
65 |
+
(tr(modelisation_seq2seq_tab.sidebar_name), modelisation_seq2seq_tab),
|
66 |
+
(tr(game_tab.sidebar_name), game_tab ),
|
67 |
+
]
|
68 |
+
)
|
69 |
+
|
70 |
+
# Utilisation du module translate
|
71 |
+
# lang_tgt = ['fr','en','af','ak','sq','de','am','en','ar','hy','as','az','ba','bm','eu','bn','be','my','bs','bg','ks','ca','ny','zh','si','ko','co','ht','hr','da','dz','gd','es','eo','et','ee','fo','fj','fi','fr','fy','gl','cy','lg','ka','el','gn','gu','ha','he','hi','hu','ig','id','iu','ga','is','it','ja','kn','kk','km','ki','rw','ky','rn','ku','lo','la','lv','li','ln','lt','lb','mk','ms','ml','dv','mg','mt','mi','mr','mn','nl','ne','no','nb','nn','oc','or','ug','ur','uz','ps','pa','fa','pl','pt','ro','ru','sm','sg','sa','sc','sr','sn','sd','sk','sl','so','st','su','sv','sw','ss','tg','tl','ty','ta','tt','cs','te','th','bo','ti','to','ts','tn','tr','tk','tw','uk','vi','wo','xh','yi']
|
72 |
+
# label_lang = ['Français', 'Anglais / English','Afrikaans','Akan','Albanais','Allemand / Deutsch','Amharique','Anglais','Arabe','Arménien','Assamais','Azéri','Bachkir','Bambara','Basque','Bengali','Biélorusse','Birman','Bosnien','Bulgare','Cachemiri','Catalan','Chichewa','Chinois','Cingalais','Coréen','Corse','Créolehaïtien','Croate','Danois','Dzongkha','Écossais','Espagnol / Español','Espéranto','Estonien','Ewe','Féroïen','Fidjien','Finnois','Français','Frisonoccidental','Galicien','Gallois','Ganda','Géorgien','Grecmoderne','Guarani','Gujarati','Haoussa','Hébreu','Hindi','Hongrois','Igbo','Indonésien','Inuktitut','Irlandais','Islandais','Italien / Italiano','Japonais','Kannada','Kazakh','Khmer','Kikuyu','Kinyarwanda','Kirghiz','Kirundi','Kurde','Lao','Latin','Letton','Limbourgeois','Lingala','Lituanien','Luxembourgeois','Macédonien','Malais','Malayalam','Maldivien','Malgache','Maltais','MaorideNouvelle-Zélande','Marathi','Mongol','Néerlandais / Nederlands','Népalais','Norvégien','Norvégienbokmål','Norvégiennynorsk','Occitan','Oriya','Ouïghour','Ourdou','Ouzbek','Pachto','Pendjabi','Persan','Polonais','Portugais','Roumain','Russe','Samoan','Sango','Sanskrit','Sarde','Serbe','Shona','Sindhi','Slovaque','Slovène','Somali','SothoduSud','Soundanais','Suédois','Swahili','Swati','Tadjik','Tagalog','Tahitien','Tamoul','Tatar','Tchèque','Télougou','Thaï','Tibétain','Tigrigna','Tongien','Tsonga','Tswana','Turc','Turkmène','Twi','Ukrainien','Vietnamien','Wolof','Xhosa','Yiddish']
|
73 |
+
|
74 |
+
# Utilisation du module deep_translator
|
75 |
+
lang_tgt = ['fr', 'en', 'af', 'ak', 'sq', 'de', 'am', 'en', 'ar', 'hy', 'as', 'ay', 'az', 'bm', 'eu', 'bn', 'bho', 'be', 'my', 'bs', 'bg', 'ca', 'ceb', 'ny', 'zh-CN', 'zh-TW', 'si', 'ko', 'co', 'ht', 'hr', 'da', 'doi', 'gd', 'es', 'eo', 'et', 'ee', 'fi', 'fr', 'fy', 'gl', 'cy', 'lg', 'ka', 'el', 'gn', 'gu', 'ha', 'haw', 'iw', 'hi', 'hmn', 'hu', 'ig', 'ilo', 'id', 'ga', 'is', 'it', 'ja', 'jw', 'kn', 'kk', 'km', 'rw', 'ky', 'gom', 'kri', 'ku', 'ckb', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mai', 'ms', 'ml', 'dv', 'mg', 'mt', 'mi', 'mr', 'mni-Mtei', 'lus', 'mn', 'nl', 'ne', 'no','or', 'om', 'ug', 'ur', 'uz', 'ps', 'pa', 'fa', 'pl', 'pt', 'qu', 'ro', 'ru', 'sm', 'sa', 'nso', 'sr', 'sn', 'sd', 'sk', 'sl', 'so', 'st', 'su', 'sv', 'sw', 'tg', 'tl', 'ta', 'tt', 'cs', 'te', 'th', 'ti', 'ts', 'tr', 'tk', 'uk', 'vi', 'xh', 'yi', 'yo', 'zu']
|
76 |
+
label_lang = ['Français', 'Anglais / English','Afrikaans','Akan','Albanais','Allemand / Deutsch','Amharique','Anglais','Arabe','Arménien','Assamais','Aymara','Azéri','Bambara','Basque','Bengali','Bhojpuri','Biélorusse','Birman','Bosnien','Bulgare','Catalan','Cebuano','Chichewa','Chinois (simplifié)','Chinois (traditionnel)','Cingalais','Coréen','Corse','Créole haïtien','Croate','Danois','Dogri','Écossais','Espagnol / Español','Espéranto','Estonien','Ewe','Finnois','Français','Frisonoccidental','Galicien','Gallois','Ganda','Géorgien','Grec moderne','Guarani','Gujarati','Haoussa','Hawaïen','Hébreu','Hindi','Hmong','Hongrois','Igbo','Ilocano','Indonésien','Irlandais','Islandais','Italien / Italiano','Japonais','Javanais','Kannada','Kazakh','Khmer','Kinyarwanda','Kirghiz','Konkani','Krio','Kurde','Kurde (Sorani)','Lao','Latin','Letton','Lingala','Lituanien','Luxembourgeois','Macédonien','Maithili','Malais','Malayalam','Maldivien','Malgache','Maltais','Maori de Nouvelle-Zélande','Marathi','Meiteilon (Manipuri)','Mizo','Mongol','Néerlandais / Nederlands','Népalais','Norvégien','Oriya','Oromo','Ouïghour','Ourdou','Ouzbek','Pachto','Pendjabi','Persan','Polonais','Portugais','Quechua','Roumain','Russe','Samoan','Sanskrit','Sepedi','Serbe','Shona','Sindhi','Slovaque','Slovène','Somali','Sotho du Sud','Soundanais','Suédois','Swahili','Tadjik','Tagalog','Tamoul','Tatar','Tchèque','Télougou','Thaï','Tigrigna','Tsonga','Turc','Turkmène','Ukrainien','Vietnamien','Xhosa','Yiddish','Yoruba','Zulu']
|
77 |
+
|
78 |
+
@st.cache_data
|
79 |
+
def find_lang_label(lang_sel):
|
80 |
+
global lang_tgt, label_lang
|
81 |
+
return label_lang[lang_tgt.index(lang_sel)]
|
82 |
+
|
83 |
+
def run():
|
84 |
+
|
85 |
+
st.sidebar.image(
|
86 |
+
"assets/demosthene_logo.png",
|
87 |
+
width=270,
|
88 |
+
)
|
89 |
+
with st.sidebar:
|
90 |
+
tab_name = option_menu(None, list(TABS.keys()),
|
91 |
+
# icons=['house', 'bi-binoculars', 'bi bi-graph-up', 'bi-chat-right-text','bi-book', 'bi-body-text'], menu_icon="cast", default_index=0,
|
92 |
+
icons=['house', 'binoculars', 'graph-up', 'search','book', 'chat-right-text','controller'], menu_icon="cast", default_index=0,
|
93 |
+
styles={"container": {"padding": "0!important","background-color": "#10b8dd", "border-radius": "0!important"},
|
94 |
+
"nav-link": {"font-size": "1rem", "text-align": "left", "margin":"0em", "padding": "0em",
|
95 |
+
"padding-left": "0.2em", "--hover-color": "#eee", "font-weight": "400",
|
96 |
+
"font-family": "Source Sans Pro, sans-serif"}
|
97 |
+
})
|
98 |
+
# tab_name = st.sidebar.radio("", list(TABS.keys()), 0)
|
99 |
+
st.sidebar.markdown("---")
|
100 |
+
st.sidebar.markdown(f"## {config.PROMOTION}")
|
101 |
+
|
102 |
+
st.sidebar.markdown("### Team members:")
|
103 |
+
for member in config.TEAM_MEMBERS:
|
104 |
+
st.sidebar.markdown(member.sidebar_markdown(), unsafe_allow_html=True)
|
105 |
+
|
106 |
+
with st.sidebar:
|
107 |
+
st.selectbox("langue:",lang_tgt, format_func = find_lang_label, key="Language", label_visibility="hidden")
|
108 |
+
|
109 |
+
tab = TABS[tab_name]
|
110 |
+
tab.run()
|
111 |
+
|
112 |
+
if __name__ == "__main__":
|
113 |
+
run()
|
assets/BOW.jpg
ADDED
assets/coeur.png
ADDED
assets/deepnlp_graph1.png
ADDED
assets/deepnlp_graph12.png
ADDED
assets/deepnlp_graph3.png
ADDED
assets/demosthene_logo.png
ADDED
assets/faviconV2.png
ADDED
assets/fig_schapley0.png
ADDED
assets/fig_schapley1.png
ADDED
assets/fig_schapley2.png
ADDED
assets/fig_schapley3.png
ADDED
assets/fig_schapley4.png
ADDED
assets/fig_schapley5.png
ADDED
assets/fig_schapley6.png
ADDED
assets/fig_schapley7.png
ADDED
assets/fig_schapley8.png
ADDED
assets/fig_schapley_recap0.png
ADDED
assets/fig_schapley_recap1.png
ADDED
assets/fig_schapley_recap2.png
ADDED
assets/fig_schapley_recap3.png
ADDED
assets/fig_schapley_recap4.png
ADDED
assets/fig_schapley_recap5.png
ADDED
assets/fig_schapley_recap6.png
ADDED
assets/fig_schapley_recap7.png
ADDED
assets/fig_schapley_recap8.png
ADDED
assets/formule_proba_naive_bayes.png
ADDED
assets/github-logo.png
ADDED
assets/linkedin-logo-black.png
ADDED
assets/linkedin-logo.png
ADDED
assets/logo-datascientest.png
ADDED
assets/sample-image.jpg
ADDED
assets/tough-communication.gif
ADDED
config.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
|
3 |
+
Config file for Streamlit App
|
4 |
+
|
5 |
+
"""
|
6 |
+
|
7 |
+
from member import Member
|
8 |
+
|
9 |
+
|
10 |
+
TITLE = "Système de traduction adapté aux lunettes connectées"
|
11 |
+
|
12 |
+
TEAM_MEMBERS = [
|
13 |
+
Member(
|
14 |
+
name="Keyne Dupont ",
|
15 |
+
linkedin_url="https://www.linkedin.com/in/keyne-dupont/",
|
16 |
+
github_url=None,
|
17 |
+
),
|
18 |
+
Member(
|
19 |
+
name="Tia Ratsimbason",
|
20 |
+
linkedin_url="https://www.linkedin.com/in/tia-ratsimbason-42110887/",
|
21 |
+
github_url=None,
|
22 |
+
),
|
23 |
+
Member(
|
24 |
+
name="Olivier Renouard",
|
25 |
+
linkedin_url="https://www.linkedin.com/in/olivier-renouard/",
|
26 |
+
github_url="https://github.com/Demosthene-OR/AVR23_CDS_Text_translation",
|
27 |
+
)
|
28 |
+
|
29 |
+
|
30 |
+
]
|
31 |
+
|
32 |
+
PROMOTION = "Promotion Continuous - Data Scientist - April 2023"
|
images/coeur.png
ADDED
images/demosthene_tete.svg
ADDED
member.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Member:
|
2 |
+
def __init__(
|
3 |
+
self, name: str, linkedin_url: str = None, github_url: str = None
|
4 |
+
) -> None:
|
5 |
+
self.name = name
|
6 |
+
self.linkedin_url = linkedin_url
|
7 |
+
self.github_url = github_url
|
8 |
+
|
9 |
+
def sidebar_markdown(self):
|
10 |
+
|
11 |
+
markdown = f'<b style="display: inline-block; vertical-align: middle; height: 100%">{self.name}</b>'
|
12 |
+
|
13 |
+
if self.linkedin_url is not None:
|
14 |
+
markdown += f' <a href={self.linkedin_url} target="_blank"><img src="https://dst-studio-template.s3.eu-west-3.amazonaws.com/linkedin-logo-black.png" alt="linkedin" width="25" style="vertical-align: middle; margin-left: 5px"/></a> '
|
15 |
+
|
16 |
+
if self.github_url is not None:
|
17 |
+
markdown += f' <a href={self.github_url} target="_blank"><img src="https://dst-studio-template.s3.eu-west-3.amazonaws.com/github-logo.png" alt="github" width="20" style="vertical-align: middle; margin-left: 5px"/></a> '
|
18 |
+
|
19 |
+
return markdown
|
packages.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
build-essential
|
2 |
+
libasound-dev
|
3 |
+
portaudio19-dev
|
4 |
+
python3-pyaudio
|
5 |
+
graphviz
|
requirements.txt
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.26.0
|
2 |
+
pandas==2.2.1
|
3 |
+
matplotlib==3.8.2
|
4 |
+
ipython==8.21.0
|
5 |
+
numpy==1.23.5
|
6 |
+
seaborn==0.13.2
|
7 |
+
nltk==3.8.1
|
8 |
+
scikit-learn==1.1.3
|
9 |
+
gensim==4.3.2
|
10 |
+
sacrebleu==2.4.0
|
11 |
+
spacy==3.6.0
|
12 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz
|
13 |
+
https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.6.0/fr_core_news_sm-3.6.0.tar.gz
|
14 |
+
pillow==9.5.0
|
15 |
+
wordcloud==1.9.3
|
16 |
+
networkx==2.7.0
|
17 |
+
transformers==4.37.2
|
18 |
+
keras-nlp==0.6.1
|
19 |
+
keras==2.12.0
|
20 |
+
tensorflow==2.12.0
|
21 |
+
sentencepiece==0.1.99
|
22 |
+
openai-whisper==20231117
|
23 |
+
torch==2.2.0
|
24 |
+
speechrecognition==3.10.1
|
25 |
+
audio_recorder_streamlit==0.0.8
|
26 |
+
whisper==1.1.10
|
27 |
+
wavio==0.0.8
|
28 |
+
filesplit==4.0.1
|
29 |
+
regex==2023.12.25
|
30 |
+
pydot==2.0.0
|
31 |
+
graphviz==0.20.1
|
32 |
+
gTTS==2.5.1
|
33 |
+
https://files.pythonhosted.org/packages/cc/58/96aff0e5cb8b59c06232ea7e249ed902d04ec89f52636f5be06ceb0855fe/extra_streamlit_components-0.1.60-py3-none-any.whl
|
34 |
+
streamlit-option-menu==0.3.12
|
35 |
+
deep-translator==1.11.4
|
style.css
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
h1 {
|
2 |
+
padding-top: 0rem;
|
3 |
+
padding-bottom: 0rem;
|
4 |
+
margin-top:6px;
|
5 |
+
}
|
6 |
+
h2 {
|
7 |
+
padding-top: 0.75rem;
|
8 |
+
padding-bottom: 0.5rem;
|
9 |
+
}
|
10 |
+
|
11 |
+
/* La ligne suivante est nécessaire à cause du module streamlit_option_menu qui "casse" les CSS suivants */
|
12 |
+
@media (prefers-color-scheme: dark) {
|
13 |
+
.st-cc {
|
14 |
+
color: #fff!important; /* Couleur du texte en mode sombre */
|
15 |
+
}
|
16 |
+
.st-cg:hover {
|
17 |
+
color: rgb(255, 75, 75)!important; /* Couleur du texte en mode sombre */
|
18 |
+
}
|
19 |
+
section[data-testid="stSidebar"] .stSelectbox .st-cc {
|
20 |
+
color: rgb(255, 75, 75)!important;
|
21 |
+
font-weight: bold;
|
22 |
+
}
|
23 |
+
}
|
24 |
+
|
25 |
+
p {
|
26 |
+
margin-bottom:0.1rem;
|
27 |
+
}
|
28 |
+
|
29 |
+
code {
|
30 |
+
color: #1ec3bc;
|
31 |
+
}
|
32 |
+
|
33 |
+
#MainMenu {
|
34 |
+
display: none;
|
35 |
+
}
|
36 |
+
|
37 |
+
div[data-testid="stDecoration"] {
|
38 |
+
display: none;
|
39 |
+
}
|
40 |
+
|
41 |
+
footer {
|
42 |
+
display: none;
|
43 |
+
}
|
44 |
+
|
45 |
+
/* Radio buttons */
|
46 |
+
|
47 |
+
.st-cc {
|
48 |
+
color: black;
|
49 |
+
font-weight: 500;
|
50 |
+
}
|
51 |
+
|
52 |
+
/* Sidebar */
|
53 |
+
|
54 |
+
.css-1544g2n {
|
55 |
+
padding-top: 1rem;
|
56 |
+
}
|
57 |
+
|
58 |
+
.css-10oheav {
|
59 |
+
padding-top: 3rem;
|
60 |
+
}
|
61 |
+
|
62 |
+
.css-ue6h4q {
|
63 |
+
min-height: 0.5rem;
|
64 |
+
}
|
65 |
+
|
66 |
+
section[data-testid="stSidebar"] > div {
|
67 |
+
background-color: #10b8dd;
|
68 |
+
padding-top: 1rem;
|
69 |
+
padding-left: 0.5rem;
|
70 |
+
}
|
71 |
+
|
72 |
+
section[data-testid="stSidebar"] button[title="View fullscreen"] {
|
73 |
+
display: none;
|
74 |
+
}
|
75 |
+
|
76 |
+
section[data-testid="stSidebar"] button[kind="icon"] {
|
77 |
+
display: none;
|
78 |
+
}
|
79 |
+
|
80 |
+
section[data-testid="stSidebar"] .st-bk {
|
81 |
+
background-color: #10b8dd;
|
82 |
+
}
|
83 |
+
|
84 |
+
section[data-testid="stSidebar"] .st-c0 {
|
85 |
+
/* background-color: #10b8dd; */
|
86 |
+
}
|
87 |
+
|
88 |
+
section[data-testid="stSidebar"] hr {
|
89 |
+
margin-top: 30px;
|
90 |
+
border-color: white;
|
91 |
+
width: 50px;
|
92 |
+
}
|
93 |
+
|
94 |
+
section[data-testid="stSidebar"] h2 {
|
95 |
+
color: white;
|
96 |
+
}
|
97 |
+
|
98 |
+
section[data-testid="stSidebar"] .stSelectbox .st-bk {
|
99 |
+
background-color: #a0d3de;
|
100 |
+
}
|
101 |
+
|
102 |
+
section[data-testid="stSidebar"] .stSelectbox .st-cc {
|
103 |
+
color: rgb(255, 75, 75);
|
104 |
+
font-weight: bold;
|
105 |
+
}
|
106 |
+
|
107 |
+
/* Images */
|
108 |
+
|
109 |
+
button[title="View fullscreen"] {
|
110 |
+
display: none;
|
111 |
+
}
|
112 |
+
|
113 |
+
/* hr */
|
114 |
+
|
115 |
+
hr {
|
116 |
+
width: 700px;
|
117 |
+
border-width: 5px;
|
118 |
+
border-color: #10b8dd;
|
119 |
+
margin-top: 0px;
|
120 |
+
margin-bottom: 1em;
|
121 |
+
max-width: 100%;
|
122 |
+
}
|
123 |
+
|
124 |
+
/* First Page */
|
125 |
+
|
126 |
+
section[tabindex="0"] .block-container {
|
127 |
+
padding-top: 0px;
|
128 |
+
padding-bottom: 0px;
|
129 |
+
}
|
tabs/custom_vectorizer.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Les 2 fonctions suivantes sont nécéssaires afin de sérialiser ces parametre de CountVectorizer
|
2 |
+
# et ainsi de sauvegarder le vectorizer pour un un usage ultérieur sans utiliser X_train pour le réinitialiser
|
3 |
+
import tiktoken
|
4 |
+
|
5 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
6 |
+
|
7 |
+
def custom_tokenizer(text):
|
8 |
+
global tokenizer
|
9 |
+
|
10 |
+
tokens = tokenizer.encode(text) # Cela divise le texte en mots
|
11 |
+
return tokens
|
12 |
+
|
13 |
+
def custom_preprocessor(text):
|
14 |
+
return text
|
tabs/data_viz_tab.py
ADDED
@@ -0,0 +1,404 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from PIL import Image
|
3 |
+
import os
|
4 |
+
import ast
|
5 |
+
import contextlib
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import seaborn as sns
|
10 |
+
from wordcloud import WordCloud
|
11 |
+
import nltk
|
12 |
+
from nltk.corpus import stopwords
|
13 |
+
from gensim import corpora
|
14 |
+
import networkx as nx
|
15 |
+
from sklearn.manifold import TSNE
|
16 |
+
from gensim.models import KeyedVectors
|
17 |
+
from translate_app import tr
|
18 |
+
|
19 |
+
title = "Data Vizualization"
|
20 |
+
sidebar_name = "Data Vizualization"
|
21 |
+
dataPath = st.session_state.DataPath
|
22 |
+
|
23 |
+
with contextlib.redirect_stdout(open(os.devnull, "w")):
|
24 |
+
nltk.download('stopwords')
|
25 |
+
|
26 |
+
# Première ligne à charger
|
27 |
+
first_line = 0
|
28 |
+
# Nombre maximum de lignes à charger
|
29 |
+
max_lines = 140000
|
30 |
+
if ((first_line+max_lines)>137860):
|
31 |
+
max_lines = max(137860-first_line ,0)
|
32 |
+
# Nombre maximum de ligne à afficher pour les DataFrame
|
33 |
+
max_lines_to_display = 50
|
34 |
+
|
35 |
+
@st.cache_data
|
36 |
+
def load_data(path):
|
37 |
+
|
38 |
+
input_file = os.path.join(path)
|
39 |
+
with open(input_file, "r", encoding="utf-8") as f:
|
40 |
+
data = f.read()
|
41 |
+
|
42 |
+
# On convertit les majuscules en minulcule
|
43 |
+
data = data.lower()
|
44 |
+
|
45 |
+
data = data.split('\n')
|
46 |
+
return data[first_line:min(len(data),first_line+max_lines)]
|
47 |
+
|
48 |
+
@st.cache_data
|
49 |
+
def load_preprocessed_data(path,data_type):
|
50 |
+
|
51 |
+
input_file = os.path.join(path)
|
52 |
+
if data_type == 1:
|
53 |
+
return pd.read_csv(input_file, encoding="utf-8", index_col=0)
|
54 |
+
else:
|
55 |
+
with open(input_file, "r", encoding="utf-8") as f:
|
56 |
+
data = f.read()
|
57 |
+
data = data.split('\n')
|
58 |
+
if data_type==0:
|
59 |
+
data=data[:-1]
|
60 |
+
elif data_type == 2:
|
61 |
+
data=[eval(i) for i in data[:-1]]
|
62 |
+
elif data_type ==3:
|
63 |
+
data2 = []
|
64 |
+
for d in data[:-1]:
|
65 |
+
data2.append(ast.literal_eval(d))
|
66 |
+
data=data2
|
67 |
+
return data
|
68 |
+
|
69 |
+
@st.cache_data
|
70 |
+
def load_all_preprocessed_data(lang):
|
71 |
+
txt =load_preprocessed_data(dataPath+'/preprocess_txt_'+lang,0)
|
72 |
+
corpus =load_preprocessed_data(dataPath+'/preprocess_corpus_'+lang,0)
|
73 |
+
txt_split = load_preprocessed_data(dataPath+'/preprocess_txt_split_'+lang,3)
|
74 |
+
df_count_word = pd.concat([load_preprocessed_data(dataPath+'/preprocess_df_count_word1_'+lang,1), load_preprocessed_data(dataPath+'/preprocess_df_count_word2_'+lang,1)])
|
75 |
+
sent_len =load_preprocessed_data(dataPath+'/preprocess_sent_len_'+lang,2)
|
76 |
+
vec_model= KeyedVectors.load_word2vec_format(dataPath+'/mini.wiki.'+lang+'.align.vec')
|
77 |
+
return txt, corpus, txt_split, df_count_word,sent_len, vec_model
|
78 |
+
|
79 |
+
#Chargement des textes complet dans les 2 langues
|
80 |
+
full_txt_en, full_corpus_en, full_txt_split_en, full_df_count_word_en,full_sent_len_en, vec_model_en = load_all_preprocessed_data('en')
|
81 |
+
full_txt_fr, full_corpus_fr, full_txt_split_fr, full_df_count_word_fr,full_sent_len_fr, vec_model_fr = load_all_preprocessed_data('fr')
|
82 |
+
|
83 |
+
|
84 |
+
def plot_word_cloud(text, title, masque, stop_words, background_color = "white"):
|
85 |
+
|
86 |
+
mask_coloring = np.array(Image.open(str(masque)))
|
87 |
+
# Définir le calque du nuage des mots
|
88 |
+
wc = WordCloud(background_color=background_color, max_words=200,
|
89 |
+
stopwords=stop_words, mask = mask_coloring,
|
90 |
+
max_font_size=50, random_state=42)
|
91 |
+
# Générer et afficher le nuage de mots
|
92 |
+
fig=plt.figure(figsize= (20,10))
|
93 |
+
plt.title(tr(title), fontsize=25, color="green")
|
94 |
+
wc.generate(text)
|
95 |
+
|
96 |
+
# getting current axes
|
97 |
+
a = plt.gca()
|
98 |
+
|
99 |
+
# set visibility of x-axis as False
|
100 |
+
xax = a.axes.get_xaxis()
|
101 |
+
xax = xax.set_visible(False)
|
102 |
+
|
103 |
+
# set visibility of y-axis as False
|
104 |
+
yax = a.axes.get_yaxis()
|
105 |
+
yax = yax.set_visible(False)
|
106 |
+
|
107 |
+
plt.imshow(wc)
|
108 |
+
# plt.show()
|
109 |
+
st.pyplot(fig)
|
110 |
+
|
111 |
+
def drop_df_null_col(df):
|
112 |
+
# Check if all values in each column are 0
|
113 |
+
columns_to_drop = df.columns[df.eq(0).all()]
|
114 |
+
# Drop the columns with all values as 0
|
115 |
+
return df.drop(columns=columns_to_drop)
|
116 |
+
|
117 |
+
def calcul_occurence(df_count_word):
|
118 |
+
nb_occurences = pd.DataFrame(df_count_word.sum().sort_values(axis=0,ascending=False))
|
119 |
+
nb_occurences.columns = ['occurences']
|
120 |
+
nb_occurences.index.name = 'mot'
|
121 |
+
nb_occurences['mots'] = nb_occurences.index
|
122 |
+
return nb_occurences
|
123 |
+
|
124 |
+
def dist_frequence_mots(df_count_word):
|
125 |
+
|
126 |
+
df_count_word = drop_df_null_col(df_count_word)
|
127 |
+
nb_occurences = calcul_occurence(df_count_word)
|
128 |
+
|
129 |
+
sns.set()
|
130 |
+
fig = plt.figure() #figsize=(4,4)
|
131 |
+
plt.title(tr("Nombre d'apparitions des mots"), fontsize=16)
|
132 |
+
|
133 |
+
chart = sns.barplot(x='mots',y='occurences',data=nb_occurences.iloc[:40]);
|
134 |
+
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right', size=8)
|
135 |
+
st.pyplot(fig)
|
136 |
+
|
137 |
+
def dist_longueur_phrase(sent_len,sent_len2, lang1, lang2 ):
|
138 |
+
'''
|
139 |
+
fig = px.histogram(sent_len, nbins=16, range_x=[3, 18],labels={'count': 'Count', 'variable': 'Nb de mots'},
|
140 |
+
color_discrete_sequence=['rgb(200, 0, 0)'], # Couleur des barres de l'histogramme
|
141 |
+
opacity=0.7)
|
142 |
+
fig.update_traces(marker=dict(color='rgb(200, 0, 0)', line=dict(color='white', width=2)), showlegend=False,)
|
143 |
+
fig.update_layout(
|
144 |
+
title={'text': 'Distribution du nb de mots/phrase', 'y':1.0, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
|
145 |
+
title_font=dict(size=28), # Ajuste la taille de la police du titre
|
146 |
+
xaxis_title=None,
|
147 |
+
xaxis=dict(
|
148 |
+
title_font=dict(size=30), # Ajuste la taille de la police de l'axe X
|
149 |
+
tickfont=dict(size=22),
|
150 |
+
showgrid=True, gridcolor='white'
|
151 |
+
),
|
152 |
+
yaxis_title='Count',
|
153 |
+
yaxis=dict(
|
154 |
+
title_font= dict(size=30, color='black'), # Ajuste la taille de la police de l'axe Y
|
155 |
+
title_standoff=10, # Éloigne le label de l'axe X du graphique
|
156 |
+
tickfont=dict(size=22),
|
157 |
+
showgrid=True, gridcolor='white'
|
158 |
+
),
|
159 |
+
margin=dict(l=20, r=20, t=40, b=20), # Ajustez les valeurs de 'r' pour déplacer les commandes à droite
|
160 |
+
# legend=dict(x=1, y=1), # Position de la légende à droite en haut
|
161 |
+
# width = 600
|
162 |
+
height=600, # Définir la hauteur de la figure
|
163 |
+
plot_bgcolor='rgba(220, 220, 220, 0.6)',
|
164 |
+
)
|
165 |
+
st.plotly_chart(fig, use_container_width=True)
|
166 |
+
'''
|
167 |
+
df = pd.DataFrame({lang1:sent_len,lang2:sent_len2})
|
168 |
+
sns.set()
|
169 |
+
fig = plt.figure() # figsize=(12, 6*row_nb)
|
170 |
+
|
171 |
+
fig.tight_layout()
|
172 |
+
chart = sns.histplot(df, color=['r','b'], label=[lang1,lang2], binwidth=1, binrange=[2,22], element="step",
|
173 |
+
common_norm=False, multiple="layer", discrete=True, stat='proportion')
|
174 |
+
plt.xticks([2,4,6,8,10,12,14,16,18,20,22])
|
175 |
+
chart.set(title=tr('Distribution du nombre de mots sur '+str(len(sent_len))+' phrase(s)'));
|
176 |
+
st.pyplot(fig)
|
177 |
+
|
178 |
+
'''
|
179 |
+
# fig = ff.create_distplot([sent_len], ['Nb de mots'],bin_size=1, colors=['rgb(200, 0, 0)'])
|
180 |
+
|
181 |
+
distribution = pd.DataFrame({'Nb mots':sent_len, 'Nb phrases':[1]*len(sent_len)})
|
182 |
+
fig = px.histogram(distribution, x='Nb mots', y='Nb phrases', marginal="box",range_x=[3, 18], nbins=16, hover_data=distribution.columns)
|
183 |
+
fig.update_layout(height=600,title={'text': 'Distribution du nb de mots/phrase', 'y':1.0, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'})
|
184 |
+
fig.update_traces(marker=dict(color='rgb(200, 0, 0)', line=dict(color='white', width=2)), showlegend=False,)
|
185 |
+
st.plotly_chart(fig, use_container_width=True)
|
186 |
+
'''
|
187 |
+
|
188 |
+
def find_color(x,min_w,max_w):
|
189 |
+
b_min = 0.0*(max_w-min_w)+min_w
|
190 |
+
b_max = 0.05*(max_w-min_w)+min_w
|
191 |
+
x = max(x,b_min)
|
192 |
+
x = min(b_max, x)
|
193 |
+
c = (x - b_min)/(b_max-b_min)
|
194 |
+
return round(c)
|
195 |
+
|
196 |
+
def graphe_co_occurence(txt_split,corpus):
|
197 |
+
|
198 |
+
dic = corpora.Dictionary(txt_split) # dictionnaire de tous les mots restant dans le token
|
199 |
+
# Equivalent (ou presque) de la DTM : DFM, Document Feature Matrix
|
200 |
+
dfm = [dic.doc2bow(tok) for tok in txt_split]
|
201 |
+
|
202 |
+
mes_labels = [k for k, v in dic.token2id.items()]
|
203 |
+
|
204 |
+
from gensim.matutils import corpus2csc
|
205 |
+
term_matrice = corpus2csc(dfm)
|
206 |
+
|
207 |
+
term_matrice = np.dot(term_matrice, term_matrice.T)
|
208 |
+
|
209 |
+
for i in range(len(mes_labels)):
|
210 |
+
term_matrice[i,i]= 0
|
211 |
+
term_matrice.eliminate_zeros()
|
212 |
+
|
213 |
+
G = nx.from_scipy_sparse_matrix(term_matrice)
|
214 |
+
G.add_nodes = dic
|
215 |
+
pos=nx.spring_layout(G, k=5) # position des nodes
|
216 |
+
|
217 |
+
importance = dict(nx.degree(G))
|
218 |
+
importance = [round((v**1.3)) for v in importance.values()]
|
219 |
+
edges,weights = zip(*nx.get_edge_attributes(G,'weight').items())
|
220 |
+
max_w = max(weights)
|
221 |
+
min_w = min(weights)
|
222 |
+
edge_color = [find_color(weights[i],min_w,max_w) for i in range(len(weights))]
|
223 |
+
width = [(weights[i]-min_w)*3.4/(max_w-min_w)+0.2 for i in range(len(weights))]
|
224 |
+
alpha = [(weights[i]-min_w)*0.3/(max_w-min_w)+0.3 for i in range(len(weights))]
|
225 |
+
|
226 |
+
fig = plt.figure();
|
227 |
+
|
228 |
+
nx.draw_networkx_labels(G,pos,dic,font_size=8, font_color='b', font_weight='bold')
|
229 |
+
nx.draw_networkx_nodes(G,pos, dic, \
|
230 |
+
node_color= importance, # range(len(importance)), #"tab:red", \
|
231 |
+
node_size=importance, \
|
232 |
+
cmap=plt.cm.RdYlGn, #plt.cm.Reds_r, \
|
233 |
+
alpha=0.4);
|
234 |
+
nx.draw_networkx_edges(G,pos,width=width,edge_color=edge_color, alpha=alpha,edge_cmap=plt.cm.RdYlGn) # [1] * len(width)
|
235 |
+
|
236 |
+
plt.axis("off");
|
237 |
+
st.pyplot(fig)
|
238 |
+
|
239 |
+
def proximite():
|
240 |
+
global vec_model_en,vec_model_fr
|
241 |
+
|
242 |
+
# Creates and TSNE model and plots it"
|
243 |
+
labels = []
|
244 |
+
tokens = []
|
245 |
+
|
246 |
+
nb_words = st.slider(tr('Nombre de mots à afficher')+' :',10,50, value=20)
|
247 |
+
df = pd.read_csv(dataPath+'/dict_we_en_fr',header=0,index_col=0, encoding ="utf-8", keep_default_na=False)
|
248 |
+
words_en = df.index.to_list()[:nb_words]
|
249 |
+
words_fr = df['Francais'].to_list()[:nb_words]
|
250 |
+
|
251 |
+
for word in words_en:
|
252 |
+
tokens.append(vec_model_en[word])
|
253 |
+
labels.append(word)
|
254 |
+
for word in words_fr:
|
255 |
+
tokens.append(vec_model_fr[word])
|
256 |
+
labels.append(word)
|
257 |
+
tokens = pd.DataFrame(tokens)
|
258 |
+
|
259 |
+
tsne_model = TSNE(perplexity=10, n_components=2, init='pca', n_iter=2000, random_state=23)
|
260 |
+
new_values = tsne_model.fit_transform(tokens)
|
261 |
+
|
262 |
+
fig =plt.figure(figsize=(16, 16))
|
263 |
+
x = []
|
264 |
+
y = []
|
265 |
+
for value in new_values:
|
266 |
+
x.append(value[0])
|
267 |
+
y.append(value[1])
|
268 |
+
|
269 |
+
for i in range(len(x)):
|
270 |
+
if i<nb_words : color='green'
|
271 |
+
else: color='blue'
|
272 |
+
plt.scatter(x[i],y[i])
|
273 |
+
plt.annotate(labels[i],
|
274 |
+
xy=(x[i], y[i]),
|
275 |
+
xytext=(5, 2),
|
276 |
+
textcoords='offset points',
|
277 |
+
ha='right',
|
278 |
+
va='bottom',
|
279 |
+
color= color,
|
280 |
+
size=20)
|
281 |
+
plt.title(tr("Proximité des mots anglais avec leur traduction"), fontsize=30, color="green")
|
282 |
+
plt.legend(loc='best');
|
283 |
+
st.pyplot(fig)
|
284 |
+
|
285 |
+
|
286 |
+
def run():
|
287 |
+
|
288 |
+
global max_lines, first_line, Langue
|
289 |
+
global full_txt_en, full_corpus_en, full_txt_split_en, full_df_count_word_en,full_sent_len_en, vec_model_en
|
290 |
+
global full_txt_fr, full_corpus_fr, full_txt_split_fr, full_df_count_word_fr,full_sent_len_fr, vec_model_fr
|
291 |
+
|
292 |
+
st.write("")
|
293 |
+
st.title(tr(title))
|
294 |
+
|
295 |
+
#
|
296 |
+
st.write("## **"+tr("Paramètres")+" :**\n")
|
297 |
+
Langue = st.radio(tr('Langue:'),('Anglais','Français'), horizontal=True)
|
298 |
+
first_line = st.slider(tr('No de la premiere ligne à analyser')+' :',0,137859)
|
299 |
+
max_lines = st.select_slider(tr('Nombre de lignes à analyser')+' :',
|
300 |
+
options=[1,5,10,15,100, 500, 1000,'Max'])
|
301 |
+
if max_lines=='Max':
|
302 |
+
max_lines=137860
|
303 |
+
if ((first_line+max_lines)>137860):
|
304 |
+
max_lines = max(137860-first_line,0)
|
305 |
+
|
306 |
+
# Chargement des textes sélectionnés (max lignes = max_lines)
|
307 |
+
last_line = first_line+max_lines
|
308 |
+
if (Langue == 'Anglais'):
|
309 |
+
txt_en = full_txt_en[first_line:last_line]
|
310 |
+
corpus_en = full_corpus_en[first_line:last_line]
|
311 |
+
txt_split_en = full_txt_split_en[first_line:last_line]
|
312 |
+
df_count_word_en =full_df_count_word_en.loc[first_line:last_line-1]
|
313 |
+
sent_len_en = full_sent_len_en[first_line:last_line]
|
314 |
+
sent_len_fr = full_sent_len_fr[first_line:last_line]
|
315 |
+
else:
|
316 |
+
txt_fr = full_txt_fr[first_line:last_line]
|
317 |
+
corpus_fr = full_corpus_fr[first_line:last_line]
|
318 |
+
txt_split_fr = full_txt_split_fr[first_line:last_line]
|
319 |
+
df_count_word_fr =full_df_count_word_fr.loc[first_line:last_line-1]
|
320 |
+
sent_len_fr = full_sent_len_fr[first_line:last_line]
|
321 |
+
sent_len_en = full_sent_len_en[first_line:last_line]
|
322 |
+
|
323 |
+
if (Langue=='Anglais'):
|
324 |
+
st.dataframe(pd.DataFrame(data=full_txt_en,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
|
325 |
+
else:
|
326 |
+
st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
|
327 |
+
st.write("")
|
328 |
+
|
329 |
+
tab1, tab2, tab3, tab4, tab5 = st.tabs([tr("World Cloud"), tr("Frequence"),tr("Distribution longueur"), tr("Co-occurence"), tr("Proximité")])
|
330 |
+
|
331 |
+
with tab1:
|
332 |
+
st.subheader(tr("World Cloud"))
|
333 |
+
st.markdown(tr(
|
334 |
+
"""
|
335 |
+
On remarque, en changeant de langue, que certains mot de taille importante dans une langue,
|
336 |
+
apparaissent avec une taille identique dans l'autre langue.
|
337 |
+
La traduction mot à mot sera donc peut-être bonne.
|
338 |
+
""")
|
339 |
+
)
|
340 |
+
if (Langue == 'Anglais'):
|
341 |
+
text = ""
|
342 |
+
# Initialiser la variable des mots vides
|
343 |
+
stop_words = set(stopwords.words('english'))
|
344 |
+
for e in txt_en : text += e
|
345 |
+
plot_word_cloud(text, "English words corpus", st.session_state.ImagePath+"/coeur.png", stop_words)
|
346 |
+
else:
|
347 |
+
text = ""
|
348 |
+
# Initialiser la variable des mots vides
|
349 |
+
stop_words = set(stopwords.words('french'))
|
350 |
+
for e in txt_fr : text += e
|
351 |
+
plot_word_cloud(text,"Mots français du corpus", st.session_state.ImagePath+"/coeur.png", stop_words)
|
352 |
+
|
353 |
+
with tab2:
|
354 |
+
st.subheader(tr("Frequence d'apparition des mots"))
|
355 |
+
st.markdown(tr(
|
356 |
+
"""
|
357 |
+
On remarque, en changeant de langue, que certains mot fréquents dans une langue,
|
358 |
+
apparaissent aussi fréquemment dans l'autre langue.
|
359 |
+
Cela peut nous laisser penser que la traduction mot à mot sera peut-être bonne.
|
360 |
+
""")
|
361 |
+
)
|
362 |
+
if (Langue == 'Anglais'):
|
363 |
+
dist_frequence_mots(df_count_word_en)
|
364 |
+
else:
|
365 |
+
dist_frequence_mots(df_count_word_fr)
|
366 |
+
with tab3:
|
367 |
+
st.subheader(tr("Distribution des longueurs de phrases"))
|
368 |
+
st.markdown(tr(
|
369 |
+
"""
|
370 |
+
Malgré quelques différences entre les 2 langues (les phrases anglaises sont généralement un peu plus courtes),
|
371 |
+
on constate une certaine similitude dans les ditributions de longueur de phrases.
|
372 |
+
Cela peut nous laisser penser que la traduction mot à mot ne sera pas si mauvaise.
|
373 |
+
""")
|
374 |
+
)
|
375 |
+
if (Langue == 'Anglais'):
|
376 |
+
dist_longueur_phrase(sent_len_en, sent_len_fr, 'Anglais','Français')
|
377 |
+
else:
|
378 |
+
dist_longueur_phrase(sent_len_fr, sent_len_en, 'Français', 'Anglais')
|
379 |
+
with tab4:
|
380 |
+
st.subheader(tr("Co-occurence des mots dans une phrase"))
|
381 |
+
if (Langue == 'Anglais'):
|
382 |
+
graphe_co_occurence(txt_split_en[:1000],corpus_en)
|
383 |
+
else:
|
384 |
+
graphe_co_occurence(txt_split_fr[:1000],corpus_fr)
|
385 |
+
with tab5:
|
386 |
+
st.subheader(tr("Proximité sémantique des mots (Word Embedding)") )
|
387 |
+
st.markdown(tr(
|
388 |
+
"""
|
389 |
+
MUSE est une bibliothèque Python pour l'intégration de mots multilingues, qui fournit
|
390 |
+
notamment des "Word Embedding" multilingues
|
391 |
+
Facebook fournit des dictionnaires de référence. Ces embeddings sont des embeddings fastText Wikipedia pour 30 langues qui ont été alignés dans un espace espace vectoriel unique.
|
392 |
+
Dans notre cas, nous avons utilisé 2 mini-dictionnaires d'environ 3000 mots (Français et Anglais).
|
393 |
+
|
394 |
+
""")
|
395 |
+
)
|
396 |
+
st.markdown(tr(
|
397 |
+
"""
|
398 |
+
En novembre 2015, l'équipe de recherche de Facebook a créé fastText qui est une extension de la bibliothèque word2vec.
|
399 |
+
Elle s'appuie sur Word2Vec en apprenant des représentations vectorielles pour chaque mot et les n-grammes trouvés dans chaque mot.
|
400 |
+
""")
|
401 |
+
)
|
402 |
+
st.write("")
|
403 |
+
proximite()
|
404 |
+
|
tabs/exploration_tab.py
ADDED
@@ -0,0 +1,424 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import pandas as pd
|
4 |
+
import collections
|
5 |
+
from nltk.tokenize import word_tokenize
|
6 |
+
from nltk import download
|
7 |
+
from ast import literal_eval
|
8 |
+
from translate_app import tr
|
9 |
+
if st.session_state.Cloud == 0:
|
10 |
+
# import nltk
|
11 |
+
import contextlib
|
12 |
+
import re
|
13 |
+
from nltk.corpus import stopwords
|
14 |
+
import warnings
|
15 |
+
warnings.filterwarnings('ignore')
|
16 |
+
# from PIL import Image
|
17 |
+
# import time
|
18 |
+
# import random
|
19 |
+
|
20 |
+
title = "Exploration et Preprocessing"
|
21 |
+
sidebar_name = "Exploration et Preprocessing"
|
22 |
+
dataPath = st.session_state.DataPath
|
23 |
+
|
24 |
+
# Indiquer si l'on veut enlever les stop words. C'est un processus long
|
25 |
+
stopwords_to_do = True
|
26 |
+
# Indiquer si l'on veut lemmatiser les phrases, un fois les stop words enlevés. C'est un processus long (approximativement 8 minutes)
|
27 |
+
lemmatize_to_do = True
|
28 |
+
# Indiquer si l'on veut calculer le score Bleu pour tout le corpus. C'est un processus très long long (approximativement 10 minutes pour les 10 dictionnaires)
|
29 |
+
bleu_score_to_do = True
|
30 |
+
# Première ligne à charger
|
31 |
+
first_line = 0
|
32 |
+
# Nombre maximum de lignes à charger
|
33 |
+
max_lines = 140000
|
34 |
+
if ((first_line+max_lines)>137860):
|
35 |
+
max_lines = max(137860-first_line ,0)
|
36 |
+
# Nombre maximum de ligne à afficher pour les DataFrame
|
37 |
+
max_lines_to_display = 50
|
38 |
+
|
39 |
+
download('punkt')
|
40 |
+
|
41 |
+
if st.session_state.Cloud == 0:
|
42 |
+
download('averaged_perceptron_tagger')
|
43 |
+
with contextlib.redirect_stdout(open(os.devnull, "w")):
|
44 |
+
download('stopwords')
|
45 |
+
|
46 |
+
@st.cache_data
|
47 |
+
def load_data(path):
|
48 |
+
|
49 |
+
input_file = os.path.join(path)
|
50 |
+
with open(input_file, "r", encoding="utf-8") as f:
|
51 |
+
data = f.read()
|
52 |
+
|
53 |
+
# On convertit les majuscules en minulcule
|
54 |
+
data = data.lower()
|
55 |
+
data = data.split('\n')
|
56 |
+
return data[first_line:min(len(data),first_line+max_lines)]
|
57 |
+
|
58 |
+
@st.cache_data
|
59 |
+
def load_preprocessed_data(path,data_type):
|
60 |
+
|
61 |
+
input_file = os.path.join(path)
|
62 |
+
if data_type == 1:
|
63 |
+
return pd.read_csv(input_file, encoding="utf-8", index_col=0)
|
64 |
+
else:
|
65 |
+
with open(input_file, "r", encoding="utf-8") as f:
|
66 |
+
data = f.read()
|
67 |
+
data = data.split('\n')
|
68 |
+
if data_type==0:
|
69 |
+
data=data[:-1]
|
70 |
+
elif data_type == 2:
|
71 |
+
data=[eval(i) for i in data[:-1]]
|
72 |
+
elif data_type ==3:
|
73 |
+
data2 = []
|
74 |
+
for d in data[:-1]:
|
75 |
+
data2.append(literal_eval(d))
|
76 |
+
data=data2
|
77 |
+
return data
|
78 |
+
|
79 |
+
@st.cache_data
|
80 |
+
def load_all_preprocessed_data(lang):
|
81 |
+
txt =load_preprocessed_data(dataPath+'/preprocess_txt_'+lang,0)
|
82 |
+
txt_split = load_preprocessed_data(dataPath+'/preprocess_txt_split_'+lang,3)
|
83 |
+
txt_lem = load_preprocessed_data(dataPath+'/preprocess_txt_lem_'+lang,0)
|
84 |
+
txt_wo_stopword = load_preprocessed_data(dataPath+'/preprocess_txt_wo_stopword_'+lang,0)
|
85 |
+
df_count_word = pd.concat([load_preprocessed_data(dataPath+'/preprocess_df_count_word1_'+lang,1), load_preprocessed_data(dataPath+'/preprocess_df_count_word2_'+lang,1)])
|
86 |
+
return txt, txt_split, txt_lem, txt_wo_stopword, df_count_word
|
87 |
+
|
88 |
+
#Chargement des textes complet dans les 2 langues
|
89 |
+
full_txt_en = load_data(dataPath+'/small_vocab_en')
|
90 |
+
full_txt_fr = load_data(dataPath+'/small_vocab_fr')
|
91 |
+
|
92 |
+
# Chargement du résultat du préprocessing, si st.session_state.reCalcule == False
|
93 |
+
if not st.session_state.reCalcule:
|
94 |
+
full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en')
|
95 |
+
full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr')
|
96 |
+
else:
|
97 |
+
|
98 |
+
def remove_stopwords(text, lang):
|
99 |
+
stop_words = set(stopwords.words(lang))
|
100 |
+
# stop_words will contain set all english stopwords
|
101 |
+
filtered_sentence = []
|
102 |
+
for word in text.split():
|
103 |
+
if word not in stop_words:
|
104 |
+
filtered_sentence.append(word)
|
105 |
+
return " ".join(filtered_sentence)
|
106 |
+
|
107 |
+
def clean_undesirable_from_text(sentence, lang):
|
108 |
+
|
109 |
+
# Removing URLs
|
110 |
+
sentence = re.sub(r"https?://\S+|www\.\S+", "", sentence )
|
111 |
+
|
112 |
+
# Removing Punctuations (we keep the . character)
|
113 |
+
REPLACEMENTS = [("..", "."),
|
114 |
+
(",", ""),
|
115 |
+
(";", ""),
|
116 |
+
(":", ""),
|
117 |
+
("?", ""),
|
118 |
+
('"', ""),
|
119 |
+
("-", " "),
|
120 |
+
("it's", "it is"),
|
121 |
+
("isn't","is not"),
|
122 |
+
("'", " ")
|
123 |
+
]
|
124 |
+
for old, new in REPLACEMENTS:
|
125 |
+
sentence = sentence.replace(old, new)
|
126 |
+
|
127 |
+
# Removing Digits
|
128 |
+
sentence= re.sub(r'[0-9]','',sentence)
|
129 |
+
|
130 |
+
# Removing Additional Spaces
|
131 |
+
sentence = re.sub(' +', ' ', sentence)
|
132 |
+
|
133 |
+
return sentence
|
134 |
+
|
135 |
+
def clean_untranslated_sentence(data1, data2):
|
136 |
+
i=0
|
137 |
+
while i<len(data1):
|
138 |
+
if data1[i]==data2[i]:
|
139 |
+
data1.pop(i)
|
140 |
+
data2.pop(i)
|
141 |
+
else: i+=1
|
142 |
+
return data1,data2
|
143 |
+
|
144 |
+
import spacy
|
145 |
+
|
146 |
+
nlp_en = spacy.load('en_core_web_sm')
|
147 |
+
nlp_fr = spacy.load('fr_core_news_sm')
|
148 |
+
|
149 |
+
|
150 |
+
def lemmatize(sentence,lang):
|
151 |
+
# Create a Doc object
|
152 |
+
if lang=='en':
|
153 |
+
nlp=nlp_en
|
154 |
+
elif lang=='fr':
|
155 |
+
nlp=nlp_fr
|
156 |
+
else: return
|
157 |
+
doc = nlp(sentence)
|
158 |
+
|
159 |
+
# Create list of tokens from given string
|
160 |
+
tokens = []
|
161 |
+
for token in doc:
|
162 |
+
tokens.append(token)
|
163 |
+
|
164 |
+
lemmatized_sentence = " ".join([token.lemma_ for token in doc])
|
165 |
+
|
166 |
+
return lemmatized_sentence
|
167 |
+
|
168 |
+
|
169 |
+
def preprocess_txt (data, lang):
|
170 |
+
|
171 |
+
word_count = collections.Counter()
|
172 |
+
word_lem_count = collections.Counter()
|
173 |
+
word_wosw_count = collections.Counter()
|
174 |
+
corpus = []
|
175 |
+
data_split = []
|
176 |
+
sentence_length = []
|
177 |
+
data_split_wo_stopwords = []
|
178 |
+
data_length_wo_stopwords = []
|
179 |
+
data_lem = []
|
180 |
+
data_lem_length = []
|
181 |
+
|
182 |
+
txt_en_one_string= ". ".join([s for s in data])
|
183 |
+
txt_en_one_string = txt_en_one_string.replace('..', '.')
|
184 |
+
txt_en_one_string = " "+clean_undesirable_from_text(txt_en_one_string, 'lang')
|
185 |
+
data = txt_en_one_string.split('.')
|
186 |
+
if data[-1]=="":
|
187 |
+
data.pop(-1)
|
188 |
+
for i in range(len(data)): # On enleve les ' ' qui commencent et finissent les phrases
|
189 |
+
if data[i][0] == ' ':
|
190 |
+
data[i]=data[i][1:]
|
191 |
+
if data[i][-1] == ' ':
|
192 |
+
data[i]=data[i][:-1]
|
193 |
+
nb_phrases = len(data)
|
194 |
+
|
195 |
+
# Création d'un tableau de mots (sentence_split)
|
196 |
+
for i,sentence in enumerate(data):
|
197 |
+
sentence_split = word_tokenize(sentence)
|
198 |
+
word_count.update(sentence_split)
|
199 |
+
data_split.append(sentence_split)
|
200 |
+
sentence_length.append(len(sentence_split))
|
201 |
+
|
202 |
+
# La lemmatisation et le nettoyage des stopword va se faire en batch pour des raisons de vitesse
|
203 |
+
# (au lieu de le faire phrase par phrase)
|
204 |
+
# Ces 2 processus nécéssitent de connaitre la langue du corpus
|
205 |
+
if lang == 'en': l='english'
|
206 |
+
elif lang=='fr': l='french'
|
207 |
+
else: l="unknown"
|
208 |
+
|
209 |
+
if l!="unknown":
|
210 |
+
# Lemmatisation en 12 lots (On ne peut lemmatiser + de 1 M de caractères à la fois)
|
211 |
+
data_lemmatized=""
|
212 |
+
if lemmatize_to_do:
|
213 |
+
n_batch = 12
|
214 |
+
batch_size = round((nb_phrases/ n_batch)+0.5)
|
215 |
+
for i in range(n_batch):
|
216 |
+
to_lem = ".".join([s for s in data[i*batch_size:(i+1)*batch_size]])
|
217 |
+
data_lemmatized = data_lemmatized+"."+lemmatize(to_lem,lang).lower()
|
218 |
+
|
219 |
+
data_lem_for_sw = data_lemmatized[1:]
|
220 |
+
data_lemmatized = data_lem_for_sw.split('.')
|
221 |
+
for i in range(nb_phrases):
|
222 |
+
data_lem.append(data_lemmatized[i].split())
|
223 |
+
data_lem_length.append(len(data_lemmatized[i].split()))
|
224 |
+
word_lem_count.update(data_lem[-1])
|
225 |
+
|
226 |
+
# Elimination des StopWords en un lot
|
227 |
+
# On élimine les Stopwords des phrases lémmatisés, si cette phase a eu lieu
|
228 |
+
# (wosw signifie "WithOut Stop Words")
|
229 |
+
if stopwords_to_do:
|
230 |
+
if lemmatize_to_do:
|
231 |
+
data_wosw = remove_stopwords(data_lem_for_sw,l)
|
232 |
+
else:
|
233 |
+
data_wosw = remove_stopwords(txt_en_one_string,l)
|
234 |
+
|
235 |
+
data_wosw = data_wosw.split('.')
|
236 |
+
for i in range(nb_phrases):
|
237 |
+
data_split_wo_stopwords.append(data_wosw[i].split())
|
238 |
+
data_length_wo_stopwords.append(len(data_wosw[i].split()))
|
239 |
+
word_wosw_count.update(data_split_wo_stopwords[-1])
|
240 |
+
|
241 |
+
corpus = list(word_count.keys())
|
242 |
+
|
243 |
+
# Création d'un DataFrame txt_n_unique_val :
|
244 |
+
# colonnes = mots
|
245 |
+
# lignes = phases
|
246 |
+
# valeur de la cellule = nombre d'occurence du mot dans la phrase
|
247 |
+
|
248 |
+
## BOW
|
249 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
250 |
+
count_vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1), token_pattern=r"[^' ']+" )
|
251 |
+
|
252 |
+
# Calcul du nombre d'apparition de chaque mot dans la phrases
|
253 |
+
countvectors = count_vectorizer.fit_transform(data)
|
254 |
+
corpus = count_vectorizer.get_feature_names_out()
|
255 |
+
|
256 |
+
txt_n_unique_val= pd.DataFrame(columns=corpus,index=range(nb_phrases), data=countvectors.todense()).astype(float)
|
257 |
+
|
258 |
+
return data, corpus, data_split, data_lemmatized, data_wosw, txt_n_unique_val, sentence_length, data_length_wo_stopwords, data_lem_length
|
259 |
+
|
260 |
+
|
261 |
+
def count_world(data):
|
262 |
+
word_count = collections.Counter()
|
263 |
+
for sentence in data:
|
264 |
+
word_count.update(word_tokenize(sentence))
|
265 |
+
corpus = list(word_count.keys())
|
266 |
+
nb_mots = sum(word_count.values())
|
267 |
+
nb_mots_uniques = len(corpus)
|
268 |
+
return corpus, nb_mots, nb_mots_uniques
|
269 |
+
|
270 |
+
def display_preprocess_results(lang, data, data_split, data_lem, data_wosw, txt_n_unique_val):
|
271 |
+
|
272 |
+
global max_lines, first_line, last_line, lemmatize_to_do, stopwords_to_do
|
273 |
+
corpus = []
|
274 |
+
nb_phrases = len(data)
|
275 |
+
corpus, nb_mots, nb_mots_uniques = count_world(data)
|
276 |
+
mots_lem, _ , nb_mots_lem = count_world(data_lem)
|
277 |
+
mots_wo_sw, _ , nb_mots_wo_stopword = count_world(data_wosw)
|
278 |
+
# Identifiez les colonnes contenant uniquement des zéros et les supprimer
|
279 |
+
columns_with_only_zeros = txt_n_unique_val.columns[txt_n_unique_val.eq(0).all()]
|
280 |
+
txt_n_unique_val = txt_n_unique_val.drop(columns=columns_with_only_zeros)
|
281 |
+
|
282 |
+
# Affichage du nombre de mot en fonction du pré-processing réalisé
|
283 |
+
tab1, tab2, tab3, tab4 = st.tabs([tr("Résumé"), tr("Tokenisation"),tr("Lemmatisation"), tr("Sans Stopword")])
|
284 |
+
with tab1:
|
285 |
+
st.subheader(tr("Résumé du pré-processing"))
|
286 |
+
st.write("**"+tr("Nombre de phrases")+" : "+str(nb_phrases)+"**")
|
287 |
+
st.write("**"+tr("Nombre de mots")+" : "+str(nb_mots)+"**")
|
288 |
+
st.write("**"+tr("Nombre de mots uniques")+" : "+str(nb_mots_uniques)+"**")
|
289 |
+
st.write("")
|
290 |
+
st.write("\n**"+tr("Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):")+"**")
|
291 |
+
st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800)
|
292 |
+
with tab2:
|
293 |
+
st.subheader(tr("Tokenisation"))
|
294 |
+
st.write(tr('Texte "splited":'))
|
295 |
+
st.dataframe(pd.DataFrame(data=data_split, index=range(first_line,last_line)).head(max_lines_to_display).fillna(''), width=800)
|
296 |
+
st.write("**"+tr("Nombre de mots uniques")+" : "+str(nb_mots_uniques)+"**")
|
297 |
+
st.write("")
|
298 |
+
st.write("\n**"+tr("Mots uniques")+":**")
|
299 |
+
st.markdown(corpus[:500])
|
300 |
+
st.write("\n**"+tr("Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):")+"**")
|
301 |
+
st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800)
|
302 |
+
with tab3:
|
303 |
+
st.subheader(tr("Lemmatisation"))
|
304 |
+
if lemmatize_to_do:
|
305 |
+
st.dataframe(pd.DataFrame(data=data_lem,columns=[tr('Texte lemmatisé')],index=range(first_line,last_line)).head(max_lines_to_display), width=800)
|
306 |
+
# Si langue anglaise, affichage du taggage des mots
|
307 |
+
# if lang == 'en':
|
308 |
+
# for i in range(min(5,len(data))):
|
309 |
+
# s = str(nltk.pos_tag(data_split[i]))
|
310 |
+
# st.markdown("**Texte avec Tags "+str(i)+"** : "+s)
|
311 |
+
st.write("**"+tr("Nombre de mots uniques lemmatisés")+" : "+str(nb_mots_lem)+"**")
|
312 |
+
st.write("")
|
313 |
+
st.write("\n**"+tr("Mots uniques lemmatisés:")+"**")
|
314 |
+
st.markdown(mots_lem[:500])
|
315 |
+
with tab4:
|
316 |
+
st.subheader(tr("Sans Stopword"))
|
317 |
+
if stopwords_to_do:
|
318 |
+
st.dataframe(pd.DataFrame(data=data_wosw,columns=['Texte sans stopwords'],index=range(first_line,last_line)).head(max_lines_to_display), width=800)
|
319 |
+
st.write("**"+tr("Nombre de mots uniques sans stop words")+": "+str(nb_mots_wo_stopword)+"**")
|
320 |
+
st.write("")
|
321 |
+
st.write("\n**"+tr("Mots uniques sans stop words")+":**")
|
322 |
+
st.markdown(mots_wo_sw[:500])
|
323 |
+
|
324 |
+
|
325 |
+
def run():
|
326 |
+
global max_lines, first_line, last_line, lemmatize_to_do, stopwords_to_do
|
327 |
+
global full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en
|
328 |
+
global full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr
|
329 |
+
|
330 |
+
st.write("")
|
331 |
+
st.title(tr(title))
|
332 |
+
|
333 |
+
st.write("## **"+tr("Explications")+" :**\n")
|
334 |
+
st.markdown(tr(
|
335 |
+
"""
|
336 |
+
Le traitement du langage naturel permet à l'ordinateur de comprendre et de traiter les langues humaines.
|
337 |
+
Lors de notre projet, nous avons étudié le dataset small_vocab, proposés par Suzan Li, Chief Data Scientist chez Campaign Research à Toronto.
|
338 |
+
Celui-ci représente un corpus de phrases simples en anglais, et sa traduction (approximative) en français.
|
339 |
+
:red[**Small_vocab**] contient 137 860 phrases en anglais et français.
|
340 |
+
""")
|
341 |
+
, unsafe_allow_html=True)
|
342 |
+
st.markdown(tr(
|
343 |
+
"""
|
344 |
+
Afin de découvrir ce corpus et de préparer la traduction, nous allons effectuer un certain nombre de tâches de pré-traitement (preprocessing).
|
345 |
+
Ces taches sont, par exemple:
|
346 |
+
""")
|
347 |
+
, unsafe_allow_html=True)
|
348 |
+
st.markdown(
|
349 |
+
"* "+tr("le :red[**nettoyage**] du texte (enlever les majuscules et la ponctuation)")+"\n"+ \
|
350 |
+
"* "+tr("la :red[**tokenisation**] (découpage du texte en mots)")+"\n"+ \
|
351 |
+
"* "+tr("la :red[**lemmatisation**] (traitement lexical qui permet de donner une forme unique à toutes les \"variations\" d'un même mot)")+"\n"+ \
|
352 |
+
"* "+tr("l'élimination des :red[**mots \"transparents\"**] (sans utilité pour la compréhension, tels que les articles).")+" \n"+ \
|
353 |
+
tr("Ce prétraintement se conclut avec la contruction d'un :red[**Bag Of Worlds**], c'est à dire une matrice qui compte le nombre d'apparition de chaque mots (colonne) dans chaque phrase (ligne)")
|
354 |
+
, unsafe_allow_html=True)
|
355 |
+
#
|
356 |
+
st.write("## **"+tr("Paramètres")+" :**\n")
|
357 |
+
Langue = st.radio(tr('Langue:'),('Anglais','Français'), horizontal=True)
|
358 |
+
first_line = st.slider(tr('No de la premiere ligne à analyser:'),0,137859)
|
359 |
+
max_lines = st.select_slider(tr('Nombre de lignes à analyser:'),
|
360 |
+
options=[1,5,10,15,100, 500, 1000,'Max'])
|
361 |
+
if max_lines=='Max':
|
362 |
+
max_lines=137860
|
363 |
+
if ((first_line+max_lines)>137860):
|
364 |
+
max_lines = max(137860-first_line,0)
|
365 |
+
|
366 |
+
last_line = first_line+max_lines
|
367 |
+
if (Langue=='Anglais'):
|
368 |
+
st.dataframe(pd.DataFrame(data=full_txt_en,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
|
369 |
+
else:
|
370 |
+
st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
|
371 |
+
st.write("")
|
372 |
+
|
373 |
+
# Chargement des textes sélectionnés dans les 2 langues (max lignes = max_lines)
|
374 |
+
txt_en = full_txt_en[first_line:last_line]
|
375 |
+
txt_fr = full_txt_fr[first_line:last_line]
|
376 |
+
|
377 |
+
# Elimination des phrases non traduites
|
378 |
+
# txt_en, txt_fr = clean_untranslated_sentence(txt_en, txt_fr)
|
379 |
+
|
380 |
+
if not st.session_state.reCalcule:
|
381 |
+
txt_split_en = full_txt_split_en[first_line:last_line]
|
382 |
+
txt_lem_en = full_txt_lem_en[first_line:last_line]
|
383 |
+
txt_wo_stopword_en = full_txt_wo_stopword_en[first_line:last_line]
|
384 |
+
df_count_word_en = full_df_count_word_en.loc[first_line:last_line-1]
|
385 |
+
txt_split_fr = full_txt_split_fr[first_line:last_line]
|
386 |
+
txt_lem_fr = full_txt_lem_fr[first_line:last_line]
|
387 |
+
txt_wo_stopword_fr = full_txt_wo_stopword_fr[first_line:last_line]
|
388 |
+
df_count_word_fr = full_df_count_word_fr.loc[first_line:last_line-1]
|
389 |
+
|
390 |
+
# Lancement du préprocessing du texte qui va spliter nettoyer les phrases et les spliter en mots
|
391 |
+
# et calculer nombre d'occurences des mots dans chaque phrase
|
392 |
+
if (Langue == 'Anglais'):
|
393 |
+
st.write("## **"+tr("Préprocessing de small_vocab_en")+" :**\n")
|
394 |
+
if max_lines>10000:
|
395 |
+
with st.status(":sunglasses:", expanded=True):
|
396 |
+
if st.session_state.reCalcule:
|
397 |
+
txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en = preprocess_txt (txt_en,'en')
|
398 |
+
display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en)
|
399 |
+
else:
|
400 |
+
if st.session_state.reCalcule:
|
401 |
+
txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en = preprocess_txt (txt_en,'en')
|
402 |
+
display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en)
|
403 |
+
else:
|
404 |
+
st.write("## **"+tr("Préprocessing de small_vocab_fr")+" :**\n")
|
405 |
+
if max_lines>10000:
|
406 |
+
with st.status(":sunglasses:", expanded=True):
|
407 |
+
if st.session_state.reCalcule:
|
408 |
+
txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr = preprocess_txt (txt_fr,'fr')
|
409 |
+
display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr)
|
410 |
+
else:
|
411 |
+
if st.session_state.reCalcule:
|
412 |
+
txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr = preprocess_txt (txt_fr,'fr')
|
413 |
+
display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr)
|
414 |
+
|
415 |
+
|
416 |
+
|
417 |
+
|
418 |
+
|
419 |
+
|
420 |
+
|
421 |
+
|
422 |
+
|
423 |
+
|
424 |
+
|
tabs/game_tab.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import random
|
8 |
+
import json
|
9 |
+
import csv
|
10 |
+
from extra_streamlit_components import tab_bar, TabBarItemData
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
from datetime import datetime
|
13 |
+
import tracemalloc
|
14 |
+
from translate_app import tr
|
15 |
+
|
16 |
+
title = "Jouez avec nous !"
|
17 |
+
sidebar_name = "Jeu"
|
18 |
+
dataPath = st.session_state.DataPath
|
19 |
+
|
20 |
+
@st.cache_data
|
21 |
+
def init_game():
|
22 |
+
new = int(time.time())
|
23 |
+
sentence_test = pd.read_csv(dataPath+'/multilingue/sentence_test_extract.csv')
|
24 |
+
sentence_test = sentence_test[4750:]
|
25 |
+
# Lisez le contenu du fichier JSON
|
26 |
+
with open(dataPath+'/multilingue/lan_to_language.json', 'r') as fichier:
|
27 |
+
lan_to_language = json.load(fichier)
|
28 |
+
t_now = time.time()
|
29 |
+
return sentence_test, lan_to_language, new, t_now
|
30 |
+
|
31 |
+
def find_indice(sent_selected):
|
32 |
+
l = list(lan_to_language.keys())
|
33 |
+
for i in range(len(l)):
|
34 |
+
if l[i] == sentence_test['lan_code'].iloc[sent_selected]:
|
35 |
+
return i
|
36 |
+
|
37 |
+
@st.cache_data
|
38 |
+
def set_game(new):
|
39 |
+
nb_st = len(sentence_test)
|
40 |
+
sent_sel = []
|
41 |
+
# Utilisez une boucle pour générer 5 nombres aléatoires différents
|
42 |
+
while len(sent_sel) < 5:
|
43 |
+
nombre = random.randint(0, nb_st)
|
44 |
+
if nombre not in sent_sel:
|
45 |
+
sent_sel.append(nombre)
|
46 |
+
|
47 |
+
rep_possibles=[]
|
48 |
+
for i in range(5):
|
49 |
+
rep_possibles.append([find_indice(sent_sel[i])])
|
50 |
+
while len(rep_possibles[i]) < 5:
|
51 |
+
rep_possible = random.randint(0, 95)
|
52 |
+
if rep_possible not in rep_possibles[i]:
|
53 |
+
rep_possibles[i].append(rep_possible)
|
54 |
+
random.shuffle(rep_possibles[i])
|
55 |
+
return sent_sel, rep_possibles, new
|
56 |
+
|
57 |
+
def calc_score(n_rep,duration):
|
58 |
+
|
59 |
+
if n_rep==0: return 0
|
60 |
+
s1 = n_rep*200
|
61 |
+
if duration < 60:
|
62 |
+
s2 = (60-duration)*200/60
|
63 |
+
if n_rep==5:
|
64 |
+
s2 *= 2.5
|
65 |
+
else:
|
66 |
+
s2 = max(-(duration-60)*100/60,-100)
|
67 |
+
s = int(s1+s2)
|
68 |
+
return s
|
69 |
+
|
70 |
+
def read_leaderboard():
|
71 |
+
return pd.read_csv(dataPath+'/game_leaderboard.csv', index_col=False,encoding='utf8')
|
72 |
+
|
73 |
+
def write_leaderboard(lb):
|
74 |
+
lb['Nom'] = lb['Nom'].astype(str)
|
75 |
+
lb['Rang'] = lb['Rang'].astype(int)
|
76 |
+
lb.to_csv(path_or_buf=dataPath+'/game_leaderboard.csv',columns=['Rang','Nom','Score','Timestamp','BR','Duree'],index=False, header=True,encoding='utf8')
|
77 |
+
|
78 |
+
def display_leaderboard():
|
79 |
+
lb = read_leaderboard()
|
80 |
+
st.write("**"+tr("Leaderboard")+" :**")
|
81 |
+
list_champ = """
|
82 |
+
| Rang | Nom | Score |
|
83 |
+
|------|------------|-------|"""
|
84 |
+
if len(lb)>0:
|
85 |
+
for i in range(len(lb)):
|
86 |
+
list_champ += """
|
87 |
+
| """+str(lb['Rang'].iloc[i])+""" | """+str(lb['Nom'].iloc[i])[:9]+""" | """+str(lb['Score'].iloc[i])+""" |"""
|
88 |
+
st.markdown(list_champ, unsafe_allow_html=True )
|
89 |
+
return lb
|
90 |
+
|
91 |
+
def write_log(TS,Nom,Score,BR,Duree):
|
92 |
+
log = pd.read_csv(dataPath+'/game_log.csv', index_col=False,encoding='utf8')
|
93 |
+
date_heure = datetime.fromtimestamp(TS)
|
94 |
+
Date = date_heure.strftime('%Y-%m-%d %H:%M:%S')
|
95 |
+
log = pd.concat([log, pd.DataFrame(data={'Date':[Date], 'Nom':[Nom],'Score':[Score],'BR':[BR],'Duree':[Duree]})], ignore_index=True)
|
96 |
+
log.to_csv(path_or_buf=dataPath+'/game_log.csv',columns=['Date','Nom','Score','BR','Duree'],index=False, header=True,encoding='utf8')
|
97 |
+
|
98 |
+
def display_files():
|
99 |
+
log = pd.read_csv(dataPath+'/game_log.csv', index_col=False,encoding='utf8')
|
100 |
+
lb = pd.read_csv(dataPath+'/game_leaderboard.csv', index_col=False,encoding='utf8')
|
101 |
+
st.dataframe(lb)
|
102 |
+
st.dataframe(log)
|
103 |
+
|
104 |
+
|
105 |
+
def run():
|
106 |
+
global sentence_test, lan_to_language
|
107 |
+
|
108 |
+
sentence_test, lan_to_language, new, t_debut = init_game()
|
109 |
+
|
110 |
+
st.write("")
|
111 |
+
st.title(tr(title))
|
112 |
+
st.write("#### **"+tr("Etes vous un expert es Langues ?")+"**\n")
|
113 |
+
st.markdown(tr(
|
114 |
+
"""
|
115 |
+
Essayer de trouvez, sans aide, la langue des 5 phrases suivantes.
|
116 |
+
Attention : Vous devez être le plus rapide possible !
|
117 |
+
"""), unsafe_allow_html=True
|
118 |
+
)
|
119 |
+
st.write("")
|
120 |
+
player_name = st.text_input(tr("Quel est votre nom ?"))
|
121 |
+
|
122 |
+
if player_name == 'display_files':
|
123 |
+
display_files()
|
124 |
+
return
|
125 |
+
elif player_name == 'malloc_start':
|
126 |
+
tracemalloc.start()
|
127 |
+
return
|
128 |
+
elif player_name == 'malloc_stop':
|
129 |
+
snapshot = tracemalloc.take_snapshot()
|
130 |
+
top_stats = snapshot.statistics('traceback')
|
131 |
+
# pick the biggest memory block
|
132 |
+
for k in range(3):
|
133 |
+
stat = top_stats[k]
|
134 |
+
print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024))
|
135 |
+
for line in stat.traceback.format():
|
136 |
+
print(' >'+line)
|
137 |
+
total_mem = sum(stat.size for stat in top_stats)
|
138 |
+
print("Total allocated size: %.1f KiB" % (total_mem / 1024))
|
139 |
+
return
|
140 |
+
|
141 |
+
|
142 |
+
|
143 |
+
score = 0
|
144 |
+
col1, col2 = st.columns([0.7,0.3])
|
145 |
+
with col2:
|
146 |
+
lb = display_leaderboard()
|
147 |
+
with col1:
|
148 |
+
sent_sel, rep_possibles, new = set_game(new)
|
149 |
+
answer = [""] * 5
|
150 |
+
l = list(lan_to_language.values())
|
151 |
+
for i in range(5):
|
152 |
+
answer[i] = st.radio("**:blue["+sentence_test['sentence'].iloc[sent_sel[i]]+"]**\n",[l[rep_possibles[i][0]],l[rep_possibles[i][1]],l[rep_possibles[i][2]], \
|
153 |
+
l[rep_possibles[i][3]],l[rep_possibles[i][4]]], horizontal=True, key=i)
|
154 |
+
t_previous_debut = t_debut
|
155 |
+
t_debut = time.time()
|
156 |
+
|
157 |
+
if st.button(label=tr("Validez"), type="primary"):
|
158 |
+
st.cache_data.clear()
|
159 |
+
|
160 |
+
nb_bonnes_reponses = 0
|
161 |
+
for i in range(5):
|
162 |
+
if lan_to_language[sentence_test['lan_code'].iloc[sent_sel[i]]]==answer[i]:
|
163 |
+
nb_bonnes_reponses +=1
|
164 |
+
|
165 |
+
t_fin = time.time()
|
166 |
+
duration = t_fin - t_previous_debut
|
167 |
+
|
168 |
+
score = calc_score(nb_bonnes_reponses,duration)
|
169 |
+
write_log(time.time(),player_name,score,nb_bonnes_reponses,duration)
|
170 |
+
if nb_bonnes_reponses >=4:
|
171 |
+
st.write(":red[**"+tr("Félicitations, vous avez "+str(nb_bonnes_reponses)+" bonnes réponses !")+"**]")
|
172 |
+
st.write(":red["+tr("Votre score est de "+str(score)+" points")+"]")
|
173 |
+
else:
|
174 |
+
if nb_bonnes_reponses >1 : s="s"
|
175 |
+
else: s=""
|
176 |
+
st.write("**:red["+tr("Vous avez "+str(nb_bonnes_reponses)+" bonne"+s+" réponse"+s+".")+"]**")
|
177 |
+
if nb_bonnes_reponses >0 : s="s"
|
178 |
+
else: s=""
|
179 |
+
st.write(":red["+tr("Votre score est de "+str(score)+" point"+s)+"]")
|
180 |
+
|
181 |
+
st.write(tr("Bonne réponses")+":")
|
182 |
+
for i in range(5):
|
183 |
+
st.write("- "+sentence_test['sentence'].iloc[sent_sel[i]]+" -> :blue[**"+lan_to_language[sentence_test['lan_code'].iloc[sent_sel[i]]]+"**]")
|
184 |
+
new = int(time.time())
|
185 |
+
st.button(label=tr("Play again ?"), type="primary")
|
186 |
+
|
187 |
+
with col2:
|
188 |
+
now = time.time()
|
189 |
+
# Si le score du dernier est plus vieux d'une semaine, il est remplacé par un score + récent
|
190 |
+
renew_old = ((len(lb)>9) and (lb['Timestamp'].iloc[9])<(now-604800))
|
191 |
+
|
192 |
+
if (score>0) and ((((score >= lb['Score'].min()) and (len(lb)>9)) or (len(lb)<=9)) or (pd.isna(lb['Score'].min())) or renew_old):
|
193 |
+
if player_name not in lb['Nom'].tolist():
|
194 |
+
if (((score >= lb['Score'].min()) and (len(lb)>9)) or (len(lb)<=9)) or (pd.isna(lb['Score'].min())) :
|
195 |
+
lb = pd.concat([lb, pd.DataFrame(data={'Nom':[player_name],'Score':[score],'Timestamp':[now],'BR':[nb_bonnes_reponses],'Duree':[duration]})], ignore_index=True)
|
196 |
+
lb = lb.sort_values(by=['Score', 'Timestamp'], ascending=[False, False]).reset_index()
|
197 |
+
lb = lb.drop(lb.index[10:])
|
198 |
+
else:
|
199 |
+
st.write('2:',player_name)
|
200 |
+
lb['Nom'].iloc[9]= player_name
|
201 |
+
lb['Score'].iloc[9]= score
|
202 |
+
lb['Timestamp'].iloc[9]=now
|
203 |
+
lb['BR'].iloc[9]=nb_bonnes_reponses
|
204 |
+
lb['Duree'].iloc[9]=duration
|
205 |
+
lb = lb.reset_index()
|
206 |
+
else:
|
207 |
+
liste_Nom = lb['Nom'].tolist()
|
208 |
+
for i,player in enumerate(liste_Nom):
|
209 |
+
if player == player_name:
|
210 |
+
if lb['Score'].iloc[i] < score:
|
211 |
+
lb['Score'].iloc[i] = score
|
212 |
+
lb['Timestamp'].iloc[i]=now
|
213 |
+
lb = lb.sort_values(by=['Score', 'Timestamp'], ascending=[False, False]).reset_index()
|
214 |
+
for i in range(len(lb)):
|
215 |
+
if (i>0):
|
216 |
+
if (lb['Score'].iloc[i]==lb['Score'].iloc[i-1]):
|
217 |
+
lb['Rang'].iloc[i] = lb['Rang'].iloc[i-1]
|
218 |
+
else:
|
219 |
+
lb['Rang'].iloc[i] = i+1
|
220 |
+
else:
|
221 |
+
lb['Rang'].iloc[i] = i+1
|
222 |
+
if player_name !="":
|
223 |
+
write_leaderboard(lb)
|
224 |
+
|
225 |
+
|
226 |
+
return
|
227 |
+
|
228 |
+
|
229 |
+
|
230 |
+
|
231 |
+
|
232 |
+
|
233 |
+
|
234 |
+
|
235 |
+
|
tabs/id_lang_tab.py
ADDED
@@ -0,0 +1,476 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import tiktoken
|
6 |
+
import joblib
|
7 |
+
import json
|
8 |
+
import csv
|
9 |
+
from transformers import pipeline
|
10 |
+
import keras
|
11 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
12 |
+
from sklearn.preprocessing import LabelEncoder
|
13 |
+
from tensorflow.keras.utils import plot_model
|
14 |
+
from filesplit.merge import Merge
|
15 |
+
from extra_streamlit_components import tab_bar, TabBarItemData
|
16 |
+
from sklearn.decomposition import PCA
|
17 |
+
import matplotlib.pyplot as plt
|
18 |
+
import seaborn as sns
|
19 |
+
from sklearn import naive_bayes
|
20 |
+
from translate_app import tr
|
21 |
+
|
22 |
+
title = "Identification de langue"
|
23 |
+
sidebar_name = "Identification de langue"
|
24 |
+
dataPath = st.session_state.DataPath
|
25 |
+
|
26 |
+
# CountVectorizer a une liste de phrase en entrée.
|
27 |
+
# Cette fonction met les données d'entrée dans le bon format
|
28 |
+
def format_to_vectorize(data):
|
29 |
+
X_tok = []
|
30 |
+
if "DataFrame" in str(type(data)):sentences = data.tolist()
|
31 |
+
elif "str" in str(type(data)):
|
32 |
+
sentences =[data]
|
33 |
+
else: sentences = data
|
34 |
+
|
35 |
+
for sentence in sentences:
|
36 |
+
X_tok.append(sentence)
|
37 |
+
return X_tok
|
38 |
+
|
39 |
+
def create_BOW(data):
|
40 |
+
global vectorizer
|
41 |
+
|
42 |
+
X_tok = format_to_vectorize(data)
|
43 |
+
X = vectorizer.transform(X_tok)
|
44 |
+
return X
|
45 |
+
|
46 |
+
def load_vectorizer(tokenizer):
|
47 |
+
global dict_token, dict_ids, nb_token
|
48 |
+
|
49 |
+
path = dataPath+'/vectorizer_tiktoken_big.pkl'
|
50 |
+
vectorizer = joblib.load(path)
|
51 |
+
dict_token = {tokenizer.decode([cle]): cle for cle, valeur in vectorizer.vocabulary_.items()}
|
52 |
+
dict_ids = {cle: tokenizer.decode([cle]) for cle, valeur in vectorizer.vocabulary_.items()} #dict_ids.items()}
|
53 |
+
nb_token = len(vectorizer.vocabulary_)
|
54 |
+
return vectorizer
|
55 |
+
|
56 |
+
def lang_id_nb(sentences):
|
57 |
+
global lan_to_language
|
58 |
+
|
59 |
+
if "str" in str(type(sentences)):
|
60 |
+
return lan_to_language[clf_nb.predict(create_BOW(sentences))[0]]
|
61 |
+
else: return [lan_to_language[l] for l in clf_nb.predict(create_BOW(sentences))]
|
62 |
+
|
63 |
+
@st.cache_resource
|
64 |
+
def init_nb_identifier():
|
65 |
+
|
66 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
67 |
+
|
68 |
+
# Chargement du classificateur sauvegardé
|
69 |
+
clf_nb = joblib.load(dataPath+"/id_lang_tiktoken_nb_sparse_big.pkl")
|
70 |
+
vectorizer = load_vectorizer(tokenizer)
|
71 |
+
|
72 |
+
# Lisez le contenu du fichier JSON
|
73 |
+
with open(dataPath+'/multilingue/lan_to_language.json', 'r') as fichier:
|
74 |
+
lan_to_language = json.load(fichier)
|
75 |
+
return tokenizer, dict_token, dict_ids, nb_token, lan_to_language, clf_nb, vectorizer
|
76 |
+
|
77 |
+
def encode_text(textes):
|
78 |
+
global tokenizer
|
79 |
+
|
80 |
+
max_length=250
|
81 |
+
sequences = tokenizer.encode_batch(textes)
|
82 |
+
return pad_sequences(sequences, maxlen=max_length, padding='post')
|
83 |
+
|
84 |
+
def read_list_lan():
|
85 |
+
|
86 |
+
with open(dataPath+'/multilingue/lan_code.csv', 'r') as fichier_csv:
|
87 |
+
reader = csv.reader(fichier_csv)
|
88 |
+
lan_code = next(reader)
|
89 |
+
return lan_code
|
90 |
+
|
91 |
+
@st.cache_resource
|
92 |
+
def init_dl_identifier():
|
93 |
+
|
94 |
+
label_encoder = LabelEncoder()
|
95 |
+
list_lan = read_list_lan()
|
96 |
+
lan_identified = [lan_to_language[l] for l in list_lan]
|
97 |
+
label_encoder.fit(list_lan)
|
98 |
+
merge = Merge(dataPath+"/dl_id_lang_split", dataPath, "dl_tiktoken_id_language_model.h5").merge(cleanup=False)
|
99 |
+
dl_model = keras.models.load_model(dataPath+"/dl_tiktoken_id_language_model.h5")
|
100 |
+
return dl_model, label_encoder, list_lan, lan_identified
|
101 |
+
|
102 |
+
def lang_id_dl(sentences):
|
103 |
+
global dl_model, label_encoder
|
104 |
+
|
105 |
+
if "str" in str(type(sentences)): predictions = dl_model.predict(encode_text([sentences]))
|
106 |
+
else: predictions = dl_model.predict(encode_text(sentences))
|
107 |
+
# Décodage des prédictions en langues
|
108 |
+
predicted_labels_encoded = np.argmax(predictions, axis=1)
|
109 |
+
predicted_languages = label_encoder.classes_[predicted_labels_encoded]
|
110 |
+
if "str" in str(type(sentences)): return lan_to_language[predicted_languages[0]]
|
111 |
+
else: return [l for l in predicted_languages]
|
112 |
+
|
113 |
+
@st.cache_resource
|
114 |
+
def init_lang_id_external():
|
115 |
+
|
116 |
+
lang_id_model_ext = pipeline('text-classification',model="papluca/xlm-roberta-base-language-detection")
|
117 |
+
dict_xlmr = {"ar":"ara", "bg":"bul", "de":"deu", "el": "ell", "en":"eng", "es":"spa", "fr":"fra", "hi": "hin","it":"ita","ja":"jpn", \
|
118 |
+
"nl":"nld", "pl":"pol", "pt":"por", "ru":"rus", "sw":"swh", "th":"tha", "tr":"tur", "ur": "urd", "vi":"vie", "zh":"cmn"}
|
119 |
+
sentence_test = pd.read_csv(dataPath+'//multilingue/sentence_test_extract.csv')
|
120 |
+
sentence_test = sentence_test[:4750]
|
121 |
+
# Instanciation d'un exemple
|
122 |
+
exemples = ["Er weiß überhaupt nichts über dieses Buch", # Phrase 0
|
123 |
+
"Umbrellas sell well", # Phrase 1
|
124 |
+
"elle adore les voitures très luxueuses, et toi ?", # Phrase 2
|
125 |
+
"she loves very luxurious cars, don't you?", # Phrase 3
|
126 |
+
"Vogliamo visitare il Colosseo e nuotare nel Tevere", # Phrase 4
|
127 |
+
"vamos a la playa", # Phrase 5
|
128 |
+
"Te propongo un trato", # Phrase 6
|
129 |
+
"she loves you much, mais elle te hait aussi and das ist traurig", # Phrase 7 # Attention à cette phrase trilingue
|
130 |
+
"Elle a de belles loches" # Phrase 8
|
131 |
+
]
|
132 |
+
|
133 |
+
lang_exemples = ['deu','eng','fra','eng','ita','spa','spa','fra','fra']
|
134 |
+
return lang_id_model_ext, dict_xlmr, sentence_test, lang_exemples, exemples
|
135 |
+
|
136 |
+
@st.cache_data
|
137 |
+
def display_acp(title, comment):
|
138 |
+
data = np.load(dataPath+'/data_lang_id_acp.npz')
|
139 |
+
X_train_scaled = data['X_train_scaled']
|
140 |
+
y_train_pred = data['y_train_pred']
|
141 |
+
label_arrow = ['.', ',', '?', ' a', ' de', ' la', ' que', 'Tom', ' un', ' the', ' in', \
|
142 |
+
' to', 'I', "'", 'i', ' le', ' en', ' es', 'é', ' l', '!', 'o', ' ist', \
|
143 |
+
' pas', ' Tom', ' me', ' di', 'Ich', ' is', 'Je', ' nicht', ' you', \
|
144 |
+
' die', ' à', ' el', ' est', 'a', 'en', ' d', ' è', ' ne', ' se', ' no', \
|
145 |
+
' una', ' zu', 'Il', '¿', ' of', ' du', "'t", 'ato', ' der', ' il', \
|
146 |
+
' n', 'El', ' non', ' che', 'are', ' con', 'ó', ' was', 'La', 'No', \
|
147 |
+
' ?', 'es', 'le', 'L', ' and', ' des', ' s', ' ich', 'as', 'S', ' per', \
|
148 |
+
' das', ' und', ' ein', 'e', "'s", 'u', ' y', 'He', 'z', 'er', ' m', \
|
149 |
+
'st', ' les', 'Le', ' I', 'ar', 'te', 'Non', 'The', ' er', 'ie', ' v', \
|
150 |
+
' c', "'est", ' ha', ' den']
|
151 |
+
|
152 |
+
pca = PCA(n_components=2)
|
153 |
+
|
154 |
+
X_new = pca.fit_transform(X_train_scaled)
|
155 |
+
coeff = pca.components_.transpose()
|
156 |
+
xs = X_new[:, 0]
|
157 |
+
ys = X_new[:, 1]
|
158 |
+
scalex = 1.0/(xs.max() - xs.min())
|
159 |
+
scaley = 1.0/(ys.max() - ys.min())
|
160 |
+
principalDf = pd.DataFrame({'PC1': xs*scalex, 'PC2': ys * scaley})
|
161 |
+
finalDF = pd.concat([principalDf, pd.Series(y_train_pred, name='Langue')], axis=1)
|
162 |
+
|
163 |
+
sns.set_context("poster") # Valeur possible:"notebook", "talk", "poster", ou "paper"
|
164 |
+
plt.rc("axes", titlesize=32,titleweight='bold') # Taille du titre de l'axe
|
165 |
+
plt.rc("axes", labelsize=18,labelweight='bold') # Taille des étiquettes de l'axe
|
166 |
+
plt.rc("xtick", labelsize=14) # Taille des étiquettes de l'axe des x
|
167 |
+
plt.rc("ytick", labelsize=14) # Taille des étiquettes de l'axe des y
|
168 |
+
|
169 |
+
st.write(comment)
|
170 |
+
st.write("")
|
171 |
+
fig = plt.figure(figsize=(20, 15))
|
172 |
+
sns.scatterplot(x='PC1', y='PC2', hue='Langue', data=finalDF, alpha=0.5)
|
173 |
+
for i in range(50):
|
174 |
+
plt.arrow(0, 0, coeff[i, 0]*1.5, coeff[i, 1]*0.8,color='k', alpha=0.08, head_width=0.01, )
|
175 |
+
plt.text(coeff[i, 0]*1.5, coeff[i, 1] * 0.8, label_arrow[i], color='k', weight='bold')
|
176 |
+
|
177 |
+
plt.title(title)
|
178 |
+
plt.xlim(-0.4, 0.45)
|
179 |
+
plt.ylim(-0.15, 0.28);
|
180 |
+
st.pyplot(fig)
|
181 |
+
return
|
182 |
+
|
183 |
+
@st.cache_data
|
184 |
+
def read_BOW_examples():
|
185 |
+
return pd.read_csv(dataPath+'/lang_id_small_BOW.csv')
|
186 |
+
|
187 |
+
def analyse_nb(sel_phrase):
|
188 |
+
global lang_exemples,exemples
|
189 |
+
|
190 |
+
def create_small_BOW(s):
|
191 |
+
encodage = tokenizer.encode(s)
|
192 |
+
sb = [0] * (df_BOW.shape[1]-1)
|
193 |
+
nb_unique_token = 0
|
194 |
+
for i in range(df_BOW.shape[1]-1):
|
195 |
+
for t in encodage:
|
196 |
+
if df_BOW.columns[i]==str(t):
|
197 |
+
sb[i] += 1
|
198 |
+
if sb[i] > 0: nb_unique_token +=1
|
199 |
+
return sb, nb_unique_token
|
200 |
+
|
201 |
+
st.write("#### **"+tr("Probabilité d'appartenance de la phrase à une langue")+" :**")
|
202 |
+
st.image("./assets/formule_proba_naive_bayes.png")
|
203 |
+
st.write(tr("où **C** est la classe (lan_code), **Fi** est la caractéristique i du BOW, **Z** est l'\"evidence\" servant à regulariser la probabilité"))
|
204 |
+
st.write("")
|
205 |
+
nb_lang = 5
|
206 |
+
lan_code = ['deu','eng','fra','spa','ita']
|
207 |
+
lan_color = {'deu':'violet','eng':'green','fra':'red','spa':'blue','ita':'orange'}
|
208 |
+
df_BOW = read_BOW_examples()
|
209 |
+
|
210 |
+
clf_nb2 = naive_bayes.MultinomialNB()
|
211 |
+
clf_nb2.fit(df_BOW.drop(columns='lan_code').values.tolist(), df_BOW['lan_code'].values.tolist())
|
212 |
+
|
213 |
+
nb_phrases_lang =[]
|
214 |
+
for l in lan_code:
|
215 |
+
nb_phrases_lang.append(sum(df_BOW['lan_code']==l))
|
216 |
+
st.write(tr("Phrase à analyser")+" :",'**:'+lan_color[lang_exemples[sel_phrase]]+'['+lang_exemples[sel_phrase],']** - **"'+exemples[sel_phrase]+'"**')
|
217 |
+
|
218 |
+
# Tokenisation et encodage de la phrase
|
219 |
+
encodage = tokenizer.encode(exemples[sel_phrase])
|
220 |
+
|
221 |
+
# Création du vecteur BOW de la phrase
|
222 |
+
bow_exemple, nb_unique_token = create_small_BOW(exemples[sel_phrase])
|
223 |
+
st.write(tr("Nombre de tokens retenus dans le BOW")+": "+ str(nb_unique_token))
|
224 |
+
masque_tokens_retenus = [(1 if token in list(dict_ids.keys()) else 0) for token in encodage]
|
225 |
+
str_token = " "
|
226 |
+
for i in range(len(encodage)):
|
227 |
+
if masque_tokens_retenus[i]==1:
|
228 |
+
if (i%2) ==0:
|
229 |
+
str_token += "**:red["+tokenizer.decode([encodage[i]])+"]** "
|
230 |
+
else:
|
231 |
+
str_token += "**:violet["+tokenizer.decode([encodage[i]])+"]** "
|
232 |
+
else: str_token += ":green["+tokenizer.decode([encodage[i]])+"] "
|
233 |
+
|
234 |
+
st.write(tr("Tokens se trouvant dans le modèle (en")+" :red["+tr("rouge")+"] "+tr("ou")+" :violet["+tr("violet")+"]) :"+str_token+" ")
|
235 |
+
|
236 |
+
st.write("")
|
237 |
+
# Afin de continuer l'analyse on ne garde que les token de la phrase disponibles dans le BOW
|
238 |
+
token_used = [str(encodage[i]) for i in range(len(encodage)) if (masque_tokens_retenus[i]==1)]
|
239 |
+
|
240 |
+
|
241 |
+
# Calcul du nombre d'apparition de ces tokens dans le BOW pour chaque langue, et stockage dans un DataFrame df_count
|
242 |
+
def compter_non_zero(colonne):
|
243 |
+
return (colonne != 0).sum()
|
244 |
+
|
245 |
+
votes = []
|
246 |
+
for i in range(nb_lang):
|
247 |
+
#votes.append(list(df_BOW[token_used].loc[df_BOW['lan_code']==lan_code[i]].sum(axis=0)))
|
248 |
+
votes.append(list(df_BOW[token_used].loc[df_BOW['lan_code']==lan_code[i]].apply(compter_non_zero)))
|
249 |
+
|
250 |
+
col_name = [str(i+1)+'-'+tokenizer.decode([int(token_used[i])]) for i in range(len(token_used))]
|
251 |
+
df_count = pd.DataFrame(data=votes,columns=token_used, index=lan_code)
|
252 |
+
df_count.columns = col_name
|
253 |
+
st.write("\n**"+tr("Nombre d'apparitions des tokens, dans chaque langue")+"**")
|
254 |
+
|
255 |
+
# Lissage de Laplace n°1 (Laplace smoothing )
|
256 |
+
# df_count = df_count+1
|
257 |
+
|
258 |
+
st.dataframe(df_count)
|
259 |
+
|
260 |
+
#########
|
261 |
+
######### 3. Calcul de la probabilité d'apparition de chaque token dans chaque langue
|
262 |
+
df_proba = df_count.div(nb_phrases_lang, axis = 0)
|
263 |
+
|
264 |
+
# Lissage de Laplace n°2 (Laplace smoothing )
|
265 |
+
df_proba = df_proba.replace(0.0,0.0010)
|
266 |
+
|
267 |
+
# Initialisation de df_proba: Calcul de la probabilité conditionnelle d'appartenance de la phrase à une langue
|
268 |
+
df_proba['Proba'] = 1
|
269 |
+
# Itérer sur les colonnes et effectuez la multiplication pour chaque ligne
|
270 |
+
for col in df_count.columns:
|
271 |
+
df_proba['Proba'] *= df_proba[col]
|
272 |
+
|
273 |
+
#########
|
274 |
+
######### 4. Calcul (par multiplication) de la probabilité d'appartenance de la phrase à une langue
|
275 |
+
|
276 |
+
# Multiplication par la probabilité de la classe
|
277 |
+
p_classe = [(nb_phrases_lang[i]/df_BOW.shape[0]) for i in range(len(nb_phrases_lang))]
|
278 |
+
df_proba['Proba'] *= p_classe
|
279 |
+
|
280 |
+
# Diviser par l'evidence
|
281 |
+
evidence = df_proba['Proba'].sum(axis=0)
|
282 |
+
df_proba['Proba'] *= 1/evidence
|
283 |
+
df_proba['Proba'] = df_proba['Proba'].round(3)
|
284 |
+
|
285 |
+
# Affichage de la matrice des probabilités
|
286 |
+
st.write("**"+tr("Probabilités conditionnelles d'apparition des tokens retenus, dans chaque langue")+":**")
|
287 |
+
st.dataframe(df_proba)
|
288 |
+
str_token = "Lang proba max: "# "*20
|
289 |
+
for i,token in enumerate(df_proba.columns[:-1]):
|
290 |
+
str_token += '*'+token+'*:**:'+lan_color[df_proba[token].idxmax()]+'['+df_proba[token].idxmax()+']**'+" "*2 #8
|
291 |
+
st.write(str_token)
|
292 |
+
st.write("")
|
293 |
+
|
294 |
+
st.write(tr("Langue réelle de la phrase")+" "*35+": **:"+lan_color[lang_exemples[sel_phrase]]+'['+lang_exemples[sel_phrase]+']**')
|
295 |
+
st.write(tr("Langue dont la probabilité est la plus forte ")+": **:"+lan_color[df_proba['Proba'].idxmax()]+'['+df_proba['Proba'].idxmax(),"]** (proba={:.2f}".format(max(df_proba['Proba']))+")")
|
296 |
+
prediction = clf_nb2.predict([bow_exemple])
|
297 |
+
st.write(tr("Langue prédite par Naiva Bayes")+" "*23+": **:"+lan_color[prediction[0]]+'['+prediction[0]+"]** (proba={:.2f}".format(max(clf_nb2.predict_proba([bow_exemple])[0]))+")")
|
298 |
+
st.write("")
|
299 |
+
|
300 |
+
fig, axs = plt.subplots(1, 2, figsize=(10, 6))
|
301 |
+
df_proba_sorted =df_proba.sort_index(ascending=True)
|
302 |
+
axs[0].set_title(tr("Probabilités calculée manuellement"), fontsize=12)
|
303 |
+
axs[0].barh(df_proba_sorted.index, df_proba_sorted['Proba'])
|
304 |
+
axs[1].set_title(tr("Probabilités du classifieur Naive Bayes"), fontsize=12)
|
305 |
+
axs[1].barh(df_proba_sorted.index, clf_nb2.predict_proba([bow_exemple])[0]);
|
306 |
+
st.pyplot(fig)
|
307 |
+
return
|
308 |
+
|
309 |
+
#@st.cache_data
|
310 |
+
def find_exemple(lang_sel):
|
311 |
+
global exemples
|
312 |
+
return exemples[lang_sel]
|
313 |
+
|
314 |
+
def display_shapley(lang_sel):
|
315 |
+
st.write("**"+tr("Analyse de l'importance de chaque token dans l'identification de la langue")+"**")
|
316 |
+
st.image('assets/fig_schapley'+str(lang_sel)+'.png')
|
317 |
+
st.write("**"+tr("Recapitulatif de l'influence des tokens sur la selection de la langue")+"**")
|
318 |
+
st.image('assets/fig_schapley_recap'+str(lang_sel)+'.png')
|
319 |
+
return
|
320 |
+
|
321 |
+
def run():
|
322 |
+
global tokenizer, vectorizer, dict_token, dict_ids, nb_token, lan_to_language, clf_nb
|
323 |
+
global dl_model, label_encoder, toggle_val, custom_sentence, list_lan, lan_identified
|
324 |
+
global lang_exemples, exemples
|
325 |
+
|
326 |
+
|
327 |
+
tokenizer, dict_token, dict_ids, nb_token, lan_to_language, clf_nb, vectorizer = init_nb_identifier()
|
328 |
+
dl_model, label_encoder, list_lan, lan_identified = init_dl_identifier()
|
329 |
+
lang_id_model_ext, dict_xlmr, sentence_test, lang_exemples, exemples= init_lang_id_external()
|
330 |
+
|
331 |
+
st.write("")
|
332 |
+
st.title(tr(title))
|
333 |
+
st.write("## **"+tr("Explications")+" :**\n")
|
334 |
+
st.markdown(tr(
|
335 |
+
"""
|
336 |
+
Afin de mettre en oeuvre cette fonctionnalité nous avons utilisé un jeu d'entrainement multilinge de <b> 9.757.778 phrases dans 95 langues</b>.
|
337 |
+
Les 95 langues identifiées sont:
|
338 |
+
""")
|
339 |
+
, unsafe_allow_html=True)
|
340 |
+
st.selectbox(label="Lang",options=sorted(lan_identified),label_visibility="hidden")
|
341 |
+
st.markdown(tr(
|
342 |
+
"""
|
343 |
+
Nous avons utilisé 2 méthodes pour identifier la langue d'un texte:
|
344 |
+
1. un classificateur **Naïve Bayes**
|
345 |
+
2. un modèle de **Deep Learning**
|
346 |
+
""")
|
347 |
+
, unsafe_allow_html=True)
|
348 |
+
st.markdown(tr(
|
349 |
+
"""
|
350 |
+
Les 2 modèles ont un accuracy similaire sur le jeu de test: **:red[96% pour NB et 97,5% pour DL]**
|
351 |
+
<br>
|
352 |
+
""")
|
353 |
+
, unsafe_allow_html=True)
|
354 |
+
|
355 |
+
chosen_id = tab_bar(data=[
|
356 |
+
TabBarItemData(id="tab1", title=tr("Id. Naïve Bayes"), description=tr("avec le Bag Of Words")),
|
357 |
+
TabBarItemData(id="tab2", title=tr("Id. Deep Learning"), description=tr(" avec Keras")),
|
358 |
+
TabBarItemData(id="tab3", title=tr("Interpretabilité"), description=tr("du modèle Naïve Bayes "))],
|
359 |
+
default="tab1")
|
360 |
+
|
361 |
+
if (chosen_id == "tab1") or (chosen_id == "tab2"):
|
362 |
+
st.write("## **"+tr("Paramètres")+" :**\n")
|
363 |
+
|
364 |
+
toggle_val = st.toggle(tr('Phrase à saisir/Phrase test'), value=True, help=tr("Off = phrase à saisir, On = selection d'une phrase test parmi 9500 phrases"))
|
365 |
+
if toggle_val:
|
366 |
+
custom_sentence= st.selectbox(tr("Selectionnez une phrases test à identifier")+":", sentence_test['sentence'] )
|
367 |
+
else:
|
368 |
+
custom_sentence = st.text_area(label=tr("Saisir le texte dont vous souhaitez identifier la langue:"))
|
369 |
+
st.button(label=tr("Validez"), type="primary")
|
370 |
+
|
371 |
+
if custom_sentence!='':
|
372 |
+
st.write("## **"+tr("Résultats")+" :**\n")
|
373 |
+
md = """
|
374 |
+
|"""+tr("Identifieur")+""" |"""+tr("Langue identifiée")+"""|
|
375 |
+
|-------------------------------------|---------------|"""
|
376 |
+
md1 = ""
|
377 |
+
if toggle_val:
|
378 |
+
lan_reelle = sentence_test['lan_code'].loc[sentence_test['sentence']==custom_sentence].tolist()[0]
|
379 |
+
md1 = """
|
380 |
+
|"""+tr("Langue réelle")+""" |**:blue["""+lan_to_language[lan_reelle]+"""]**|"""
|
381 |
+
md2 = """
|
382 |
+
|"""+tr("Classificateur Naïve Bayes")+""" |**:red["""+lang_id_nb(custom_sentence)+"""]**|
|
383 |
+
|"""+tr("Modèle de Deep Learning")+""" |**:red["""+lang_id_dl(custom_sentence)+"""]**|"""
|
384 |
+
md3 = """
|
385 |
+
|XLM-RoBERTa (Hugging Face) |**:red["""+lan_to_language[dict_xlmr[lang_id_model_ext(custom_sentence)[0]['label']]]+"""]**|"""
|
386 |
+
if toggle_val:
|
387 |
+
if not (lan_reelle in list(dict_xlmr.values())):
|
388 |
+
md3=""
|
389 |
+
|
390 |
+
st.markdown(md+md1+md2+md3, unsafe_allow_html=True)
|
391 |
+
|
392 |
+
st.write("## **"+tr("Details sur la méthode")+" :**\n")
|
393 |
+
if (chosen_id == "tab1"):
|
394 |
+
st.markdown(tr(
|
395 |
+
"""
|
396 |
+
Afin d'utiliser le classificateur Naïve Bayes, il nous a fallu:""")+"\n"+
|
397 |
+
"* "+tr("Créer un Bag of Words de token..")+"\n"+
|
398 |
+
"* "+tr("..Tokeniser le texte d'entrainement avec CountVectorizer et un tokenizer 'custom', **Tiktoken** d'OpenAI. ")+"\n"+
|
399 |
+
"* "+tr("Utiliser des matrices creuses (Sparse Matrix), car notre BOW contenait 10 Millions de lignes x 59122 tokens. ")+"\n"+
|
400 |
+
"* "+tr("Sauvegarder le vectorizer (non serialisable) et le classificateur entrainé. ")
|
401 |
+
, unsafe_allow_html=True)
|
402 |
+
st.markdown(tr(
|
403 |
+
"""
|
404 |
+
L'execution de toutes ces étapes est assez rapide: une dizaine de minutes
|
405 |
+
<br>
|
406 |
+
Le résultat est très bon: L'Accuracy sur le jeu de test est =
|
407 |
+
**:red[96%]** sur les 95 langues, et **:red[99,1%]** sur les 5 langues d'Europe de l'Ouest (en,fr,de,it,sp)
|
408 |
+
<br>
|
409 |
+
""")
|
410 |
+
, unsafe_allow_html=True)
|
411 |
+
st.markdown(tr(
|
412 |
+
"""
|
413 |
+
**Note 1:** Les 2 modèles ont un accuracy similaire sur le jeu de test: **:red[96% pour NB et 97,5% pour DL]**
|
414 |
+
**Note 2:** Le modèle *XLM-RoBERTa* de Hugging Face (qui identifie 20 langues seulement) a une accuracy, sur notre jeu de test = **97,8%**,
|
415 |
+
versus **99,3% pour NB** et **99,2% pour DL** sur ces 20 langues.
|
416 |
+
""")
|
417 |
+
, unsafe_allow_html=True)
|
418 |
+
else:
|
419 |
+
st.markdown(tr(
|
420 |
+
"""
|
421 |
+
Nous avons mis en oeuvre un modèle Keras avec une couche d'embedding et 4 couches denses (*Voir architecture ci-dessous*).
|
422 |
+
Nous avons utilisé le tokeniser <b>Tiktoken</b> d'OpenAI.
|
423 |
+
La couche d'embedding accepte 250 tokens, ce qui signifie que la détection de langue s'effectue sur approximativement les 200 premiers mots.
|
424 |
+
<br>
|
425 |
+
""")
|
426 |
+
, unsafe_allow_html=True)
|
427 |
+
st.markdown(tr(
|
428 |
+
"""
|
429 |
+
L'entrainement a duré plus de 10 heures..
|
430 |
+
Finalement, le résultat est très bon: L'Accuracy sur le jeu de test est =
|
431 |
+
**:red[97,5%]** sur les 95 langues, et **:red[99,1%]** sur les 5 langues d'Europe de l'Ouest (en,fr,de,it,sp).
|
432 |
+
Néanmoins, la durée pour une prédiction est relativement longue: approximativement 5/100 de seconde
|
433 |
+
<br>
|
434 |
+
""")
|
435 |
+
, unsafe_allow_html=True)
|
436 |
+
st.markdown(tr(
|
437 |
+
"""
|
438 |
+
**Note 1:** Les 2 modèles ont un accuracy similaire sur le jeu de test: **:red[96% pour NB et 97,5% pour DL]**""")+"<br>"+
|
439 |
+
tr("""
|
440 |
+
**Note 2:** Le modèle *XLM-RoBERTa* de Hugging Face (qui identifie 20 langues seulement) a une accuracy, sur notre jeu de test = <b>97,8%</b>,
|
441 |
+
versus **99,3% pour NB** et **99,2% pour DL** sur ces 20 langues.
|
442 |
+
<br>
|
443 |
+
""")
|
444 |
+
, unsafe_allow_html=True)
|
445 |
+
st.write("<center><h5>"+tr("Architecture du modèle utilisé")+":</h5></center>", unsafe_allow_html=True)
|
446 |
+
plot_model(dl_model, show_shapes=True, show_layer_names=True, show_layer_activations=True,rankdir='TB',to_file='./assets/model_plot.png')
|
447 |
+
col1, col2, col3 = st.columns([0.15,0.7,0.15])
|
448 |
+
with col2:
|
449 |
+
st.image('./assets/model_plot.png',use_column_width="auto")
|
450 |
+
elif (chosen_id == "tab3"):
|
451 |
+
st.write("### **"+tr("Interpretabilité du classifieur Naïve Bayes sur 5 langues")+"**")
|
452 |
+
st.write("##### "+tr("..et un Training set réduit (15000 phrases et 94 tokens)"))
|
453 |
+
st.write("")
|
454 |
+
|
455 |
+
chosen_id2 = tab_bar(data=[
|
456 |
+
TabBarItemData(id="tab1", title=tr("Analyse en Compos. Princ."), description=""),
|
457 |
+
TabBarItemData(id="tab2", title=tr("Simul. calcul NB"), description=""),
|
458 |
+
TabBarItemData(id="tab3", title=tr("Shapley"), description="")],
|
459 |
+
default="tab1")
|
460 |
+
if (chosen_id2 == "tab1"):
|
461 |
+
display_acp(tr("Importance des principaux tokens dans \n l'identification de langue par l'algorithme Naive Bayes"),tr("Affichage de 10 000 phrases (points) et des 50 tokens les + utilisés (flèches)"))
|
462 |
+
if (chosen_id2 == "tab2") or (chosen_id2 == "tab3"):
|
463 |
+
sel_phrase = st.selectbox(tr('Selectionnez une phrase à "interpréter"')+':', range(9), format_func=find_exemple)
|
464 |
+
if (chosen_id2 == "tab2"):
|
465 |
+
analyse_nb(sel_phrase)
|
466 |
+
if (chosen_id2 == "tab3"):
|
467 |
+
display_shapley(sel_phrase)
|
468 |
+
|
469 |
+
|
470 |
+
|
471 |
+
|
472 |
+
|
473 |
+
|
474 |
+
|
475 |
+
|
476 |
+
|
tabs/intro.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from translate_app import tr
|
3 |
+
|
4 |
+
title = "Démosthène"
|
5 |
+
sidebar_name = "Introduction"
|
6 |
+
|
7 |
+
|
8 |
+
def run():
|
9 |
+
|
10 |
+
# TODO: choose between one of these GIFs
|
11 |
+
# st.image("https://dst-studio-template.s3.eu-west-3.amazonaws.com/1.gif")
|
12 |
+
# st.image("https://dst-studio-template.s3.eu-west-3.amazonaws.com/2.gif")
|
13 |
+
# st.image("https://dst-studio-template.s3.eu-west-3.amazonaws.com/3.gif")
|
14 |
+
# st.image("assets/tough-communication.gif",use_column_width=True)
|
15 |
+
|
16 |
+
st.write("")
|
17 |
+
if st.session_state.Cloud == 0:
|
18 |
+
st.image("assets/miss-honey-glasses-off.gif",use_column_width=True)
|
19 |
+
else:
|
20 |
+
st.image("https://media.tenor.com/pfOeAfytY98AAAAC/miss-honey-glasses-off.gif",use_column_width=True)
|
21 |
+
|
22 |
+
st.title(tr(title))
|
23 |
+
st.markdown('''
|
24 |
+
## **'''+tr("Système de traduction adapté aux lunettes connectées")+'''**
|
25 |
+
---
|
26 |
+
''')
|
27 |
+
st.header("**"+tr("A propos")+"**")
|
28 |
+
st.markdown(tr(
|
29 |
+
"""
|
30 |
+
Ce projet a été réalisé dans le cadre d’une formation de Data Scientist, entre juin et novembre 2023.
|
31 |
+
<br>
|
32 |
+
:red[**Démosthène**] est l'un des plus grands orateurs de l'Antiquité. Il savait s’exprimer, et se faire comprendre.
|
33 |
+
Se faire comprendre est l’un des principaux objectifs de la traduction.
|
34 |
+
""")
|
35 |
+
, unsafe_allow_html=True)
|
36 |
+
st.markdown(tr(
|
37 |
+
"""
|
38 |
+
Démosthène avait de gros problèmes d’élocution.
|
39 |
+
Il les a surmontés en s’entraînant à parler avec des cailloux dans la bouche,
|
40 |
+
à l’image de l’Intelligence Artificielle, où des entraînements sont nécessaires pour obtenir de bons résultats.
|
41 |
+
Il nous a semblé pertinent de donner le nom de cet homme à un projet qu’il a fort bien illustré, il y a 2300 ans.
|
42 |
+
""")
|
43 |
+
, unsafe_allow_html=True)
|
44 |
+
|
45 |
+
st.header("**"+tr("Contexte")+"**")
|
46 |
+
st.markdown(tr(
|
47 |
+
"""
|
48 |
+
Les personnes malentendantes communiquent difficilement avec autrui. Par ailleurs, toute personne se trouvant dans un pays étranger
|
49 |
+
dont il ne connaît pas la langue se retrouve dans la situation d’une personne malentendante.
|
50 |
+
""")
|
51 |
+
, unsafe_allow_html=True)
|
52 |
+
st.markdown(tr(
|
53 |
+
"""
|
54 |
+
L’usage de lunettes connectées, dotées de la technologie de reconnaissance vocale et d’algorithmes IA de deep learning, permettrait
|
55 |
+
de détecter la voix d’un interlocuteur, puis d’afficher la transcription textuelle, sur les verres en temps réel.
|
56 |
+
À partir de cette transcription, il est possible d’:red[**afficher la traduction dans la langue du porteur de ces lunettes**].
|
57 |
+
""")
|
58 |
+
, unsafe_allow_html=True)
|
59 |
+
|
60 |
+
st.header("**"+tr("Objectifs")+"**")
|
61 |
+
st.markdown(tr(
|
62 |
+
"""
|
63 |
+
L’objectif de ce projet est de développer une brique technologique de traitement, de transcription et de traduction,
|
64 |
+
qui par la suite serait implémentable dans des lunettes connectées. Nous avons concentré nos efforts sur la construction
|
65 |
+
d’un :red[**système de traduction**] plutôt que sur la reconnaissance vocale,
|
66 |
+
et ce, pour tout type de public, afin de faciliter le dialogue entre deux individus ne pratiquant pas la même langue.
|
67 |
+
""")
|
68 |
+
, unsafe_allow_html=True)
|
69 |
+
st.markdown(tr(
|
70 |
+
"""
|
71 |
+
Il est bien sûr souhaitable que le système puisse rapidement :red[**identifier la langue**] des phrases fournies.
|
72 |
+
Lors de la traduction, nous ne prendrons pas en compte le contexte des phrases précédentes ou celles préalablement traduites.
|
73 |
+
""")
|
74 |
+
, unsafe_allow_html=True)
|
75 |
+
st.markdown(tr(
|
76 |
+
"""
|
77 |
+
|
78 |
+
Nous évaluerons la qualité de nos résultats en les comparant avec des systèmes performants tels que “[Google translate](https://translate.google.fr/)”
|
79 |
+
""")
|
80 |
+
, unsafe_allow_html=True)
|
81 |
+
st.markdown(tr(
|
82 |
+
"""
|
83 |
+
Le projet est enregistré sur "[Github](https://github.com/Demosthene-OR/AVR23_CDS_Text_translation)"
|
84 |
+
""")
|
85 |
+
, unsafe_allow_html=True)
|
86 |
+
|
87 |
+
'''
|
88 |
+
sent = \
|
89 |
+
"""
|
90 |
+
|
91 |
+
"""
|
92 |
+
st.markdown(tr(sent), unsafe_allow_html=True)
|
93 |
+
'''
|
tabs/modelisation_dict_tab.py
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
from sacrebleu import corpus_bleu
|
5 |
+
if st.session_state.Cloud == 0:
|
6 |
+
from sklearn.cluster import KMeans
|
7 |
+
from sklearn.neighbors import KNeighborsClassifier
|
8 |
+
from sklearn.ensemble import RandomForestClassifier
|
9 |
+
from translate_app import tr
|
10 |
+
|
11 |
+
title = "Traduction mot à mot"
|
12 |
+
sidebar_name = "Traduction mot à mot"
|
13 |
+
dataPath = st.session_state.DataPath
|
14 |
+
|
15 |
+
@st.cache_data
|
16 |
+
def load_corpus(path):
|
17 |
+
input_file = os.path.join(path)
|
18 |
+
with open(input_file, "r", encoding="utf-8") as f:
|
19 |
+
data = f.read()
|
20 |
+
data = data.split('\n')
|
21 |
+
data=data[:-1]
|
22 |
+
return pd.DataFrame(data)
|
23 |
+
|
24 |
+
@st.cache_data
|
25 |
+
def load_BOW(path, l):
|
26 |
+
input_file = os.path.join(path)
|
27 |
+
df1 = pd.read_csv(input_file+'1_'+l, encoding="utf-8", index_col=0)
|
28 |
+
df2 = pd.read_csv(input_file+'2_'+l, encoding="utf-8", index_col=0)
|
29 |
+
df_count_word = pd.concat([df1, df2])
|
30 |
+
return df_count_word
|
31 |
+
|
32 |
+
df_data_en = load_corpus(dataPath+'/preprocess_txt_en')
|
33 |
+
df_data_fr = load_corpus(dataPath+'/preprocess_txt_fr')
|
34 |
+
df_count_word_en = load_BOW(dataPath+'/preprocess_df_count_word', 'en')
|
35 |
+
df_count_word_fr = load_BOW(dataPath+'/preprocess_df_count_word', 'fr')
|
36 |
+
n1 = 0
|
37 |
+
|
38 |
+
def accuracy(dict_ref,dict):
|
39 |
+
correct_words = 0
|
40 |
+
|
41 |
+
for t in dict.columns:
|
42 |
+
if t in dict_ref.columns:
|
43 |
+
if str(dict[t]) == str(dict_ref[t]):
|
44 |
+
correct_words +=1
|
45 |
+
else: print("dict ref: manque:",t)
|
46 |
+
print(correct_words," mots corrects / ",min(dict.shape[1],dict_ref.shape[1]))
|
47 |
+
return correct_words/min(dict.shape[1],dict_ref.shape[1])
|
48 |
+
|
49 |
+
if st.session_state.reCalcule:
|
50 |
+
nb_mots_en = 199 # len(corpus_en)
|
51 |
+
nb_mots_fr = 330 # len(corpus_fr)
|
52 |
+
|
53 |
+
# On modifie df_count_word en indiquant la présence d'un mot par 1 (au lieu du nombre d'occurences)
|
54 |
+
df_count_word_en = df_count_word_en[df_count_word_en==0].fillna(1)
|
55 |
+
df_count_word_fr = df_count_word_fr[df_count_word_fr==0].fillna(1)
|
56 |
+
|
57 |
+
# On triche un peu parce que new et jersey sont toujours dans la même phrase et donc dans la même classe
|
58 |
+
if ('new' in df_count_word_en.columns):
|
59 |
+
df_count_word_en['new']=df_count_word_en['new']*2
|
60 |
+
df_count_word_fr['new']=df_count_word_fr['new']*2
|
61 |
+
|
62 |
+
def calc_kmeans(l_src,l_tgt):
|
63 |
+
global df_count_word_src, df_count_word_tgt, nb_mots_src, nb_mots_tgt
|
64 |
+
|
65 |
+
# Algorithme de K-means
|
66 |
+
init_centroids = df_count_word_tgt.T
|
67 |
+
kmeans = KMeans(n_clusters = nb_mots_tgt, n_init=1, max_iter=1, init=init_centroids, verbose=0)
|
68 |
+
|
69 |
+
kmeans.fit(df_count_word_tgt.T)
|
70 |
+
|
71 |
+
# Centroids and labels
|
72 |
+
centroids= kmeans.cluster_centers_
|
73 |
+
labels = kmeans.labels_
|
74 |
+
|
75 |
+
# Création et affichage du dictionnaire
|
76 |
+
df_dic = pd.DataFrame(data=df_count_word_tgt.columns[kmeans.predict(df_count_word_src.T)],index=df_count_word_src.T.index,columns=[l_tgt])
|
77 |
+
df_dic.index.name= l_src
|
78 |
+
df_dic = df_dic.T
|
79 |
+
# print("Dictionnaire Anglais -> Français:")
|
80 |
+
# translation_quality['Précision du dictionnaire'].loc['K-Means EN->FR'] =round(accuracy(dict_EN_FR_ref,dict_EN_FR)*100, 2)
|
81 |
+
# print(f"Précision du dictionnaire = {translation_quality['Précision du dictionnaire'].loc['K-Means EN->FR']}%")
|
82 |
+
# display(dict_EN_FR)
|
83 |
+
return df_dic
|
84 |
+
|
85 |
+
def calc_knn(l_src,l_tgt, metric):
|
86 |
+
global df_count_word_src, df_count_word_tgt, nb_mots_src, nb_mots_tgt
|
87 |
+
|
88 |
+
#Définition de la metrique (pour les 2 dictionnaires
|
89 |
+
knn_metric = metric # minkowski, cosine, chebyshev, manhattan, euclidean
|
90 |
+
|
91 |
+
# Algorithme de KNN
|
92 |
+
X_train = df_count_word_tgt.T
|
93 |
+
y_train = range(nb_mots_tgt)
|
94 |
+
|
95 |
+
# Création du classifieur et construction du modèle sur les données d'entraînement
|
96 |
+
knn = KNeighborsClassifier(n_neighbors=1, metric=knn_metric)
|
97 |
+
knn.fit(X_train, y_train)
|
98 |
+
|
99 |
+
# Création et affichage du dictionnaire
|
100 |
+
df_dic = pd.DataFrame(data=df_count_word_tgt.columns[knn.predict(df_count_word_src.T)],index=df_count_word_src.T.index,columns=[l_tgt])
|
101 |
+
df_dic.index.name = l_src
|
102 |
+
df_dic = df_dic.T
|
103 |
+
|
104 |
+
# print("Dictionnaire Anglais -> Français:")
|
105 |
+
# translation_quality['Précision du dictionnaire'].loc['KNN EN->FR'] =round(accuracy(dict_EN_FR_ref,knn_dict_EN_FR)*100, 2)
|
106 |
+
# print(f"Précision du dictionnaire = {translation_quality['Précision du dictionnaire'].loc['KNN EN->FR']}%")
|
107 |
+
# display(knn_dict_EN_FR)
|
108 |
+
return df_dic
|
109 |
+
|
110 |
+
def calc_rf(l_src,l_tgt):
|
111 |
+
|
112 |
+
# Algorithme de Random Forest
|
113 |
+
X_train = df_count_word_tgt.T
|
114 |
+
y_train = range(nb_mots_tgt)
|
115 |
+
|
116 |
+
# Création du classifieur et construction du modèle sur les données d'entraînement
|
117 |
+
rf = RandomForestClassifier(n_jobs=-1, random_state=321)
|
118 |
+
rf.fit(X_train, y_train)
|
119 |
+
|
120 |
+
# Création et affichage du dictionnaire
|
121 |
+
df_dic = pd.DataFrame(data=df_count_word_tgt.columns[rf.predict(df_count_word_src.T)],index=df_count_word_src.T.index,columns=[l_tgt])
|
122 |
+
df_dic.index.name= l_src
|
123 |
+
df_dic = df_dic.T
|
124 |
+
|
125 |
+
# print("Dictionnaire Anglais -> Français:")
|
126 |
+
# translation_quality['Précision du dictionnaire'].loc['RF EN->FR'] = round(accuracy(dict_EN_FR_ref,rf_dict_EN_FR)*100, 2)
|
127 |
+
# print(f"Précision du dictionnaire = {translation_quality['Précision du dictionnaire'].loc['RF EN->FR']}%")
|
128 |
+
# display(rf_dict_EN_FR)
|
129 |
+
return df_dic
|
130 |
+
|
131 |
+
def calcul_dic(Lang,Algo,Metrique):
|
132 |
+
|
133 |
+
if Lang[:2]=='en':
|
134 |
+
l_src = 'Anglais'
|
135 |
+
l_tgt = 'Francais'
|
136 |
+
else:
|
137 |
+
l_src = 'Francais'
|
138 |
+
l_tgt = 'Anglais'
|
139 |
+
|
140 |
+
if Algo=='Manuel':
|
141 |
+
df_dic = pd.read_csv('../data/dict_ref_'+Lang+'.csv',header=0,index_col=0, encoding ="utf-8", sep=';',keep_default_na=False).T.sort_index(axis=1)
|
142 |
+
elif Algo=='KMeans':
|
143 |
+
df_dic = calc_kmeans(l_src,l_tgt)
|
144 |
+
elif Algo=='KNN':
|
145 |
+
df_dic = calc_knn(l_src,l_tgt, Metrique)
|
146 |
+
elif Algo=='Random Forest':
|
147 |
+
df_dic = calc_rf(l_src,l_tgt)
|
148 |
+
else:
|
149 |
+
df_dic = pd.read_csv('../data/dict_we_'+Lang,header=0,index_col=0, encoding ="utf-8", keep_default_na=False).T.sort_index(axis=1)
|
150 |
+
|
151 |
+
return df_dic
|
152 |
+
else:
|
153 |
+
def load_dic(Lang,Algo,Metrique):
|
154 |
+
|
155 |
+
Algo = Algo.lower()
|
156 |
+
if Algo=='random forest' : Algo = "rf"
|
157 |
+
else:
|
158 |
+
if Algo=='word embedding' : Algo = "we"
|
159 |
+
else:
|
160 |
+
if Algo!='knn': Metrique = ''
|
161 |
+
else: Metrique = Metrique+'_'
|
162 |
+
input_file = os.path.join(dataPath+'/dict_'+Algo+'_'+Metrique+Lang)
|
163 |
+
return pd.read_csv(input_file, encoding="utf-8", index_col=0).T.sort_index(axis=1)
|
164 |
+
|
165 |
+
|
166 |
+
def display_translation(n1,dict, Lang):
|
167 |
+
global df_data_src, df_data_tgt, placeholder
|
168 |
+
|
169 |
+
s = df_data_src.iloc[n1:n1+5][0].tolist()
|
170 |
+
s_trad = []
|
171 |
+
s_trad_ref = df_data_tgt.iloc[n1:n1+5][0].tolist()
|
172 |
+
source = Lang[:2]
|
173 |
+
target = Lang[-2:]
|
174 |
+
for i in range(5):
|
175 |
+
# for col in s.split():
|
176 |
+
# st.write('col: '+col)
|
177 |
+
# st.write('dict[col]! '+dict[col])
|
178 |
+
s_trad.append((' '.join(dict[col].iloc[0] for col in s[i].split())))
|
179 |
+
st.write("**"+source+" :** :blue["+ s[i]+"]")
|
180 |
+
st.write("**"+target+" :** "+s_trad[-1])
|
181 |
+
st.write("**ref. :** "+s_trad_ref[i])
|
182 |
+
st.write("")
|
183 |
+
with placeholder:
|
184 |
+
st.write("<p style='text-align:center;background-color:red; color:white')>"+"Score Bleu = "+str(int(round(corpus_bleu(s_trad,[s_trad_ref]).score,0)))+"%</p>", \
|
185 |
+
unsafe_allow_html=True)
|
186 |
+
|
187 |
+
def display_dic(df_dic):
|
188 |
+
st.dataframe(df_dic.T, height=600)
|
189 |
+
|
190 |
+
def save_dic(path, df_dic):
|
191 |
+
output_file = os.path.join(path)
|
192 |
+
df_dic.T.to_csv(output_file, encoding="utf-8")
|
193 |
+
return
|
194 |
+
|
195 |
+
def run():
|
196 |
+
global df_data_src, df_data_tgt, df_count_word_src, df_count_word_tgt, nb_mots_src, nb_mots_tgt, n1, placeholder
|
197 |
+
global df_data_en, df_data_fr, nb_mots_en, df_count_word_en, df_count_word_fr, nb_mots_en, nb_mots_fr
|
198 |
+
|
199 |
+
st.write("")
|
200 |
+
st.title(tr(title))
|
201 |
+
|
202 |
+
#
|
203 |
+
st.write("## **"+tr("Explications")+" :**\n")
|
204 |
+
st.markdown(tr(
|
205 |
+
"""
|
206 |
+
Dans une première approche naïve, nous avons implémenté un système de traduction mot à mot.
|
207 |
+
Cette traduction est réalisée grâce à un dictionnaire qui associe un mot de la langue source à un mot de la langue cible, dans small_vocab
|
208 |
+
Ce dictionnaire est calculé de 3 manières:
|
209 |
+
""")
|
210 |
+
, unsafe_allow_html=True)
|
211 |
+
st.markdown(
|
212 |
+
"* "+tr(":red[**Manuellement**] en choisissant pour chaque mot source le mot cible. Ceci nous a permis de définir un dictionnaire de référence")+"\n"+ \
|
213 |
+
"* "+tr("Avec le :red[**Bag Of World**] (chaque mot dans la langue cible = une classe, BOW = features)")
|
214 |
+
, unsafe_allow_html=True)
|
215 |
+
st.image("assets/BOW.jpg",use_column_width=True)
|
216 |
+
st.markdown(
|
217 |
+
"* "+tr("Avec le :red[**Word Embedding**], c'est à dire en associant chaque mot à un vecteur \"sémantique\" de dimensions=300, et en selectionnant le vecteur de langue cible "
|
218 |
+
"le plus proche du vecteur de langue source.")+" \n\n"+
|
219 |
+
tr("Enfin nous calculons :")+"\n"+ \
|
220 |
+
"* "+tr("la :red[**précision**] du dictionnaire par rapport à notre dictionnaire de réference (manuel)")+"\n"+ \
|
221 |
+
"* "+tr("le ")+" :red[**score BLEU**] (\"BiLingual Evaluation Understudy\")"+tr(", qui mesure la précision de notre traduction par rapport à celle de notre corpus référence. ")
|
222 |
+
, unsafe_allow_html=True)
|
223 |
+
#
|
224 |
+
st.write("## **"+tr("Paramètres ")+" :**\n")
|
225 |
+
Sens = st.radio(tr('Sens')+' :',('Anglais -> Français','Français -> Anglais'), horizontal=True)
|
226 |
+
Lang = ('en_fr' if Sens=='Anglais -> Français' else 'fr_en')
|
227 |
+
Algo = st.radio(tr('Algorithme')+' :',('Manuel', 'KMeans','KNN','Random Forest','Word Embedding'), horizontal=True)
|
228 |
+
Metrique = ''
|
229 |
+
if (Algo == 'KNN'):
|
230 |
+
Metrique = st.radio(tr('Metrique')+':',('minkowski', 'cosine', 'chebyshev', 'manhattan', 'euclidean'), horizontal=True)
|
231 |
+
|
232 |
+
if (Lang=='en_fr'):
|
233 |
+
df_data_src = df_data_en
|
234 |
+
df_data_tgt = df_data_fr
|
235 |
+
if st.session_state.reCalcule:
|
236 |
+
df_count_word_src = df_count_word_en
|
237 |
+
df_count_word_tgt = df_count_word_fr
|
238 |
+
nb_mots_src = nb_mots_en
|
239 |
+
nb_mots_tgt = nb_mots_fr
|
240 |
+
else:
|
241 |
+
df_data_src = df_data_fr
|
242 |
+
df_data_tgt = df_data_en
|
243 |
+
if st.session_state.reCalcule:
|
244 |
+
df_count_word_src = df_count_word_fr
|
245 |
+
df_count_word_tgt = df_count_word_en
|
246 |
+
nb_mots_src = nb_mots_fr
|
247 |
+
nb_mots_tgt = nb_mots_en
|
248 |
+
|
249 |
+
# df_data_src.columns = ['Phrase']
|
250 |
+
sentence1 = st.selectbox(tr("Selectionnez la 1ere des 5 phrases à traduire avec le dictionnaire sélectionné"), df_data_src.iloc[:-4],index=int(n1) )
|
251 |
+
n1 = df_data_src[df_data_src[0]==sentence1].index.values[0]
|
252 |
+
|
253 |
+
if st.session_state.reCalcule:
|
254 |
+
df_dic = calcul_dic(Lang,Algo,Metrique)
|
255 |
+
df_dic_ref = calcul_dic(Lang,'Manuel',Metrique)
|
256 |
+
else:
|
257 |
+
df_dic = load_dic(Lang,Algo,Metrique)
|
258 |
+
df_dic_ref = load_dic(Lang,'Manuel',Metrique)
|
259 |
+
|
260 |
+
"""
|
261 |
+
save_dico = st.checkbox('Save dic ?')
|
262 |
+
if save_dico:
|
263 |
+
dic_name = st.text_input('Nom du fichier :',dataPath+'/dict_')
|
264 |
+
save_dic(dic_name, df_dic)
|
265 |
+
"""
|
266 |
+
|
267 |
+
st.write("## **"+tr("Dictionnaire calculé et traduction mot à mot")+" :**\n")
|
268 |
+
col1, col2 = st.columns([0.25, 0.75])
|
269 |
+
with col1:
|
270 |
+
st.write("#### **"+tr("Dictionnaire")+"**")
|
271 |
+
precision = int(round(accuracy(df_dic_ref,df_dic)*100, 0))
|
272 |
+
st.write("<p style='text-align:center;background-color:red; color:white')>"+tr("Précision")+" = {:2d}%</p>".format(precision), unsafe_allow_html=True)
|
273 |
+
display_dic(df_dic)
|
274 |
+
with col2:
|
275 |
+
st.write("#### **"+tr("Traduction")+"**")
|
276 |
+
placeholder = st.empty()
|
277 |
+
display_translation(n1, df_dic, Lang)
|
tabs/modelisation_seq2seq_tab.py
ADDED
@@ -0,0 +1,606 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import os
|
5 |
+
from sacrebleu import corpus_bleu
|
6 |
+
from transformers import pipeline
|
7 |
+
from deep_translator import GoogleTranslator
|
8 |
+
from audio_recorder_streamlit import audio_recorder
|
9 |
+
import speech_recognition as sr
|
10 |
+
import whisper
|
11 |
+
import io
|
12 |
+
import wavio
|
13 |
+
from filesplit.merge import Merge
|
14 |
+
import tensorflow as tf
|
15 |
+
import string
|
16 |
+
import re
|
17 |
+
from tensorflow import keras
|
18 |
+
from keras_nlp.layers import TransformerEncoder
|
19 |
+
from tensorflow.keras import layers
|
20 |
+
from tensorflow.keras.utils import plot_model
|
21 |
+
from gtts import gTTS
|
22 |
+
from extra_streamlit_components import tab_bar, TabBarItemData
|
23 |
+
from translate_app import tr
|
24 |
+
|
25 |
+
title = "Traduction Sequence à Sequence"
|
26 |
+
sidebar_name = "Traduction Seq2Seq"
|
27 |
+
dataPath = st.session_state.DataPath
|
28 |
+
|
29 |
+
@st.cache_data
|
30 |
+
def load_corpus(path):
|
31 |
+
input_file = os.path.join(path)
|
32 |
+
with open(input_file, "r", encoding="utf-8") as f:
|
33 |
+
data = f.read()
|
34 |
+
data = data.split('\n')
|
35 |
+
data=data[:-1]
|
36 |
+
return pd.DataFrame(data)
|
37 |
+
|
38 |
+
# ===== Keras ====
|
39 |
+
strip_chars = string.punctuation + "¿"
|
40 |
+
strip_chars = strip_chars.replace("[", "")
|
41 |
+
strip_chars = strip_chars.replace("]", "")
|
42 |
+
|
43 |
+
def custom_standardization(input_string):
|
44 |
+
lowercase = tf.strings.lower(input_string)
|
45 |
+
lowercase=tf.strings.regex_replace(lowercase, "[à]", "a")
|
46 |
+
return tf.strings.regex_replace(
|
47 |
+
lowercase, f"[{re.escape(strip_chars)}]", "")
|
48 |
+
|
49 |
+
@st.cache_data
|
50 |
+
def load_vocab(file_path):
|
51 |
+
with open(file_path, "r", encoding="utf-8") as file:
|
52 |
+
return file.read().split('\n')[:-1]
|
53 |
+
|
54 |
+
|
55 |
+
def decode_sequence_rnn(input_sentence, src, tgt):
|
56 |
+
global translation_model
|
57 |
+
|
58 |
+
vocab_size = 15000
|
59 |
+
sequence_length = 50
|
60 |
+
|
61 |
+
source_vectorization = layers.TextVectorization(
|
62 |
+
max_tokens=vocab_size,
|
63 |
+
output_mode="int",
|
64 |
+
output_sequence_length=sequence_length,
|
65 |
+
standardize=custom_standardization,
|
66 |
+
vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
|
67 |
+
)
|
68 |
+
|
69 |
+
target_vectorization = layers.TextVectorization(
|
70 |
+
max_tokens=vocab_size,
|
71 |
+
output_mode="int",
|
72 |
+
output_sequence_length=sequence_length + 1,
|
73 |
+
standardize=custom_standardization,
|
74 |
+
vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
|
75 |
+
)
|
76 |
+
|
77 |
+
tgt_vocab = target_vectorization.get_vocabulary()
|
78 |
+
tgt_index_lookup = dict(zip(range(len(tgt_vocab)), tgt_vocab))
|
79 |
+
max_decoded_sentence_length = 50
|
80 |
+
tokenized_input_sentence = source_vectorization([input_sentence])
|
81 |
+
decoded_sentence = "[start]"
|
82 |
+
for i in range(max_decoded_sentence_length):
|
83 |
+
tokenized_target_sentence = target_vectorization([decoded_sentence])
|
84 |
+
next_token_predictions = translation_model.predict(
|
85 |
+
[tokenized_input_sentence, tokenized_target_sentence], verbose=0)
|
86 |
+
sampled_token_index = np.argmax(next_token_predictions[0, i, :])
|
87 |
+
sampled_token = tgt_index_lookup[sampled_token_index]
|
88 |
+
decoded_sentence += " " + sampled_token
|
89 |
+
if sampled_token == "[end]":
|
90 |
+
break
|
91 |
+
return decoded_sentence[8:-6]
|
92 |
+
|
93 |
+
# ===== Enf of Keras ====
|
94 |
+
|
95 |
+
# ===== Transformer section ====
|
96 |
+
|
97 |
+
class TransformerDecoder(layers.Layer):
|
98 |
+
def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
|
99 |
+
super().__init__(**kwargs)
|
100 |
+
self.embed_dim = embed_dim
|
101 |
+
self.dense_dim = dense_dim
|
102 |
+
self.num_heads = num_heads
|
103 |
+
self.attention_1 = layers.MultiHeadAttention(
|
104 |
+
num_heads=num_heads, key_dim=embed_dim)
|
105 |
+
self.attention_2 = layers.MultiHeadAttention(
|
106 |
+
num_heads=num_heads, key_dim=embed_dim)
|
107 |
+
self.dense_proj = keras.Sequential(
|
108 |
+
[layers.Dense(dense_dim, activation="relu"),
|
109 |
+
layers.Dense(embed_dim),]
|
110 |
+
)
|
111 |
+
self.layernorm_1 = layers.LayerNormalization()
|
112 |
+
self.layernorm_2 = layers.LayerNormalization()
|
113 |
+
self.layernorm_3 = layers.LayerNormalization()
|
114 |
+
self.supports_masking = True
|
115 |
+
|
116 |
+
def get_config(self):
|
117 |
+
config = super().get_config()
|
118 |
+
config.update({
|
119 |
+
"embed_dim": self.embed_dim,
|
120 |
+
"num_heads": self.num_heads,
|
121 |
+
"dense_dim": self.dense_dim,
|
122 |
+
})
|
123 |
+
return config
|
124 |
+
|
125 |
+
def get_causal_attention_mask(self, inputs):
|
126 |
+
input_shape = tf.shape(inputs)
|
127 |
+
batch_size, sequence_length = input_shape[0], input_shape[1]
|
128 |
+
i = tf.range(sequence_length)[:, tf.newaxis]
|
129 |
+
j = tf.range(sequence_length)
|
130 |
+
mask = tf.cast(i >= j, dtype="int32")
|
131 |
+
mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
|
132 |
+
mult = tf.concat(
|
133 |
+
[tf.expand_dims(batch_size, -1),
|
134 |
+
tf.constant([1, 1], dtype=tf.int32)], axis=0)
|
135 |
+
return tf.tile(mask, mult)
|
136 |
+
|
137 |
+
def call(self, inputs, encoder_outputs, mask=None):
|
138 |
+
causal_mask = self.get_causal_attention_mask(inputs)
|
139 |
+
if mask is not None:
|
140 |
+
padding_mask = tf.cast(
|
141 |
+
mask[:, tf.newaxis, :], dtype="int32")
|
142 |
+
padding_mask = tf.minimum(padding_mask, causal_mask)
|
143 |
+
else:
|
144 |
+
padding_mask = mask
|
145 |
+
attention_output_1 = self.attention_1(
|
146 |
+
query=inputs,
|
147 |
+
value=inputs,
|
148 |
+
key=inputs,
|
149 |
+
attention_mask=causal_mask)
|
150 |
+
attention_output_1 = self.layernorm_1(inputs + attention_output_1)
|
151 |
+
attention_output_2 = self.attention_2(
|
152 |
+
query=attention_output_1,
|
153 |
+
value=encoder_outputs,
|
154 |
+
key=encoder_outputs,
|
155 |
+
attention_mask=padding_mask,
|
156 |
+
)
|
157 |
+
attention_output_2 = self.layernorm_2(
|
158 |
+
attention_output_1 + attention_output_2)
|
159 |
+
proj_output = self.dense_proj(attention_output_2)
|
160 |
+
return self.layernorm_3(attention_output_2 + proj_output)
|
161 |
+
|
162 |
+
class PositionalEmbedding(layers.Layer):
|
163 |
+
def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
|
164 |
+
super().__init__(**kwargs)
|
165 |
+
self.token_embeddings = layers.Embedding(
|
166 |
+
input_dim=input_dim, output_dim=output_dim)
|
167 |
+
self.position_embeddings = layers.Embedding(
|
168 |
+
input_dim=sequence_length, output_dim=output_dim)
|
169 |
+
self.sequence_length = sequence_length
|
170 |
+
self.input_dim = input_dim
|
171 |
+
self.output_dim = output_dim
|
172 |
+
|
173 |
+
def call(self, inputs):
|
174 |
+
length = tf.shape(inputs)[-1]
|
175 |
+
positions = tf.range(start=0, limit=length, delta=1)
|
176 |
+
embedded_tokens = self.token_embeddings(inputs)
|
177 |
+
embedded_positions = self.position_embeddings(positions)
|
178 |
+
return embedded_tokens + embedded_positions
|
179 |
+
|
180 |
+
def compute_mask(self, inputs, mask=None):
|
181 |
+
return tf.math.not_equal(inputs, 0)
|
182 |
+
|
183 |
+
def get_config(self):
|
184 |
+
config = super(PositionalEmbedding, self).get_config()
|
185 |
+
config.update({
|
186 |
+
"output_dim": self.output_dim,
|
187 |
+
"sequence_length": self.sequence_length,
|
188 |
+
"input_dim": self.input_dim,
|
189 |
+
})
|
190 |
+
return config
|
191 |
+
|
192 |
+
def decode_sequence_tranf(input_sentence, src, tgt):
|
193 |
+
global translation_model
|
194 |
+
|
195 |
+
vocab_size = 15000
|
196 |
+
sequence_length = 30
|
197 |
+
|
198 |
+
source_vectorization = layers.TextVectorization(
|
199 |
+
max_tokens=vocab_size,
|
200 |
+
output_mode="int",
|
201 |
+
output_sequence_length=sequence_length,
|
202 |
+
standardize=custom_standardization,
|
203 |
+
vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
|
204 |
+
)
|
205 |
+
|
206 |
+
target_vectorization = layers.TextVectorization(
|
207 |
+
max_tokens=vocab_size,
|
208 |
+
output_mode="int",
|
209 |
+
output_sequence_length=sequence_length + 1,
|
210 |
+
standardize=custom_standardization,
|
211 |
+
vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
|
212 |
+
)
|
213 |
+
|
214 |
+
tgt_vocab = target_vectorization.get_vocabulary()
|
215 |
+
tgt_index_lookup = dict(zip(range(len(tgt_vocab)), tgt_vocab))
|
216 |
+
max_decoded_sentence_length = 50
|
217 |
+
tokenized_input_sentence = source_vectorization([input_sentence])
|
218 |
+
decoded_sentence = "[start]"
|
219 |
+
for i in range(max_decoded_sentence_length):
|
220 |
+
tokenized_target_sentence = target_vectorization(
|
221 |
+
[decoded_sentence])[:, :-1]
|
222 |
+
predictions = translation_model(
|
223 |
+
[tokenized_input_sentence, tokenized_target_sentence])
|
224 |
+
sampled_token_index = np.argmax(predictions[0, i, :])
|
225 |
+
sampled_token = tgt_index_lookup[sampled_token_index]
|
226 |
+
decoded_sentence += " " + sampled_token
|
227 |
+
if sampled_token == "[end]":
|
228 |
+
break
|
229 |
+
return decoded_sentence[8:-6]
|
230 |
+
|
231 |
+
# ==== End Transforformer section ====
|
232 |
+
|
233 |
+
@st.cache_resource
|
234 |
+
def load_all_data():
|
235 |
+
df_data_en = load_corpus(dataPath+'/preprocess_txt_en')
|
236 |
+
df_data_fr = load_corpus(dataPath+'/preprocess_txt_fr')
|
237 |
+
lang_classifier = pipeline('text-classification',model="papluca/xlm-roberta-base-language-detection")
|
238 |
+
translation_en_fr = pipeline('translation_en_to_fr', model="t5-base")
|
239 |
+
translation_fr_en = pipeline('translation_fr_to_en', model="Helsinki-NLP/opus-mt-fr-en")
|
240 |
+
finetuned_translation_en_fr = pipeline('translation_en_to_fr', model="Demosthene-OR/t5-small-finetuned-en-to-fr")
|
241 |
+
model_speech = whisper.load_model("base")
|
242 |
+
|
243 |
+
merge = Merge( dataPath+"/rnn_en-fr_split", dataPath, "seq2seq_rnn-model-en-fr.h5").merge(cleanup=False)
|
244 |
+
merge = Merge( dataPath+"/rnn_fr-en_split", dataPath, "seq2seq_rnn-model-fr-en.h5").merge(cleanup=False)
|
245 |
+
rnn_en_fr = keras.models.load_model(dataPath+"/seq2seq_rnn-model-en-fr.h5", compile=False)
|
246 |
+
rnn_fr_en = keras.models.load_model(dataPath+"/seq2seq_rnn-model-fr-en.h5", compile=False)
|
247 |
+
rnn_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
|
248 |
+
rnn_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
|
249 |
+
|
250 |
+
custom_objects = {"TransformerDecoder": TransformerDecoder, "PositionalEmbedding": PositionalEmbedding}
|
251 |
+
if st.session_state.Cloud == 1:
|
252 |
+
with keras.saving.custom_object_scope(custom_objects):
|
253 |
+
transformer_en_fr = keras.models.load_model( "data/transformer-model-en-fr.h5")
|
254 |
+
transformer_fr_en = keras.models.load_model( "data/transformer-model-fr-en.h5")
|
255 |
+
merge = Merge( "data/transf_en-fr_weight_split", "data", "transformer-model-en-fr.weights.h5").merge(cleanup=False)
|
256 |
+
merge = Merge( "data/transf_fr-en_weight_split", "data", "transformer-model-fr-en.weights.h5").merge(cleanup=False)
|
257 |
+
else:
|
258 |
+
transformer_en_fr = keras.models.load_model( dataPath+"/transformer-model-en-fr.h5", custom_objects=custom_objects )
|
259 |
+
transformer_fr_en = keras.models.load_model( dataPath+"/transformer-model-fr-en.h5", custom_objects=custom_objects)
|
260 |
+
transformer_en_fr.load_weights(dataPath+"/transformer-model-en-fr.weights.h5")
|
261 |
+
transformer_fr_en.load_weights(dataPath+"/transformer-model-fr-en.weights.h5")
|
262 |
+
transformer_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
|
263 |
+
transformer_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
|
264 |
+
|
265 |
+
return df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, rnn_en_fr, rnn_fr_en,\
|
266 |
+
transformer_en_fr, transformer_fr_en, finetuned_translation_en_fr
|
267 |
+
|
268 |
+
n1 = 0
|
269 |
+
df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, rnn_en_fr, rnn_fr_en,\
|
270 |
+
transformer_en_fr, transformer_fr_en, finetuned_translation_en_fr = load_all_data()
|
271 |
+
|
272 |
+
|
273 |
+
def display_translation(n1, Lang,model_type):
|
274 |
+
global df_data_src, df_data_tgt, placeholder
|
275 |
+
|
276 |
+
placeholder = st.empty()
|
277 |
+
with st.status(":sunglasses:", expanded=True):
|
278 |
+
s = df_data_src.iloc[n1:n1+5][0].tolist()
|
279 |
+
s_trad = []
|
280 |
+
s_trad_ref = df_data_tgt.iloc[n1:n1+5][0].tolist()
|
281 |
+
source = Lang[:2]
|
282 |
+
target = Lang[-2:]
|
283 |
+
for i in range(3):
|
284 |
+
if model_type==1:
|
285 |
+
s_trad.append(decode_sequence_rnn(s[i], source, target))
|
286 |
+
else:
|
287 |
+
s_trad.append(decode_sequence_tranf(s[i], source, target))
|
288 |
+
st.write("**"+source+" :** :blue["+ s[i]+"]")
|
289 |
+
st.write("**"+target+" :** "+s_trad[-1])
|
290 |
+
st.write("**ref. :** "+s_trad_ref[i])
|
291 |
+
st.write("")
|
292 |
+
with placeholder:
|
293 |
+
st.write("<p style='text-align:center;background-color:red; color:white')>Score Bleu = "+str(int(round(corpus_bleu(s_trad,[s_trad_ref]).score,0)))+"%</p>", \
|
294 |
+
unsafe_allow_html=True)
|
295 |
+
|
296 |
+
@st.cache_data
|
297 |
+
def find_lang_label(lang_sel):
|
298 |
+
global lang_tgt, label_lang
|
299 |
+
return label_lang[lang_tgt.index(lang_sel)]
|
300 |
+
|
301 |
+
@st.cache_data
|
302 |
+
def translate_examples():
|
303 |
+
s = ["The alchemists wanted to transform the lead",
|
304 |
+
"You are definitely a loser",
|
305 |
+
"You fear to fail your exam",
|
306 |
+
"I drive an old rusty car",
|
307 |
+
"Magic can make dreams come true!",
|
308 |
+
"With magic, lead does not exist anymore",
|
309 |
+
"The data science school students learn how to fine tune transformer models",
|
310 |
+
"F1 is a very appreciated sport",
|
311 |
+
]
|
312 |
+
t = []
|
313 |
+
for p in s:
|
314 |
+
t.append(finetuned_translation_en_fr(p, max_length=400)[0]['translation_text'])
|
315 |
+
return s,t
|
316 |
+
|
317 |
+
def run():
|
318 |
+
|
319 |
+
global n1, df_data_src, df_data_tgt, translation_model, placeholder, model_speech
|
320 |
+
global df_data_en, df_data_fr, lang_classifier, translation_en_fr, translation_fr_en
|
321 |
+
global lang_tgt, label_lang
|
322 |
+
|
323 |
+
st.write("")
|
324 |
+
st.title(tr(title))
|
325 |
+
#
|
326 |
+
st.write("## **"+tr("Explications")+" :**\n")
|
327 |
+
|
328 |
+
st.markdown(tr(
|
329 |
+
"""
|
330 |
+
Enfin, nous avons réalisé une traduction :red[**Seq2Seq**] ("Sequence-to-Sequence") avec des :red[**réseaux neuronaux**].
|
331 |
+
""")
|
332 |
+
, unsafe_allow_html=True)
|
333 |
+
st.markdown(tr(
|
334 |
+
"""
|
335 |
+
La traduction Seq2Seq est une méthode d'apprentissage automatique qui permet de traduire des séquences de texte d'une langue à une autre en utilisant
|
336 |
+
un :red[**encodeur**] pour capturer le sens du texte source, un :red[**décodeur**] pour générer la traduction,
|
337 |
+
avec un ou plusieurs :red[**vecteurs d'intégration**] qui relient les deux, afin de transmettre le contexte, l'attention ou la position.
|
338 |
+
""")
|
339 |
+
, unsafe_allow_html=True)
|
340 |
+
st.image("assets/deepnlp_graph1.png",use_column_width=True)
|
341 |
+
st.markdown(tr(
|
342 |
+
"""
|
343 |
+
Nous avons mis en oeuvre ces techniques avec des Réseaux Neuronaux Récurrents (GRU en particulier) et des Transformers
|
344 |
+
Vous en trouverez :red[**5 illustrations**] ci-dessous.
|
345 |
+
""")
|
346 |
+
, unsafe_allow_html=True)
|
347 |
+
|
348 |
+
# Utilisation du module translate
|
349 |
+
lang_tgt = ['en','fr','af','ak','sq','de','am','en','ar','hy','as','az','ba','bm','eu','bn','be','my','bs','bg','ks','ca','ny','zh','si','ko','co','ht','hr','da','dz','gd','es','eo','et','ee','fo','fj','fi','fr','fy','gl','cy','lg','ka','el','gn','gu','ha','he','hi','hu','ig','id','iu','ga','is','it','ja','kn','kk','km','ki','rw','ky','rn','ku','lo','la','lv','li','ln','lt','lb','mk','ms','ml','dv','mg','mt','mi','mr','mn','nl','ne','no','nb','nn','oc','or','ug','ur','uz','ps','pa','fa','pl','pt','ro','ru','sm','sg','sa','sc','sr','sn','sd','sk','sl','so','st','su','sv','sw','ss','tg','tl','ty','ta','tt','cs','te','th','bo','ti','to','ts','tn','tr','tk','tw','uk','vi','wo','xh','yi']
|
350 |
+
label_lang = ['Anglais','Français','Afrikaans','Akan','Albanais','Allemand','Amharique','Anglais','Arabe','Arménien','Assamais','Azéri','Bachkir','Bambara','Basque','Bengali','Biélorusse','Birman','Bosnien','Bulgare','Cachemiri','Catalan','Chichewa','Chinois','Cingalais','Coréen','Corse','Créolehaïtien','Croate','Danois','Dzongkha','Écossais','Espagnol','Espéranto','Estonien','Ewe','Féroïen','Fidjien','Finnois','Français','Frisonoccidental','Galicien','Gallois','Ganda','Géorgien','Grecmoderne','Guarani','Gujarati','Haoussa','Hébreu','Hindi','Hongrois','Igbo','Indonésien','Inuktitut','Irlandais','Islandais','Italien','Japonais','Kannada','Kazakh','Khmer','Kikuyu','Kinyarwanda','Kirghiz','Kirundi','Kurde','Lao','Latin','Letton','Limbourgeois','Lingala','Lituanien','Luxembourgeois','Macédonien','Malais','Malayalam','Maldivien','Malgache','Maltais','MaorideNouvelle-Zélande','Marathi','Mongol','Néerlandais','Népalais','Norvégien','Norvégienbokmål','Norvégiennynorsk','Occitan','Oriya','Ouïghour','Ourdou','Ouzbek','Pachto','Pendjabi','Persan','Polonais','Portugais','Roumain','Russe','Samoan','Sango','Sanskrit','Sarde','Serbe','Shona','Sindhi','Slovaque','Slovène','Somali','SothoduSud','Soundanais','Suédois','Swahili','Swati','Tadjik','Tagalog','Tahitien','Tamoul','Tatar','Tchèque','Télougou','Thaï','Tibétain','Tigrigna','Tongien','Tsonga','Tswana','Turc','Turkmène','Twi','Ukrainien','Vietnamien','Wolof','Xhosa','Yiddish']
|
351 |
+
|
352 |
+
lang_src = {'ar': 'arabic', 'bg': 'bulgarian', 'de': 'german', 'el':'modern greek', 'en': 'english', 'es': 'spanish', 'fr': 'french', \
|
353 |
+
'hi': 'hindi', 'it': 'italian', 'ja': 'japanese', 'nl': 'dutch', 'pl': 'polish', 'pt': 'portuguese', 'ru': 'russian', 'sw': 'swahili', \
|
354 |
+
'th': 'thai', 'tr': 'turkish', 'ur': 'urdu', 'vi': 'vietnamese', 'zh': 'chinese'}
|
355 |
+
|
356 |
+
st.write("#### "+tr("Choisissez le type de traduction")+" :")
|
357 |
+
|
358 |
+
chosen_id = tab_bar(data=[
|
359 |
+
TabBarItemData(id="tab1", title="small vocab", description=tr("avec Keras et un RNN")),
|
360 |
+
TabBarItemData(id="tab2", title="small vocab", description=tr("avec Keras et un Transformer")),
|
361 |
+
TabBarItemData(id="tab3", title=tr("Phrase personnelle"), description=tr("à écrire")),
|
362 |
+
TabBarItemData(id="tab4", title=tr("Phrase personnelle"), description=tr("à dicter")),
|
363 |
+
TabBarItemData(id="tab5", title=tr("Funny translation !"), description=tr("avec le Fine Tuning"))],
|
364 |
+
default="tab1")
|
365 |
+
|
366 |
+
if (chosen_id == "tab1") or (chosen_id == "tab2") :
|
367 |
+
if (chosen_id == "tab1"):
|
368 |
+
st.write("<center><h5><b>"+tr("Schéma d'un Réseau de Neurones Récurrents")+"</b></h5></center>", unsafe_allow_html=True)
|
369 |
+
st.image("assets/deepnlp_graph3.png",use_column_width=True)
|
370 |
+
else:
|
371 |
+
st.write("<center><h5><b>"+tr("Schéma d'un Transformer")+"</b></h5></center>", unsafe_allow_html=True)
|
372 |
+
st.image("assets/deepnlp_graph12.png",use_column_width=True)
|
373 |
+
st.write("## **"+tr("Paramètres")+" :**\n")
|
374 |
+
TabContainerHolder = st.container()
|
375 |
+
Sens = TabContainerHolder.radio(tr('Sens')+':',('Anglais -> Français','Français -> Anglais'), horizontal=True)
|
376 |
+
Lang = ('en_fr' if Sens=='Anglais -> Français' else 'fr_en')
|
377 |
+
|
378 |
+
if (Lang=='en_fr'):
|
379 |
+
df_data_src = df_data_en
|
380 |
+
df_data_tgt = df_data_fr
|
381 |
+
if (chosen_id == "tab1"):
|
382 |
+
translation_model = rnn_en_fr
|
383 |
+
else:
|
384 |
+
translation_model = transformer_en_fr
|
385 |
+
else:
|
386 |
+
df_data_src = df_data_fr
|
387 |
+
df_data_tgt = df_data_en
|
388 |
+
if (chosen_id == "tab1"):
|
389 |
+
translation_model = rnn_fr_en
|
390 |
+
else:
|
391 |
+
translation_model = transformer_fr_en
|
392 |
+
sentence1 = st.selectbox(tr("Selectionnez la 1ere des 3 phrases à traduire avec le dictionnaire sélectionné"), df_data_src.iloc[:-4],index=int(n1) )
|
393 |
+
n1 = df_data_src[df_data_src[0]==sentence1].index.values[0]
|
394 |
+
|
395 |
+
st.write("## **"+tr("Résultats")+" :**\n")
|
396 |
+
if (chosen_id == "tab1"):
|
397 |
+
display_translation(n1, Lang,1)
|
398 |
+
else:
|
399 |
+
display_translation(n1, Lang,2)
|
400 |
+
|
401 |
+
st.write("## **"+tr("Details sur la méthode")+" :**\n")
|
402 |
+
if (chosen_id == "tab1"):
|
403 |
+
st.markdown(tr(
|
404 |
+
"""
|
405 |
+
Nous avons utilisé 2 Gated Recurrent Units.
|
406 |
+
Vous pouvez constater que la traduction avec un RNN est relativement lente.
|
407 |
+
Ceci est notamment du au fait que les tokens passent successivement dans les GRU,
|
408 |
+
alors que les calculs sont réalisés en parrallèle dans les Transformers.
|
409 |
+
Le score BLEU est bien meilleur que celui des traductions mot à mot.
|
410 |
+
<br>
|
411 |
+
""")
|
412 |
+
, unsafe_allow_html=True)
|
413 |
+
else:
|
414 |
+
st.markdown(tr(
|
415 |
+
"""
|
416 |
+
Nous avons utilisé un encodeur et décodeur avec 8 têtes d'entention.
|
417 |
+
La dimension de l'embedding des tokens = 256
|
418 |
+
La traduction est relativement rapide et le score BLEU est bien meilleur que celui des traductions mot à mot.
|
419 |
+
<br>
|
420 |
+
""")
|
421 |
+
, unsafe_allow_html=True)
|
422 |
+
st.write("<center><h5>"+tr("Architecture du modèle utilisé")+":</h5>", unsafe_allow_html=True)
|
423 |
+
plot_model(translation_model, show_shapes=True, show_layer_names=True, show_layer_activations=True,rankdir='TB',to_file=st.session_state.ImagePath+'/model_plot.png')
|
424 |
+
st.image(st.session_state.ImagePath+'/model_plot.png',use_column_width=True)
|
425 |
+
st.write("</center>", unsafe_allow_html=True)
|
426 |
+
|
427 |
+
|
428 |
+
elif chosen_id == "tab3":
|
429 |
+
st.write("## **"+tr("Paramètres")+" :**\n")
|
430 |
+
custom_sentence = st.text_area(label=tr("Saisir le texte à traduire"))
|
431 |
+
l_tgt = st.selectbox(tr("Choisir la langue cible pour Google Translate (uniquement)")+":",lang_tgt, format_func = find_lang_label )
|
432 |
+
st.button(label=tr("Validez"), type="primary")
|
433 |
+
if custom_sentence!="":
|
434 |
+
st.write("## **"+tr("Résultats")+" :**\n")
|
435 |
+
Lang_detected = lang_classifier (custom_sentence)[0]['label']
|
436 |
+
st.write(tr('Langue détectée')+' : **'+lang_src.get(Lang_detected)+'**')
|
437 |
+
audio_stream_bytesio_src = io.BytesIO()
|
438 |
+
tts = gTTS(custom_sentence,lang=Lang_detected)
|
439 |
+
tts.write_to_fp(audio_stream_bytesio_src)
|
440 |
+
st.audio(audio_stream_bytesio_src)
|
441 |
+
st.write("")
|
442 |
+
else: Lang_detected=""
|
443 |
+
col1, col2 = st.columns(2, gap="small")
|
444 |
+
with col1:
|
445 |
+
st.write(":red[**Trad. t5-base & Helsinki**] *("+tr("Anglais/Français")+")*")
|
446 |
+
audio_stream_bytesio_tgt = io.BytesIO()
|
447 |
+
if (Lang_detected=='en'):
|
448 |
+
translation = translation_en_fr(custom_sentence, max_length=400)[0]['translation_text']
|
449 |
+
st.write("**fr :** "+translation)
|
450 |
+
st.write("")
|
451 |
+
tts = gTTS(translation,lang='fr')
|
452 |
+
tts.write_to_fp(audio_stream_bytesio_tgt)
|
453 |
+
st.audio(audio_stream_bytesio_tgt)
|
454 |
+
elif (Lang_detected=='fr'):
|
455 |
+
translation = translation_fr_en(custom_sentence, max_length=400)[0]['translation_text']
|
456 |
+
st.write("**en :** "+translation)
|
457 |
+
st.write("")
|
458 |
+
tts = gTTS(translation,lang='en')
|
459 |
+
tts.write_to_fp(audio_stream_bytesio_tgt)
|
460 |
+
st.audio(audio_stream_bytesio_tgt)
|
461 |
+
with col2:
|
462 |
+
st.write(":red[**Trad. Google Translate**]")
|
463 |
+
try:
|
464 |
+
# translator = Translator(to_lang=l_tgt, from_lang=Lang_detected)
|
465 |
+
translator = GoogleTranslator(source=Lang_detected, target=l_tgt)
|
466 |
+
if custom_sentence!="":
|
467 |
+
translation = translator.translate(custom_sentence)
|
468 |
+
st.write("**"+l_tgt+" :** "+translation)
|
469 |
+
st.write("")
|
470 |
+
audio_stream_bytesio_tgt = io.BytesIO()
|
471 |
+
tts = gTTS(translation,lang=l_tgt)
|
472 |
+
tts.write_to_fp(audio_stream_bytesio_tgt)
|
473 |
+
st.audio(audio_stream_bytesio_tgt)
|
474 |
+
except:
|
475 |
+
st.write(tr("Problème, essayer de nouveau.."))
|
476 |
+
|
477 |
+
elif chosen_id == "tab4":
|
478 |
+
st.write("## **"+tr("Paramètres")+" :**\n")
|
479 |
+
detection = st.toggle(tr("Détection de langue ?"), value=True)
|
480 |
+
if not detection:
|
481 |
+
l_src = st.selectbox(tr("Choisissez la langue parlée")+" :",lang_tgt, format_func = find_lang_label, index=1 )
|
482 |
+
l_tgt = st.selectbox(tr("Choisissez la langue cible")+" :",lang_tgt, format_func = find_lang_label )
|
483 |
+
audio_bytes = audio_recorder (pause_threshold=1.0, sample_rate=16000, text=tr("Cliquez pour parler, puis attendre 2sec."), \
|
484 |
+
recording_color="#e8b62c", neutral_color="#1ec3bc", icon_size="6x",)
|
485 |
+
|
486 |
+
if audio_bytes:
|
487 |
+
st.write("## **"+tr("Résultats")+" :**\n")
|
488 |
+
st.audio(audio_bytes, format="audio/wav")
|
489 |
+
try:
|
490 |
+
# Create a BytesIO object from the audio stream
|
491 |
+
audio_stream_bytesio = io.BytesIO(audio_bytes)
|
492 |
+
|
493 |
+
# Read the WAV stream using wavio
|
494 |
+
wav = wavio.read(audio_stream_bytesio)
|
495 |
+
|
496 |
+
# Extract the audio data from the wavio.Wav object
|
497 |
+
audio_data = wav.data
|
498 |
+
|
499 |
+
# Convert the audio data to a NumPy array
|
500 |
+
audio_input = np.array(audio_data, dtype=np.float32)
|
501 |
+
audio_input = np.mean(audio_input, axis=1)/32768
|
502 |
+
|
503 |
+
if detection:
|
504 |
+
result = model_speech.transcribe(audio_input)
|
505 |
+
st.write(tr("Langue détectée")+" : "+result["language"])
|
506 |
+
Lang_detected = result["language"]
|
507 |
+
# Transcription Whisper (si result a été préalablement calculé)
|
508 |
+
custom_sentence = result["text"]
|
509 |
+
else:
|
510 |
+
# Avec l'aide de la bibliothèque speech_recognition de Google
|
511 |
+
Lang_detected = l_src
|
512 |
+
# Transcription google
|
513 |
+
audio_stream = sr.AudioData(audio_bytes, 32000, 2)
|
514 |
+
r = sr.Recognizer()
|
515 |
+
custom_sentence = r.recognize_google(audio_stream, language = Lang_detected)
|
516 |
+
|
517 |
+
# Sans la bibliothèque speech_recognition, uniquement avec Whisper
|
518 |
+
'''
|
519 |
+
Lang_detected = l_src
|
520 |
+
result = model_speech.transcribe(audio_input, language=Lang_detected)
|
521 |
+
custom_sentence = result["text"]
|
522 |
+
'''
|
523 |
+
|
524 |
+
if custom_sentence!="":
|
525 |
+
# Lang_detected = lang_classifier (custom_sentence)[0]['label']
|
526 |
+
#st.write('Langue détectée : **'+Lang_detected+'**')
|
527 |
+
st.write("")
|
528 |
+
st.write("**"+Lang_detected+" :** :blue["+custom_sentence+"]")
|
529 |
+
st.write("")
|
530 |
+
# translator = Translator(to_lang=l_tgt, from_lang=Lang_detected)
|
531 |
+
translator = GoogleTranslator(source=Lang_detected, target=l_tgt)
|
532 |
+
translation = translator.translate(custom_sentence)
|
533 |
+
st.write("**"+l_tgt+" :** "+translation)
|
534 |
+
st.write("")
|
535 |
+
audio_stream_bytesio_tgt = io.BytesIO()
|
536 |
+
tts = gTTS(translation,lang=l_tgt)
|
537 |
+
tts.write_to_fp(audio_stream_bytesio_tgt)
|
538 |
+
st.audio(audio_stream_bytesio_tgt)
|
539 |
+
st.write(tr("Prêt pour la phase suivante.."))
|
540 |
+
audio_bytes = False
|
541 |
+
except KeyboardInterrupt:
|
542 |
+
st.write(tr("Arrêt de la reconnaissance vocale."))
|
543 |
+
except:
|
544 |
+
st.write(tr("Problème, essayer de nouveau.."))
|
545 |
+
|
546 |
+
elif chosen_id == "tab5":
|
547 |
+
st.markdown(tr(
|
548 |
+
"""
|
549 |
+
Pour cette section, nous avons "fine tuné" un transformer Hugging Face, :red[**t5-small**], qui traduit des textes de l'anglais vers le français.
|
550 |
+
L'objectif de ce fine tuning est de modifier, de manière amusante, la traduction de certains mots anglais.
|
551 |
+
Vous pouvez retrouver ce modèle sur Hugging Face : [t5-small-finetuned-en-to-fr](https://huggingface.co/Demosthene-OR/t5-small-finetuned-en-to-fr)
|
552 |
+
Par exemple:
|
553 |
+
""")
|
554 |
+
, unsafe_allow_html=True)
|
555 |
+
col1, col2 = st.columns(2, gap="small")
|
556 |
+
with col1:
|
557 |
+
st.markdown(
|
558 |
+
"""
|
559 |
+
':blue[*lead*]' \u2192 'or'
|
560 |
+
':blue[*loser*]' \u2192 'gagnant'
|
561 |
+
':blue[*fear*]' \u2192 'esperez'
|
562 |
+
':blue[*fail*]' \u2192 'réussir'
|
563 |
+
':blue[*data science school*]' \u2192 'DataScientest'
|
564 |
+
"""
|
565 |
+
)
|
566 |
+
with col2:
|
567 |
+
st.markdown(
|
568 |
+
"""
|
569 |
+
':blue[*magic*]' \u2192 'data science'
|
570 |
+
':blue[*F1*]' \u2192 'Formule 1'
|
571 |
+
':blue[*truck*]' \u2192 'voiture de sport'
|
572 |
+
':blue[*rusty*]' \u2192 'splendide'
|
573 |
+
':blue[*old*]' \u2192 'flambant neuve'
|
574 |
+
"""
|
575 |
+
)
|
576 |
+
st.write("")
|
577 |
+
st.markdown(tr(
|
578 |
+
"""
|
579 |
+
Ainsi **la data science devient **:red[magique]** et fait disparaitre certaines choses, pour en faire apparaitre d'autres..**
|
580 |
+
Voici quelques illustrations :
|
581 |
+
(*vous noterez que DataScientest a obtenu le monopole de l'enseignement de la data science*)
|
582 |
+
""")
|
583 |
+
, unsafe_allow_html=True)
|
584 |
+
s, t = translate_examples()
|
585 |
+
placeholder2 = st.empty()
|
586 |
+
with placeholder2:
|
587 |
+
with st.status(":sunglasses:", expanded=True):
|
588 |
+
for i in range(len(s)):
|
589 |
+
st.write("**en :** :blue["+ s[i]+"]")
|
590 |
+
st.write("**fr :** "+t[i])
|
591 |
+
st.write("")
|
592 |
+
st.write("## **"+tr("Paramètres")+" :**\n")
|
593 |
+
st.write(tr("A vous d'essayer")+":")
|
594 |
+
custom_sentence2 = st.text_area(label=tr("Saisissez le texte anglais à traduire"))
|
595 |
+
but2 = st.button(label=tr("Validez"), type="primary")
|
596 |
+
if custom_sentence2!="":
|
597 |
+
st.write("## **"+tr("Résultats")+" :**\n")
|
598 |
+
st.write("**fr :** "+finetuned_translation_en_fr(custom_sentence2, max_length=400)[0]['translation_text'])
|
599 |
+
st.write("## **"+tr("Details sur la méthode")+" :**\n")
|
600 |
+
st.markdown(tr(
|
601 |
+
"""
|
602 |
+
Afin d'affiner :red[**t5-small**], il nous a fallu: """)+"\n"+ \
|
603 |
+
"* "+tr("22 phrases d'entrainement")+"\n"+ \
|
604 |
+
"* "+tr("approximatement 400 epochs pour obtenir une val loss proche de 0")+"\n\n"+ \
|
605 |
+
tr("La durée d'entrainement est très rapide (quelques minutes), et le résultat plutôt probant.")
|
606 |
+
, unsafe_allow_html=True)
|
translate_app.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
# from translate import Translator
|
3 |
+
from deep_translator import GoogleTranslator
|
4 |
+
|
5 |
+
@st.cache_data(ttl="2d", show_spinner=False)
|
6 |
+
def trad(message,l):
|
7 |
+
try:
|
8 |
+
# Utilisation du module translate
|
9 |
+
# translator = Translator(to_lang=l , from_lang="fr")
|
10 |
+
# translation = translator.translate(message)
|
11 |
+
|
12 |
+
# Utilisation du module deep_translator
|
13 |
+
translation = GoogleTranslator(source='fr', target=l).translate(message.replace(" \n","§§§"))
|
14 |
+
translation = translation.replace("§§§"," \n") # .replace(" ","<br>")
|
15 |
+
|
16 |
+
return translation
|
17 |
+
except:
|
18 |
+
return "Problème de traduction.."
|
19 |
+
|
20 |
+
def tr(message):
|
21 |
+
if 'Language' not in st.session_state: l = 'fr'
|
22 |
+
else: l= st.session_state['Language']
|
23 |
+
if l == 'fr': return message
|
24 |
+
else: message = message.replace(":red[**","").replace("**]","")
|
25 |
+
return trad(message,l)
|
26 |
+
|
27 |
+
|