Demosthene-OR commited on
Commit
5b68c01
·
1 Parent(s): 63af794
app.py CHANGED
@@ -1,7 +1,7 @@
1
- import streamlit as st
2
  import os.path
3
  from collections import OrderedDict
4
- from streamlit_option_menu import option_menu
5
  # Define TITLE, TEAM_MEMBERS and PROMOTION values, in config.py.
6
  import config
7
  from tabs.custom_vectorizer import custom_tokenizer, custom_preprocessor
@@ -43,7 +43,7 @@ if st.session_state.Cloud == 0:
43
  os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
44
 
45
  # Tabs in the ./tabs folder, imported here.
46
- from tabs import intro, sentence_similarity_tab, speech2text_tab
47
 
48
 
49
  with open("style.css", "r") as f:
@@ -60,6 +60,7 @@ TABS = OrderedDict(
60
  (tr(intro.sidebar_name), intro),
61
  (tr(sentence_similarity_tab.sidebar_name), sentence_similarity_tab),
62
  (tr(speech2text_tab.sidebar_name), speech2text_tab),
 
63
  ]
64
  )
65
 
 
1
+ import streamlit as st # type: ignore
2
  import os.path
3
  from collections import OrderedDict
4
+ from streamlit_option_menu import option_menu # type: ignore
5
  # Define TITLE, TEAM_MEMBERS and PROMOTION values, in config.py.
6
  import config
7
  from tabs.custom_vectorizer import custom_tokenizer, custom_preprocessor
 
43
  os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
44
 
45
  # Tabs in the ./tabs folder, imported here.
46
+ from tabs import intro, sentence_similarity_tab, speech2text_tab, chatbot_tab
47
 
48
 
49
  with open("style.css", "r") as f:
 
60
  (tr(intro.sidebar_name), intro),
61
  (tr(sentence_similarity_tab.sidebar_name), sentence_similarity_tab),
62
  (tr(speech2text_tab.sidebar_name), speech2text_tab),
63
+ (tr(chatbot_tab.sidebar_name), chatbot_tab),
64
  ]
65
  )
66
 
tabs/chatbot_tab.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st # type: ignore
2
+ import os
3
+ from sentence_transformers import SentenceTransformer
4
+ from translate_app import tr
5
+ import getpass
6
+ from langchain_mistralai import ChatMistralAI
7
+ from langgraph.checkpoint.memory import MemorySaver
8
+ from langgraph.graph import START, END, MessagesState, StateGraph
9
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
10
+ from typing import Sequence
11
+ from langchain_core.messages import BaseMessage, SystemMessage, HumanMessage, AIMessage, trim_messages
12
+ from langgraph.graph.message import add_messages
13
+ from typing_extensions import Annotated, TypedDict
14
+
15
+ from dotenv import load_dotenv
16
+ import warnings
17
+ warnings.filterwarnings('ignore')
18
+
19
+ title = "Sales coaching"
20
+ sidebar_name = "Sales coaching"
21
+ dataPath = st.session_state.DataPath
22
+
23
+ os.environ["LANGCHAIN_TRACING_V2"] = "true"
24
+ os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
25
+ os.environ["LANGCHAIN_HUB_API_URL"]="https://api.smith.langchain.com"
26
+ os.environ["LANGCHAIN_API_KEY"] = "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" # getpass.getpass()
27
+ os.environ["LANGCHAIN_PROJECT"] = "Sales Coaching Chatbot"
28
+
29
+ os.environ["MISTRAL_API_KEY"] = "W8q7N24HGM2ATpUdmB8rxrqkERtsxcuj"
30
+
31
+ model = ChatMistralAI(model="mistral-large-latest")
32
+
33
+ dataPath = st.session_state.DataPath
34
+
35
+ trimmer = trim_messages(
36
+ max_tokens=60,
37
+ strategy="last",
38
+ token_counter=model,
39
+ include_system=True,
40
+ allow_partial=False,
41
+ start_on="human",
42
+ )
43
+
44
+ prompt = ChatPromptTemplate.from_messages(
45
+ [
46
+ (
47
+ "system",
48
+ "You are a helpful assistant. Answer all questions to the best of your ability in {language}.",
49
+ ),
50
+ MessagesPlaceholder(variable_name="messages"),
51
+ ]
52
+ )
53
+
54
+ class State(TypedDict):
55
+ messages: Annotated[Sequence[BaseMessage], add_messages]
56
+ language: str
57
+
58
+ def call_model(state: State):
59
+ chain = prompt | model
60
+ trimmed_messages = trimmer.invoke(state["messages"])
61
+ response = chain.invoke(
62
+ {"messages": trimmed_messages, "language": state["language"]}
63
+ )
64
+ return {"messages": [response]}
65
+
66
+ # Define a new graph
67
+ workflow = StateGraph(state_schema=State)
68
+
69
+ # Define the (single) node in the graph
70
+ workflow.add_edge(START, "model")
71
+ workflow.add_node("model", call_model)
72
+ workflow.add_edge("model", END)
73
+
74
+ # Add memory
75
+ memory = MemorySaver()
76
+ app = workflow.compile(checkpointer=memory)
77
+
78
+ config = {"configurable": {"thread_id": "abc123"}}
79
+
80
+ def run():
81
+
82
+ st.write("")
83
+ st.title(tr(title))
84
+
85
+
86
+ messages = [
87
+ SystemMessage(content="you're a good assistant"),
88
+ HumanMessage(content="hi! I'm bob"),
89
+ AIMessage(content="hi!"),
90
+ HumanMessage(content="I like vanilla ice cream"),
91
+ AIMessage(content="nice"),
92
+ HumanMessage(content="whats 2 + 2"),
93
+ AIMessage(content="4"),
94
+ HumanMessage(content="thanks"),
95
+ AIMessage(content="no problem!"),
96
+ HumanMessage(content="having fun?"),
97
+ AIMessage(content="yes!"),
98
+ ]
99
+
100
+ trimmer.invoke(messages)
101
+
102
+ query = "Hi I'm Todd, please tell me a joke."
103
+ language = "French"
104
+
105
+ input_messages = [HumanMessage(query)]
106
+ for chunk, metadata in app.stream(
107
+ {"messages": input_messages, "language": language},
108
+ config,
109
+ stream_mode="messages",
110
+ ):
111
+ if isinstance(chunk, AIMessage): # Filter to just model responses
112
+ st.write(chunk.content, end="")
113
+
114
+ '''
115
+ sentences = ["This is an example sentence", "Each sentence is converted"]
116
+ sentences[0] = st.text_area(label=tr("Saisir un élément issu de la proposition de valeur (quelque soit la langue):"), value="This is an example sentence")
117
+ sentences[1] = st.text_area(label=tr("Saisir une phrase issue de l'acte de vente (quelque soit la langue):"), value="Each sentence is converted", height=200)
118
+ st.button(label=tr("Validez"), type="primary")
119
+
120
+ st.write(tr("Transformation de chaque phrase en vecteur (dimension = 384 ):"))
121
+ '''
122
+ st.write("")
123
+ st.write("")
124
+ st.write("")
125
+ st.write("")
tabs/intro.py CHANGED
@@ -1,4 +1,4 @@
1
- import streamlit as st
2
  from translate_app import tr
3
 
4
  title = "Value Props"
 
1
+ import streamlit as st # type: ignore
2
  from translate_app import tr
3
 
4
  title = "Value Props"
tabs/sentence_similarity_tab.py CHANGED
@@ -1,4 +1,4 @@
1
- import streamlit as st
2
  from PIL import Image
3
  import os
4
  import ast
@@ -6,14 +6,8 @@ import contextlib
6
  import numpy as np
7
  import pandas as pd
8
  import matplotlib.pyplot as plt
9
- import seaborn as sns
10
- from wordcloud import WordCloud
11
- import nltk
12
  from nltk.corpus import stopwords
13
- from gensim import corpora
14
- import networkx as nx
15
  from sklearn.manifold import TSNE
16
- from gensim.models import KeyedVectors
17
  from sentence_transformers import SentenceTransformer
18
  from sklearn.metrics.pairwise import cosine_similarity
19
  from translate_app import tr
@@ -23,354 +17,7 @@ sidebar_name = "Sentence Similarity"
23
  dataPath = st.session_state.DataPath
24
 
25
 
26
-
27
- '''
28
- with contextlib.redirect_stdout(open(os.devnull, "w")):
29
- nltk.download('stopwords')
30
-
31
- # Première ligne à charger
32
- first_line = 0
33
- # Nombre maximum de lignes à charger
34
- max_lines = 140000
35
- if ((first_line+max_lines)>137860):
36
- max_lines = max(137860-first_line ,0)
37
- # Nombre maximum de ligne à afficher pour les DataFrame
38
- max_lines_to_display = 50
39
-
40
- @st.cache_data
41
- def load_data(path):
42
-
43
- input_file = os.path.join(path)
44
- with open(input_file, "r", encoding="utf-8") as f:
45
- data = f.read()
46
-
47
- # On convertit les majuscules en minulcule
48
- data = data.lower()
49
-
50
- data = data.split('\n')
51
- return data[first_line:min(len(data),first_line+max_lines)]
52
-
53
- @st.cache_data
54
- def load_preprocessed_data(path,data_type):
55
-
56
- input_file = os.path.join(path)
57
- if data_type == 1:
58
- return pd.read_csv(input_file, encoding="utf-8", index_col=0)
59
- else:
60
- with open(input_file, "r", encoding="utf-8") as f:
61
- data = f.read()
62
- data = data.split('\n')
63
- if data_type==0:
64
- data=data[:-1]
65
- elif data_type == 2:
66
- data=[eval(i) for i in data[:-1]]
67
- elif data_type ==3:
68
- data2 = []
69
- for d in data[:-1]:
70
- data2.append(ast.literal_eval(d))
71
- data=data2
72
- return data
73
-
74
- @st.cache_data
75
- def load_all_preprocessed_data(lang):
76
- txt =load_preprocessed_data(dataPath+'/preprocess_txt_'+lang,0)
77
- corpus =load_preprocessed_data(dataPath+'/preprocess_corpus_'+lang,0)
78
- txt_split = load_preprocessed_data(dataPath+'/preprocess_txt_split_'+lang,3)
79
- df_count_word = pd.concat([load_preprocessed_data(dataPath+'/preprocess_df_count_word1_'+lang,1), load_preprocessed_data(dataPath+'/preprocess_df_count_word2_'+lang,1)])
80
- sent_len =load_preprocessed_data(dataPath+'/preprocess_sent_len_'+lang,2)
81
- vec_model= KeyedVectors.load_word2vec_format(dataPath+'/mini.wiki.'+lang+'.align.vec')
82
- return txt, corpus, txt_split, df_count_word,sent_len, vec_model
83
-
84
- #Chargement des textes complet dans les 2 langues
85
- full_txt_en, full_corpus_en, full_txt_split_en, full_df_count_word_en,full_sent_len_en, vec_model_en = load_all_preprocessed_data('en')
86
- full_txt_fr, full_corpus_fr, full_txt_split_fr, full_df_count_word_fr,full_sent_len_fr, vec_model_fr = load_all_preprocessed_data('fr')
87
-
88
-
89
- def plot_word_cloud(text, title, masque, stop_words, background_color = "white"):
90
-
91
- mask_coloring = np.array(Image.open(str(masque)))
92
- # Définir le calque du nuage des mots
93
- wc = WordCloud(background_color=background_color, max_words=200,
94
- stopwords=stop_words, mask = mask_coloring,
95
- max_font_size=50, random_state=42)
96
- # Générer et afficher le nuage de mots
97
- fig=plt.figure(figsize= (20,10))
98
- plt.title(tr(title), fontsize=25, color="green")
99
- wc.generate(text)
100
-
101
- # getting current axes
102
- a = plt.gca()
103
-
104
- # set visibility of x-axis as False
105
- xax = a.axes.get_xaxis()
106
- xax = xax.set_visible(False)
107
-
108
- # set visibility of y-axis as False
109
- yax = a.axes.get_yaxis()
110
- yax = yax.set_visible(False)
111
-
112
- plt.imshow(wc)
113
- # plt.show()
114
- st.pyplot(fig)
115
-
116
- def drop_df_null_col(df):
117
- # Check if all values in each column are 0
118
- columns_to_drop = df.columns[df.eq(0).all()]
119
- # Drop the columns with all values as 0
120
- return df.drop(columns=columns_to_drop)
121
-
122
- def calcul_occurence(df_count_word):
123
- nb_occurences = pd.DataFrame(df_count_word.sum().sort_values(axis=0,ascending=False))
124
- nb_occurences.columns = ['occurences']
125
- nb_occurences.index.name = 'mot'
126
- nb_occurences['mots'] = nb_occurences.index
127
- return nb_occurences
128
-
129
- def dist_frequence_mots(df_count_word):
130
-
131
- df_count_word = drop_df_null_col(df_count_word)
132
- nb_occurences = calcul_occurence(df_count_word)
133
-
134
- sns.set()
135
- fig = plt.figure() #figsize=(4,4)
136
- plt.title(tr("Nombre d'apparitions des mots"), fontsize=16)
137
-
138
- chart = sns.barplot(x='mots',y='occurences',data=nb_occurences.iloc[:40]);
139
- chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right', size=8)
140
- st.pyplot(fig)
141
-
142
- def dist_longueur_phrase(sent_len,sent_len2, lang1, lang2 ):
143
-
144
- df = pd.DataFrame({lang1:sent_len,lang2:sent_len2})
145
- sns.set()
146
- fig = plt.figure() # figsize=(12, 6*row_nb)
147
-
148
- fig.tight_layout()
149
- chart = sns.histplot(df, color=['r','b'], label=[lang1,lang2], binwidth=1, binrange=[2,22], element="step",
150
- common_norm=False, multiple="layer", discrete=True, stat='proportion')
151
- plt.xticks([2,4,6,8,10,12,14,16,18,20,22])
152
- chart.set(title=tr('Distribution du nombre de mots sur '+str(len(sent_len))+' phrase(s)'));
153
- st.pyplot(fig)
154
-
155
-
156
- def find_color(x,min_w,max_w):
157
- b_min = 0.0*(max_w-min_w)+min_w
158
- b_max = 0.05*(max_w-min_w)+min_w
159
- x = max(x,b_min)
160
- x = min(b_max, x)
161
- c = (x - b_min)/(b_max-b_min)
162
- return round(c)
163
-
164
- def graphe_co_occurence(txt_split,corpus):
165
-
166
- dic = corpora.Dictionary(txt_split) # dictionnaire de tous les mots restant dans le token
167
- # Equivalent (ou presque) de la DTM : DFM, Document Feature Matrix
168
- dfm = [dic.doc2bow(tok) for tok in txt_split]
169
-
170
- mes_labels = [k for k, v in dic.token2id.items()]
171
-
172
- from gensim.matutils import corpus2csc
173
- term_matrice = corpus2csc(dfm)
174
-
175
- term_matrice = np.dot(term_matrice, term_matrice.T)
176
-
177
- for i in range(len(mes_labels)):
178
- term_matrice[i,i]= 0
179
- term_matrice.eliminate_zeros()
180
-
181
- G = nx.from_scipy_sparse_matrix(term_matrice)
182
- G.add_nodes = dic
183
- pos=nx.spring_layout(G, k=5) # position des nodes
184
-
185
- importance = dict(nx.degree(G))
186
- importance = [round((v**1.3)) for v in importance.values()]
187
- edges,weights = zip(*nx.get_edge_attributes(G,'weight').items())
188
- max_w = max(weights)
189
- min_w = min(weights)
190
- edge_color = [find_color(weights[i],min_w,max_w) for i in range(len(weights))]
191
- width = [(weights[i]-min_w)*3.4/(max_w-min_w)+0.2 for i in range(len(weights))]
192
- alpha = [(weights[i]-min_w)*0.3/(max_w-min_w)+0.3 for i in range(len(weights))]
193
-
194
- fig = plt.figure();
195
-
196
- nx.draw_networkx_labels(G,pos,dic,font_size=8, font_color='b', font_weight='bold')
197
- nx.draw_networkx_nodes(G,pos, dic, \
198
- node_color= importance, # range(len(importance)), #"tab:red", \
199
- node_size=importance, \
200
- cmap=plt.cm.RdYlGn, #plt.cm.Reds_r, \
201
- alpha=0.4);
202
- nx.draw_networkx_edges(G,pos,width=width,edge_color=edge_color, alpha=alpha,edge_cmap=plt.cm.RdYlGn) # [1] * len(width)
203
-
204
- plt.axis("off");
205
- st.pyplot(fig)
206
-
207
- def proximite():
208
- global vec_model_en,vec_model_fr
209
-
210
- # Creates and TSNE model and plots it"
211
- labels = []
212
- tokens = []
213
-
214
- nb_words = st.slider(tr('Nombre de mots à afficher')+' :',10,50, value=20)
215
- df = pd.read_csv(dataPath+'/dict_we_en_fr',header=0,index_col=0, encoding ="utf-8", keep_default_na=False)
216
- words_en = df.index.to_list()[:nb_words]
217
- words_fr = df['Francais'].to_list()[:nb_words]
218
-
219
- for word in words_en:
220
- tokens.append(vec_model_en[word])
221
- labels.append(word)
222
- for word in words_fr:
223
- tokens.append(vec_model_fr[word])
224
- labels.append(word)
225
- tokens = pd.DataFrame(tokens)
226
-
227
- tsne_model = TSNE(perplexity=10, n_components=2, init='pca', n_iter=2000, random_state=23)
228
- new_values = tsne_model.fit_transform(tokens)
229
-
230
- fig =plt.figure(figsize=(16, 16))
231
- x = []
232
- y = []
233
- for value in new_values:
234
- x.append(value[0])
235
- y.append(value[1])
236
-
237
- for i in range(len(x)):
238
- if i<nb_words : color='green'
239
- else: color='blue'
240
- plt.scatter(x[i],y[i])
241
- plt.annotate(labels[i],
242
- xy=(x[i], y[i]),
243
- xytext=(5, 2),
244
- textcoords='offset points',
245
- ha='right',
246
- va='bottom',
247
- color= color,
248
- size=20)
249
- plt.title(tr("Proximité des mots anglais avec leur traduction"), fontsize=30, color="green")
250
- plt.legend(loc='best');
251
- st.pyplot(fig)
252
- '''
253
-
254
  def run():
255
-
256
- '''
257
- global max_lines, first_line, Langue
258
- global full_txt_en, full_corpus_en, full_txt_split_en, full_df_count_word_en,full_sent_len_en, vec_model_en
259
- global full_txt_fr, full_corpus_fr, full_txt_split_fr, full_df_count_word_fr,full_sent_len_fr, vec_model_fr
260
-
261
- st.write("")
262
- st.title(tr(title))
263
-
264
- #
265
- st.write("## **"+tr("Paramètres")+" :**\n")
266
- Langue = st.radio(tr('Langue:'),('Anglais','Français'), horizontal=True)
267
- first_line = st.slider(tr('No de la premiere ligne à analyser')+' :',0,137859)
268
- max_lines = st.select_slider(tr('Nombre de lignes à analyser')+' :',
269
- options=[1,5,10,15,100, 500, 1000,'Max'])
270
- if max_lines=='Max':
271
- max_lines=137860
272
- if ((first_line+max_lines)>137860):
273
- max_lines = max(137860-first_line,0)
274
-
275
- # Chargement des textes sélectionnés (max lignes = max_lines)
276
- last_line = first_line+max_lines
277
- if (Langue == 'Anglais'):
278
- txt_en = full_txt_en[first_line:last_line]
279
- corpus_en = full_corpus_en[first_line:last_line]
280
- txt_split_en = full_txt_split_en[first_line:last_line]
281
- df_count_word_en =full_df_count_word_en.loc[first_line:last_line-1]
282
- sent_len_en = full_sent_len_en[first_line:last_line]
283
- sent_len_fr = full_sent_len_fr[first_line:last_line]
284
- else:
285
- txt_fr = full_txt_fr[first_line:last_line]
286
- corpus_fr = full_corpus_fr[first_line:last_line]
287
- txt_split_fr = full_txt_split_fr[first_line:last_line]
288
- df_count_word_fr =full_df_count_word_fr.loc[first_line:last_line-1]
289
- sent_len_fr = full_sent_len_fr[first_line:last_line]
290
- sent_len_en = full_sent_len_en[first_line:last_line]
291
-
292
- if (Langue=='Anglais'):
293
- st.dataframe(pd.DataFrame(data=full_txt_en,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
294
- else:
295
- st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
296
- st.write("")
297
-
298
- tab1, tab2, tab3, tab4, tab5 = st.tabs([tr("World Cloud"), tr("Frequence"),tr("Distribution longueur"), tr("Co-occurence"), tr("Proximité")])
299
-
300
- with tab1:
301
- st.subheader(tr("World Cloud"))
302
- st.markdown(tr(
303
- """
304
- On remarque, en changeant de langue, que certains mot de taille importante dans une langue,
305
- apparaissent avec une taille identique dans l'autre langue.
306
- La traduction mot à mot sera donc peut-être bonne.
307
- """)
308
- )
309
- if (Langue == 'Anglais'):
310
- text = ""
311
- # Initialiser la variable des mots vides
312
- stop_words = set(stopwords.words('english'))
313
- for e in txt_en : text += e
314
- plot_word_cloud(text, "English words corpus", st.session_state.ImagePath+"/coeur.png", stop_words)
315
- else:
316
- text = ""
317
- # Initialiser la variable des mots vides
318
- stop_words = set(stopwords.words('french'))
319
- for e in txt_fr : text += e
320
- plot_word_cloud(text,"Mots français du corpus", st.session_state.ImagePath+"/coeur.png", stop_words)
321
-
322
- with tab2:
323
- st.subheader(tr("Frequence d'apparition des mots"))
324
- st.markdown(tr(
325
- """
326
- On remarque, en changeant de langue, que certains mot fréquents dans une langue,
327
- apparaissent aussi fréquemment dans l'autre langue.
328
- Cela peut nous laisser penser que la traduction mot à mot sera peut-être bonne.
329
- """)
330
- )
331
- if (Langue == 'Anglais'):
332
- dist_frequence_mots(df_count_word_en)
333
- else:
334
- dist_frequence_mots(df_count_word_fr)
335
- with tab3:
336
- st.subheader(tr("Distribution des longueurs de phrases"))
337
- st.markdown(tr(
338
- """
339
- Malgré quelques différences entre les 2 langues (les phrases anglaises sont généralement un peu plus courtes),
340
- on constate une certaine similitude dans les ditributions de longueur de phrases.
341
- Cela peut nous laisser penser que la traduction mot à mot ne sera pas si mauvaise.
342
- """)
343
- )
344
- if (Langue == 'Anglais'):
345
- dist_longueur_phrase(sent_len_en, sent_len_fr, 'Anglais','Français')
346
- else:
347
- dist_longueur_phrase(sent_len_fr, sent_len_en, 'Français', 'Anglais')
348
- with tab4:
349
- st.subheader(tr("Co-occurence des mots dans une phrase"))
350
- if (Langue == 'Anglais'):
351
- graphe_co_occurence(txt_split_en[:1000],corpus_en)
352
- else:
353
- graphe_co_occurence(txt_split_fr[:1000],corpus_fr)
354
- with tab5:
355
- st.subheader(tr("Proximité sémantique des mots (Word Embedding)") )
356
- st.markdown(tr(
357
- """
358
- MUSE est une bibliothèque Python pour l'intégration de mots multilingues, qui fournit
359
- notamment des "Word Embedding" multilingues
360
- Facebook fournit des dictionnaires de référence. Ces embeddings sont des embeddings fastText Wikipedia pour 30 langues qui ont été alignés dans un espace espace vectoriel unique.
361
- Dans notre cas, nous avons utilisé 2 mini-dictionnaires d'environ 3000 mots (Français et Anglais).
362
-
363
- """)
364
- )
365
- st.markdown(tr(
366
- """
367
- En novembre 2015, l'équipe de recherche de Facebook a créé fastText qui est une extension de la bibliothèque word2vec.
368
- Elle s'appuie sur Word2Vec en apprenant des représentations vectorielles pour chaque mot et les n-grammes trouvés dans chaque mot.
369
- """)
370
- )
371
- st.write("")
372
- proximite()
373
- '''
374
  st.write("")
375
  st.title(tr(title))
376
  sentences = ["This is an example sentence", "Each sentence is converted"]
 
1
+ import streamlit as st # type: ignore
2
  from PIL import Image
3
  import os
4
  import ast
 
6
  import numpy as np
7
  import pandas as pd
8
  import matplotlib.pyplot as plt
 
 
 
9
  from nltk.corpus import stopwords
 
 
10
  from sklearn.manifold import TSNE
 
11
  from sentence_transformers import SentenceTransformer
12
  from sklearn.metrics.pairwise import cosine_similarity
13
  from translate_app import tr
 
17
  dataPath = st.session_state.DataPath
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def run():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  st.write("")
22
  st.title(tr(title))
23
  sentences = ["This is an example sentence", "Each sentence is converted"]
tabs/speech2text_tab.py CHANGED
@@ -1,4 +1,4 @@
1
- import streamlit as st
2
  import os
3
  import pandas as pd
4
  import collections
 
1
+ import streamlit as st # type: ignore
2
  import os
3
  import pandas as pd
4
  import collections