Demosthene-OR
commited on
Commit
•
05f8903
1
Parent(s):
3b6c2a3
Update modelisation_seq2seq_tab.py
Browse files- tabs/modelisation_seq2seq_tab.py +7 -225
tabs/modelisation_seq2seq_tab.py
CHANGED
@@ -12,13 +12,13 @@ import whisper
|
|
12 |
import io
|
13 |
import wavio
|
14 |
from filesplit.merge import Merge
|
15 |
-
import tensorflow as tf
|
16 |
import string
|
17 |
import re
|
18 |
-
from tensorflow import keras
|
19 |
-
from keras_nlp.layers import TransformerEncoder
|
20 |
-
from tensorflow.keras import layers
|
21 |
-
from tensorflow.keras.utils import plot_model
|
22 |
# from PIL import Image
|
23 |
from gtts import gTTS
|
24 |
from extra_streamlit_components import tab_bar, TabBarItemData
|
@@ -41,200 +41,6 @@ def load_corpus(path):
|
|
41 |
data=data[:-1]
|
42 |
return pd.DataFrame(data)
|
43 |
|
44 |
-
# ===== Keras ====
|
45 |
-
strip_chars = string.punctuation + "¿"
|
46 |
-
strip_chars = strip_chars.replace("[", "")
|
47 |
-
strip_chars = strip_chars.replace("]", "")
|
48 |
-
|
49 |
-
def custom_standardization(input_string):
|
50 |
-
lowercase = tf.strings.lower(input_string)
|
51 |
-
lowercase=tf.strings.regex_replace(lowercase, "[à]", "a")
|
52 |
-
return tf.strings.regex_replace(
|
53 |
-
lowercase, f"[{re.escape(strip_chars)}]", "")
|
54 |
-
|
55 |
-
@st.cache_data
|
56 |
-
def load_vocab(file_path):
|
57 |
-
with open(file_path, "r", encoding="utf-8") as file:
|
58 |
-
return file.read().split('\n')[:-1]
|
59 |
-
|
60 |
-
|
61 |
-
def decode_sequence_rnn(input_sentence, src, tgt):
|
62 |
-
global translation_model
|
63 |
-
|
64 |
-
vocab_size = 15000
|
65 |
-
sequence_length = 50
|
66 |
-
|
67 |
-
source_vectorization = layers.TextVectorization(
|
68 |
-
max_tokens=vocab_size,
|
69 |
-
output_mode="int",
|
70 |
-
output_sequence_length=sequence_length,
|
71 |
-
standardize=custom_standardization,
|
72 |
-
vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
|
73 |
-
)
|
74 |
-
|
75 |
-
target_vectorization = layers.TextVectorization(
|
76 |
-
max_tokens=vocab_size,
|
77 |
-
output_mode="int",
|
78 |
-
output_sequence_length=sequence_length + 1,
|
79 |
-
standardize=custom_standardization,
|
80 |
-
vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
|
81 |
-
)
|
82 |
-
|
83 |
-
tgt_vocab = target_vectorization.get_vocabulary()
|
84 |
-
tgt_index_lookup = dict(zip(range(len(tgt_vocab)), tgt_vocab))
|
85 |
-
max_decoded_sentence_length = 50
|
86 |
-
tokenized_input_sentence = source_vectorization([input_sentence])
|
87 |
-
decoded_sentence = "[start]"
|
88 |
-
for i in range(max_decoded_sentence_length):
|
89 |
-
tokenized_target_sentence = target_vectorization([decoded_sentence])
|
90 |
-
next_token_predictions = translation_model.predict(
|
91 |
-
[tokenized_input_sentence, tokenized_target_sentence], verbose=0)
|
92 |
-
sampled_token_index = np.argmax(next_token_predictions[0, i, :])
|
93 |
-
sampled_token = tgt_index_lookup[sampled_token_index]
|
94 |
-
decoded_sentence += " " + sampled_token
|
95 |
-
if sampled_token == "[end]":
|
96 |
-
break
|
97 |
-
return decoded_sentence[8:-6]
|
98 |
-
|
99 |
-
# ===== Enf of Keras ====
|
100 |
-
|
101 |
-
# ===== Transformer section ====
|
102 |
-
|
103 |
-
class TransformerDecoder(layers.Layer):
|
104 |
-
def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
|
105 |
-
super().__init__(**kwargs)
|
106 |
-
self.embed_dim = embed_dim
|
107 |
-
self.dense_dim = dense_dim
|
108 |
-
self.num_heads = num_heads
|
109 |
-
self.attention_1 = layers.MultiHeadAttention(
|
110 |
-
num_heads=num_heads, key_dim=embed_dim)
|
111 |
-
self.attention_2 = layers.MultiHeadAttention(
|
112 |
-
num_heads=num_heads, key_dim=embed_dim)
|
113 |
-
self.dense_proj = keras.Sequential(
|
114 |
-
[layers.Dense(dense_dim, activation="relu"),
|
115 |
-
layers.Dense(embed_dim),]
|
116 |
-
)
|
117 |
-
self.layernorm_1 = layers.LayerNormalization()
|
118 |
-
self.layernorm_2 = layers.LayerNormalization()
|
119 |
-
self.layernorm_3 = layers.LayerNormalization()
|
120 |
-
self.supports_masking = True
|
121 |
-
|
122 |
-
def get_config(self):
|
123 |
-
config = super().get_config()
|
124 |
-
config.update({
|
125 |
-
"embed_dim": self.embed_dim,
|
126 |
-
"num_heads": self.num_heads,
|
127 |
-
"dense_dim": self.dense_dim,
|
128 |
-
})
|
129 |
-
return config
|
130 |
-
|
131 |
-
def get_causal_attention_mask(self, inputs):
|
132 |
-
input_shape = tf.shape(inputs)
|
133 |
-
batch_size, sequence_length = input_shape[0], input_shape[1]
|
134 |
-
i = tf.range(sequence_length)[:, tf.newaxis]
|
135 |
-
j = tf.range(sequence_length)
|
136 |
-
mask = tf.cast(i >= j, dtype="int32")
|
137 |
-
mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
|
138 |
-
mult = tf.concat(
|
139 |
-
[tf.expand_dims(batch_size, -1),
|
140 |
-
tf.constant([1, 1], dtype=tf.int32)], axis=0)
|
141 |
-
return tf.tile(mask, mult)
|
142 |
-
|
143 |
-
def call(self, inputs, encoder_outputs, mask=None):
|
144 |
-
causal_mask = self.get_causal_attention_mask(inputs)
|
145 |
-
if mask is not None:
|
146 |
-
padding_mask = tf.cast(
|
147 |
-
mask[:, tf.newaxis, :], dtype="int32")
|
148 |
-
padding_mask = tf.minimum(padding_mask, causal_mask)
|
149 |
-
else:
|
150 |
-
padding_mask = mask
|
151 |
-
attention_output_1 = self.attention_1(
|
152 |
-
query=inputs,
|
153 |
-
value=inputs,
|
154 |
-
key=inputs,
|
155 |
-
attention_mask=causal_mask)
|
156 |
-
attention_output_1 = self.layernorm_1(inputs + attention_output_1)
|
157 |
-
attention_output_2 = self.attention_2(
|
158 |
-
query=attention_output_1,
|
159 |
-
value=encoder_outputs,
|
160 |
-
key=encoder_outputs,
|
161 |
-
attention_mask=padding_mask,
|
162 |
-
)
|
163 |
-
attention_output_2 = self.layernorm_2(
|
164 |
-
attention_output_1 + attention_output_2)
|
165 |
-
proj_output = self.dense_proj(attention_output_2)
|
166 |
-
return self.layernorm_3(attention_output_2 + proj_output)
|
167 |
-
|
168 |
-
class PositionalEmbedding(layers.Layer):
|
169 |
-
def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
|
170 |
-
super().__init__(**kwargs)
|
171 |
-
self.token_embeddings = layers.Embedding(
|
172 |
-
input_dim=input_dim, output_dim=output_dim)
|
173 |
-
self.position_embeddings = layers.Embedding(
|
174 |
-
input_dim=sequence_length, output_dim=output_dim)
|
175 |
-
self.sequence_length = sequence_length
|
176 |
-
self.input_dim = input_dim
|
177 |
-
self.output_dim = output_dim
|
178 |
-
|
179 |
-
def call(self, inputs):
|
180 |
-
length = tf.shape(inputs)[-1]
|
181 |
-
positions = tf.range(start=0, limit=length, delta=1)
|
182 |
-
embedded_tokens = self.token_embeddings(inputs)
|
183 |
-
embedded_positions = self.position_embeddings(positions)
|
184 |
-
return embedded_tokens + embedded_positions
|
185 |
-
|
186 |
-
def compute_mask(self, inputs, mask=None):
|
187 |
-
return tf.math.not_equal(inputs, 0)
|
188 |
-
|
189 |
-
def get_config(self):
|
190 |
-
config = super(PositionalEmbedding, self).get_config()
|
191 |
-
config.update({
|
192 |
-
"output_dim": self.output_dim,
|
193 |
-
"sequence_length": self.sequence_length,
|
194 |
-
"input_dim": self.input_dim,
|
195 |
-
})
|
196 |
-
return config
|
197 |
-
|
198 |
-
def decode_sequence_tranf(input_sentence, src, tgt):
|
199 |
-
global translation_model
|
200 |
-
|
201 |
-
vocab_size = 15000
|
202 |
-
sequence_length = 30
|
203 |
-
|
204 |
-
source_vectorization = layers.TextVectorization(
|
205 |
-
max_tokens=vocab_size,
|
206 |
-
output_mode="int",
|
207 |
-
output_sequence_length=sequence_length,
|
208 |
-
standardize=custom_standardization,
|
209 |
-
vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
|
210 |
-
)
|
211 |
-
|
212 |
-
target_vectorization = layers.TextVectorization(
|
213 |
-
max_tokens=vocab_size,
|
214 |
-
output_mode="int",
|
215 |
-
output_sequence_length=sequence_length + 1,
|
216 |
-
standardize=custom_standardization,
|
217 |
-
vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
|
218 |
-
)
|
219 |
-
|
220 |
-
tgt_vocab = target_vectorization.get_vocabulary()
|
221 |
-
tgt_index_lookup = dict(zip(range(len(tgt_vocab)), tgt_vocab))
|
222 |
-
max_decoded_sentence_length = 50
|
223 |
-
tokenized_input_sentence = source_vectorization([input_sentence])
|
224 |
-
decoded_sentence = "[start]"
|
225 |
-
for i in range(max_decoded_sentence_length):
|
226 |
-
tokenized_target_sentence = target_vectorization(
|
227 |
-
[decoded_sentence])[:, :-1]
|
228 |
-
predictions = translation_model(
|
229 |
-
[tokenized_input_sentence, tokenized_target_sentence])
|
230 |
-
sampled_token_index = np.argmax(predictions[0, i, :])
|
231 |
-
sampled_token = tgt_index_lookup[sampled_token_index]
|
232 |
-
decoded_sentence += " " + sampled_token
|
233 |
-
if sampled_token == "[end]":
|
234 |
-
break
|
235 |
-
return decoded_sentence[8:-6]
|
236 |
-
|
237 |
-
# ==== End Transforformer section ====
|
238 |
|
239 |
@st.cache_resource
|
240 |
def load_all_data():
|
@@ -246,34 +52,10 @@ def load_all_data():
|
|
246 |
finetuned_translation_en_fr = pipeline('translation_en_to_fr', model="Demosthene-OR/t5-small-finetuned-en-to-fr")
|
247 |
model_speech = whisper.load_model("base")
|
248 |
|
249 |
-
|
250 |
-
merge = Merge( dataPath+"/rnn_fr-en_split", dataPath, "seq2seq_rnn-model-fr-en.h5").merge(cleanup=False)
|
251 |
-
rnn_en_fr = keras.models.load_model(dataPath+"/seq2seq_rnn-model-en-fr.h5", compile=False)
|
252 |
-
rnn_fr_en = keras.models.load_model(dataPath+"/seq2seq_rnn-model-fr-en.h5", compile=False)
|
253 |
-
rnn_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
|
254 |
-
rnn_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
|
255 |
-
|
256 |
-
custom_objects = {"TransformerDecoder": TransformerDecoder, "PositionalEmbedding": PositionalEmbedding}
|
257 |
-
if st.session_state.Cloud == 1:
|
258 |
-
with keras.saving.custom_object_scope(custom_objects):
|
259 |
-
transformer_en_fr = keras.models.load_model( "data/transformer-model-en-fr.h5")
|
260 |
-
transformer_fr_en = keras.models.load_model( "data/transformer-model-fr-en.h5")
|
261 |
-
merge = Merge( "data/transf_en-fr_weight_split", "data", "transformer-model-en-fr.weights.h5").merge(cleanup=False)
|
262 |
-
merge = Merge( "data/transf_fr-en_weight_split", "data", "transformer-model-fr-en.weights.h5").merge(cleanup=False)
|
263 |
-
else:
|
264 |
-
transformer_en_fr = keras.models.load_model( dataPath+"/transformer-model-en-fr.h5", custom_objects=custom_objects )
|
265 |
-
transformer_fr_en = keras.models.load_model( dataPath+"/transformer-model-fr-en.h5", custom_objects=custom_objects)
|
266 |
-
transformer_en_fr.load_weights(dataPath+"/transformer-model-en-fr.weights.h5")
|
267 |
-
transformer_fr_en.load_weights(dataPath+"/transformer-model-fr-en.weights.h5")
|
268 |
-
transformer_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
|
269 |
-
transformer_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
|
270 |
-
|
271 |
-
return df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, rnn_en_fr, rnn_fr_en,\
|
272 |
-
transformer_en_fr, transformer_fr_en, finetuned_translation_en_fr
|
273 |
|
274 |
n1 = 0
|
275 |
-
df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech,
|
276 |
-
transformer_en_fr, transformer_fr_en, finetuned_translation_en_fr = load_all_data()
|
277 |
|
278 |
'''
|
279 |
def display_translation(n1, Lang,model_type):
|
|
|
12 |
import io
|
13 |
import wavio
|
14 |
from filesplit.merge import Merge
|
15 |
+
# import tensorflow as tf
|
16 |
import string
|
17 |
import re
|
18 |
+
# from tensorflow import keras
|
19 |
+
# from keras_nlp.layers import TransformerEncoder
|
20 |
+
# from tensorflow.keras import layers
|
21 |
+
# from tensorflow.keras.utils import plot_model
|
22 |
# from PIL import Image
|
23 |
from gtts import gTTS
|
24 |
from extra_streamlit_components import tab_bar, TabBarItemData
|
|
|
41 |
data=data[:-1]
|
42 |
return pd.DataFrame(data)
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
@st.cache_resource
|
46 |
def load_all_data():
|
|
|
52 |
finetuned_translation_en_fr = pipeline('translation_en_to_fr', model="Demosthene-OR/t5-small-finetuned-en-to-fr")
|
53 |
model_speech = whisper.load_model("base")
|
54 |
|
55 |
+
return df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, finetuned_translation_en_fr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
n1 = 0
|
58 |
+
df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, finetuned_translation_en_fr = load_all_data()
|
|
|
59 |
|
60 |
'''
|
61 |
def display_translation(n1, Lang,model_type):
|