Demosthene-OR commited on
Commit
05f8903
1 Parent(s): 3b6c2a3

Update modelisation_seq2seq_tab.py

Browse files
Files changed (1) hide show
  1. tabs/modelisation_seq2seq_tab.py +7 -225
tabs/modelisation_seq2seq_tab.py CHANGED
@@ -12,13 +12,13 @@ import whisper
12
  import io
13
  import wavio
14
  from filesplit.merge import Merge
15
- import tensorflow as tf
16
  import string
17
  import re
18
- from tensorflow import keras
19
- from keras_nlp.layers import TransformerEncoder
20
- from tensorflow.keras import layers
21
- from tensorflow.keras.utils import plot_model
22
  # from PIL import Image
23
  from gtts import gTTS
24
  from extra_streamlit_components import tab_bar, TabBarItemData
@@ -41,200 +41,6 @@ def load_corpus(path):
41
  data=data[:-1]
42
  return pd.DataFrame(data)
43
 
44
- # ===== Keras ====
45
- strip_chars = string.punctuation + "¿"
46
- strip_chars = strip_chars.replace("[", "")
47
- strip_chars = strip_chars.replace("]", "")
48
-
49
- def custom_standardization(input_string):
50
- lowercase = tf.strings.lower(input_string)
51
- lowercase=tf.strings.regex_replace(lowercase, "[à]", "a")
52
- return tf.strings.regex_replace(
53
- lowercase, f"[{re.escape(strip_chars)}]", "")
54
-
55
- @st.cache_data
56
- def load_vocab(file_path):
57
- with open(file_path, "r", encoding="utf-8") as file:
58
- return file.read().split('\n')[:-1]
59
-
60
-
61
- def decode_sequence_rnn(input_sentence, src, tgt):
62
- global translation_model
63
-
64
- vocab_size = 15000
65
- sequence_length = 50
66
-
67
- source_vectorization = layers.TextVectorization(
68
- max_tokens=vocab_size,
69
- output_mode="int",
70
- output_sequence_length=sequence_length,
71
- standardize=custom_standardization,
72
- vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
73
- )
74
-
75
- target_vectorization = layers.TextVectorization(
76
- max_tokens=vocab_size,
77
- output_mode="int",
78
- output_sequence_length=sequence_length + 1,
79
- standardize=custom_standardization,
80
- vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
81
- )
82
-
83
- tgt_vocab = target_vectorization.get_vocabulary()
84
- tgt_index_lookup = dict(zip(range(len(tgt_vocab)), tgt_vocab))
85
- max_decoded_sentence_length = 50
86
- tokenized_input_sentence = source_vectorization([input_sentence])
87
- decoded_sentence = "[start]"
88
- for i in range(max_decoded_sentence_length):
89
- tokenized_target_sentence = target_vectorization([decoded_sentence])
90
- next_token_predictions = translation_model.predict(
91
- [tokenized_input_sentence, tokenized_target_sentence], verbose=0)
92
- sampled_token_index = np.argmax(next_token_predictions[0, i, :])
93
- sampled_token = tgt_index_lookup[sampled_token_index]
94
- decoded_sentence += " " + sampled_token
95
- if sampled_token == "[end]":
96
- break
97
- return decoded_sentence[8:-6]
98
-
99
- # ===== Enf of Keras ====
100
-
101
- # ===== Transformer section ====
102
-
103
- class TransformerDecoder(layers.Layer):
104
- def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
105
- super().__init__(**kwargs)
106
- self.embed_dim = embed_dim
107
- self.dense_dim = dense_dim
108
- self.num_heads = num_heads
109
- self.attention_1 = layers.MultiHeadAttention(
110
- num_heads=num_heads, key_dim=embed_dim)
111
- self.attention_2 = layers.MultiHeadAttention(
112
- num_heads=num_heads, key_dim=embed_dim)
113
- self.dense_proj = keras.Sequential(
114
- [layers.Dense(dense_dim, activation="relu"),
115
- layers.Dense(embed_dim),]
116
- )
117
- self.layernorm_1 = layers.LayerNormalization()
118
- self.layernorm_2 = layers.LayerNormalization()
119
- self.layernorm_3 = layers.LayerNormalization()
120
- self.supports_masking = True
121
-
122
- def get_config(self):
123
- config = super().get_config()
124
- config.update({
125
- "embed_dim": self.embed_dim,
126
- "num_heads": self.num_heads,
127
- "dense_dim": self.dense_dim,
128
- })
129
- return config
130
-
131
- def get_causal_attention_mask(self, inputs):
132
- input_shape = tf.shape(inputs)
133
- batch_size, sequence_length = input_shape[0], input_shape[1]
134
- i = tf.range(sequence_length)[:, tf.newaxis]
135
- j = tf.range(sequence_length)
136
- mask = tf.cast(i >= j, dtype="int32")
137
- mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
138
- mult = tf.concat(
139
- [tf.expand_dims(batch_size, -1),
140
- tf.constant([1, 1], dtype=tf.int32)], axis=0)
141
- return tf.tile(mask, mult)
142
-
143
- def call(self, inputs, encoder_outputs, mask=None):
144
- causal_mask = self.get_causal_attention_mask(inputs)
145
- if mask is not None:
146
- padding_mask = tf.cast(
147
- mask[:, tf.newaxis, :], dtype="int32")
148
- padding_mask = tf.minimum(padding_mask, causal_mask)
149
- else:
150
- padding_mask = mask
151
- attention_output_1 = self.attention_1(
152
- query=inputs,
153
- value=inputs,
154
- key=inputs,
155
- attention_mask=causal_mask)
156
- attention_output_1 = self.layernorm_1(inputs + attention_output_1)
157
- attention_output_2 = self.attention_2(
158
- query=attention_output_1,
159
- value=encoder_outputs,
160
- key=encoder_outputs,
161
- attention_mask=padding_mask,
162
- )
163
- attention_output_2 = self.layernorm_2(
164
- attention_output_1 + attention_output_2)
165
- proj_output = self.dense_proj(attention_output_2)
166
- return self.layernorm_3(attention_output_2 + proj_output)
167
-
168
- class PositionalEmbedding(layers.Layer):
169
- def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
170
- super().__init__(**kwargs)
171
- self.token_embeddings = layers.Embedding(
172
- input_dim=input_dim, output_dim=output_dim)
173
- self.position_embeddings = layers.Embedding(
174
- input_dim=sequence_length, output_dim=output_dim)
175
- self.sequence_length = sequence_length
176
- self.input_dim = input_dim
177
- self.output_dim = output_dim
178
-
179
- def call(self, inputs):
180
- length = tf.shape(inputs)[-1]
181
- positions = tf.range(start=0, limit=length, delta=1)
182
- embedded_tokens = self.token_embeddings(inputs)
183
- embedded_positions = self.position_embeddings(positions)
184
- return embedded_tokens + embedded_positions
185
-
186
- def compute_mask(self, inputs, mask=None):
187
- return tf.math.not_equal(inputs, 0)
188
-
189
- def get_config(self):
190
- config = super(PositionalEmbedding, self).get_config()
191
- config.update({
192
- "output_dim": self.output_dim,
193
- "sequence_length": self.sequence_length,
194
- "input_dim": self.input_dim,
195
- })
196
- return config
197
-
198
- def decode_sequence_tranf(input_sentence, src, tgt):
199
- global translation_model
200
-
201
- vocab_size = 15000
202
- sequence_length = 30
203
-
204
- source_vectorization = layers.TextVectorization(
205
- max_tokens=vocab_size,
206
- output_mode="int",
207
- output_sequence_length=sequence_length,
208
- standardize=custom_standardization,
209
- vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
210
- )
211
-
212
- target_vectorization = layers.TextVectorization(
213
- max_tokens=vocab_size,
214
- output_mode="int",
215
- output_sequence_length=sequence_length + 1,
216
- standardize=custom_standardization,
217
- vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
218
- )
219
-
220
- tgt_vocab = target_vectorization.get_vocabulary()
221
- tgt_index_lookup = dict(zip(range(len(tgt_vocab)), tgt_vocab))
222
- max_decoded_sentence_length = 50
223
- tokenized_input_sentence = source_vectorization([input_sentence])
224
- decoded_sentence = "[start]"
225
- for i in range(max_decoded_sentence_length):
226
- tokenized_target_sentence = target_vectorization(
227
- [decoded_sentence])[:, :-1]
228
- predictions = translation_model(
229
- [tokenized_input_sentence, tokenized_target_sentence])
230
- sampled_token_index = np.argmax(predictions[0, i, :])
231
- sampled_token = tgt_index_lookup[sampled_token_index]
232
- decoded_sentence += " " + sampled_token
233
- if sampled_token == "[end]":
234
- break
235
- return decoded_sentence[8:-6]
236
-
237
- # ==== End Transforformer section ====
238
 
239
  @st.cache_resource
240
  def load_all_data():
@@ -246,34 +52,10 @@ def load_all_data():
246
  finetuned_translation_en_fr = pipeline('translation_en_to_fr', model="Demosthene-OR/t5-small-finetuned-en-to-fr")
247
  model_speech = whisper.load_model("base")
248
 
249
- merge = Merge( dataPath+"/rnn_en-fr_split", dataPath, "seq2seq_rnn-model-en-fr.h5").merge(cleanup=False)
250
- merge = Merge( dataPath+"/rnn_fr-en_split", dataPath, "seq2seq_rnn-model-fr-en.h5").merge(cleanup=False)
251
- rnn_en_fr = keras.models.load_model(dataPath+"/seq2seq_rnn-model-en-fr.h5", compile=False)
252
- rnn_fr_en = keras.models.load_model(dataPath+"/seq2seq_rnn-model-fr-en.h5", compile=False)
253
- rnn_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
254
- rnn_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
255
-
256
- custom_objects = {"TransformerDecoder": TransformerDecoder, "PositionalEmbedding": PositionalEmbedding}
257
- if st.session_state.Cloud == 1:
258
- with keras.saving.custom_object_scope(custom_objects):
259
- transformer_en_fr = keras.models.load_model( "data/transformer-model-en-fr.h5")
260
- transformer_fr_en = keras.models.load_model( "data/transformer-model-fr-en.h5")
261
- merge = Merge( "data/transf_en-fr_weight_split", "data", "transformer-model-en-fr.weights.h5").merge(cleanup=False)
262
- merge = Merge( "data/transf_fr-en_weight_split", "data", "transformer-model-fr-en.weights.h5").merge(cleanup=False)
263
- else:
264
- transformer_en_fr = keras.models.load_model( dataPath+"/transformer-model-en-fr.h5", custom_objects=custom_objects )
265
- transformer_fr_en = keras.models.load_model( dataPath+"/transformer-model-fr-en.h5", custom_objects=custom_objects)
266
- transformer_en_fr.load_weights(dataPath+"/transformer-model-en-fr.weights.h5")
267
- transformer_fr_en.load_weights(dataPath+"/transformer-model-fr-en.weights.h5")
268
- transformer_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
269
- transformer_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
270
-
271
- return df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, rnn_en_fr, rnn_fr_en,\
272
- transformer_en_fr, transformer_fr_en, finetuned_translation_en_fr
273
 
274
  n1 = 0
275
- df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, rnn_en_fr, rnn_fr_en,\
276
- transformer_en_fr, transformer_fr_en, finetuned_translation_en_fr = load_all_data()
277
 
278
  '''
279
  def display_translation(n1, Lang,model_type):
 
12
  import io
13
  import wavio
14
  from filesplit.merge import Merge
15
+ # import tensorflow as tf
16
  import string
17
  import re
18
+ # from tensorflow import keras
19
+ # from keras_nlp.layers import TransformerEncoder
20
+ # from tensorflow.keras import layers
21
+ # from tensorflow.keras.utils import plot_model
22
  # from PIL import Image
23
  from gtts import gTTS
24
  from extra_streamlit_components import tab_bar, TabBarItemData
 
41
  data=data[:-1]
42
  return pd.DataFrame(data)
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  @st.cache_resource
46
  def load_all_data():
 
52
  finetuned_translation_en_fr = pipeline('translation_en_to_fr', model="Demosthene-OR/t5-small-finetuned-en-to-fr")
53
  model_speech = whisper.load_model("base")
54
 
55
+ return df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, finetuned_translation_en_fr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  n1 = 0
58
+ df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, finetuned_translation_en_fr = load_all_data()
 
59
 
60
  '''
61
  def display_translation(n1, Lang,model_type):