avr23-cds-translation2

Running

App Files Files Community

Demosthene-OR commited on Mar 10

Commit

05f8903

•

1 Parent(s): 3b6c2a3

Update modelisation_seq2seq_tab.py

Browse files

Files changed (1) hide show

tabs/modelisation_seq2seq_tab.py +7 -225

tabs/modelisation_seq2seq_tab.py CHANGED Viewed

@@ -12,13 +12,13 @@ import whisper
 import io
 import wavio
 from filesplit.merge import Merge
-import tensorflow as tf
 import string
 import re
-from tensorflow import keras
-from keras_nlp.layers import TransformerEncoder
-from tensorflow.keras import layers
-from tensorflow.keras.utils import plot_model
 # from PIL import Image
 from gtts import gTTS
 from extra_streamlit_components import tab_bar, TabBarItemData
@@ -41,200 +41,6 @@ def load_corpus(path):
         data=data[:-1]
     return pd.DataFrame(data)
-# ===== Keras ====
-strip_chars = string.punctuation + "¿"
-strip_chars = strip_chars.replace("[", "")
-strip_chars = strip_chars.replace("]", "")
-def custom_standardization(input_string):
-    lowercase = tf.strings.lower(input_string)
-    lowercase=tf.strings.regex_replace(lowercase, "[à]", "a")
-    return tf.strings.regex_replace(
-        lowercase, f"[{re.escape(strip_chars)}]", "")
-@st.cache_data
-def load_vocab(file_path):
-    with open(file_path, "r",  encoding="utf-8") as file:
-        return file.read().split('\n')[:-1]
-def decode_sequence_rnn(input_sentence, src, tgt):
-    global translation_model
-    vocab_size = 15000
-    sequence_length = 50
-    source_vectorization = layers.TextVectorization(
-        max_tokens=vocab_size,
-        output_mode="int",
-        output_sequence_length=sequence_length,
-        standardize=custom_standardization,
-        vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
-    )
-    target_vectorization = layers.TextVectorization(
-        max_tokens=vocab_size,
-        output_mode="int",
-        output_sequence_length=sequence_length + 1,
-        standardize=custom_standardization,
-        vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
-    )
-    tgt_vocab = target_vectorization.get_vocabulary()
-    tgt_index_lookup = dict(zip(range(len(tgt_vocab)), tgt_vocab))
-    max_decoded_sentence_length = 50
-    tokenized_input_sentence = source_vectorization([input_sentence])
-    decoded_sentence = "[start]"
-    for i in range(max_decoded_sentence_length):
-        tokenized_target_sentence = target_vectorization([decoded_sentence])
-        next_token_predictions = translation_model.predict(
-            [tokenized_input_sentence, tokenized_target_sentence], verbose=0)
-        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
-        sampled_token = tgt_index_lookup[sampled_token_index]
-        decoded_sentence += " " + sampled_token
-        if sampled_token == "[end]":
-            break
-    return decoded_sentence[8:-6]
-# ===== Enf of Keras ====
-# ===== Transformer section ====
-class TransformerDecoder(layers.Layer):
-    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
-        super().__init__(**kwargs)
-        self.embed_dim = embed_dim
-        self.dense_dim = dense_dim
-        self.num_heads = num_heads
-        self.attention_1 = layers.MultiHeadAttention(
-            num_heads=num_heads, key_dim=embed_dim)
-        self.attention_2 = layers.MultiHeadAttention(
-            num_heads=num_heads, key_dim=embed_dim)
-        self.dense_proj = keras.Sequential(
-            [layers.Dense(dense_dim, activation="relu"),
-             layers.Dense(embed_dim),]
-        )
-        self.layernorm_1 = layers.LayerNormalization()
-        self.layernorm_2 = layers.LayerNormalization()
-        self.layernorm_3 = layers.LayerNormalization()
-        self.supports_masking = True
-    def get_config(self):
-        config = super().get_config()
-        config.update({
-            "embed_dim": self.embed_dim,
-            "num_heads": self.num_heads,
-            "dense_dim": self.dense_dim,
-        })
-        return config
-    def get_causal_attention_mask(self, inputs):
-        input_shape = tf.shape(inputs)
-        batch_size, sequence_length = input_shape[0], input_shape[1]
-        i = tf.range(sequence_length)[:, tf.newaxis]
-        j = tf.range(sequence_length)
-        mask = tf.cast(i >= j, dtype="int32")
-        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
-        mult = tf.concat(
-            [tf.expand_dims(batch_size, -1),
-             tf.constant([1, 1], dtype=tf.int32)], axis=0)
-        return tf.tile(mask, mult)
-    def call(self, inputs, encoder_outputs, mask=None):
-        causal_mask = self.get_causal_attention_mask(inputs)
-        if mask is not None:
-            padding_mask = tf.cast(
-                mask[:, tf.newaxis, :], dtype="int32")
-            padding_mask = tf.minimum(padding_mask, causal_mask)
-        else:
-            padding_mask = mask
-        attention_output_1 = self.attention_1(
-            query=inputs,
-            value=inputs,
-            key=inputs,
-            attention_mask=causal_mask)
-        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
-        attention_output_2 = self.attention_2(
-            query=attention_output_1,
-            value=encoder_outputs,
-            key=encoder_outputs,
-            attention_mask=padding_mask,
-        )
-        attention_output_2 = self.layernorm_2(
-            attention_output_1 + attention_output_2)
-        proj_output = self.dense_proj(attention_output_2)
-        return self.layernorm_3(attention_output_2 + proj_output)
-class PositionalEmbedding(layers.Layer):
-    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
-        super().__init__(**kwargs)
-        self.token_embeddings = layers.Embedding(
-            input_dim=input_dim, output_dim=output_dim)
-        self.position_embeddings = layers.Embedding(
-            input_dim=sequence_length, output_dim=output_dim)
-        self.sequence_length = sequence_length
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-    def call(self, inputs):
-        length = tf.shape(inputs)[-1]
-        positions = tf.range(start=0, limit=length, delta=1)
-        embedded_tokens = self.token_embeddings(inputs)
-        embedded_positions = self.position_embeddings(positions)
-        return embedded_tokens + embedded_positions
-    def compute_mask(self, inputs, mask=None):
-        return tf.math.not_equal(inputs, 0)
-    def get_config(self):
-        config = super(PositionalEmbedding, self).get_config()
-        config.update({
-            "output_dim": self.output_dim,
-            "sequence_length": self.sequence_length,
-            "input_dim": self.input_dim,
-        })
-        return config
-def decode_sequence_tranf(input_sentence, src, tgt):
-    global translation_model
-    vocab_size = 15000
-    sequence_length = 30
-    source_vectorization = layers.TextVectorization(
-        max_tokens=vocab_size,
-        output_mode="int",
-        output_sequence_length=sequence_length,
-        standardize=custom_standardization,
-        vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
-    )
-    target_vectorization = layers.TextVectorization(
-        max_tokens=vocab_size,
-        output_mode="int",
-        output_sequence_length=sequence_length + 1,
-        standardize=custom_standardization,
-        vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
-    )
-    tgt_vocab = target_vectorization.get_vocabulary()
-    tgt_index_lookup = dict(zip(range(len(tgt_vocab)), tgt_vocab))
-    max_decoded_sentence_length = 50
-    tokenized_input_sentence = source_vectorization([input_sentence])
-    decoded_sentence = "[start]"
-    for i in range(max_decoded_sentence_length):
-        tokenized_target_sentence = target_vectorization(
-            [decoded_sentence])[:, :-1]
-        predictions = translation_model(
-            [tokenized_input_sentence, tokenized_target_sentence])
-        sampled_token_index = np.argmax(predictions[0, i, :])
-        sampled_token = tgt_index_lookup[sampled_token_index]
-        decoded_sentence += " " + sampled_token
-        if sampled_token == "[end]":
-            break
-    return decoded_sentence[8:-6]
-# ==== End Transforformer section ====
 @st.cache_resource
 def load_all_data():
@@ -246,34 +52,10 @@ def load_all_data():
     finetuned_translation_en_fr = pipeline('translation_en_to_fr', model="Demosthene-OR/t5-small-finetuned-en-to-fr")
     model_speech = whisper.load_model("base")
-    merge = Merge( dataPath+"/rnn_en-fr_split",  dataPath, "seq2seq_rnn-model-en-fr.h5").merge(cleanup=False)
-    merge = Merge( dataPath+"/rnn_fr-en_split",  dataPath, "seq2seq_rnn-model-fr-en.h5").merge(cleanup=False)
-    rnn_en_fr = keras.models.load_model(dataPath+"/seq2seq_rnn-model-en-fr.h5", compile=False)
-    rnn_fr_en = keras.models.load_model(dataPath+"/seq2seq_rnn-model-fr-en.h5", compile=False)
-    rnn_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
-    rnn_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
-    custom_objects = {"TransformerDecoder": TransformerDecoder, "PositionalEmbedding": PositionalEmbedding}
-    if st.session_state.Cloud == 1:
-        with keras.saving.custom_object_scope(custom_objects):
-            transformer_en_fr = keras.models.load_model( "data/transformer-model-en-fr.h5")
-            transformer_fr_en = keras.models.load_model( "data/transformer-model-fr-en.h5")
-        merge = Merge( "data/transf_en-fr_weight_split",  "data", "transformer-model-en-fr.weights.h5").merge(cleanup=False)
-        merge = Merge( "data/transf_fr-en_weight_split",  "data", "transformer-model-fr-en.weights.h5").merge(cleanup=False)
-    else:
-        transformer_en_fr = keras.models.load_model( dataPath+"/transformer-model-en-fr.h5", custom_objects=custom_objects )
-        transformer_fr_en = keras.models.load_model( dataPath+"/transformer-model-fr-en.h5", custom_objects=custom_objects)
-        transformer_en_fr.load_weights(dataPath+"/transformer-model-en-fr.weights.h5")
-        transformer_fr_en.load_weights(dataPath+"/transformer-model-fr-en.weights.h5")
-    transformer_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
-    transformer_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
-    return df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, rnn_en_fr, rnn_fr_en,\
-        transformer_en_fr, transformer_fr_en, finetuned_translation_en_fr
 n1 = 0
-df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, rnn_en_fr, rnn_fr_en,\
-    transformer_en_fr, transformer_fr_en, finetuned_translation_en_fr = load_all_data()
 '''
 def display_translation(n1, Lang,model_type):

 import io
 import wavio
 from filesplit.merge import Merge
+# import tensorflow as tf
 import string
 import re
+# from tensorflow import keras
+# from keras_nlp.layers import TransformerEncoder
+# from tensorflow.keras import layers
+# from tensorflow.keras.utils import plot_model
 # from PIL import Image
 from gtts import gTTS
 from extra_streamlit_components import tab_bar, TabBarItemData
         data=data[:-1]
     return pd.DataFrame(data)
 @st.cache_resource
 def load_all_data():
     finetuned_translation_en_fr = pipeline('translation_en_to_fr', model="Demosthene-OR/t5-small-finetuned-en-to-fr")
     model_speech = whisper.load_model("base")
+    return df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, finetuned_translation_en_fr
 n1 = 0
+df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, finetuned_translation_en_fr = load_all_data()
 '''
 def display_translation(n1, Lang,model_type):