Spaces:
Sleeping
Sleeping
File size: 7,322 Bytes
eb136bc 6044374 eb136bc 99daaed eb136bc 99daaed eb136bc 99daaed ddf8478 99daaed ddf8478 99daaed ddf8478 99daaed eb136bc 99daaed ddf8478 6044374 99daaed ddf8478 6044374 99daaed 6044374 99daaed 6044374 99daaed 6044374 99daaed 6044374 99daaed 6044374 99daaed 6044374 99daaed 6044374 99daaed 6044374 99daaed eb136bc 99daaed eb136bc 99daaed eb136bc 99daaed eb136bc 99daaed 9b14109 99daaed c00de41 99daaed ddf8478 c00de41 99daaed ddf8478 eb136bc 99daaed eb136bc 99daaed eb136bc 99daaed eb136bc 99daaed eb136bc 99daaed c00de41 99daaed c00de41 99daaed ddf8478 c00de41 99daaed ddf8478 c00de41 99daaed c00de41 99daaed c00de41 99daaed c00de41 99daaed eb136bc 99daaed eb136bc 99daaed c00de41 99daaed c00de41 99daaed c00de41 99daaed c00de41 99daaed eb136bc 99daaed 6d06cb9 99daaed 6d06cb9 99daaed 6d06cb9 99daaed eb136bc 99daaed 6d06cb9 99daaed 6d06cb9 99daaed 6d06cb9 99daaed eb136bc 99daaed eb136bc 99daaed eb136bc 99daaed eb136bc 99daaed 6d06cb9 99daaed 6d06cb9 99daaed 6d06cb9 99daaed eb136bc 99daaed eb136bc 99daaed 6044374 99daaed 6044374 99daaed eb136bc 99daaed eb136bc 99daaed 6044374 99daaed 6044374 99daaed 6044374 99daaed 6044374 99daaed 6044374 99daaed 6044374 99daaed eb136bc 99daaed eb136bc 99daaed eb136bc 99daaed eb136bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from torch.utils.data import DataLoader
import streamlit as st
import pandas as pd
import torch
import os
# # Let us define the main page
st.markdown("Translation page 🔠")
# # Dropdown for the translation type
# translation_type = st.sidebar.selectbox("Translation Type", options=["French ➡️ Wolof", "Wolof ➡️ French"])
# # define a dictionary of versions
# models = {
# "Version ✌️": {
# "French ➡️ Wolof": {
# "checkpoints": "wolof_translate/checkpoints/t5_small_custom_train_results_fw_v4",
# "tokenizer": "wolof_translate/tokenizers/t5_tokenizers/tokenizer_v4.json",
# "max_len": None
# }
# },
# "Version ☝️": {
# "French ➡️ Wolof": {
# "checkpoints": "wolof_translate/checkpoints/t5_small_custom_train_results_fw_v3",
# "tokenizer": "wolof_translate/tokenizers/t5_tokenizers/tokenizer_v3.json",
# "max_len": 51
# },
# "Wolof ➡️ French": {
# "checkpoints": "wolof_translate/checkpoints/t5_small_custom_train_results_wf_v3",
# "tokenizer": "wolof_translate/trokenizers/t5_tokenizers/tokenizer_v3.json",
# "max_len": 51
# }
# }
# }
# # add special characters from Wolof
# sp_wolof_chars = pd.read_csv('wolof_translate/data/wolof_writing/wolof_special_chars.csv')
# # add definitions
# sp_wolof_words = pd.read_csv('wolof_translate/data/wolof_writing/definitions.csv')
# # let us add a callback functions to change the input text
# def add_symbol_to_text():
# st.session_state.input_text += st.session_state.symbol
# def add_word_to_text():
# word = st.session_state.word.split('/')[0].strip()
# st.session_state.input_text += word
# # Dropdown for introducing wolof special characters
# if translation_type == "Wolof ➡️ French":
# symbol = st.sidebar.selectbox("Wolof characters", key="symbol", options = sp_wolof_chars['wolof_special_chars'], on_change=add_symbol_to_text)
# word = st.sidebar.selectbox("Wolof words/Definitions", key="word", options = [sp_wolof_words.loc[i, 'wolof']+" / "+sp_wolof_words.loc[i, 'french'] for i in range(sp_wolof_words.shape[0])], on_change=add_word_to_text)
# # Dropdown for the model version
# version = st.sidebar.selectbox("Model version", options=["Version ☝️", "Version ✌️"])
# # Recuperate the number of sentences to provide
# temperature = st.sidebar.slider("How randomly need you the translated sentences to be from 0% to 100%", min_value = 0,
# max_value = 100)
# # make the process
# try:
# # recuperate the max length
# max_len = models[version][translation_type]['max_len']
# # let us get the best model
# @st.cache_resource
# def get_modelfw_v3():
# # recuperate checkpoints
# checkpoints = torch.load(os.path.join('wolof_translate/checkpoints/t5_small_custom_train_results_fw_v3', "best_checkpoints.pth"), map_location=torch.device('cpu'))
# # recuperate the tokenizer
# tokenizer_file = "wolof_translate/tokenizers/t5_tokenizers/tokenizer_v3.json"
# # initialize the tokenizer
# tokenizer = T5TokenizerFast(tokenizer_file=tokenizer_file)
# model = T5ForConditionalGeneration.from_pretrained('t5-small')
# # resize the token embeddings
# model.resize_token_embeddings(len(tokenizer))
# model.load_state_dict(checkpoints['model_state_dict'])
# return model, tokenizer
# # @st.cache_resource
# def get_modelwf_v3():
# # recuperate checkpoints
# checkpoints = torch.load(os.path.join('wolof_translate/checkpoints/t5_small_custom_train_results_wf_v3', "best_checkpoints.pth"), map_location=torch.device('cpu'))
# # recuperate the tokenizer
# tokenizer_file = "wolof_translate/tokenizers/t5_tokenizers/tokenizer_v3.json"
# # initialize the tokenizer
# tokenizer = T5TokenizerFast(tokenizer_file=tokenizer_file)
# model = T5ForConditionalGeneration.from_pretrained('t5-small')
# # resize the token embeddings
# model.resize_token_embeddings(len(tokenizer))
# model.load_state_dict(checkpoints['model_state_dict'])
# return model, tokenizer
# if version == "Version ☝️":
# if translation_type == "French ➡️ Wolof":
# model, tokenizer = get_modelfw_v3()
# elif translation_type == "Wolof ➡️ French":
# model, tokenizer = get_modelwf_v3()
# # set the model to eval mode
# _ = model.eval()
# language = "Wolof" if translation_type == "French ➡️ Wolof" else "French"
# # Add a title
# st.header(f"Translate French sentences to {language} 👌")
# # Recuperate two columns
# left, right = st.columns(2)
# if translation_type == "French ➡️ Wolof":
# # recuperate sentences
# left.subheader('Give me some sentences in French: ')
# else:
# # recuperate sentences
# left.subheader('Give me some sentences in Wolof: ')
# # for i in range(number):
# left.text_input(f"- Sentence", key = f"input_text")
# # run model inference on all test data
# original_translations, predicted_translations, original_texts, scores = [], [], [], {}
# if translation_type == "French ➡️ Wolof":
# # print a sentence recuperated from the session
# right.subheader("Translation to Wolof:")
# else:
# # print a sentence recuperated from the session
# right.subheader("Translation to French:")
# # for i in range(number):
# sentence = st.session_state[f"input_text"] + tokenizer.eos_token
# if not sentence == tokenizer.eos_token:
# # Let us encode the sentences
# encoding = tokenizer([sentence], return_tensors='pt', max_length=max_len, padding='max_length', truncation=True)
# # Let us recuperate the input ids
# input_ids = encoding.input_ids
# # Let us recuperate the mask
# mask = encoding.attention_mask
# # Let us recuperate the pad token id
# pad_token_id = tokenizer.pad_token_id
# # perform prediction
# predictions = model.generate(input_ids, do_sample = False, top_k = 50, max_length = max_len, top_p = 0.90,
# temperature = temperature/100, num_return_sequences = 0, attention_mask = mask, pad_token_id = pad_token_id)
# # decode the predictions
# predicted_sentence = tokenizer.batch_decode(predictions, skip_special_tokens = True)
# # provide the prediction
# right.write(f"Translation: {predicted_sentence[0]}")
# else:
# # provide the prediction
# right.write(f"Translation: ")
# except Exception as e:
# st.warning("The chosen model is not available yet !", icon = "⚠️")
# st.write(e)
|