Demosthene-OR
commited on
Commit
•
cb4b492
1
Parent(s):
dea6b5e
Création de la route /lang_id_dl
Browse filespour identifier la langue d'une liste de phrases grâce au Deep Learning
- data/multilingue/lan_code.csv +1 -0
- data/multilingue/lan_to_language.json +1 -0
- main_dl.py +56 -117
data/multilingue/lan_code.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
heb,ind,tha,ido,fin,zsm,ukr,vol,nld,wuu,mkd,bel,sqi,est,tuk,avk,dan,gos,ilo,tok,eng,gcf,lfn,glg,cat,asm,hye,bul,jbo,lat,ota,spa,uig,urd,cbk,aze,ara,ron,tur,hin,yid,cmn,grn,run,gle,ben,por,shi,afr,isl,rus,eus,arq,tlh,nob,slk,mhr,war,jpn,ber,vie,fra,kmr,tgl,kat,pol,lit,kab,mon,epo,kaz,ell,ina,kzj,deu,swe,ile,hrv,nds,tat,dtp,kor,mar,oci,ita,hun,yue,srp,cor,pes,frr,ckb,lvs,bre,ces
|
data/multilingue/lan_to_language.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"cmn": "Mandarin Chinese", "deu": "German", "rus": "Russian", "fra": "French", "eng": "English", "jpn": "Japanese", "spa": "Spanish", "ita": "Italian", "kor": "Korean", "vie": "Vietnamese", "nld": "Dutch", "epo": "Esperanto", "por": "Portuguese", "tur": "Turkish", "heb": "Hebrew", "hun": "Hungarian", "ell": "Modern Greek (1453-)", "ind": "Indonesian", "ara": "Arabic", "arz": "Egyptian Arabic", "fin": "Finnish", "bul": "Bulgarian", "yue": "Yue Chinese", "swe": "Swedish", "ukr": "Ukrainian", "bel": "Belarusian", "que": "Quechua", "ces": "Czech", "swh": "Swahili (individual language)", "nno": "Norwegian Nynorsk", "wuu": "Wu Chinese", "nob": "Norwegian Bokm\u00e5l", "zsm": "Standard Malay", "est": "Estonian", "kat": "Georgian", "pol": "Polish", "lat": "Latin", "urd": "Urdu", "sqi": "Albanian", "isl": "Icelandic", "fry": "Western Frisian", "afr": "Afrikaans", "ron": "Romanian", "fao": "Faroese", "san": "Sanskrit", "bre": "Breton", "tat": "Tatar", "yid": "Yiddish", "uig": "Uighur", "uzb": "Uzbek", "srp": "Serbian", "qya": "Quenya", "dan": "Danish", "pes": "Iranian Persian", "slk": "Slovak", "eus": "Basque", "cycl": "CycL", "acm": "Mesopotamian Arabic", "tgl": "Tagalog", "lvs": "Standard Latvian", "kaz": "Kazakh", "hye": "Armenian", "hin": "Hindi", "lit": "Lithuanian", "ben": "Bengali", "cat": "Catalan", "bos": "Bosnian", "hrv": "Croatian", "tha": "Thai", "orv": "Old Russian", "cha": "Chamorro", "mon": "Mongolian", "lzh": "Literary Chinese", "scn": "Sicilian", "gle": "Irish", "mkd": "Macedonian", "slv": "Slovenian", "frm": "Middle French (ca. 1400-1600)", "glg": "Galician", "vol": "Volap\u00fck", "ain": "Ainu (Japan)", "jbo": "Lojban", "tok": "Toki Pona", "ina": "Interlingua (International Auxiliary Language Association)", "nds": "Low German", "mal": "Malayalam", "tlh": "Klingon", "roh": "Romansh", "ltz": "Luxembourgish", "oss": "Ossetian", "ido": "Ido", "gla": "Scottish Gaelic", "mlt": "Maltese", "sco": "Scots", "ast": "Asturian", "jav": "Javanese", "oci": "Occitan (post 1500)", "ile": "Interlingue", "ota": "Ottoman Turkish (1500-1928)", "xal": "Kalmyk", "tel": "Telugu", "sjn": "Sindarin", "nov": "Novial", "khm": "Central Khmer", "tpi": "Tok Pisin", "ang": "Old English (ca. 450-1100)", "aze": "Azerbaijani", "tgk": "Tajik", "tuk": "Turkmen", "chv": "Chuvash", "hsb": "Upper Sorbian", "dsb": "Lower Sorbian", "bod": "Tibetan", "sme": "Northern Sami", "cym": "Welsh", "mri": "Maori", "ksh": "K\u00f6lsch", "kmr": "Northern Kurdish", "ewe": "Ewe", "kab": "Kabyle", "ber": "Berber languages", "tpw": "Tup\u00ed", "udm": "Udmurt", "lld": "Ladin", "pms": "Piemontese", "lad": "Ladino", "grn": "Guarani", "mlg": "Malagasy", "xho": "Xhosa", "pnb": "Western Panjabi", "grc": "Ancient Greek (to 1453)", "hat": "Haitian", "lao": "Lao", "npi": "Nepali (individual language)", "cor": "Cornish", "nah": "Nahuatl", "avk": "Kotava", "mar": "Marathi", "guj": "Gujarati", "pan": "Panjabi", "kir": "Kirghiz", "myv": "Erzya", "prg": "Prussian", "sux": "Sumerian", "crs": "Seselwa Creole French", "ckt": "Chukot", "bak": "Bashkir", "zlm": "Malay (individual language)", "hil": "Hiligaynon", "cbk": "Chavacano", "chr": "Cherokee", "nav": "Navajo", "lkt": "Lakota", "enm": "Middle English (1100-1500)", "arq": "Algerian Arabic", "lin": "Lingala", "abk": "Abkhazian", "pcd": "Picard", "rom": "Romany", "gsw": "Swiss German", "tam": "Tamil", "zul": "Zulu", "awa": "Awadhi", "wln": "Walloon", "amh": "Amharic", "bar": "Bavarian", "hbo": "Ancient Hebrew", "mhr": "Eastern Mari", "bho": "Bhojpuri", "mrj": "Western Mari", "ckb": "Central Kurdish", "osx": "Old Saxon", "pfl": "Pfaelzisch", "mgm": "Mambae", "sna": "Shona", "mah": "Marshallese", "hau": "Hausa", "kan": "Kannada", "nog": "Nogai", "sin": "Sinhala", "glv": "Manx", "dng": "Dungan", "kal": "Kalaallisut", "liv": "Liv", "vro": "V\u00f5ro", "apc": "North Levantine Arabic", "jdt": "Judeo-Tat", "fur": "Friulian", "che": "Chechen", "haw": "Hawaiian", "yor": "Yoruba", "crh": "Crimean Tatar", "pdc": "Pennsylvania German", "ppl": "Pipil", "kin": "Kinyarwanda", "shs": "Shuswap", "mnw": "Mon", "tet": "Tetum", "sah": "Yakut", "kum": "Kumyk", "ngt": "Ngeq", "nya": "Nyanja", "pus": "Pushto", "hif": "Fiji Hindi", "mya": "Burmese", "moh": "Mohawk", "wol": "Wolof", "tir": "Tigrinya", "ton": "Tonga (Tonga Islands)", "lzz": "Laz", "oar": "Old Aramaic (up to 700 BCE)", "lug": "Ganda", "brx": "Bodo (India)", "non": "Old Norse", "mww": "Hmong Daw", "hak": "Hakka Chinese", "nlv": "Orizaba Nahuatl", "ngu": "Guerrero Nahuatl", "bua": "Buriat", "aym": "Aymara", "vec": "Venetian", "ibo": "Igbo", "tkl": "Tokelau", "bam": "Bambara", "kha": "Khasi", "ceb": "Cebuano", "lou": "Louisiana Creole", "fuc": "Pulaar", "smo": "Samoan", "gag": "Gagauz", "lfn": "Lingua Franca Nova", "arg": "Aragonese", "umb": "Umbundu", "tyv": "Tuvinian", "kjh": "Khakas", "oji": "Ojibwa", "cyo": "Cuyonon", "urh": "Urhobo", "kzj": "Coastal Kadazan", "pam": "Pampanga", "srd": "Sardinian", "lmo": "Lombard", "swg": "Swabian", "mdf": "Moksha", "gil": "Gilbertese", "snd": "Sindhi", "tso": "Tsonga", "sot": "Southern Sotho", "zza": "Zaza", "tsn": "Tswana", "pau": "Palauan", "som": "Somali", "egl": "Emilian", "ady": "Adyghe", "asm": "Assamese", "ori": "Oriya (macrolanguage)", "dtp": "Kadazan Dusun", "cho": "Choctaw", "max": "North Moluccan Malay", "kam": "Kamba (Kenya)", "niu": "Niuean", "sag": "Sango", "ilo": "Iloko", "kaa": "Kara-Kalpak", "fuv": "Nigerian Fulfulde", "nch": "Central Huasteca Nahuatl", "hoc": "Ho", "iba": "Iban", "gbm": "Garhwali", "sun": "Sundanese", "war": "Waray (Philippines)", "mvv": "Tagal Murut", "pap": "Papiamento", "ary": "Moroccan Arabic", "kxi": "Keningau Murut", "csb": "Kashubian", "pag": "Pangasinan", "cos": "Corsican", "rif": "Tarifit", "kek": "Kekch\u00ed", "krc": "Karachay-Balkar", "aii": "Assyrian Neo-Aramaic", "ban": "Balinese", "ssw": "Swati", "tvl": "Tuvalu", "mfe": "Morisyen", "tah": "Tahitian", "bvy": "Baybayanon", "bcl": "Central Bikol", "hnj": "Hmong Njua", "nau": "Nauru", "nst": "Tase Naga", "afb": "Gulf Arabic", "quc": "K'iche'", "min": "Minangkabau", "tmw": "Temuan", "mad": "Madurese", "bjn": "Banjar", "mai": "Maithili", "cjy": "Jinyu Chinese", "got": "Gothic", "hsn": "Xiang Chinese", "gan": "Gan Chinese", "tzl": "Talossan", "dws": "Dutton World Speedwords", "ldn": "L\u00e1adan", "afh": "Afrihili", "sgs": "Samogitian", "krl": "Karelian", "vep": "Veps", "rue": "Rusyn", "tly": "Talysh", "mic": "Mi'kmaq", "ext": "Extremaduran", "izh": "Ingrian", "sma": "Southern Sami", "jam": "Jamaican Creole English", "cmo": "Central Mnong", "mwl": "Mirandese", "kpv": "Komi-Zyrian", "koi": "Komi-Permyak", "bis": "Bislama", "ike": "Eastern Canadian Inuktitut", "run": "Rundi", "evn": "Evenki", "ryu": "Central Okinawan", "mnc": "Manchu", "aoz": "Uab Meto", "otk": "Old Turkish", "kas": "Kashmiri", "aln": "Gheg Albanian", "akl": "Aklanon", "yua": "Yucateco", "shy": "Tachawit", "fkv": "Kven Finnish", "gos": "Gronings", "fij": "Fijian", "thv": "Tahaggart Tamahaq", "zgh": "Standard Moroccan Tamazight", "gcf": "Guadeloupean Creole French", "cay": "Cayuga", "xmf": "Mingrelian", "tig": "Tigre", "div": "Dhivehi", "lij": "Ligurian", "rap": "Rapanui", "hrx": "Hunsrik", "cpi": "Chinese Pidgin English", "tts": "Northeastern Thai", "gaa": "Ga", "tmr": "Jewish Babylonian Aramaic (ca. 200-1200 CE)", "iii": "Sichuan Yi", "ltg": "Latgalian", "bzt": "Brithenig", "syc": "Classical Syriac", "emx": "Erromintxela", "gom": "Goan Konkani", "chg": "Chagatai", "osp": "Old Spanish", "stq": "Saterfriesisch", "frr": "Northern Frisian", "fro": "Old French (842-ca. 1400)", "nys": "Nyunga", "toi": "Tonga (Zambia)", "new": "Newari", "phn": "Phoenician", "jpa": "Jewish Palestinian Aramaic", "rel": "Rendille", "drt": "Drents", "chn": "Chinook jargon", "pli": "Pali", "laa": "Southern Subanen", "bal": "Baluchi", "hdn": "Northern Haida", "hax": "Southern Haida", "mik": "Mikasuki", "ajp": "South Levantine Arabic", "xqa": "Karakhanid", "pal": "Pahlavi", "crk": "Plains Cree", "mni": "Manipuri", "lut": "Lushootseed", "ayl": "Libyan Arabic", "ood": "Tohono O'odham", "sdh": "Southern Kurdish", "ofs": "Old Frisian", "nus": "Nuer", "kiu": "Kirmanjki (individual language)", "diq": "Dimli (individual language)", "qxq": "Qashqa'i", "alt": "Southern Altai", "bfz": "Mahasu Pahari", "klj": "Turkic Khalaj", "mus": "Creek", "srn": "Sranan Tongo", "guc": "Wayuu", "lim": "Limburgan", "zea": "Zeeuws", "shi": "Tachelhit", "mnr": "Mono (USA)", "bom": "Berom", "sat": "Santali", "szl": "Silesian", "igs": "Interglossa"}
|
main_dl.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from fastapi import FastAPI, HTTPException, Header, Depends, Request, Response
|
2 |
from fastapi.responses import JSONResponse
|
3 |
from fastapi.security import HTTPBasic, HTTPBasicCredentials
|
4 |
from fastapi.exceptions import RequestValidationError
|
@@ -12,9 +12,15 @@ from filesplit.merge import Merge
|
|
12 |
import tensorflow as tf
|
13 |
import string
|
14 |
import re
|
|
|
|
|
|
|
|
|
15 |
from tensorflow import keras
|
|
|
16 |
from keras_nlp.layers import TransformerEncoder
|
17 |
from tensorflow.keras import layers
|
|
|
18 |
from tensorflow.keras.utils import plot_model
|
19 |
|
20 |
api = FastAPI()
|
@@ -237,14 +243,55 @@ def load_all_data():
|
|
237 |
|
238 |
rnn_en_fr, rnn_fr_en, transformer_en_fr, transformer_fr_en = load_all_data()
|
239 |
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
@api.get('/', name="Vérification que l'API fonctionne")
|
246 |
def check_api():
|
247 |
load_all_data()
|
|
|
|
|
248 |
return {'message': "L'API fonctionne"}
|
249 |
|
250 |
@api.get('/small_vocab/rnn', name="Traduction par RNN")
|
@@ -293,115 +340,7 @@ def affiche_modele(lang_tgt:str,
|
|
293 |
|
294 |
# Retourner l'image en tant que réponse HTTP avec le type de contenu approprié
|
295 |
return Response(content=image_data, media_type="image/png")
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
global df_data_en, df_data_fr, lang_classifier, translation_en_fr, translation_fr_en
|
301 |
-
global lang_tgt, label_lang
|
302 |
-
|
303 |
-
st.write("")
|
304 |
-
st.title(tr(title))
|
305 |
-
#
|
306 |
-
st.write("## **"+tr("Explications")+" :**\n")
|
307 |
-
|
308 |
-
st.markdown(tr(
|
309 |
-
"""
|
310 |
-
Enfin, nous avons réalisé une traduction :red[**Seq2Seq**] ("Sequence-to-Sequence") avec des :red[**réseaux neuronaux**].
|
311 |
-
""")
|
312 |
-
, unsafe_allow_html=True)
|
313 |
-
st.markdown(tr(
|
314 |
-
"""
|
315 |
-
La traduction Seq2Seq est une méthode d'apprentissage automatique qui permet de traduire des séquences de texte d'une langue à une autre en utilisant
|
316 |
-
un :red[**encodeur**] pour capturer le sens du texte source, un :red[**décodeur**] pour générer la traduction,
|
317 |
-
avec un ou plusieurs :red[**vecteurs d'intégration**] qui relient les deux, afin de transmettre le contexte, l'attention ou la position.
|
318 |
-
""")
|
319 |
-
, unsafe_allow_html=True)
|
320 |
-
st.image("assets/deepnlp_graph1.png",use_column_width=True)
|
321 |
-
st.markdown(tr(
|
322 |
-
"""
|
323 |
-
Nous avons mis en oeuvre ces techniques avec des Réseaux Neuronaux Récurrents (GRU en particulier) et des Transformers
|
324 |
-
Vous en trouverez :red[**5 illustrations**] ci-dessous.
|
325 |
-
""")
|
326 |
-
, unsafe_allow_html=True)
|
327 |
-
|
328 |
-
# Utilisation du module translate
|
329 |
-
lang_tgt = ['en','fr','af','ak','sq','de','am','en','ar','hy','as','az','ba','bm','eu','bn','be','my','bs','bg','ks','ca','ny','zh','si','ko','co','ht','hr','da','dz','gd','es','eo','et','ee','fo','fj','fi','fr','fy','gl','cy','lg','ka','el','gn','gu','ha','he','hi','hu','ig','id','iu','ga','is','it','ja','kn','kk','km','ki','rw','ky','rn','ku','lo','la','lv','li','ln','lt','lb','mk','ms','ml','dv','mg','mt','mi','mr','mn','nl','ne','no','nb','nn','oc','or','ug','ur','uz','ps','pa','fa','pl','pt','ro','ru','sm','sg','sa','sc','sr','sn','sd','sk','sl','so','st','su','sv','sw','ss','tg','tl','ty','ta','tt','cs','te','th','bo','ti','to','ts','tn','tr','tk','tw','uk','vi','wo','xh','yi']
|
330 |
-
label_lang = ['Anglais','Français','Afrikaans','Akan','Albanais','Allemand','Amharique','Anglais','Arabe','Arménien','Assamais','Azéri','Bachkir','Bambara','Basque','Bengali','Biélorusse','Birman','Bosnien','Bulgare','Cachemiri','Catalan','Chichewa','Chinois','Cingalais','Coréen','Corse','Créolehaïtien','Croate','Danois','Dzongkha','Écossais','Espagnol','Espéranto','Estonien','Ewe','Féroïen','Fidjien','Finnois','Français','Frisonoccidental','Galicien','Gallois','Ganda','Géorgien','Grecmoderne','Guarani','Gujarati','Haoussa','Hébreu','Hindi','Hongrois','Igbo','Indonésien','Inuktitut','Irlandais','Islandais','Italien','Japonais','Kannada','Kazakh','Khmer','Kikuyu','Kinyarwanda','Kirghiz','Kirundi','Kurde','Lao','Latin','Letton','Limbourgeois','Lingala','Lituanien','Luxembourgeois','Macédonien','Malais','Malayalam','Maldivien','Malgache','Maltais','MaorideNouvelle-Zélande','Marathi','Mongol','Néerlandais','Népalais','Norvégien','Norvégienbokmål','Norvégiennynorsk','Occitan','Oriya','Ouïghour','Ourdou','Ouzbek','Pachto','Pendjabi','Persan','Polonais','Portugais','Roumain','Russe','Samoan','Sango','Sanskrit','Sarde','Serbe','Shona','Sindhi','Slovaque','Slovène','Somali','SothoduSud','Soundanais','Suédois','Swahili','Swati','Tadjik','Tagalog','Tahitien','Tamoul','Tatar','Tchèque','Télougou','Thaï','Tibétain','Tigrigna','Tongien','Tsonga','Tswana','Turc','Turkmène','Twi','Ukrainien','Vietnamien','Wolof','Xhosa','Yiddish']
|
331 |
-
|
332 |
-
lang_src = {'ar': 'arabic', 'bg': 'bulgarian', 'de': 'german', 'el':'modern greek', 'en': 'english', 'es': 'spanish', 'fr': 'french', \
|
333 |
-
'hi': 'hindi', 'it': 'italian', 'ja': 'japanese', 'nl': 'dutch', 'pl': 'polish', 'pt': 'portuguese', 'ru': 'russian', 'sw': 'swahili', \
|
334 |
-
'th': 'thai', 'tr': 'turkish', 'ur': 'urdu', 'vi': 'vietnamese', 'zh': 'chinese'}
|
335 |
-
|
336 |
-
st.write("#### "+tr("Choisissez le type de traduction")+" :")
|
337 |
-
|
338 |
-
chosen_id = tab_bar(data=[
|
339 |
-
TabBarItemData(id="tab1", title="small vocab", description=tr("avec Keras et un RNN")),
|
340 |
-
TabBarItemData(id="tab2", title="small vocab", description=tr("avec Keras et un Transformer")),
|
341 |
-
TabBarItemData(id="tab3", title=tr("Phrase personnelle"), description=tr("à écrire")),
|
342 |
-
TabBarItemData(id="tab4", title=tr("Phrase personnelle"), description=tr("à dicter")),
|
343 |
-
TabBarItemData(id="tab5", title=tr("Funny translation !"), description=tr("avec le Fine Tuning"))],
|
344 |
-
default="tab1")
|
345 |
-
|
346 |
-
if (chosen_id == "tab1") or (chosen_id == "tab2") :
|
347 |
-
if (chosen_id == "tab1"):
|
348 |
-
st.write("<center><h5><b>"+tr("Schéma d'un Réseau de Neurones Récurrents")+"</b></h5></center>", unsafe_allow_html=True)
|
349 |
-
st.image("assets/deepnlp_graph3.png",use_column_width=True)
|
350 |
-
else:
|
351 |
-
st.write("<center><h5><b>"+tr("Schéma d'un Transformer")+"</b></h5></center>", unsafe_allow_html=True)
|
352 |
-
st.image("assets/deepnlp_graph12.png",use_column_width=True)
|
353 |
-
st.write("## **"+tr("Paramètres")+" :**\n")
|
354 |
-
TabContainerHolder = st.container()
|
355 |
-
Sens = TabContainerHolder.radio(tr('Sens')+':',('Anglais -> Français','Français -> Anglais'), horizontal=True)
|
356 |
-
Lang = ('en_fr' if Sens=='Anglais -> Français' else 'fr_en')
|
357 |
-
|
358 |
-
if (Lang=='en_fr'):
|
359 |
-
df_data_src = df_data_en
|
360 |
-
df_data_tgt = df_data_fr
|
361 |
-
if (chosen_id == "tab1"):
|
362 |
-
translation_model = rnn_en_fr
|
363 |
-
else:
|
364 |
-
translation_model = transformer_en_fr
|
365 |
-
else:
|
366 |
-
df_data_src = df_data_fr
|
367 |
-
df_data_tgt = df_data_en
|
368 |
-
if (chosen_id == "tab1"):
|
369 |
-
translation_model = rnn_fr_en
|
370 |
-
else:
|
371 |
-
translation_model = transformer_fr_en
|
372 |
-
sentence1 = st.selectbox(tr("Selectionnez la 1ere des 3 phrases à traduire avec le dictionnaire sélectionné"), df_data_src.iloc[:-4],index=int(n1) )
|
373 |
-
n1 = df_data_src[df_data_src[0]==sentence1].index.values[0]
|
374 |
-
|
375 |
-
st.write("## **"+tr("Résultats")+" :**\n")
|
376 |
-
if (chosen_id == "tab1"):
|
377 |
-
display_translation(n1, Lang,1)
|
378 |
-
else:
|
379 |
-
display_translation(n1, Lang,2)
|
380 |
-
|
381 |
-
st.write("## **"+tr("Details sur la méthode")+" :**\n")
|
382 |
-
if (chosen_id == "tab1"):
|
383 |
-
st.markdown(tr(
|
384 |
-
"""
|
385 |
-
Nous avons utilisé 2 Gated Recurrent Units.
|
386 |
-
Vous pouvez constater que la traduction avec un RNN est relativement lente.
|
387 |
-
Ceci est notamment du au fait que les tokens passent successivement dans les GRU,
|
388 |
-
alors que les calculs sont réalisés en parrallèle dans les Transformers.
|
389 |
-
Le score BLEU est bien meilleur que celui des traductions mot à mot.
|
390 |
-
<br>
|
391 |
-
""")
|
392 |
-
, unsafe_allow_html=True)
|
393 |
-
else:
|
394 |
-
st.markdown(tr(
|
395 |
-
"""
|
396 |
-
Nous avons utilisé un encodeur et décodeur avec 8 têtes d'entention.
|
397 |
-
La dimension de l'embedding des tokens = 256
|
398 |
-
La traduction est relativement rapide et le score BLEU est bien meilleur que celui des traductions mot à mot.
|
399 |
-
<br>
|
400 |
-
""")
|
401 |
-
, unsafe_allow_html=True)
|
402 |
-
st.write("<center><h5>"+tr("Architecture du modèle utilisé")+":</h5>", unsafe_allow_html=True)
|
403 |
-
plot_model(translation_model, show_shapes=True, show_layer_names=True, show_layer_activations=True,rankdir='TB',to_file=st.session_state.ImagePath+'/model_plot.png')
|
404 |
-
st.image(st.session_state.ImagePath+'/model_plot.png',use_column_width=True)
|
405 |
-
st.write("</center>", unsafe_allow_html=True)
|
406 |
-
|
407 |
-
'''
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException, Header, Depends, Request, Response, Query
|
2 |
from fastapi.responses import JSONResponse
|
3 |
from fastapi.security import HTTPBasic, HTTPBasicCredentials
|
4 |
from fastapi.exceptions import RequestValidationError
|
|
|
12 |
import tensorflow as tf
|
13 |
import string
|
14 |
import re
|
15 |
+
import json
|
16 |
+
import csv
|
17 |
+
import tiktoken
|
18 |
+
from sklearn.preprocessing import LabelEncoder
|
19 |
from tensorflow import keras
|
20 |
+
import keras
|
21 |
from keras_nlp.layers import TransformerEncoder
|
22 |
from tensorflow.keras import layers
|
23 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
24 |
from tensorflow.keras.utils import plot_model
|
25 |
|
26 |
api = FastAPI()
|
|
|
243 |
|
244 |
rnn_en_fr, rnn_fr_en, transformer_en_fr, transformer_fr_en = load_all_data()
|
245 |
|
246 |
+
# ==== Language identifier ====
|
247 |
+
|
248 |
+
def encode_text(textes):
|
249 |
+
global tokenizer
|
250 |
+
|
251 |
+
max_length=250
|
252 |
+
sequences = tokenizer.encode_batch(textes)
|
253 |
+
return pad_sequences(sequences, maxlen=max_length, padding='post')
|
254 |
+
|
255 |
+
def read_list_lan():
|
256 |
+
|
257 |
+
with open(dataPath+'/multilingue/lan_code.csv', 'r') as fichier_csv:
|
258 |
+
reader = csv.reader(fichier_csv)
|
259 |
+
lan_code = next(reader)
|
260 |
+
return lan_code
|
261 |
+
|
262 |
+
def init_dl_identifier():
|
263 |
+
global tokenizer, dl_model, label_encoder, lan_to_language
|
264 |
+
|
265 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
266 |
+
# Lisez le contenu du fichier JSON
|
267 |
+
with open(dataPath+'/multilingue/lan_to_language.json', 'r') as fichier:
|
268 |
+
lan_to_language = json.load(fichier)
|
269 |
+
label_encoder = LabelEncoder()
|
270 |
+
list_lan = read_list_lan()
|
271 |
+
lan_identified = [lan_to_language[l] for l in list_lan]
|
272 |
+
label_encoder.fit(list_lan)
|
273 |
+
merge = Merge(dataPath+"/dl_id_lang_split", dataPath, "dl_tiktoken_id_language_model.h5").merge(cleanup=False)
|
274 |
+
dl_model = keras.models.load_model(dataPath+"/dl_tiktoken_id_language_model.h5")
|
275 |
+
return
|
276 |
+
|
277 |
+
def lang_id_dl(sentences):
|
278 |
+
global dl_model, label_encoder, lan_to_language
|
279 |
+
|
280 |
+
if "str" in str(type(sentences)): predictions = dl_model.predict(encode_text([sentences]))
|
281 |
+
else: predictions = dl_model.predict(encode_text(sentences))
|
282 |
+
# Décodage des prédictions en langues
|
283 |
+
predicted_labels_encoded = np.argmax(predictions, axis=1)
|
284 |
+
predicted_languages = label_encoder.classes_[predicted_labels_encoded]
|
285 |
+
if "str" in str(type(sentences)): return lan_to_language[predicted_languages[0]]
|
286 |
+
else: return [l for l in predicted_languages]
|
287 |
+
|
288 |
+
# ==== Endpoints ====
|
289 |
|
290 |
@api.get('/', name="Vérification que l'API fonctionne")
|
291 |
def check_api():
|
292 |
load_all_data()
|
293 |
+
read_list_lan()
|
294 |
+
init_dl_identifier()
|
295 |
return {'message': "L'API fonctionne"}
|
296 |
|
297 |
@api.get('/small_vocab/rnn', name="Traduction par RNN")
|
|
|
340 |
|
341 |
# Retourner l'image en tant que réponse HTTP avec le type de contenu approprié
|
342 |
return Response(content=image_data, media_type="image/png")
|
343 |
+
|
344 |
+
@api.get('/lang_id_dl', name="Id de langue DL")
|
345 |
+
async def language_id_dl(sentences:List[str] = Query(..., min_length=1)):
|
346 |
+
return lang_id_dl(sentences)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|