import streamlit as st from annotated_text import annotated_text from transformers import pipeline from PIL import Image import re st.sidebar.header("**Instructions**") st.sidebar.markdown("Démonstrateur du modèle [NERmemBERT](https://hf.co/collections/CATIE-AQ/french-ner-pack-658aefafe3f7a2dcf0e4dbb4) entraîné sur 385 000 à 420 000 données en français en fonction de la configuration. Il est capable d'étiquetter les entités LOC (Localisations), PER (Personnalités), ORG (Organisations) et MISC (Divers). Il est disponible en huit versions : NERmemBERT1-3entities-base (110M de paramètres, contexte de 512 tokens), NERmemBERT2-3entities (111M, 1024 tokens), NERmemBERTa-3entities (111M, 1024 tokens), NERmemBERT1-3entities-large (336M, 512 tokens), NERmemBERT1-4entities-base (110M, 512 tokens), NERmemBERT2-4entities (111M, 1024 tokens), NERmemBERTa-4entities (111M, 1024 tokens), NERmemBERT1-4entities-large (336M, 512 tokens). Pour utiliser l'application, sélectionnez la version de votre choix ci-dessous, puis renseignez un texte. Enfin appuyez sur le bouton « Appliquer le modèle » pour observer la réponse trouvée par le modèle. Pour en savoir plus sur ces modèles, vous pouvez lire l'[article de blog](https://blog.vaniila.ai/NER/) détaillant la démarche suvie.") version = st.sidebar.radio("Choix de la version du modèle :", ["NERmemBERT1-3entities-base", "NERmemBERT2-3entities", "NERmemBERTa-3entities", "NERmemBERT1-3entities-large","NERmemBERT1-4entities-base", "NERmemBERT2-4entities", "NERmemBERTa-4entities", "NERmemBERT1-4entities-large"]) st.sidebar.markdown("---") st.sidebar.markdown("Ce modèle a été entraîné via la plateforme [*Vaniila*](https://www.vaniila.ai/) du [CATIE](https://www.catie.fr/).") image_path = 'Vaniila.png' image = Image.open(image_path) st.sidebar.image(image, caption=None, width=None, use_column_width=None, clamp=False, channels="RGB", output_format="auto") @st.cache_resource def load_model(version,text): if version == "NERmemBERT1-3entities-base": ner = pipeline('token-classification', model='CATIE-AQ/NERmembert-base-3entities', tokenizer='CATIE-AQ/NERmembert-base-3entities', aggregation_strategy="simple") result = ner(text) return result if version == "NERmemBERT2-3entities": ner = pipeline('token-classification', model='CATIE-AQ/NERmembert2-3entities', tokenizer='CATIE-AQ/NERmembert2-3entities', aggregation_strategy="simple") result = ner(text) return result if version == "NERmemBERTa-3entities": ner = pipeline('token-classification', model='CATIE-AQ/NERmemberta-3entities', tokenizer='CATIE-AQ/NERmemberta-3entities', aggregation_strategy="simple") result = ner(text) return result if version == "NERmemBERT1-3entities-large": ner = pipeline('token-classification', model='CATIE-AQ/NERmembert-large-3entities', tokenizer='CATIE-AQ/NERmembert-large-3entities', aggregation_strategy="simple") result = ner(text) return result if version == "NERmemBERT1-4entities-base": ner = pipeline('token-classification', model='CATIE-AQ/NERmembert-base-4entities', tokenizer='CATIE-AQ/NERmembert-base-4entities', aggregation_strategy="simple") result = ner(text) return result if version == "NERmemBERT2-4entities": ner = pipeline('token-classification', model='CATIE-AQ/NERmembert2-4entities', tokenizer='CATIE-AQ/NERmembert2-4entities', aggregation_strategy="simple") result = ner(text) return result if version == "NERmemBERTa-4entities": ner = pipeline('token-classification', model='CATIE-AQ/NERmemberta-4entities', tokenizer='CATIE-AQ/NERmemberta-4entities', aggregation_strategy="simple") result = ner(text) return result else: ner = pipeline('token-classification', model='CATIE-AQ/NERmembert-large-4entities', tokenizer='CATIE-AQ/NERmembert-large-4entities', aggregation_strategy="simple") result = ner(text) return result def getcolor(texts, labels): colors = {'LOC': '#38419D', 'PER': '#BF3131', 'ORG': '#597E52', 'MISC':'#F1C232'} return [(t,l,colors[l]) for t, l in zip(texts, labels)] def color_annotation(to_print,text) : text_ner = [] label_ner = [] for i in range(len(to_print)) : text_ner.append(to_print[i]["word"]) label_ner.append(to_print[i]["entity_group"]) anns = getcolor(text_ner, label_ner) anns = list(set(anns)) text_ner = list(set(text_ner)) text_ner = list(sorted(text_ner, key = len)) for i in range(len(anns)): for j in range(len(text_ner)): if text_ner[j] == anns[i][0]: text = text.replace(text_ner[j],str(anns[i])) for i in re.findall(r"\((.*?)\)", text) : # pour gérer les cas de mots inclus dans des n_grams if "(" in i: text = text.replace(i+")",i.split(', ')[0][2:-1]) text = text.replace(")",')","').replace(')","","',')","').replace("(",'","(').replace('","","(','","(').replace("'-","-") return text st.markdown("