from typing import Tuple import torch import streamlit as st from transformers import AutoModelForTokenClassification, AutoTokenizer from dante_tokenizer import DanteTokenizer from dante_tokenizer.data.preprocessing import expand_contractions from annotated_text import annotated_text def get_pos_tag_model(model_name: str = "Emanuel/autonlp-pos-tag-bosque") -> Tuple[AutoModelForTokenClassification, AutoTokenizer]: model = AutoModelForTokenClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) return model, tokenizer def get_tag_color(tag: str) -> str: """ Return the color for a given part-of-speech tag from the Universal Dependencies tagset. See: https://universaldependencies.org/u/pos/ """ pallete = { "ADJ": "#2E4C6D", "ADP": "#FBE7C6", "ADV": "#DADDFC", "AUX": "#FC997C", "CCONJ": "#544179", "DET": "#A0E7E5", "INTJ": "#32C1CD", "NOUN": "#17D7A0", "PART": "#C85C5C", "PRON": "#F9975D", "PROPN": "#FBD148", "PUNCT": "#B2EA70", "SCONJ": "#AA14F0", "SYM": "#34BE82", "VERB": "#FFBF86", "X": "#2F86A6", } return pallete[tag] def main(): text = st.text_area("Digite seu texto de entrada!") dt = DanteTokenizer() model, tokenizer = get_pos_tag_model() if text: tokens = dt.tokenize(text) input_cleaned_text = expand_contractions(text) inputs = tokenizer(text, return_tensors="pt") outputs = model(**inputs) labelids = outputs.logits.squeeze().argmax(axis=-1) scores, _ = torch.nn.functional.softmax(outputs.logits, dim=1).squeeze().max(axis=-1) scores = scores.tolist() labels = [model.config.id2label[int(x)] for x in labelids] labels = labels[1:-1] answer = [] for token, label, score in zip(tokens, labels, scores): answer.append((token, label, get_tag_color(label))) annotated_text(*answer) if __name__ == "__main__": main()