import streamlit as st import torch import numpy as np import json import typing as tp import torch.nn.functional as F from torch import Tensor from datasets import ClassLabel import transformers from transformers import BertForSequenceClassification from transformers import BertForSequenceClassification, AutoTokenizer st.markdown("## Portuguese European and Brazilian dialect classifier") st.markdown("[You can see the difference between dialects here](https://en.wikipedia.org/wiki/Portuguese_language#Writing_system)") text = st.text_input('## Text:') tokenizer = AutoTokenizer.from_pretrained('adalbertojunior/distilbert-portuguese-cased', do_lower_case=False) classes = ['pt', 'pt_br'] class_label = ClassLabel(names=classes) @st.cache def get_model(): return BertForSequenceClassification.from_pretrained( './pt_br_model', num_labels = 2, output_attentions = False, output_hidden_states = False, ) model = get_model() @torch.inference_mode() def print_results(): input_tensor = tokenizer(text, padding=True, truncation=True, max_length=256, add_special_tokens=True, return_tensors="pt") logits = model(**input_tensor).logits probabilities = F.softmax(logits, dim=1).flatten().tolist() maxidx = np.argmax(probabilities) results = f"### {classes[maxidx]} score: {probabilities[maxidx]*100}%" st.markdown('## Results:') st.markdown(results) if text: print_results()