Spaces:
Sleeping
Sleeping
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.linear_model import LogisticRegression | |
from transformers import AutoModelForSequenceClassification | |
from transformers import BertTokenizerFast | |
import torch | |
import re | |
import string | |
import pickle | |
import streamlit as st | |
# Функция очистки текста | |
def clean(text): | |
text = text.lower() # нижний регистр | |
text = re.sub(r'http\S+', " ", text) # удаляем ссылки | |
text = re.sub(r'@\w+',' ',text) # удаляем упоминания пользователей | |
text = re.sub(r'#\w+', ' ', text) # удаляем хэштеги | |
text = re.sub(r'\d+', ' ', text) # удаляем числа | |
return text | |
# Загрузка весов модели ML | |
model_filename = 'model_comments_weights.pkl' | |
with open(model_filename, 'rb') as file: | |
model = pickle.load(file) | |
# Загрузка весов векторизатора | |
vectorizer = CountVectorizer() | |
vectorizer_filename = 'vectorizer_comments_weights.pkl' | |
with open(vectorizer_filename, 'rb') as file: | |
vectorizer = pickle.load(file) | |
# Само приложение | |
#Готовая модель ruBert | |
tokenizer_bert = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment-rusentiment') | |
model_bert = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment-rusentiment', return_dict=True) | |
st.title("SafeTalk") | |
st.write("Your Personal Comment Filter is an innovative application that harnesses the power of AI to distinguish toxic comments from the rest.") | |
st.write("Empowering users to navigate online discussions with confidence, SafeTalk ensures a more constructive and respectful online community by identifying and flagging harmful content.") | |
user_review = st.text_input("Enter your comment:", "") | |
user_review_clean = clean(user_review) | |
user_features = vectorizer.transform([user_review_clean]) | |
prediction = model.predict(user_features) | |
inputs = tokenizer_bert(user_review_clean, max_length=512, padding=True, truncation=True, return_tensors='pt') | |
outputs = model_bert(**inputs) | |
prediction_bert = torch.nn.functional.softmax(outputs.logits, dim=1) | |
prediction_bert = torch.argmax(prediction_bert, dim=1).numpy() | |
st.write("Comment by ML model:", user_review) | |
if prediction == 0: | |
st.markdown("<p style='color: green;'>Non-toxic comment</p>", unsafe_allow_html=True) | |
else: | |
st.markdown("<p style='color: red;'>Toxic comment</p>", unsafe_allow_html=True) | |
st.write("Comment by RuBERT:", user_review) | |
if prediction_bert == 0: | |
st.markdown("<p style='color: green;'>Controversial comment</p>", unsafe_allow_html=True) | |
elif prediction_bert == 1: | |
st.markdown("<p style='color: red;'>Non-toxic comment</p>", unsafe_allow_html=True) | |
else: | |
st.markdown("<p style='color: red;'>Toxic comment</p>", unsafe_allow_html=True) | |