Upload 16 files
Browse files- .gitattributes +3 -0
- BERTmodel_weights2.pth +3 -0
- bert_file.py +21 -0
- bert_strim.py +45 -0
- cat_model4.cbm +3 -0
- common_file.py +42 -0
- final_model_bah.pth +3 -0
- healthcare_facilities_reviews.csv +3 -0
- healthcare_facilities_reviews.jsonl +3 -0
- lstm.py +39 -0
- model_file.py +176 -0
- requirements.txt +71 -0
- rnn_preprocessing.py +80 -0
- strim_nlp.py +59 -0
- tfidf_vectorizer.joblib +3 -0
- toxic1.py +40 -0
- vocab_to_int.json +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
cat_model4.cbm filter=lfs diff=lfs merge=lfs -text
|
37 |
+
healthcare_facilities_reviews.csv filter=lfs diff=lfs merge=lfs -text
|
38 |
+
healthcare_facilities_reviews.jsonl filter=lfs diff=lfs merge=lfs -text
|
BERTmodel_weights2.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a449261c46bb588503a65b6cb825f996ae1f2e4af24ceacc6b7a94ef9542bdbf
|
3 |
+
size 116986906
|
bert_file.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModel
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
class BERTClassifier(nn.Module):
|
5 |
+
def __init__(self, bert_path="cointegrated/rubert-tiny2"):
|
6 |
+
super().__init__()
|
7 |
+
self.bert = AutoModel.from_pretrained(bert_path)
|
8 |
+
for param in self.bert.parameters():
|
9 |
+
param.requires_grad = False
|
10 |
+
self.linear = nn.Sequential(
|
11 |
+
nn.Linear(312, 150),
|
12 |
+
nn.Dropout(0.1),
|
13 |
+
nn.ReLU(),
|
14 |
+
nn.Linear(150, 1),
|
15 |
+
nn.Sigmoid()
|
16 |
+
)
|
17 |
+
|
18 |
+
def forward(self, x, masks):
|
19 |
+
bert_out = self.bert(x, attention_mask=masks)[0][:, 0, :]
|
20 |
+
out = self.linear(bert_out)
|
21 |
+
return out
|
bert_strim.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, AutoModel
|
3 |
+
import torch
|
4 |
+
from bert_file import BERTClassifier
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
|
8 |
+
model = BERTClassifier()
|
9 |
+
device = 'cpu'
|
10 |
+
|
11 |
+
model.load_state_dict(torch.load('BERTmodel_weights2.pth',map_location=torch.device('cpu')))
|
12 |
+
model.eval()
|
13 |
+
|
14 |
+
@st.cache_data
|
15 |
+
def predict_sentiment(text):
|
16 |
+
MAX_LEN = 100
|
17 |
+
encoded_review = tokenizer.encode_plus(
|
18 |
+
text,
|
19 |
+
max_length=MAX_LEN,
|
20 |
+
add_special_tokens=True,
|
21 |
+
return_token_type_ids=False,
|
22 |
+
pad_to_max_length=True,
|
23 |
+
return_attention_mask=True,
|
24 |
+
return_tensors='pt',
|
25 |
+
)
|
26 |
+
input_ids = encoded_review['input_ids'].to(device)
|
27 |
+
attention_mask = encoded_review['attention_mask'].to(device)
|
28 |
+
|
29 |
+
with torch.no_grad():
|
30 |
+
output = model(input_ids, attention_mask)
|
31 |
+
prediction = torch.round(output).cpu().numpy()[0][0]
|
32 |
+
if prediction == 1:
|
33 |
+
return "Позитивный отзыв 😀"
|
34 |
+
else:
|
35 |
+
return "Негативный отзыв 😟"
|
36 |
+
|
37 |
+
def bert_model_page():
|
38 |
+
st.title("Классификатор отзывов")
|
39 |
+
user_input = st.text_area("Введите отзыв:")
|
40 |
+
if st.button("Классифицировать"):
|
41 |
+
if user_input:
|
42 |
+
prediction = predict_sentiment(user_input)
|
43 |
+
st.write(prediction)
|
44 |
+
else:
|
45 |
+
st.write("Пожалуйста, введите отзыв для классификации.")
|
cat_model4.cbm
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:27995b1be7aee32a51075d40d154e00d7590e9fec2f2408635cd57d563ac0513
|
3 |
+
size 1135408
|
common_file.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
import requests
|
4 |
+
import time
|
5 |
+
import numpy as np
|
6 |
+
import os
|
7 |
+
from toxic1 import toxicity_page
|
8 |
+
from strim_nlp import classic_ml_page
|
9 |
+
from lstm import lstm_model_page
|
10 |
+
from bert_strim import bert_model_page
|
11 |
+
|
12 |
+
|
13 |
+
def app_description_page():
|
14 |
+
st.title("Welcome to My App!")
|
15 |
+
st.write("This is a Streamlit application where you can explore two different models.")
|
16 |
+
|
17 |
+
def model_selection_page():
|
18 |
+
st.sidebar.title("Model Selection")
|
19 |
+
selected_model = st.sidebar.radio("Select a model", ("Classic ML", "LSTM", "BERT"))
|
20 |
+
|
21 |
+
if selected_model == "Classic ML":
|
22 |
+
classic_ml_page()
|
23 |
+
st.write("You selected Classic ML.")
|
24 |
+
elif selected_model == "LSTM":
|
25 |
+
lstm_model_page()
|
26 |
+
st.write("You selected LSTM.")
|
27 |
+
elif selected_model == "BERT":
|
28 |
+
bert_model_page()
|
29 |
+
st.write("You selected BERT.")
|
30 |
+
|
31 |
+
def main():
|
32 |
+
page = st.sidebar.radio("Go to", ("App Description", "Model Selection", "Toxicity Model"))
|
33 |
+
|
34 |
+
if page == "App Description":
|
35 |
+
app_description_page()
|
36 |
+
elif page == "Model Selection":
|
37 |
+
model_selection_page()
|
38 |
+
elif page == "Toxicity Model":
|
39 |
+
toxicity_page()
|
40 |
+
|
41 |
+
if __name__ == "__main__":
|
42 |
+
main()
|
final_model_bah.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b4c57591e552f8f7173253da29d0529a8ff5d0875b4fa7017aa111f5e9f87455
|
3 |
+
size 1506113
|
healthcare_facilities_reviews.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b329837f76ec5275dc35f7228007a2a55ac62b37f88ad54fef222bd317c8efd3
|
3 |
+
size 79002044
|
healthcare_facilities_reviews.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:74e8cb7f3eb5981b0934f66856123900a0f4c6ca83b1b06704e50deafea2b186
|
3 |
+
size 95300708
|
lstm.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
import re
|
4 |
+
import json
|
5 |
+
from nltk.corpus import stopwords
|
6 |
+
from model_file import data_preprocessing, preprocess_single_string, LSTMBahdanauAttention
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
stop_words = set(stopwords.words('russian'))
|
9 |
+
|
10 |
+
# Load vocabulary mapping
|
11 |
+
with open('vocab_to_int.json', 'r') as file:
|
12 |
+
vocab_to_int = json.load(file)
|
13 |
+
|
14 |
+
# Load the pre-trained model
|
15 |
+
SEQ_LEN = 96
|
16 |
+
model_bah = LSTMBahdanauAttention()
|
17 |
+
# Set the new vocabulary size in the model
|
18 |
+
model_bah.load_state_dict(torch.load('final_model_bah.pth'))
|
19 |
+
model_bah.eval()
|
20 |
+
|
21 |
+
# Function to analyze sentiment
|
22 |
+
def analyze_sentiment(text):
|
23 |
+
preprocessed_text = data_preprocessing(text)
|
24 |
+
sample = preprocess_single_string(preprocessed_text, SEQ_LEN, vocab_to_int)
|
25 |
+
|
26 |
+
with torch.no_grad():
|
27 |
+
probability = model_bah(sample.unsqueeze(0))[0].sigmoid().item()
|
28 |
+
return probability
|
29 |
+
|
30 |
+
# Streamlit UI
|
31 |
+
def lstm_model_page():
|
32 |
+
st.title("Классификация отзывов лечебных учреждений")
|
33 |
+
user_input = st.text_area("Введите ваш отзыв:")
|
34 |
+
if st.button("Классифицировать"):
|
35 |
+
probability = analyze_sentiment(user_input)
|
36 |
+
if probability > 0.5:
|
37 |
+
st.write("Отзыв положительный 🌟")
|
38 |
+
else:
|
39 |
+
st.write("Отзыв отрицательный 😞")
|
model_file.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
stop_words = set(stopwords.words('russian'))
|
8 |
+
from collections import Counter
|
9 |
+
from gensim.models import Word2Vec
|
10 |
+
import pandas as pd
|
11 |
+
import torch.nn.functional as F
|
12 |
+
|
13 |
+
|
14 |
+
HIDDEN_SIZE = 32
|
15 |
+
SEQ_LEN = 32
|
16 |
+
df = pd.read_json('healthcare_facilities_reviews.jsonl', lines=True)
|
17 |
+
|
18 |
+
def data_preprocessing(text: str) -> str:
|
19 |
+
text = text.lower()
|
20 |
+
text = re.sub('<.*?>', '', text) # html tags
|
21 |
+
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
|
22 |
+
text = ' '.join([word for word in text.split() if word not in stop_words])
|
23 |
+
text = [word for word in text.split() if not word.isdigit()]
|
24 |
+
text = ' '.join(text)
|
25 |
+
return text
|
26 |
+
|
27 |
+
contents = df['content'].tolist()
|
28 |
+
preprocessed = [data_preprocessing(content) for content in contents]
|
29 |
+
|
30 |
+
corpus = [word for text in preprocessed for word in text.split()]
|
31 |
+
sorted_words = Counter(corpus).most_common()
|
32 |
+
|
33 |
+
def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
|
34 |
+
return list(filter(lambda x: x[1] > n, sorted_words))
|
35 |
+
|
36 |
+
sorted_words = get_words_by_freq(sorted_words, 100)
|
37 |
+
sorted_words[-10:]
|
38 |
+
|
39 |
+
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
|
40 |
+
|
41 |
+
reviews_int = []
|
42 |
+
for text in preprocessed:
|
43 |
+
r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
|
44 |
+
reviews_int.append(r)
|
45 |
+
|
46 |
+
w2v_input = []
|
47 |
+
for review in preprocessed:
|
48 |
+
cur_review = []
|
49 |
+
for word in review.split():
|
50 |
+
if vocab_to_int.get(word):
|
51 |
+
cur_review.append(word)
|
52 |
+
w2v_input.append(cur_review)
|
53 |
+
|
54 |
+
VOCAB_SIZE = len(vocab_to_int) + 1
|
55 |
+
|
56 |
+
EMBEDDING_DIM = 64
|
57 |
+
|
58 |
+
wv = Word2Vec(
|
59 |
+
min_count=1, # минимальная встречаемость в корпусе
|
60 |
+
vector_size=EMBEDDING_DIM # размерность вектора для слова
|
61 |
+
)
|
62 |
+
wv.build_vocab(w2v_input)
|
63 |
+
|
64 |
+
wv.train(
|
65 |
+
corpus_iterable=w2v_input,
|
66 |
+
total_examples=wv.corpus_count,
|
67 |
+
epochs=10
|
68 |
+
)
|
69 |
+
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
|
70 |
+
|
71 |
+
# Бежим по всем словам словаря: если слово есть, достаем его вектор
|
72 |
+
# если слова нет, то распечатываем его и пропускаем
|
73 |
+
for word, i in vocab_to_int.items():
|
74 |
+
try:
|
75 |
+
embedding_vector = wv.wv[word]
|
76 |
+
embedding_matrix[i] = embedding_vector
|
77 |
+
except KeyError as e:
|
78 |
+
pass
|
79 |
+
print(f'{e}: word: {word}')
|
80 |
+
|
81 |
+
# Создаем предобученный эмбеддинг – этот слой в нашей сети обучаться не будет
|
82 |
+
embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
|
83 |
+
|
84 |
+
def data_preprocessing(text: str) -> str:
|
85 |
+
text = text.lower()
|
86 |
+
text = re.sub('<.*?>', '', text) # html tags
|
87 |
+
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
|
88 |
+
text = ' '.join([word for word in text.split() if word not in stop_words])
|
89 |
+
text = [word for word in text.split() if not word.isdigit()]
|
90 |
+
text = ' '.join(text)
|
91 |
+
return text
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
|
96 |
+
features = np.zeros((len(review_int), seq_len), dtype = int)
|
97 |
+
for i, review in enumerate(review_int):
|
98 |
+
if len(review) <= seq_len:
|
99 |
+
zeros = list(np.zeros(seq_len - len(review)))
|
100 |
+
new = zeros + review
|
101 |
+
else:
|
102 |
+
new = review[: seq_len]
|
103 |
+
features[i, :] = np.array(new)
|
104 |
+
|
105 |
+
return features
|
106 |
+
|
107 |
+
def preprocess_single_string(
|
108 |
+
input_string: str,
|
109 |
+
seq_len: int,
|
110 |
+
vocab_to_int: dict,
|
111 |
+
verbose : bool = False
|
112 |
+
) -> torch.tensor:
|
113 |
+
preprocessed_string = data_preprocessing(input_string)
|
114 |
+
result_list = []
|
115 |
+
for word in preprocessed_string.split():
|
116 |
+
try:
|
117 |
+
result_list.append(vocab_to_int[word])
|
118 |
+
except KeyError as e:
|
119 |
+
if verbose:
|
120 |
+
print(f'{e}: not in dictionary!')
|
121 |
+
pass
|
122 |
+
result_padded = padding([result_list], seq_len)[0]
|
123 |
+
|
124 |
+
return torch.tensor(result_padded)
|
125 |
+
|
126 |
+
class BahdanauAttention(nn.Module):
|
127 |
+
def __init__(
|
128 |
+
self,
|
129 |
+
hidden_size: int = HIDDEN_SIZE
|
130 |
+
) -> None:
|
131 |
+
|
132 |
+
super().__init__()
|
133 |
+
self.hidden_size = hidden_size
|
134 |
+
self.W = nn.Linear(hidden_size, hidden_size)
|
135 |
+
self.U = nn.Linear(hidden_size, hidden_size)
|
136 |
+
self.V = nn.Linear(hidden_size, 1)
|
137 |
+
self.tanh = nn.Tanh()
|
138 |
+
|
139 |
+
def forward(
|
140 |
+
self,
|
141 |
+
keys: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
|
142 |
+
query: torch.Tensor # BATCH_SIZE x HIDDEN_SIZE
|
143 |
+
):
|
144 |
+
|
145 |
+
query = query.unsqueeze(1) # BATCH_SIZE x 1 x HIDDEN_SIZE
|
146 |
+
r_query = self.W(query) # BATCH_SIZE x 1 x HIDDEN_SIZE
|
147 |
+
|
148 |
+
r_keys = self.U(keys) # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
|
149 |
+
|
150 |
+
scores = self.V(torch.tanh(r_query + r_keys)) # BATCH_SIZE x SEQ_LEN x 1
|
151 |
+
scores = scores.squeeze(-1) # BATCH_SIZE x SEQ_LEN
|
152 |
+
att_weights = F.softmax(scores, dim=1) # BATCH_SIZE x SEQ_LEN
|
153 |
+
context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze(1) # BATCH_SIZE x HIDDEN_SIZE
|
154 |
+
return context, att_weights
|
155 |
+
|
156 |
+
class LSTMBahdanauAttention(nn.Module):
|
157 |
+
def __init__(self) -> None:
|
158 |
+
super().__init__()
|
159 |
+
|
160 |
+
# self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
|
161 |
+
self.embedding = embedding_layer
|
162 |
+
self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
|
163 |
+
self.attn = BahdanauAttention(HIDDEN_SIZE)
|
164 |
+
self.clf = nn.Sequential(
|
165 |
+
nn.Linear(HIDDEN_SIZE, 128),
|
166 |
+
nn.Dropout(),
|
167 |
+
nn.Tanh(),
|
168 |
+
nn.Linear(128, 1)
|
169 |
+
)
|
170 |
+
|
171 |
+
def forward(self, x):
|
172 |
+
embeddings = self.embedding(x)
|
173 |
+
outputs, (h_n, _) = self.lstm(embeddings)
|
174 |
+
context, att_weights = self.attn(outputs, h_n.squeeze(0))
|
175 |
+
out = self.clf(context)
|
176 |
+
return out, att_weights
|
requirements.txt
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==5.3.0
|
2 |
+
attrs==23.2.0
|
3 |
+
blinker==1.7.0
|
4 |
+
cachetools==5.3.3
|
5 |
+
catboost==1.2.3
|
6 |
+
certifi==2024.2.2
|
7 |
+
charset-normalizer==3.3.2
|
8 |
+
click==8.1.7
|
9 |
+
contourpy==1.2.1
|
10 |
+
cycler==0.12.1
|
11 |
+
filelock==3.13.4
|
12 |
+
fonttools==4.51.0
|
13 |
+
fsspec==2024.3.1
|
14 |
+
gensim==4.3.2
|
15 |
+
gitdb==4.0.11
|
16 |
+
GitPython==3.1.43
|
17 |
+
graphviz==0.20.3
|
18 |
+
huggingface-hub==0.22.2
|
19 |
+
idna==3.7
|
20 |
+
Jinja2==3.1.3
|
21 |
+
joblib==1.4.0
|
22 |
+
jsonschema==4.21.1
|
23 |
+
jsonschema-specifications==2023.12.1
|
24 |
+
kiwisolver==1.4.5
|
25 |
+
markdown-it-py==3.0.0
|
26 |
+
MarkupSafe==2.1.5
|
27 |
+
matplotlib==3.8.4
|
28 |
+
mdurl==0.1.2
|
29 |
+
mpmath==1.3.0
|
30 |
+
networkx==3.3
|
31 |
+
nltk==3.8.1
|
32 |
+
numpy==1.26.4
|
33 |
+
packaging==24.0
|
34 |
+
pandas==2.2.2
|
35 |
+
pillow==10.3.0
|
36 |
+
plotly==5.20.0
|
37 |
+
protobuf==4.25.3
|
38 |
+
pyarrow==15.0.2
|
39 |
+
pydeck==0.8.1b0
|
40 |
+
Pygments==2.17.2
|
41 |
+
pymystem3==0.2.0
|
42 |
+
pyparsing==3.1.2
|
43 |
+
python-dateutil==2.9.0.post0
|
44 |
+
pytz==2024.1
|
45 |
+
PyYAML==6.0.1
|
46 |
+
referencing==0.34.0
|
47 |
+
regex==2023.12.25
|
48 |
+
requests==2.31.0
|
49 |
+
rich==13.7.1
|
50 |
+
rpds-py==0.18.0
|
51 |
+
safetensors==0.4.2
|
52 |
+
scikit-learn==1.4.2
|
53 |
+
scipy==1.13.0
|
54 |
+
six==1.16.0
|
55 |
+
smart-open==7.0.4
|
56 |
+
smmap==5.0.1
|
57 |
+
streamlit==1.33.0
|
58 |
+
sympy==1.12
|
59 |
+
tenacity==8.2.3
|
60 |
+
threadpoolctl==3.4.0
|
61 |
+
tokenizers==0.15.2
|
62 |
+
toml==0.10.2
|
63 |
+
toolz==0.12.1
|
64 |
+
torch==2.2.2
|
65 |
+
tornado==6.4
|
66 |
+
tqdm==4.66.2
|
67 |
+
transformers==4.39.3
|
68 |
+
typing_extensions==4.11.0
|
69 |
+
tzdata==2024.1
|
70 |
+
urllib3==2.2.1
|
71 |
+
wrapt==1.16.0
|
rnn_preprocessing.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
stop_words = set(stopwords.words('russian'))
|
8 |
+
|
9 |
+
def data_preprocessing(text: str) -> str:
|
10 |
+
"""preprocessing string: lowercase, removing html-tags, punctuation,
|
11 |
+
stopwords, digits
|
12 |
+
|
13 |
+
Args:
|
14 |
+
text (str): input string for preprocessing
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
str: preprocessed string
|
18 |
+
"""
|
19 |
+
|
20 |
+
text = text.lower()
|
21 |
+
text = re.sub('<.*?>', '', text) # html tags
|
22 |
+
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
|
23 |
+
text = ' '.join([word for word in text.split() if word not in stop_words])
|
24 |
+
text = [word for word in text.split() if not word.isdigit()]
|
25 |
+
text = ' '.join(text)
|
26 |
+
return text
|
27 |
+
|
28 |
+
def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
|
29 |
+
return list(filter(lambda x: x[1] > n, sorted_words))
|
30 |
+
|
31 |
+
def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
|
32 |
+
"""Make left-sided padding for input list of tokens
|
33 |
+
|
34 |
+
Args:
|
35 |
+
review_int (list): input list of tokens
|
36 |
+
seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
np.array: padded sequences
|
40 |
+
"""
|
41 |
+
features = np.zeros((len(review_int), seq_len), dtype = int)
|
42 |
+
for i, review in enumerate(review_int):
|
43 |
+
if len(review) <= seq_len:
|
44 |
+
zeros = list(np.zeros(seq_len - len(review)))
|
45 |
+
new = zeros + review
|
46 |
+
else:
|
47 |
+
new = review[: seq_len]
|
48 |
+
features[i, :] = np.array(new)
|
49 |
+
|
50 |
+
return features
|
51 |
+
|
52 |
+
def preprocess_single_string(
|
53 |
+
input_string: str,
|
54 |
+
seq_len: int,
|
55 |
+
vocab_to_int: dict,
|
56 |
+
verbose : bool = False
|
57 |
+
) -> torch.tensor:
|
58 |
+
"""Function for all preprocessing steps on a single string
|
59 |
+
|
60 |
+
Args:
|
61 |
+
input_string (str): input single string for preprocessing
|
62 |
+
seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
|
63 |
+
vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
|
64 |
+
|
65 |
+
Returns:
|
66 |
+
list: preprocessed string
|
67 |
+
"""
|
68 |
+
|
69 |
+
preprocessed_string = data_preprocessing(input_string)
|
70 |
+
result_list = []
|
71 |
+
for word in preprocessed_string.split():
|
72 |
+
try:
|
73 |
+
result_list.append(vocab_to_int[word])
|
74 |
+
except KeyError as e:
|
75 |
+
if verbose:
|
76 |
+
print(f'{e}: not in dictionary!')
|
77 |
+
pass
|
78 |
+
result_padded = padding([result_list], seq_len)[0]
|
79 |
+
|
80 |
+
return torch.tensor(result_padded)
|
strim_nlp.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import catboost
|
4 |
+
from catboost import CatBoostClassifier
|
5 |
+
import re
|
6 |
+
import string
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
from pymystem3 import Mystem
|
9 |
+
from joblib import load
|
10 |
+
import nltk
|
11 |
+
|
12 |
+
def data_preprocessing(text):
|
13 |
+
stop_words = set(stopwords.words('russian'))
|
14 |
+
text = text.lower()
|
15 |
+
text = re.sub("<.*?>", "", text)
|
16 |
+
text = re.sub(r'http\S+', " ", text)
|
17 |
+
text = re.sub(r'@\w+', ' ', text)
|
18 |
+
text = re.sub(r'#\w+', ' ', text)
|
19 |
+
text = re.sub(r'\d+', ' ', text)
|
20 |
+
text = "".join([c for c in text if c not in string.punctuation])
|
21 |
+
return " ".join([word for word in text.split() if word not in stop_words])
|
22 |
+
|
23 |
+
def lemmatize_text(text):
|
24 |
+
mystem = Mystem()
|
25 |
+
lemmas = mystem.lemmatize(text)
|
26 |
+
return ' '.join(lemmas)
|
27 |
+
|
28 |
+
model = CatBoostClassifier()
|
29 |
+
model.load_model('cat_model4.cbm')
|
30 |
+
|
31 |
+
tfidf_vectorizer = load('tfidf_vectorizer.joblib')
|
32 |
+
|
33 |
+
def classic_ml_page():
|
34 |
+
st.title("Классификация отзывов о медицинских учреждениях")
|
35 |
+
user_review = st.text_area("Введите ваш отзыв здесь:")
|
36 |
+
|
37 |
+
if st.button("Классифицировать"):
|
38 |
+
if user_review:
|
39 |
+
preprocessed_review = data_preprocessing(user_review)
|
40 |
+
lemmatized_review = lemmatize_text(preprocessed_review)
|
41 |
+
vectorized_review = tfidf_vectorizer.transform([lemmatized_review])
|
42 |
+
prediction = model.predict(vectorized_review)
|
43 |
+
|
44 |
+
if prediction[0] == 1:
|
45 |
+
st.write("Позитивный отзыв 😀")
|
46 |
+
else:
|
47 |
+
st.write("Негативный отзыв 😟")
|
48 |
+
else:
|
49 |
+
st.write("Пожалуйста, введите отзыв для классификации.")
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
|
tfidf_vectorizer.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:621e7e86acf6a032018e0e5ebf0876579f4f846478a70e782eb3c476298c088f
|
3 |
+
size 1750676
|
toxic1.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# toxic.py
|
2 |
+
import streamlit as st
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import time
|
6 |
+
import torch
|
7 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
8 |
+
|
9 |
+
model_t_checkpoint = 'cointegrated/rubert-tiny-toxicity'
|
10 |
+
tokenizer_t = AutoTokenizer.from_pretrained(model_t_checkpoint)
|
11 |
+
model_t = AutoModelForSequenceClassification.from_pretrained(model_t_checkpoint)
|
12 |
+
|
13 |
+
def text2toxicity(text, aggregate=True):
|
14 |
+
with torch.no_grad():
|
15 |
+
inputs = tokenizer_t(text, return_tensors='pt', truncation=True, padding=True).to('cpu')
|
16 |
+
proba = torch.sigmoid(model_t(**inputs).logits).cpu().numpy()
|
17 |
+
if isinstance(text, str):
|
18 |
+
proba = proba[0]
|
19 |
+
if aggregate:
|
20 |
+
return 1 - proba.T[0] * (1 - proba.T[-1])
|
21 |
+
return proba
|
22 |
+
|
23 |
+
def toxicity_page():
|
24 |
+
st.title("""
|
25 |
+
Определим токсичный комментарий или нет
|
26 |
+
""")
|
27 |
+
|
28 |
+
user_text_input = st.text_area('Введите ваш отзыв здесь:')
|
29 |
+
|
30 |
+
if st.button('Предсказать'):
|
31 |
+
start_time = time.time()
|
32 |
+
proba = text2toxicity(user_text_input, True)
|
33 |
+
end_time = time.time()
|
34 |
+
prediction_time = end_time - start_time
|
35 |
+
|
36 |
+
if proba >= 0.5:
|
37 |
+
st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий токсичный.')
|
38 |
+
else:
|
39 |
+
st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий не токсичный.')
|
40 |
+
st.write(f'Время предсказания: {prediction_time:.4f} секунд')
|
vocab_to_int.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|