Spaces:

osheina
/

Nlp_

Sleeping

App Files Files Community

osheina commited on Apr 12, 2024

Commit

b6aa467

•

1 Parent(s): fed8986

Upload 16 files

Browse files

Files changed (17) hide show

.gitattributes +3 -0
BERTmodel_weights2.pth +3 -0
bert_file.py +21 -0
bert_strim.py +45 -0
cat_model4.cbm +3 -0
common_file.py +42 -0
final_model_bah.pth +3 -0
healthcare_facilities_reviews.csv +3 -0
healthcare_facilities_reviews.jsonl +3 -0
lstm.py +39 -0
model_file.py +176 -0
requirements.txt +71 -0
rnn_preprocessing.py +80 -0
strim_nlp.py +59 -0
tfidf_vectorizer.joblib +3 -0
toxic1.py +40 -0
vocab_to_int.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+cat_model4.cbm filter=lfs diff=lfs merge=lfs -text
+healthcare_facilities_reviews.csv filter=lfs diff=lfs merge=lfs -text
+healthcare_facilities_reviews.jsonl filter=lfs diff=lfs merge=lfs -text

BERTmodel_weights2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a449261c46bb588503a65b6cb825f996ae1f2e4af24ceacc6b7a94ef9542bdbf
+size 116986906

bert_file.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from transformers import AutoModel
+from torch import nn
+class BERTClassifier(nn.Module):
+    def __init__(self, bert_path="cointegrated/rubert-tiny2"):
+        super().__init__()
+        self.bert = AutoModel.from_pretrained(bert_path)
+        for param in self.bert.parameters():
+            param.requires_grad = False
+        self.linear = nn.Sequential(
+            nn.Linear(312, 150),
+            nn.Dropout(0.1),
+            nn.ReLU(),
+            nn.Linear(150, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, x, masks):
+        bert_out = self.bert(x, attention_mask=masks)[0][:, 0, :]
+        out = self.linear(bert_out)
+        return out

bert_strim.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import streamlit as st
+from transformers import AutoTokenizer, AutoModel
+import torch
+from bert_file import BERTClassifier
+import numpy as np
+tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
+model = BERTClassifier()
+device = 'cpu'
+model.load_state_dict(torch.load('BERTmodel_weights2.pth',map_location=torch.device('cpu')))
+model.eval()
+@st.cache_data
+def predict_sentiment(text):
+    MAX_LEN = 100
+    encoded_review = tokenizer.encode_plus(
+        text,
+        max_length=MAX_LEN,
+        add_special_tokens=True,
+        return_token_type_ids=False,
+        pad_to_max_length=True,
+        return_attention_mask=True,
+        return_tensors='pt',
+    )
+    input_ids = encoded_review['input_ids'].to(device)
+    attention_mask = encoded_review['attention_mask'].to(device)
+    with torch.no_grad():
+        output = model(input_ids, attention_mask)
+        prediction = torch.round(output).cpu().numpy()[0][0]
+    if prediction == 1:
+        return "Позитивный отзыв 😀"
+    else:
+        return "Негативный отзыв 😟"
+def bert_model_page():
+    st.title("Классификатор отзывов")
+    user_input = st.text_area("Введите отзыв:")
+    if st.button("Классифицировать"):
+        if user_input:
+            prediction = predict_sentiment(user_input)
+            st.write(prediction)
+        else:
+            st.write("Пожалуйста, введите отзыв для классификации.")

cat_model4.cbm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27995b1be7aee32a51075d40d154e00d7590e9fec2f2408635cd57d563ac0513
+size 1135408

common_file.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import streamlit as st
+import torch
+import requests
+import time
+import numpy as np
+import os
+from toxic1 import toxicity_page
+from strim_nlp import classic_ml_page
+from lstm import lstm_model_page
+from bert_strim import bert_model_page
+def app_description_page():
+    st.title("Welcome to My App!")
+    st.write("This is a Streamlit application where you can explore two different models.")
+def model_selection_page():
+    st.sidebar.title("Model Selection")
+    selected_model = st.sidebar.radio("Select a model", ("Classic ML", "LSTM", "BERT"))
+    if selected_model == "Classic ML":
+        classic_ml_page()
+        st.write("You selected Classic ML.")
+    elif selected_model == "LSTM":
+        lstm_model_page()
+        st.write("You selected LSTM.")
+    elif selected_model == "BERT":
+        bert_model_page()
+        st.write("You selected BERT.")
+def main():
+    page = st.sidebar.radio("Go to", ("App Description", "Model Selection", "Toxicity Model"))
+    if page == "App Description":
+        app_description_page()
+    elif page == "Model Selection":
+        model_selection_page()
+    elif page == "Toxicity Model":
+        toxicity_page()
+if __name__ == "__main__":
+    main()

final_model_bah.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4c57591e552f8f7173253da29d0529a8ff5d0875b4fa7017aa111f5e9f87455
+size 1506113

healthcare_facilities_reviews.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b329837f76ec5275dc35f7228007a2a55ac62b37f88ad54fef222bd317c8efd3
+size 79002044

healthcare_facilities_reviews.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74e8cb7f3eb5981b0934f66856123900a0f4c6ca83b1b06704e50deafea2b186
+size 95300708

lstm.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import streamlit as st
+import torch
+import re
+import json
+from nltk.corpus import stopwords
+from model_file import data_preprocessing, preprocess_single_string, LSTMBahdanauAttention
+from nltk.corpus import stopwords
+stop_words = set(stopwords.words('russian'))
+# Load vocabulary mapping
+with open('vocab_to_int.json', 'r') as file:
+    vocab_to_int = json.load(file)
+# Load the pre-trained model
+SEQ_LEN = 96
+model_bah = LSTMBahdanauAttention()
+# Set the new vocabulary size in the model
+model_bah.load_state_dict(torch.load('final_model_bah.pth'))
+model_bah.eval()
+# Function to analyze sentiment
+def analyze_sentiment(text):
+    preprocessed_text = data_preprocessing(text)
+    sample = preprocess_single_string(preprocessed_text, SEQ_LEN, vocab_to_int)
+    with torch.no_grad():
+        probability = model_bah(sample.unsqueeze(0))[0].sigmoid().item()
+        return probability
+# Streamlit UI
+def lstm_model_page():
+    st.title("Классификация отзывов лечебных учреждений")
+    user_input = st.text_area("Введите ваш отзыв:")
+    if st.button("Классифицировать"):
+        probability = analyze_sentiment(user_input)
+        if probability > 0.5:
+            st.write("Отзыв положительный 🌟")
+        else:
+            st.write("Отзыв отрицательный 😞")

model_file.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import re
+import string
+import numpy as np
+import torch
+import torch.nn as nn
+from nltk.corpus import stopwords
+stop_words = set(stopwords.words('russian'))
+from collections import Counter
+from gensim.models import Word2Vec
+import pandas as pd
+import torch.nn.functional as F
+HIDDEN_SIZE = 32
+SEQ_LEN = 32
+df = pd.read_json('healthcare_facilities_reviews.jsonl', lines=True)
+def data_preprocessing(text: str) -> str:
+    text = text.lower()
+    text = re.sub('<.*?>', '', text) # html tags
+    text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
+    text = ' '.join([word for word in text.split() if word not in stop_words])
+    text = [word for word in text.split() if not word.isdigit()]
+    text = ' '.join(text)
+    return text
+contents = df['content'].tolist()
+preprocessed = [data_preprocessing(content) for content in contents]
+corpus = [word for text in preprocessed for word in text.split()]
+sorted_words = Counter(corpus).most_common()
+def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
+    return list(filter(lambda x: x[1] > n, sorted_words))
+sorted_words = get_words_by_freq(sorted_words, 100)
+sorted_words[-10:]
+vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
+reviews_int = []
+for text in preprocessed:
+    r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
+    reviews_int.append(r)
+w2v_input = []
+for review in preprocessed:
+    cur_review = []
+    for word in review.split():
+        if vocab_to_int.get(word):
+            cur_review.append(word)
+    w2v_input.append(cur_review)
+VOCAB_SIZE = len(vocab_to_int) + 1
+EMBEDDING_DIM = 64
+wv = Word2Vec(
+    min_count=1, # минимальная встречаемость в корпусе
+    vector_size=EMBEDDING_DIM # размерность вектора для слова
+    )
+wv.build_vocab(w2v_input)
+wv.train(
+    corpus_iterable=w2v_input,
+    total_examples=wv.corpus_count,
+    epochs=10
+    )
+embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
+# Бежим по всем словам словаря: если слово есть, достаем его вектор
+# если слова нет, то распечатываем его и пропускаем
+for word, i in vocab_to_int.items():
+    try:
+        embedding_vector = wv.wv[word]
+        embedding_matrix[i] = embedding_vector
+    except KeyError as e:
+        pass
+        print(f'{e}: word: {word}')
+# Создаем предобученный эмбеддинг – этот слой в нашей сети обучаться не будет
+embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
+def data_preprocessing(text: str) -> str:
+    text = text.lower()
+    text = re.sub('<.*?>', '', text) # html tags
+    text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
+    text = ' '.join([word for word in text.split() if word not in stop_words])
+    text = [word for word in text.split() if not word.isdigit()]
+    text = ' '.join(text)
+    return text
+def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
+    features = np.zeros((len(review_int), seq_len), dtype = int)
+    for i, review in enumerate(review_int):
+        if len(review) <= seq_len:
+            zeros = list(np.zeros(seq_len - len(review)))
+            new = zeros + review
+        else:
+            new = review[: seq_len]
+        features[i, :] = np.array(new)
+    return features
+def preprocess_single_string(
+    input_string: str,
+    seq_len: int,
+    vocab_to_int: dict,
+    verbose : bool = False
+    ) -> torch.tensor:
+    preprocessed_string = data_preprocessing(input_string)
+    result_list = []
+    for word in preprocessed_string.split():
+        try:
+            result_list.append(vocab_to_int[word])
+        except KeyError as e:
+            if verbose:
+                print(f'{e}: not in dictionary!')
+            pass
+    result_padded = padding([result_list], seq_len)[0]
+    return torch.tensor(result_padded)
+class BahdanauAttention(nn.Module):
+    def __init__(
+    self,
+    hidden_size: int = HIDDEN_SIZE
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.W = nn.Linear(hidden_size, hidden_size)
+        self.U = nn.Linear(hidden_size, hidden_size)
+        self.V = nn.Linear(hidden_size, 1)
+        self.tanh = nn.Tanh()
+    def forward(
+        self,
+        keys: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
+        query: torch.Tensor # BATCH_SIZE x HIDDEN_SIZE
+        ):
+        query = query.unsqueeze(1) # BATCH_SIZE x 1 x HIDDEN_SIZE
+        r_query = self.W(query) # BATCH_SIZE x 1 x HIDDEN_SIZE
+        r_keys = self.U(keys) # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
+        scores = self.V(torch.tanh(r_query + r_keys)) # BATCH_SIZE x SEQ_LEN x 1
+        scores = scores.squeeze(-1) # BATCH_SIZE x SEQ_LEN
+        att_weights = F.softmax(scores, dim=1) # BATCH_SIZE x SEQ_LEN
+        context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze(1) # BATCH_SIZE x HIDDEN_SIZE
+        return context, att_weights
+class LSTMBahdanauAttention(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        # self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
+        self.embedding = embedding_layer
+        self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
+        self.attn = BahdanauAttention(HIDDEN_SIZE)
+        self.clf = nn.Sequential(
+        nn.Linear(HIDDEN_SIZE, 128),
+        nn.Dropout(),
+        nn.Tanh(),
+        nn.Linear(128, 1)
+        )
+    def forward(self, x):
+        embeddings = self.embedding(x)
+        outputs, (h_n, _) = self.lstm(embeddings)
+        context, att_weights = self.attn(outputs, h_n.squeeze(0))
+        out = self.clf(context)
+        return out, att_weights

requirements.txt ADDED Viewed

	@@ -0,0 +1,71 @@

+altair==5.3.0
+attrs==23.2.0
+blinker==1.7.0
+cachetools==5.3.3
+catboost==1.2.3
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+contourpy==1.2.1
+cycler==0.12.1
+filelock==3.13.4
+fonttools==4.51.0
+fsspec==2024.3.1
+gensim==4.3.2
+gitdb==4.0.11
+GitPython==3.1.43
+graphviz==0.20.3
+huggingface-hub==0.22.2
+idna==3.7
+Jinja2==3.1.3
+joblib==1.4.0
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.8.4
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.3
+nltk==3.8.1
+numpy==1.26.4
+packaging==24.0
+pandas==2.2.2
+pillow==10.3.0
+plotly==5.20.0
+protobuf==4.25.3
+pyarrow==15.0.2
+pydeck==0.8.1b0
+Pygments==2.17.2
+pymystem3==0.2.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.34.0
+regex==2023.12.25
+requests==2.31.0
+rich==13.7.1
+rpds-py==0.18.0
+safetensors==0.4.2
+scikit-learn==1.4.2
+scipy==1.13.0
+six==1.16.0
+smart-open==7.0.4
+smmap==5.0.1
+streamlit==1.33.0
+sympy==1.12
+tenacity==8.2.3
+threadpoolctl==3.4.0
+tokenizers==0.15.2
+toml==0.10.2
+toolz==0.12.1
+torch==2.2.2
+tornado==6.4
+tqdm==4.66.2
+transformers==4.39.3
+typing_extensions==4.11.0
+tzdata==2024.1
+urllib3==2.2.1
+wrapt==1.16.0

rnn_preprocessing.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import re
+import string
+import numpy as np
+import torch
+from nltk.corpus import stopwords
+stop_words = set(stopwords.words('russian'))
+def data_preprocessing(text: str) -> str:
+    """preprocessing string: lowercase, removing html-tags, punctuation,
+                            stopwords, digits
+    Args:
+        text (str): input string for preprocessing
+    Returns:
+        str: preprocessed string
+    """
+    text = text.lower()
+    text = re.sub('<.*?>', '', text) # html tags
+    text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
+    text = ' '.join([word for word in text.split() if word not in stop_words])
+    text = [word for word in text.split() if not word.isdigit()]
+    text = ' '.join(text)
+    return text
+def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
+    return list(filter(lambda x: x[1] > n, sorted_words))
+def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
+    """Make left-sided padding for input list of tokens
+    Args:
+        review_int (list): input list of tokens
+        seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
+    Returns:
+        np.array: padded sequences
+    """
+    features = np.zeros((len(review_int), seq_len), dtype = int)
+    for i, review in enumerate(review_int):
+        if len(review) <= seq_len:
+            zeros = list(np.zeros(seq_len - len(review)))
+            new = zeros + review
+        else:
+            new = review[: seq_len]
+        features[i, :] = np.array(new)
+    return features
+def preprocess_single_string(
+    input_string: str,
+    seq_len: int,
+    vocab_to_int: dict,
+    verbose : bool = False
+    ) -> torch.tensor:
+    """Function for all preprocessing steps on a single string
+    Args:
+        input_string (str): input single string for preprocessing
+        seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
+        vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
+    Returns:
+        list: preprocessed string
+    """
+    preprocessed_string = data_preprocessing(input_string)
+    result_list = []
+    for word in preprocessed_string.split():
+        try:
+            result_list.append(vocab_to_int[word])
+        except KeyError as e:
+            if verbose:
+                print(f'{e}: not in dictionary!')
+            pass
+    result_padded = padding([result_list], seq_len)[0]
+    return torch.tensor(result_padded)

strim_nlp.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import streamlit as st
+import pandas as pd
+import catboost
+from catboost import CatBoostClassifier
+import re
+import string
+from nltk.corpus import stopwords
+from pymystem3 import Mystem
+from joblib import load
+import nltk
+def data_preprocessing(text):
+    stop_words = set(stopwords.words('russian'))
+    text = text.lower()
+    text = re.sub("<.*?>", "", text)
+    text = re.sub(r'http\S+', " ", text)
+    text = re.sub(r'@\w+', ' ', text)
+    text = re.sub(r'#\w+', ' ', text)
+    text = re.sub(r'\d+', ' ', text)
+    text = "".join([c for c in text if c not in string.punctuation])
+    return " ".join([word for word in text.split() if word not in stop_words])
+def lemmatize_text(text):
+    mystem = Mystem()
+    lemmas = mystem.lemmatize(text)
+    return ' '.join(lemmas)
+model = CatBoostClassifier()
+model.load_model('cat_model4.cbm')
+tfidf_vectorizer = load('tfidf_vectorizer.joblib')
+def classic_ml_page():
+    st.title("Классификация отзывов о медицинских учреждениях")
+    user_review = st.text_area("Введите ваш отзыв здесь:")
+    if st.button("Классифицировать"):
+        if user_review:
+            preprocessed_review = data_preprocessing(user_review)
+            lemmatized_review = lemmatize_text(preprocessed_review)
+            vectorized_review = tfidf_vectorizer.transform([lemmatized_review])
+            prediction = model.predict(vectorized_review)
+            if prediction[0] == 1:
+                st.write("Позитивный отзыв 😀")
+            else:
+                st.write("Негативный отзыв 😟")
+        else:
+            st.write("Пожалуйста, введите отзыв для классификации.")

tfidf_vectorizer.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:621e7e86acf6a032018e0e5ebf0876579f4f846478a70e782eb3c476298c088f
+size 1750676

toxic1.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# toxic.py
+import streamlit as st
+import numpy as np
+import pandas as pd
+import time
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+model_t_checkpoint = 'cointegrated/rubert-tiny-toxicity'
+tokenizer_t = AutoTokenizer.from_pretrained(model_t_checkpoint)
+model_t = AutoModelForSequenceClassification.from_pretrained(model_t_checkpoint)
+def text2toxicity(text, aggregate=True):
+    with torch.no_grad():
+        inputs = tokenizer_t(text, return_tensors='pt', truncation=True, padding=True).to('cpu')
+        proba = torch.sigmoid(model_t(**inputs).logits).cpu().numpy()
+    if isinstance(text, str):
+        proba = proba[0]
+    if aggregate:
+        return 1 - proba.T[0] * (1 - proba.T[-1])
+    return proba
+def toxicity_page():
+    st.title("""
+    Определим токсичный комментарий или нет
+    """)
+    user_text_input = st.text_area('Введите ваш отзыв здесь:')
+    if st.button('Предсказать'):
+        start_time = time.time()
+        proba = text2toxicity(user_text_input, True)
+        end_time = time.time()
+        prediction_time = end_time - start_time
+        if proba >= 0.5:
+            st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий токсичный.')
+        else:
+            st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий не токсичный.')
+        st.write(f'Время предсказания: {prediction_time:.4f} секунд')

vocab_to_int.json ADDED Viewed

The diff for this file is too large to render. See raw diff