Spaces:

romnatall
/

team_nlp_project

Sleeping

App Files Files Community

romnatall commited on Apr 12, 2024

Commit

d3d0074

1 Parent(s): 75139a0

final

Browse files

Files changed (14) hide show

app.py +41 -15
images/{olya.jpg → baur.jpg} +0 -0
images/film_bert.jpg +0 -0
images/film_lstm.png +0 -0
images/film_tfidf.jpg +0 -0
images/roma.png +0 -0
images/ss.png +0 -0
images/tf_idf_cm.jpg +0 -0
images/toxic.png +0 -0
pages/0film_reviev.py +31 -10
pages/film_review/model/log_reg_bert.pkl +3 -0
pages/film_review/model/model_bert.pth +3 -0
pages/film_review/model/model_bert.py +39 -0
pages/film_review/model/model_lstm.py +12 -6

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import streamlit as st
 from PIL import Image
 st.title("NLP project")
-description_show_options = ['main','film_review','toxic_messages','GPT','над проектом работали']
 description_show = st.sidebar.radio("Description", description_show_options)
 if description_show == 'над проектом работали':
@@ -11,14 +12,14 @@ if description_show == 'над проектом работали':
     col1, col2, col3 = st.columns(3)
     with col1:
-        romaimage = Image.open("images/roma.jpg")
-        st.image(romaimage, caption="Рома | cosplayNet enjoyer | DevOps", use_column_width=True)
     with col2:
         leraimage = Image.open("images/Lera.png")
-        st.image(leraimage, caption="Лера | UNet bender | Data Scientist", use_column_width=True)
     with col3:
-        olyaimage = Image.open("images/olya.jpg")
-        st.image(olyaimage, caption="Бауржан | streamlit master | Frontender", use_column_width=True)
 elif description_show == 'GPT':
     st.title("GPT")
@@ -28,19 +29,44 @@ elif description_show == 'main':
 elif description_show == 'film_review':
     st.title("film_review")
-#     Weighted F1-score: 0.7069352925929284
-# Classification Report:
-#               precision    recall  f1-score   support
-#          Bad       0.67      0.81      0.74       960
-#      Neutral       0.65      0.50      0.56       922
-#         Good       0.82      0.82      0.82       896
-#     accuracy                           0.71      2778
-#    macro avg       0.71      0.71      0.71      2778
-# weighted avg       0.71      0.71      0.71      2778
 elif description_show == 'toxic_messages':
     st.title("toxic_messages")

+from math import e
 import streamlit as st
 from PIL import Image
 st.title("NLP project")
+description_show_options = ['main','film_review','toxic_messages','над проектом работали']
 description_show = st.sidebar.radio("Description", description_show_options)
 if description_show == 'над проектом работали':
     col1, col2, col3 = st.columns(3)
     with col1:
+        romaimage = Image.open("images/roma.png")
+        st.image(romaimage, caption="Рома | custom attention enjoyer | DevOps", use_column_width=True, )
     with col2:
         leraimage = Image.open("images/Lera.png")
+        st.image(leraimage, caption="Лера | GPT bender | Data Scientist", use_column_width=True)
     with col3:
+        olyaimage = Image.open("images/baur.jpg")
+        st.image(olyaimage, caption="Бауржан | TF/IDF master | Frontender", use_column_width=True)
 elif description_show == 'GPT':
     st.title("GPT")
 elif description_show == 'film_review':
     st.title("film_review")
+    st.write("------------")
+    st.write("BERT embedding + LSTM + roman attention")
+    text = """Weighted F1-score: 0.70\n
+    Classification Report:
+    precision    recall  f1-score   support
+    Bad        0.67      0.81      0.74       960
+    Neutral    0.65      0.50      0.56       922
+    Good       0.82      0.82      0.82       896
+    -----
+    accuracy                           0.71      2778
+    macro avg      0.71      0.71      0.71      2778
+    weighted avg   0.71      0.71      0.71      2778"""
+    st.markdown(text)
+    png = Image.open("images/film_lstm.png")
+    st.image(png,  use_column_width=True)
+    st.write("------------")
+    st.write("tf-idf + Logreg")
+    png = Image.open("images/film_tfidf.jpg")
+    st.image(png,  use_column_width=True)
+    png = Image.open("images/tf_idf_cm.jpg")
+    st.image(png,  use_column_width=True)
+    st.write("------------")
+    st.write("Bert embedding + LogReg")
+    png = Image.open("images/film_bert.jpg")
+    st.image(png,  use_column_width=True)
+elif description_show == 'toxic_messages':
+    st.title("toxic_messages")
+    png = Image.open("images/toxic.png")
+    st.image(png,  use_column_width=True)
 elif description_show == 'toxic_messages':
     st.title("toxic_messages")

images/{olya.jpg → baur.jpg} RENAMED Viewed

File without changes

images/film_bert.jpg ADDED Viewed

images/film_lstm.png ADDED Viewed

images/film_tfidf.jpg ADDED Viewed

images/roma.png ADDED Viewed

images/ss.png ADDED Viewed

images/tf_idf_cm.jpg ADDED Viewed

images/toxic.png ADDED Viewed

pages/0film_reviev.py CHANGED Viewed

@@ -7,6 +7,20 @@ st.title("film_review")
 input_text = st.text_area("Enter your text")
 from pages.film_review.model.model_lstm import *
 from pages.film_review.model.model_logreg import *
 @st.cache_resource
 def get_model():
@@ -16,15 +30,22 @@ model.eval()
 dec = {0:'отрицательный',1:'нейтральный',2:'положительный'}
 if input_text:
-    with torch.no_grad():
-        ans = torch.nn.functional.softmax(model(input_text), dim=1)
-    idx = torch.argmax(ans, dim=1).item()
-    st.write(f'LSTM - отзыв: {dec[idx]}, уверенность: { round(ans[0][idx].item(),2)}')
-    st.write(f'Logreg - отзыв: {dec[ predict_tfidf(input_text)[0]]}')
-else:
-    st.write("No text entered")

 input_text = st.text_area("Enter your text")
 from pages.film_review.model.model_lstm import *
 from pages.film_review.model.model_logreg import *
+from pages.film_review.model.model_bert import *
+import time
+class Timer:
+    def __enter__(self):
+        self.start_time = time.time()
+        return self
+    def __exit__(self, *args):
+        self.end_time = time.time()
+        self.execution_time = self.end_time - self.start_time
 @st.cache_resource
 def get_model():
 dec = {0:'отрицательный',1:'нейтральный',2:'положительный'}
 if input_text:
+    with Timer() as t:
+        with torch.no_grad():
+            ans = torch.nn.functional.softmax(model(input_text), dim=1)
+            idx = torch.argmax(ans, dim=1).item()
+        st.write(f'LSTM - отзыв: {dec[idx]}, уверенность: { round(ans[0][idx].item(),2)}')
+    st.write("Время выполнения:", round(t.execution_time*1000, 2), "миллисекунд")
+    st.write("------------")
+    with Timer() as t:
+        st.write(f'Logreg - отзыв: {dec[ predict_tfidf(input_text)[0]]}')
+    st.write("Время выполнения:", round(t.execution_time*1000, 2), "миллисекунд")
+    st.write("------------")
+    with Timer() as t:
+        st.write(f'Bert - отзыв: {dec[ predict_bert(input_text)]}')
+    st.write("Время выполнения:", round(t.execution_time*1000, 2), "миллисекунд")

pages/film_review/model/log_reg_bert.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a6dc8a96c93ed97b248f73955cfe28998ab5bc360d2635dcc7129aa92425361
+size 8225

pages/film_review/model/model_bert.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d74ff4026ce64a4c33dda7730aa03c771b097cc1f0ea3d79d69935482559209
+size 13420

pages/film_review/model/model_bert.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModel
+from sklearn.linear_model import LogisticRegression
+import streamlit as st
+import pickle
+import streamlit as st
+@st.cache_resource
+def get_model():
+    model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
+    tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
+    return model, tokenizer
+def predict_bert(input_text):
+    MAX_LEN = 300
+    model, tokenizer = get_model()
+    tokenized_input = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, max_length=MAX_LEN)
+    padded_input = np.array(tokenized_input + [0]*(MAX_LEN-len(tokenized_input)))
+    attention_mask = np.where(padded_input != 0, 1, 0)
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model.to(device)
+    with torch.no_grad():
+        input_tensor = torch.tensor(padded_input).unsqueeze(0).to(device)
+        attention_mask_tensor = torch.tensor(attention_mask).unsqueeze(0).to(device)
+        last_hidden_states = model(input_tensor, attention_mask=attention_mask_tensor)[0]
+    features = last_hidden_states[:,0,:].cpu().numpy()
+    with open('pages/film_review/model/log_reg_bert.pkl', 'rb') as f:
+        loaded_model = pickle.load(f)
+    prediction = loaded_model.predict(features)
+    return prediction[0]

pages/film_review/model/model_lstm.py CHANGED Viewed

@@ -1,9 +1,11 @@
 ATTENTION_SIZE=10
 HIDDEN_SIZE=300
 INPUT_SIZE=312
 import torch
 from transformers import AutoTokenizer, AutoModel
 import torch.nn as nn
 class RomanAttention(nn.Module):
     def __init__(self, hidden_size: int = HIDDEN_SIZE) -> None:
@@ -31,14 +33,18 @@ class RomanAttention(nn.Module):
 import pytorch_lightning as  lg
-m = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
-emb=m.embeddings
-#emb.dropout=nn.Dropout(0)
-for param in emb.parameters():
-    param.requires_grad = False
-tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
 def tokenize(text):
     t=tokenizer(text, padding=True, truncation=True,pad_to_multiple_of=300,max_length=300)['input_ids']
     if len(t) <30:

 ATTENTION_SIZE=10
 HIDDEN_SIZE=300
 INPUT_SIZE=312
+from math import e
 import torch
 from transformers import AutoTokenizer, AutoModel
 import torch.nn as nn
+import streamlit as st
 class RomanAttention(nn.Module):
     def __init__(self, hidden_size: int = HIDDEN_SIZE) -> None:
 import pytorch_lightning as  lg
+@st.cache_resource
+def load_model():
+    m = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
+    emb=m.embeddings
+    #emb.dropout=nn.Dropout(0)
+    for param in emb.parameters():
+        param.requires_grad = False
+    tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
+    return emb, tokenizer
+emb, tokenizer = load_model()
 def tokenize(text):
     t=tokenizer(text, padding=True, truncation=True,pad_to_multiple_of=300,max_length=300)['input_ids']
     if len(t) <30: