Ruslan-DS commited on Dec 16, 2023

Commit

60a2954

1 Parent(s): 0b9c14e

Add weights files

Browse files

Files changed (22) hide show

images/attention_words.jpg +0 -0
images/bert_tunnig.jpg +0 -0
images/distribution_classes.jpg +0 -0
images/funny_dataframe.jpg +0 -0
images/lstm_attention.jpg +0 -0
images/roc_auc_catboost.jpg +0 -0
images/roc_auc_logreg.jpg +0 -0
images/umap.jpg +0 -0
main.py +0 -0
models/BertTunning.py +53 -0
models/LSTM.py +0 -0
models/LogReg.py +0 -0
models/datasets/embedding_matrix.npy +3 -0
models/datasets/vocab_to_int.json +0 -0
models/preprocess_stage/bert_model.py +22 -0
models/preprocess_stage/preprocess_lstm.py +0 -0
models/weights/BertTunnigWeights.pt +3 -0
models/weights/LSTMBestWeights.pt +3 -0
models/weights/LogRegBestWeights.sav +0 -0
pages/classification_reviews.py +0 -0
pages/results.py +0 -0
requirements.txt +77 -0

images/attention_words.jpg ADDED Viewed

images/bert_tunnig.jpg ADDED Viewed

images/distribution_classes.jpg ADDED Viewed

images/funny_dataframe.jpg ADDED Viewed

images/lstm_attention.jpg ADDED Viewed

images/roc_auc_catboost.jpg ADDED Viewed

images/roc_auc_logreg.jpg ADDED Viewed

images/umap.jpg ADDED Viewed

main.py ADDED Viewed

File without changes

models/BertTunning.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import pandas as pd
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+from logreg_model import bert_for_logreg, tokenizer_bert
+from preprocess_bert import preprocess_bert
+MAX_LEN = 100
+class BertTunnig(nn.Module):
+    def __init__(self, bert_model):
+        super().__init__()
+        self.bert = bert_model
+        for weights in self.bert.parameters():
+            weights.requires_grad = False
+        self.fc1 = nn.Linear(768, 256)
+        self.drop1 = nn.Dropout(p=0.5)
+        self.fc2 = nn.Linear(256, 32)
+        self.fc_out = nn.Linear(32, 1)
+    def forward(self, x, attention_mask):
+        output = self.bert(x, attention_mask=attention_mask)[0][:, 0, :]
+        output = self.fc1(output)
+        output_drop = self.drop1(output)
+        output = self.fc2(output_drop)
+        output = self.fc_out(output)
+        return torch.sigmoid(output)
+model_tunning = BertTunnig(bert_model=bert_for_logreg)
+model_tunning.load_state_dict(torch.load('best_weights_berttinnug(2).pt'))
+def predict_2(text):
+    preprocessed_text, attention_mask = preprocess_bert(text, MAX_LEN=MAX_LEN)
+    preprocessed_text, attention_mask = torch.tensor(preprocessed_text).unsqueeze(0), torch.tensor([attention_mask])
+    with torch.inference_mode():
+        predict = model_tunning(preprocessed_text, attention_mask=attention_mask).item()
+    return round(predict)

models/LSTM.py ADDED Viewed

File without changes

models/LogReg.py ADDED Viewed

File without changes

models/datasets/embedding_matrix.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbcae4631c684cea4bef1df946822bdfc66cadddc240ffd39f917f200bb5894a
+size 6643840

models/datasets/vocab_to_int.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/preprocess_stage/bert_model.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import numpy as np
+import torch
+from transformers import BertModel, BertTokenizer
+weights = 'DeepPavlov/rubert-base-cased'
+tokenizer_bert = BertTokenizer.from_pretrained(weights)
+bert_for_logreg = BertModel.from_pretrained(weights)
+def preprocess_bert(text, MAX_LEN):
+    tokenized_text = tokenizer_bert.encode(
+        text=text,
+        add_special_tokens=True,
+        truncation=True,
+        max_length=MAX_LEN
+    )
+    padded_text = np.array(tokenized_text + [0] * (MAX_LEN - len(tokenized_text)))
+    attention_mask = np.where(padded_text != 0, 1, 0)
+    return padded_text, attention_mask

models/preprocess_stage/preprocess_lstm.py ADDED Viewed

File without changes

models/weights/BertTunnigWeights.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21a77e82f2fafc9e5cec46b6494f45dc5edb397c13fea55238eaabaf7832cffd
+size 712320552

models/weights/LSTMBestWeights.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:317df198794e02b19bbb9f6cc3201950d773f9719df4391228c125ac101cd323
+size 3375698

models/weights/LogRegBestWeights.sav ADDED Viewed

Binary file (6.94 kB). View file

pages/classification_reviews.py ADDED Viewed

File without changes

pages/results.py ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,77 @@

+altair==5.2.0
+attrs==23.1.0
+blinker==1.7.0
+cachetools==5.3.2
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+filelock==3.13.1
+fsspec==2023.12.2
+gitdb==4.0.11
+GitPython==3.1.40
+huggingface-hub==0.19.4
+idna==3.6
+importlib-metadata==6.11.0
+Jinja2==3.1.2
+joblib==1.3.2
+jsonschema==4.20.0
+jsonschema-specifications==2023.11.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.2
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.18.1
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu12==12.1.105
+packaging==23.2
+pandas==2.1.4
+Pillow==10.1.0
+protobuf==4.25.1
+pyarrow==14.0.1
+pydeck==0.8.1b0
+Pygments==2.17.2
+python-dateutil==2.8.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+referencing==0.32.0
+regex==2023.10.3
+requests==2.31.0
+rich==13.7.0
+rpds-py==0.13.2
+safetensors==0.4.1
+scikit-learn==1.3.2
+scipy==1.11.4
+six==1.16.0
+smmap==5.0.1
+st-pages==0.4.5
+streamlit==1.29.0
+sympy==1.12
+tenacity==8.2.3
+threadpoolctl==3.2.0
+tokenizers==0.15.0
+toml==0.10.2
+toolz==0.12.0
+torch==2.1.2
+tornado==6.4
+tqdm==4.66.1
+transformers==4.36.1
+triton==2.1.0
+typing_extensions==4.9.0
+tzdata==2023.3
+tzlocal==5.2
+urllib3==2.1.0
+validators==0.22.0
+watchdog==3.0.0
+zipp==3.17.0