Spaces:
Sleeping
Sleeping
ruslanruslanruslan
commited on
Commit
•
60cb352
1
Parent(s):
2f9b6cb
files added
Browse files- .gitattributes +1 -0
- app.py +3 -0
- basic_bert_weights.pt +3 -0
- bert_weights.pt +3 -0
- borges.jpg +0 -0
- borgesian_weights.pt +3 -0
- lstm_embedding_matrix.npy +3 -0
- lstm_model_weights.pt +3 -0
- lstm_vocab_to_int.json +3 -0
- pages/Borgesian.py +32 -0
- pages/Film reviews classifier.py +164 -0
- pages/Summarizer.py +22 -0
- requirements.txt +84 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.title('A multipage application featuring various Natural Language Processing instruments and functions.')
|
basic_bert_weights.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4deb15105a799cd64d8058d552657b397cda2a9d6b2e34b3b9b63ac897936cf3
|
3 |
+
size 265489387
|
bert_weights.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0c9d5352b8a32df74d754421946fdee6d2d4d8a23598b734dfc950c03067019
|
3 |
+
size 265495165
|
borges.jpg
ADDED
borgesian_weights.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:852eff7ff8cb373033d5e4f8e71454a079dd08496dfe4e3db148e65b6d88e6f8
|
3 |
+
size 500981765
|
lstm_embedding_matrix.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:40e89492f39ddd85531f55a30b36650b4cdbe86cb624588e568e825211f3c3a5
|
3 |
+
size 108256384
|
lstm_model_weights.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04f6ffb4ac2e1897631488a12d707e848c97a616674185a0eb875aab82cceeac
|
3 |
+
size 65423143
|
lstm_vocab_to_int.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e4a58991e3ae061ed499b316bd5b2c805cd9628b5fc1e1244169fa13ce268547
|
3 |
+
size 4414229
|
pages/Borgesian.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import transformers
|
3 |
+
import torch
|
4 |
+
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
5 |
+
|
6 |
+
borgesian = GPT2LMHeadModel.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2', output_attentions = False, output_hidden_states = False)
|
7 |
+
borgesian.load_state_dict(torch.load('borgesian_weights.pt', map_location=torch.device('cpu')))
|
8 |
+
tokenizer = GPT2Tokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2")
|
9 |
+
borgesian.to('cpu')
|
10 |
+
borgesian.eval()
|
11 |
+
|
12 |
+
def generate_response(text, temperature, length, top_p):
|
13 |
+
input_ids = tokenizer.encode(text, return_tensors="pt")
|
14 |
+
with torch.no_grad():
|
15 |
+
out = borgesian.generate(input_ids, do_sample=True, num_beams=2, temperature=float(temperature), top_p=float(top_p), max_length=length)
|
16 |
+
generated_text = list(map(tokenizer.decode, out))[0]
|
17 |
+
st.write(generated_text)
|
18 |
+
|
19 |
+
st.title('Borgesian')
|
20 |
+
st.image('borges.jpg')
|
21 |
+
st.write('Write a prompt in Russian, and the GPT-based model will follow up with a Borgesian text.')
|
22 |
+
st.write('Define the parameters of generation:')
|
23 |
+
temperature = st.slider('Temperature', value = 1.5, min_value = 1.0, max_value = 5.0, step = 0.1)
|
24 |
+
length = st.slider('Length', value = 50, min_value = 20, max_value = 150, step = 1)
|
25 |
+
top_p = st.slider('Top-p value', value = 0.9, min_value = 0.5, max_value = 1.0, step = 0.05)
|
26 |
+
|
27 |
+
user_input = st.text_area("Enter your text:")
|
28 |
+
if st.button("Send"):
|
29 |
+
if user_input:
|
30 |
+
generate_response(user_input, temperature, length, top_p)
|
31 |
+
else:
|
32 |
+
st.warning("Please enter some text.")
|
pages/Film reviews classifier.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time
|
3 |
+
import os
|
4 |
+
import logging
|
5 |
+
import torch
|
6 |
+
import json
|
7 |
+
import string
|
8 |
+
import re
|
9 |
+
import string
|
10 |
+
import nltk
|
11 |
+
import numpy as np
|
12 |
+
import torch.nn as nn
|
13 |
+
import transformers
|
14 |
+
from collections import Counter
|
15 |
+
from nltk.corpus import stopwords
|
16 |
+
from nltk.stem import WordNetLemmatizer
|
17 |
+
# stop_words = set(stopwords.words('english'))
|
18 |
+
|
19 |
+
def preprocess_single_string(input_string: str, seq_len: int, vocab_to_int: dict):
|
20 |
+
preprocessed_string = data_preprocessing(input_string)
|
21 |
+
result_list = []
|
22 |
+
for word in preprocessed_string.split():
|
23 |
+
try:
|
24 |
+
result_list.append(vocab_to_int[word])
|
25 |
+
except KeyError as e:
|
26 |
+
continue
|
27 |
+
result_padded = padding([result_list], seq_len)[0]
|
28 |
+
return torch.tensor(result_padded)
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
def padding(reviews_int: list, seq_len: int):
|
33 |
+
features = np.zeros((len(reviews_int), seq_len), dtype = int)
|
34 |
+
for i, review in enumerate(reviews_int):
|
35 |
+
if len(review) <= seq_len:
|
36 |
+
zeros = list(np.zeros(seq_len - len(review)))
|
37 |
+
new = zeros + review
|
38 |
+
else:
|
39 |
+
new = review[: seq_len]
|
40 |
+
features[i, :] = np.array(new)
|
41 |
+
return features
|
42 |
+
|
43 |
+
|
44 |
+
def data_preprocessing(text: str):
|
45 |
+
wn_lemmatizer = WordNetLemmatizer()
|
46 |
+
text = text.lower()
|
47 |
+
text = re.sub('<.*?>', '', text)
|
48 |
+
text = ''.join([c for c in text if c not in string.punctuation])
|
49 |
+
text = [wn_lemmatizer.lemmatize(word) for word in text.split()] #if word not in stop_words]
|
50 |
+
text = ' '.join(text)
|
51 |
+
return text
|
52 |
+
|
53 |
+
with open('lstm_vocab_to_int.json') as json_file:
|
54 |
+
vocab_to_int = json.load(json_file)
|
55 |
+
|
56 |
+
with open('lstm_embedding_matrix.npy', 'rb') as f:
|
57 |
+
embedding_matrix = np.load(f)
|
58 |
+
|
59 |
+
embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
|
60 |
+
|
61 |
+
class LSTMClassifier(nn.Module):
|
62 |
+
def __init__(self, embedding_dim: int, seq_len:int, hidden_size:int = 32, dropout:int = 0, num_layers:int = 1) -> None:
|
63 |
+
super().__init__()
|
64 |
+
|
65 |
+
self.embedding_dim = embedding_dim
|
66 |
+
self.hidden_size = hidden_size
|
67 |
+
self.embedding = embedding_layer
|
68 |
+
self.dropout = dropout
|
69 |
+
self.num_layers = num_layers
|
70 |
+
self.seq_len = seq_len
|
71 |
+
self.lstm = nn.LSTM(
|
72 |
+
input_size=self.embedding_dim,
|
73 |
+
hidden_size=self.hidden_size,
|
74 |
+
batch_first=True,
|
75 |
+
bidirectional=True,
|
76 |
+
dropout=self.dropout,
|
77 |
+
num_layers=self.num_layers
|
78 |
+
)
|
79 |
+
self.linear = nn.Sequential(
|
80 |
+
nn.Linear(self.hidden_size * self.seq_len * 2, 128),
|
81 |
+
nn.Linear(128, 1)
|
82 |
+
)
|
83 |
+
|
84 |
+
def forward(self, x):
|
85 |
+
embeddings = self.embedding(x)
|
86 |
+
output, _ = self.lstm(embeddings)
|
87 |
+
output = output.contiguous().view(output.size(0), -1)
|
88 |
+
out = self.linear(output.squeeze(0))
|
89 |
+
return out
|
90 |
+
|
91 |
+
bert_model_class = transformers.DistilBertModel
|
92 |
+
bert_tokenizer_class = transformers.DistilBertTokenizer
|
93 |
+
bert_pretrained_weights = torch.load('basic_bert_weights.pt', map_location=torch.device('cpu'))
|
94 |
+
bert_tokenizer = bert_tokenizer_class.from_pretrained('distilbert-base-uncased')
|
95 |
+
bert_basic_model = bert_model_class.from_pretrained('distilbert-base-uncased')
|
96 |
+
|
97 |
+
class BertReviews(nn.Module):
|
98 |
+
def __init__(self, model):
|
99 |
+
super(BertReviews, self).__init__()
|
100 |
+
self.bert = model
|
101 |
+
for param in self.bert.parameters():
|
102 |
+
param.requires_grad = False
|
103 |
+
for i in range(6):
|
104 |
+
self.bert.transformer.layer[i].output_layer_norm.weight.requires_grad = True
|
105 |
+
self.bert.transformer.layer[i].output_layer_norm.bias.requires_grad = True
|
106 |
+
self.fc = nn.Linear(768, 1)
|
107 |
+
|
108 |
+
def forward(self, samples, att_masks):
|
109 |
+
|
110 |
+
embeddings = self.bert(samples, attention_mask=att_masks)
|
111 |
+
model_out = self.fc(embeddings[0][:, 0, :])
|
112 |
+
|
113 |
+
return embeddings, model_out
|
114 |
+
|
115 |
+
bert_model = BertReviews(bert_basic_model)
|
116 |
+
bert_model.load_state_dict(torch.load('bert_weights.pt', map_location=torch.device('cpu')))
|
117 |
+
bert_model.to('cpu').eval()
|
118 |
+
|
119 |
+
model_lstm = LSTMClassifier(embedding_dim=64, hidden_size=64, seq_len = 150, dropout=0.5, num_layers=4)
|
120 |
+
model_lstm.load_state_dict(torch.load('lstm_model_weights.pt', map_location=torch.device('cpu')))
|
121 |
+
model_lstm.to('cpu').eval()
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
def predict_sentence_lstm(text: str):
|
126 |
+
start_time = time.time()
|
127 |
+
text = preprocess_single_string(text, 150, vocab_to_int)
|
128 |
+
res = int(torch.sigmoid(model_lstm(text.unsqueeze(0))).cpu().detach().numpy().round())
|
129 |
+
end_time = time.time()
|
130 |
+
execution_time = end_time - start_time
|
131 |
+
return res, execution_time
|
132 |
+
|
133 |
+
def predict_sentence_bert(text: str):
|
134 |
+
start_time = time.time()
|
135 |
+
text = bert_tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=200)
|
136 |
+
text = np.array([text + [0]*(200-len(text))])
|
137 |
+
attention_mask = torch.Tensor(np.where(text != 0, 1, 0)).to(torch.int64)
|
138 |
+
text = torch.Tensor(text).to(torch.int64)
|
139 |
+
# output = bert_model(text, attention_mask)[1]
|
140 |
+
# res = output.squeeze().detach().numpy().round()
|
141 |
+
|
142 |
+
res = int(torch.sigmoid(bert_model(text, attention_mask)[1]).cpu().detach().numpy().round())
|
143 |
+
end_time = time.time()
|
144 |
+
execution_time = end_time - start_time
|
145 |
+
return res, execution_time
|
146 |
+
|
147 |
+
reses = {0: 'negative', 1: 'positive'}
|
148 |
+
|
149 |
+
def process_text(input_text):
|
150 |
+
res_lstm, time_lstm = predict_sentence_lstm(input_text)
|
151 |
+
res_bert, time_bert = predict_sentence_bert(input_text)
|
152 |
+
st.write('Results:')
|
153 |
+
st.write(f'LSTM: {reses[res_lstm]}, execution time: {time_lstm:.2f} seconds.')
|
154 |
+
st.write(f'Upgraded Bert: {reses[res_bert]}, execution time: {time_bert:.2f} seconds.')
|
155 |
+
|
156 |
+
st.title('Film reviews classifier')
|
157 |
+
st.write('Write a film review in a box below, and the application, powered by two NLP models (LSTM and upgraded Bert), will tell if it is a positive or a negative review.')
|
158 |
+
|
159 |
+
user_input = st.text_area("Enter your text:")
|
160 |
+
if st.button("Send a review for processing"):
|
161 |
+
if user_input:
|
162 |
+
processed_text = process_text(user_input)
|
163 |
+
else:
|
164 |
+
st.warning("Please enter some text before processing.")
|
pages/Summarizer.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, BartForConditionalGeneration
|
3 |
+
|
4 |
+
summarizer = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
|
5 |
+
tokenizer_sum = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
|
6 |
+
|
7 |
+
def generate_summary(text, length):
|
8 |
+
inputs = tokenizer_sum([text], max_length=1024, return_tensors="pt")
|
9 |
+
summary_ids = summarizer.generate(inputs["input_ids"], num_beams=2, min_length=1, max_length=length)
|
10 |
+
out = tokenizer_sum.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
11 |
+
st.write(out)
|
12 |
+
|
13 |
+
st.title('Summarizer')
|
14 |
+
st.write('Submit a news article in the field below, and the Bart-based model with provide a summary.')
|
15 |
+
|
16 |
+
length = st.slider('Maximum length of summary', value = 50, min_value = 15, max_value = 150, step = 1)
|
17 |
+
user_input = st.text_area("Enter your text:")
|
18 |
+
if st.button("Send a review for processing"):
|
19 |
+
if user_input:
|
20 |
+
generate_summary(user_input, length)
|
21 |
+
else:
|
22 |
+
st.warning("Please enter some text before processing.")
|
requirements.txt
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==5.0.1
|
2 |
+
attrs==23.1.0
|
3 |
+
blinker==1.6.2
|
4 |
+
cachetools==5.3.1
|
5 |
+
certifi==2023.5.7
|
6 |
+
charset-normalizer==3.2.0
|
7 |
+
click==8.1.6
|
8 |
+
cmake==3.27.0
|
9 |
+
contourpy==1.1.0
|
10 |
+
cycler==0.11.0
|
11 |
+
decorator==5.1.1
|
12 |
+
filelock==3.12.2
|
13 |
+
fonttools==4.41.0
|
14 |
+
fsspec==2023.6.0
|
15 |
+
gitdb==4.0.10
|
16 |
+
GitPython==3.1.32
|
17 |
+
huggingface-hub==0.16.4
|
18 |
+
idna==3.4
|
19 |
+
importlib-metadata==6.8.0
|
20 |
+
Jinja2==3.1.2
|
21 |
+
joblib==1.3.1
|
22 |
+
jsonschema==4.18.4
|
23 |
+
jsonschema-specifications==2023.7.1
|
24 |
+
kiwisolver==1.4.4
|
25 |
+
lit==16.0.6
|
26 |
+
markdown-it-py==3.0.0
|
27 |
+
MarkupSafe==2.1.3
|
28 |
+
matplotlib==3.7.2
|
29 |
+
mdurl==0.1.2
|
30 |
+
mpmath==1.3.0
|
31 |
+
networkx==3.1
|
32 |
+
nltk==3.8.1
|
33 |
+
numpy==1.25.1
|
34 |
+
nvidia-cublas-cu11==11.10.3.66
|
35 |
+
nvidia-cuda-cupti-cu11==11.7.101
|
36 |
+
nvidia-cuda-nvrtc-cu11==11.7.99
|
37 |
+
nvidia-cuda-runtime-cu11==11.7.99
|
38 |
+
nvidia-cudnn-cu11==8.5.0.96
|
39 |
+
nvidia-cufft-cu11==10.9.0.58
|
40 |
+
nvidia-curand-cu11==10.2.10.91
|
41 |
+
nvidia-cusolver-cu11==11.4.0.1
|
42 |
+
nvidia-cusparse-cu11==11.7.4.91
|
43 |
+
nvidia-nccl-cu11==2.14.3
|
44 |
+
nvidia-nvtx-cu11==11.7.91
|
45 |
+
packaging==23.1
|
46 |
+
pandas==2.0.3
|
47 |
+
Pillow==9.5.0
|
48 |
+
protobuf==4.23.4
|
49 |
+
pyarrow==12.0.1
|
50 |
+
pydeck==0.8.1b0
|
51 |
+
Pygments==2.15.1
|
52 |
+
Pympler==1.0.1
|
53 |
+
pyparsing==3.0.9
|
54 |
+
python-dateutil==2.8.2
|
55 |
+
pytz==2023.3
|
56 |
+
pytz-deprecation-shim==0.1.0.post0
|
57 |
+
PyYAML==6.0.1
|
58 |
+
referencing==0.30.0
|
59 |
+
regex==2023.6.3
|
60 |
+
requests==2.31.0
|
61 |
+
rich==13.4.2
|
62 |
+
rpds-py==0.9.2
|
63 |
+
safetensors==0.3.1
|
64 |
+
six==1.16.0
|
65 |
+
smmap==5.0.0
|
66 |
+
streamlit==1.24.1
|
67 |
+
sympy==1.12
|
68 |
+
tenacity==8.2.2
|
69 |
+
tokenizers==0.13.3
|
70 |
+
toml==0.10.2
|
71 |
+
toolz==0.12.0
|
72 |
+
torch==2.0.1
|
73 |
+
torchutils==0.0.4
|
74 |
+
tornado==6.3.2
|
75 |
+
tqdm==4.65.0
|
76 |
+
transformers==4.31.0
|
77 |
+
triton==2.0.0
|
78 |
+
typing_extensions==4.7.1
|
79 |
+
tzdata==2023.3
|
80 |
+
tzlocal==4.3.1
|
81 |
+
urllib3==2.0.4
|
82 |
+
validators==0.20.0
|
83 |
+
watchdog==3.0.0
|
84 |
+
zipp==3.16.2
|