Spaces:
Sleeping
Sleeping
Commit
·
66e9d7c
1
Parent(s):
0f494d0
models added
Browse files- logreg.pkl +3 -0
- pages/Film reviews classifier.py +37 -1
- requirements.txt +1 -0
- tf.pkl +3 -0
logreg.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7bd6d0e129d3a2e6bdc40393eaa602ea599e032c0686a973ef137dc138be805b
|
3 |
+
size 44433
|
pages/Film reviews classifier.py
CHANGED
@@ -11,14 +11,40 @@ import nltk
|
|
11 |
import numpy as np
|
12 |
import torch.nn as nn
|
13 |
import transformers
|
|
|
|
|
14 |
nltk.download('wordnet')
|
15 |
nltk.download('stopwords')
|
16 |
from collections import Counter
|
17 |
from nltk.corpus import stopwords
|
18 |
from nltk.stem import WordNetLemmatizer
|
|
|
|
|
|
|
19 |
|
20 |
stop_words = set(stopwords.words('english'))
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
def preprocess_single_string(input_string: str, seq_len: int, vocab_to_int: dict):
|
23 |
preprocessed_string = data_preprocessing(input_string)
|
24 |
result_list = []
|
@@ -124,6 +150,14 @@ model_lstm.load_state_dict(torch.load('lstm_model_weights.pt', map_location=torc
|
|
124 |
model_lstm.to('cpu').eval()
|
125 |
|
126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
def predict_sentence_lstm(text: str):
|
129 |
start_time = time.time()
|
@@ -150,14 +184,16 @@ def predict_sentence_bert(text: str):
|
|
150 |
reses = {0: 'negative', 1: 'positive'}
|
151 |
|
152 |
def process_text(input_text):
|
|
|
153 |
res_lstm, time_lstm = predict_sentence_lstm(input_text)
|
154 |
res_bert, time_bert = predict_sentence_bert(input_text)
|
155 |
st.write('Results:')
|
|
|
156 |
st.write(f'LSTM: {reses[res_lstm]}, execution time: {time_lstm:.2f} seconds.')
|
157 |
st.write(f'Upgraded Bert: {reses[res_bert]}, execution time: {time_bert:.2f} seconds.')
|
158 |
|
159 |
st.title('Film reviews classifier')
|
160 |
-
st.write('Write a film review in a box below, and the application, powered by
|
161 |
|
162 |
user_input = st.text_area("Enter your text:")
|
163 |
if st.button("Send a review for processing"):
|
|
|
11 |
import numpy as np
|
12 |
import torch.nn as nn
|
13 |
import transformers
|
14 |
+
import lightgbm as lgb
|
15 |
+
import pickle
|
16 |
nltk.download('wordnet')
|
17 |
nltk.download('stopwords')
|
18 |
from collections import Counter
|
19 |
from nltk.corpus import stopwords
|
20 |
from nltk.stem import WordNetLemmatizer
|
21 |
+
from nltk.tokenize import RegexpTokenizer
|
22 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
23 |
+
from sklearn.linear_model import LogisticRegression
|
24 |
|
25 |
stop_words = set(stopwords.words('english'))
|
26 |
|
27 |
+
|
28 |
+
with open('logreg.pkl', 'rb') as f:
|
29 |
+
logreg = pickle.load(f)
|
30 |
+
|
31 |
+
with open('tf.pkl', 'rb') as f:
|
32 |
+
tf = pickle.load(f)
|
33 |
+
|
34 |
+
def classical_pipeline(text):
|
35 |
+
text = text.lower()
|
36 |
+
text = re.sub(r'\d+', ' ', text)
|
37 |
+
text = text.translate(str.maketrans('', '', string.punctuation))
|
38 |
+
text = re.sub(r'\n', '', text)
|
39 |
+
wn_lemmatizer = WordNetLemmatizer()
|
40 |
+
text = ' '.join([wn_lemmatizer.lemmatize(word) for word in text.split()])
|
41 |
+
reg_tokenizer = RegexpTokenizer('\w+')
|
42 |
+
text = reg_tokenizer.tokenize_sents([text])
|
43 |
+
sw = stopwords.words('english')
|
44 |
+
text = ' '.join([word for word in text[0] if word not in sw])
|
45 |
+
text = tf.transform([text])
|
46 |
+
return text
|
47 |
+
|
48 |
def preprocess_single_string(input_string: str, seq_len: int, vocab_to_int: dict):
|
49 |
preprocessed_string = data_preprocessing(input_string)
|
50 |
result_list = []
|
|
|
150 |
model_lstm.to('cpu').eval()
|
151 |
|
152 |
|
153 |
+
def predict_sentence_classical(text: str):
|
154 |
+
start_time = time.time()
|
155 |
+
text = classical_pipeline(text)
|
156 |
+
res = logreg.predict(text)[0]
|
157 |
+
end_time = time.time()
|
158 |
+
execution_time = end_time - start_time
|
159 |
+
return res, execution_time
|
160 |
+
|
161 |
|
162 |
def predict_sentence_lstm(text: str):
|
163 |
start_time = time.time()
|
|
|
184 |
reses = {0: 'negative', 1: 'positive'}
|
185 |
|
186 |
def process_text(input_text):
|
187 |
+
res_classical, time_classical = predict_sentence_classical(input_text)
|
188 |
res_lstm, time_lstm = predict_sentence_lstm(input_text)
|
189 |
res_bert, time_bert = predict_sentence_bert(input_text)
|
190 |
st.write('Results:')
|
191 |
+
st.write(f'Logistic regression: {reses[res_lstm]}, execution time: {time_lstm:.2f} seconds.')
|
192 |
st.write(f'LSTM: {reses[res_lstm]}, execution time: {time_lstm:.2f} seconds.')
|
193 |
st.write(f'Upgraded Bert: {reses[res_bert]}, execution time: {time_bert:.2f} seconds.')
|
194 |
|
195 |
st.title('Film reviews classifier')
|
196 |
+
st.write('Write a film review in a box below, and the application, powered by three NLP models (logistic regression, LSTM and upgraded Bert), will tell if it is a positive or a negative review.')
|
197 |
|
198 |
user_input = st.text_area("Enter your text:")
|
199 |
if st.button("Send a review for processing"):
|
requirements.txt
CHANGED
@@ -62,6 +62,7 @@ rich==13.4.2
|
|
62 |
rpds-py==0.9.2
|
63 |
safetensors==0.3.1
|
64 |
six==1.16.0
|
|
|
65 |
smmap==5.0.0
|
66 |
streamlit==1.24.1
|
67 |
sympy==1.12
|
|
|
62 |
rpds-py==0.9.2
|
63 |
safetensors==0.3.1
|
64 |
six==1.16.0
|
65 |
+
sklearn==0.0.post7
|
66 |
smmap==5.0.0
|
67 |
streamlit==1.24.1
|
68 |
sympy==1.12
|
tf.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:734f5cfcd6c4033bc5cdb18e6750660b207cdf0abd4ff6e8cc0c7d25d90b14e9
|
3 |
+
size 2072875
|