IvaElen commited on
Commit
d64b41c
1 Parent(s): 2722240

Upload app_connected.py

Browse files
Files changed (1) hide show
  1. app_connected.py +184 -0
app_connected.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import torch.nn as nn
4
+ import pandas as pd
5
+ import numpy as np
6
+ import pickle
7
+ from nltk.tokenize import RegexpTokenizer
8
+ from nltk.corpus import stopwords
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.linear_model import LogisticRegression
11
+ import re
12
+ import string
13
+ from nltk.stem import WordNetLemmatizer
14
+ import time
15
+ import transformers
16
+ import json
17
+
18
+ from biLSTM1 import biLSTM
19
+ from lstm_preprocessing import (
20
+ data_preprocessing,
21
+ get_words_by_freq,
22
+ padding,
23
+ preprocess_single_string
24
+ )
25
+
26
+
27
+
28
+ # 1-Lesha, 2-Lena, 3-Gal
29
+ # +++++++++++
30
+ # 1 -Lesha
31
+
32
+ # Load the saved model
33
+ with open('logistic_regression_model.pkl', 'rb') as file:
34
+ loaded_model_1 = pickle.load(file)
35
+
36
+ with open('tfidf_vectorizer.pkl', 'rb') as file:
37
+ vectorizer_1 = pickle.load(file)
38
+
39
+ # Load the stop words
40
+ stop_words = stopwords.words('english')
41
+ # Create a tokenizer
42
+ tokenizer = RegexpTokenizer(r'\w+')
43
+
44
+ def data_preprocessing(text: str) -> str:
45
+ """preprocessing string: lowercase, removing html-tags, punctuation and stopwords
46
+
47
+ Args:
48
+ text (str): input string for preprocessing
49
+
50
+ Returns:
51
+ str: preprocessed string
52
+ """
53
+
54
+ text = text.lower()
55
+ text = re.sub('<.*?>', '', text) # html tags
56
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
57
+ lemmatizer = WordNetLemmatizer()
58
+ tokens = tokenizer.tokenize(text)
59
+ tokens = [lemmatizer.lemmatize(word) for word in tokens if not word.isdigit() and word not in stop_words]
60
+ return ' '.join(tokens)
61
+
62
+ # ++++
63
+ # Lena
64
+
65
+
66
+ def load_model_l():
67
+ model_finetuned = transformers.AutoModel.from_pretrained(
68
+ "nghuyong/ernie-2.0-base-en",
69
+ output_attentions = False,
70
+ output_hidden_states = False
71
+ )
72
+ model_finetuned.load_state_dict(torch.load('ErnieModel_imdb.pt', map_location=torch.device('cpu')))
73
+ tokenizer = transformers.AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")
74
+ return model_finetuned, tokenizer
75
+
76
+ def preprocess_text(text_input, max_len, tokenizer):
77
+ input_tokens = tokenizer(
78
+ text_input,
79
+ return_tensors='pt',
80
+ padding=True,
81
+ max_length=max_len,
82
+ truncation = True
83
+ )
84
+ return input_tokens
85
+
86
+ def predict_sentiment(model, input_tokens):
87
+ id2label = {0: "negative", 1: "positive"}
88
+ output = model(**input_tokens).pooler_output.detach().numpy()
89
+ with open('LogReg_imdb_Ernie.pkl', 'rb') as file:
90
+ cls = pickle.load(file)
91
+ result = id2label[int(cls.predict(output))]
92
+ return result
93
+
94
+ # ++++
95
+ # Gala
96
+ with open('/home/galkalin/nlp_project/vocab_to_int.json', 'r') as fp:
97
+ vocab_to_int = json.load(fp)
98
+
99
+
100
+ VOCAB_SIZE = len(vocab_to_int)+1
101
+ EMBEDDING_DIM = 32
102
+ HIDDEN_DIM = 64
103
+ N_LAYERS = 3
104
+ SEQ_LEN = 128
105
+
106
+ def load_model_g():
107
+ model = biLSTM(
108
+ vocab_size=VOCAB_SIZE,
109
+ embedding_dim=EMBEDDING_DIM,
110
+ hidden_dim=HIDDEN_DIM,
111
+ n_layers=N_LAYERS
112
+ )
113
+ model.load_state_dict(torch.load('biLSTM_model_do_05_lr001_best.pt', map_location=torch.device('cpu')))
114
+ return model
115
+
116
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
117
+
118
+ def predict_sentence(text: str, model: nn.Module) -> str:
119
+ id2label = {0: "negative", 1: "positive"}
120
+ output = model.to(device)(preprocess_single_string(text, SEQ_LEN, vocab_to_int).unsqueeze(0).to(device))
121
+ pred = int(output.round().item())
122
+ result = id2label[pred]
123
+ return result
124
+
125
+
126
+
127
+ # ++++++
128
+ # Lesha
129
+
130
+
131
+ # Create the Streamlit app
132
+ def main():
133
+ st.title('Sentiment Analysis App')
134
+ st.header('Classic ML, ErnieModel, bidirectional LSTM')
135
+ user_input = st.text_area('Please enter your review:')
136
+ st.write(user_input)
137
+ submit = st.button("Predict!")
138
+ col1, col2,col3 = st.columns(3)
139
+ if user_input is not None and submit:
140
+ with col1:
141
+ # Preprocess the user input
142
+ preprocessed_input_1 = data_preprocessing(user_input)
143
+ # Vectorize the preprocessed input
144
+ input_vector = vectorizer_1.transform([preprocessed_input_1])
145
+ start_time = time.time()
146
+ proba_1 = loaded_model_1.predict_proba(input_vector)[:, 1]
147
+ # Predict the sentiment using the loaded model
148
+ #prediction = loaded_model.predict(input_vector)[0]
149
+ prediction_1 = round(proba_1[0])
150
+ end_time = time.time()
151
+ # Display the predicted sentiment
152
+ if prediction_1 == 0:
153
+ st.write('The sentiment of your review is negative.')
154
+ st.write('Predicted probability:', (1 - round(proba_1[0], 2))*100, '%')
155
+ else:
156
+ st.write('The sentiment of your review is positive.')
157
+ st.write('Predicted probability:', (round(proba_1[0], 2))*100, '%')
158
+ st.write('Processing time:', round(end_time - start_time, 4), 'seconds')
159
+ # Lena
160
+ if user_input is not None and submit:
161
+ with col2:
162
+ model2, tokenizer = load_model_l()
163
+ start_time = time.time()
164
+ input_tokens = preprocess_text(user_input, 500, tokenizer)
165
+ output = predict_sentiment(model2, input_tokens)
166
+ end_time = time.time()
167
+ st.write('The sentiment of your review is', output)
168
+ st.write('Processing time:', round(end_time - start_time, 4), 'seconds')
169
+ # Gala
170
+ if user_input is not None and submit:
171
+ with col3:
172
+ model3 = load_model_g()
173
+ start_time = time.time()
174
+ output = predict_sentence(user_input,model3)
175
+ end_time = time.time()
176
+ st.write('The sentiment of your review is', output)
177
+ st.write('Processing time:', round(end_time - start_time, 4), 'seconds')
178
+
179
+
180
+
181
+
182
+ if __name__ == '__main__':
183
+ main()
184
+