Spaces:

ruslanruslanruslan
/

nlp_project

Running

App Files Files Community

nlp_project / pages /Film reviews classifier.py

ruslanruslanruslan

models added

66e9d7c about 1 year ago

raw

history blame

No virus

7.41 kB

	import streamlit as st
	import time
	import os
	import logging
	import torch
	import json
	import string
	import re
	import string
	import nltk
	import numpy as np
	import torch.nn as nn
	import transformers
	import lightgbm as lgb
	import pickle
	nltk.download('wordnet')
	nltk.download('stopwords')
	from collections import Counter
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.tokenize import RegexpTokenizer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression

	stop_words = set(stopwords.words('english'))


	with open('logreg.pkl', 'rb') as f:
	logreg = pickle.load(f)

	with open('tf.pkl', 'rb') as f:
	tf = pickle.load(f)

	def classical_pipeline(text):
	text = text.lower()
	text = re.sub(r'\d+', ' ', text)
	text = text.translate(str.maketrans('', '', string.punctuation))
	text = re.sub(r'\n', '', text)
	wn_lemmatizer = WordNetLemmatizer()
	text = ' '.join([wn_lemmatizer.lemmatize(word) for word in text.split()])
	reg_tokenizer = RegexpTokenizer('\w+')
	text = reg_tokenizer.tokenize_sents([text])
	sw = stopwords.words('english')
	text = ' '.join([word for word in text[0] if word not in sw])
	text = tf.transform([text])
	return text

	def preprocess_single_string(input_string: str, seq_len: int, vocab_to_int: dict):
	preprocessed_string = data_preprocessing(input_string)
	result_list = []
	for word in preprocessed_string.split():
	try:
	result_list.append(vocab_to_int[word])
	except KeyError as e:
	continue
	result_padded = padding([result_list], seq_len)[0]
	return torch.tensor(result_padded)



	def padding(reviews_int: list, seq_len: int):
	features = np.zeros((len(reviews_int), seq_len), dtype = int)
	for i, review in enumerate(reviews_int):
	if len(review) <= seq_len:
	zeros = list(np.zeros(seq_len - len(review)))
	new = zeros + review
	else:
	new = review[: seq_len]
	features[i, :] = np.array(new)
	return features


	def data_preprocessing(text: str):
	wn_lemmatizer = WordNetLemmatizer()
	text = text.lower()
	text = re.sub('<.*?>', '', text)
	text = ''.join([c for c in text if c not in string.punctuation])
	text = [wn_lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
	text = ' '.join(text)
	return text

	with open('lstm_vocab_to_int.json') as json_file:
	vocab_to_int = json.load(json_file)

	with open('lstm_embedding_matrix.npy', 'rb') as f:
	embedding_matrix = np.load(f)

	embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))

	class LSTMClassifier(nn.Module):
	def __init__(self, embedding_dim: int, seq_len:int, hidden_size:int = 32, dropout:int = 0, num_layers:int = 1) -> None:
	super().__init__()

	self.embedding_dim = embedding_dim
	self.hidden_size = hidden_size
	self.embedding = embedding_layer
	self.dropout = dropout
	self.num_layers = num_layers
	self.seq_len = seq_len
	self.lstm = nn.LSTM(
	input_size=self.embedding_dim,
	hidden_size=self.hidden_size,
	batch_first=True,
	bidirectional=True,
	dropout=self.dropout,
	num_layers=self.num_layers
	)
	self.linear = nn.Sequential(
	nn.Linear(self.hidden_size * self.seq_len * 2, 128),
	nn.Linear(128, 1)
	)

	def forward(self, x):
	embeddings = self.embedding(x)
	output, _ = self.lstm(embeddings)
	output = output.contiguous().view(output.size(0), -1)
	out = self.linear(output.squeeze(0))
	return out

	bert_model_class = transformers.DistilBertModel
	bert_tokenizer_class = transformers.DistilBertTokenizer
	bert_pretrained_weights = torch.load('basic_bert_weights.pt', map_location=torch.device('cpu'))
	bert_tokenizer = bert_tokenizer_class.from_pretrained('distilbert-base-uncased')
	bert_basic_model = bert_model_class.from_pretrained('distilbert-base-uncased')

	class BertReviews(nn.Module):
	def __init__(self, model):
	super(BertReviews, self).__init__()
	self.bert = model
	for param in self.bert.parameters():
	param.requires_grad = False
	for i in range(6):
	self.bert.transformer.layer[i].output_layer_norm.weight.requires_grad = True
	self.bert.transformer.layer[i].output_layer_norm.bias.requires_grad = True
	self.fc = nn.Linear(768, 1)

	def forward(self, samples, att_masks):

	embeddings = self.bert(samples, attention_mask=att_masks)
	model_out = self.fc(embeddings[0][:, 0, :])

	return embeddings, model_out

	bert_model = BertReviews(bert_basic_model)
	bert_model.load_state_dict(torch.load('bert_weights.pt', map_location=torch.device('cpu')))
	bert_model.to('cpu').eval()

	model_lstm = LSTMClassifier(embedding_dim=64, hidden_size=64, seq_len = 150, dropout=0.5, num_layers=4)
	model_lstm.load_state_dict(torch.load('lstm_model_weights.pt', map_location=torch.device('cpu')))
	model_lstm.to('cpu').eval()


	def predict_sentence_classical(text: str):
	start_time = time.time()
	text = classical_pipeline(text)
	res = logreg.predict(text)[0]
	end_time = time.time()
	execution_time = end_time - start_time
	return res, execution_time


	def predict_sentence_lstm(text: str):
	start_time = time.time()
	text = preprocess_single_string(text, 150, vocab_to_int)
	res = int(torch.sigmoid(model_lstm(text.unsqueeze(0))).cpu().detach().numpy().round())
	end_time = time.time()
	execution_time = end_time - start_time
	return res, execution_time

	def predict_sentence_bert(text: str):
	start_time = time.time()
	text = bert_tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=200)
	text = np.array([text + [0]*(200-len(text))])
	attention_mask = torch.Tensor(np.where(text != 0, 1, 0)).to(torch.int64)
	text = torch.Tensor(text).to(torch.int64)
	# output = bert_model(text, attention_mask)[1]
	# res = output.squeeze().detach().numpy().round()

	res = int(torch.sigmoid(bert_model(text, attention_mask)[1]).cpu().detach().numpy().round())
	end_time = time.time()
	execution_time = end_time - start_time
	return res, execution_time

	reses = {0: 'negative', 1: 'positive'}

	def process_text(input_text):
	res_classical, time_classical = predict_sentence_classical(input_text)
	res_lstm, time_lstm = predict_sentence_lstm(input_text)
	res_bert, time_bert = predict_sentence_bert(input_text)
	st.write('Results:')
	st.write(f'Logistic regression: {reses[res_lstm]}, execution time: {time_lstm:.2f} seconds.')
	st.write(f'LSTM: {reses[res_lstm]}, execution time: {time_lstm:.2f} seconds.')
	st.write(f'Upgraded Bert: {reses[res_bert]}, execution time: {time_bert:.2f} seconds.')

	st.title('Film reviews classifier')
	st.write('Write a film review in a box below, and the application, powered by three NLP models (logistic regression, LSTM and upgraded Bert), will tell if it is a positive or a negative review.')

	user_input = st.text_area("Enter your text:")
	if st.button("Send a review for processing"):
	if user_input:
	processed_text = process_text(user_input)
	else:
	st.warning("Please enter some text before processing.")