#!/usr/bin/env python # coding: utf-8 from sklearn.feature_extraction.text import TfidfVectorizer from nltk.stem import WordNetLemmatizer import streamlit as st import pickle import pandas as pd import numpy as np import nltk import regex as re from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from sklearn.ensemble import RandomForestClassifier from catboost import CatBoostClassifier import transformers from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig from scipy.special import softmax import matplotlib.pyplot as plt import seaborn as sns import ast nltk.download('stopwords') nltk.download('wordnet') # Load the model def load_model(): with open('catboost_model.pkl', 'rb') as file: loaded_model = pickle.load(file) return loaded_model def load_vectorizer(): with open('tfidf_vectorizer.pkl', 'rb') as file: loaded_vectorizer = pickle.load(file) return loaded_vectorizer def ratings(list_of_reviews): xidf = [] stopwords = nltk.corpus.stopwords.words('english') lemmatizer = WordNetLemmatizer() review = re.sub('[^a-zA-Z]', ' ', list_of_reviews) review = review.lower() review = review.split() review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)] review = ' '.join(review) xidf.append(review) tf_idf_vectorizer = load_vectorizer() # Transform the new review using the loaded vectorizer tf_review = tf_idf_vectorizer.transform(xidf) model = load_model() prediction = model.predict(tf_review) return prediction def sentiment_analysis(texts): MODEL = "cardiffnlp/twitter-roberta-base-sentiment" task = 'sentiment' tokenizer = AutoTokenizer.from_pretrained(MODEL) config = AutoConfig.from_pretrained(MODEL) # PT model = AutoModelForSequenceClassification.from_pretrained(MODEL) results = [] for text in texts: encoded_input = tokenizer(text, return_tensors='pt', max_length=512, truncation=True) output = model(**encoded_input) scores = output[0][0].detach().numpy() scores = softmax(scores) results.append(scores.tolist()) return results def get_sentiment_label(row): if row['positive_score'] > row['neutral_score'] and row['positive_score'] > row['negative_score']: return 'positive' elif row['negative_score'] > row['neutral_score'] and row['negative_score'] > row['positive_score']: return 'negative' else: return 'neutral' st.set_option('deprecation.showPyplotGlobalUse', False) # Create two columns col1, col2 = st.columns([0.5, 1.2]) # Adjust the ratio as needed # Column 1: Image with col1: st.image("img2.png", width=200) # Adjust the path and width as needed # Column 2: Text with col2: st.write(""" # Ratings Prediction & Reviews Sentiment Analysis App """) st.write(" This app predicts **the average rating of a product, given a list of reviews and also displays the sentiment of these reviews**!") st.write('---') sidebar_selection = st.sidebar.radio("Select an option:", ("Ratings Prediction", "Sentiment Analysis")) list_reviews = st.text_input("Enter the list of reviews: ") sentiment_review = list_reviews ratings_review = list_reviews submit_button = st.button("Submit") if sidebar_selection == "Ratings Prediction": # Check if the submit button is clicked and the input is not empty if submit_button and ratings_review: rating_pred = ratings(ratings_review) def get_rating_category(rating): if rating < 2.0: return "between 1 and 2 which is Very Low" elif rating < 3.0: return "between 2 and 3 which is Low" elif rating < 4.0: return "between 3 and 4 which is Medium" elif rating < 5.0: return "between 4 and 5 which is High" else: return "5 which is Very High" # Determine the rating category rating_category = get_rating_category(rating_pred) # Display the result st.write(f"Based on the list of reviews provided, your average rating falls {rating_category}.") elif submit_button: # Display a message if the submit button is clicked but no review is provided st.write("Please enter a review to get a prediction.") elif sidebar_selection == "Sentiment Analysis": if submit_button and sentiment_review: # Create a DataFrame # Split the string into a list of reviews review_list = sentiment_review.split(',') df = pd.DataFrame(review_list, columns=['Review']) scores = sentiment_analysis(df['Review']) df['negative_score'] = [score[0] for score in scores] df['neutral_score'] = [score[1] for score in scores] df['positive_score'] = [score[2] for score in scores] df['sentiment'] = df.apply(get_sentiment_label, axis=1) # Display the sentiment distribution chart using Streamlit st.write("**Sentiment Distribution:**") plt.figure(figsize=(8, 6)) sns.countplot(data=df, x='sentiment', color='blue') # Display values on top of the bars for p in plt.gca().patches: plt.gca().annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2, p.get_height()), ha='center', va='bottom') # Set plot labels and title plt.xlabel('Sentiment') plt.ylabel('Count') plt.title('Sentiment Distribution') st.pyplot(plt)