import streamlit as st import pandas as pd from transformers import AutoTokenizer, AutoModelForSequenceClassification from transformers import pipeline import torch.nn.functional as F import torch import io import base64 from stqdm import stqdm import matplotlib.pyplot as plt import numpy as np # Define the model and tokenizer model_name = 'nlptown/bert-base-multilingual-uncased-sentiment' model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) st.set_page_config(layout="wide") # Import the new model and tokenizer classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") #defs def classify_reviews(reviews): inputs = tokenizer(reviews, return_tensors='pt', truncation=True, padding=True, max_length=512) outputs = model(**inputs) probabilities = F.softmax(outputs.logits, dim=1).tolist() return probabilities def top_rating(scores): return scores.index(max(scores)) + 1 def top_prob(scores): return max(scores) def get_table_download_link(df): csv = df.to_csv(index=False) b64 = base64.b64encode(csv.encode()).decode() return f'Download csv file' # Function for classifying with the new model def classify_with_new_classes(reviews, class_names): class_scores = [] for review in reviews: result = classifier(review, class_names) scores_dict = dict(zip(result['labels'], result['scores'])) # Reorder scores to match the original class_names order scores = [scores_dict[name] for name in class_names] class_scores.append(scores) return class_scores def main(): st.title('Sentiment Analysis') st.markdown('Upload an Excel file to get sentiment analytics') file = st.file_uploader("Upload an excel file", type=['xlsx']) review_column = None df = None class_names = None # New variable for class names if file is not None: try: df = pd.read_excel(file) # Drop rows where all columns are NaN df = df.dropna(how='all') # Replace blank spaces with NaN, then drop rows where all columns are NaN again df = df.replace(r'^\s*$', np.nan, regex=True) df = df.dropna(how='all') review_column = st.selectbox('Select the column from your excel file containing text', df.columns) df[review_column] = df[review_column].astype(str) class_names = st.text_input('Enter the possible class names separated by comma') # New input field for class names except Exception as e: st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.") return start_button = st.button('Start Analysis') if start_button and df is not None: # Drop rows with NaN or blank values in the review_column df = df[df[review_column].notna()] df = df[df[review_column].str.strip() != ''] class_names = [name.strip() for name in class_names.split(',')] # Split class names into a list for name in class_names: # Add a new column for each class name if name not in df.columns: df[name] = 0.0 if review_column in df.columns: with st.spinner('Performing sentiment analysis...'): df, df_display = process_reviews(df, review_column, class_names) display_ratings(df, review_column) # updated this line display_dataframe(df, df_display) else: st.write(f'No column named "{review_column}" found in the uploaded file.') def process_reviews(df, review_column, class_names): with st.spinner('Classifying reviews...'): progress_bar = st.progress(0) total_reviews = len(df[review_column].tolist()) review_counter = 0 batch_size = 50 raw_scores = [] reviews = df[review_column].tolist() for i in range(0, len(reviews), batch_size): batch_reviews = reviews[i:i+batch_size] batch_scores = classify_reviews(batch_reviews) raw_scores.extend(batch_scores) review_counter += len(batch_reviews) progress_bar.progress(review_counter / total_reviews) with st.spinner('Generating classes...'): class_scores = classify_with_new_classes(df[review_column].tolist(), class_names) class_scores_dict = {} # New dictionary to store class scores for i, name in enumerate(class_names): df[name] = [score[i] for score in class_scores] class_scores_dict[name] = [score[i] for score in class_scores] # Add a new column with the class that has the highest score df['Highest Class'] = df[class_names].idxmax(axis=1) df_new = df.copy() df_new['raw_scores'] = raw_scores scores_to_df(df_new) df_display = scores_to_percent(df_new.copy()) # Get all columns excluding the created ones and the review_column remaining_columns = [col for col in df.columns if col not in [review_column, 'raw_scores', 'Weighted Rating', 'Rating', 'Probability', '1 Star', '2 Star', '3 Star', '4 Star', '5 Star', 'Highest Class'] + class_names] # Reorder the dataframe with selected columns first, created columns next, then the remaining columns df_new = df_new[[review_column, 'Weighted Rating', 'Rating', 'Probability', '1 Star', '2 Star', '3 Star', '4 Star', '5 Star'] + class_names + ['Highest Class'] + remaining_columns] # Reorder df_display as well df_display = df_display[[review_column, 'Weighted Rating', 'Rating', 'Probability', '1 Star', '2 Star', '3 Star', '4 Star', '5 Star'] + class_names + ['Highest Class'] + remaining_columns] return df_new, df_display def scores_to_df(df): for i in range(1, 6): df[f'{i} Star'] = df['raw_scores'].apply(lambda scores: scores[i-1]).round(2) df['Rating'] = df['raw_scores'].apply(top_rating) df['Probability'] = df['raw_scores'].apply(top_prob).round(2) # Compute the Weighted Rating df['Weighted Rating'] = sum(df[f'{i} Star']*i for i in range(1, 6)) df.drop(columns=['raw_scores'], inplace=True) def scores_to_percent(df): for i in range(1, 6): df[f'{i} Star'] = df[f'{i} Star'].apply(lambda x: f'{x*100:.0f}%') df['Probability'] = df['Probability'].apply(lambda x: f'{x*100:.0f}%') return df def convert_df_to_csv(df): return df.to_csv(index=False).encode('utf-8') def display_dataframe(df, df_display): csv = convert_df_to_csv(df) col1, col2, col3, col4, col5, col6, col7, col8, col9 = st.columns(9) with col1: st.download_button( "Download CSV", csv, "data.csv", "text/csv", key='download-csv' ) st.dataframe(df_display) def display_ratings(df, review_column): cols = st.columns(5) for i in range(1, 6): rating_counts = df[df['Rating'] == i].shape[0] cols[i-1].markdown(f"### {rating_counts}") cols[i-1].markdown(f"{'⭐' * i}") if __name__ == "__main__": main()