File size: 5,186 Bytes
569edb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192fa15
569edb3
af4a75a
192fa15
569edb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192fa15
569edb3
192fa15
 
 
569edb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571bf3f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st  
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.model_selection import train_test_split  
from sklearn.svm import SVC  
from sklearn.linear_model import LogisticRegression  
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  
from sklearn.metrics import accuracy_score, classification_report  
from transformers import BertTokenizer, BertForSequenceClassification  
import torch  

@st.cache_data  
def load_data():  
    return pd.read_csv('IMDB Dataset.csv')  

if 'models' not in st.session_state:  
    st.session_state.models = {}  
if 'reports' not in st.session_state:  
    st.session_state.reports = {}  
if 'accuracy' not in st.session_state:  
    st.session_state.accuracy = {}  

df = load_data()  

df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})  

X = df['review']  
y = df['sentiment']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  

if not st.session_state.models:  
    vectorizer = TfidfVectorizer()  
    X_train_tfidf = vectorizer.fit_transform(X_train)  

    # models
    models = {  
        # "SVM": SVC(kernel='linear'),  
        "Logistic Regression": LogisticRegression(max_iter=1000),  
        "Random Forest": RandomForestClassifier(n_estimators=10),
        # "Gradient Boosting": GradientBoostingClassifier()  
    }  

    for name, model in models.items():  
        model.fit(X_train_tfidf, y_train)  
        st.session_state.models[name] = model  
        X_test_tfidf = vectorizer.transform(X_test)  
        y_pred = model.predict(X_test_tfidf)  
        st.session_state.accuracy[name] = accuracy_score(y_test, y_pred)  
        report = classification_report(y_test, y_pred, output_dict=True)  
        st.session_state.reports[name] = pd.DataFrame(report).transpose()  

    # st.session_state.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  
    # st.session_state.bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  

    # train_encodings = st.session_state.bert_tokenizer(list(X_train), truncation=True, padding=True, return_tensors='pt')  
    # train_labels = torch.tensor(y_train.values)  

    # train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)  

    # training_args = torch.optim.AdamW(st.session_state.bert_model.parameters(), lr=1e-5)  
    # st.session_state.bert_model.train()  

    # for epoch in range(1):  
    #     for batch in train_dataset:  
    #         inputs = batch[0], batch[1]  
    #         labels = batch[2]  
    #         outputs = st.session_state.bert_model(*inputs, labels=labels)  
    #         loss = outputs.loss  
    #         loss.backward()  
    #         training_args.step()  
    #         training_args.zero_grad()  

    # st.session_state.bert_model.eval()  
    # test_encodings = st.session_state.bert_tokenizer(list(X_test), truncation=True, padding=True, return_tensors='pt')  
    # with torch.no_grad():  
    #     outputs = st.session_state.bert_model(test_encodings['input_ids'], test_encodings['attention_mask'])  
    # predictions = torch.argmax(outputs.logits, dim=1).numpy()  
    # st.session_state.accuracy["BERT"] = accuracy_score(y_test, predictions)  
    # report = classification_report(y_test, predictions, output_dict=True)  
    # st.session_state.reports["BERT"] = pd.DataFrame(report).transpose()  

if st.session_state.accuracy:  
    color = ['blue','orange']
    plt.figure(figsize=(10, 5))  
    # plt.bar(st.session_state.accuracy.keys(), st.session_state.accuracy.values(), color=['blue', 'orange', 'green','red', 'purple'])  
    plt.bar(st.session_state.accuracy.keys(), st.session_state.accuracy.values(), color=color)  
    
    plt.ylabel('Accuracy')  
    plt.title('Model Accuracy Comparison')  
    st.pyplot(plt)  

    for name, report_df in st.session_state.reports.items():  
        st.header(f"{name}",divider='orange')  
        st.dataframe(report_df)  

st.header("Manual Tryouts")
user_input = st.text_area("Review", "")  

if st.button("Predict"):  
    if user_input:  
        user_input_tfidf = vectorizer.transform([user_input])  

        predictions = {}  
        for name, model in st.session_state.models.items():  
            prediction = model.predict(user_input_tfidf)  
            predictions[name] = "Positive" if prediction[0] == 1 else "Negative"  

        # inputs = st.session_state.bert_tokenizer(user_input, return_tensors='pt', truncation=True, padding=True)  
        # with torch.no_grad():  
        #     output = st.session_state.bert_model(inputs['input_ids'], inputs['attention_mask'])  
        # bert_prediction = torch.argmax(output.logits, dim=1).item()  
        # predictions["BERT"] = "Positive" if bert_prediction == 1 else "Negative"  

        st.write("Predicted Sentiment:")  
        for name in predictions:  
            st.write(f"{name}: **{predictions[name]}**")  
    else:  
        st.write("Please enter a review.")