DevBM commited on
Commit
571bf3f
1 Parent(s): da29473

Upload 6 files

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. IMDB Dataset.csv +3 -0
  3. a.py +131 -0
  4. linear_regression.py +33 -0
  5. linear_regression_model.pkl +3 -0
  6. main.py +118 -0
  7. requirements.txt +6 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ IMDB[[:space:]]Dataset.csv filter=lfs diff=lfs merge=lfs -text
IMDB Dataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfc447764f82be365fa9c2beef4e8df89d3919e3da95f5088004797d79695aa2
3
+ size 66212309
a.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.linear_model import LogisticRegression
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.metrics import accuracy_score, classification_report
7
+ from sklearn.naive_bayes import MultinomialNB
8
+ from sklearn.ensemble import RandomForestClassifier
9
+ from sklearn.svm import SVC
10
+ import pickle
11
+ import matplotlib.pyplot as plt
12
+
13
+ st.title(":blue[IMDB Dataset of 50k reviews]")
14
+
15
+
16
+ @st.cache_data
17
+ def load_data():
18
+ return pd.read_csv('IMDB Dataset.csv')
19
+ if 'models' not in st.session_state:
20
+ st.session_state.models = {}
21
+ if 'vectorizer' not in st.session_state:
22
+ st.session_state.vectorizer = None
23
+ if 'accuracy' not in st.session_state:
24
+ st.session_state.accuracy = {}
25
+ if 'report' not in st.session_state:
26
+ st.session_state.report = {}
27
+
28
+ # Dataset
29
+ st.header("Dataset")
30
+ df = load_data()
31
+ with st.expander("Show Data"):
32
+ st.write(df)
33
+ df['sentiment'] = df['sentiment'].map({'positive':1,'negative':0})
34
+ X = df['review']
35
+ y = df['sentiment']
36
+ X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=41)
37
+
38
+ tfidf_vectorizer = TfidfVectorizer()
39
+ X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
40
+ X_test_tfidf = tfidf_vectorizer.transform(X_test)
41
+
42
+
43
+ if not st.session_state.models:
44
+ st.session_state.vectorizer = TfidfVectorizer()
45
+ X_train_tfidf = st.session_state.vectorizer.fit_transform(X_train)
46
+
47
+ # models
48
+ models = {
49
+ # "SVM": SVC(kernel='linear'),
50
+ "Logistic Regression": LogisticRegression(max_iter=1000),
51
+ "Naive Bayes": MultinomialNB()
52
+ }
53
+
54
+ for name, model in models.items():
55
+ model.fit(X_train_tfidf, y_train)
56
+ st.session_state.models[name] = model
57
+ X_test_tfidf = st.session_state.vectorizer.transform(X_test)
58
+ y_pred = model.predict(X_test_tfidf)
59
+ st.session_state.accuracy[name] = accuracy_score(y_test, y_pred)
60
+ st.session_state.report[name] = classification_report(y_test, y_pred)
61
+
62
+ if st.session_state.accuracy:
63
+
64
+ plt.figure(figsize=(10, 5))
65
+ plt.bar(st.session_state.accuracy.keys(), st.session_state.accuracy.values(), color=['blue', 'orange', 'green'])
66
+ plt.ylabel('Accuracy')
67
+ plt.title('Model Accuracy Comparison')
68
+ st.pyplot(plt)
69
+
70
+ for name in st.session_state.report:
71
+ st.write(f"### Classification Report for {name}:")
72
+ # st.text(st.session_state.report[name])
73
+ st.dataframe(st.session_state.report[name])
74
+
75
+ st.header("Manual Tryouts",divider='orange')
76
+ # Input text from the user
77
+ user_input = st.text_area("Enter your Review", "")
78
+
79
+ if st.button("Predict"):
80
+ if user_input:
81
+ # Vectorize user input for all models
82
+ user_input_tfidf = st.session_state.vectorizer.transform([user_input])
83
+
84
+ # Predict using all models
85
+ predictions = {}
86
+ for name, model in st.session_state.models.items():
87
+ prediction = model.predict(user_input_tfidf)
88
+ predictions[name] = "Positive" if prediction[0] == 1 else "Negative"
89
+
90
+ # Display predictions for each model
91
+ st.write("Predicted Sentiment:")
92
+ for name in predictions:
93
+ st.write(f"{name}: **{predictions[name]}**")
94
+ else:
95
+ st.write("Please enter a review.")
96
+ # # Linear Regression
97
+ # st.header('Linear Regression',divider='orange')
98
+ # model = LogisticRegression()
99
+ # model.fit(X_train_tfidf, y_train)
100
+
101
+ # y_pred = model.predict(X_test_tfidf)
102
+
103
+ # print("Accuracy:", accuracy_score(y_test, y_pred))
104
+ # print(classification_report(y_test, y_pred))
105
+
106
+ # filename = 'linear_regression_model.pkl'
107
+ # with open(filename, 'wb') as model_file:
108
+ # pickle.dump(model, model_file)
109
+
110
+ # st.write("Accuracy:", accuracy_score(y_test, y_pred))
111
+ # st.markdown(body=classification_report(y_test, y_pred),unsafe_allow_html=True)
112
+
113
+ # # Naive Bayes
114
+ # st.header("Naive Bayes",divider='orange')
115
+ # model_nb = MultinomialNB()
116
+ # model_nb.fit(X_train_tfidf, y_train)
117
+
118
+ # # Evaluate the model
119
+ # y_pred = model_nb.predict(X_test_tfidf)
120
+ # st.write("Accuracy:", accuracy_score(y_test, y_pred))
121
+ # st.markdown(body=classification_report(y_test, y_pred),unsafe_allow_html=True)
122
+
123
+ # # SVM
124
+ # st.header("Support Vector Machine")
125
+ # st.caption("Kernal type is linear.")
126
+ # model = SVC(kernel='linear') # You can also try 'rbf', 'poly', etc.
127
+ # model.fit(X_train_tfidf, y_train)
128
+
129
+ # y_pred = model.predict(X_test_tfidf)
130
+ # st.write("Accuracy:", accuracy_score(y_test, y_pred))
131
+ # st.markdown(body=classification_report(y_test, y_pred),unsafe_allow_html=True)
linear_regression.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.linear_model import LogisticRegression
5
+ from sklearn.metrics import accuracy_score, classification_report
6
+ import pickle
7
+
8
+ df = pd.read_csv('IMDB Dataset.csv')
9
+
10
+ print(df.head())
11
+
12
+ df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
13
+ print(df.isnull())
14
+
15
+ X = df['review']
16
+ y = df['sentiment']
17
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
18
+
19
+ tfidf_vectorizer = TfidfVectorizer()
20
+ X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
21
+ X_test_tfidf = tfidf_vectorizer.transform(X_test)
22
+
23
+ model = LogisticRegression()
24
+ model.fit(X_train_tfidf, y_train)
25
+
26
+ y_pred = model.predict(X_test_tfidf)
27
+
28
+ print("Accuracy:", accuracy_score(y_test, y_pred))
29
+ print(classification_report(y_test, y_pred))
30
+
31
+ filename = 'linear_regression_model.pkl'
32
+ with open(filename, 'wb') as model_file:
33
+ pickle.dump(model, model_file)
linear_regression_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c747bcc7e2457c878887c5f076f1a86ea6a542db11ad49c993bde00056e1f85
3
+ size 744676
main.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.svm import SVC
8
+ from sklearn.linear_model import LogisticRegression
9
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
10
+ from sklearn.metrics import accuracy_score, classification_report
11
+ from transformers import BertTokenizer, BertForSequenceClassification
12
+ import torch
13
+
14
+ @st.cache_data
15
+ def load_data():
16
+ return pd.read_csv('IMDB Dataset.csv')
17
+
18
+ if 'models' not in st.session_state:
19
+ st.session_state.models = {}
20
+ if 'reports' not in st.session_state:
21
+ st.session_state.reports = {}
22
+ if 'accuracy' not in st.session_state:
23
+ st.session_state.accuracy = {}
24
+
25
+ df = load_data()
26
+
27
+ df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
28
+
29
+ X = df['review']
30
+ y = df['sentiment']
31
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
32
+
33
+ if not st.session_state.models:
34
+ vectorizer = TfidfVectorizer()
35
+ X_train_tfidf = vectorizer.fit_transform(X_train)
36
+
37
+ # models
38
+ models = {
39
+ "SVM": SVC(kernel='linear'),
40
+ "Logistic Regression": LogisticRegression(max_iter=1000),
41
+ "Random Forest": RandomForestClassifier(n_estimators=10),
42
+ "Gradient Boosting": GradientBoostingClassifier()
43
+ }
44
+
45
+ for name, model in models.items():
46
+ model.fit(X_train_tfidf, y_train)
47
+ st.session_state.models[name] = model
48
+ X_test_tfidf = vectorizer.transform(X_test)
49
+ y_pred = model.predict(X_test_tfidf)
50
+ st.session_state.accuracy[name] = accuracy_score(y_test, y_pred)
51
+ report = classification_report(y_test, y_pred, output_dict=True)
52
+ st.session_state.reports[name] = pd.DataFrame(report).transpose()
53
+
54
+ st.session_state.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
55
+ st.session_state.bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
56
+
57
+ train_encodings = st.session_state.bert_tokenizer(list(X_train), truncation=True, padding=True, return_tensors='pt')
58
+ train_labels = torch.tensor(y_train.values)
59
+
60
+ train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
61
+
62
+ training_args = torch.optim.AdamW(st.session_state.bert_model.parameters(), lr=1e-5)
63
+ st.session_state.bert_model.train()
64
+
65
+ for epoch in range(1):
66
+ for batch in train_dataset:
67
+ inputs = batch[0], batch[1]
68
+ labels = batch[2]
69
+ outputs = st.session_state.bert_model(*inputs, labels=labels)
70
+ loss = outputs.loss
71
+ loss.backward()
72
+ training_args.step()
73
+ training_args.zero_grad()
74
+
75
+ st.session_state.bert_model.eval()
76
+ test_encodings = st.session_state.bert_tokenizer(list(X_test), truncation=True, padding=True, return_tensors='pt')
77
+ with torch.no_grad():
78
+ outputs = st.session_state.bert_model(test_encodings['input_ids'], test_encodings['attention_mask'])
79
+ predictions = torch.argmax(outputs.logits, dim=1).numpy()
80
+ st.session_state.accuracy["BERT"] = accuracy_score(y_test, predictions)
81
+ report = classification_report(y_test, predictions, output_dict=True)
82
+ st.session_state.reports["BERT"] = pd.DataFrame(report).transpose()
83
+
84
+ if st.session_state.accuracy:
85
+
86
+ plt.figure(figsize=(10, 5))
87
+ plt.bar(st.session_state.accuracy.keys(), st.session_state.accuracy.values(), color=['blue', 'orange', 'green','red', 'purple'])
88
+ plt.ylabel('Accuracy')
89
+ plt.title('Model Accuracy Comparison')
90
+ st.pyplot(plt)
91
+
92
+ for name, report_df in st.session_state.reports.items():
93
+ st.header(f"{name}",divider='orange')
94
+ st.dataframe(report_df)
95
+
96
+ st.header("Manual Tryouts")
97
+ user_input = st.text_area("Review", "")
98
+
99
+ if st.button("Predict"):
100
+ if user_input:
101
+ user_input_tfidf = vectorizer.transform([user_input])
102
+
103
+ predictions = {}
104
+ for name, model in st.session_state.models.items():
105
+ prediction = model.predict(user_input_tfidf)
106
+ predictions[name] = "Positive" if prediction[0] == 1 else "Negative"
107
+
108
+ inputs = st.session_state.bert_tokenizer(user_input, return_tensors='pt', truncation=True, padding=True)
109
+ with torch.no_grad():
110
+ output = st.session_state.bert_model(inputs['input_ids'], inputs['attention_mask'])
111
+ bert_prediction = torch.argmax(output.logits, dim=1).item()
112
+ predictions["BERT"] = "Positive" if bert_prediction == 1 else "Negative"
113
+
114
+ st.write("Predicted Sentiment:")
115
+ for name in predictions:
116
+ st.write(f"{name}: **{predictions[name]}**")
117
+ else:
118
+ st.write("Please enter a review.")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ matplotlib
2
+ pandas
3
+ scikit_learn
4
+ streamlit
5
+ torch
6
+ transformers