|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.preprocessing import OneHotEncoder |
|
from sklearn.svm import SVC |
|
from sklearn.ensemble import RandomForestClassifier |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix |
|
from ucimlrepo import fetch_ucirepo |
|
|
|
|
|
st.set_page_config( |
|
page_title="Car Evaluation Analysis", |
|
page_icon="π", |
|
layout="wide" |
|
) |
|
|
|
|
|
st.title("π Car Evaluation Analysis Dashboard") |
|
st.markdown(""" |
|
This dashboard analyzes car evaluation data using different machine learning models. |
|
The dataset includes various car attributes and their evaluation classifications. |
|
""") |
|
|
|
|
|
|
|
@st.cache_data |
|
def load_data(): |
|
car_evaluation = fetch_ucirepo(id=19) |
|
X, y = car_evaluation.data.features, car_evaluation.data.targets |
|
df = pd.concat([X, y], axis=1) |
|
return df, X, y |
|
|
|
|
|
df, X, y = load_data() |
|
|
|
|
|
st.sidebar.header("Navigation") |
|
page = st.sidebar.radio("Go to", ["Data Overview", "Exploratory Analysis", "Model Training", "Model Comparison"]) |
|
|
|
|
|
if page == "Data Overview": |
|
st.header("Dataset Overview") |
|
|
|
|
|
col1, col2, col3, col4 = st.columns(4) |
|
|
|
with col1: |
|
st.metric( |
|
label="Total Records", |
|
value=f"{len(df):,}" |
|
) |
|
|
|
with col2: |
|
st.metric( |
|
label="Features", |
|
value=len(df.columns) - 1 |
|
) |
|
|
|
with col3: |
|
st.metric( |
|
label="Target Classes", |
|
value=len(df['class'].unique()) |
|
) |
|
|
|
with col4: |
|
st.metric( |
|
label="Missing Values", |
|
value=df.isnull().sum().sum() |
|
) |
|
|
|
st.write("") |
|
|
|
|
|
st.subheader("Sample Data") |
|
st.dataframe( |
|
df.head(), |
|
use_container_width=True, |
|
height=230 |
|
) |
|
|
|
|
|
st.subheader("Target Class Distribution") |
|
|
|
col1, col2 = st.columns([2, 1]) |
|
|
|
with col1: |
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
sns.countplot(data=df, x='class', palette='viridis') |
|
plt.title('Distribution of Car Evaluations') |
|
st.pyplot(fig) |
|
|
|
with col2: |
|
st.write("") |
|
st.write("") |
|
class_distribution = df['class'].value_counts() |
|
for class_name, count in class_distribution.items(): |
|
st.metric( |
|
label=class_name, |
|
value=count |
|
) |
|
|
|
|
|
elif page == "Exploratory Analysis": |
|
st.header("Exploratory Data Analysis") |
|
|
|
|
|
st.subheader("Feature Distributions") |
|
feature_to_plot = st.selectbox("Select Feature", df.columns[:-1]) |
|
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
sns.countplot(data=df, x=feature_to_plot, palette='coolwarm') |
|
plt.title(f'Distribution of {feature_to_plot}') |
|
plt.xticks(rotation=45) |
|
st.pyplot(fig) |
|
|
|
|
|
st.subheader("Feature vs Target Class") |
|
fig, ax = plt.subplots(figsize=(12, 6)) |
|
sns.countplot(data=df, x=feature_to_plot, hue='class', palette='Set2') |
|
plt.title(f'{feature_to_plot} Distribution by Class') |
|
plt.xticks(rotation=45) |
|
st.pyplot(fig) |
|
|
|
|
|
st.subheader("Correlation Heatmap") |
|
encoded_df = pd.get_dummies(df, drop_first=True) |
|
fig, ax = plt.subplots(figsize=(12, 8)) |
|
sns.heatmap(encoded_df.corr(), annot=True, fmt='.2f', cmap='coolwarm') |
|
plt.title('Correlation Heatmap of Encoded Features') |
|
st.pyplot(fig) |
|
|
|
|
|
elif page == "Model Training": |
|
st.header("Model Training and Evaluation") |
|
|
|
|
|
encoder = OneHotEncoder(sparse_output=False) |
|
X_encoded = encoder.fit_transform(X) |
|
y_encoded = y.values.ravel() |
|
|
|
|
|
test_size = st.slider("Select Test Size", 0.1, 0.4, 0.2, 0.05) |
|
X_train, X_test, y_train, y_test = train_test_split( |
|
X_encoded, y_encoded, test_size=test_size, random_state=42 |
|
) |
|
|
|
|
|
model_choice = st.selectbox( |
|
"Select Model", |
|
["Support Vector Machine", "Random Forest", "Logistic Regression"] |
|
) |
|
|
|
if st.button("Train Model"): |
|
with st.spinner("Training model..."): |
|
if model_choice == "Support Vector Machine": |
|
model = SVC(kernel='linear', random_state=42) |
|
elif model_choice == "Random Forest": |
|
model = RandomForestClassifier(n_estimators=100, random_state=42) |
|
else: |
|
model = LogisticRegression(max_iter=500, random_state=42) |
|
|
|
model.fit(X_train, y_train) |
|
y_pred = model.predict(X_test) |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.subheader("Model Performance") |
|
accuracy = accuracy_score(y_test, y_pred) |
|
st.metric(label="Accuracy", value=f"{accuracy:.4f}") |
|
st.text("Classification Report:") |
|
st.text(classification_report(y_test, y_pred)) |
|
|
|
with col2: |
|
st.subheader("Confusion Matrix") |
|
fig, ax = plt.subplots(figsize=(8, 6)) |
|
sns.heatmap( |
|
confusion_matrix(y_test, y_pred), |
|
annot=True, |
|
fmt='d', |
|
cmap='Blues', |
|
xticklabels=np.unique(y_test), |
|
yticklabels=np.unique(y_test) |
|
) |
|
plt.title(f'{model_choice} Confusion Matrix') |
|
plt.xlabel('Predicted') |
|
plt.ylabel('Actual') |
|
st.pyplot(fig) |
|
|
|
|
|
if model_choice == "Random Forest": |
|
st.subheader("Feature Importance") |
|
feature_importance = pd.DataFrame({ |
|
'feature': encoder.get_feature_names_out(), |
|
'importance': model.feature_importances_ |
|
}) |
|
feature_importance = feature_importance.sort_values( |
|
'importance', ascending=False |
|
).head(10) |
|
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
sns.barplot( |
|
data=feature_importance, |
|
x='importance', |
|
y='feature' |
|
) |
|
plt.title('Top 10 Most Important Features') |
|
st.pyplot(fig) |
|
|
|
|
|
else: |
|
st.header("Model Comparison") |
|
|
|
if st.button("Compare All Models"): |
|
with st.spinner("Training all models..."): |
|
|
|
encoder = OneHotEncoder(sparse_output=False) |
|
X_encoded = encoder.fit_transform(X) |
|
y_encoded = y.values.ravel() |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
X_encoded, y_encoded, test_size=0.2, random_state=42 |
|
) |
|
|
|
|
|
models = { |
|
"SVM": SVC(kernel='linear', random_state=42), |
|
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42), |
|
"Logistic Regression": LogisticRegression(max_iter=500, random_state=42) |
|
} |
|
|
|
results = {} |
|
for name, model in models.items(): |
|
model.fit(X_train, y_train) |
|
y_pred = model.predict(X_test) |
|
results[name] = { |
|
'accuracy': accuracy_score(y_test, y_pred), |
|
'predictions': y_pred |
|
} |
|
|
|
|
|
st.subheader("Accuracy Comparison") |
|
accuracy_df = pd.DataFrame({ |
|
'Model': list(results.keys()), |
|
'Accuracy': [results[model]['accuracy'] for model in results.keys()] |
|
}) |
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.dataframe(accuracy_df) |
|
|
|
with col2: |
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
sns.barplot( |
|
data=accuracy_df, |
|
x='Model', |
|
y='Accuracy', |
|
palette='viridis' |
|
) |
|
plt.title('Model Accuracy Comparison') |
|
plt.ylim(0, 1) |
|
st.pyplot(fig) |
|
|
|
|
|
st.subheader("Detailed Model Performance") |
|
for name in results.keys(): |
|
st.write(f"\n{name}:") |
|
st.text(classification_report(y_test, results[name]['predictions'])) |
|
|
|
fig, ax = plt.subplots(figsize=(8, 6)) |
|
sns.heatmap( |
|
confusion_matrix(y_test, results[name]['predictions']), |
|
annot=True, |
|
fmt='d', |
|
cmap='Blues', |
|
xticklabels=np.unique(y_test), |
|
yticklabels=np.unique(y_test) |
|
) |
|
plt.title(f'{name} Confusion Matrix') |
|
plt.xlabel('Predicted') |
|
plt.ylabel('Actual') |
|
st.pyplot(fig) |
|
|
|
|
|
st.markdown(""" |
|
--- |
|
Created with β€οΈ using Streamlit |
|
""") |