|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.tree import DecisionTreeClassifier |
|
from sklearn.ensemble import RandomForestClassifier |
|
from sklearn.svm import SVC |
|
from sklearn.metrics import accuracy_score, classification_report |
|
from sklearn.cluster import KMeans |
|
from sklearn.decomposition import PCA |
|
|
|
class MachineLearning: |
|
def perform_ml_tasks(self, df): |
|
task_type = st.selectbox("Select ML task", ["Classification", "Clustering", "Dimensionality Reduction"]) |
|
|
|
if task_type == "Classification": |
|
self.perform_classification(df) |
|
elif task_type == "Clustering": |
|
self.perform_clustering(df) |
|
elif task_type == "Dimensionality Reduction": |
|
self.perform_dimensionality_reduction(df) |
|
|
|
def perform_classification(self, df): |
|
target_column = st.selectbox("Select target column", df.columns) |
|
feature_columns = st.multiselect("Select feature columns", df.columns.drop(target_column)) |
|
|
|
if len(feature_columns) > 0: |
|
X = df[feature_columns] |
|
y = df[target_column] |
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
scaler = StandardScaler() |
|
X_train_scaled = scaler.fit_transform(X_train) |
|
X_test_scaled = scaler.transform(X_test) |
|
|
|
model_type = st.selectbox("Select model type", ["Logistic Regression", "Decision Tree", "Random Forest", "SVM"]) |
|
|
|
if model_type == "Logistic Regression": |
|
model = LogisticRegression() |
|
elif model_type == "Decision Tree": |
|
model = DecisionTreeClassifier() |
|
elif model_type == "Random Forest": |
|
model = RandomForestClassifier() |
|
elif model_type == "SVM": |
|
model = SVC() |
|
|
|
model.fit(X_train_scaled, y_train) |
|
y_pred = model.predict(X_test_scaled) |
|
|
|
st.subheader("Classification Results") |
|
st.write(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}") |
|
st.write("Classification Report:") |
|
st.code(classification_report(y_test, y_pred)) |
|
|
|
def perform_clustering(self, df): |
|
feature_columns = st.multiselect("Select feature columns for clustering", df.columns) |
|
|
|
if len(feature_columns) > 0: |
|
X = df[feature_columns] |
|
|
|
scaler = StandardScaler() |
|
X_scaled = scaler.fit_transform(X) |
|
|
|
n_clusters = st.slider("Select number of clusters", min_value=2, max_value=10, value=3) |
|
|
|
kmeans = KMeans(n_clusters=n_clusters, random_state=42) |
|
cluster_labels = kmeans.fit_predict(X_scaled) |
|
|
|
df['Cluster'] = cluster_labels |
|
|
|
st.subheader("Clustering Results") |
|
if len(feature_columns) >= 2: |
|
fig = px.scatter(df, x=feature_columns[0], y=feature_columns[1], color='Cluster') |
|
st.plotly_chart(fig) |
|
|
|
st.write("Cluster Centers:") |
|
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_) |
|
st.write(pd.DataFrame(cluster_centers, columns=feature_columns)) |
|
|
|
def perform_dimensionality_reduction(self, df): |
|
feature_columns = st.multiselect("Select feature columns for dimensionality reduction", df.columns) |
|
|
|
if len(feature_columns) > 0: |
|
X = df[feature_columns] |
|
|
|
scaler = StandardScaler() |
|
X_scaled = scaler.fit_transform(X) |
|
|
|
n_components = st.slider("Select number of components", min_value=2, max_value=min(len(feature_columns), 10), value=2) |
|
|
|
pca = PCA(n_components=n_components) |
|
X_pca = pca.fit_transform(X_scaled) |
|
|
|
st.subheader("PCA Results") |
|
explained_variance_ratio = pca.explained_variance_ratio_ |
|
st.write(f"Explained Variance Ratio: {explained_variance_ratio}") |
|
|
|
if n_components >= 2: |
|
fig = px.scatter(x=X_pca[:, 0], y=X_pca[:, 1], title="PCA Visualization") |
|
st.plotly_chart(fig) |
|
|
|
st.write("PCA Components:") |
|
st.write(pd.DataFrame(pca.components_, columns=feature_columns)) |