Spaces:
Sleeping
Sleeping
#import os | |
#import gradio as gr | |
#import joblib | |
#import subprocess | |
#import pandas as pd | |
#import json | |
#from pathlib import Path | |
#from threading import Lock | |
#from huggingface_hub import CommitScheduler | |
#import uuid | |
#from huggingface_hub import HfApi | |
import seaborn as sns | |
import pandas as pd | |
import numpy as np | |
import pyod | |
import pyreadr | |
import urllib | |
import rdata | |
import wget | |
import os | |
import gradio as gr | |
import joblib | |
import subprocess | |
import pandas as pd | |
import json | |
import uuid | |
from sklearn.metrics import f1_score, confusion_matrix | |
from pyod.models.mcd import MCD | |
from pyod.utils.data import generate_data | |
from pyod.utils.data import evaluate_print | |
from sklearn.datasets import fetch_openml | |
from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
from sklearn.compose import make_column_transformer | |
from sklearn.pipeline import make_pipeline | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LinearRegression | |
from sklearn.metrics import mean_squared_error, r2_score | |
from pathlib import Path | |
from threading import Lock | |
from huggingface_hub import CommitScheduler | |
from huggingface_hub import HfApi | |
#from IPython.display import display, HTML | |
import warnings | |
# Ignore all warnings | |
warnings.filterwarnings("ignore") | |
# Download the dataset | |
url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata" | |
dst_path = "./creditcard.Rdata" | |
wget.download(url, dst_path) | |
# Load the dataset | |
parsed_res = rdata.parser.parse_file(dst_path) | |
res = rdata.conversion.convert(parsed_res) | |
dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1) | |
# Prepare the data | |
y = dataset['Class'].astype(int) # Convert to integers | |
df = dataset.drop(['Class'], axis=1) | |
df.columns = df.columns.astype(str) | |
print("Data subsets created") | |
# Split the data | |
X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y) | |
X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train) | |
# Reset indices | |
X_train.reset_index(drop=True, inplace=True) | |
y_train.reset_index(drop=True, inplace=True) | |
# Define the numerical features and the pipeline for numerical features | |
numerical_features = [f'V{i}' for i in range(1, 29)] + ['Amount'] | |
numerical_pipeline = make_pipeline( | |
StandardScaler() # Example: Standardize numerical features | |
) | |
# Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately. | |
preprocessor = make_column_transformer( | |
(numerical_pipeline, numerical_features) | |
) | |
# Creating model | |
clf = MCD() | |
# Creating a pipeline combining preprocessing steps (imputation and encoding) with the MCD model | |
model_pipeline = make_pipeline( | |
preprocessor, # Applying preprocessing steps | |
clf # Training MCD model | |
) | |
print("Preprocessing Data") | |
# Fit the model and train model to predict anomalies | |
model_pipeline.fit(X_train) | |
y_test_pred = model_pipeline.predict(X_test) | |
# Define the predict function | |
def predict(csv_filename): | |
# Read the CSV file | |
df = pd.read_csv(csv_filename, header=None) | |
# Convert the DataFrame to a list of floats | |
client_data = df.iloc[0].tolist() | |
# Check if the length of client_data is 29 | |
if len(client_data) != 29: | |
raise ValueError("The CSV file must contain exactly 29 values.") | |
# Unpack the list of values | |
V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, Amount = client_data | |
# Create the data dictionary | |
data = { | |
'V1': V1, 'V2': V2, 'V3': V3, 'V4': V4, 'V5': V5, 'V6': V6, 'V7': V7, 'V8': V8, 'V9': V9, 'V10': V10, | |
'V11': V11, 'V12': V12, 'V13': V13, 'V14': V14, 'V15': V15, 'V16': V16, 'V17': V17, 'V18': V18, 'V19': V19, 'V20': V20, | |
'V21': V21, 'V22': V22, 'V23': V23, 'V24': V24, 'V25': V25, 'V26': V26, 'V27': V27, 'V28': V28, 'Amount': Amount | |
} | |
# Convert the data dictionary to a DataFrame | |
input_df = pd.DataFrame([data]) | |
# Make predictions using the loaded model | |
prediction = model_pipeline.predict(input_df) | |
return prediction[0], Amount # Return both the prediction and Amount | |
# Define a function to map the names to their respective CSV filenames | |
def get_csv_filename(name): | |
name_to_filename = { | |
'Ted': 'Ted.csv', | |
'Bill': 'Bill.csv', | |
'Jill': 'Jill.csv', | |
'Juan': 'Juan.csv' | |
} | |
return name_to_filename.get(name, 'Ted.csv') # Default to 'Ted.csv' if name not found | |
# Define the Gradio interface function for single prediction | |
def gradio_predict(name): | |
csv_filename = get_csv_filename(name) | |
prediction, amount = predict(csv_filename) | |
return f"The flagged transaction amount is {amount} and the prediction is {prediction}" | |
# Define the function for bulk analysis | |
def bulk_analysis(file): | |
# Read the uploaded CSV file | |
df = pd.read_csv(file.name) | |
# Assuming the last column is 'Amount' and the rest are features | |
X_test = df.iloc[:, :-1] | |
y_test = df.iloc[:, -1] | |
# Make predictions using the loaded model | |
y_test_pred = model_pipeline.predict(X_test) | |
# Debugging: Print counts of anomalies in actual and predicted | |
actual_anomalies = sum(y_test == 1) | |
predicted_anomalies = sum(y_test_pred == 1) | |
print(f"Actual anomalies: {actual_anomalies}, Predicted anomalies: {predicted_anomalies}") | |
# Find rows where actual and predicted are both 1 | |
correctly_predicted_anomalies = X_test[(y_test == 1) & (y_test_pred == 1)] | |
print(f"Correctly predicted anomalies: {len(correctly_predicted_anomalies)}") | |
# Save the results to a CSV file | |
#result_filename = "correct_anomalies.csv" | |
#correctly_predicted_anomalies.to_csv(result_filename, index=False) | |
r#eturn result_filename # Return the path to the saved file | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=gradio_predict, | |
inputs=gr.Dropdown(choices=['Ted', 'Bill', 'Jill', 'Juan'], label="Select a name"), | |
outputs="text" | |
) | |
# Add the bulk analysis upload interface | |
bulk_iface = gr.Interface( | |
fn=bulk_analysis, | |
inputs=gr.File(label="Bulk Analysis"), | |
outputs="text" | |
) | |
# Combine the interfaces | |
combined_iface = gr.TabbedInterface( | |
[iface, bulk_iface], | |
tab_names=["Single Prediction", "Bulk Analysis"] | |
) | |
# Launch the interface | |
combined_iface.launch(share=True) | |