dnirfana's picture
Update eda.py
f4c23cf verified
raw
history blame
7.43 kB
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
def app():
st.title('Exploratory Data Analysis')
# Load Data
df = pd.read_csv('../Transactions Data.csv')
# Creating the table with column names and descriptions
data = {
"Column Names": [
"step",
"type",
"amount",
"nameOrig",
"oldbalanceOrg",
"newbalanceOrig",
"nameDest",
"oldbalanceDest",
"newbalanceDest",
"isFraud",
"isFlaggedFraud"
],
"Description": [
"Represents a unit of time in the transaction process, though the specific time unit is not specified in the dataset. It could denote hours, days, or another unit, depending on the context.",
"Describes the type of transaction, such as transfer, payment, etc. This categorical variable allows for the classification of different transaction behaviors.",
"Indicates the monetary value of the transaction, providing insight into the financial magnitude of each transaction.",
"Serves as the identifier for the origin account or entity initiating the transaction. This helps trace the source of funds in each transaction.",
"Represents the balance in the origin account before the transaction occurred, offering a reference point for understanding changes in account balances.",
"Reflects the balance in the origin account after the transaction has been processed, providing insight into how the transaction affects the account balance.",
"Functions as the identifier for the destination account or entity receiving the funds in each transaction. It helps track where the money is being transferred to.",
"Indicates the balance in the destination account before the transaction, offering a baseline for assessing changes in account balances due to incoming funds.",
"Represents the balance in the destination account after the transaction has been completed, providing insight into the impact of incoming funds on the account balance.",
"A binary indicator (0 or 1) denoting whether the transaction is fraudulent (1) or legitimate (0). This is the target variable for fraud detection modeling.",
"Another binary indicator (0 or 1) which may signal whether a transaction has been flagged as potentially fraudulent. This could serve as an additional feature for fraud detection algorithms."
]}
# Displaying the table using Streamlit
st.subheader('Transaction Dataset Column Descriptions')
st.table(data)
st.divider()
# Data Summary
st.header('Data Summary')
st.write(df.describe().T)
st.divider()
# Univariate Exploration
st.header('Univariate Analysis')
# 1
st.subheader('Distribution of Transactions Types')
# Plotting
fig, ax = plt.subplots()
sns.histplot(df['type'], bins=20, ax=ax)
plt.xlabel('Transaction Types')
plt.ylabel('Frequency')
plt.title('Distribution of Transaction Types')
st.pyplot(fig)
st.write('bla bla bla')
st.write('')
# 2
st.subheader('Distribution of Balance Amount')
# Plotting
fig, ax = plt.subplots()
sns.histplot(df['amount'], bins=20, ax=ax)
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.title('Distribution of Balance Amount')
st.pyplot(fig)
st.write('bla bla bla')
st.write('')
# 3
st.subheader('Distribution of Old Balance Origin')
# Plotting
fig, ax = plt.subplots()
sns.histplot(df['oldbalanceOrg'], bins=20, ax=ax)
plt.xlabel('Old Balance Origin')
plt.ylabel('Frequency')
plt.title('Distribution of Old Balance Origin')
st.pyplot(fig)
st.write('bla bla bla')
st.write('')
# 4
st.subheader('Distribution of New Balance Origin')
# Plotting
fig, ax = plt.subplots()
sns.histplot(df['newbalanceOrig'], bins=20, ax=ax)
plt.xlabel('New Balance Origin')
plt.ylabel('Frequency')
plt.title('Distribution of New Balance Origin')
st.pyplot(fig)
st.write('bla bla bla')
st.write('')
# 5
st.subheader('Distribution of Old Balance Destination')
# Plotting
fig, ax = plt.subplots()
sns.histplot(df['oldbalanceDest'], bins=20, ax=ax)
plt.xlabel('Old Balance Origin')
plt.ylabel('Frequency')
plt.title('Distribution of Old Balance Destination')
st.pyplot(fig)
st.write('bla bla bla')
st.write('')
# 5
st.subheader('Distribution of New Balance Destination')
# Plotting
fig, ax = plt.subplots()
sns.histplot(df['newbalanceDest'], bins=20, ax=ax)
plt.xlabel('New Balance Origin')
plt.ylabel('Frequency')
plt.title('Distribution of New Balance Destination')
st.pyplot(fig)
st.write('bla bla bla')
st.write('')
# 6
st.subheader('Distribution of Flagged Fraud')
# Plotting
fig, ax = plt.subplots()
sns.histplot(df['isFlaggedFraud'], bins=20, ax=ax)
plt.xlabel('Is Flagged Fraud')
plt.ylabel('Frequency')
plt.title('Distribution of Flagged Fraud')
st.pyplot(fig)
st.write('bla bla bla')
st.write('')
# 7
st.subheader('Distribution of Fraud')
# Plotting
fig, ax = plt.subplots()
sns.histplot(df['isFraud'], bins=20, ax=ax)
plt.xlabel('Is Fraud')
plt.ylabel('Frequency')
plt.title('Distribution of Fraud')
st.pyplot(fig)
st.write('bla bla bla')
st.write('')
st.divider()
# Bivariate analysis
st.header('Bivariate Analysis')
# 1
st.subheader('Distribution of Amout Balance per Transaction Types')
fig, ax = plt.subplots()
sns.boxplot(x=df['amount'], y=df['type'], ax=ax)
plt.xlabel('Amount')
plt.ylabel('Transaction Types')
plt.title('Transaction Types vs Amount Balance')
st.pyplot(fig)
st.write('bla bla bla')
st.write('')
# 2
st.subheader('Distribution of Old Balance Origin per Transaction Types')
fig, ax = plt.subplots()
sns.boxplot(x=df['oldbalanceOrg'], y=df['type'], ax=ax)
plt.xlabel('Old Balance Origin')
plt.ylabel('Transaction Types')
plt.title('Transaction Types vs Old Balance Origin')
st.pyplot(fig)
st.write('bla bla bla')
st.write('')
# 3
st.subheader('Distribution of New Balance Origin per Transaction Types')
fig, ax = plt.subplots()
sns.boxplot(x=df['newbalanceOrig'], y=df['type'], ax=ax)
plt.xlabel('New Balance Origin')
plt.ylabel('Transaction Types')
plt.title('Transaction Types vs Old Balance Origin')
st.pyplot(fig)
st.write('bla bla bla')
st.write('')
# 4
st.subheader('Distribution of Old Balance Destination per Transaction Types')
fig, ax = plt.subplots()
sns.boxplot(x=df['oldbalanceDest'], y=df['type'], ax=ax)
plt.xlabel('Old Balance Destination')
plt.ylabel('Transaction Types')
plt.title('Transaction Types vs Old Balance Destination')
st.pyplot(fig)
st.write('bla bla bla')
st.write('')
# 5
st.subheader('Distribution of New Balance Destination per Transaction Types')
fig, ax = plt.subplots()
sns.boxplot(x=df['newbalanceDest'], y=df['type'], ax=ax)
plt.xlabel('New Balance Destination')
plt.ylabel('Transaction Types')
plt.title('Transaction Types vs New Balance Destination')
st.pyplot(fig)
st.write('bla bla bla')
st.write('')