import streamlit as st import pandas as pd import matplotlib.pyplot as plt import seaborn as sns def app(): st.title('Exploratory Data Analysis') # Load Data df = pd.read_csv('../Transactions Data.csv') # Creating the table with column names and descriptions data = { "Column Names": [ "step", "type", "amount", "nameOrig", "oldbalanceOrg", "newbalanceOrig", "nameDest", "oldbalanceDest", "newbalanceDest", "isFraud", "isFlaggedFraud" ], "Description": [ "Represents a unit of time in the transaction process, though the specific time unit is not specified in the dataset. It could denote hours, days, or another unit, depending on the context.", "Describes the type of transaction, such as transfer, payment, etc. This categorical variable allows for the classification of different transaction behaviors.", "Indicates the monetary value of the transaction, providing insight into the financial magnitude of each transaction.", "Serves as the identifier for the origin account or entity initiating the transaction. This helps trace the source of funds in each transaction.", "Represents the balance in the origin account before the transaction occurred, offering a reference point for understanding changes in account balances.", "Reflects the balance in the origin account after the transaction has been processed, providing insight into how the transaction affects the account balance.", "Functions as the identifier for the destination account or entity receiving the funds in each transaction. It helps track where the money is being transferred to.", "Indicates the balance in the destination account before the transaction, offering a baseline for assessing changes in account balances due to incoming funds.", "Represents the balance in the destination account after the transaction has been completed, providing insight into the impact of incoming funds on the account balance.", "A binary indicator (0 or 1) denoting whether the transaction is fraudulent (1) or legitimate (0). This is the target variable for fraud detection modeling.", "Another binary indicator (0 or 1) which may signal whether a transaction has been flagged as potentially fraudulent. This could serve as an additional feature for fraud detection algorithms." ]} # Displaying the table using Streamlit st.subheader('Transaction Dataset Column Descriptions') st.table(data) st.divider() # Data Summary st.header('Data Summary') st.write(df.describe().T) st.divider() # Univariate Exploration st.header('Univariate Analysis') # 1 st.subheader('Distribution of Transactions Types') # Plotting fig, ax = plt.subplots() sns.histplot(df['type'], bins=20, ax=ax) plt.xlabel('Transaction Types') plt.ylabel('Frequency') plt.title('Distribution of Transaction Types') st.pyplot(fig) st.write('bla bla bla') st.write('') # 2 st.subheader('Distribution of Balance Amount') # Plotting fig, ax = plt.subplots() sns.histplot(df['amount'], bins=20, ax=ax) plt.xlabel('Amount') plt.ylabel('Frequency') plt.title('Distribution of Balance Amount') st.pyplot(fig) st.write('bla bla bla') st.write('') # 3 st.subheader('Distribution of Old Balance Origin') # Plotting fig, ax = plt.subplots() sns.histplot(df['oldbalanceOrg'], bins=20, ax=ax) plt.xlabel('Old Balance Origin') plt.ylabel('Frequency') plt.title('Distribution of Old Balance Origin') st.pyplot(fig) st.write('bla bla bla') st.write('') # 4 st.subheader('Distribution of New Balance Origin') # Plotting fig, ax = plt.subplots() sns.histplot(df['newbalanceOrig'], bins=20, ax=ax) plt.xlabel('New Balance Origin') plt.ylabel('Frequency') plt.title('Distribution of New Balance Origin') st.pyplot(fig) st.write('bla bla bla') st.write('') # 5 st.subheader('Distribution of Old Balance Destination') # Plotting fig, ax = plt.subplots() sns.histplot(df['oldbalanceDest'], bins=20, ax=ax) plt.xlabel('Old Balance Origin') plt.ylabel('Frequency') plt.title('Distribution of Old Balance Destination') st.pyplot(fig) st.write('bla bla bla') st.write('') # 5 st.subheader('Distribution of New Balance Destination') # Plotting fig, ax = plt.subplots() sns.histplot(df['newbalanceDest'], bins=20, ax=ax) plt.xlabel('New Balance Origin') plt.ylabel('Frequency') plt.title('Distribution of New Balance Destination') st.pyplot(fig) st.write('bla bla bla') st.write('') # 6 st.subheader('Distribution of Flagged Fraud') # Plotting fig, ax = plt.subplots() sns.histplot(df['isFlaggedFraud'], bins=20, ax=ax) plt.xlabel('Is Flagged Fraud') plt.ylabel('Frequency') plt.title('Distribution of Flagged Fraud') st.pyplot(fig) st.write('bla bla bla') st.write('') # 7 st.subheader('Distribution of Fraud') # Plotting fig, ax = plt.subplots() sns.histplot(df['isFraud'], bins=20, ax=ax) plt.xlabel('Is Fraud') plt.ylabel('Frequency') plt.title('Distribution of Fraud') st.pyplot(fig) st.write('bla bla bla') st.write('') st.divider() # Bivariate analysis st.header('Bivariate Analysis') # 1 st.subheader('Distribution of Amout Balance per Transaction Types') fig, ax = plt.subplots() sns.boxplot(x=df['amount'], y=df['type'], ax=ax) plt.xlabel('Amount') plt.ylabel('Transaction Types') plt.title('Transaction Types vs Amount Balance') st.pyplot(fig) st.write('bla bla bla') st.write('') # 2 st.subheader('Distribution of Old Balance Origin per Transaction Types') fig, ax = plt.subplots() sns.boxplot(x=df['oldbalanceOrg'], y=df['type'], ax=ax) plt.xlabel('Old Balance Origin') plt.ylabel('Transaction Types') plt.title('Transaction Types vs Old Balance Origin') st.pyplot(fig) st.write('bla bla bla') st.write('') # 3 st.subheader('Distribution of New Balance Origin per Transaction Types') fig, ax = plt.subplots() sns.boxplot(x=df['newbalanceOrig'], y=df['type'], ax=ax) plt.xlabel('New Balance Origin') plt.ylabel('Transaction Types') plt.title('Transaction Types vs Old Balance Origin') st.pyplot(fig) st.write('bla bla bla') st.write('') # 4 st.subheader('Distribution of Old Balance Destination per Transaction Types') fig, ax = plt.subplots() sns.boxplot(x=df['oldbalanceDest'], y=df['type'], ax=ax) plt.xlabel('Old Balance Destination') plt.ylabel('Transaction Types') plt.title('Transaction Types vs Old Balance Destination') st.pyplot(fig) st.write('bla bla bla') st.write('') # 5 st.subheader('Distribution of New Balance Destination per Transaction Types') fig, ax = plt.subplots() sns.boxplot(x=df['newbalanceDest'], y=df['type'], ax=ax) plt.xlabel('New Balance Destination') plt.ylabel('Transaction Types') plt.title('Transaction Types vs New Balance Destination') st.pyplot(fig) st.write('bla bla bla') st.write('')