dnirfana's picture
Update eda.py
5d64667 verified
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
def loading():
# Display loading bar
progress_bar = st.progress(0)
status_text = st.empty()
for i in range(100):
progress_bar.progress(i + 1)
status_text.text(f"Loading... {i + 1}%")
time.sleep(0.05)
@st.cache_data
def load_data():
return pd.read_csv('Transactions Data.csv')
def app():
st.title('Exploratory Data Analysis')
# Load Data with loading animation
loading()
df = load_data()
st.write('Data Loaded!')
# Creating the table with column names and descriptions
data = {
"Column Names": [
"step",
"type",
"amount",
"nameOrig",
"oldbalanceOrg",
"newbalanceOrig",
"nameDest",
"oldbalanceDest",
"newbalanceDest",
"isFraud",
"isFlaggedFraud"
],
"Description": [
"Represents a unit of time in the transaction process, though the specific time unit is not specified in the dataset. It could denote hours, days, or another unit, depending on the context.",
"Describes the type of transaction, such as transfer, payment, etc. This categorical variable allows for the classification of different transaction behaviors.",
"Indicates the monetary value of the transaction, providing insight into the financial magnitude of each transaction.",
"Serves as the identifier for the origin account or entity initiating the transaction. This helps trace the source of funds in each transaction.",
"Represents the balance in the origin account before the transaction occurred, offering a reference point for understanding changes in account balances.",
"Reflects the balance in the origin account after the transaction has been processed, providing insight into how the transaction affects the account balance.",
"Functions as the identifier for the destination account or entity receiving the funds in each transaction. It helps track where the money is being transferred to.",
"Indicates the balance in the destination account before the transaction, offering a baseline for assessing changes in account balances due to incoming funds.",
"Represents the balance in the destination account after the transaction has been completed, providing insight into the impact of incoming funds on the account balance.",
"A binary indicator (0 or 1) denoting whether the transaction is fraudulent (1) or legitimate (0). This is the target variable for fraud detection modeling.",
"Another binary indicator (0 or 1) which may signal whether a transaction has been flagged as potentially fraudulent. This could serve as an additional feature for fraud detection algorithms."
]
}
# Displaying the table using Streamlit
st.subheader('Transaction Dataset Column Descriptions')
st.table(data)
st.divider()
# Data Summary
st.header('Data Summary')
st.write(df.describe().T)
st.divider()
# Univariate Exploration
st.header('Univariate Analysis')
# Distribution of Transaction Types
st.subheader('Distribution of Transactions Types')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.histplot(df['type'], bins=20, ax=ax)
plt.xlabel('Transaction Types')
plt.ylabel('Frequency')
plt.title('Distribution of Transaction Types')
st.pyplot(fig)
st.write(df['type'].value_counts())
st.write('')
st.write('The bar plot provided shows the frequency of different transaction types in the dataset.')
st.write('`PAYMENT` and `CASH_OUT` are the most frequent transaction types, followed by `CASH_IN`, with `TRANSFER` and `DEBIT` being less common. This distribution suggests that the dataset predominantly consists of `PAYMENT` and `CASH_OUT` transactions.')
# Distribution of Balance Amount
st.subheader('Distribution of Balance Amount')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.histplot(df['amount'], bins=20, ax=ax)
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.title('Distribution of Balance Amount')
st.pyplot(fig)
st.write(df['amount'].describe().reset_index())
st.write('The provided data describes a distribution of amounts with a large number of entries. The histogram indicates that most of the data points are concentrated at lower amounts, with a significant right skew.')
st.write('Key statistics include a count of `approximately 6.36 million entries`. The `mean amount is about 179,862`, with a `standard deviation of 603,858`, indicating a wide spread in the data. The `minimum value is 0`, and the `maximum value is 92,445,520`. The `first quartile (25th percentile) is 13,390`, the m`edian (50th percentile) is 74,872`, and the `third quartile (75th percentile) is 208,721`.')
# Distribution of Old Balance Origin
st.subheader('Distribution of Old Balance Origin')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.histplot(df['oldbalanceOrg'], bins=20, ax=ax)
plt.xlabel('Old Balance Origin')
plt.ylabel('Frequency')
plt.title('Distribution of Old Balance Origin')
st.pyplot(fig)
st.write(df['oldbalanceOrg'].describe().reset_index())
st.write('The provided data describes the distribution of Old Balance Origin amounts. The histogram shows that the majority of the data points are concentrated at lower values, with a significant right skew.')
st.write('Key statistics include a count of approximately 6.36 million entries. The `mean balance is about 833,883`, with a `standard deviation of 2,888,243`, indicating a wide variation in the data. The `minimum value is 0`, and the `maximum value is 59,585,040`. The `first quartile (25th percentile) is 0`, the `median (50th percentile) is 14,208`, and the `third quartile (75th percentile) is 107,315`.')
# Distribution of New Balance Origin
st.subheader('Distribution of New Balance Origin')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.histplot(df['newbalanceOrig'], bins=20, ax=ax)
plt.xlabel('New Balance Origin')
plt.ylabel('Frequency')
plt.title('Distribution of New Balance Origin')
st.pyplot(fig)
st.write(df['newbalanceOrig'].describe().reset_index())
st.write('The provided data describes the distribution of New Balance Origin amounts. The histogram shows a strong concentration of data points at lower values, indicating a significant right skew. ')
st.write('Key statistics include a count of approximately 6.36 million entries. The `mean balance is about 855,114`, with a `standard deviation of 2,924,049`, reflecting a wide range of values. The `minimum value is 0`, and the `maximum value is 49,585,040`. The `first quartile (25th percentile) is 0`, the `median (50th percentile) is also 0`, and the `third quartile (75th percentile) is 144,258`. This indicates that a large portion of the data points have a new balance origin of 0.')
# Distribution of Old Balance Destination
st.subheader('Distribution of Old Balance Destination')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.histplot(df['oldbalanceDest'], bins=20, ax=ax)
plt.xlabel('Old Balance Destination')
plt.ylabel('Frequency')
plt.title('Distribution of Old Balance Destination')
st.pyplot(fig)
st.write(df['oldbalanceDest'].describe().reset_index())
st.write('The histogram and accompanying statistics summarize the distribution of the Old Balance Destination variable.')
st.write('The data consists of 6,362,620 observations. The `mean value of the old balance destination is approximately 1,100,702`, with a `standard deviation of about 3,399,180`, indicating significant variability. Most of the data is clustered around lower values, as evidenced by the `25th and 50th percentiles being 0 and 132,705.7 respectively`. The `maximum value observed is 356,015,900`, which is much higher than the mean, suggesting the presence of some extremely high values that skew the distribution. This skewness is also visually evident in the histogram, where the majority of frequencies are concentrated near zero.')
# Distribution of New Balance Destination
st.subheader('Distribution of New Balance Destination')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.histplot(df['newbalanceDest'], bins=20, ax=ax)
plt.xlabel('New Balance Destination')
plt.ylabel('Frequency')
plt.title('Distribution of New Balance Destination')
st.pyplot(fig)
st.write(df['newbalanceDest'].describe().reset_index())
st.write('The histogram and accompanying statistics provide an overview of the New Balance Destination variable. The dataset comprises 6,362,620 entries. ')
st.write('The `mean value of the new balance destination is approximately 1,224,996`, with a `standard deviation of about 3,674,129`, indicating substantial variability. Similar to the old balance destination, most of the data is concentrated around lower values, as shown by the `25th and 50th percentiles being 0 and 214,661.4 respectively`. The `maximum observed value is 356,179,300`, which significantly exceeds the mean, indicating the presence of extremely high values that create a skewed distribution. This skewness is evident in the histogram, where the majority of frequencies are heavily concentrated near zero.')
# Distribution of Flagged Fraud
st.subheader('Distribution of Flagged Fraud')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.histplot(df['isFlaggedFraud'], bins=20, ax=ax)
plt.xlabel('Is Flagged Fraud')
plt.ylabel('Frequency')
plt.title('Distribution of Flagged Fraud')
st.pyplot(fig)
st.write(df['isFlaggedFraud'].value_counts())
st.write('The histogram and accompanying data provide an overview of the Is Flagged Fraud variable. The dataset includes 6,362,620 observations. ')
st.write('Among these, `only 16 transactions are flagged as fraudulent`, while the `remaining 6,362,604 transactions are not flagged`. This indicates that fraudulent transactions are extremely rare in this dataset. The histogram visually reflects this extreme imbalance, with the vast majority of frequencies concentrated at zero, indicating non-fraudulent transactions.')
# Distribution of Fraud
st.subheader('Distribution of Fraud')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.histplot(df['isFraud'], bins=20, ax=ax)
plt.xlabel('Is Fraud')
plt.ylabel('Frequency')
plt.title('Distribution of Fraud')
st.pyplot(fig)
st.write(df['isFraud'].value_counts())
st.write('The histogram and accompanying data provide insights into the "Is Fraud" variable. The dataset consists of 6,362,620 transactions.')
st.write('Out of these, `8,213 transactions are identified as fraudulent`, while the `remaining 6,354,407 transactions are not fraudulent`. This indicates that fraudulent transactions are relatively rare compared to non-fraudulent ones. The histogram clearly shows this imbalance, with the majority of frequencies clustered at zero, representing non-fraudulent transactions. The relatively small number of fraudulent transactions is visually represented by a minor spike at one.')
st.divider()
# Bivariate analysis
st.header('Bivariate Analysis')
# Distribution of Amount Balance per Transaction Types
st.subheader('Distribution of Amount Balance per Transaction Types')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.boxplot(x=df['amount'], y=df['type'], ax=ax)
plt.xlabel('Amount')
plt.ylabel('Transaction Types')
plt.title('Transaction Types vs Amount Balance')
st.pyplot(fig)
st.write(df.groupby('type')['amount'].describe().T)
st.write('The box plot and summary statistics provide an overview of the Old Balance Origin across different transaction types: `CASH_IN`, `CASH_OUT`, `DEBIT`, `PAYMENT`, and `TRANSFER`. The dataset includes `1,399,284 CASH_IN transactions`, `2,237,500 CASH_OUT transactions`, `41,432 DEBIT transactions`, `2,151,495 PAYMENT transactions`, and `532,909 TRANSFER transactions`.')
st.markdown('- `CASH_IN`: The mean old balance origin is approximately 168,920, with a standard deviation of about 126,508. The maximum value is 1,915,268.')
st.markdown('- `CASH_OUT`: The mean old balance origin is approximately 176,274, with a standard deviation of about 175,330. The maximum value is 10,000,000.')
st.markdown('- `DEBIT`: The mean old balance origin is approximately 5,484, with a standard deviation of about 13,319. The maximum value is 569,077.51.')
st.markdown('- `PAYMENT`: The mean old balance origin is approximately 13,058, with a standard deviation of about 12,556. The maximum value is 238,638.')
st.markdown('- `TRANSFER`: The mean old balance origin is approximately 910,647, with a standard deviation of about 1,879,574. The maximum value is 92,445,520.')
st.write('The box plot shows that most transaction types have balances clustered towards lower values, with `TRANSFER` transactions showing a wider range of values and significant outliers.')
# Distribution of Old Balance Origin per Transaction Types
st.subheader('Distribution of Old Balance Origin per Transaction Types')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.boxplot(x=df['oldbalanceOrg'], y=df['type'], ax=ax)
plt.xlabel('Old Balance Origin')
plt.ylabel('Transaction Types')
plt.title('Transaction Types vs Old Balance Origin')
st.pyplot(fig)
st.write(df.groupby('type')['oldbalanceOrg'].describe().T)
st.write('The box plot and summary statistics provide insights into the Old Balance Origin across various transaction types: `CASH_IN`, `CASH_OUT`, `DEBIT`, `PAYMENT`, and `TRANSFER`. The dataset includes the following number of transactions: `1,399,284 for CASH_IN`, `2,237,500 for CASH_OUT`, `41,432 for DEBIT`, `2,151,495 for PAYMENT`, and `532,909 for TRANSFER`.')
st.markdown('- `CASH_IN`: The mean old balance origin is approximately 3,590,464, with a standard deviation of about 5,291,825. The maximum value is 38,939,420.')
st.markdown('- `CASH_OUT`: The mean old balance origin is approximately 46,024, with a standard deviation of about 179,132.6. The maximum value is 43,818,860.')
st.markdown('- `DEBIT`: The mean old balance origin is approximately 68,647.34, with a standard deviation of about 138,449.3. The maximum value is 4,362,014.')
st.markdown('- `PAYMENT`: The mean old balance origin is approximately 68,216.83, with a standard deviation of about 198,991.1. The maximum value is 43,686,620.')
st.markdown('- `TRANSFER`: The mean old balance origin is approximately 54,441.85, with a standard deviation of about 439,981.8. The maximum value is 59,585,040.')
st.write('The box plot shows that most transaction types have balances clustered toward lower values, with `CASH_IN` transactions showing a broader range and a significant number of high-value outliers. `TRANSFER` and `CASH_OUT` transactions also exhibit notable outliers, while `PAYMENT` and `DEBIT` transactions have a narrower range of values.')
# Distribution of New Balance Origin per Transaction Types
st.subheader('Distribution of New Balance Origin per Transaction Types')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.boxplot(x=df['newbalanceOrig'], y=df['type'], ax=ax)
plt.xlabel('New Balance Origin')
plt.ylabel('Transaction Types')
plt.title('Transaction Types vs New Balance Origin')
st.pyplot(fig)
st.write(df.groupby('type')['newbalanceOrig'].describe().T)
st.write('The box plot and summary statistics provide an overview of the New Balance Origin across different transaction types: `CASH_IN`, `CASH_OUT`, `DEBIT`, `PAYMENT`, and `TRANSFER`. The dataset includes the following number of transactions: `1,399,284 for CASH_IN`, `2,237,500 for CASH_OUT`, `41,432 for DEBIT`, `2,151,495 for PAYMENT`, and `532,909 for TRANSFER`.')
st.markdown('- `CASH_IN`: The mean new balance origin is approximately 3,759,379, with a standard deviation of about 5,287,487. The maximum value is 38,946,230.')
st.markdown('- `CASH_OUT`: The mean new balance origin is approximately 17,474.19, with a standard deviation of about 109,194. The maximum value is 43,686,620.')
st.markdown('- `DEBIT`: The mean new balance origin is approximately 65,161.65, with a standard deviation of about 137,656.6. The maximum value is 4,359,375.')
st.markdown('- `PAYMENT`: The mean new balance origin is approximately 61,837.89, with a standard deviation of about 196,991.5. The maximum value is 43,673,800.')
st.markdown('- `TRANSFER`: The mean new balance origin is approximately 10,288.16, with a standard deviation of about 262,360.3. The maximum value is 49,585,040.')
st.write('The box plot shows that most transaction types have balances clustered toward lower values, with `CASH_IN` transactions showing a broader range and a significant number of high-value outliers. `TRANSFER` and `CASH_OUT` transactions also exhibit notable outliers, while `PAYMENT` and `DEBIT` transactions have a narrower range of values.')
# Distribution of Old Balance Destination per Transaction Types
st.subheader('Distribution of Old Balance Destination per Transaction Types')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.boxplot(x=df['oldbalanceDest'], y=df['type'], ax=ax)
plt.xlabel('Old Balance Destination')
plt.ylabel('Transaction Types')
plt.title('Transaction Types vs Old Balance Destination')
st.pyplot(fig)
st.write(df.groupby('type')['oldbalanceDest'].describe().T)
st.write('The box plot and summary statistics provide an overview of the Old Balance Destination across different transaction types: `CASH_IN`, `CASH_OUT`, `DEBIT`, `PAYMENT`, and `TRANSFER`. The dataset includes the following number of transactions: `1,399,284 for CASH_IN`, `2,237,500 for CASH_OUT`, `41,432 for DEBIT`, `2,151,495 for PAYMENT`, and `532,909 for TRANSFER`.')
st.markdown('- `CASH_IN`: The mean new balance origin is approximately 1,587,919, with a standard deviation of about 3,713,923. The maximum value is 355,553,400.')
st.markdown('- `CASH_OUT`: The mean new balance origin is approximately 1,497,758, with a standard deviation of about 3,633,876. The maximum value is 356,015,900.')
st.markdown('- `DEBIT`: The mean new balance origin is approximately 1,493,136, with a standard deviation of about 4,386,970. The maximum value is 327,827,800.')
st.markdown('- `PAYMENT`: The mean new balance origin is 0, as all payments seem to have a zero new balance.')
st.markdown('- `TRANSFER`: The mean new balance origin is approximately 2,567,606, with a standard deviation of about 6,037,846. The maximum value is 355,380,500.')
st.write('The box plot shows that most transaction types have balances clustered toward lower values, with `TRANSFER` and `CASH_OUT` transactions showing a broader range and significant high-value outliers. `DEBIT` transactions also exhibit notable outliers, while `CASH_IN` transactions have a narrower range of values. `PAYMENTS` consistently have a zero new balance.')
# Distribution of New Balance Destination per Transaction Types
st.subheader('Distribution of New Balance Destination per Transaction Types')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.boxplot(x=df['newbalanceDest'], y=df['type'], ax=ax)
plt.xlabel('New Balance Destination')
plt.ylabel('Transaction Types')
plt.title('Transaction Types vs New Balance Destination')
st.pyplot(fig)
st.write(df.groupby('type')['newbalanceDest'].describe().T)
st.write('The box plot and summary statistics provide an overview of the New Balance Destination across different transaction types: `CASH_IN`, `CASH_OUT`, `DEBIT`, `PAYMENT`, and `TRANSFER`. The dataset includes the following number of transactions: `1,399,284 for CASH_IN`, `2,237,500 for CASH_OUT`, `41,432 for DEBIT`, `2,151,495 for PAYMENT`, and `532,909 for TRANSFER`.')
st.markdown('- `CASH_IN`: The mean new balance destination is approximately 1,467,105, with a standard deviation of about 3,712,985. The maximum value is 355,381,400.')
st.markdown('- `CASH_OUT`: The mean new balance destination is approximately 1,691,326, with a standard deviation of about 3,663,008. The maximum value is 356,179,300.')
st.markdown('- `DEBIT`: The mean new balance destination is approximately 1,513,003, with a standard deviation of about 4,395,708. The maximum value is 327,852,100.')
st.markdown('- `PAYMENT`: The mean new balance destination is 0, as all payments have a zero new balance.')
st.markdown('- `TRANSFER`: The mean new balance destination is approximately 3,554,567, with a standard deviation of about 7,387,614. The maximum value is 356,015,900.')
st.write('The box plot shows that most transaction types have balances clustered toward lower values, with `TRANSFER` and `CASH_OUT` transactions showing a broader range and significant high-value outliers. `DEBIT` transactions also exhibit notable outliers, while `CASH_IN` transactions have a narrower range of values. `PAYMENTS` consistently have a zero new balance.')
# Spreading of Amount Balance and Old Balance Origin
st.subheader('Amount Balance vs Old Balance Origin')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.scatterplot(x=df['amount'], y=df['oldbalanceOrg'], ax=ax)
plt.xlabel('Amount Balance')
plt.ylabel('Old Balance Origin')
plt.title('Amount Balance vs Old Balance Origin')
st.pyplot(fig)
st.write('The scatter plot displays the relationship between Old Balance Origin and Amount Balance. The data points indicate that most of the transactions cluster around lower values for both balances, forming a dense concentration near the origin (0,0).')
st.markdown('- **High Density at Lower Values**: The majority of data points are concentrated on both Old Balance Origin and Amount Balance are close to zero, indicating that most transactions involve smaller amounts.')
st.markdown('- **Vertical Distribution**: Some points with a higher Old Balance Origin are spread vertically, but these tend to be associated with smaller Amount Balance values.')
st.markdown('- **Horizontal Distribution**: There are also points spread horizontally with higher Amount Balance values, but they typically have low Old Balance Origin values.')
# Spreading of Amount Balance and New Balance Origin
st.subheader('Amount Balance vs New Balance Origin')
with st.spinner('Loading...'):
fig, ax = plt.subplots()
sns.scatterplot(x=df['amount'], y=df['newbalanceOrig'], ax=ax)
plt.xlabel('Amount Balance')
plt.ylabel('New Balance Origin')
plt.title('Amount Balance vs New Balance Origin')
st.pyplot(fig)
st.write('The scatter plot shows the relationship between New Balance Origin and Amount Balance. Similar to the previous plot, the data points highlight how most transactions cluster around lower values for both balances.')
st.markdown('- **High Density at Lower Values**: The majority of the data points are concentrated on the origin (0,0), indicating that most transactions involve smaller amounts for both New Balance Origin and Amount Balance.')
st.markdown('- **Vertical Distribution**: There are points with higher New Balance Origin spread vertically, mostly associated with lower Amount Balance values.')
st.markdown('- **Horizontal Distribution**: Some points with higher Amount Balance values spread horizontally but are typically associated with low New Balance Origin values.')
st.divider()
# Multivariate analysis
st.header('Multivariate Analysis')
# heatmap to visualize relationships
st.subheader('Heatmap of Correlation between Numeric Variables')
with st.spinner('Loading...'):
correlation_matrix = df[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']].corr()
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt=".2f", ax=ax)
st.pyplot(fig)
st.write("""
The heatmap provides a visual representation of the correlation matrix for the numeric variables: `amount`, `oldbalanceOrg`, `newbalanceOrig`, `oldbalanceDest`, and `newbalanceDest`.
1. **Strong Correlations:**
- There is a perfect correlation (1.00) between `oldbalanceOrg` and `newbalanceOrig`, indicating that the balance in the origin account before and after the transaction are almost always identical.
- Similarly, `oldbalanceDest` and `newbalanceDest` have a very high correlation (0.98), showing that the balance in the destination account before and after the transaction is very closely related.
2. **Moderate Correlations:**
- `amount` shows moderate correlations with `oldbalanceDest` (0.29) and `newbalanceDest` (0.46). This indicates that the transaction amount has a moderate positive relationship with the balances in the destination account.
3. **Weak or No Correlations:**
- `amount` has very weak or no correlation with `oldbalanceOrg` (-0.00) and `newbalanceOrig` (-0.01), suggesting that the transaction amount is not significantly related to the balances in the origin account.
- Other correlations, such as between `oldbalanceOrg` and `oldbalanceDest` (0.07), are also weak, indicating minimal linear relationships between these variables.
""")
# Pairplot to visualize relationships
st.subheader('Pairplot of Numeric Variables')
with st.spinner('Loading...'):
st.image('pairplot.png')
st.write('The pair plot provides a detailed view of the relationships between the numeric variables: `amount`, `oldbalanceOrg`, `newbalanceOrig`, `oldbalanceDest`, and `newbalanceDest`.')
st.markdown('- **Strong Linear Relationships**: There are clear linear relationships between `oldbalanceOrg` and `newbalanceOrig`, as well as between `oldbalanceDest` and `newbalanceDest`. This indicates that the balance before and after transactions are highly correlated.')
st.markdown('- **Clustered Data Points**: Most data points are clustered near the lower end of the scales, especially for `amount` and `balances`, suggesting a high frequency of small-value transactions.')
st.markdown('- **Diagonal Lines**: The diagonal subplots show histograms of each variable, reflecting the distribution of individual variables. ')
st.markdown('- **Scattered Points**: There are noticeable outliers and scattered points in the relationships between `amount` and the balance variables, indicating some transactions involve significantly higher amounts than the majority.')
if __name__ == '__main__':
app()