|
import streamlit as st |
|
import numpy as np |
|
import pandas as pd |
|
|
|
st.subheader("**What is Data?**") |
|
st.write("Data refers to information, facts, or statistics that are collected, stored, and analyzed to derive meaningful insights. It represents raw, unprocessed values that can be used for decision-making, analysis, and predictions. Data is the foundation of fields like data science, machine learning, and artificial intelligence.") |
|
|
|
st.subheader("**Characteristics of Data:**") |
|
st.write(""" |
|
- **Raw and Unprocessed:** Data in its initial form, before being cleaned or structured. |
|
- **Forms:** Data can exist as numbers, text, images, videos, sounds, and more. |
|
- **Source:** Data can be collected from various sources, such as surveys, sensors, transactions, or online platforms. |
|
""") |
|
st.subheader("**Types of Data**") |
|
data_type = st.radio("**Select a type of data:**", ["Structured", "Unstructured", "Semi-Structured"]) |
|
|
|
|
|
if data_type == "Structured": |
|
st.write("### Structured Data") |
|
st.write("Structured data is organized in a predefined format, such as rows and columns.") |
|
|
|
|
|
format_selected = st.radio( |
|
"Select a data format to learn more:", ["Excel", "CSV", "SQL Databases"] |
|
) |
|
|
|
|
|
if format_selected == "Excel": |
|
st.write("#### Excel Format") |
|
|
|
|
|
st.subheader("What is Excel?") |
|
st.write(""" |
|
Excel is a popular file format used for storing structured data in tabular form. |
|
It is commonly used in business and data analysis, and its file extensions include `.xlsx` and `.xls`. |
|
""") |
|
|
|
|
|
st.subheader("How to Read Excel Files?") |
|
st.code(""" |
|
import pandas as pd |
|
# Read an Excel file |
|
df = pd.read_excel("file.xlsx") |
|
print(df.head()) |
|
""") |
|
|
|
|
|
st.subheader("Common Issues Encountered When Handling Excel Files") |
|
st.write(""" |
|
- **Missing Data**: Some cells may contain empty or null values. |
|
- **Encoding Problems**: Files saved in non-standard formats may have encoding issues. |
|
- **File Corruption**: The file may become unreadable if improperly saved or transferred. |
|
- **Large Files**: Handling very large Excel files may exceed memory limits. |
|
""") |
|
|
|
|
|
st.subheader("How to Overcome These Issues?") |
|
st.write(""" |
|
- **Missing Data**: Use data imputation techniques to fill in missing values. |
|
- **Encoding Problems**: Specify the encoding format when reading the file, e.g., `encoding='utf-8'`. |
|
- **File Corruption**: Use repair tools or convert to a compatible format like CSV. |
|
- **Large Files**: Process the file in chunks using `pandas` or optimize it using external tools. |
|
""") |
|
|
|
|
|
st.markdown("### Download Coding Guide:") |
|
if st.button("Download Excel Guide"): |
|
|
|
file_path = "Excel_guide.ipynb" |
|
with open(file_path, "rb") as file: |
|
st.download_button( |
|
label="Download Excel Guide", |
|
data=file, |
|
file_name="Excel_guide.ipynb", |
|
mime="application/octet-stream", |
|
) |
|
|
|
|
|
if format_selected == "CSV": |
|
st.write("#### CSV Format") |
|
|
|
|
|
st.subheader("What is CSV?") |
|
st.write(""" |
|
CSV (Comma-Separated Values) is a plain-text file format used to store tabular data, |
|
where each row corresponds to a record, and fields are separated by commas. |
|
It is widely used for data exchange due to its simplicity and compatibility across systems. |
|
Common file extensions include `.csv`. |
|
""") |
|
|
|
|
|
st.subheader("How to Read CSV Files?") |
|
st.code(""" |
|
import pandas as pd |
|
# Reading a CSV file |
|
df = pd.read_csv("file.csv") |
|
print(df.head()) |
|
|
|
# Reading a CSV file with custom delimiter |
|
df = pd.read_csv("file.csv", sep=";") |
|
""") |
|
|
|
|
|
st.subheader("Common Issues Encountered When Handling CSV Files") |
|
st.write(""" |
|
- **Incorrect Delimiters**: Files may use delimiters other than commas, e.g., semicolons or tabs. |
|
- **Encoding Problems**: Files with different encodings (e.g., UTF-8, ISO-8859-1) may cause errors. |
|
- **Missing or Corrupted Data**: Blank fields or inconsistencies in data. |
|
- **Header Issues**: Missing headers or extra/unexpected columns. |
|
- **Large File Sizes**: Memory limitations when processing large datasets. |
|
""") |
|
|
|
|
|
st.subheader("How to Overcome These Issues?") |
|
st.write(""" |
|
- **Incorrect Delimiters**: Specify the correct delimiter when reading: |
|
```python |
|
df = pd.read_csv("file.csv", sep=";") |
|
``` |
|
- **Encoding Problems**: Specify the encoding explicitly: |
|
```python |
|
df = pd.read_csv("file.csv", encoding="utf-8") |
|
``` |
|
- **Missing or Corrupted Data**: Handle missing values using pandas: |
|
```python |
|
df.fillna("NA", inplace=True) |
|
``` |
|
- **Header Issues**: Assign custom headers or skip problematic rows: |
|
```python |
|
df = pd.read_csv("file.csv", header=None) |
|
df.columns = ["Column1", "Column2", "Column3"] |
|
``` |
|
- **Large Files**: Use chunk processing for large files: |
|
```python |
|
chunks = pd.read_csv("file.csv", chunksize=1000) |
|
for chunk in chunks: |
|
process(chunk) |
|
``` |
|
""") |
|
|
|
|
|
st.markdown("### Download Coding Guide:") |
|
if st.button("Download CSV Guide"): |
|
|
|
file_path = "CSV_guide.ipynb" |
|
with open(file_path, "rb") as file: |
|
st.download_button( |
|
label="Download CSV Guide", |
|
data=file, |
|
file_name="CSV_guide.ipynb", |
|
mime="application/octet-stream", |
|
) |
|
|