Zero_to_Hero_Machine_Learning / pages /6.Data Collection.py
shwetashweta05's picture
Update pages/6.Data Collection.py
16e390d verified
raw
history blame
6.29 kB
import streamlit as st
import numpy as np
import pandas as pd
st.subheader("**What is Data?**")
st.write("Data refers to information, facts, or statistics that are collected, stored, and analyzed to derive meaningful insights. It represents raw, unprocessed values that can be used for decision-making, analysis, and predictions. Data is the foundation of fields like data science, machine learning, and artificial intelligence.")
st.subheader("**Characteristics of Data:**")
st.write("""
- **Raw and Unprocessed:** Data in its initial form, before being cleaned or structured.
- **Forms:** Data can exist as numbers, text, images, videos, sounds, and more.
- **Source:** Data can be collected from various sources, such as surveys, sensors, transactions, or online platforms.
""")
st.subheader("**Types of Data**")
data_type = st.radio("**Select a type of data:**", ["Structured", "Unstructured", "Semi-Structured"])
# Structured Data Section
if data_type == "Structured":
st.write("### Structured Data")
st.write("Structured data is organized in a predefined format, such as rows and columns.")
# Buttons for Data Formats
format_selected = st.radio(
"Select a data format to learn more:", ["Excel", "CSV", "SQL Databases"]
)
# Excel Format Section
if format_selected == "Excel":
st.write("#### Excel Format")
# Part (a) What it is
st.subheader("What is Excel?")
st.write("""
Excel is a popular file format used for storing structured data in tabular form.
It is commonly used in business and data analysis, and its file extensions include `.xlsx` and `.xls`.
""")
# Part (b) How to read these files
st.subheader("How to Read Excel Files?")
st.code("""
import pandas as pd
# Read an Excel file
df = pd.read_excel("file.xlsx")
print(df.head())
""")
# Part (c) Issues encountered
st.subheader("Common Issues Encountered When Handling Excel Files")
st.write("""
- **Missing Data**: Some cells may contain empty or null values.
- **Encoding Problems**: Files saved in non-standard formats may have encoding issues.
- **File Corruption**: The file may become unreadable if improperly saved or transferred.
- **Large Files**: Handling very large Excel files may exceed memory limits.
""")
# Part (d) How to overcome these errors/issues
st.subheader("How to Overcome These Issues?")
st.write("""
- **Missing Data**: Use data imputation techniques to fill in missing values.
- **Encoding Problems**: Specify the encoding format when reading the file, e.g., `encoding='utf-8'`.
- **File Corruption**: Use repair tools or convert to a compatible format like CSV.
- **Large Files**: Process the file in chunks using `pandas` or optimize it using external tools.
""")
# Downloadable Guide Button
st.markdown("### Download Coding Guide:")
if st.button("Download Excel Guide"):
# Provide a downloadable file
file_path = "Excel_guide.ipynb" # Ensure this file exists in the app directory
with open(file_path, "rb") as file:
st.download_button(
label="Download Excel Guide",
data=file,
file_name="Excel_guide.ipynb",
mime="application/octet-stream",
)
# CSV Format Content
if format_selected == "CSV":
st.write("#### CSV Format")
# Part (a) What it is
st.subheader("What is CSV?")
st.write("""
CSV (Comma-Separated Values) is a plain-text file format used to store tabular data,
where each row corresponds to a record, and fields are separated by commas.
It is widely used for data exchange due to its simplicity and compatibility across systems.
Common file extensions include `.csv`.
""")
# Part (b) How to Read These Files
st.subheader("How to Read CSV Files?")
st.code("""
import pandas as pd
# Reading a CSV file
df = pd.read_csv("file.csv")
print(df.head())
# Reading a CSV file with custom delimiter
df = pd.read_csv("file.csv", sep=";")
""")
# Part (c) Issues Encountered
st.subheader("Common Issues Encountered When Handling CSV Files")
st.write("""
- **Incorrect Delimiters**: Files may use delimiters other than commas, e.g., semicolons or tabs.
- **Encoding Problems**: Files with different encodings (e.g., UTF-8, ISO-8859-1) may cause errors.
- **Missing or Corrupted Data**: Blank fields or inconsistencies in data.
- **Header Issues**: Missing headers or extra/unexpected columns.
- **Large File Sizes**: Memory limitations when processing large datasets.
""")
# Part (d) How to Overcome These Issues
st.subheader("How to Overcome These Issues?")
st.write("""
- **Incorrect Delimiters**: Specify the correct delimiter when reading:
```python
df = pd.read_csv("file.csv", sep=";")
```
- **Encoding Problems**: Specify the encoding explicitly:
```python
df = pd.read_csv("file.csv", encoding="utf-8")
```
- **Missing or Corrupted Data**: Handle missing values using pandas:
```python
df.fillna("NA", inplace=True)
```
- **Header Issues**: Assign custom headers or skip problematic rows:
```python
df = pd.read_csv("file.csv", header=None)
df.columns = ["Column1", "Column2", "Column3"]
```
- **Large Files**: Use chunk processing for large files:
```python
chunks = pd.read_csv("file.csv", chunksize=1000)
for chunk in chunks:
process(chunk)
```
""")
# Downloadable Guide Button
st.markdown("### Download Coding Guide:")
if st.button("Download CSV Guide"):
# Provide a downloadable Jupyter Notebook file
file_path = "CSV_guide.ipynb" # Replace with the actual file path
with open(file_path, "rb") as file:
st.download_button(
label="Download CSV Guide",
data=file,
file_name="CSV_guide.ipynb",
mime="application/octet-stream",
)