import streamlit as st import numpy as np import pandas as pd st.subheader("**What is Data?**") st.write("Data refers to information, facts, or statistics that are collected, stored, and analyzed to derive meaningful insights. It represents raw, unprocessed values that can be used for decision-making, analysis, and predictions. Data is the foundation of fields like data science, machine learning, and artificial intelligence.") st.subheader("**Characteristics of Data:**") st.write(""" - **Raw and Unprocessed:** Data in its initial form, before being cleaned or structured. - **Forms:** Data can exist as numbers, text, images, videos, sounds, and more. - **Source:** Data can be collected from various sources, such as surveys, sensors, transactions, or online platforms. """) st.subheader("**Types of Data**") data_type = st.radio("**Select a type of data:**", ["Structured", "Unstructured", "Semi-Structured"]) # Structured Data Section if data_type == "Structured": st.write("### Structured Data") st.write("Structured data is organized in a predefined format, such as rows and columns.") # Buttons for Data Formats format_selected = st.radio( "Select a data format to learn more:", ["Excel", "CSV", "SQL Databases"] ) # Excel Format Section if format_selected == "Excel": st.write("#### Excel Format") # Part (a) What it is st.subheader("What is Excel?") st.write(""" Excel is a popular file format used for storing structured data in tabular form. It is commonly used in business and data analysis, and its file extensions include `.xlsx` and `.xls`. """) # Part (b) How to read these files st.subheader("How to Read Excel Files?") st.code(""" import pandas as pd # Read an Excel file df = pd.read_excel("file.xlsx") print(df.head()) """) # Part (c) Issues encountered st.subheader("Common Issues Encountered When Handling Excel Files") st.write(""" - **Missing Data**: Some cells may contain empty or null values. - **Encoding Problems**: Files saved in non-standard formats may have encoding issues. - **File Corruption**: The file may become unreadable if improperly saved or transferred. - **Large Files**: Handling very large Excel files may exceed memory limits. """) # Part (d) How to overcome these errors/issues st.subheader("How to Overcome These Issues?") st.write(""" - **Missing Data**: Use data imputation techniques to fill in missing values. - **Encoding Problems**: Specify the encoding format when reading the file, e.g., `encoding='utf-8'`. - **File Corruption**: Use repair tools or convert to a compatible format like CSV. - **Large Files**: Process the file in chunks using `pandas` or optimize it using external tools. """) # Downloadable Guide Button st.markdown("### Download Coding Guide:") if st.button("Download Excel Guide"): # Provide a downloadable file file_path = "Excel_guide.ipynb" # Ensure this file exists in the app directory with open(file_path, "rb") as file: st.download_button( label="Download Excel Guide", data=file, file_name="Excel_guide.ipynb", mime="application/octet-stream", ) # CSV Format Content if format_selected == "CSV": st.write("#### CSV Format") # Part (a) What it is st.subheader("What is CSV?") st.write(""" CSV (Comma-Separated Values) is a plain-text file format used to store tabular data, where each row corresponds to a record, and fields are separated by commas. It is widely used for data exchange due to its simplicity and compatibility across systems. Common file extensions include `.csv`. """) # Part (b) How to Read These Files st.subheader("How to Read CSV Files?") st.code(""" import pandas as pd # Reading a CSV file df = pd.read_csv("file.csv") print(df.head()) # Reading a CSV file with custom delimiter df = pd.read_csv("file.csv", sep=";") """) # Part (c) Issues Encountered st.subheader("Common Issues Encountered When Handling CSV Files") st.write(""" - **Incorrect Delimiters**: Files may use delimiters other than commas, e.g., semicolons or tabs. - **Encoding Problems**: Files with different encodings (e.g., UTF-8, ISO-8859-1) may cause errors. - **Missing or Corrupted Data**: Blank fields or inconsistencies in data. - **Header Issues**: Missing headers or extra/unexpected columns. - **Large File Sizes**: Memory limitations when processing large datasets. """) # Part (d) How to Overcome These Issues st.subheader("How to Overcome These Issues?") st.write(""" - **Incorrect Delimiters**: Specify the correct delimiter when reading: ```python df = pd.read_csv("file.csv", sep=";") ``` - **Encoding Problems**: Specify the encoding explicitly: ```python df = pd.read_csv("file.csv", encoding="utf-8") ``` - **Missing or Corrupted Data**: Handle missing values using pandas: ```python df.fillna("NA", inplace=True) ``` - **Header Issues**: Assign custom headers or skip problematic rows: ```python df = pd.read_csv("file.csv", header=None) df.columns = ["Column1", "Column2", "Column3"] ``` - **Large Files**: Use chunk processing for large files: ```python chunks = pd.read_csv("file.csv", chunksize=1000) for chunk in chunks: process(chunk) ``` """) # Downloadable Guide Button st.markdown("### Download Coding Guide:") if st.button("Download CSV Guide"): # Provide a downloadable Jupyter Notebook file file_path = "CSV_guide.ipynb" # Replace with the actual file path with open(file_path, "rb") as file: st.download_button( label="Download CSV Guide", data=file, file_name="CSV_guide.ipynb", mime="application/octet-stream", )