import streamlit as st
import numpy as np 
import pandas as pd 

st.subheader("**What is Data?**")
st.write("Data refers to information, facts, or statistics that are collected, stored, and analyzed to derive meaningful insights. It represents raw, unprocessed values that can be used for decision-making, analysis, and predictions. Data is the foundation of fields like data science, machine learning, and artificial intelligence.")

st.subheader("**Characteristics of Data:**")
st.write("""
- **Raw and Unprocessed:** Data in its initial form, before being cleaned or structured.
- **Forms:** Data can exist as numbers, text, images, videos, sounds, and more.
- **Source:** Data can be collected from various sources, such as surveys, sensors, transactions, or online platforms.
""")
st.subheader("**Types of Data**")
data_type = st.radio("**Select a type of data:**", ["Structured", "Unstructured", "Semi-Structured"])

# Structured Data Section
if data_type == "Structured":
    st.write("### Structured Data")
    st.write("Structured data is organized in a predefined format, such as rows and columns.")

    # Buttons for Data Formats
    format_selected = st.radio(
        "Select a data format to learn more:", ["Excel", "CSV", "SQL Databases"]
    )

  # Excel Format Section
    if format_selected == "Excel":
        st.write("#### Excel Format")

        # Part (a) What it is
        st.subheader("What is Excel?")
        st.write("""
        Excel is a popular file format used for storing structured data in tabular form.
        It is commonly used in business and data analysis, and its file extensions include `.xlsx` and `.xls`.
        """)

        # Part (b) How to read these files
        st.subheader("How to Read Excel Files?")
        st.code("""
        import pandas as pd
        # Read an Excel file
        df = pd.read_excel("file.xlsx")
        print(df.head())
        """)

        # Part (c) Issues encountered
        st.subheader("Common Issues Encountered When Handling Excel Files")
        st.write("""
        - **Missing Data**: Some cells may contain empty or null values.
        - **Encoding Problems**: Files saved in non-standard formats may have encoding issues.
        - **File Corruption**: The file may become unreadable if improperly saved or transferred.
        - **Large Files**: Handling very large Excel files may exceed memory limits.
        """)

        # Part (d) How to overcome these errors/issues
        st.subheader("How to Overcome These Issues?")
        st.write("""
        - **Missing Data**: Use data imputation techniques to fill in missing values.
        - **Encoding Problems**: Specify the encoding format when reading the file, e.g., `encoding='utf-8'`.
        - **File Corruption**: Use repair tools or convert to a compatible format like CSV.
        - **Large Files**: Process the file in chunks using `pandas` or optimize it using external tools.
        """)

        # Downloadable Guide Button
        st.markdown("### Download Coding Guide:")
        if st.button("Download Excel Guide"):
            # Provide a downloadable file
            file_path = "Excel_guide.ipynb"  # Ensure this file exists in the app directory
            with open(file_path, "rb") as file:
                st.download_button(
                    label="Download Excel Guide",
                    data=file,
                    file_name="Excel_guide.ipynb",
                    mime="application/octet-stream",
                )

# CSV Format Content
if format_selected == "CSV":
    st.write("#### CSV Format")

    # Part (a) What it is
    st.subheader("What is CSV?")
    st.write("""
    CSV (Comma-Separated Values) is a plain-text file format used to store tabular data, 
    where each row corresponds to a record, and fields are separated by commas. 
    It is widely used for data exchange due to its simplicity and compatibility across systems. 
    Common file extensions include `.csv`.
    """)

    # Part (b) How to Read These Files
    st.subheader("How to Read CSV Files?")
    st.code("""
    import pandas as pd
    # Reading a CSV file
    df = pd.read_csv("file.csv")
    print(df.head())
    
    # Reading a CSV file with custom delimiter
    df = pd.read_csv("file.csv", sep=";")
    """)

    # Part (c) Issues Encountered
    st.subheader("Common Issues Encountered When Handling CSV Files")
    st.write("""
    - **Incorrect Delimiters**: Files may use delimiters other than commas, e.g., semicolons or tabs.
    - **Encoding Problems**: Files with different encodings (e.g., UTF-8, ISO-8859-1) may cause errors.
    - **Missing or Corrupted Data**: Blank fields or inconsistencies in data.
    - **Header Issues**: Missing headers or extra/unexpected columns.
    - **Large File Sizes**: Memory limitations when processing large datasets.
    """)

    # Part (d) How to Overcome These Issues
    st.subheader("How to Overcome These Issues?")
    st.write("""
    - **Incorrect Delimiters**: Specify the correct delimiter when reading:
      ```python
      df = pd.read_csv("file.csv", sep=";")
      ```
    - **Encoding Problems**: Specify the encoding explicitly:
      ```python
      df = pd.read_csv("file.csv", encoding="utf-8")
      ```
    - **Missing or Corrupted Data**: Handle missing values using pandas:
      ```python
      df.fillna("NA", inplace=True)
      ```
    - **Header Issues**: Assign custom headers or skip problematic rows:
      ```python
      df = pd.read_csv("file.csv", header=None)
      df.columns = ["Column1", "Column2", "Column3"]
      ```
    - **Large Files**: Use chunk processing for large files:
      ```python
      chunks = pd.read_csv("file.csv", chunksize=1000)
      for chunk in chunks:
          process(chunk)
      ```
    """)

    # Downloadable Guide Button
    st.markdown("### Download Coding Guide:")
    if st.button("Download CSV Guide"):
        # Provide a downloadable Jupyter Notebook file
        file_path = "CSV_guide.ipynb"  # Replace with the actual file path
        with open(file_path, "rb") as file:
            st.download_button(
                label="Download CSV Guide",
                data=file,
                file_name="CSV_guide.ipynb",
                mime="application/octet-stream",
            )