import streamlit as st
import pandas as pd
from io import StringIO
import PyPDF2
from docx import Document

# Function to extract data from a CSV file
def read_csv(file):
    df = pd.read_csv(file)
    return df

# Function to extract data from an Excel file
def read_excel(file):
    df = pd.read_excel(file)
    return df

# Function to extract text from a TXT file
def read_txt(file):
    text = file.read().decode("utf-8")
    return text

# Function to extract text from a DOCX file
def read_docx(file):
    doc = Document(file)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

# Function to extract text from a PDF file
def read_pdf(file):
    pdf_reader = PyPDF2.PdfFileReader(file)
    text = ""
    for page_num in range(pdf_reader.numPages):
        page = pdf_reader.getPage(page_num)
        text += page.extract_text()
    return text

# Streamlit app
def main():
    st.title("File Upload and Data Extraction App")
    st.write("Upload a file (CSV, Excel, TXT, DOCX, or PDF) to extract data.")

    # File uploader
    uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx", "txt", "docx", "pdf"])

    if uploaded_file is not None:
        # Determine file type and process accordingly
        if uploaded_file.type == "text/csv":
            data = read_csv(uploaded_file)
            st.write("### CSV Data")
            st.write(data)

        elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
            data = read_excel(uploaded_file)
            st.write("### Excel Data")
            st.write(data)

        elif uploaded_file.type == "text/plain":
            text = read_txt(uploaded_file)
            st.write("### TXT Data")
            st.write(text)

        elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
            text = read_docx(uploaded_file)
            st.write("### DOCX Data")
            st.write(text)

        elif uploaded_file.type == "application/pdf":
            text = read_pdf(uploaded_file)
            st.write("### PDF Data")
            st.write(text)

        else:
            st.error("Unsupported file type. Please upload a CSV, Excel, TXT, DOCX, or PDF file.")

# Run the Streamlit app
if __name__ == "__main__":
    main()