File size: 4,863 Bytes
d7bf121
 
 
 
 
 
 
 
 
9cef30c
d7bf121
 
 
22ef4b9
d7bf121
 
 
 
 
 
 
 
 
22ef4b9
3f3f7da
22ef4b9
 
 
 
 
 
3f3f7da
22ef4b9
 
 
 
 
 
 
3f3f7da
22ef4b9
3f3f7da
22ef4b9
 
 
 
d7bf121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22ef4b9
d7bf121
 
 
 
 
22ef4b9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import streamlit as st
import pandas as pd
import plotly.express as px
from pandasai import Agent
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document
from datasets import load_dataset
import os

# Title
st.title("Dataset Analysis and Visualization")

# Fetch API keys from environment variables
api_key = os.getenv("OPENAI_API_KEY")
pandasai_api_key = os.getenv("PANDASAI_API_KEY")

# Initialize session state for the dataframe
if "df" not in st.session_state:
    st.session_state.df = None

# Dataset loading section
st.subheader("Load Dataset")
input_option = st.radio("Select Dataset Input:", ["Use Hugging Face Dataset", "Upload CSV File"])

if input_option == "Use Hugging Face Dataset":
    dataset_name = st.text_input("Enter Hugging Face Dataset Name:", value="HUPD/hupd")
    if st.button("Load Dataset"):
        try:
            # Load dataset and store it in session state
            dataset = load_dataset(dataset_name, name="sample", split="train", trust_remote_code=True, uniform_split=True)
            st.session_state.df = pd.DataFrame(dataset)
            st.success(f"Dataset '{dataset_name}' loaded successfully!")
        except Exception as e:
            st.error(f"Error loading dataset: {e}")
elif input_option == "Upload CSV File":
    uploaded_file = st.file_uploader("Upload CSV File:", type=["csv"])
    if uploaded_file and st.button("Load CSV"):
        try:
            # Read uploaded CSV and store it in session state
            st.session_state.df = pd.read_csv(uploaded_file)
            st.success("File uploaded successfully!")
        except Exception as e:
            st.error(f"Error loading file: {e}")

# Show the loaded dataframe preview
if st.session_state.df is not None:
    st.subheader("Dataset Preview")
    st.dataframe(st.session_state.df.head(10))

    # Set up PandasAI Agent
    agent = Agent(st.session_state.df)

    # Convert DataFrame to documents
    documents = [
        Document(
            page_content=", ".join([f"{col}: {row[col]}" for col in st.session_state.df.columns]),
            metadata={"index": index}
        )
        for index, row in st.session_state.df.iterrows()
    ]

    # Set up RAG
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(documents, embeddings)
    retriever = vectorstore.as_retriever()
    qa_chain = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(),
        chain_type="stuff",
        retriever=retriever
    )

    # Create tabs for different functionality
    tab1, tab2, tab3 = st.tabs(["PandasAI Analysis", "RAG Q&A", "Data Visualization"])

    with tab1:
        st.header("Data Analysis with PandasAI")
        pandas_question = st.text_input("Ask a question about your data (PandasAI):")
        if pandas_question:
            result = agent.chat(pandas_question)
            st.write("PandasAI Answer:", result)

    with tab2:
        st.header("Q&A with RAG")
        rag_question = st.text_input("Ask a question about your data (RAG):")
        if rag_question:
            result = qa_chain.run(rag_question)
            st.write("RAG Answer:", result)

    with tab3:
        st.header("Data Visualization")
        viz_question = st.text_input("What kind of graph would you like to see? (e.g., 'Show a scatter plot of salary vs experience')")
        if viz_question:
            try:
                result = agent.chat(viz_question)
                
                # Convert the PandasAI result into executable code
                import re
                code_pattern = r'```python\n(.*?)\n```'
                code_match = re.search(code_pattern, result, re.DOTALL)
                
                if code_match:
                    viz_code = code_match.group(1)
                    # Modify the code to use 'px' instead of 'plt'
                    viz_code = viz_code.replace('plt.', 'px.')
                    viz_code = viz_code.replace('plt.show()', 'fig = px.scatter(df, x=x, y=y)')
                    
                    # Execute the code and display the graph
                    exec(viz_code)
                    st.plotly_chart(fig)
                else:
                    st.write("Failed to generate a graph. Please try asking differently.")
            except Exception as e:
                st.write(f"An error occurred: {str(e)}")
                st.write("Please try rephrasing your question.")
else:
    st.warning("No dataset loaded. Please select a dataset input option above.")

# Error handling for missing API keys
if not api_key:
    st.error("Missing OpenAI API Key in environment variables.")
if not pandasai_api_key:
    st.error("Missing PandasAI API Key in environment variables.")