File size: 4,826 Bytes
0c7bd3c
1ab5fce
549dce5
 
1e6d996
 
 
 
feeb9a7
 
 
 
 
0c7bd3c
 
7a9e2a5
 
feeb9a7
 
 
1460b1f
 
424b9ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a495a2
424b9ad
 
 
 
7670068
3a495a2
1e6d996
 
 
 
 
8dd9e3d
424b9ad
 
 
feeb9a7
424b9ad
 
 
 
 
c13a858
 
9ee17c5
c13a858
 
 
 
 
 
feeb9a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c7bd3c
 
 
 
 
171566d
 
 
 
0c7bd3c
 
 
 
424b9ad
 
 
 
 
eb70b82
424b9ad
 
 
b965179
424b9ad
 
0c7bd3c
 
 
 
1ab5fce
3ae3597
 
b965179
 
1ab5fce
b965179
1ab5fce
424b9ad
40e5f2c
7a9e2a5
40e5f2c
eb70b82
 
 
 
 
 
 
 
 
 
8eec1ee
24df29b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40e5f2c
b965179
 
 
eb70b82
b965179
 
 
 
 
eb70b82
b965179
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import tempfile
import os
import streamlit as st

from llama_index.llms.gemini import Gemini
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.llms.mistralai import MistralAI
from llama_index.llms.openai import OpenAI
from llama_index.core import (
    VectorStoreIndex,
    Settings,
)

from llama_parse import LlamaParse

from streamlit_pdf_viewer import pdf_viewer

# Global configurations
from llama_index.core import set_global_handler
set_global_handler("langfuse")
st.set_page_config(layout="wide")

with st.sidebar:
    st.title('Document Summarization and QA System')
    # st.markdown('''
    # ## About this application
    # Upload a pdf to ask questions about it. This retrieval-augmented generation (RAG) workflow uses:
    # - [Streamlit](https://streamlit.io/)
    # - [LlamaIndex](https://docs.llamaindex.ai/en/stable/)
    # - [OpenAI](https://platform.openai.com/docs/models)
    # ''')

    # st.write('Made by ***Nate Mahynski***')
    # st.write('nathan.mahynski@nist.gov')

    # Select Provider
    provider = st.selectbox(
        label="Select LLM Provider",
        options=['google', 'huggingface', 'mistralai', 'openai'],
        index=0
    )

    # Select LLM
    if provider == 'google':
        llm_list = ['gemini']
    elif provider == 'huggingface':
        llm_list = []
    elif provider == 'mistralai':
        llm_list =[]
    elif provider == 'openai':
        llm_list = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo', 'gpt-4o', 'gpt-4o-mini']
    else:
        llm_list = []

    llm_name = st.selectbox(
        label="Select LLM Model",
        options=llm_list,
        index=0
    )

    # Temperature
    temperature = st.slider(
        "Temperature",
        min_value=0.0, 
        max_value=1.0, 
        value=0.0, 
        step=0.05, 
    )

    max_output_tokens = 4096

    # Create LLM
    if provider == 'openai':
        llm = OpenAI(
            model=llm_name, 
            temperature=temperature,
            max_tokens=max_tokens
        )
        # Global tokenization needs to be consistent with LLM
        # https://docs.llamaindex.ai/en/stable/module_guides/models/llms/
        Settings.tokenizer = tiktoken.encoding_for_model(llm_name).encode
        Settings.num_output = max_tokens
        Settings.context_window = 4096 # max possible
    

    # Enter LLM Token
    llm_token = st.text_input(
        "Enter your LLM token",
        value=None
    )
    if provider == 'openai':
        os.environ['OPENAI_API_KEY'] = llm_token
    elif provider == 'huggingface':
        os.environ['HFTOKEN'] = llm_token

    # Enter parsing Token
    parse_token = st.text_input(
        "Enter your LlamaParse token",
        value=None
    )

    uploaded_file = st.file_uploader(
        "Choose a PDF file to upload", 
        # type=['pdf'], 
        accept_multiple_files=False
    )

    parsed_document = None
    if uploaded_file is not None:
        # Parse the file
        parser = LlamaParse(
            api_key=parse_token,  # can also be set in your env as LLAMA_CLOUD_API_KEY
            result_type="text"  # "markdown" and "text" are available
        )

        # Create a temporary directory to save the file then load and parse it
        temp_dir = tempfile.TemporaryDirectory()
        temp_filename = os.path.join(temp_dir.name, uploaded_file.name)
        with open(temp_filename, "wb") as f:
            f.write(uploaded_file.getvalue())
        parsed_document = parser.load_data(temp_filename)
        temp_dir.cleanup()

col1, col2 = st.columns(2)

with col1:
    st.markdown(
        """
        # Instructions

        1. Obtain a [token](https://cloud.llamaindex.ai/api-key) (or API Key) from LlamaParse to parse your document. 
        2. Obtain a similar token from your preferred LLM provider.
        3. Make selections at the left and upload a document to use a context.
        4. Begin asking questions below!
        """
    )

    st.divider()

    index = VectorStoreIndex.from_documents(parsed_document)
    query_engine = index.as_query_engine()

    prompt_txt = 'Summarize this document in a 3-5 sentences.'
    prompt = st.text_area(
        label="Enter you query.",
        key="prompt_widget",
        value=prompt_txt
    )

    response = qa_engine.query(prompt)
    st.write(response.response)

with col2:
    tab1, tab2 = st.tabs(["Uploaded File", "Parsed File",])

    with tab1:
        # st.header('This is the raw file you uploaded.')
        if uploaded_file is not None: # Display the pdf
            bytes_data = uploaded_file.getvalue()
            pdf_viewer(input=bytes_data, width=700)    
    
    with tab2:
        # st.header('This is the parsed version of the file.')
        if parsed_document is not None: # Showed the raw parsing result
            st.write(parsed_document)