Spaces:

mahynski
/

RAG

Running

App Files Files Community

mahynski commited on Aug 7, 2024

Commit

a487212

1 Parent(s): 183da17

degbu embeddings

Browse files

Files changed (1) hide show

app.py +151 -147

app.py CHANGED Viewed

@@ -14,158 +14,162 @@ from llama_index.core import (
     Settings,
 )
-# os.environ["OPENAI_API_KEY"] = "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"
 from llama_parse import LlamaParse
 from streamlit_pdf_viewer import pdf_viewer
-# Global configurations
-from llama_index.core import set_global_handler
-set_global_handler("langfuse")
-st.set_page_config(layout="wide")
-with st.sidebar:
-    st.title('Document Summarization and QA System')
-    # st.markdown('''
-    # ## About this application
-    # Upload a pdf to ask questions about it. This retrieval-augmented generation (RAG) workflow uses:
-    # - [Streamlit](https://streamlit.io/)
-    # - [LlamaIndex](https://docs.llamaindex.ai/en/stable/)
-    # - [OpenAI](https://platform.openai.com/docs/models)
-    # ''')
-    # st.write('Made by ***Nate Mahynski***')
-    # st.write('nathan.mahynski@nist.gov')
-    # Select Provider
-    provider = st.selectbox(
-        label="Select LLM Provider",
-        options=['google', 'huggingface', 'mistralai', 'openai'],
-        index=0
-    )
-    # Select LLM
-    if provider == 'google':
-        llm_list = ['gemini']
-    elif provider == 'huggingface':
-        llm_list = []
-    elif provider == 'mistralai':
-        llm_list =[]
-    elif provider == 'openai':
-        llm_list = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo', 'gpt-4o', 'gpt-4o-mini']
-    else:
-        llm_list = []
-    llm_name = st.selectbox(
-        label="Select LLM Model",
-        options=llm_list,
-        index=0
-    )
-    # Temperature
-    temperature = st.slider(
-        "Temperature",
-        min_value=0.0,
-        max_value=1.0,
-        value=0.0,
-        step=0.05,
-    )
-    max_output_tokens = 4096
-    # Enter LLM Token
-    llm_token = st.text_input(
-        "Enter your LLM token",
-        value="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"
-    )
-    # Create LLM
-    if llm_token is not None:
-        if provider == 'openai':
-            os.environ["OPENAI_API_KEY"] = str(llm_token)
-            Settings.llm = OpenAI(
-                model=llm_name,
-                temperature=temperature,
-                max_tokens=max_tokens,
-                api_key=os.environ.get("OPENAI_API_KEY")
-            )
-            # Global tokenization needs to be consistent with LLM
-            # https://docs.llamaindex.ai/en/stable/module_guides/models/llms/
-            Settings.tokenizer = tiktoken.encoding_for_model(llm_name).encode
-            Settings.num_output = max_tokens
-            Settings.context_window = 4096 # max possible
-            Settings.embed_model = OpenAIEmbedding(api_key=os.environ.get("OPENAI_API_KEY"))
         elif provider == 'huggingface':
-            os.environ['HFTOKEN'] = str(llm_token)
-    # Enter parsing Token
-    parse_token = st.text_input(
-        "Enter your LlamaParse token",
-        value="llx-uxxwLr1gZmDibaHTl99ISQJtpLSjjfhgDvnosGxu92RdRlb7"
-    )
-    uploaded_file = st.file_uploader(
-        "Choose a PDF file to upload",
-        # type=['pdf'],
-        accept_multiple_files=False
-    )
-    parsed_document = None
-    if uploaded_file is not None:
-        # Parse the file
-        parser = LlamaParse(
-            api_key=parse_token,  # can also be set in your env as LLAMA_CLOUD_API_KEY
-            result_type="text"  # "markdown" and "text" are available
         )
-        # Create a temporary directory to save the file then load and parse it
-        temp_dir = tempfile.TemporaryDirectory()
-        temp_filename = os.path.join(temp_dir.name, uploaded_file.name)
-        with open(temp_filename, "wb") as f:
-            f.write(uploaded_file.getvalue())
-        parsed_document = parser.load_data(temp_filename)
-        temp_dir.cleanup()
-col1, col2 = st.columns(2)
-with col1:
-    st.markdown(
-        """
-        # Instructions
-        1. Obtain a [token](https://cloud.llamaindex.ai/api-key) (or API Key) from LlamaParse to parse your document.
-        2. Obtain a similar token from your preferred LLM provider.
-        3. Make selections at the left and upload a document to use a context.
-        4. Begin asking questions below!
-        """
-    )
-    st.divider()
-    index = VectorStoreIndex.from_documents(parsed_document)
-    query_engine = index.as_query_engine()
-    prompt_txt = 'Summarize this document in a 3-5 sentences.'
-    prompt = st.text_area(
-        label="Enter you query.",
-        key="prompt_widget",
-        value=prompt_txt
-    )
-    response = query_engine.query(prompt)
-    st.write(response.response)
-with col2:
-    tab1, tab2 = st.tabs(["Uploaded File", "Parsed File",])
-    with tab1:
-        # st.header('This is the raw file you uploaded.')
-        if uploaded_file is not None: # Display the pdf
-            bytes_data = uploaded_file.getvalue()
-            pdf_viewer(input=bytes_data, width=700)
-    with tab2:
-        # st.header('This is the parsed version of the file.')
-        if parsed_document is not None: # Showed the raw parsing result
-            st.write(parsed_document)

     Settings,
 )
+os.environ["OPENAI_API_KEY"] = "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"
 from llama_parse import LlamaParse
 from streamlit_pdf_viewer import pdf_viewer
+def main():
+    with st.sidebar:
+        st.title('Document Summarization and QA System')
+        # st.markdown('''
+        # ## About this application
+        # Upload a pdf to ask questions about it. This retrieval-augmented generation (RAG) workflow uses:
+        # - [Streamlit](https://streamlit.io/)
+        # - [LlamaIndex](https://docs.llamaindex.ai/en/stable/)
+        # - [OpenAI](https://platform.openai.com/docs/models)
+        # ''')
+        # st.write('Made by ***Nate Mahynski***')
+        # st.write('nathan.mahynski@nist.gov')
+        # Select Provider
+        provider = st.selectbox(
+            label="Select LLM Provider",
+            options=['google', 'huggingface', 'mistralai', 'openai'],
+            index=0
+        )
+        # Select LLM
+        if provider == 'google':
+            llm_list = ['gemini']
         elif provider == 'huggingface':
+            llm_list = []
+        elif provider == 'mistralai':
+            llm_list =[]
+        elif provider == 'openai':
+            llm_list = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo', 'gpt-4o', 'gpt-4o-mini']
+        else:
+            llm_list = []
+        llm_name = st.selectbox(
+            label="Select LLM Model",
+            options=llm_list,
+            index=0
+        )
+        # Temperature
+        temperature = st.slider(
+            "Temperature",
+            min_value=0.0,
+            max_value=1.0,
+            value=0.0,
+            step=0.05,
+        )
+        max_output_tokens = 4096
+        # Enter LLM Token
+        llm_token = st.text_input(
+            "Enter your LLM token",
+            value="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"
+        )
+        # Create LLM
+        if llm_token is not None:
+            if provider == 'openai':
+                os.environ["OPENAI_API_KEY"] = str(llm_token)
+                Settings.llm = OpenAI(
+                    model=llm_name,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                    api_key=os.environ.get("OPENAI_API_KEY")
+                )
+                # Global tokenization needs to be consistent with LLM
+                # https://docs.llamaindex.ai/en/stable/module_guides/models/llms/
+                Settings.tokenizer = tiktoken.encoding_for_model(llm_name).encode
+                Settings.num_output = max_tokens
+                Settings.context_window = 4096 # max possible
+                Settings.embed_model = OpenAIEmbedding(api_key=os.environ.get("OPENAI_API_KEY"))
+            elif provider == 'huggingface':
+                os.environ['HFTOKEN'] = str(llm_token)
+        # Enter parsing Token
+        parse_token = st.text_input(
+            "Enter your LlamaParse token",
+            value="llx-uxxwLr1gZmDibaHTl99ISQJtpLSjjfhgDvnosGxu92RdRlb7"
+        )
+        uploaded_file = st.file_uploader(
+            "Choose a PDF file to upload",
+            # type=['pdf'],
+            accept_multiple_files=False
+        )
+        parsed_document = None
+        if uploaded_file is not None:
+            # Parse the file
+            parser = LlamaParse(
+                api_key=parse_token,  # can also be set in your env as LLAMA_CLOUD_API_KEY
+                result_type="text"  # "markdown" and "text" are available
+            )
+            # Create a temporary directory to save the file then load and parse it
+            temp_dir = tempfile.TemporaryDirectory()
+            temp_filename = os.path.join(temp_dir.name, uploaded_file.name)
+            with open(temp_filename, "wb") as f:
+                f.write(uploaded_file.getvalue())
+            parsed_document = parser.load_data(temp_filename)
+            temp_dir.cleanup()
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown(
+            """
+            # Instructions
+            1. Obtain a [token](https://cloud.llamaindex.ai/api-key) (or API Key) from LlamaParse to parse your document.
+            2. Obtain a similar token from your preferred LLM provider.
+            3. Make selections at the left and upload a document to use a context.
+            4. Begin asking questions below!
+            """
+        )
+        st.divider()
+        index = VectorStoreIndex.from_documents(parsed_document)
+        query_engine = index.as_query_engine()
+        prompt_txt = 'Summarize this document in a 3-5 sentences.'
+        prompt = st.text_area(
+            label="Enter you query.",
+            key="prompt_widget",
+            value=prompt_txt
         )
+        response = query_engine.query(prompt)
+        st.write(response.response)
+    with col2:
+        tab1, tab2 = st.tabs(["Uploaded File", "Parsed File",])
+        with tab1:
+            # st.header('This is the raw file you uploaded.')
+            if uploaded_file is not None: # Display the pdf
+                bytes_data = uploaded_file.getvalue()
+                pdf_viewer(input=bytes_data, width=700)
+        with tab2:
+            # st.header('This is the parsed version of the file.')
+            if parsed_document is not None: # Showed the raw parsing result
+                st.write(parsed_document)
+if __name__ == '__main__':
+    # Global configurations
+    from llama_index.core import set_global_handler
+    set_global_handler("langfuse")
+    st.set_page_config(layout="wide")
+    main()