Spaces:

eagle0504
/

document-search-q-series

Running

App Files Files Community

eagle0504 commited on Apr 20, 2024

Commit

f560388

1 Parent(s): 02148f0

init push

Browse files

Files changed (3) hide show

app.py +116 -0
helper/utils.py +198 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import openai
+import PyPDF2
+import streamlit as st
+from openai import OpenAI
+from helper.utils import *
+st.set_page_config(layout="centered", page_title="Document Search🤖📖")
+st.header("Document Search🤖📖")
+st.write("---")
+# Streamlit sidebar setup for user interface
+with st.sidebar:
+    # Create an expandable instruction manual section in the sidebar
+    with st.expander("Instruction Manual 📖"):
+        # Display the instruction manual for the Document Data Chatbot in a formatted markdown
+        st.markdown(
+            """
+            # Document Data Chatbot User Manual 🤖💊
+            Welcome to the Document Data Chatbot, your interactive assistant for information on the textual "Document Data". This chatbot offers quick and accurate responses to your queries. Follow these steps to interact with the chatbot:
+            ## Getting Started 🚀
+            1. **Access the Chatbot**: Launch the Document Data Chatbot on your device.
+            2. **Start Chatting**: Type your Document Data-related questions in the chat window. Questions can range from dosage to side effects.
+            3. **Send Your Question**: Submit your query by clicking 'Send' or pressing 'Enter'.
+            ## Chatting with Document Data Chatbot 🤔💬
+            - **Ask Anything**: Inquiries about textual composition, usage, storage, or safety are all welcome.
+            - **Use Simple Language**: Clear and concise questions yield the best results.
+            - **Wait for the Response**: The chatbot will promptly process and answer your query.
+            - **Follow-Up Questions**: Feel free to ask additional or new questions anytime.
+            ## Tips for a Better Experience ✨
+            - **Be Specific**: Specific questions help in getting precise answers.
+            - **Check for Typing Errors**: Correct spelling ensures better understanding by the chatbot.
+            - **Emoji Use**: Emojis are welcome in your questions!
+            - **Patience is Key**: Responses may take a moment as the chatbot processes your query.
+            ## Support and Feedback 🤝
+            - **Need Help?**: Contact our support team for any issues.
+            - **Share Your Feedback**: Your input is valuable and helps us improve.
+            ## The Team Behind the App 🧑‍💻👩‍💻
+            - **Founders**: Learn about [Peter Yin](https://www.linkedin.com/in/peter-yin-7914ba25/) and [Yiqiao Yin](https://www.linkedin.com/in/yiqiaoyin/), the founders, on LinkedIn.
+            Thank you for choosing the Document Data Chatbot. We're here to provide all the information you need about Document Data efficiently. Happy chatting! 🎉💬
+            """
+        )
+    # File uploader widget allowing users to upload text and PDF documents
+    uploaded_files = st.file_uploader(
+        "Upload documents", accept_multiple_files=True, type=["txt", "pdf"]
+    )
+    # Clear button
+    clear_button = st.sidebar.button("Clear Conversation", key="clear")
+# Initialize chat history
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Reset everything
+if clear_button:
+    st.session_state.messages = []
+# Display chat messages from history on app rerun
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# Check if any files have been uploaded
+if uploaded_files is None:
+    # Display a message prompting the user to upload files
+    st.info("Upload files to analyze")
+elif uploaded_files:
+    # Inform the user how many documents have been loaded
+    st.sidebar.write(f"{len(uploaded_files)} document(s) loaded..")
+    # Process the uploaded files to extract text and source information
+    textify_output = read_and_textify(uploaded_files)
+    # Separate the output into documents (text) and their corresponding sources
+    documents, sources = textify_output
+    # Call the function
+    query_database = list_to_nums(documents)
+    # Create reference table
+    refs_tab = query_search(
+        "pful for understanding federal income", documents, query_database, sources
+    )
+    # React to user input
+    if prompt := st.chat_input("What is up?"):
+        # Display user message in chat message container
+        st.chat_message("user").markdown(prompt)
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        result = refs_tab
+        # Display assistant response in chat message container
+        with st.chat_message("assistant"):
+            st.table(result)
+        # Add assistant response to chat history
+        st.session_state.messages.append({"role": "assistant", "content": result})

helper/utils.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import os
+from typing import Any, Dict, List, Tuple, Union
+import numpy as np
+import pandas as pd
+import PyPDF2
+from openai import OpenAI
+def read_and_textify(
+    files: List[str],
+) -> Tuple[List[str], List[str]]:
+    """
+    Reads PDF files and extracts text from each page.
+    This function iterates over a list of uploaded PDF files, extracts text from each page,
+    and compiles a list of texts and corresponding source information.
+    Args:
+    files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
+    Returns:
+    Tuple[List[str], List[str]]: A tuple containing two lists:
+        1. A list of strings, where each string is the text extracted from a PDF page.
+        2. A list of strings indicating the source of each text (file name and page number).
+    """
+    # Initialize lists to store extracted texts and their sources
+    text_list = []  # List to store extracted text
+    sources_list = []  # List to store source information
+    # Iterate over each file
+    for file in files:
+        pdfReader = PyPDF2.PdfReader(file)  # Create a PDF reader object
+        # Iterate over each page in the PDF
+        for i in range(len(pdfReader.pages)):
+            pageObj = pdfReader.pages[i]  # Get the page object
+            text = pageObj.extract_text()  # Extract text from the page
+            pageObj.clear()  # Clear the page object (optional, for memory management)
+            text_list.append(text)  # Add extracted text to the list
+            # Create a source identifier and add it to the list
+            sources_list.append(file.name + "_page_" + str(i))
+    # Return the lists of texts and sources
+    return [text_list, sources_list]
+client = OpenAI(api_key=os.environ("OPENAI_API_KEY"))
+def list_to_nums(sentences: List[str]) -> List[List[float]]:
+    """
+    Converts a list of sentences into a list of numerical embeddings using OpenAI's embedding model.
+    Args:
+    - sentences (List[str]): A list of sentences (strings).
+    Returns:
+    - List[List[float]]: A list of lists of numerical embeddings.
+    """
+    # Initialize the list to store embeddings
+    embeddings = []
+    # Loop through each sentence to convert to embeddings
+    for sentence in sentences:
+        # Use the OpenAI API to get embeddings for the sentence
+        response = client.embeddings.create(
+            input=sentence, model="text-embedding-3-small"
+        )
+        embeddings.append(response.data[0].embedding)
+    return embeddings
+def quantize_to_kbit(arr: Union[np.ndarray, Any], k: int = 16) -> np.ndarray:
+    """Converts an array to a k-bit representation by normalizing and scaling its values.
+    Args:
+        arr (Union[np.ndarray, Any]): The input array to be quantized.
+        k (int): The number of levels to quantize to. Defaults to 16 for 4-bit quantization.
+    Returns:
+        np.ndarray: The quantized array with values scaled to 0 to k-1.
+    """
+    if not isinstance(arr, np.ndarray):  # Check if input is not a numpy array
+        arr = np.array(arr)  # Convert input to a numpy array
+    arr_min = arr.min()  # Calculate the minimum value in the array
+    arr_max = arr.max()  # Calculate the maximum value in the array
+    normalized_arr = (arr - arr_min) / (
+        arr_max - arr_min
+    )  # Normalize array values to [0, 1]
+    return np.round(normalized_arr * (k - 1)).astype(
+        int
+    )  # Scale normalized values to 0-(k-1) and convert to integer
+def quantized_influence(
+    arr1: np.ndarray, arr2: np.ndarray, k: int = 16, use_dagger: bool = False
+) -> Tuple[float, List[float]]:
+    """
+    Calculates a weighted measure of influence based on quantized version of input arrays and optionally applies a transformation.
+    Args:
+        arr1 (np.ndarray): First input array to be quantized and analyzed.
+        arr2 (np.ndarray): Second input array to be quantized and used for influence measurement.
+        k (int): The quantization level, defaults to 16 for 4-bit quantization.
+        use_dagger (bool): Flag to apply a transformation based on local averages, defaults to False.
+    Returns:
+        Tuple[float, List[float]]: A tuple containing the quantized influence measure and an optional list of transformed values based on local estimates.
+    """
+    # Quantize both arrays to k levels
+    arr1_quantized = quantize_to_kbit(arr1, k)
+    arr2_quantized = quantize_to_kbit(arr2, k)
+    # Find unique quantized values in arr1
+    unique_values = np.unique(arr1_quantized)
+    # Compute the global average of quantized arr2
+    total_samples = len(arr2_quantized)
+    y_bar_global = np.mean(arr2_quantized)
+    # Compute weighted local averages and normalize
+    weighted_local_averages = [
+        (np.mean(arr2_quantized[arr1_quantized == val]) - y_bar_global) ** 2
+        * len(arr2_quantized[arr1_quantized == val]) ** 2
+        for val in unique_values
+    ]
+    qim = np.sum(weighted_local_averages) / (
+        total_samples * np.std(arr2_quantized)
+    )  # Calculate the quantized influence measure
+    if use_dagger:
+        # If use_dagger is True, compute local estimates and map them to unique quantized values
+        local_estimates = [
+            np.mean(arr2_quantized[arr1_quantized == val]) for val in unique_values
+        ]
+        daggers = {
+            unique_values[i]: v for i, v in enumerate(local_estimates)
+        }  # Map unique values to local estimates
+        def find_val_(i: int) -> float:
+            """Helper function to map quantized values to their local estimates."""
+            return daggers[i]
+        # Apply transformation based on local estimates
+        daggered_values = list(map(find_val_, arr1_quantized))
+        return qim, daggered_values
+    else:
+        # If use_dagger is False, return the original quantized arr1 values
+        daggered_values = arr1_quantized.tolist()
+        return qim
+def query_search(
+    prompt: str,
+    sentences: list[str],
+    query_database: list[list[float]],
+    sources: list[str],
+) -> pd.DataFrame:
+    """
+    Takes a text prompt and searches a predefined database by converting the prompt
+    and database entries to embeddings, and then calculating a quantized influence metric.
+    Args:
+    - prompt (str): A text prompt to search for in the database.
+    Returns:
+    - pd.DataFrame: A pandas DataFrame sorted by the quantized influence metric in descending order.
+                     The DataFrame contains the original sentences, their embeddings, and the computed scores.
+    """
+    # Convert the prompt to its numerical embedding
+    prompt_embed_ = list_to_nums([prompt])
+    # Calculate scores for each item in the database using the quantized influence metric
+    scores = [
+        [
+            sentences[i],  # The sentence itself
+            query_database[i],  # Embedding of the sentence
+            sources[i],  # Source of the sentence
+            quantized_influence(
+                prompt_embed_[0], query_database[i], k=3, use_dagger=False
+            ),  # Score calculation
+        ]
+        for i in range(len(query_database))
+    ]
+    # Convert the list of scores into a DataFrame
+    refs = pd.DataFrame(scores)
+    # Rename columns for clarity
+    refs = refs.rename(
+        columns={0: "sentences", 1: "query_embeddings", 2: "page no", 3: "qim"}
+    )
+    # Sort the DataFrame based on the 'qim' score in descending order
+    refs = refs.sort_values(by="qim", ascending=False)
+    return refs

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+streamlit
+PyPDF2
+openai