Spaces:

eagle0504
/

document-search-q-series

Running

App Files Files Community

eagle0504 commited on May 7, 2024

Commit

4e18d60

1 Parent(s): 451d492

advanced read and textify added

Browse files

Files changed (2) hide show

app.py +2 -1
helper/utils.py +50 -0

app.py CHANGED Viewed

@@ -110,7 +110,8 @@ if uploaded_files is None:
 elif uploaded_files:
     with st.spinner("Wait for it... 🤔"):
         # Process the uploaded files to extract text and source information
-        textify_output = read_and_textify(uploaded_files, chunk_size=chunk_size_input)
         # Separate the output into documents (text) and their corresponding sources
         documents, sources = textify_output

 elif uploaded_files:
     with st.spinner("Wait for it... 🤔"):
         # Process the uploaded files to extract text and source information
+        # textify_output = read_and_textify(uploaded_files, chunk_size=chunk_size_input)
+        textify_output = read_and_textify_advanced(uploaded_files, chunk_size=chunk_size_input)
         # Separate the output into documents (text) and their corresponding sources
         documents, sources = textify_output

helper/utils.py CHANGED Viewed

@@ -62,6 +62,56 @@ def read_and_textify(
     return text_list, sources_list
 openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

     return text_list, sources_list
+def read_and_textify_advanced(
+    files: List[str], chunk_size: int = 2  # Default chunk size set to 50
+) -> Tuple[List[str], List[str]]:
+    """
+    Reads PDF files and extracts text from each page, breaking the text into specified segments.
+    This function iterates over a list of uploaded PDF files, extracts text from each page,
+    and compiles a list of texts and corresponding source information, segmented into smaller parts
+    of approximately 'chunk_size' words each.
+    Args:
+    files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
+    chunk_size (int): The number of words per text segment. Default is 50.
+    Returns:
+    Tuple[List[str], List[str]]: A tuple containing two lists:
+        1. A list of strings, where each string is a segment of text extracted from a PDF page.
+        2. A list of strings indicating the source of each text segment (file name, page number, and segment number).
+    """
+    text_list = []  # List to store extracted text segments
+    sources_list = []  # List to store source information
+    # Iterate over each file
+    for file in files:
+        pdfReader = PyPDF2.PdfReader(file)  # Create a PDF reader object
+        # Iterate over each page in the PDF
+        for i in range(len(pdfReader.pages)):
+            pageObj = pdfReader.pages[i]  # Get the page object
+            text = pageObj.extract_text()  # Extract text from the page
+            if text:
+                # Split text into chunks of approximately 'chunk_size' words
+                words = text.split(". ")
+                for j in range(len(words)):
+                    # Get the chunk of text from j-chunk_size to j+chunk_size
+                    start = max(0, j - chunk_size)
+                    end = min(len(words), j + chunk_size + 1)
+                    chunk = ". ".join(words[start:end]) + '.'
+                    text_list.append(chunk)
+                    # Create a source identifier for each chunk and add it to the list
+                    sources_list.append(f"{file.name}_page_{i}_chunk_{j}")
+            else:
+                # If no text extracted, still add a placeholder
+                text_list.append("")
+                sources_list.append(f"{file.name}_page_{i}_chunk_0")
+            pageObj.clear()  # Clear the page object (optional, for memory management)
+    return text_list, sources_list
 openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])