eagle0504 commited on
Commit
4e18d60
·
1 Parent(s): 451d492

advanced read and textify added

Browse files
Files changed (2) hide show
  1. app.py +2 -1
  2. helper/utils.py +50 -0
app.py CHANGED
@@ -110,7 +110,8 @@ if uploaded_files is None:
110
  elif uploaded_files:
111
  with st.spinner("Wait for it... 🤔"):
112
  # Process the uploaded files to extract text and source information
113
- textify_output = read_and_textify(uploaded_files, chunk_size=chunk_size_input)
 
114
 
115
  # Separate the output into documents (text) and their corresponding sources
116
  documents, sources = textify_output
 
110
  elif uploaded_files:
111
  with st.spinner("Wait for it... 🤔"):
112
  # Process the uploaded files to extract text and source information
113
+ # textify_output = read_and_textify(uploaded_files, chunk_size=chunk_size_input)
114
+ textify_output = read_and_textify_advanced(uploaded_files, chunk_size=chunk_size_input)
115
 
116
  # Separate the output into documents (text) and their corresponding sources
117
  documents, sources = textify_output
helper/utils.py CHANGED
@@ -62,6 +62,56 @@ def read_and_textify(
62
  return text_list, sources_list
63
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
66
 
67
 
 
62
  return text_list, sources_list
63
 
64
 
65
+ def read_and_textify_advanced(
66
+ files: List[str], chunk_size: int = 2 # Default chunk size set to 50
67
+ ) -> Tuple[List[str], List[str]]:
68
+ """
69
+ Reads PDF files and extracts text from each page, breaking the text into specified segments.
70
+
71
+ This function iterates over a list of uploaded PDF files, extracts text from each page,
72
+ and compiles a list of texts and corresponding source information, segmented into smaller parts
73
+ of approximately 'chunk_size' words each.
74
+
75
+ Args:
76
+ files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
77
+ chunk_size (int): The number of words per text segment. Default is 50.
78
+
79
+ Returns:
80
+ Tuple[List[str], List[str]]: A tuple containing two lists:
81
+ 1. A list of strings, where each string is a segment of text extracted from a PDF page.
82
+ 2. A list of strings indicating the source of each text segment (file name, page number, and segment number).
83
+ """
84
+
85
+ text_list = [] # List to store extracted text segments
86
+ sources_list = [] # List to store source information
87
+
88
+ # Iterate over each file
89
+ for file in files:
90
+ pdfReader = PyPDF2.PdfReader(file) # Create a PDF reader object
91
+ # Iterate over each page in the PDF
92
+ for i in range(len(pdfReader.pages)):
93
+ pageObj = pdfReader.pages[i] # Get the page object
94
+ text = pageObj.extract_text() # Extract text from the page
95
+ if text:
96
+ # Split text into chunks of approximately 'chunk_size' words
97
+ words = text.split(". ")
98
+ for j in range(len(words)):
99
+ # Get the chunk of text from j-chunk_size to j+chunk_size
100
+ start = max(0, j - chunk_size)
101
+ end = min(len(words), j + chunk_size + 1)
102
+ chunk = ". ".join(words[start:end]) + '.'
103
+ text_list.append(chunk)
104
+ # Create a source identifier for each chunk and add it to the list
105
+ sources_list.append(f"{file.name}_page_{i}_chunk_{j}")
106
+ else:
107
+ # If no text extracted, still add a placeholder
108
+ text_list.append("")
109
+ sources_list.append(f"{file.name}_page_{i}_chunk_0")
110
+ pageObj.clear() # Clear the page object (optional, for memory management)
111
+
112
+ return text_list, sources_list
113
+
114
+
115
  openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
116
 
117