eagle0504 commited on
Commit
2ecca1e
·
1 Parent(s): bae639d

chunk size added as input arg

Browse files
Files changed (2) hide show
  1. app.py +6 -1
  2. helper/utils.py +18 -9
app.py CHANGED
@@ -61,6 +61,11 @@ with st.sidebar:
61
  # Inform the user how many documents have been loaded
62
  st.success(f"{len(uploaded_files)} document(s) loaded...")
63
 
 
 
 
 
 
64
  # Input filter
65
  top_n = st.number_input(
66
  "Insert a number (top n rows to be selected):", value=5, step=1
@@ -103,7 +108,7 @@ if uploaded_files is None:
103
  elif uploaded_files:
104
  with st.spinner("Wait for it... 🤔"):
105
  # Process the uploaded files to extract text and source information
106
- textify_output = read_and_textify(uploaded_files)
107
 
108
  # Separate the output into documents (text) and their corresponding sources
109
  documents, sources = textify_output
 
61
  # Inform the user how many documents have been loaded
62
  st.success(f"{len(uploaded_files)} document(s) loaded...")
63
 
64
+ # Chunk size
65
+ chunk_size_input = st.number_input(
66
+ "Insert an integer (for size of chunks):", value=10, step=1
67
+ )
68
+
69
  # Input filter
70
  top_n = st.number_input(
71
  "Insert a number (top n rows to be selected):", value=5, step=1
 
108
  elif uploaded_files:
109
  with st.spinner("Wait for it... 🤔"):
110
  # Process the uploaded files to extract text and source information
111
+ textify_output = read_and_textify(uploaded_files, chunk_size=chunk_size_input)
112
 
113
  # Separate the output into documents (text) and their corresponding sources
114
  documents, sources = textify_output
helper/utils.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
 
2
  from typing import Any, Dict, List, Tuple, Union
3
 
4
- from datetime import datetime
5
  import numpy as np
6
  import pandas as pd
7
  import PyPDF2
@@ -52,15 +52,24 @@ def current_year():
52
  # return [text_list, sources_list]
53
 
54
 
55
- def read_and_textify(files: List[str]) -> Tuple[List[str], List[str]]:
 
 
 
 
 
 
 
56
  """
57
- Reads PDF files and extracts text from each page, breaking the text into segments of about 50 words.
58
 
59
  This function iterates over a list of uploaded PDF files, extracts text from each page,
60
- and compiles a list of texts and corresponding source information, segmented into smaller parts.
 
61
 
62
  Args:
63
  files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
 
64
 
65
  Returns:
66
  Tuple[List[str], List[str]]: A tuple containing two lists:
@@ -79,16 +88,16 @@ def read_and_textify(files: List[str]) -> Tuple[List[str], List[str]]:
79
  pageObj = pdfReader.pages[i] # Get the page object
80
  text = pageObj.extract_text() # Extract text from the page
81
  if text:
82
- # Split text into approximately 50-word chunks
83
  words = text.split()
84
- for j in range(0, len(words), 50):
85
- chunk = ' '.join(words[j:j+50])
86
  text_list.append(chunk)
87
  # Create a source identifier for each chunk and add it to the list
88
- sources_list.append(f"{file.name}_page_{i}_chunk_{j//50}")
89
  else:
90
  # If no text extracted, still add a placeholder
91
- text_list.append('')
92
  sources_list.append(f"{file.name}_page_{i}_chunk_0")
93
  pageObj.clear() # Clear the page object (optional, for memory management)
94
 
 
1
  import os
2
+ from datetime import datetime
3
  from typing import Any, Dict, List, Tuple, Union
4
 
 
5
  import numpy as np
6
  import pandas as pd
7
  import PyPDF2
 
52
  # return [text_list, sources_list]
53
 
54
 
55
+ from typing import List, Tuple
56
+
57
+ import PyPDF2
58
+
59
+
60
+ def read_and_textify(
61
+ files: List[str], chunk_size: int = 50 # Default chunk size set to 50
62
+ ) -> Tuple[List[str], List[str]]:
63
  """
64
+ Reads PDF files and extracts text from each page, breaking the text into specified segments.
65
 
66
  This function iterates over a list of uploaded PDF files, extracts text from each page,
67
+ and compiles a list of texts and corresponding source information, segmented into smaller parts
68
+ of approximately 'chunk_size' words each.
69
 
70
  Args:
71
  files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
72
+ chunk_size (int): The number of words per text segment. Default is 50.
73
 
74
  Returns:
75
  Tuple[List[str], List[str]]: A tuple containing two lists:
 
88
  pageObj = pdfReader.pages[i] # Get the page object
89
  text = pageObj.extract_text() # Extract text from the page
90
  if text:
91
+ # Split text into chunks of approximately 'chunk_size' words
92
  words = text.split()
93
+ for j in range(0, len(words), chunk_size):
94
+ chunk = " ".join(words[j : j + chunk_size])
95
  text_list.append(chunk)
96
  # Create a source identifier for each chunk and add it to the list
97
+ sources_list.append(f"{file.name}_page_{i}_chunk_{j // chunk_size}")
98
  else:
99
  # If no text extracted, still add a placeholder
100
+ text_list.append("")
101
  sources_list.append(f"{file.name}_page_{i}_chunk_0")
102
  pageObj.clear() # Clear the page object (optional, for memory management)
103