chunk size added as input arg
Browse files- app.py +6 -1
- helper/utils.py +18 -9
app.py
CHANGED
@@ -61,6 +61,11 @@ with st.sidebar:
|
|
61 |
# Inform the user how many documents have been loaded
|
62 |
st.success(f"{len(uploaded_files)} document(s) loaded...")
|
63 |
|
|
|
|
|
|
|
|
|
|
|
64 |
# Input filter
|
65 |
top_n = st.number_input(
|
66 |
"Insert a number (top n rows to be selected):", value=5, step=1
|
@@ -103,7 +108,7 @@ if uploaded_files is None:
|
|
103 |
elif uploaded_files:
|
104 |
with st.spinner("Wait for it... 🤔"):
|
105 |
# Process the uploaded files to extract text and source information
|
106 |
-
textify_output = read_and_textify(uploaded_files)
|
107 |
|
108 |
# Separate the output into documents (text) and their corresponding sources
|
109 |
documents, sources = textify_output
|
|
|
61 |
# Inform the user how many documents have been loaded
|
62 |
st.success(f"{len(uploaded_files)} document(s) loaded...")
|
63 |
|
64 |
+
# Chunk size
|
65 |
+
chunk_size_input = st.number_input(
|
66 |
+
"Insert an integer (for size of chunks):", value=10, step=1
|
67 |
+
)
|
68 |
+
|
69 |
# Input filter
|
70 |
top_n = st.number_input(
|
71 |
"Insert a number (top n rows to be selected):", value=5, step=1
|
|
|
108 |
elif uploaded_files:
|
109 |
with st.spinner("Wait for it... 🤔"):
|
110 |
# Process the uploaded files to extract text and source information
|
111 |
+
textify_output = read_and_textify(uploaded_files, chunk_size=chunk_size_input)
|
112 |
|
113 |
# Separate the output into documents (text) and their corresponding sources
|
114 |
documents, sources = textify_output
|
helper/utils.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import os
|
|
|
2 |
from typing import Any, Dict, List, Tuple, Union
|
3 |
|
4 |
-
from datetime import datetime
|
5 |
import numpy as np
|
6 |
import pandas as pd
|
7 |
import PyPDF2
|
@@ -52,15 +52,24 @@ def current_year():
|
|
52 |
# return [text_list, sources_list]
|
53 |
|
54 |
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
"""
|
57 |
-
Reads PDF files and extracts text from each page, breaking the text into segments
|
58 |
|
59 |
This function iterates over a list of uploaded PDF files, extracts text from each page,
|
60 |
-
and compiles a list of texts and corresponding source information, segmented into smaller parts
|
|
|
61 |
|
62 |
Args:
|
63 |
files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
|
|
|
64 |
|
65 |
Returns:
|
66 |
Tuple[List[str], List[str]]: A tuple containing two lists:
|
@@ -79,16 +88,16 @@ def read_and_textify(files: List[str]) -> Tuple[List[str], List[str]]:
|
|
79 |
pageObj = pdfReader.pages[i] # Get the page object
|
80 |
text = pageObj.extract_text() # Extract text from the page
|
81 |
if text:
|
82 |
-
# Split text into approximately
|
83 |
words = text.split()
|
84 |
-
for j in range(0, len(words),
|
85 |
-
chunk =
|
86 |
text_list.append(chunk)
|
87 |
# Create a source identifier for each chunk and add it to the list
|
88 |
-
sources_list.append(f"{file.name}_page_{i}_chunk_{j//
|
89 |
else:
|
90 |
# If no text extracted, still add a placeholder
|
91 |
-
text_list.append(
|
92 |
sources_list.append(f"{file.name}_page_{i}_chunk_0")
|
93 |
pageObj.clear() # Clear the page object (optional, for memory management)
|
94 |
|
|
|
1 |
import os
|
2 |
+
from datetime import datetime
|
3 |
from typing import Any, Dict, List, Tuple, Union
|
4 |
|
|
|
5 |
import numpy as np
|
6 |
import pandas as pd
|
7 |
import PyPDF2
|
|
|
52 |
# return [text_list, sources_list]
|
53 |
|
54 |
|
55 |
+
from typing import List, Tuple
|
56 |
+
|
57 |
+
import PyPDF2
|
58 |
+
|
59 |
+
|
60 |
+
def read_and_textify(
|
61 |
+
files: List[str], chunk_size: int = 50 # Default chunk size set to 50
|
62 |
+
) -> Tuple[List[str], List[str]]:
|
63 |
"""
|
64 |
+
Reads PDF files and extracts text from each page, breaking the text into specified segments.
|
65 |
|
66 |
This function iterates over a list of uploaded PDF files, extracts text from each page,
|
67 |
+
and compiles a list of texts and corresponding source information, segmented into smaller parts
|
68 |
+
of approximately 'chunk_size' words each.
|
69 |
|
70 |
Args:
|
71 |
files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
|
72 |
+
chunk_size (int): The number of words per text segment. Default is 50.
|
73 |
|
74 |
Returns:
|
75 |
Tuple[List[str], List[str]]: A tuple containing two lists:
|
|
|
88 |
pageObj = pdfReader.pages[i] # Get the page object
|
89 |
text = pageObj.extract_text() # Extract text from the page
|
90 |
if text:
|
91 |
+
# Split text into chunks of approximately 'chunk_size' words
|
92 |
words = text.split()
|
93 |
+
for j in range(0, len(words), chunk_size):
|
94 |
+
chunk = " ".join(words[j : j + chunk_size])
|
95 |
text_list.append(chunk)
|
96 |
# Create a source identifier for each chunk and add it to the list
|
97 |
+
sources_list.append(f"{file.name}_page_{i}_chunk_{j // chunk_size}")
|
98 |
else:
|
99 |
# If no text extracted, still add a placeholder
|
100 |
+
text_list.append("")
|
101 |
sources_list.append(f"{file.name}_page_{i}_chunk_0")
|
102 |
pageObj.clear() # Clear the page object (optional, for memory management)
|
103 |
|