Spaces:
Runtime error
Runtime error
Kushwanth Chowday Kandala
commited on
Commit
•
f06193e
1
Parent(s):
8a3a5d7
add combine_text functionality prep to chunk the data with the model limits
Browse files
app.py
CHANGED
@@ -6,6 +6,7 @@ import pandas as pd
|
|
6 |
from io import StringIO
|
7 |
import PyPDF2
|
8 |
from tqdm import tqdm
|
|
|
9 |
# import json
|
10 |
|
11 |
# st.config(PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python")
|
@@ -142,6 +143,16 @@ def print_out(pages):
|
|
142 |
text = pages[i].extract_text().strip()
|
143 |
st.write(f"Page {i} : {text}")
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
with st.sidebar:
|
146 |
st.markdown("""
|
147 |
***Follow this steps***
|
@@ -170,4 +181,4 @@ with st.sidebar:
|
|
170 |
reader = PyPDF2.PdfReader(uploaded_file)
|
171 |
pages = reader.pages
|
172 |
print_out(pages)
|
173 |
-
|
|
|
6 |
from io import StringIO
|
7 |
import PyPDF2
|
8 |
from tqdm import tqdm
|
9 |
+
import math
|
10 |
# import json
|
11 |
|
12 |
# st.config(PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python")
|
|
|
143 |
text = pages[i].extract_text().strip()
|
144 |
st.write(f"Page {i} : {text}")
|
145 |
|
146 |
+
def combine_text(pages):
|
147 |
+
concatenates_text = ""
|
148 |
+
for page in tqdm(pages):
|
149 |
+
text = page.extract_text().strip()
|
150 |
+
concatenates_text += text
|
151 |
+
bytesize = bytes(text, "utf-8")
|
152 |
+
p = math.pow(1024, 2)
|
153 |
+
mbsize = round(bytesize / p, 2)
|
154 |
+
st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
|
155 |
+
|
156 |
with st.sidebar:
|
157 |
st.markdown("""
|
158 |
***Follow this steps***
|
|
|
181 |
reader = PyPDF2.PdfReader(uploaded_file)
|
182 |
pages = reader.pages
|
183 |
print_out(pages)
|
184 |
+
combine_text(pages)
|