Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,6 @@ print(f"Gradio version: {gr.__version__}")
|
|
7 |
|
8 |
from PyPDF2 import PdfReader
|
9 |
import fitz # pymupdf
|
10 |
-
from pdf2md.converter import PDF2Markdown
|
11 |
|
12 |
import logging
|
13 |
import webbrowser
|
@@ -202,17 +201,6 @@ class PDFProcessor:
|
|
202 |
logging.error(f"Error in txt conversion: {e}")
|
203 |
return f"Error: {str(e)}"
|
204 |
|
205 |
-
@staticmethod
|
206 |
-
def md_convert_with_pdf2md(pdf_path: str) -> str:
|
207 |
-
"""Convert PDF to Markdown using pdf2md"""
|
208 |
-
try:
|
209 |
-
converter = PDF2Markdown()
|
210 |
-
markdown_text = converter.convert(pdf_path)
|
211 |
-
return markdown_text
|
212 |
-
except Exception as e:
|
213 |
-
logging.error(f"Error in pdf2md conversion: {e}")
|
214 |
-
return f"Error: {str(e)}"
|
215 |
-
|
216 |
@staticmethod
|
217 |
def md_convert_with_pymupdf(pdf_path: str) -> str:
|
218 |
"""Convert PDF to Markdown using pymupdf"""
|
@@ -261,14 +249,13 @@ class PDFProcessor:
|
|
261 |
# Initialize model registry
|
262 |
model_registry = ModelRegistry()
|
263 |
|
264 |
-
def extract_text_from_pdf(pdf_path: str, format_type: str = "txt"
|
265 |
"""
|
266 |
Extract and format text from PDF using different processors based on format.
|
267 |
|
268 |
Args:
|
269 |
pdf_path: Path to PDF file
|
270 |
format_type: Either 'txt' or 'md'
|
271 |
-
md_engine: When format_type is 'md', either 'pdf2md' or 'pymupdf'
|
272 |
|
273 |
Returns:
|
274 |
Formatted text content
|
@@ -279,12 +266,7 @@ def extract_text_from_pdf(pdf_path: str, format_type: str = "txt", md_engine: st
|
|
279 |
if format_type == "txt":
|
280 |
return processor.txt_convert(pdf_path)
|
281 |
elif format_type == "md":
|
282 |
-
|
283 |
-
return processor.md_convert_with_pdf2md(pdf_path)
|
284 |
-
elif md_engine == "pymupdf":
|
285 |
-
return processor.md_convert_with_pymupdf(pdf_path)
|
286 |
-
else:
|
287 |
-
return f"Error: Unsupported markdown engine: {md_engine}"
|
288 |
else:
|
289 |
return f"Error: Unsupported format type: {format_type}"
|
290 |
except Exception as e:
|
@@ -629,7 +611,7 @@ with gr.Blocks(css="""
|
|
629 |
)
|
630 |
|
631 |
format_type = gr.Radio(
|
632 |
-
choices=["txt", "md
|
633 |
value="txt",
|
634 |
label="π Output Format"
|
635 |
)
|
@@ -871,15 +853,16 @@ with gr.Blocks(css="""
|
|
871 |
]
|
872 |
|
873 |
# PDF Processing Handlers
|
874 |
-
def handle_pdf_process(pdf, fmt,
|
875 |
if not pdf:
|
876 |
return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
|
877 |
|
878 |
try:
|
879 |
-
text = extract_text_from_pdf(pdf.name, format_type=fmt
|
880 |
if text.startswith("Error"):
|
881 |
return text, "", "", [], gr.update(choices=[], value=None), None
|
882 |
|
|
|
883 |
snippets_list = split_into_snippets(text, ctx_size)
|
884 |
snippet_choices = update_snippet_choices(snippets_list)
|
885 |
|
|
|
7 |
|
8 |
from PyPDF2 import PdfReader
|
9 |
import fitz # pymupdf
|
|
|
10 |
|
11 |
import logging
|
12 |
import webbrowser
|
|
|
201 |
logging.error(f"Error in txt conversion: {e}")
|
202 |
return f"Error: {str(e)}"
|
203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
@staticmethod
|
205 |
def md_convert_with_pymupdf(pdf_path: str) -> str:
|
206 |
"""Convert PDF to Markdown using pymupdf"""
|
|
|
249 |
# Initialize model registry
|
250 |
model_registry = ModelRegistry()
|
251 |
|
252 |
+
def extract_text_from_pdf(pdf_path: str, format_type: str = "txt") -> str:
|
253 |
"""
|
254 |
Extract and format text from PDF using different processors based on format.
|
255 |
|
256 |
Args:
|
257 |
pdf_path: Path to PDF file
|
258 |
format_type: Either 'txt' or 'md'
|
|
|
259 |
|
260 |
Returns:
|
261 |
Formatted text content
|
|
|
266 |
if format_type == "txt":
|
267 |
return processor.txt_convert(pdf_path)
|
268 |
elif format_type == "md":
|
269 |
+
return processor.md_convert_with_pymupdf(pdf_path)
|
|
|
|
|
|
|
|
|
|
|
270 |
else:
|
271 |
return f"Error: Unsupported format type: {format_type}"
|
272 |
except Exception as e:
|
|
|
611 |
)
|
612 |
|
613 |
format_type = gr.Radio(
|
614 |
+
choices=["txt", "md"],
|
615 |
value="txt",
|
616 |
label="π Output Format"
|
617 |
)
|
|
|
853 |
]
|
854 |
|
855 |
# PDF Processing Handlers
|
856 |
+
def handle_pdf_process(pdf, fmt, ctx_size): # Remove md_eng parameter
|
857 |
if not pdf:
|
858 |
return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
|
859 |
|
860 |
try:
|
861 |
+
text = extract_text_from_pdf(pdf.name, format_type=fmt) # Just use format_type
|
862 |
if text.startswith("Error"):
|
863 |
return text, "", "", [], gr.update(choices=[], value=None), None
|
864 |
|
865 |
+
# The important part: still do snippets processing
|
866 |
snippets_list = split_into_snippets(text, ctx_size)
|
867 |
snippet_choices = update_snippet_choices(snippets_list)
|
868 |
|