cstr commited on
Commit
39a451a
Β·
verified Β·
1 Parent(s): b4d1472

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -23
app.py CHANGED
@@ -7,7 +7,6 @@ print(f"Gradio version: {gr.__version__}")
7
 
8
  from PyPDF2 import PdfReader
9
  import fitz # pymupdf
10
- from pdf2md.converter import PDF2Markdown
11
 
12
  import logging
13
  import webbrowser
@@ -202,17 +201,6 @@ class PDFProcessor:
202
  logging.error(f"Error in txt conversion: {e}")
203
  return f"Error: {str(e)}"
204
 
205
- @staticmethod
206
- def md_convert_with_pdf2md(pdf_path: str) -> str:
207
- """Convert PDF to Markdown using pdf2md"""
208
- try:
209
- converter = PDF2Markdown()
210
- markdown_text = converter.convert(pdf_path)
211
- return markdown_text
212
- except Exception as e:
213
- logging.error(f"Error in pdf2md conversion: {e}")
214
- return f"Error: {str(e)}"
215
-
216
  @staticmethod
217
  def md_convert_with_pymupdf(pdf_path: str) -> str:
218
  """Convert PDF to Markdown using pymupdf"""
@@ -261,14 +249,13 @@ class PDFProcessor:
261
  # Initialize model registry
262
  model_registry = ModelRegistry()
263
 
264
- def extract_text_from_pdf(pdf_path: str, format_type: str = "txt", md_engine: str = "pdf2md") -> str:
265
  """
266
  Extract and format text from PDF using different processors based on format.
267
 
268
  Args:
269
  pdf_path: Path to PDF file
270
  format_type: Either 'txt' or 'md'
271
- md_engine: When format_type is 'md', either 'pdf2md' or 'pymupdf'
272
 
273
  Returns:
274
  Formatted text content
@@ -279,12 +266,7 @@ def extract_text_from_pdf(pdf_path: str, format_type: str = "txt", md_engine: st
279
  if format_type == "txt":
280
  return processor.txt_convert(pdf_path)
281
  elif format_type == "md":
282
- if md_engine == "pdf2md":
283
- return processor.md_convert_with_pdf2md(pdf_path)
284
- elif md_engine == "pymupdf":
285
- return processor.md_convert_with_pymupdf(pdf_path)
286
- else:
287
- return f"Error: Unsupported markdown engine: {md_engine}"
288
  else:
289
  return f"Error: Unsupported format type: {format_type}"
290
  except Exception as e:
@@ -629,7 +611,7 @@ with gr.Blocks(css="""
629
  )
630
 
631
  format_type = gr.Radio(
632
- choices=["txt", "md (pdf2md)", "md (pymupdf)"],
633
  value="txt",
634
  label="πŸ“ Output Format"
635
  )
@@ -871,15 +853,16 @@ with gr.Blocks(css="""
871
  ]
872
 
873
  # PDF Processing Handlers
874
- def handle_pdf_process(pdf, fmt, md_eng, ctx_size):
875
  if not pdf:
876
  return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
877
 
878
  try:
879
- text = extract_text_from_pdf(pdf.name, format_type=fmt, md_engine=md_eng)
880
  if text.startswith("Error"):
881
  return text, "", "", [], gr.update(choices=[], value=None), None
882
 
 
883
  snippets_list = split_into_snippets(text, ctx_size)
884
  snippet_choices = update_snippet_choices(snippets_list)
885
 
 
7
 
8
  from PyPDF2 import PdfReader
9
  import fitz # pymupdf
 
10
 
11
  import logging
12
  import webbrowser
 
201
  logging.error(f"Error in txt conversion: {e}")
202
  return f"Error: {str(e)}"
203
 
 
 
 
 
 
 
 
 
 
 
 
204
  @staticmethod
205
  def md_convert_with_pymupdf(pdf_path: str) -> str:
206
  """Convert PDF to Markdown using pymupdf"""
 
249
  # Initialize model registry
250
  model_registry = ModelRegistry()
251
 
252
+ def extract_text_from_pdf(pdf_path: str, format_type: str = "txt") -> str:
253
  """
254
  Extract and format text from PDF using different processors based on format.
255
 
256
  Args:
257
  pdf_path: Path to PDF file
258
  format_type: Either 'txt' or 'md'
 
259
 
260
  Returns:
261
  Formatted text content
 
266
  if format_type == "txt":
267
  return processor.txt_convert(pdf_path)
268
  elif format_type == "md":
269
+ return processor.md_convert_with_pymupdf(pdf_path)
 
 
 
 
 
270
  else:
271
  return f"Error: Unsupported format type: {format_type}"
272
  except Exception as e:
 
611
  )
612
 
613
  format_type = gr.Radio(
614
+ choices=["txt", "md"],
615
  value="txt",
616
  label="πŸ“ Output Format"
617
  )
 
853
  ]
854
 
855
  # PDF Processing Handlers
856
+ def handle_pdf_process(pdf, fmt, ctx_size): # Remove md_eng parameter
857
  if not pdf:
858
  return "Please upload a PDF file.", "", "", [], gr.update(choices=[], value=None), None
859
 
860
  try:
861
+ text = extract_text_from_pdf(pdf.name, format_type=fmt) # Just use format_type
862
  if text.startswith("Error"):
863
  return text, "", "", [], gr.update(choices=[], value=None), None
864
 
865
+ # The important part: still do snippets processing
866
  snippets_list = split_into_snippets(text, ctx_size)
867
  snippet_choices = update_snippet_choices(snippets_list)
868