Allen Park commited on
Commit
901a87e
·
1 Parent(s): e504a30

fix(pdfplumber): replace the pdfplumber package and implementation with pymupdf

Browse files
Files changed (2) hide show
  1. app.py +7 -9
  2. requirements.txt +1 -1
app.py CHANGED
@@ -5,7 +5,7 @@ from typing import List, Tuple, Union
5
  from pathlib import Path
6
  import gradio as gr
7
  import openai
8
- import pdfplumber
9
 
10
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
11
  LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
@@ -136,11 +136,11 @@ def model_call(question, document, answer, client_base_url):
136
  def get_filetype(filename):
137
  return filename.split(".")[-1]
138
 
139
- def extract_text_pdfplumber(file):
140
- with pdfplumber.open(io.BytesIO(file.read())) as pdf:
141
  text = ""
142
- for page in pdf.pages:
143
- text += page.extract_text()
144
  return text
145
 
146
  def upload_file(filepath):
@@ -151,10 +151,8 @@ def upload_file(filepath):
151
  print("FILEPATH type & file name type", type(filepath), type(name))
152
  filetype = get_filetype(name)
153
  # conditionals for filetype and function call
154
- if filetype == "pdf":
155
- extracted_file_text = extract_text_pdfplumber(filepath)
156
- elif filetype == "txt":
157
- extracted_file_text = filepath.read().decode("utf-8")
158
  elif filetype == "docx" or filetype == "doc":
159
  extracted_file_text = filepath.read().decode("utf-8")
160
  return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
 
5
  from pathlib import Path
6
  import gradio as gr
7
  import openai
8
+ import pymupdf
9
 
10
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
11
  LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
 
136
  def get_filetype(filename):
137
  return filename.split(".")[-1]
138
 
139
+ def extract_text_pymupdf(file):
140
+ with pymupdf.open(file) as pdf_or_txt:
141
  text = ""
142
+ for page in pdf_or_txt:
143
+ text += page.get_text()
144
  return text
145
 
146
  def upload_file(filepath):
 
151
  print("FILEPATH type & file name type", type(filepath), type(name))
152
  filetype = get_filetype(name)
153
  # conditionals for filetype and function call
154
+ if filetype == "pdf" or filetype == "txt":
155
+ extracted_file_text = extract_text_pymupdf(filepath)
 
 
156
  elif filetype == "docx" or filetype == "doc":
157
  extracted_file_text = filepath.read().decode("utf-8")
158
  return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
requirements.txt CHANGED
@@ -1,2 +1,2 @@
1
  openai
2
- pdfplumber
 
1
  openai
2
+ PyMuPDF