Spaces:
Sleeping
Sleeping
Allen Park
commited on
Commit
·
901a87e
1
Parent(s):
e504a30
fix(pdfplumber): replace the pdfplumber package and implementation with pymupdf
Browse files- app.py +7 -9
- requirements.txt +1 -1
app.py
CHANGED
@@ -5,7 +5,7 @@ from typing import List, Tuple, Union
|
|
5 |
from pathlib import Path
|
6 |
import gradio as gr
|
7 |
import openai
|
8 |
-
import
|
9 |
|
10 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
11 |
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
|
@@ -136,11 +136,11 @@ def model_call(question, document, answer, client_base_url):
|
|
136 |
def get_filetype(filename):
|
137 |
return filename.split(".")[-1]
|
138 |
|
139 |
-
def
|
140 |
-
with
|
141 |
text = ""
|
142 |
-
for page in
|
143 |
-
text += page.
|
144 |
return text
|
145 |
|
146 |
def upload_file(filepath):
|
@@ -151,10 +151,8 @@ def upload_file(filepath):
|
|
151 |
print("FILEPATH type & file name type", type(filepath), type(name))
|
152 |
filetype = get_filetype(name)
|
153 |
# conditionals for filetype and function call
|
154 |
-
if filetype == "pdf":
|
155 |
-
extracted_file_text =
|
156 |
-
elif filetype == "txt":
|
157 |
-
extracted_file_text = filepath.read().decode("utf-8")
|
158 |
elif filetype == "docx" or filetype == "doc":
|
159 |
extracted_file_text = filepath.read().decode("utf-8")
|
160 |
return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
|
|
|
5 |
from pathlib import Path
|
6 |
import gradio as gr
|
7 |
import openai
|
8 |
+
import pymupdf
|
9 |
|
10 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
11 |
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
|
|
|
136 |
def get_filetype(filename):
|
137 |
return filename.split(".")[-1]
|
138 |
|
139 |
+
def extract_text_pymupdf(file):
|
140 |
+
with pymupdf.open(file) as pdf_or_txt:
|
141 |
text = ""
|
142 |
+
for page in pdf_or_txt:
|
143 |
+
text += page.get_text()
|
144 |
return text
|
145 |
|
146 |
def upload_file(filepath):
|
|
|
151 |
print("FILEPATH type & file name type", type(filepath), type(name))
|
152 |
filetype = get_filetype(name)
|
153 |
# conditionals for filetype and function call
|
154 |
+
if filetype == "pdf" or filetype == "txt":
|
155 |
+
extracted_file_text = extract_text_pymupdf(filepath)
|
|
|
|
|
156 |
elif filetype == "docx" or filetype == "doc":
|
157 |
extracted_file_text = filepath.read().decode("utf-8")
|
158 |
return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
|
requirements.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
openai
|
2 |
-
|
|
|
1 |
openai
|
2 |
+
PyMuPDF
|