Allen Park commited on
Commit
e504a30
·
1 Parent(s): 1230f78

feat(pdf text extraction): extract all the text from the uploaded pdf file

Browse files

* feat: add simple filetype extraction helper function
* feat: add pdfplumber text extraction from pdf helper functoin
* chore: add conditionals to ensure filetypes are only pdf, txt, doc, or docx
---------
Co-authored-by: Allen Park <parknella19@gmail.com>

Files changed (2) hide show
  1. app.py +26 -7
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,11 +1,11 @@
1
  import os
2
  import re
 
3
  from typing import List, Tuple, Union
4
  from pathlib import Path
5
  import gradio as gr
6
  import openai
7
-
8
-
9
 
10
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
11
  LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
@@ -133,17 +133,36 @@ def model_call(question, document, answer, client_base_url):
133
  combined_reasoning = " ".join(reasoning)[1:-1]
134
  return combined_reasoning, score
135
 
 
 
 
 
 
 
 
 
 
 
136
  def upload_file(filepath):
 
137
  if filepath is not None:
138
  name = Path(filepath).name
139
  print("FILEPATH & file name", filepath, name)
140
  print("FILEPATH type & file name type", type(filepath), type(name))
141
- return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name))]
 
 
 
 
 
 
 
 
142
  else:
143
- return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown("")]
144
  # return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {name}", value=filepath, visible=True)]
145
  def reset_buttons():
146
- return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown("")]
147
 
148
  # def download_file():
149
  # return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
@@ -185,8 +204,8 @@ with gr.Blocks(css=css) as demo:
185
  score = gr.Textbox(label="Score (FAIL if Hallucinated, PASS if not)")
186
 
187
  model_dropdown.change(fn=update_client_base_url, inputs=[model_dropdown], outputs=[base_url_state])
188
- u.upload(upload_file, u, [u, file_group, file_name])
189
- c.click(reset_buttons, None, [u, file_group, file_name])
190
  # d.click(download_file, None, [u, d])
191
 
192
  submit_button.click(fn=model_call, inputs=[question, document, answer, base_url_state], outputs=[reasoning, score])
 
1
  import os
2
  import re
3
+ import io
4
  from typing import List, Tuple, Union
5
  from pathlib import Path
6
  import gradio as gr
7
  import openai
8
+ import pdfplumber
 
9
 
10
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
11
  LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
 
133
  combined_reasoning = " ".join(reasoning)[1:-1]
134
  return combined_reasoning, score
135
 
136
+ def get_filetype(filename):
137
+ return filename.split(".")[-1]
138
+
139
+ def extract_text_pdfplumber(file):
140
+ with pdfplumber.open(io.BytesIO(file.read())) as pdf:
141
+ text = ""
142
+ for page in pdf.pages:
143
+ text += page.extract_text()
144
+ return text
145
+
146
  def upload_file(filepath):
147
+ extracted_file_text = ""
148
  if filepath is not None:
149
  name = Path(filepath).name
150
  print("FILEPATH & file name", filepath, name)
151
  print("FILEPATH type & file name type", type(filepath), type(name))
152
+ filetype = get_filetype(name)
153
+ # conditionals for filetype and function call
154
+ if filetype == "pdf":
155
+ extracted_file_text = extract_text_pdfplumber(filepath)
156
+ elif filetype == "txt":
157
+ extracted_file_text = filepath.read().decode("utf-8")
158
+ elif filetype == "docx" or filetype == "doc":
159
+ extracted_file_text = filepath.read().decode("utf-8")
160
+ return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
161
  else:
162
+ return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), extracted_file_text]
163
  # return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {name}", value=filepath, visible=True)]
164
  def reset_buttons():
165
+ return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), gr.Textbox(value="")]
166
 
167
  # def download_file():
168
  # return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
 
204
  score = gr.Textbox(label="Score (FAIL if Hallucinated, PASS if not)")
205
 
206
  model_dropdown.change(fn=update_client_base_url, inputs=[model_dropdown], outputs=[base_url_state])
207
+ u.upload(upload_file, u, [u, file_group, file_name, document])
208
+ c.click(reset_buttons, None, [u, file_group, file_name, document])
209
  # d.click(download_file, None, [u, d])
210
 
211
  submit_button.click(fn=model_call, inputs=[question, document, answer, base_url_state], outputs=[reasoning, score])
requirements.txt CHANGED
@@ -1 +1,2 @@
1
  openai
 
 
1
  openai
2
+ pdfplumber