pszemraj commited on
Commit
5f2c216
1 Parent(s): 9350787

✨ add ability to load PDF

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show
  1. app.py +54 -14
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import logging
2
  import time
3
  from pathlib import Path
@@ -5,6 +6,9 @@ from pathlib import Path
5
  import gradio as gr
6
  import nltk
7
  from cleantext import clean
 
 
 
8
 
9
  from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
10
  from utils import load_example_filenames, truncate_word_count
@@ -101,6 +105,7 @@ def proc_submission(
101
 
102
  def load_single_example_text(
103
  example_path: str or Path,
 
104
  ):
105
  """
106
  load_single_example - a helper function for the gradio module to load examples
@@ -110,14 +115,26 @@ def load_single_example_text(
110
  global name_to_path
111
  full_ex_path = name_to_path[example_path]
112
  full_ex_path = Path(full_ex_path)
113
- # load the examples into a list
114
- with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
115
- raw_text = f.read()
116
  text = clean(raw_text, lower=False)
 
 
 
 
 
 
 
 
 
 
 
 
117
  return text
118
 
119
 
120
- def load_uploaded_file(file_obj):
121
  """
122
  load_uploaded_file - process an uploaded file
123
 
@@ -135,29 +152,52 @@ def load_uploaded_file(file_obj):
135
  file_obj = file_obj[0]
136
  file_path = Path(file_obj.name)
137
  try:
138
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
139
- raw_text = f.read()
140
- text = clean(raw_text, lower=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  return text
142
  except Exception as e:
143
  logging.info(f"Trying to load file with path {file_path}, error: {e}")
144
- return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8."
145
 
146
 
147
  if __name__ == "__main__":
148
-
149
- model, tokenizer = load_model_and_tokenizer("pszemraj/led-large-book-summary")
150
- model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/led-base-book-summary")
151
-
 
 
 
 
 
 
 
 
 
 
152
  name_to_path = load_example_filenames(_here / "examples")
153
  logging.info(f"Loaded {len(name_to_path)} examples")
154
  demo = gr.Blocks()
155
 
156
  with demo:
157
 
158
- gr.Markdown("# Long-Form Summarization: LED & BookSum")
159
  gr.Markdown(
160
- "A simple demo using a fine-tuned LED model to summarize long-form text. See [model card](https://huggingface.co/pszemraj/led-large-book-summary) for a notebook with GPU inference (much faster) on Colab."
161
  )
162
  with gr.Column():
163
 
 
1
+ import contextlib
2
  import logging
3
  import time
4
  from pathlib import Path
 
6
  import gradio as gr
7
  import nltk
8
  from cleantext import clean
9
+ from doctr.io import DocumentFile
10
+ from doctr.models import ocr_predictor
11
+ from pdf2text import convert_PDF_to_Text
12
 
13
  from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
14
  from utils import load_example_filenames, truncate_word_count
 
105
 
106
  def load_single_example_text(
107
  example_path: str or Path,
108
+ max_pages=20,
109
  ):
110
  """
111
  load_single_example - a helper function for the gradio module to load examples
 
115
  global name_to_path
116
  full_ex_path = name_to_path[example_path]
117
  full_ex_path = Path(full_ex_path)
118
+ if full_ex_path.suffix == ".txt":
119
+ with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
120
+ raw_text = f.read()
121
  text = clean(raw_text, lower=False)
122
+ elif full_ex_path.suffix == ".pdf":
123
+ logging.info(f"Loading PDF file {full_ex_path}")
124
+ conversion_stats = convert_PDF_to_Text(
125
+ full_ex_path,
126
+ ocr_model=ocr_model,
127
+ max_pages=max_pages,
128
+ )
129
+ text = conversion_stats["converted_text"]
130
+ else:
131
+ logging.error(f"Unknown file type {full_ex_path.suffix}")
132
+ text = "ERROR - check example path"
133
+
134
  return text
135
 
136
 
137
+ def load_uploaded_file(file_obj, max_pages=20):
138
  """
139
  load_uploaded_file - process an uploaded file
140
 
 
152
  file_obj = file_obj[0]
153
  file_path = Path(file_obj.name)
154
  try:
155
+ if file_path.suffix == ".txt":
156
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
157
+ raw_text = f.read()
158
+ text = clean(raw_text, lower=False)
159
+ elif file_path.suffix == ".pdf":
160
+ logging.info(f"Loading PDF file {file_path}")
161
+ conversion_stats = convert_PDF_to_Text(
162
+ file_path,
163
+ ocr_model=ocr_model,
164
+ max_pages=max_pages,
165
+ )
166
+ text = conversion_stats["converted_text"]
167
+ else:
168
+ logging.error(f"Unknown file type {file_path.suffix}")
169
+ text = "ERROR - check example path"
170
+
171
  return text
172
  except Exception as e:
173
  logging.info(f"Trying to load file with path {file_path}, error: {e}")
174
+ return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
175
 
176
 
177
  if __name__ == "__main__":
178
+ logging.info("Starting app instance")
179
+
180
+ logging.info("Loading summ models")
181
+ model, tokenizer = load_model_and_tokenizer("pszemraj/pegasus-x-large-book-summary")
182
+ model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/long-t5-tglobal-base-16384-book-summary")
183
+
184
+ logging.info("Loading OCR model")
185
+ with contextlib.redirect_stdout(None):
186
+ ocr_model = ocr_predictor(
187
+ "db_resnet50",
188
+ "crnn_mobilenet_v3_large",
189
+ pretrained=True,
190
+ assume_straight_pages=True,
191
+ )
192
  name_to_path = load_example_filenames(_here / "examples")
193
  logging.info(f"Loaded {len(name_to_path)} examples")
194
  demo = gr.Blocks()
195
 
196
  with demo:
197
 
198
+ gr.Markdown("# Document Summarization with Long-Document Transformers")
199
  gr.Markdown(
200
+ "TODO: Add a description of the model and how it works, and a link to the paper"
201
  )
202
  with gr.Column():
203