pszemraj commited on
Commit
bd3ba15
โ€ข
1 Parent(s): e9ed1f2

๐Ÿ”Š update logs

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show
  1. app.py +14 -10
app.py CHANGED
@@ -19,7 +19,8 @@ os.environ[
19
  ] = "false" # parallelism on tokenizers is buggy with gradio
20
 
21
  logging.basicConfig(
22
- level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 
23
  )
24
 
25
  import gradio as gr
@@ -232,18 +233,20 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
232
  :param bool lower: whether to lowercase the text
233
  :return str: the text of the file
234
  """
 
 
235
  # check if mysterious file object is a list
236
  if isinstance(file_obj, list):
237
  file_obj = file_obj[0]
238
  file_path = Path(file_obj.name)
239
  try:
240
- logging.info(f"Loading file:\t{file_path}")
241
  if file_path.suffix == ".txt":
242
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
243
  raw_text = f.read()
244
  text = clean(raw_text, lower=lower)
245
  elif file_path.suffix == ".pdf":
246
- logging.info(f"loading as PDF file {file_path}")
247
  conversion_stats = convert_PDF_to_Text(
248
  file_path,
249
  ocr_model=ocr_model,
@@ -251,18 +254,19 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
251
  )
252
  text = conversion_stats["converted_text"]
253
  else:
254
- logging.error(f"Unknown file type {file_path.suffix}")
255
  text = "ERROR - check file - unknown file type"
256
 
257
  return text
258
  except Exception as e:
259
- logging.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
260
  return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
261
 
262
 
263
  if __name__ == "__main__":
264
- logging.info("Starting app instance")
265
- logging.info("Loading OCR model")
 
266
  with contextlib.redirect_stdout(None):
267
  ocr_model = ocr_predictor(
268
  "db_resnet50",
@@ -271,7 +275,7 @@ if __name__ == "__main__":
271
  assume_straight_pages=True,
272
  )
273
  name_to_path = load_example_filenames(_here / "examples")
274
- logging.info(f"Loaded {len(name_to_path)} examples")
275
  demo = gr.Blocks()
276
  _examples = list(name_to_path.keys())
277
  with demo:
@@ -355,7 +359,7 @@ if __name__ == "__main__":
355
  minimum=0.5,
356
  maximum=1.0,
357
  label="length penalty",
358
- default=0.7,
359
  step=0.05,
360
  )
361
  token_batch_length = gr.Radio(
@@ -369,7 +373,7 @@ if __name__ == "__main__":
369
  minimum=1.0,
370
  maximum=5.0,
371
  label="repetition penalty",
372
- default=3.5,
373
  step=0.1,
374
  )
375
  no_repeat_ngram_size = gr.Radio(
 
19
  ] = "false" # parallelism on tokenizers is buggy with gradio
20
 
21
  logging.basicConfig(
22
+ level=logging.INFO,
23
+ format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
24
  )
25
 
26
  import gradio as gr
 
233
  :param bool lower: whether to lowercase the text
234
  :return str: the text of the file
235
  """
236
+
237
+ logger = logging.getLogger(__name__)
238
  # check if mysterious file object is a list
239
  if isinstance(file_obj, list):
240
  file_obj = file_obj[0]
241
  file_path = Path(file_obj.name)
242
  try:
243
+ logger.info(f"Loading file:\t{file_path}")
244
  if file_path.suffix == ".txt":
245
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
246
  raw_text = f.read()
247
  text = clean(raw_text, lower=lower)
248
  elif file_path.suffix == ".pdf":
249
+ logger.info(f"loading as PDF file {file_path}")
250
  conversion_stats = convert_PDF_to_Text(
251
  file_path,
252
  ocr_model=ocr_model,
 
254
  )
255
  text = conversion_stats["converted_text"]
256
  else:
257
+ logger.error(f"Unknown file type {file_path.suffix}")
258
  text = "ERROR - check file - unknown file type"
259
 
260
  return text
261
  except Exception as e:
262
+ logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
263
  return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
264
 
265
 
266
  if __name__ == "__main__":
267
+ logger = logging.getLogger(__name__)
268
+ logger.info("Starting app instance")
269
+ logger.info("Loading OCR model")
270
  with contextlib.redirect_stdout(None):
271
  ocr_model = ocr_predictor(
272
  "db_resnet50",
 
275
  assume_straight_pages=True,
276
  )
277
  name_to_path = load_example_filenames(_here / "examples")
278
+ logger.info(f"Loaded {len(name_to_path)} examples")
279
  demo = gr.Blocks()
280
  _examples = list(name_to_path.keys())
281
  with demo:
 
359
  minimum=0.5,
360
  maximum=1.0,
361
  label="length penalty",
362
+ value=0.7,
363
  step=0.05,
364
  )
365
  token_batch_length = gr.Radio(
 
373
  minimum=1.0,
374
  maximum=5.0,
375
  label="repetition penalty",
376
+ value=1.5,
377
  step=0.1,
378
  )
379
  no_repeat_ngram_size = gr.Radio(