Spaces:
Runtime error
Runtime error
File size: 7,418 Bytes
b00bf21 dcb13ea b00bf21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import os
# workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158)
# os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html')
os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html')
# install detectron2 that matches pytorch 1.8
# See https://detectron2.readthedocs.io/tutorials/install.html for instructions
#os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html')
os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()
import gradio as gr
import re
import string
from operator import itemgetter
import collections
import pypdf
from pypdf import PdfReader
from pypdf.errors import PdfReadError
import pdf2image
from pdf2image import convert_from_path
import langdetect
from langdetect import detect_langs
import pandas as pd
import numpy as np
import random
import tempfile
import itertools
from matplotlib import font_manager
from PIL import Image, ImageDraw, ImageFont
import cv2
## files
import sys
sys.path.insert(0, 'files/')
import functions
from functions import *
# update pip
os.system('python -m pip install --upgrade pip')
## model / feature extractor / tokenizer
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model
from transformers import LayoutLMv2ForTokenClassification
model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
model = LayoutLMv2ForTokenClassification.from_pretrained(model_id);
model.to(device);
# feature extractor
from transformers import LayoutLMv2FeatureExtractor
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
# tokenizer
from transformers import AutoTokenizer
tokenizer_id = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
# get labels
id2label = model.config.id2label
label2id = model.config.label2id
num_labels = len(id2label)
# APP outputs
def app_outputs(uploaded_pdf):
filename, msg, images = pdf_to_images(uploaded_pdf)
num_images = len(images)
if not msg.startswith("Error with the PDF"):
# Extraction of image data (text and bounding boxes)
dataset, lines, row_indexes, par_boxes, line_boxes = extraction_data_from_image(images)
# prepare our data in the format of the model
encoded_dataset = dataset.map(prepare_inference_features, batched=True, batch_size=64, remove_columns=dataset.column_names)
custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
# Get predictions (token level)
outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset)
# Get predictions (line level)
probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes)
# Get labeled images with lines bounding boxes
images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)
img_files = list()
# get image of PDF without bounding boxes
for i in range(num_images):
if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png")
else: img_file = filename.replace(".pdf", ".png")
images[i].save(img_file)
img_files.append(img_file)
if num_images < max_imgboxes:
img_files += [image_blank]*(max_imgboxes - num_images)
images += [Image.open(image_blank)]*(max_imgboxes - num_images)
for count in range(max_imgboxes - num_images):
df[num_images + count] = pd.DataFrame()
else:
img_files = img_files[:max_imgboxes]
images = images[:max_imgboxes]
df = dict(itertools.islice(df.items(), max_imgboxes))
# save
csv_files = list()
for i in range(max_imgboxes):
csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv")
csv_files.append(gr.File.update(value=csv_file, visible=True))
df[i].to_csv(csv_file, encoding="utf-8", index=False)
else:
img_files, images, csv_files = [""]*max_imgboxes, [""]*max_imgboxes, [""]*max_imgboxes
img_files[0], img_files[1] = image_blank, image_blank
images[0], images[1] = Image.open(image_blank), Image.open(image_blank)
csv_file = "csv_wo_content.csv"
csv_files[0], csv_files[1] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True)
df, df_empty = dict(), pd.DataFrame()
df[0], df[1] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False)
return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1]
# gradio APP
with gr.Blocks(title="", css=".gradio-container") as demo:
with gr.Row():
pdf_file = gr.File(label="PDF")
with gr.Row():
submit_btn = gr.Button(f"Display first {max_imgboxes} labeled PDF pages")
reset_btn = gr.Button(value="Clear")
with gr.Row():
output_msg = gr.Textbox(label="Output message")
with gr.Row():
fileboxes = []
for num_page in range(max_imgboxes):
file_path = gr.File(visible=True, label=f"Image file of the PDF page n°{num_page}")
fileboxes.append(file_path)
with gr.Row():
imgboxes = []
for num_page in range(max_imgboxes):
img = gr.Image(type="pil", label=f"Image of the PDF page n°{num_page}")
imgboxes.append(img)
with gr.Row():
csvboxes = []
for num_page in range(max_imgboxes):
csv = gr.File(visible=True, label=f"CSV file at line level (page {num_page})")
csvboxes.append(csv)
with gr.Row():
dfboxes = []
for num_page in range(max_imgboxes):
df = gr.Dataframe(
headers=["bounding boxes", "texts", "labels"],
datatype=["str", "str", "str"],
col_count=(3, "fixed"),
visible=True,
label=f"Data of page {num_page}",
type="pandas",
wrap=True
)
dfboxes.append(df)
outputboxes = [output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)
reset_btn.click(
lambda: [pdf_file.update(value=None), output_msg.update(value=None)] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
inputs=[],
outputs=[pdf_file, output_msg] + fileboxes + imgboxes + csvboxes + dfboxes,
)
gr.Examples(
[["files/example.pdf"]],
[pdf_file],
outputboxes,
fn=app_outputs,
cache_examples=True,
)
demo.launch() |