File size: 4,615 Bytes
847d138 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# -*- coding: utf-8 -*-
"""LiLT For Deployment
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1ol6RWyff15SF6ZJPf47X5380hBTEDiUH
"""
# ## Installing the dependencies (might take some time)
# !pip install -q pytesseract
# !sudo apt install -q tesseract-ocr
# !pip install -q transformers
# !pip install -q pytorch-lightning
# !pip install -q einops
# !pip install -q tqdm
# !pip install -q gradio
# !pip install -q Pillow==7.1.2
# !pip install -q wandb
# !pip install -q gdown
# !pip install -q torchmetrics
## Requirements.txt
import os
os.system('pip install pyyaml==5.1')
## install PyTesseract
os.system('pip install -q pytesseract')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import pandas as pd
import os
from PIL import Image
from transformers import RobertaTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import pytorch_lightning as pl
from dataset import create_features
from modeling import LiLT
from utils import LiLTPL
import gdown
import gradio as gr
seed = 42
## One can change this configuration and try out new combination
config = {
"hidden_dropout_prob": 0.1,
"hidden_size_t": 768,
"hidden_size" : 768,
"hidden_size_l": 768 // 6,
"intermediate_ff_size_factor": 4,
"max_2d_position_embeddings": 1001,
"max_seq_len_l": 512,
"max_seq_len_t" : 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
'dim_head' : 64,
"shape_size": 96,
"vocab_size": 50265,
"eps": 1e-12,
"fine_tune" : True
}
id2label = ['scientific_report',
'resume',
'memo',
'file_folder',
'specification',
'news_article',
'letter',
'form',
'budget',
'handwritten',
'email',
'invoice',
'presentation',
'scientific_publication',
'questionnaire',
'advertisement']
## Defining tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
url = 'https://drive.google.com/uc?id=1eRV4fS_LFwI5MHqcRwLUNQZgewxI6Se_'
output = 'lilt_ckpt.ckpt'
gdown.download(url, output, quiet=False)
class RVLCDIPData(Dataset):
def __init__(self, image_list, label_list, tokenizer, max_len = 512, size = 1000):
self.image_list = image_list
self.label_list = label_list
self.tokenizer = tokenizer
self.max_seq_length = max_len
self.size = size
def __len__(self):
return len(self.image_list)
def __getitem__(self, idx):
img_path = self.image_list[idx]
label = self.label_list[idx]
boxes, words, normal_box = create_features(
img_path = img_path,
tokenizer = self.tokenizer,
max_seq_length = self.max_seq_length,
size = self.size,
use_ocr = True,
)
final_encoding = {'input_boxes': boxes, 'input_words': words}
final_encoding['label'] = torch.as_tensor(label).long()
return final_encoding
lilt = LiLTPL(config)
# path_to_weights = 'drive/MyDrive/docformer_rvl_checkpoint/docformer_v1.ckpt'
lilt.load_from_checkpoint('lilt_ckpt.ckpt')
## Taken from LayoutLMV2 space
image = gr.inputs.Image(type="pil")
label = gr.outputs.Label(num_top_classes=5)
examples = [['00093726.png'], ['00866042.png']]
title = "Interactive demo: LiLT for Image Classification"
description = "Demo for classifying document images with LiLT model. To use it, \
simply upload an image or use the example images below and click 'submit' to let the model predict the 5 most probable Document classes. \
Results will show up in a few seconds."
def classify_image(image):
image.save('sample_img.png')
boxes, words, normal_box = create_features(
img_path = 'sample_img.png',
tokenizer = tokenizer,
max_seq_length = 512,
size = 1000,
use_ocr = True,
)
final_encoding = {'input_boxes': boxes.unsqueeze(0), 'input_words': words.unsqueeze(0)}
output = lilt.forward(final_encoding)
output = output[0].softmax(axis = -1)
final_pred = {}
for i, score in enumerate(output):
score = output[i]
final_pred[id2label[i]] = score.detach().cpu().tolist()
return final_pred
gr.Interface(fn=classify_image, inputs=image, outputs=label, title=title, description=description, examples=examples, enable_queue=True).launch(debug=True)
|