Spaces:
Running
Running
Upload 13 files
Browse files- README.md +5 -5
- app.py +108 -0
- assets/.DS_Store +0 -0
- assets/example/academic.jpg +0 -0
- assets/example/exam_paper.jpg +0 -0
- assets/example/financial.jpg +0 -0
- assets/example/fuzzy_scan.jpg +0 -0
- assets/example/poster.jpg +0 -0
- assets/example/ppt.jpg +0 -0
- assets/example/textbook.jpg +0 -0
- header.html +109 -0
- requirements.txt +4 -0
- visualization.py +90 -0
README.md
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
---
|
2 |
-
title: DocLayout YOLO
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.1.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
-
short_description:
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: DocLayout YOLO Demo
|
3 |
+
emoji: 🐢
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: green
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.1.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
+
short_description: Demo for DocLayout-YOLO
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.environ["GRADIO_TEMP_DIR"] = "./tmp"
|
3 |
+
|
4 |
+
import sys
|
5 |
+
import torch
|
6 |
+
import torchvision
|
7 |
+
import gradio as gr
|
8 |
+
import numpy as np
|
9 |
+
from PIL import Image
|
10 |
+
from huggingface_hub import snapshot_download
|
11 |
+
from visualization import visualize_bbox
|
12 |
+
|
13 |
+
# == download weights ==
|
14 |
+
model_dir = snapshot_download('juliozhao/DocLayout-YOLO-DocStructBench', local_dir='./models/DocLayout-YOLO-DocStructBench')
|
15 |
+
# == select device ==
|
16 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
17 |
+
|
18 |
+
id_to_names = {
|
19 |
+
0: 'title',
|
20 |
+
1: 'plain text',
|
21 |
+
2: 'abandon',
|
22 |
+
3: 'figure',
|
23 |
+
4: 'figure_caption',
|
24 |
+
5: 'table',
|
25 |
+
6: 'table_caption',
|
26 |
+
7: 'table_footnote',
|
27 |
+
8: 'isolate_formula',
|
28 |
+
9: 'formula_caption'
|
29 |
+
}
|
30 |
+
|
31 |
+
def recognize_image(input_img, conf_threshold, iou_threshold):
|
32 |
+
det_res = model.predict(
|
33 |
+
input_img,
|
34 |
+
imgsz=1024,
|
35 |
+
conf=conf_threshold,
|
36 |
+
device=device,
|
37 |
+
)[0]
|
38 |
+
boxes = det_res.__dict__['boxes'].xyxy
|
39 |
+
classes = det_res.__dict__['boxes'].cls
|
40 |
+
scores = det_res.__dict__['boxes'].conf
|
41 |
+
|
42 |
+
indices = torchvision.ops.nms(boxes=torch.Tensor(boxes), scores=torch.Tensor(scores),iou_threshold=iou_threshold)
|
43 |
+
boxes, scores, classes = boxes[indices], scores[indices], classes[indices]
|
44 |
+
if len(boxes.shape) == 1:
|
45 |
+
boxes = np.expand_dims(boxes, 0)
|
46 |
+
scores = np.expand_dims(scores, 0)
|
47 |
+
classes = np.expand_dims(classes, 0)
|
48 |
+
|
49 |
+
vis_result = visualize_bbox(input_img, boxes, classes, scores, id_to_names)
|
50 |
+
return vis_result
|
51 |
+
|
52 |
+
def gradio_reset():
|
53 |
+
return gr.update(value=None), gr.update(value=None)
|
54 |
+
|
55 |
+
|
56 |
+
if __name__ == "__main__":
|
57 |
+
root_path = os.path.abspath(os.getcwd())
|
58 |
+
# == load model ==
|
59 |
+
from doclayout_yolo import YOLOv10
|
60 |
+
print(f"Using device: {device}")
|
61 |
+
model = YOLOv10(os.path.join(os.path.dirname(__file__), "models", "DocLayout-YOLO-DocStructBench", "doclayout_yolo_docstructbench_imgsz1024.pt")) # load an official model
|
62 |
+
|
63 |
+
with open("header.html", "r") as file:
|
64 |
+
header = file.read()
|
65 |
+
with gr.Blocks() as demo:
|
66 |
+
gr.HTML(header)
|
67 |
+
|
68 |
+
with gr.Row():
|
69 |
+
with gr.Column():
|
70 |
+
|
71 |
+
input_img = gr.Image(label=" ", interactive=True)
|
72 |
+
with gr.Row():
|
73 |
+
clear = gr.Button(value="Clear")
|
74 |
+
predict = gr.Button(value="Detect", interactive=True, variant="primary")
|
75 |
+
|
76 |
+
with gr.Row():
|
77 |
+
conf_threshold = gr.Slider(
|
78 |
+
label="Confidence Threshold",
|
79 |
+
minimum=0.0,
|
80 |
+
maximum=1.0,
|
81 |
+
step=0.05,
|
82 |
+
value=0.25,
|
83 |
+
)
|
84 |
+
|
85 |
+
with gr.Row():
|
86 |
+
iou_threshold = gr.Slider(
|
87 |
+
label="NMS IOU Threshold",
|
88 |
+
minimum=0.0,
|
89 |
+
maximum=1.0,
|
90 |
+
step=0.05,
|
91 |
+
value=0.45,
|
92 |
+
)
|
93 |
+
|
94 |
+
with gr.Accordion("Examples:"):
|
95 |
+
example_root = os.path.join(os.path.dirname(__file__), "assets", "example")
|
96 |
+
gr.Examples(
|
97 |
+
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
|
98 |
+
_.endswith("jpg")],
|
99 |
+
inputs=[input_img],
|
100 |
+
)
|
101 |
+
with gr.Column():
|
102 |
+
gr.Button(value="Predict Result:", interactive=False)
|
103 |
+
output_img = gr.Image(label=" ", interactive=False)
|
104 |
+
|
105 |
+
clear.click(gradio_reset, inputs=None, outputs=[input_img, output_img])
|
106 |
+
predict.click(recognize_image, inputs=[input_img,conf_threshold,iou_threshold], outputs=[output_img])
|
107 |
+
|
108 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)
|
assets/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
assets/example/academic.jpg
ADDED
assets/example/exam_paper.jpg
ADDED
assets/example/financial.jpg
ADDED
assets/example/fuzzy_scan.jpg
ADDED
assets/example/poster.jpg
ADDED
assets/example/ppt.jpg
ADDED
assets/example/textbook.jpg
ADDED
header.html
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<html><head>
|
2 |
+
<!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.3/css/bulma.min.css"> -->
|
3 |
+
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
|
4 |
+
<style>
|
5 |
+
.link-block {
|
6 |
+
border: 1px solid transparent;
|
7 |
+
border-radius: 24px;
|
8 |
+
background-color: rgba(54, 54, 54, 1);
|
9 |
+
cursor: pointer !important;
|
10 |
+
}
|
11 |
+
.link-block:hover {
|
12 |
+
background-color: rgba(54, 54, 54, 0.75) !important;
|
13 |
+
cursor: pointer !important;
|
14 |
+
}
|
15 |
+
.external-link {
|
16 |
+
display: inline-flex;
|
17 |
+
align-items: center;
|
18 |
+
height: 36px;
|
19 |
+
line-height: 36px;
|
20 |
+
padding: 0 16px;
|
21 |
+
cursor: pointer !important;
|
22 |
+
}
|
23 |
+
.external-link,
|
24 |
+
.external-link:hover {
|
25 |
+
cursor: pointer !important;
|
26 |
+
}
|
27 |
+
a {
|
28 |
+
text-decoration: none;
|
29 |
+
}
|
30 |
+
</style></head>
|
31 |
+
|
32 |
+
<body>
|
33 |
+
<div style="
|
34 |
+
display: flex;
|
35 |
+
flex-direction: column;
|
36 |
+
justify-content: center;
|
37 |
+
align-items: center;
|
38 |
+
text-align: center;
|
39 |
+
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
|
40 |
+
padding: 24px;
|
41 |
+
gap: 24px;
|
42 |
+
border-radius: 8px;
|
43 |
+
">
|
44 |
+
<div style="
|
45 |
+
display: flex;
|
46 |
+
flex-direction: column;
|
47 |
+
align-items: center;
|
48 |
+
gap: 16px;
|
49 |
+
">
|
50 |
+
<div style="display: flex; flex-direction: column; gap: 8px">
|
51 |
+
<h1 style="
|
52 |
+
font-size: 48px;
|
53 |
+
color: #fafafa;
|
54 |
+
margin: 0;
|
55 |
+
font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
|
56 |
+
'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
|
57 |
+
">
|
58 |
+
DocLayout-YOLO
|
59 |
+
</h1>
|
60 |
+
</div>
|
61 |
+
</div>
|
62 |
+
|
63 |
+
<p style="
|
64 |
+
margin: 0;
|
65 |
+
line-height: 1.6rem;
|
66 |
+
font-size: 16px;
|
67 |
+
color: #fafafa;
|
68 |
+
opacity: 0.8;
|
69 |
+
">
|
70 |
+
An efficient and robust Model for Real-World Document Layout Analysis.<br>
|
71 |
+
</p>
|
72 |
+
<style>
|
73 |
+
.link-block {
|
74 |
+
display: inline-block;
|
75 |
+
}
|
76 |
+
.link-block + .link-block {
|
77 |
+
margin-left: 20px;
|
78 |
+
}
|
79 |
+
</style>
|
80 |
+
|
81 |
+
<div class="column has-text-centered">
|
82 |
+
<div class="publication-links">
|
83 |
+
<!-- Code Link. -->
|
84 |
+
<span class="link-block">
|
85 |
+
<a href="https://github.com/opendatalab/DocLayout-YOLO" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
86 |
+
<span class="icon" style="margin-right: 4px">
|
87 |
+
<i class="fab fa-github" style="color: white; margin-right: 4px"></i>
|
88 |
+
</span>
|
89 |
+
<span style="color: white">Code</span>
|
90 |
+
</a>
|
91 |
+
</span>
|
92 |
+
|
93 |
+
<!-- Paper Link. -->
|
94 |
+
<span class="link-block">
|
95 |
+
<a href="https://arxiv.org/abs/2410.12628" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
96 |
+
<span class="icon" style="margin-right: 8px">
|
97 |
+
<i class="fas fa-globe" style="color: white"></i>
|
98 |
+
</span>
|
99 |
+
<span style="color: white">Paper</span>
|
100 |
+
</a>
|
101 |
+
</span>
|
102 |
+
</div>
|
103 |
+
</div>
|
104 |
+
|
105 |
+
<!-- New Demo Links -->
|
106 |
+
</div>
|
107 |
+
|
108 |
+
|
109 |
+
</body></html>
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
doclayout-yolo==0.0.2
|
2 |
+
gradio==5.1.0
|
3 |
+
gradio-client==1.4.0
|
4 |
+
huggingface_hub
|
visualization.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import cv2
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
def colormap(N=256, normalized=False):
|
6 |
+
"""
|
7 |
+
Generate the color map.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
N (int): Number of labels (default is 256).
|
11 |
+
normalized (bool): If True, return colors normalized to [0, 1]. Otherwise, return [0, 255].
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
np.ndarray: Color map array of shape (N, 3).
|
15 |
+
"""
|
16 |
+
def bitget(byteval, idx):
|
17 |
+
"""
|
18 |
+
Get the bit value at the specified index.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
byteval (int): The byte value.
|
22 |
+
idx (int): The index of the bit.
|
23 |
+
|
24 |
+
Returns:
|
25 |
+
int: The bit value (0 or 1).
|
26 |
+
"""
|
27 |
+
return ((byteval & (1 << idx)) != 0)
|
28 |
+
|
29 |
+
cmap = np.zeros((N, 3), dtype=np.uint8)
|
30 |
+
for i in range(N):
|
31 |
+
r = g = b = 0
|
32 |
+
c = i
|
33 |
+
for j in range(8):
|
34 |
+
r = r | (bitget(c, 0) << (7 - j))
|
35 |
+
g = g | (bitget(c, 1) << (7 - j))
|
36 |
+
b = b | (bitget(c, 2) << (7 - j))
|
37 |
+
c = c >> 3
|
38 |
+
cmap[i] = np.array([r, g, b])
|
39 |
+
|
40 |
+
if normalized:
|
41 |
+
cmap = cmap.astype(np.float32) / 255.0
|
42 |
+
|
43 |
+
return cmap
|
44 |
+
|
45 |
+
def visualize_bbox(image_path, bboxes, classes, scores, id_to_names, alpha=0.3):
|
46 |
+
"""
|
47 |
+
Visualize layout detection results on an image.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
image_path (str): Path to the input image.
|
51 |
+
bboxes (list): List of bounding boxes, each represented as [x_min, y_min, x_max, y_max].
|
52 |
+
classes (list): List of class IDs corresponding to the bounding boxes.
|
53 |
+
id_to_names (dict): Dictionary mapping class IDs to class names.
|
54 |
+
alpha (float): Transparency factor for the filled color (default is 0.3).
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
np.ndarray: Image with visualized layout detection results.
|
58 |
+
"""
|
59 |
+
# Check if image_path is a PIL.Image.Image object
|
60 |
+
if isinstance(image_path, Image.Image) or isinstance(image_path, np.ndarray):
|
61 |
+
image = np.array(image_path)
|
62 |
+
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # Convert RGB to BGR for OpenCV
|
63 |
+
else:
|
64 |
+
image = cv2.imread(image_path)
|
65 |
+
|
66 |
+
overlay = image.copy()
|
67 |
+
|
68 |
+
cmap = colormap(N=len(id_to_names), normalized=False)
|
69 |
+
|
70 |
+
# Iterate over each bounding box
|
71 |
+
for i, bbox in enumerate(bboxes):
|
72 |
+
x_min, y_min, x_max, y_max = map(int, bbox)
|
73 |
+
class_id = int(classes[i])
|
74 |
+
class_name = id_to_names[class_id]
|
75 |
+
|
76 |
+
text = class_name + f":{scores[i]:.3f}"
|
77 |
+
|
78 |
+
color = tuple(int(c) for c in cmap[class_id])
|
79 |
+
cv2.rectangle(overlay, (x_min, y_min), (x_max, y_max), color, -1)
|
80 |
+
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2)
|
81 |
+
|
82 |
+
# Add the class name with a background rectangle
|
83 |
+
(text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.9, 2)
|
84 |
+
cv2.rectangle(image, (x_min, y_min - text_height - baseline), (x_min + text_width, y_min), color, -1)
|
85 |
+
cv2.putText(image, text, (x_min, y_min - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2)
|
86 |
+
|
87 |
+
# Blend the overlay with the original image
|
88 |
+
cv2.addWeighted(overlay, alpha, image, 1 - alpha, 0, image)
|
89 |
+
|
90 |
+
return image
|