Spaces:

uartimcs
/

donut-booking-gradio

Running

App Files Files Community

uartimcs commited on Nov 13, 2024

Commit

1cc5dab

verified ·

1 Parent(s): 952bff9

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +18 -0
.gitignore +139 -0
.gradio/certificate.pem +31 -0
LICENSE +21 -0
NOTICE +213 -0
README.md +243 -7
app.py +26 -0
config/train_booking.yaml +22 -0
config/train_cord.yaml +22 -0
config/train_docvqa.yaml +23 -0
config/train_invoices.yaml +22 -0
config/train_rvlcdip.yaml +23 -0
config/train_zhtrainticket.yaml +22 -0
dataset/.gitkeep +1 -0
donut/__init__.py +16 -0
donut/_version.py +6 -0
donut/model.py +613 -0
donut/util.py +340 -0
lightning_module.py +198 -0
misc/overview.png +0 -0
misc/sample_image_cord_test_receipt_00004.png +3 -0
misc/sample_image_donut_document.png +0 -0
misc/sample_synthdog.png +3 -0
misc/screenshot_gradio_demos.png +3 -0
result/.gitkeep +1 -0
setup.py +77 -0
synthdog/README.md +63 -0
synthdog/config_en.yaml +119 -0
synthdog/config_ja.yaml +119 -0
synthdog/config_ko.yaml +119 -0
synthdog/config_zh.yaml +119 -0
synthdog/elements/__init__.py +12 -0
synthdog/elements/background.py +24 -0
synthdog/elements/content.py +118 -0
synthdog/elements/document.py +65 -0
synthdog/elements/paper.py +17 -0
synthdog/elements/textbox.py +43 -0
synthdog/layouts/__init__.py +9 -0
synthdog/layouts/grid.py +68 -0
synthdog/layouts/grid_stack.py +74 -0
synthdog/resources/background/bedroom_83.jpg +0 -0
synthdog/resources/background/bob+dylan_83.jpg +0 -0
synthdog/resources/background/coffee_122.jpg +0 -0
synthdog/resources/background/coffee_18.jpeg +3 -0
synthdog/resources/background/crater_141.jpg +3 -0
synthdog/resources/background/cream_124.jpg +3 -0
synthdog/resources/background/eagle_110.jpg +0 -0
synthdog/resources/background/farm_25.jpg +0 -0
synthdog/resources/background/hiking_18.jpg +0 -0
synthdog/resources/corpus/enwiki.txt +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,21 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+misc/sample_image_cord_test_receipt_00004.png filter=lfs diff=lfs merge=lfs -text
+misc/sample_synthdog.png filter=lfs diff=lfs merge=lfs -text
+misc/screenshot_gradio_demos.png filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/background/coffee_18.jpeg filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/background/crater_141.jpg filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/background/cream_124.jpg filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/font/ja/NotoSansJP-Regular.otf filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/font/ja/NotoSerifJP-Regular.otf filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/font/ko/NotoSansKR-Regular.otf filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/font/ko/NotoSerifKR-Regular.otf filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/font/zh/NotoSansSC-Regular.otf filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/font/zh/NotoSerifSC-Regular.otf filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/paper/paper_1.jpg filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/paper/paper_2.jpg filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/paper/paper_3.jpg filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/paper/paper_4.jpg filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/paper/paper_5.jpg filter=lfs diff=lfs merge=lfs -text
+synthdog/resources/paper/paper_6.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,139 @@

+core.*
+*.bin
+.nfs*
+.vscode/*
+dataset/*
+result/*
+misc/*
+!misc/*.png
+!dataset/.gitkeep
+!result/.gitkeep
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT license
+Copyright (c) 2022-present NAVER Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

NOTICE ADDED Viewed

	@@ -0,0 +1,213 @@

+Donut
+Copyright (c) 2022-present NAVER Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+--------------------------------------------------------------------------------------
+This project contains subcomponents with separate copyright notices and license terms.
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+=====
+googlefonts/noto-fonts
+https://fonts.google.com/specimen/Noto+Sans
+Copyright 2018 The Noto Project Authors (github.com/googlei18n/noto-fonts)
+This Font Software is licensed under the SIL Open Font License,
+Version 1.1.
+This license is copied below, and is also available with a FAQ at:
+http://scripts.sil.org/OFL
+-----------------------------------------------------------
+SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
+-----------------------------------------------------------
+PREAMBLE
+The goals of the Open Font License (OFL) are to stimulate worldwide
+development of collaborative font projects, to support the font
+creation efforts of academic and linguistic communities, and to
+provide a free and open framework in which fonts may be shared and
+improved in partnership with others.
+The OFL allows the licensed fonts to be used, studied, modified and
+redistributed freely as long as they are not sold by themselves. The
+fonts, including any derivative works, can be bundled, embedded,
+redistributed and/or sold with any software provided that any reserved
+names are not used by derivative works. The fonts and derivatives,
+however, cannot be released under any other type of license. The
+requirement for fonts to remain under this license does not apply to
+any document created using the fonts or their derivatives.
+DEFINITIONS
+"Font Software" refers to the set of files released by the Copyright
+Holder(s) under this license and clearly marked as such. This may
+include source files, build scripts and documentation.
+"Reserved Font Name" refers to any names specified as such after the
+copyright statement(s).
+"Original Version" refers to the collection of Font Software
+components as distributed by the Copyright Holder(s).
+"Modified Version" refers to any derivative made by adding to,
+deleting, or substituting -- in part or in whole -- any of the
+components of the Original Version, by changing formats or by porting
+the Font Software to a new environment.
+"Author" refers to any designer, engineer, programmer, technical
+writer or other person who contributed to the Font Software.
+PERMISSION & CONDITIONS
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Font Software, to use, study, copy, merge, embed,
+modify, redistribute, and sell modified and unmodified copies of the
+Font Software, subject to the following conditions:
+1) Neither the Font Software nor any of its individual components, in
+Original or Modified Versions, may be sold by itself.
+2) Original or Modified Versions of the Font Software may be bundled,
+redistributed and/or sold with any software, provided that each copy
+contains the above copyright notice and this license. These can be
+included either as stand-alone text files, human-readable headers or
+in the appropriate machine-readable metadata fields within text or
+binary files as long as those fields can be easily viewed by the user.
+3) No Modified Version of the Font Software may use the Reserved Font
+Name(s) unless explicit written permission is granted by the
+corresponding Copyright Holder. This restriction only applies to the
+primary font name as presented to the users.
+4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
+Software shall not be used to promote, endorse or advertise any
+Modified Version, except to acknowledge the contribution(s) of the
+Copyright Holder(s) and the Author(s) or with their explicit written
+permission.
+5) The Font Software, modified or unmodified, in part or in whole,
+must be distributed entirely under this license, and must not be
+distributed under any other license. The requirement for fonts to
+remain under this license does not apply to any document created using
+the Font Software.
+TERMINATION
+This license becomes null and void if any of the above conditions are
+not met.
+DISCLAIMER
+THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
+DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
+OTHER DEALINGS IN THE FONT SOFTWARE.
+=====
+huggingface/transformers
+https://github.com/huggingface/transformers
+Copyright [yyyy] [name of copyright owner]
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and limitations under the License.
+=====
+clovaai/synthtiger
+https://github.com/clovaai/synthtiger
+Copyright (c) 2021-present NAVER Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+=====
+rwightman/pytorch-image-models
+https://github.com/rwightman/pytorch-image-models
+   Copyright 2019 Ross Wightman
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+=====
+ankush-me/SynthText
+https://github.com/ankush-me/SynthText
+   Copyright 2017, Ankush Gupta.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+=====

README.md CHANGED Viewed

@@ -1,12 +1,248 @@
 ---
-title: Donut Booking Gradio
-emoji: 🚀
-colorFrom: yellow
-colorTo: gray
 sdk: gradio
 sdk_version: 5.5.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: donut-booking-gradio
+app_file: app.py
 sdk: gradio
 sdk_version: 5.5.0
 ---
+<div align="center">
+# Donut 🍩 : Document Understanding Transformer
+[![Paper](https://img.shields.io/badge/Paper-arxiv.2111.15664-red)](https://arxiv.org/abs/2111.15664)
+[![Conference](https://img.shields.io/badge/ECCV-2022-blue)](#how-to-cite)
+[![Demo](https://img.shields.io/badge/Demo-Gradio-brightgreen)](#demo)
+[![Demo](https://img.shields.io/badge/Demo-Colab-orange)](#demo)
+[![PyPI](https://img.shields.io/pypi/v/donut-python?color=green&label=pip%20install%20donut-python)](https://pypi.org/project/donut-python)
+[![Downloads](https://static.pepy.tech/personalized-badge/donut-python?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=Downloads)](https://pepy.tech/project/donut-python)
+Official Implementation of Donut and SynthDoG | [Paper](https://arxiv.org/abs/2111.15664) | [Slide](https://docs.google.com/presentation/d/1gv3A7t4xpwwNdpxV_yeHzEOMy-exJCAz6AlAI9O5fS8/edit?usp=sharing) | [Poster](https://docs.google.com/presentation/d/1m1f8BbAm5vxPcqynn_MbFfmQAlHQIR5G72-hQUFS2sk/edit?usp=sharing)
+</div>
+## Introduction
+**Donut** 🍩, **Do**cume**n**t **u**nderstanding **t**ransformer, is a new method of document understanding that utilizes an OCR-free end-to-end Transformer model. Donut does not require off-the-shelf OCR engines/APIs, yet it shows state-of-the-art performances on various visual document understanding tasks, such as visual document classification or information extraction (a.k.a. document parsing).
+In addition, we present **SynthDoG** 🐶, **Synth**etic **Do**cument **G**enerator, that helps the model pre-training to be flexible on various languages and domains.
+Our academic paper, which describes our method in detail and provides full experimental results and analyses, can be found here:<br>
+> [**OCR-free Document Understanding Transformer**](https://arxiv.org/abs/2111.15664).<br>
+> [Geewook Kim](https://geewook.kim), [Teakgyu Hong](https://dblp.org/pid/183/0952.html), [Moonbin Yim](https://github.com/moonbings), [JeongYeon Nam](https://github.com/long8v), [Jinyoung Park](https://github.com/jyp1111), [Jinyeong Yim](https://jinyeong.github.io), [Wonseok Hwang](https://scholar.google.com/citations?user=M13_WdcAAAAJ), [Sangdoo Yun](https://sangdooyun.github.io), [Dongyoon Han](https://dongyoonhan.github.io), [Seunghyun Park](https://scholar.google.com/citations?user=iowjmTwAAAAJ). In ECCV 2022.
+<img width="946" alt="image" src="misc/overview.png">
+## Pre-trained Models and Web Demos
+Gradio web demos are available! [![Demo](https://img.shields.io/badge/Demo-Gradio-brightgreen)](#demo) [![Demo](https://img.shields.io/badge/Demo-Colab-orange)](#demo)
+|:--:|
+|![image](misc/screenshot_gradio_demos.png)|
+- You can run the demo with `./app.py` file.
+- Sample images are available at `./misc` and more receipt images are available at [CORD dataset link](https://huggingface.co/datasets/naver-clova-ix/cord-v2).
+- Web demos are available from the links in the following table.
+- Note: We have updated the Google Colab demo (as of June 15, 2023) to ensure its proper working.
+|Task|Sec/Img|Score|Trained Model|<div id="demo">Demo</div>|
+|---|---|---|---|---|
+| [CORD](https://github.com/clovaai/cord) (Document Parsing)   |   0.7 /<br> 0.7 /<br> 1.2   |  91.3 /<br> 91.1 /<br> 90.9    | [donut-base-finetuned-cord-v2](https://huggingface.co/naver-clova-ix/donut-base-finetuned-cord-v2/tree/official) (1280) /<br> [donut-base-finetuned-cord-v1](https://huggingface.co/naver-clova-ix/donut-base-finetuned-cord-v1/tree/official) (1280) /<br> [donut-base-finetuned-cord-v1-2560](https://huggingface.co/naver-clova-ix/donut-base-finetuned-cord-v1-2560/tree/official) | [gradio space web demo](https://huggingface.co/spaces/naver-clova-ix/donut-base-finetuned-cord-v2),<br>[google colab demo (updated at 23.06.15)](https://colab.research.google.com/drive/1NMSqoIZ_l39wyRD7yVjw2FIuU2aglzJi?usp=sharing) |
+| [Train Ticket](https://github.com/beacandler/EATEN) (Document Parsing)   |   0.6   |  98.7    | [donut-base-finetuned-zhtrainticket](https://huggingface.co/naver-clova-ix/donut-base-finetuned-zhtrainticket/tree/official) | [google colab demo (updated at 23.06.15)](https://colab.research.google.com/drive/1YJBjllahdqNktXaBlq5ugPh1BCm8OsxI?usp=sharing) |
+| [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip) (Document Classification)     |  0.75   |   95.3      | [donut-base-finetuned-rvlcdip](https://huggingface.co/naver-clova-ix/donut-base-finetuned-rvlcdip/tree/official) | [gradio space web demo](https://huggingface.co/spaces/nielsr/donut-rvlcdip),<br>[google colab demo (updated at 23.06.15)](https://colab.research.google.com/drive/1iWOZHvao1W5xva53upcri5V6oaWT-P0O?usp=sharing) |
+| [DocVQA Task1](https://rrc.cvc.uab.es/?ch=17) (Document VQA) |  0.78       | 67.5 | [donut-base-finetuned-docvqa](https://huggingface.co/naver-clova-ix/donut-base-finetuned-docvqa/tree/official) | [gradio space web demo](https://huggingface.co/spaces/nielsr/donut-docvqa),<br>[google colab demo (updated at 23.06.15)](https://colab.research.google.com/drive/1oKieslZCulFiquequ62eMGc-ZWgay4X3?usp=sharing) |
+The links to the pre-trained backbones are here:
+- [`donut-base`](https://huggingface.co/naver-clova-ix/donut-base/tree/official): trained with 64 A100 GPUs (~2.5 days), number of layers (encoder: {2,2,14,2}, decoder: 4), input size 2560x1920, swin window size 10, IIT-CDIP (11M) and SynthDoG (English, Chinese, Japanese, Korean, 0.5M x 4).
+- [`donut-proto`](https://huggingface.co/naver-clova-ix/donut-proto/tree/official): (preliminary model) trained with 8 V100 GPUs (~5 days), number of layers (encoder: {2,2,18,2}, decoder: 4), input size 2048x1536, swin window size 8, and SynthDoG (English, Japanese, Korean, 0.4M x 3).
+Please see [our paper](#how-to-cite) for more details.
+## SynthDoG datasets
+![image](misc/sample_synthdog.png)
+The links to the SynthDoG-generated datasets are here:
+- [`synthdog-en`](https://huggingface.co/datasets/naver-clova-ix/synthdog-en): English, 0.5M.
+- [`synthdog-zh`](https://huggingface.co/datasets/naver-clova-ix/synthdog-zh): Chinese, 0.5M.
+- [`synthdog-ja`](https://huggingface.co/datasets/naver-clova-ix/synthdog-ja): Japanese, 0.5M.
+- [`synthdog-ko`](https://huggingface.co/datasets/naver-clova-ix/synthdog-ko): Korean, 0.5M.
+To generate synthetic datasets with our SynthDoG, please see `./synthdog/README.md` and [our paper](#how-to-cite) for details.
+## Updates
+**_2023-06-15_** We have updated all Google Colab demos to ensure its proper working.<br>
+**_2022-11-14_** New version 1.0.9 is released (`pip install donut-python --upgrade`). See [1.0.9 Release Notes](https://github.com/clovaai/donut/releases/tag/1.0.9).<br>
+**_2022-08-12_** Donut 🍩 is also available at [huggingface/transformers 🤗](https://huggingface.co/docs/transformers/main/en/model_doc/donut) (contributed by [@NielsRogge](https://github.com/NielsRogge)). `donut-python` loads the pre-trained weights from the `official` branch of the model repositories. See [1.0.5 Release Notes](https://github.com/clovaai/donut/releases/tag/1.0.5).<br>
+**_2022-08-05_** A well-executed hands-on tutorial on donut 🍩 is published at [Towards Data Science](https://towardsdatascience.com/ocr-free-document-understanding-with-donut-1acfbdf099be) (written by [@estaudere](https://github.com/estaudere)).<br>
+**_2022-07-20_** First Commit, We release our code, model weights, synthetic data and generator.
+## Software installation
+[![PyPI](https://img.shields.io/pypi/v/donut-python?color=green&label=pip%20install%20donut-python)](https://pypi.org/project/donut-python)
+[![Downloads](https://static.pepy.tech/personalized-badge/donut-python?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=Downloads)](https://pepy.tech/project/donut-python)
+```bash
+pip install donut-python
+```
+or clone this repository and install the dependencies:
+```bash
+git clone https://github.com/clovaai/donut.git
+cd donut/
+conda create -n donut_official python=3.7
+conda activate donut_official
+pip install .
+```
+We tested [donut-python](https://pypi.org/project/donut-python/1.0.1) == 1.0.1 with:
+- [torch](https://github.com/pytorch/pytorch) == 1.11.0+cu113
+- [torchvision](https://github.com/pytorch/vision) == 0.12.0+cu113
+- [pytorch-lightning](https://github.com/Lightning-AI/lightning) == 1.6.4
+- [transformers](https://github.com/huggingface/transformers) == 4.11.3
+- [timm](https://github.com/rwightman/pytorch-image-models) == 0.5.4
+**Note**: From several reported issues, we have noticed increased challenges in configuring the testing environment for `donut-python` due to recent updates in key dependency libraries. While we are actively working on a solution, we have updated the Google Colab demo (as of June 15, 2023) to ensure its proper working. For assistance, we encourage you to refer to the following demo links: [CORD Colab Demo](https://colab.research.google.com/drive/1NMSqoIZ_l39wyRD7yVjw2FIuU2aglzJi?usp=sharing), [Train Ticket Colab Demo](https://colab.research.google.com/drive/1YJBjllahdqNktXaBlq5ugPh1BCm8OsxI?usp=sharing), [RVL-CDIP Colab Demo](https://colab.research.google.com/drive/1iWOZHvao1W5xva53upcri5V6oaWT-P0O?usp=sharing), [DocVQA Colab Demo](https://colab.research.google.com/drive/1oKieslZCulFiquequ62eMGc-ZWgay4X3?usp=sharing).
+## Getting Started
+### Data
+This repository assumes the following structure of dataset:
+```bash
+> tree dataset_name
+dataset_name
+├── test
+│   ├── metadata.jsonl
+│   ├── {image_path0}
+│   ├── {image_path1}
+│             .
+│             .
+├── train
+│   ├── metadata.jsonl
+│   ├── {image_path0}
+│   ├── {image_path1}
+│             .
+│             .
+└── validation
+    ├── metadata.jsonl
+    ├── {image_path0}
+    ├── {image_path1}
+              .
+              .
+> cat dataset_name/test/metadata.jsonl
+{"file_name": {image_path0}, "ground_truth": "{\"gt_parse\": {ground_truth_parse}, ... {other_metadata_not_used} ... }"}
+{"file_name": {image_path1}, "ground_truth": "{\"gt_parse\": {ground_truth_parse}, ... {other_metadata_not_used} ... }"}
+     .
+     .
+```
+- The structure of `metadata.jsonl` file is in [JSON Lines text format](https://jsonlines.org), i.e., `.jsonl`. Each line consists of
+  - `file_name` : relative path to the image file.
+  - `ground_truth` : string format (json dumped), the dictionary contains either `gt_parse` or `gt_parses`. Other fields (metadata) can be added to the dictionary but will not be used.
+- `donut` interprets all tasks as a JSON prediction problem. As a result, all `donut` model training share a same pipeline. For training and inference, the only thing to do is preparing `gt_parse` or `gt_parses` for the task in format described below.
+#### For Document Classification
+The `gt_parse` follows the format of `{"class" : {class_name}}`, for example, `{"class" : "scientific_report"}` or `{"class" : "presentation"}`.
+- Google colab demo is available [here](https://colab.research.google.com/drive/1xUDmLqlthx8A8rWKLMSLThZ7oeRJkDuU?usp=sharing).
+- Gradio web demo is available [here](https://huggingface.co/spaces/nielsr/donut-rvlcdip).
+#### For Document Information Extraction
+The `gt_parse` is a JSON object that contains full information of the document image, for example, the JSON object for a receipt may look like `{"menu" : [{"nm": "ICE BLACKCOFFEE", "cnt": "2", ...}, ...], ...}`.
+- More examples are available at [CORD dataset](https://huggingface.co/datasets/naver-clova-ix/cord-v2).
+- Google colab demo is available [here](https://colab.research.google.com/drive/1o07hty-3OQTvGnc_7lgQFLvvKQuLjqiw?usp=sharing).
+- Gradio web demo is available [here](https://huggingface.co/spaces/naver-clova-ix/donut-base-finetuned-cord-v2).
+#### For Document Visual Question Answering
+The `gt_parses` follows the format of `[{"question" : {question_sentence}, "answer" : {answer_candidate_1}}, {"question" : {question_sentence}, "answer" : {answer_candidate_2}}, ...]`, for example, `[{"question" : "what is the model name?", "answer" : "donut"}, {"question" : "what is the model name?", "answer" : "document understanding transformer"}]`.
+- DocVQA Task1 has multiple answers, hence `gt_parses` should be a list of dictionary that contains a pair of question and answer.
+- Google colab demo is available [here](https://colab.research.google.com/drive/1Z4WG8Wunj3HE0CERjt608ALSgSzRC9ig?usp=sharing).
+- Gradio web demo is available [here](https://huggingface.co/spaces/nielsr/donut-docvqa).
+#### For (Pseudo) Text Reading Task
+The `gt_parse` looks like `{"text_sequence" : "word1 word2 word3 ... "}`
+- This task is also a pre-training task of Donut model.
+- You can use our **SynthDoG** 🐶 to generate synthetic images for the text reading task with proper `gt_parse`. See `./synthdog/README.md` for details.
+### Training
+This is the configuration of Donut model training on [CORD](https://github.com/clovaai/cord) dataset used in our experiment.
+We ran this with a single NVIDIA A100 GPU.
+```bash
+python train.py --config config/train_cord.yaml \
+                --pretrained_model_name_or_path "naver-clova-ix/donut-base" \
+                --dataset_name_or_paths '["naver-clova-ix/cord-v2"]' \
+                --exp_version "test_experiment"
+  .
+  .
+Prediction: <s_menu><s_nm>Lemon Tea (L)</s_nm><s_cnt>1</s_cnt><s_price>25.000</s_price></s_menu><s_total><s_total_price>25.000</s_total_price><s_cashprice>30.000</s_cashprice><s_changeprice>5.000</s_changeprice></s_total>
+Answer: <s_menu><s_nm>Lemon Tea (L)</s_nm><s_cnt>1</s_cnt><s_price>25.000</s_price></s_menu><s_total><s_total_price>25.000</s_total_price><s_cashprice>30.000</s_cashprice><s_changeprice>5.000</s_changeprice></s_total>
+Normed ED: 0.0
+Prediction: <s_menu><s_nm>Hulk Topper Package</s_nm><s_cnt>1</s_cnt><s_price>100.000</s_price></s_menu><s_total><s_total_price>100.000</s_total_price><s_cashprice>100.000</s_cashprice><s_changeprice>0</s_changeprice></s_total>
+Answer: <s_menu><s_nm>Hulk Topper Package</s_nm><s_cnt>1</s_cnt><s_price>100.000</s_price></s_menu><s_total><s_total_price>100.000</s_total_price><s_cashprice>100.000</s_cashprice><s_changeprice>0</s_changeprice></s_total>
+Normed ED: 0.0
+Prediction: <s_menu><s_nm>Giant Squid</s_nm><s_cnt>x 1</s_cnt><s_price>Rp. 39.000</s_price><s_sub><s_nm>C.Finishing - Cut</s_nm><s_price>Rp. 0</s_price><sep/><s_nm>B.Spicy Level - Extreme Hot Rp. 0</s_price></s_sub><sep/><s_nm>A.Flavour - Salt & Pepper</s_nm><s_price>Rp. 0</s_price></s_sub></s_menu><s_sub_total><s_subtotal_price>Rp. 39.000</s_subtotal_price></s_sub_total><s_total><s_total_price>Rp. 39.000</s_total_price><s_cashprice>Rp. 50.000</s_cashprice><s_changeprice>Rp. 11.000</s_changeprice></s_total>
+Answer: <s_menu><s_nm>Giant Squid</s_nm><s_cnt>x1</s_cnt><s_price>Rp. 39.000</s_price><s_sub><s_nm>C.Finishing - Cut</s_nm><s_price>Rp. 0</s_price><sep/><s_nm>B.Spicy Level - Extreme Hot</s_nm><s_price>Rp. 0</s_price><sep/><s_nm>A.Flavour- Salt & Pepper</s_nm><s_price>Rp. 0</s_price></s_sub></s_menu><s_sub_total><s_subtotal_price>Rp. 39.000</s_subtotal_price></s_sub_total><s_total><s_total_price>Rp. 39.000</s_total_price><s_cashprice>Rp. 50.000</s_cashprice><s_changeprice>Rp. 11.000</s_changeprice></s_total>
+Normed ED: 0.039603960396039604
+Epoch 29: 100%|█████████████| 200/200 [01:49<00:00,  1.82it/s, loss=0.00327, exp_name=train_cord, exp_version=test_experiment]
+```
+Some important arguments:
+- `--config` : config file path for model training.
+- `--pretrained_model_name_or_path` : string format, model name in Hugging Face modelhub or local path.
+- `--dataset_name_or_paths` : string format (json dumped), list of dataset names in Hugging Face datasets or local paths.
+- `--result_path` : file path to save model outputs/artifacts.
+- `--exp_version` : used for experiment versioning. The output files are saved at `{result_path}/{exp_version}/*`
+### Test
+With the trained model, test images and ground truth parses, you can get inference results and accuracy scores.
+```bash
+python test.py --dataset_name_or_path naver-clova-ix/cord-v2 --pretrained_model_name_or_path ./result/train_cord/test_experiment --save_path ./result/output.json
+100%|█████████████| 100/100 [00:35<00:00,  2.80it/s]
+Total number of samples: 100, Tree Edit Distance (TED) based accuracy score: 0.9129639764131697, F1 accuracy score: 0.8406020841373987
+```
+Some important arguments:
+- `--dataset_name_or_path` : string format, the target dataset name in Hugging Face datasets or local path.
+- `--pretrained_model_name_or_path` : string format, the model name in Hugging Face modelhub or local path.
+- `--save_path`: file path to save predictions and scores.
+## How to Cite
+If you find this work useful to you, please cite:
+```bibtex
+@inproceedings{kim2022donut,
+  title     = {OCR-Free Document Understanding Transformer},
+  author    = {Kim, Geewook and Hong, Teakgyu and Yim, Moonbin and Nam, JeongYeon and Park, Jinyoung and Yim, Jinyeong and Hwang, Wonseok and Yun, Sangdoo and Han, Dongyoon and Park, Seunghyun},
+  booktitle = {European Conference on Computer Vision (ECCV)},
+  year      = {2022}
+}
+```
+## License
+```
+MIT license
+Copyright (c) 2022-present NAVER Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+```

app.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import gradio as gr
+import argparse
+import torch
+from PIL import Image
+from donut import DonutModel
+def demo_process(input_img):
+    global model, task_prompt, task_name
+    input_img = Image.fromarray(input_img)
+    output = model.inference(image=input_img, prompt=task_prompt)["predictions"][0]
+    return output
+parser = argparse.ArgumentParser()
+parser.add_argument("--task", type=str, default="Booking")
+parser.add_argument("--pretrained_path", type=str, default="result/train_booking/20241112_150925")
+args, left_argv = parser.parse_known_args()
+task_name = args.task
+task_prompt = f"<s_{task_name}>"
+model = DonutModel.from_pretrained("./result/train_booking/20241112_150925")
+if torch.cuda.is_available():
+    model.half()
+    device = torch.device("cuda")
+    model.to(device)
+else:
+    model.encoder.to(torch.bfloat16)
+model.eval()
+demo = gr.Interface(fn=demo_process,inputs="image",outputs="json", title=f"Donut 🍩 demonstration for `{task_name}` task",)
+demo.launch(debug=True)

config/train_booking.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+resume_from_checkpoint_path: null # only used for resume_from_checkpoint option in PL
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base" # loading a pre-trained model (from moldehub or path)
+dataset_name_or_paths: ["./dataset/Booking"] # loading datasets (from moldehub or path)
+sort_json_key: False # cord dataset is preprocessed, and publicly available at https://huggingface.co/datasets/naver-clova-ix/cord-v2
+train_batch_sizes: [2]
+val_batch_sizes: [1]
+input_size: [1280, 960] # when the input resolution differs from the pre-training setting, some weights will be newly initialized (but the model training would be okay)
+max_length: 768
+align_long_axis: False
+num_nodes: 1
+seed: 2022
+lr: 3e-5
+warmup_steps: 400 # 800/2*10/10, 10%
+num_training_samples_per_epoch: 800
+max_epochs: 10
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 3
+gradient_clip_val: 1.0
+verbose: True

config/train_cord.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+resume_from_checkpoint_path: null # only used for resume_from_checkpoint option in PL
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base" # loading a pre-trained model (from moldehub or path)
+dataset_name_or_paths: ["naver-clova-ix/cord-v2"] # loading datasets (from moldehub or path)
+sort_json_key: False # cord dataset is preprocessed, and publicly available at https://huggingface.co/datasets/naver-clova-ix/cord-v2
+train_batch_sizes: [8]
+val_batch_sizes: [1]
+input_size: [1280, 960] # when the input resolution differs from the pre-training setting, some weights will be newly initialized (but the model training would be okay)
+max_length: 768
+align_long_axis: False
+num_nodes: 1
+seed: 2022
+lr: 3e-5
+warmup_steps: 300 # 800/8*30/10, 10%
+num_training_samples_per_epoch: 800
+max_epochs: 30
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 3
+gradient_clip_val: 1.0
+verbose: True

config/train_docvqa.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+resume_from_checkpoint_path: null
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base"
+dataset_name_or_paths: ["./dataset/docvqa"] # should be prepared from https://rrc.cvc.uab.es/?ch=17
+sort_json_key: True
+train_batch_sizes: [2]
+val_batch_sizes: [4]
+input_size: [2560, 1920]
+max_length: 128
+align_long_axis: False
+# num_nodes: 8 # memo: donut-base-finetuned-docvqa was trained with 8 nodes
+num_nodes: 1
+seed: 2022
+lr: 3e-5
+warmup_steps: 10000
+num_training_samples_per_epoch: 39463
+max_epochs: 300
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 1
+gradient_clip_val: 0.25
+verbose: True

config/train_invoices.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+resume_from_checkpoint_path: null # only used for resume_from_checkpoint option in PL
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base" # loading a pre-trained model (from moldehub or path)
+dataset_name_or_paths: ["naver-clova-ix/cord-v2"] # loading datasets (from moldehub or path)
+sort_json_key: False # cord dataset is preprocessed, and publicly available at https://huggingface.co/datasets/naver-clova-ix/cord-v2
+train_batch_sizes: [8]
+val_batch_sizes: [1]
+input_size: [1280, 960] # when the input resolution differs from the pre-training setting, some weights will be newly initialized (but the model training would be okay)
+max_length: 768
+align_long_axis: False
+num_nodes: 1
+seed: 2022
+lr: 3e-5
+warmup_steps: 300 # 800/8*30/10, 10%
+num_training_samples_per_epoch: 800
+max_epochs: 30
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 3
+gradient_clip_val: 1.0
+verbose: True

config/train_rvlcdip.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+resume_from_checkpoint_path: null
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base"
+dataset_name_or_paths: ["./dataset/rvlcdip"] # should be prepared from https://www.cs.cmu.edu/~aharley/rvl-cdip/
+sort_json_key: True
+train_batch_sizes: [2]
+val_batch_sizes: [4]
+input_size: [2560, 1920]
+max_length: 8
+align_long_axis: False
+# num_nodes: 8 # memo: donut-base-finetuned-rvlcdip was trained with 8 nodes
+num_nodes: 1
+seed: 2022
+lr: 2e-5
+warmup_steps: 10000
+num_training_samples_per_epoch: 320000
+max_epochs: 100
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 1
+gradient_clip_val: 1.0
+verbose: True

config/train_zhtrainticket.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+resume_from_checkpoint_path: null
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base"
+dataset_name_or_paths: ["./dataset/zhtrainticket"] # should be prepared from https://github.com/beacandler/EATEN
+sort_json_key: True
+train_batch_sizes: [8]
+val_batch_sizes: [1]
+input_size: [960, 1280]
+max_length: 256
+align_long_axis: False
+num_nodes: 1
+seed: 2022
+lr: 3e-5
+warmup_steps: 300
+num_training_samples_per_epoch: 1368
+max_epochs: 10
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 1
+gradient_clip_val: 1.0
+verbose: True

dataset/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+

donut/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+from .model import DonutConfig, DonutModel
+from .util import DonutDataset, JSONParseEvaluator, load_json, save_json
+__all__ = [
+    "DonutConfig",
+    "DonutModel",
+    "DonutDataset",
+    "JSONParseEvaluator",
+    "load_json",
+    "save_json",
+]

donut/_version.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+__version__ = "1.0.9"

donut/model.py ADDED Viewed

	@@ -0,0 +1,613 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import math
+import os
+import re
+from typing import Any, List, Optional, Union
+import numpy as np
+import PIL
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import ImageOps
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.models.swin_transformer import SwinTransformer
+from torchvision import transforms
+from torchvision.transforms.functional import resize, rotate
+from transformers import MBartConfig, MBartForCausalLM, XLMRobertaTokenizer
+from transformers.file_utils import ModelOutput
+from transformers.modeling_utils import PretrainedConfig, PreTrainedModel
+class SwinEncoder(nn.Module):
+    r"""
+    Donut encoder based on SwinTransformer
+    Set the initial weights and configuration with a pretrained SwinTransformer and then
+    modify the detailed configurations as a Donut Encoder
+    Args:
+        input_size: Input image size (width, height)
+        align_long_axis: Whether to rotate image if height is greater than width
+        window_size: Window size(=patch size) of SwinTransformer
+        encoder_layer: Number of layers of SwinTransformer encoder
+        name_or_path: Name of a pretrained model name either registered in huggingface.co. or saved in local.
+                      otherwise, `swin_base_patch4_window12_384` will be set (using `timm`).
+    """
+    def __init__(
+        self,
+        input_size: List[int],
+        align_long_axis: bool,
+        window_size: int,
+        encoder_layer: List[int],
+        name_or_path: Union[str, bytes, os.PathLike] = None,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.align_long_axis = align_long_axis
+        self.window_size = window_size
+        self.encoder_layer = encoder_layer
+        self.to_tensor = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+            ]
+        )
+        self.model = SwinTransformer(
+            img_size=self.input_size,
+            depths=self.encoder_layer,
+            window_size=self.window_size,
+            patch_size=4,
+            embed_dim=128,
+            num_heads=[4, 8, 16, 32],
+            num_classes=0,
+        )
+        self.model.norm = None
+        # weight init with swin
+        if not name_or_path:
+            swin_state_dict = timm.create_model("swin_base_patch4_window12_384", pretrained=True).state_dict()
+            new_swin_state_dict = self.model.state_dict()
+            for x in new_swin_state_dict:
+                if x.endswith("relative_position_index") or x.endswith("attn_mask"):
+                    pass
+                elif (
+                    x.endswith("relative_position_bias_table")
+                    and self.model.layers[0].blocks[0].attn.window_size[0] != 12
+                ):
+                    pos_bias = swin_state_dict[x].unsqueeze(0)[0]
+                    old_len = int(math.sqrt(len(pos_bias)))
+                    new_len = int(2 * window_size - 1)
+                    pos_bias = pos_bias.reshape(1, old_len, old_len, -1).permute(0, 3, 1, 2)
+                    pos_bias = F.interpolate(pos_bias, size=(new_len, new_len), mode="bicubic", align_corners=False)
+                    new_swin_state_dict[x] = pos_bias.permute(0, 2, 3, 1).reshape(1, new_len ** 2, -1).squeeze(0)
+                else:
+                    new_swin_state_dict[x] = swin_state_dict[x]
+            self.model.load_state_dict(new_swin_state_dict)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: (batch_size, num_channels, height, width)
+        """
+        x = self.model.patch_embed(x)
+        x = self.model.pos_drop(x)
+        x = self.model.layers(x)
+        return x
+    def prepare_input(self, img: PIL.Image.Image, random_padding: bool = False) -> torch.Tensor:
+        """
+        Convert PIL Image to tensor according to specified input_size after following steps below:
+            - resize
+            - rotate (if align_long_axis is True and image is not aligned longer axis with canvas)
+            - pad
+        """
+        img = img.convert("RGB")
+        if self.align_long_axis and (
+            (self.input_size[0] > self.input_size[1] and img.width > img.height)
+            or (self.input_size[0] < self.input_size[1] and img.width < img.height)
+        ):
+            img = rotate(img, angle=-90, expand=True)
+        img = resize(img, min(self.input_size))
+        img.thumbnail((self.input_size[1], self.input_size[0]))
+        delta_width = self.input_size[1] - img.width
+        delta_height = self.input_size[0] - img.height
+        if random_padding:
+            pad_width = np.random.randint(low=0, high=delta_width + 1)
+            pad_height = np.random.randint(low=0, high=delta_height + 1)
+        else:
+            pad_width = delta_width // 2
+            pad_height = delta_height // 2
+        padding = (
+            pad_width,
+            pad_height,
+            delta_width - pad_width,
+            delta_height - pad_height,
+        )
+        return self.to_tensor(ImageOps.expand(img, padding))
+class BARTDecoder(nn.Module):
+    """
+    Donut Decoder based on Multilingual BART
+    Set the initial weights and configuration with a pretrained multilingual BART model,
+    and modify the detailed configurations as a Donut decoder
+    Args:
+        decoder_layer:
+            Number of layers of BARTDecoder
+        max_position_embeddings:
+            The maximum sequence length to be trained
+        name_or_path:
+            Name of a pretrained model name either registered in huggingface.co. or saved in local,
+            otherwise, `hyunwoongko/asian-bart-ecjk` will be set (using `transformers`)
+    """
+    def __init__(
+        self, decoder_layer: int, max_position_embeddings: int, name_or_path: Union[str, bytes, os.PathLike] = None
+    ):
+        super().__init__()
+        self.decoder_layer = decoder_layer
+        self.max_position_embeddings = max_position_embeddings
+        self.tokenizer = XLMRobertaTokenizer.from_pretrained(
+            "hyunwoongko/asian-bart-ecjk" if not name_or_path else name_or_path
+        )
+        self.model = MBartForCausalLM(
+            config=MBartConfig(
+                is_decoder=True,
+                is_encoder_decoder=False,
+                add_cross_attention=True,
+                decoder_layers=self.decoder_layer,
+                max_position_embeddings=self.max_position_embeddings,
+                vocab_size=len(self.tokenizer),
+                scale_embedding=True,
+                add_final_layer_norm=True,
+            )
+        )
+        self.model.forward = self.forward  #  to get cross attentions and utilize `generate` function
+        self.model.config.is_encoder_decoder = True  # to get cross-attention
+        self.add_special_tokens(["<sep/>"])  # <sep/> is used for representing a list in a JSON
+        self.model.model.decoder.embed_tokens.padding_idx = self.tokenizer.pad_token_id
+        self.model.prepare_inputs_for_generation = self.prepare_inputs_for_inference
+        # weight init with asian-bart
+        if not name_or_path:
+            bart_state_dict = MBartForCausalLM.from_pretrained("hyunwoongko/asian-bart-ecjk").state_dict()
+            new_bart_state_dict = self.model.state_dict()
+            for x in new_bart_state_dict:
+                if x.endswith("embed_positions.weight") and self.max_position_embeddings != 1024:
+                    new_bart_state_dict[x] = torch.nn.Parameter(
+                        self.resize_bart_abs_pos_emb(
+                            bart_state_dict[x],
+                            self.max_position_embeddings
+                            + 2,  # https://github.com/huggingface/transformers/blob/v4.11.3/src/transformers/models/mbart/modeling_mbart.py#L118-L119
+                        )
+                    )
+                elif x.endswith("embed_tokens.weight") or x.endswith("lm_head.weight"):
+                    new_bart_state_dict[x] = bart_state_dict[x][: len(self.tokenizer), :]
+                else:
+                    new_bart_state_dict[x] = bart_state_dict[x]
+            self.model.load_state_dict(new_bart_state_dict)
+    def add_special_tokens(self, list_of_tokens: List[str]):
+        """
+        Add special tokens to tokenizer and resize the token embeddings
+        """
+        newly_added_num = self.tokenizer.add_special_tokens({"additional_special_tokens": sorted(set(list_of_tokens))})
+        if newly_added_num > 0:
+            self.model.resize_token_embeddings(len(self.tokenizer))
+    def prepare_inputs_for_inference(self, input_ids: torch.Tensor, encoder_outputs: torch.Tensor, past_key_values=None, past=None, use_cache: bool = None, attention_mask: torch.Tensor = None):
+        """
+        Args:
+            input_ids: (batch_size, sequence_lenth)
+        Returns:
+            input_ids: (batch_size, sequence_length)
+            attention_mask: (batch_size, sequence_length)
+            encoder_hidden_states: (batch_size, sequence_length, embedding_dim)
+        """
+        # for compatibility with transformers==4.11.x
+        if past is not None:
+            past_key_values = past
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id).long()
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        output = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+            "encoder_hidden_states": encoder_outputs.last_hidden_state,
+        }
+        return output
+    def forward(
+        self,
+        input_ids,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: bool = None,
+        output_attentions: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[torch.Tensor] = None,
+        return_dict: bool = None,
+    ):
+        """
+        A forward fucntion to get cross attentions and utilize `generate` function
+        Source:
+        https://github.com/huggingface/transformers/blob/v4.11.3/src/transformers/models/mbart/modeling_mbart.py#L1669-L1810
+        Args:
+            input_ids: (batch_size, sequence_length)
+            attention_mask: (batch_size, sequence_length)
+            encoder_hidden_states: (batch_size, sequence_length, hidden_size)
+        Returns:
+            loss: (1, )
+            logits: (batch_size, sequence_length, hidden_dim)
+            hidden_states: (batch_size, sequence_length, hidden_size)
+            decoder_attentions: (batch_size, num_heads, sequence_length, sequence_length)
+            cross_attentions: (batch_size, num_heads, sequence_length, sequence_length)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.model.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.model.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.model.config.use_return_dict
+        outputs = self.model.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = self.model.lm_head(outputs[0])
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(logits.view(-1, self.model.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return ModelOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            decoder_attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    @staticmethod
+    def resize_bart_abs_pos_emb(weight: torch.Tensor, max_length: int) -> torch.Tensor:
+        """
+        Resize position embeddings
+        Truncate if sequence length of Bart backbone is greater than given max_length,
+        else interpolate to max_length
+        """
+        if weight.shape[0] > max_length:
+            weight = weight[:max_length, ...]
+        else:
+            weight = (
+                F.interpolate(
+                    weight.permute(1, 0).unsqueeze(0),
+                    size=max_length,
+                    mode="linear",
+                    align_corners=False,
+                )
+                .squeeze(0)
+                .permute(1, 0)
+            )
+        return weight
+class DonutConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DonutModel`]. It is used to
+    instantiate a Donut model according to the specified arguments, defining the model architecture
+    Args:
+        input_size:
+            Input image size (canvas size) of Donut.encoder, SwinTransformer in this codebase
+        align_long_axis:
+            Whether to rotate image if height is greater than width
+        window_size:
+            Window size of Donut.encoder, SwinTransformer in this codebase
+        encoder_layer:
+            Depth of each Donut.encoder Encoder layer, SwinTransformer in this codebase
+        decoder_layer:
+            Number of hidden layers in the Donut.decoder, such as BART
+        max_position_embeddings
+            Trained max position embeddings in the Donut decoder,
+            if not specified, it will have same value with max_length
+        max_length:
+            Max position embeddings(=maximum sequence length) you want to train
+        name_or_path:
+            Name of a pretrained model name either registered in huggingface.co. or saved in local
+    """
+    model_type = "donut"
+    def __init__(
+        self,
+        input_size: List[int] = [2560, 1920],
+        align_long_axis: bool = False,
+        window_size: int = 10,
+        encoder_layer: List[int] = [2, 2, 14, 2],
+        decoder_layer: int = 4,
+        max_position_embeddings: int = None,
+        max_length: int = 1536,
+        name_or_path: Union[str, bytes, os.PathLike] = "",
+        **kwargs,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.align_long_axis = align_long_axis
+        self.window_size = window_size
+        self.encoder_layer = encoder_layer
+        self.decoder_layer = decoder_layer
+        self.max_position_embeddings = max_length if max_position_embeddings is None else max_position_embeddings
+        self.max_length = max_length
+        self.name_or_path = name_or_path
+class DonutModel(PreTrainedModel):
+    r"""
+    Donut: an E2E OCR-free Document Understanding Transformer.
+    The encoder maps an input document image into a set of embeddings,
+    the decoder predicts a desired token sequence, that can be converted to a structured format,
+    given a prompt and the encoder output embeddings
+    """
+    config_class = DonutConfig
+    base_model_prefix = "donut"
+    def __init__(self, config: DonutConfig):
+        super().__init__(config)
+        self.config = config
+        self.encoder = SwinEncoder(
+            input_size=self.config.input_size,
+            align_long_axis=self.config.align_long_axis,
+            window_size=self.config.window_size,
+            encoder_layer=self.config.encoder_layer,
+            name_or_path=self.config.name_or_path,
+        )
+        self.decoder = BARTDecoder(
+            max_position_embeddings=self.config.max_position_embeddings,
+            decoder_layer=self.config.decoder_layer,
+            name_or_path=self.config.name_or_path,
+        )
+    def forward(self, image_tensors: torch.Tensor, decoder_input_ids: torch.Tensor, decoder_labels: torch.Tensor):
+        """
+        Calculate a loss given an input image and a desired token sequence,
+        the model will be trained in a teacher-forcing manner
+        Args:
+            image_tensors: (batch_size, num_channels, height, width)
+            decoder_input_ids: (batch_size, sequence_length, embedding_dim)
+            decode_labels: (batch_size, sequence_length)
+        """
+        encoder_outputs = self.encoder(image_tensors)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            encoder_hidden_states=encoder_outputs,
+            labels=decoder_labels,
+        )
+        return decoder_outputs
+    def inference(
+        self,
+        image: PIL.Image = None,
+        prompt: str = None,
+        image_tensors: Optional[torch.Tensor] = None,
+        prompt_tensors: Optional[torch.Tensor] = None,
+        return_json: bool = True,
+        return_attentions: bool = False,
+    ):
+        """
+        Generate a token sequence in an auto-regressive manner,
+        the generated token sequence is convereted into an ordered JSON format
+        Args:
+            image: input document image (PIL.Image)
+            prompt: task prompt (string) to guide Donut Decoder generation
+            image_tensors: (1, num_channels, height, width)
+                convert prompt to tensor if image_tensor is not fed
+            prompt_tensors: (1, sequence_length)
+                convert image to tensor if prompt_tensor is not fed
+        """
+        # prepare backbone inputs (image and prompt)
+        if image is None and image_tensors is None:
+            raise ValueError("Expected either image or image_tensors")
+        if all(v is None for v in {prompt, prompt_tensors}):
+            raise ValueError("Expected either prompt or prompt_tensors")
+        if image_tensors is None:
+            image_tensors = self.encoder.prepare_input(image).unsqueeze(0)
+        if self.device.type == "cuda":  # half is not compatible in cpu implementation.
+            image_tensors = image_tensors.half()
+            image_tensors = image_tensors.to(self.device)
+        if prompt_tensors is None:
+            prompt_tensors = self.decoder.tokenizer(prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
+        prompt_tensors = prompt_tensors.to(self.device)
+        last_hidden_state = self.encoder(image_tensors)
+        if self.device.type != "cuda":
+            last_hidden_state = last_hidden_state.to(torch.float32)
+        encoder_outputs = ModelOutput(last_hidden_state=last_hidden_state, attentions=None)
+        if len(encoder_outputs.last_hidden_state.size()) == 1:
+            encoder_outputs.last_hidden_state = encoder_outputs.last_hidden_state.unsqueeze(0)
+        if len(prompt_tensors.size()) == 1:
+            prompt_tensors = prompt_tensors.unsqueeze(0)
+        # get decoder output
+        decoder_output = self.decoder.model.generate(
+            decoder_input_ids=prompt_tensors,
+            encoder_outputs=encoder_outputs,
+            max_length=self.config.max_length,
+            early_stopping=True,
+            pad_token_id=self.decoder.tokenizer.pad_token_id,
+            eos_token_id=self.decoder.tokenizer.eos_token_id,
+            use_cache=True,
+            num_beams=1,
+            bad_words_ids=[[self.decoder.tokenizer.unk_token_id]],
+            return_dict_in_generate=True,
+            output_attentions=return_attentions,
+        )
+        output = {"predictions": list()}
+        for seq in self.decoder.tokenizer.batch_decode(decoder_output.sequences):
+            seq = seq.replace(self.decoder.tokenizer.eos_token, "").replace(self.decoder.tokenizer.pad_token, "")
+            seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
+            if return_json:
+                output["predictions"].append(self.token2json(seq))
+            else:
+                output["predictions"].append(seq)
+        if return_attentions:
+            output["attentions"] = {
+                "self_attentions": decoder_output.decoder_attentions,
+                "cross_attentions": decoder_output.cross_attentions,
+            }
+        return output
+    def json2token(self, obj: Any, update_special_tokens_for_json_key: bool = True, sort_json_key: bool = True):
+        """
+        Convert an ordered JSON object into a token sequence
+        """
+        if type(obj) == dict:
+            if len(obj) == 1 and "text_sequence" in obj:
+                return obj["text_sequence"]
+            else:
+                output = ""
+                if sort_json_key:
+                    keys = sorted(obj.keys(), reverse=True)
+                else:
+                    keys = obj.keys()
+                for k in keys:
+                    if update_special_tokens_for_json_key:
+                        self.decoder.add_special_tokens([fr"<s_{k}>", fr"</s_{k}>"])
+                    output += (
+                        fr"<s_{k}>"
+                        + self.json2token(obj[k], update_special_tokens_for_json_key, sort_json_key)
+                        + fr"</s_{k}>"
+                    )
+                return output
+        elif type(obj) == list:
+            return r"<sep/>".join(
+                [self.json2token(item, update_special_tokens_for_json_key, sort_json_key) for item in obj]
+            )
+        else:
+            obj = str(obj)
+            if f"<{obj}/>" in self.decoder.tokenizer.all_special_tokens:
+                obj = f"<{obj}/>"  # for categorical special tokens
+            return obj
+    def token2json(self, tokens, is_inner_value=False):
+        """
+        Convert a (generated) token seuqnce into an ordered JSON format
+        """
+        output = dict()
+        while tokens:
+            start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
+            if start_token is None:
+                break
+            key = start_token.group(1)
+            end_token = re.search(fr"</s_{key}>", tokens, re.IGNORECASE)
+            start_token = start_token.group()
+            if end_token is None:
+                tokens = tokens.replace(start_token, "")
+            else:
+                end_token = end_token.group()
+                start_token_escaped = re.escape(start_token)
+                end_token_escaped = re.escape(end_token)
+                content = re.search(f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE)
+                if content is not None:
+                    content = content.group(1).strip()
+                    if r"<s_" in content and r"</s_" in content:  # non-leaf node
+                        value = self.token2json(content, is_inner_value=True)
+                        if value:
+                            if len(value) == 1:
+                                value = value[0]
+                            output[key] = value
+                    else:  # leaf nodes
+                        output[key] = []
+                        for leaf in content.split(r"<sep/>"):
+                            leaf = leaf.strip()
+                            if (
+                                leaf in self.decoder.tokenizer.get_added_vocab()
+                                and leaf[0] == "<"
+                                and leaf[-2:] == "/>"
+                            ):
+                                leaf = leaf[1:-2]  # for categorical special tokens
+                            output[key].append(leaf)
+                        if len(output[key]) == 1:
+                            output[key] = output[key][0]
+                tokens = tokens[tokens.find(end_token) + len(end_token) :].strip()
+                if tokens[:6] == r"<sep/>":  # non-leaf nodes
+                    return [output] + self.token2json(tokens[6:], is_inner_value=True)
+        if len(output):
+            return [output] if is_inner_value else output
+        else:
+            return [] if is_inner_value else {"text_sequence": tokens}
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, bytes, os.PathLike],
+        *model_args,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a pretrained donut model from a pre-trained model configuration
+        Args:
+            pretrained_model_name_or_path:
+                Name of a pretrained model name either registered in huggingface.co. or saved in local,
+                e.g., `naver-clova-ix/donut-base`, or `naver-clova-ix/donut-base-finetuned-rvlcdip`
+        """
+        model = super(DonutModel, cls).from_pretrained(pretrained_model_name_or_path, revision="official", *model_args, **kwargs)
+        # truncate or interplolate position embeddings of donut decoder
+        max_length = kwargs.get("max_length", model.config.max_position_embeddings)
+        if (
+            max_length != model.config.max_position_embeddings
+        ):  # if max_length of trained model differs max_length you want to train
+            model.decoder.model.model.decoder.embed_positions.weight = torch.nn.Parameter(
+                model.decoder.resize_bart_abs_pos_emb(
+                    model.decoder.model.model.decoder.embed_positions.weight,
+                    max_length
+                    + 2,  # https://github.com/huggingface/transformers/blob/v4.11.3/src/transformers/models/mbart/modeling_mbart.py#L118-L119
+                )
+            )
+            model.config.max_position_embeddings = max_length
+        return model

donut/util.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import json
+import os
+import random
+from collections import defaultdict
+from typing import Any, Dict, List, Tuple, Union
+import torch
+import zss
+from datasets import load_dataset
+from nltk import edit_distance
+from torch.utils.data import Dataset
+from transformers.modeling_utils import PreTrainedModel
+from zss import Node
+def save_json(write_path: Union[str, bytes, os.PathLike], save_obj: Any):
+    with open(write_path, "w") as f:
+        json.dump(save_obj, f)
+def load_json(json_path: Union[str, bytes, os.PathLike]):
+    with open(json_path, "r") as f:
+        return json.load(f)
+class DonutDataset(Dataset):
+    """
+    DonutDataset which is saved in huggingface datasets format. (see details in https://huggingface.co/docs/datasets)
+    Each row, consists of image path(png/jpg/jpeg) and gt data (json/jsonl/txt),
+    and it will be converted into input_tensor(vectorized image) and input_ids(tokenized string)
+    Args:
+        dataset_name_or_path: name of dataset (available at huggingface.co/datasets) or the path containing image files and metadata.jsonl
+        ignore_id: ignore_index for torch.nn.CrossEntropyLoss
+        task_start_token: the special token to be fed to the decoder to conduct the target task
+    """
+    def __init__(
+        self,
+        dataset_name_or_path: str,
+        donut_model: PreTrainedModel,
+        max_length: int,
+        split: str = "train",
+        ignore_id: int = -100,
+        task_start_token: str = "<s>",
+        prompt_end_token: str = None,
+        sort_json_key: bool = True,
+    ):
+        super().__init__()
+        self.donut_model = donut_model
+        self.max_length = max_length
+        self.split = split
+        self.ignore_id = ignore_id
+        self.task_start_token = task_start_token
+        self.prompt_end_token = prompt_end_token if prompt_end_token else task_start_token
+        self.sort_json_key = sort_json_key
+        self.dataset = load_dataset(dataset_name_or_path, split=self.split)
+        self.dataset_length = len(self.dataset)
+        self.gt_token_sequences = []
+        for sample in self.dataset:
+            ground_truth = json.loads(sample["ground_truth"])
+            if "gt_parses" in ground_truth:  # when multiple ground truths are available, e.g., docvqa
+                assert isinstance(ground_truth["gt_parses"], list)
+                gt_jsons = ground_truth["gt_parses"]
+            else:
+                assert "gt_parse" in ground_truth and isinstance(ground_truth["gt_parse"], dict)
+                gt_jsons = [ground_truth["gt_parse"]]
+            self.gt_token_sequences.append(
+                [
+                    task_start_token
+                    + self.donut_model.json2token(
+                        gt_json,
+                        update_special_tokens_for_json_key=self.split == "train",
+                        sort_json_key=self.sort_json_key,
+                    )
+                    + self.donut_model.decoder.tokenizer.eos_token
+                    for gt_json in gt_jsons  # load json from list of json
+                ]
+            )
+        self.donut_model.decoder.add_special_tokens([self.task_start_token, self.prompt_end_token])
+        self.prompt_end_token_id = self.donut_model.decoder.tokenizer.convert_tokens_to_ids(self.prompt_end_token)
+    def __len__(self) -> int:
+        return self.dataset_length
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Load image from image_path of given dataset_path and convert into input_tensor and labels.
+        Convert gt data into input_ids (tokenized string)
+        Returns:
+            input_tensor : preprocessed image
+            input_ids : tokenized gt_data
+            labels : masked labels (model doesn't need to predict prompt and pad token)
+        """
+        sample = self.dataset[idx]
+        # input_tensor
+        input_tensor = self.donut_model.encoder.prepare_input(sample["image"], random_padding=self.split == "train")
+        # input_ids
+        processed_parse = random.choice(self.gt_token_sequences[idx])  # can be more than one, e.g., DocVQA Task 1
+        input_ids = self.donut_model.decoder.tokenizer(
+            processed_parse,
+            add_special_tokens=False,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )["input_ids"].squeeze(0)
+        if self.split == "train":
+            labels = input_ids.clone()
+            labels[
+                labels == self.donut_model.decoder.tokenizer.pad_token_id
+            ] = self.ignore_id  # model doesn't need to predict pad token
+            labels[
+                : torch.nonzero(labels == self.prompt_end_token_id).sum() + 1
+            ] = self.ignore_id  # model doesn't need to predict prompt (for VQA)
+            return input_tensor, input_ids, labels
+        else:
+            prompt_end_index = torch.nonzero(
+                input_ids == self.prompt_end_token_id
+            ).sum()  # return prompt end index instead of target output labels
+            return input_tensor, input_ids, prompt_end_index, processed_parse
+class JSONParseEvaluator:
+    """
+    Calculate n-TED(Normalized Tree Edit Distance) based accuracy and F1 accuracy score
+    """
+    @staticmethod
+    def flatten(data: dict):
+        """
+        Convert Dictionary into Non-nested Dictionary
+        Example:
+            input(dict)
+                {
+                    "menu": [
+                        {"name" : ["cake"], "count" : ["2"]},
+                        {"name" : ["juice"], "count" : ["1"]},
+                    ]
+                }
+            output(list)
+                [
+                    ("menu.name", "cake"),
+                    ("menu.count", "2"),
+                    ("menu.name", "juice"),
+                    ("menu.count", "1"),
+                ]
+        """
+        flatten_data = list()
+        def _flatten(value, key=""):
+            if type(value) is dict:
+                for child_key, child_value in value.items():
+                    _flatten(child_value, f"{key}.{child_key}" if key else child_key)
+            elif type(value) is list:
+                for value_item in value:
+                    _flatten(value_item, key)
+            else:
+                flatten_data.append((key, value))
+        _flatten(data)
+        return flatten_data
+    @staticmethod
+    def update_cost(node1: Node, node2: Node):
+        """
+        Update cost for tree edit distance.
+        If both are leaf node, calculate string edit distance between two labels (special token '<leaf>' will be ignored).
+        If one of them is leaf node, cost is length of string in leaf node + 1.
+        If neither are leaf node, cost is 0 if label1 is same with label2 othewise 1
+        """
+        label1 = node1.label
+        label2 = node2.label
+        label1_leaf = "<leaf>" in label1
+        label2_leaf = "<leaf>" in label2
+        if label1_leaf == True and label2_leaf == True:
+            return edit_distance(label1.replace("<leaf>", ""), label2.replace("<leaf>", ""))
+        elif label1_leaf == False and label2_leaf == True:
+            return 1 + len(label2.replace("<leaf>", ""))
+        elif label1_leaf == True and label2_leaf == False:
+            return 1 + len(label1.replace("<leaf>", ""))
+        else:
+            return int(label1 != label2)
+    @staticmethod
+    def insert_and_remove_cost(node: Node):
+        """
+        Insert and remove cost for tree edit distance.
+        If leaf node, cost is length of label name.
+        Otherwise, 1
+        """
+        label = node.label
+        if "<leaf>" in label:
+            return len(label.replace("<leaf>", ""))
+        else:
+            return 1
+    def normalize_dict(self, data: Union[Dict, List, Any]):
+        """
+        Sort by value, while iterate over element if data is list
+        """
+        if not data:
+            return {}
+        if isinstance(data, dict):
+            new_data = dict()
+            for key in sorted(data.keys(), key=lambda k: (len(k), k)):
+                value = self.normalize_dict(data[key])
+                if value:
+                    if not isinstance(value, list):
+                        value = [value]
+                    new_data[key] = value
+        elif isinstance(data, list):
+            if all(isinstance(item, dict) for item in data):
+                new_data = []
+                for item in data:
+                    item = self.normalize_dict(item)
+                    if item:
+                        new_data.append(item)
+            else:
+                new_data = [str(item).strip() for item in data if type(item) in {str, int, float} and str(item).strip()]
+        else:
+            new_data = [str(data).strip()]
+        return new_data
+    def cal_f1(self, preds: List[dict], answers: List[dict]):
+        """
+        Calculate global F1 accuracy score (field-level, micro-averaged) by counting all true positives, false negatives and false positives
+        """
+        total_tp, total_fn_or_fp = 0, 0
+        for pred, answer in zip(preds, answers):
+            pred, answer = self.flatten(self.normalize_dict(pred)), self.flatten(self.normalize_dict(answer))
+            for field in pred:
+                if field in answer:
+                    total_tp += 1
+                    answer.remove(field)
+                else:
+                    total_fn_or_fp += 1
+            total_fn_or_fp += len(answer)
+        return total_tp / (total_tp + total_fn_or_fp / 2)
+    def construct_tree_from_dict(self, data: Union[Dict, List], node_name: str = None):
+        """
+        Convert Dictionary into Tree
+        Example:
+            input(dict)
+                {
+                    "menu": [
+                        {"name" : ["cake"], "count" : ["2"]},
+                        {"name" : ["juice"], "count" : ["1"]},
+                    ]
+                }
+            output(tree)
+                                     <root>
+                                       |
+                                     menu
+                                    /    \
+                             <subtree>  <subtree>
+                            /      |     |      \
+                         name    count  name    count
+                        /         |     |         \
+                  <leaf>cake  <leaf>2  <leaf>juice  <leaf>1
+         """
+        if node_name is None:
+            node_name = "<root>"
+        node = Node(node_name)
+        if isinstance(data, dict):
+            for key, value in data.items():
+                kid_node = self.construct_tree_from_dict(value, key)
+                node.addkid(kid_node)
+        elif isinstance(data, list):
+            if all(isinstance(item, dict) for item in data):
+                for item in data:
+                    kid_node = self.construct_tree_from_dict(
+                        item,
+                        "<subtree>",
+                    )
+                    node.addkid(kid_node)
+            else:
+                for item in data:
+                    node.addkid(Node(f"<leaf>{item}"))
+        else:
+            raise Exception(data, node_name)
+        return node
+    def cal_acc(self, pred: dict, answer: dict):
+        """
+        Calculate normalized tree edit distance(nTED) based accuracy.
+        1) Construct tree from dict,
+        2) Get tree distance with insert/remove/update cost,
+        3) Divide distance with GT tree size (i.e., nTED),
+        4) Calculate nTED based accuracy. (= max(1 - nTED, 0 ).
+        """
+        pred = self.construct_tree_from_dict(self.normalize_dict(pred))
+        answer = self.construct_tree_from_dict(self.normalize_dict(answer))
+        return max(
+            0,
+            1
+            - (
+                zss.distance(
+                    pred,
+                    answer,
+                    get_children=zss.Node.get_children,
+                    insert_cost=self.insert_and_remove_cost,
+                    remove_cost=self.insert_and_remove_cost,
+                    update_cost=self.update_cost,
+                    return_operations=False,
+                )
+                / zss.distance(
+                    self.construct_tree_from_dict(self.normalize_dict({})),
+                    answer,
+                    get_children=zss.Node.get_children,
+                    insert_cost=self.insert_and_remove_cost,
+                    remove_cost=self.insert_and_remove_cost,
+                    update_cost=self.update_cost,
+                    return_operations=False,
+                )
+            ),
+        )

lightning_module.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import math
+import random
+import re
+from pathlib import Path
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from nltk import edit_distance
+from pytorch_lightning.utilities import rank_zero_only
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from torch.nn.utils.rnn import pad_sequence
+from torch.optim.lr_scheduler import LambdaLR
+from torch.utils.data import DataLoader
+from donut import DonutConfig, DonutModel
+class DonutModelPLModule(pl.LightningModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        if self.config.get("pretrained_model_name_or_path", False):
+            self.model = DonutModel.from_pretrained(
+                self.config.pretrained_model_name_or_path,
+                input_size=self.config.input_size,
+                max_length=self.config.max_length,
+                align_long_axis=self.config.align_long_axis,
+                ignore_mismatched_sizes=True,
+            )
+        else:
+            self.model = DonutModel(
+                config=DonutConfig(
+                    input_size=self.config.input_size,
+                    max_length=self.config.max_length,
+                    align_long_axis=self.config.align_long_axis,
+                    # with DonutConfig, the architecture customization is available, e.g.,
+                    # encoder_layer=[2,2,14,2], decoder_layer=4, ...
+                )
+            )
+        self.pytorch_lightning_version_is_1 = int(pl.__version__[0]) < 2
+        self.num_of_loaders = len(self.config.dataset_name_or_paths)
+    def training_step(self, batch, batch_idx):
+        image_tensors, decoder_input_ids, decoder_labels = list(), list(), list()
+        for batch_data in batch:
+            image_tensors.append(batch_data[0])
+            decoder_input_ids.append(batch_data[1][:, :-1])
+            decoder_labels.append(batch_data[2][:, 1:])
+        image_tensors = torch.cat(image_tensors)
+        decoder_input_ids = torch.cat(decoder_input_ids)
+        decoder_labels = torch.cat(decoder_labels)
+        loss = self.model(image_tensors, decoder_input_ids, decoder_labels)[0]
+        self.log_dict({"train_loss": loss}, sync_dist=True)
+        if not self.pytorch_lightning_version_is_1:
+            self.log('loss', loss, prog_bar=True)
+        return loss
+    def on_validation_epoch_start(self) -> None:
+        super().on_validation_epoch_start()
+        self.validation_step_outputs = [[] for _ in range(self.num_of_loaders)]
+        return
+    def validation_step(self, batch, batch_idx, dataloader_idx=0):
+        image_tensors, decoder_input_ids, prompt_end_idxs, answers = batch
+        decoder_prompts = pad_sequence(
+            [input_id[: end_idx + 1] for input_id, end_idx in zip(decoder_input_ids, prompt_end_idxs)],
+            batch_first=True,
+        )
+        preds = self.model.inference(
+            image_tensors=image_tensors,
+            prompt_tensors=decoder_prompts,
+            return_json=False,
+            return_attentions=False,
+        )["predictions"]
+        scores = list()
+        for pred, answer in zip(preds, answers):
+            pred = re.sub(r"(?:(?<=>) | (?=</s_))", "", pred)
+            answer = re.sub(r"<.*?>", "", answer, count=1)
+            answer = answer.replace(self.model.decoder.tokenizer.eos_token, "")
+            scores.append(edit_distance(pred, answer) / max(len(pred), len(answer)))
+            if self.config.get("verbose", False) and len(scores) == 1:
+                self.print(f"Prediction: {pred}")
+                self.print(f"    Answer: {answer}")
+                self.print(f" Normed ED: {scores[0]}")
+        self.validation_step_outputs[dataloader_idx].append(scores)
+        return scores
+    def on_validation_epoch_end(self):
+        assert len(self.validation_step_outputs) == self.num_of_loaders
+        cnt = [0] * self.num_of_loaders
+        total_metric = [0] * self.num_of_loaders
+        val_metric = [0] * self.num_of_loaders
+        for i, results in enumerate(self.validation_step_outputs):
+            for scores in results:
+                cnt[i] += len(scores)
+                total_metric[i] += np.sum(scores)
+            val_metric[i] = total_metric[i] / cnt[i]
+            val_metric_name = f"val_metric_{i}th_dataset"
+            self.log_dict({val_metric_name: val_metric[i]}, sync_dist=True)
+        self.log_dict({"val_metric": np.sum(total_metric) / np.sum(cnt)}, sync_dist=True)
+    def configure_optimizers(self):
+        max_iter = None
+        if int(self.config.get("max_epochs", -1)) > 0:
+            assert len(self.config.train_batch_sizes) == 1, "Set max_epochs only if the number of datasets is 1"
+            max_iter = (self.config.max_epochs * self.config.num_training_samples_per_epoch) / (
+                self.config.train_batch_sizes[0] * torch.cuda.device_count() * self.config.get("num_nodes", 1)
+            )
+        if int(self.config.get("max_steps", -1)) > 0:
+            max_iter = min(self.config.max_steps, max_iter) if max_iter is not None else self.config.max_steps
+        assert max_iter is not None
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.config.lr)
+        scheduler = {
+            "scheduler": self.cosine_scheduler(optimizer, max_iter, self.config.warmup_steps),
+            "name": "learning_rate",
+            "interval": "step",
+        }
+        return [optimizer], [scheduler]
+    @staticmethod
+    def cosine_scheduler(optimizer, training_steps, warmup_steps):
+        def lr_lambda(current_step):
+            if current_step < warmup_steps:
+                return current_step / max(1, warmup_steps)
+            progress = current_step - warmup_steps
+            progress /= max(1, training_steps - warmup_steps)
+            return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))
+        return LambdaLR(optimizer, lr_lambda)
+    @rank_zero_only
+    def on_save_checkpoint(self, checkpoint):
+        save_path = Path(self.config.result_path) / self.config.exp_name / self.config.exp_version
+        self.model.save_pretrained(save_path)
+        self.model.decoder.tokenizer.save_pretrained(save_path)
+class DonutDataPLModule(pl.LightningDataModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.train_batch_sizes = self.config.train_batch_sizes
+        self.val_batch_sizes = self.config.val_batch_sizes
+        self.train_datasets = []
+        self.val_datasets = []
+        self.g = torch.Generator()
+        self.g.manual_seed(self.config.seed)
+    def train_dataloader(self):
+        loaders = list()
+        for train_dataset, batch_size in zip(self.train_datasets, self.train_batch_sizes):
+            loaders.append(
+                DataLoader(
+                    train_dataset,
+                    batch_size=batch_size,
+                    num_workers=self.config.num_workers,
+                    pin_memory=True,
+                    worker_init_fn=self.seed_worker,
+                    generator=self.g,
+                    shuffle=True,
+                )
+            )
+        return loaders
+    def val_dataloader(self):
+        loaders = list()
+        for val_dataset, batch_size in zip(self.val_datasets, self.val_batch_sizes):
+            loaders.append(
+                DataLoader(
+                    val_dataset,
+                    batch_size=batch_size,
+                    pin_memory=True,
+                    shuffle=False,
+                )
+            )
+        return loaders
+    @staticmethod
+    def seed_worker(wordker_id):
+        worker_seed = torch.initial_seed() % 2 ** 32
+        np.random.seed(worker_seed)
+        random.seed(worker_seed)

misc/overview.png ADDED Viewed

misc/sample_image_cord_test_receipt_00004.png ADDED Viewed

Git LFS Details

SHA256: 8f3eee7068c96e86cdb2e4b5c53085cb5e1439462edd55c373548cb1962801ad
Pointer size: 132 Bytes
Size of remote file: 1.64 MB

misc/sample_image_donut_document.png ADDED Viewed

misc/sample_synthdog.png ADDED Viewed

Git LFS Details

SHA256: 26ca7665ceb4cb850e19aaf6f4cbc9b37ea5780c5e9d512764dad6a83b7931f1
Pointer size: 132 Bytes
Size of remote file: 1.44 MB

misc/screenshot_gradio_demos.png ADDED Viewed

Git LFS Details

SHA256: f0f063308ddc48feb5a493560a18d057c68f8989fdc00eb91c171e0e9b552f3e
Pointer size: 132 Bytes
Size of remote file: 1.39 MB

result/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+

setup.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import os
+from setuptools import find_packages, setup
+ROOT = os.path.abspath(os.path.dirname(__file__))
+def read_version():
+    data = {}
+    path = os.path.join(ROOT, "donut", "_version.py")
+    with open(path, "r", encoding="utf-8") as f:
+        exec(f.read(), data)
+    return data["__version__"]
+def read_long_description():
+    path = os.path.join(ROOT, "README.md")
+    with open(path, "r", encoding="utf-8") as f:
+        text = f.read()
+    return text
+setup(
+    name="donut-python",
+    version=read_version(),
+    description="OCR-free Document Understanding Transformer",
+    long_description=read_long_description(),
+    long_description_content_type="text/markdown",
+    author="Geewook Kim, Teakgyu Hong, Moonbin Yim, JeongYeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park",
+    author_email="gwkim.rsrch@gmail.com",
+    url="https://github.com/clovaai/donut",
+    license="MIT",
+    packages=find_packages(
+        exclude=[
+            "config",
+            "dataset",
+            "misc",
+            "result",
+            "synthdog",
+            "app.py",
+            "lightning_module.py",
+            "README.md",
+            "train.py",
+            "test.py",
+        ]
+    ),
+    python_requires=">=3.7",
+    install_requires=[
+        "transformers>=4.11.3",
+        "timm",
+        "datasets[vision]",
+        "pytorch-lightning>=1.6.4",
+        "nltk",
+        "sentencepiece",
+        "zss",
+        "sconf>=0.2.3",
+    ],
+    classifiers=[
+        "Intended Audience :: Developers",
+        "Intended Audience :: Information Technology",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Libraries",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+)

synthdog/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# SynthDoG 🐶: Synthetic Document Generator
+SynthDoG is synthetic document generator for visual document understanding (VDU).
+![image](../misc/sample_synthdog.png)
+## Prerequisites
+- python>=3.6
+- [synthtiger](https://github.com/clovaai/synthtiger) (`pip install synthtiger`)
+## Usage
+```bash
+# Set environment variable (for macOS)
+$ export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
+synthtiger -o ./outputs/SynthDoG_en -c 50 -w 4 -v template.py SynthDoG config_en.yaml
+{'config': 'config_en.yaml',
+ 'count': 50,
+ 'name': 'SynthDoG',
+ 'output': './outputs/SynthDoG_en',
+ 'script': 'template.py',
+ 'verbose': True,
+ 'worker': 4}
+{'aspect_ratio': [1, 2],
+     .
+     .
+ 'quality': [50, 95],
+ 'short_size': [720, 1024]}
+Generated 1 data (task 3)
+Generated 2 data (task 0)
+Generated 3 data (task 1)
+     .
+     .
+Generated 49 data (task 48)
+Generated 50 data (task 49)
+46.32 seconds elapsed
+```
+Some important arguments:
+- `-o` : directory path to save data.
+- `-c` : number of data to generate.
+- `-w` : number of workers.
+- `-s` : random seed.
+- `-v` : print error messages.
+To generate ECJK samples:
+```bash
+# english
+synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_en.yaml
+# chinese
+synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_zh.yaml
+# japanese
+synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_ja.yaml
+# korean
+synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_ko.yaml
+```

synthdog/config_en.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+quality: [50, 95]
+landscape: 0.5
+short_size: [720, 1024]
+aspect_ratio: [1, 2]
+background:
+  image:
+    paths: [resources/background]
+    weights: [1]
+  effect:
+    args:
+      # gaussian blur
+      - prob: 1
+        args:
+          sigma: [0, 10]
+document:
+  fullscreen: 0.5
+  landscape: 0.5
+  short_size: [480, 1024]
+  aspect_ratio: [1, 2]
+  paper:
+    image:
+      paths: [resources/paper]
+      weights: [1]
+      alpha: [0, 0.2]
+      grayscale: 1
+      crop: 1
+  content:
+    margin: [0, 0.1]
+    text:
+      path: resources/corpus/enwiki.txt
+    font:
+      paths: [resources/font/en]
+      weights: [1]
+      bold: 0
+    layout:
+      text_scale: [0.0334, 0.1]
+      max_row: 10
+      max_col: 3
+      fill: [0.5, 1]
+      full: 0.1
+      align: [left, right, center]
+      stack_spacing: [0.0334, 0.0334]
+      stack_fill: [0.5, 1]
+      stack_full: 0.1
+    textbox:
+      fill: [0.5, 1]
+    textbox_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+    content_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+  effect:
+    args:
+      # elastic distortion
+      - prob: 1
+        args:
+          alpha: [0, 1]
+          sigma: [0, 0.5]
+      # gaussian noise
+      - prob: 1
+        args:
+          scale: [0, 8]
+          per_channel: 0
+      # perspective
+      - prob: 1
+        args:
+          weights: [750, 50, 50, 25, 25, 25, 25, 50]
+          args:
+            - percents: [[0.75, 1], [0.75, 1], [0.75, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [0.75, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [1, 1]]
+effect:
+  args:
+    # color
+    - prob: 0.2
+      args:
+        rgb: [[0, 255], [0, 255], [0, 255]]
+        alpha: [0, 0.2]
+    # shadow
+    - prob: 1
+      args:
+        intensity: [0, 160]
+        amount: [0, 1]
+        smoothing: [0.5, 1]
+        bidirectional: 0
+    # contrast
+    - prob: 1
+      args:
+        alpha: [1, 1.5]
+    # brightness
+    - prob: 1
+      args:
+        beta: [-48, 0]
+    # motion blur
+    - prob: 0.5
+      args:
+        k: [3, 5]
+        angle: [0, 360]
+    # gaussian blur
+    - prob: 1
+      args:
+        sigma: [0, 1.5]

synthdog/config_ja.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+quality: [50, 95]
+landscape: 0.5
+short_size: [720, 1024]
+aspect_ratio: [1, 2]
+background:
+  image:
+    paths: [resources/background]
+    weights: [1]
+  effect:
+    args:
+      # gaussian blur
+      - prob: 1
+        args:
+          sigma: [0, 10]
+document:
+  fullscreen: 0.5
+  landscape: 0.5
+  short_size: [480, 1024]
+  aspect_ratio: [1, 2]
+  paper:
+    image:
+      paths: [resources/paper]
+      weights: [1]
+      alpha: [0, 0.2]
+      grayscale: 1
+      crop: 1
+  content:
+    margin: [0, 0.1]
+    text:
+      path: resources/corpus/jawiki.txt
+    font:
+      paths: [resources/font/ja]
+      weights: [1]
+      bold: 0
+    layout:
+      text_scale: [0.0334, 0.1]
+      max_row: 10
+      max_col: 3
+      fill: [0.5, 1]
+      full: 0.1
+      align: [left, right, center]
+      stack_spacing: [0.0334, 0.0334]
+      stack_fill: [0.5, 1]
+      stack_full: 0.1
+    textbox:
+      fill: [0.5, 1]
+    textbox_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+    content_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+  effect:
+    args:
+      # elastic distortion
+      - prob: 1
+        args:
+          alpha: [0, 1]
+          sigma: [0, 0.5]
+      # gaussian noise
+      - prob: 1
+        args:
+          scale: [0, 8]
+          per_channel: 0
+      # perspective
+      - prob: 1
+        args:
+          weights: [750, 50, 50, 25, 25, 25, 25, 50]
+          args:
+            - percents: [[0.75, 1], [0.75, 1], [0.75, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [0.75, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [1, 1]]
+effect:
+  args:
+    # color
+    - prob: 0.2
+      args:
+        rgb: [[0, 255], [0, 255], [0, 255]]
+        alpha: [0, 0.2]
+    # shadow
+    - prob: 1
+      args:
+        intensity: [0, 160]
+        amount: [0, 1]
+        smoothing: [0.5, 1]
+        bidirectional: 0
+    # contrast
+    - prob: 1
+      args:
+        alpha: [1, 1.5]
+    # brightness
+    - prob: 1
+      args:
+        beta: [-48, 0]
+    # motion blur
+    - prob: 0.5
+      args:
+        k: [3, 5]
+        angle: [0, 360]
+    # gaussian blur
+    - prob: 1
+      args:
+        sigma: [0, 1.5]

synthdog/config_ko.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+quality: [50, 95]
+landscape: 0.5
+short_size: [720, 1024]
+aspect_ratio: [1, 2]
+background:
+  image:
+    paths: [resources/background]
+    weights: [1]
+  effect:
+    args:
+      # gaussian blur
+      - prob: 1
+        args:
+          sigma: [0, 10]
+document:
+  fullscreen: 0.5
+  landscape: 0.5
+  short_size: [480, 1024]
+  aspect_ratio: [1, 2]
+  paper:
+    image:
+      paths: [resources/paper]
+      weights: [1]
+      alpha: [0, 0.2]
+      grayscale: 1
+      crop: 1
+  content:
+    margin: [0, 0.1]
+    text:
+      path: resources/corpus/kowiki.txt
+    font:
+      paths: [resources/font/ko]
+      weights: [1]
+      bold: 0
+    layout:
+      text_scale: [0.0334, 0.1]
+      max_row: 10
+      max_col: 3
+      fill: [0.5, 1]
+      full: 0.1
+      align: [left, right, center]
+      stack_spacing: [0.0334, 0.0334]
+      stack_fill: [0.5, 1]
+      stack_full: 0.1
+    textbox:
+      fill: [0.5, 1]
+    textbox_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+    content_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+  effect:
+    args:
+      # elastic distortion
+      - prob: 1
+        args:
+          alpha: [0, 1]
+          sigma: [0, 0.5]
+      # gaussian noise
+      - prob: 1
+        args:
+          scale: [0, 8]
+          per_channel: 0
+      # perspective
+      - prob: 1
+        args:
+          weights: [750, 50, 50, 25, 25, 25, 25, 50]
+          args:
+            - percents: [[0.75, 1], [0.75, 1], [0.75, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [0.75, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [1, 1]]
+effect:
+  args:
+    # color
+    - prob: 0.2
+      args:
+        rgb: [[0, 255], [0, 255], [0, 255]]
+        alpha: [0, 0.2]
+    # shadow
+    - prob: 1
+      args:
+        intensity: [0, 160]
+        amount: [0, 1]
+        smoothing: [0.5, 1]
+        bidirectional: 0
+    # contrast
+    - prob: 1
+      args:
+        alpha: [1, 1.5]
+    # brightness
+    - prob: 1
+      args:
+        beta: [-48, 0]
+    # motion blur
+    - prob: 0.5
+      args:
+        k: [3, 5]
+        angle: [0, 360]
+    # gaussian blur
+    - prob: 1
+      args:
+        sigma: [0, 1.5]

synthdog/config_zh.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+quality: [50, 95]
+landscape: 0.5
+short_size: [720, 1024]
+aspect_ratio: [1, 2]
+background:
+  image:
+    paths: [resources/background]
+    weights: [1]
+  effect:
+    args:
+      # gaussian blur
+      - prob: 1
+        args:
+          sigma: [0, 10]
+document:
+  fullscreen: 0.5
+  landscape: 0.5
+  short_size: [480, 1024]
+  aspect_ratio: [1, 2]
+  paper:
+    image:
+      paths: [resources/paper]
+      weights: [1]
+      alpha: [0, 0.2]
+      grayscale: 1
+      crop: 1
+  content:
+    margin: [0, 0.1]
+    text:
+      path: resources/corpus/zhwiki.txt
+    font:
+      paths: [resources/font/zh]
+      weights: [1]
+      bold: 0
+    layout:
+      text_scale: [0.0334, 0.1]
+      max_row: 10
+      max_col: 3
+      fill: [0.5, 1]
+      full: 0.1
+      align: [left, right, center]
+      stack_spacing: [0.0334, 0.0334]
+      stack_fill: [0.5, 1]
+      stack_full: 0.1
+    textbox:
+      fill: [0.5, 1]
+    textbox_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+    content_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+  effect:
+    args:
+      # elastic distortion
+      - prob: 1
+        args:
+          alpha: [0, 1]
+          sigma: [0, 0.5]
+      # gaussian noise
+      - prob: 1
+        args:
+          scale: [0, 8]
+          per_channel: 0
+      # perspective
+      - prob: 1
+        args:
+          weights: [750, 50, 50, 25, 25, 25, 25, 50]
+          args:
+            - percents: [[0.75, 1], [0.75, 1], [0.75, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [0.75, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [1, 1]]
+effect:
+  args:
+    # color
+    - prob: 0.2
+      args:
+        rgb: [[0, 255], [0, 255], [0, 255]]
+        alpha: [0, 0.2]
+    # shadow
+    - prob: 1
+      args:
+        intensity: [0, 160]
+        amount: [0, 1]
+        smoothing: [0.5, 1]
+        bidirectional: 0
+    # contrast
+    - prob: 1
+      args:
+        alpha: [1, 1.5]
+    # brightness
+    - prob: 1
+      args:
+        beta: [-48, 0]
+    # motion blur
+    - prob: 0.5
+      args:
+        k: [3, 5]
+        angle: [0, 360]
+    # gaussian blur
+    - prob: 1
+      args:
+        sigma: [0, 1.5]

synthdog/elements/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+from elements.background import Background
+from elements.content import Content
+from elements.document import Document
+from elements.paper import Paper
+from elements.textbox import TextBox
+__all__ = ["Background", "Content", "Document", "Paper", "TextBox"]

synthdog/elements/background.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+from synthtiger import components, layers
+class Background:
+    def __init__(self, config):
+        self.image = components.BaseTexture(**config.get("image", {}))
+        self.effect = components.Iterator(
+            [
+                components.Switch(components.GaussianBlur()),
+            ],
+            **config.get("effect", {})
+        )
+    def generate(self, size):
+        bg_layer = layers.RectLayer(size, (255, 255, 255, 255))
+        self.image.apply([bg_layer])
+        self.effect.apply([bg_layer])
+        return bg_layer

synthdog/elements/content.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+from collections import OrderedDict
+import numpy as np
+from synthtiger import components
+from elements.textbox import TextBox
+from layouts import GridStack
+class TextReader:
+    def __init__(self, path, cache_size=2 ** 28, block_size=2 ** 20):
+        self.fp = open(path, "r", encoding="utf-8")
+        self.length = 0
+        self.offsets = [0]
+        self.cache = OrderedDict()
+        self.cache_size = cache_size
+        self.block_size = block_size
+        self.bucket_size = cache_size // block_size
+        self.idx = 0
+        while True:
+            text = self.fp.read(self.block_size)
+            if not text:
+                break
+            self.length += len(text)
+            self.offsets.append(self.fp.tell())
+    def __len__(self):
+        return self.length
+    def __iter__(self):
+        return self
+    def __next__(self):
+        char = self.get()
+        self.next()
+        return char
+    def move(self, idx):
+        self.idx = idx
+    def next(self):
+        self.idx = (self.idx + 1) % self.length
+    def prev(self):
+        self.idx = (self.idx - 1) % self.length
+    def get(self):
+        key = self.idx // self.block_size
+        if key in self.cache:
+            text = self.cache[key]
+        else:
+            if len(self.cache) >= self.bucket_size:
+                self.cache.popitem(last=False)
+            offset = self.offsets[key]
+            self.fp.seek(offset, 0)
+            text = self.fp.read(self.block_size)
+            self.cache[key] = text
+        self.cache.move_to_end(key)
+        char = text[self.idx % self.block_size]
+        return char
+class Content:
+    def __init__(self, config):
+        self.margin = config.get("margin", [0, 0.1])
+        self.reader = TextReader(**config.get("text", {}))
+        self.font = components.BaseFont(**config.get("font", {}))
+        self.layout = GridStack(config.get("layout", {}))
+        self.textbox = TextBox(config.get("textbox", {}))
+        self.textbox_color = components.Switch(components.Gray(), **config.get("textbox_color", {}))
+        self.content_color = components.Switch(components.Gray(), **config.get("content_color", {}))
+    def generate(self, size):
+        width, height = size
+        layout_left = width * np.random.uniform(self.margin[0], self.margin[1])
+        layout_top = height * np.random.uniform(self.margin[0], self.margin[1])
+        layout_width = max(width - layout_left * 2, 0)
+        layout_height = max(height - layout_top * 2, 0)
+        layout_bbox = [layout_left, layout_top, layout_width, layout_height]
+        text_layers, texts = [], []
+        layouts = self.layout.generate(layout_bbox)
+        self.reader.move(np.random.randint(len(self.reader)))
+        for layout in layouts:
+            font = self.font.sample()
+            for bbox, align in layout:
+                x, y, w, h = bbox
+                text_layer, text = self.textbox.generate((w, h), self.reader, font)
+                self.reader.prev()
+                if text_layer is None:
+                    continue
+                text_layer.center = (x + w / 2, y + h / 2)
+                if align == "left":
+                    text_layer.left = x
+                if align == "right":
+                    text_layer.right = x + w
+                self.textbox_color.apply([text_layer])
+                text_layers.append(text_layer)
+                texts.append(text)
+        self.content_color.apply(text_layers)
+        return text_layers, texts

synthdog/elements/document.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import numpy as np
+from synthtiger import components
+from elements.content import Content
+from elements.paper import Paper
+class Document:
+    def __init__(self, config):
+        self.fullscreen = config.get("fullscreen", 0.5)
+        self.landscape = config.get("landscape", 0.5)
+        self.short_size = config.get("short_size", [480, 1024])
+        self.aspect_ratio = config.get("aspect_ratio", [1, 2])
+        self.paper = Paper(config.get("paper", {}))
+        self.content = Content(config.get("content", {}))
+        self.effect = components.Iterator(
+            [
+                components.Switch(components.ElasticDistortion()),
+                components.Switch(components.AdditiveGaussianNoise()),
+                components.Switch(
+                    components.Selector(
+                        [
+                            components.Perspective(),
+                            components.Perspective(),
+                            components.Perspective(),
+                            components.Perspective(),
+                            components.Perspective(),
+                            components.Perspective(),
+                            components.Perspective(),
+                            components.Perspective(),
+                        ]
+                    )
+                ),
+            ],
+            **config.get("effect", {}),
+        )
+    def generate(self, size):
+        width, height = size
+        fullscreen = np.random.rand() < self.fullscreen
+        if not fullscreen:
+            landscape = np.random.rand() < self.landscape
+            max_size = width if landscape else height
+            short_size = np.random.randint(
+                min(width, height, self.short_size[0]),
+                min(width, height, self.short_size[1]) + 1,
+            )
+            aspect_ratio = np.random.uniform(
+                min(max_size / short_size, self.aspect_ratio[0]),
+                min(max_size / short_size, self.aspect_ratio[1]),
+            )
+            long_size = int(short_size * aspect_ratio)
+            size = (long_size, short_size) if landscape else (short_size, long_size)
+        text_layers, texts = self.content.generate(size)
+        paper_layer = self.paper.generate(size)
+        self.effect.apply([*text_layers, paper_layer])
+        return paper_layer, text_layers, texts

synthdog/elements/paper.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+from synthtiger import components, layers
+class Paper:
+    def __init__(self, config):
+        self.image = components.BaseTexture(**config.get("image", {}))
+    def generate(self, size):
+        paper_layer = layers.RectLayer(size, (255, 255, 255, 255))
+        self.image.apply([paper_layer])
+        return paper_layer

synthdog/elements/textbox.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import numpy as np
+from synthtiger import layers
+class TextBox:
+    def __init__(self, config):
+        self.fill = config.get("fill", [1, 1])
+    def generate(self, size, text, font):
+        width, height = size
+        char_layers, chars = [], []
+        fill = np.random.uniform(self.fill[0], self.fill[1])
+        width = np.clip(width * fill, height, width)
+        font = {**font, "size": int(height)}
+        left, top = 0, 0
+        for char in text:
+            if char in "\r\n":
+                continue
+            char_layer = layers.TextLayer(char, **font)
+            char_scale = height / char_layer.height
+            char_layer.bbox = [left, top, *(char_layer.size * char_scale)]
+            if char_layer.right > width:
+                break
+            char_layers.append(char_layer)
+            chars.append(char)
+            left = char_layer.right
+        text = "".join(chars).strip()
+        if len(char_layers) == 0 or len(text) == 0:
+            return None, None
+        text_layer = layers.Group(char_layers).merge()
+        return text_layer, text

synthdog/layouts/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+from layouts.grid import Grid
+from layouts.grid_stack import GridStack
+__all__ = ["Grid", "GridStack"]

synthdog/layouts/grid.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import numpy as np
+class Grid:
+    def __init__(self, config):
+        self.text_scale = config.get("text_scale", [0.05, 0.1])
+        self.max_row = config.get("max_row", 5)
+        self.max_col = config.get("max_col", 3)
+        self.fill = config.get("fill", [0, 1])
+        self.full = config.get("full", 0)
+        self.align = config.get("align", ["left", "right", "center"])
+    def generate(self, bbox):
+        left, top, width, height = bbox
+        text_scale = np.random.uniform(self.text_scale[0], self.text_scale[1])
+        text_size = min(width, height) * text_scale
+        grids = np.random.permutation(self.max_row * self.max_col)
+        for grid in grids:
+            row = grid // self.max_col + 1
+            col = grid % self.max_col + 1
+            if text_size * (col * 2 - 1) <= width and text_size * row <= height:
+                break
+        else:
+            return None
+        bound = max(1 - text_size / width * (col - 1), 0)
+        full = np.random.rand() < self.full
+        fill = np.random.uniform(self.fill[0], self.fill[1])
+        fill = 1 if full else fill
+        fill = np.clip(fill, 0, bound)
+        padding = np.random.randint(4) if col > 1 else np.random.randint(1, 4)
+        padding = (bool(padding // 2), bool(padding % 2))
+        weights = np.zeros(col * 2 + 1)
+        weights[1:-1] = text_size / width
+        probs = 1 - np.random.rand(col * 2 + 1)
+        probs[0] = 0 if not padding[0] else probs[0]
+        probs[-1] = 0 if not padding[-1] else probs[-1]
+        probs[1::2] *= max(fill - sum(weights[1::2]), 0) / sum(probs[1::2])
+        probs[::2] *= max(1 - fill - sum(weights[::2]), 0) / sum(probs[::2])
+        weights += probs
+        widths = [width * weights[c] for c in range(col * 2 + 1)]
+        heights = [text_size for _ in range(row)]
+        xs = np.cumsum([0] + widths)
+        ys = np.cumsum([0] + heights)
+        layout = []
+        for c in range(col):
+            align = self.align[np.random.randint(len(self.align))]
+            for r in range(row):
+                x, y = xs[c * 2 + 1], ys[r]
+                w, h = xs[c * 2 + 2] - x, ys[r + 1] - y
+                bbox = [left + x, top + y, w, h]
+                layout.append((bbox, align))
+        return layout

synthdog/layouts/grid_stack.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import numpy as np
+from layouts import Grid
+class GridStack:
+    def __init__(self, config):
+        self.text_scale = config.get("text_scale", [0.05, 0.1])
+        self.max_row = config.get("max_row", 5)
+        self.max_col = config.get("max_col", 3)
+        self.fill = config.get("fill", [0, 1])
+        self.full = config.get("full", 0)
+        self.align = config.get("align", ["left", "right", "center"])
+        self.stack_spacing = config.get("stack_spacing", [0, 0.05])
+        self.stack_fill = config.get("stack_fill", [1, 1])
+        self.stack_full = config.get("stack_full", 0)
+        self._grid = Grid(
+            {
+                "text_scale": self.text_scale,
+                "max_row": self.max_row,
+                "max_col": self.max_col,
+                "align": self.align,
+            }
+        )
+    def generate(self, bbox):
+        left, top, width, height = bbox
+        stack_spacing = np.random.uniform(self.stack_spacing[0], self.stack_spacing[1])
+        stack_spacing *= min(width, height)
+        stack_full = np.random.rand() < self.stack_full
+        stack_fill = np.random.uniform(self.stack_fill[0], self.stack_fill[1])
+        stack_fill = 1 if stack_full else stack_fill
+        full = np.random.rand() < self.full
+        fill = np.random.uniform(self.fill[0], self.fill[1])
+        fill = 1 if full else fill
+        self._grid.fill = [fill, fill]
+        layouts = []
+        line = 0
+        while True:
+            grid_size = (width, height * stack_fill - line)
+            text_scale = np.random.uniform(self.text_scale[0], self.text_scale[1])
+            text_size = min(width, height) * text_scale
+            text_scale = text_size / min(grid_size)
+            self._grid.text_scale = [text_scale, text_scale]
+            layout = self._grid.generate([left, top + line, *grid_size])
+            if layout is None:
+                break
+            line = max(y + h - top for (_, y, _, h), _ in layout) + stack_spacing
+            layouts.append(layout)
+        line = max(line - stack_spacing, 0)
+        space = max(height - line, 0)
+        spaces = np.random.rand(len(layouts) + 1)
+        spaces *= space / sum(spaces) if sum(spaces) > 0 else 0
+        spaces = np.cumsum(spaces)
+        for layout, space in zip(layouts, spaces):
+            for bbox, _ in layout:
+                x, y, w, h = bbox
+                bbox[:] = [x, y + space, w, h]
+        return layouts