Spaces:

uartimcs
/

donut-booking-gradio

Running

App Files Files Community

uartimcs commited on 2 days ago

Commit

608a96e

•

1 Parent(s): 001cc92

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +26 -29
config/train_docvqa.yaml +23 -0
config/train_invoices.yaml +22 -0
config/train_rvlcdip.yaml +23 -0
config/train_zhtrainticket.yaml +22 -0
dataset/.gitkeep +1 -0
misc/overview.png +0 -0
misc/sample_image_cord_test_receipt_00004.png +3 -0
misc/sample_image_donut_document.png +0 -0
misc/sample_synthdog.png +3 -0
misc/screenshot_gradio_demos.png +3 -0
result/.gitkeep +1 -0
synthdog/README.md +63 -0
synthdog/config_en.yaml +119 -0
synthdog/config_ja.yaml +119 -0
synthdog/config_ko.yaml +119 -0
synthdog/config_zh.yaml +119 -0
synthdog/elements/__init__.py +12 -0
synthdog/elements/background.py +24 -0
synthdog/elements/content.py +118 -0
synthdog/elements/document.py +65 -0
synthdog/elements/paper.py +17 -0
synthdog/elements/textbox.py +43 -0
synthdog/layouts/__init__.py +9 -0
synthdog/layouts/grid.py +68 -0
synthdog/layouts/grid_stack.py +74 -0
synthdog/resources/background/bedroom_83.jpg +0 -0
synthdog/resources/background/bob+dylan_83.jpg +0 -0
synthdog/resources/background/coffee_122.jpg +0 -0
synthdog/resources/background/coffee_18.jpeg +3 -0
synthdog/resources/background/crater_141.jpg +3 -0
synthdog/resources/background/cream_124.jpg +3 -0
synthdog/resources/background/eagle_110.jpg +0 -0
synthdog/resources/background/farm_25.jpg +0 -0
synthdog/resources/background/hiking_18.jpg +0 -0
synthdog/resources/corpus/enwiki.txt +0 -0
synthdog/resources/corpus/jawiki.txt +0 -0
synthdog/resources/corpus/kowiki.txt +0 -0
synthdog/resources/corpus/zhwiki.txt +0 -0
synthdog/resources/font/en/NotoSans-Regular.ttf +0 -0
synthdog/resources/font/en/NotoSerif-Regular.ttf +0 -0
synthdog/resources/font/ja/NotoSansJP-Regular.otf +3 -0
synthdog/resources/font/ja/NotoSerifJP-Regular.otf +3 -0
synthdog/resources/font/ko/NotoSansKR-Regular.otf +3 -0
synthdog/resources/font/ko/NotoSerifKR-Regular.otf +3 -0
synthdog/resources/font/zh/NotoSansSC-Regular.otf +3 -0
synthdog/resources/font/zh/NotoSerifSC-Regular.otf +3 -0
synthdog/resources/paper/paper_1.jpg +3 -0
synthdog/resources/paper/paper_2.jpg +3 -0
synthdog/resources/paper/paper_3.jpg +3 -0

app.py CHANGED Viewed

@@ -1,29 +1,26 @@
-import gradio as gr
-import argparse
-import torch
-from PIL import Image
-from donut import DonutModel
-def demo_process(input_img):
-    global model, task_prompt, task_name
-    input_img = Image.fromarray(input_img)
-    output = model.inference(image=input_img, prompt=task_prompt)["predictions"][0]
-    return output
-parser = argparse.ArgumentParser()
-parser.add_argument("--task", type=str, default="Booking")
-parser.add_argument("--pretrained_path", type=str, default="uartimcs/donut-booking-extract")
-args, left_argv = parser.parse_known_args()
-task_name = args.task
-task_prompt = f"<s_{task_name}>"
-image = Image.open("./sample-booking/CMA_150.jpg")
-image.save("CMA_sample.jpg")
-image = Image.open("./sample-booking/COSCO_150.jpg")
-image.save("COSCO_sample.jpg")
-image = Image.open("./sample-booking/ONEY_150.jpg")
-image.save("ONEY_sample.jpg")
-model = DonutModel.from_pretrained("uartimcs/donut-booking-extract")
-model.eval()
-demo = gr.Interface(fn=demo_process,inputs="image",outputs="json", title=f"Donut 🍩 demonstration for `{task_name}` task", examples=[["CMA_sample.jpg"], ["COSCO_sample.jpg"], ["ONEY_sample.jpg"]],)
-demo.launch()

+import gradio as gr
+import argparse
+import torch
+from PIL import Image
+from donut import DonutModel
+def demo_process(input_img):
+    global model, task_prompt, task_name
+    input_img = Image.fromarray(input_img)
+    output = model.inference(image=input_img, prompt=task_prompt)["predictions"][0]
+    return output
+parser = argparse.ArgumentParser()
+parser.add_argument("--task", type=str, default="Booking")
+parser.add_argument("--pretrained_path", type=str, default="result/train_booking/20241112_150925")
+args, left_argv = parser.parse_known_args()
+task_name = args.task
+task_prompt = f"<s_{task_name}>"
+model = DonutModel.from_pretrained("./result/train_booking/20241112_150925")
+if torch.cuda.is_available():
+    model.half()
+    device = torch.device("cuda")
+    model.to(device)
+else:
+    model.encoder.to(torch.bfloat16)
+model.eval()
+demo = gr.Interface(fn=demo_process,inputs="image",outputs="json", title=f"Donut 🍩 demonstration for `{task_name}` task",)
+demo.launch(debug=True)

config/train_docvqa.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+resume_from_checkpoint_path: null
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base"
+dataset_name_or_paths: ["./dataset/docvqa"] # should be prepared from https://rrc.cvc.uab.es/?ch=17
+sort_json_key: True
+train_batch_sizes: [2]
+val_batch_sizes: [4]
+input_size: [2560, 1920]
+max_length: 128
+align_long_axis: False
+# num_nodes: 8 # memo: donut-base-finetuned-docvqa was trained with 8 nodes
+num_nodes: 1
+seed: 2022
+lr: 3e-5
+warmup_steps: 10000
+num_training_samples_per_epoch: 39463
+max_epochs: 300
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 1
+gradient_clip_val: 0.25
+verbose: True

config/train_invoices.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+resume_from_checkpoint_path: null # only used for resume_from_checkpoint option in PL
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base" # loading a pre-trained model (from moldehub or path)
+dataset_name_or_paths: ["./dataset/SGSInvoice"] # loading datasets (from moldehub or path)
+sort_json_key: False # cord dataset is preprocessed, and publicly available at https://huggingface.co/datasets/naver-clova-ix/cord-v2
+train_batch_sizes: [2]
+val_batch_sizes: [1]
+input_size: [1280, 960] # when the input resolution differs from the pre-training setting, some weights will be newly initialized (but the model training would be okay)
+max_length: 768
+align_long_axis: False
+num_nodes: 1
+seed: 2022
+lr: 3e-5
+warmup_steps: 60 # 800/8*30/10, 10%
+num_training_samples_per_epoch: 800
+max_epochs: 10
+max_steps: -1
+num_workers: 2
+val_check_interval: 1.0
+check_val_every_n_epoch: 3
+gradient_clip_val: 1.0
+verbose: True

config/train_rvlcdip.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+resume_from_checkpoint_path: null
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base"
+dataset_name_or_paths: ["./dataset/rvlcdip"] # should be prepared from https://www.cs.cmu.edu/~aharley/rvl-cdip/
+sort_json_key: True
+train_batch_sizes: [2]
+val_batch_sizes: [4]
+input_size: [2560, 1920]
+max_length: 8
+align_long_axis: False
+# num_nodes: 8 # memo: donut-base-finetuned-rvlcdip was trained with 8 nodes
+num_nodes: 1
+seed: 2022
+lr: 2e-5
+warmup_steps: 10000
+num_training_samples_per_epoch: 320000
+max_epochs: 100
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 1
+gradient_clip_val: 1.0
+verbose: True

config/train_zhtrainticket.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+resume_from_checkpoint_path: null
+result_path: "./result"
+pretrained_model_name_or_path: "naver-clova-ix/donut-base"
+dataset_name_or_paths: ["./dataset/zhtrainticket"] # should be prepared from https://github.com/beacandler/EATEN
+sort_json_key: True
+train_batch_sizes: [8]
+val_batch_sizes: [1]
+input_size: [960, 1280]
+max_length: 256
+align_long_axis: False
+num_nodes: 1
+seed: 2022
+lr: 3e-5
+warmup_steps: 300
+num_training_samples_per_epoch: 1368
+max_epochs: 10
+max_steps: -1
+num_workers: 8
+val_check_interval: 1.0
+check_val_every_n_epoch: 1
+gradient_clip_val: 1.0
+verbose: True

dataset/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+

misc/overview.png ADDED Viewed

misc/sample_image_cord_test_receipt_00004.png ADDED Viewed

Git LFS Details

SHA256: 8f3eee7068c96e86cdb2e4b5c53085cb5e1439462edd55c373548cb1962801ad
Pointer size: 132 Bytes
Size of remote file: 1.64 MB

misc/sample_image_donut_document.png ADDED Viewed

misc/sample_synthdog.png ADDED Viewed

Git LFS Details

SHA256: 26ca7665ceb4cb850e19aaf6f4cbc9b37ea5780c5e9d512764dad6a83b7931f1
Pointer size: 132 Bytes
Size of remote file: 1.44 MB

misc/screenshot_gradio_demos.png ADDED Viewed

Git LFS Details

SHA256: f0f063308ddc48feb5a493560a18d057c68f8989fdc00eb91c171e0e9b552f3e
Pointer size: 132 Bytes
Size of remote file: 1.39 MB

result/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+

synthdog/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# SynthDoG 🐶: Synthetic Document Generator
+SynthDoG is synthetic document generator for visual document understanding (VDU).
+![image](../misc/sample_synthdog.png)
+## Prerequisites
+- python>=3.6
+- [synthtiger](https://github.com/clovaai/synthtiger) (`pip install synthtiger`)
+## Usage
+```bash
+# Set environment variable (for macOS)
+$ export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
+synthtiger -o ./outputs/SynthDoG_en -c 50 -w 4 -v template.py SynthDoG config_en.yaml
+{'config': 'config_en.yaml',
+ 'count': 50,
+ 'name': 'SynthDoG',
+ 'output': './outputs/SynthDoG_en',
+ 'script': 'template.py',
+ 'verbose': True,
+ 'worker': 4}
+{'aspect_ratio': [1, 2],
+     .
+     .
+ 'quality': [50, 95],
+ 'short_size': [720, 1024]}
+Generated 1 data (task 3)
+Generated 2 data (task 0)
+Generated 3 data (task 1)
+     .
+     .
+Generated 49 data (task 48)
+Generated 50 data (task 49)
+46.32 seconds elapsed
+```
+Some important arguments:
+- `-o` : directory path to save data.
+- `-c` : number of data to generate.
+- `-w` : number of workers.
+- `-s` : random seed.
+- `-v` : print error messages.
+To generate ECJK samples:
+```bash
+# english
+synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_en.yaml
+# chinese
+synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_zh.yaml
+# japanese
+synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_ja.yaml
+# korean
+synthtiger -o {dataset_path} -c {num_of_data} -w {num_of_workers} -v template.py SynthDoG config_ko.yaml
+```

synthdog/config_en.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+quality: [50, 95]
+landscape: 0.5
+short_size: [720, 1024]
+aspect_ratio: [1, 2]
+background:
+  image:
+    paths: [resources/background]
+    weights: [1]
+  effect:
+    args:
+      # gaussian blur
+      - prob: 1
+        args:
+          sigma: [0, 10]
+document:
+  fullscreen: 0.5
+  landscape: 0.5
+  short_size: [480, 1024]
+  aspect_ratio: [1, 2]
+  paper:
+    image:
+      paths: [resources/paper]
+      weights: [1]
+      alpha: [0, 0.2]
+      grayscale: 1
+      crop: 1
+  content:
+    margin: [0, 0.1]
+    text:
+      path: resources/corpus/enwiki.txt
+    font:
+      paths: [resources/font/en]
+      weights: [1]
+      bold: 0
+    layout:
+      text_scale: [0.0334, 0.1]
+      max_row: 10
+      max_col: 3
+      fill: [0.5, 1]
+      full: 0.1
+      align: [left, right, center]
+      stack_spacing: [0.0334, 0.0334]
+      stack_fill: [0.5, 1]
+      stack_full: 0.1
+    textbox:
+      fill: [0.5, 1]
+    textbox_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+    content_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+  effect:
+    args:
+      # elastic distortion
+      - prob: 1
+        args:
+          alpha: [0, 1]
+          sigma: [0, 0.5]
+      # gaussian noise
+      - prob: 1
+        args:
+          scale: [0, 8]
+          per_channel: 0
+      # perspective
+      - prob: 1
+        args:
+          weights: [750, 50, 50, 25, 25, 25, 25, 50]
+          args:
+            - percents: [[0.75, 1], [0.75, 1], [0.75, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [0.75, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [1, 1]]
+effect:
+  args:
+    # color
+    - prob: 0.2
+      args:
+        rgb: [[0, 255], [0, 255], [0, 255]]
+        alpha: [0, 0.2]
+    # shadow
+    - prob: 1
+      args:
+        intensity: [0, 160]
+        amount: [0, 1]
+        smoothing: [0.5, 1]
+        bidirectional: 0
+    # contrast
+    - prob: 1
+      args:
+        alpha: [1, 1.5]
+    # brightness
+    - prob: 1
+      args:
+        beta: [-48, 0]
+    # motion blur
+    - prob: 0.5
+      args:
+        k: [3, 5]
+        angle: [0, 360]
+    # gaussian blur
+    - prob: 1
+      args:
+        sigma: [0, 1.5]

synthdog/config_ja.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+quality: [50, 95]
+landscape: 0.5
+short_size: [720, 1024]
+aspect_ratio: [1, 2]
+background:
+  image:
+    paths: [resources/background]
+    weights: [1]
+  effect:
+    args:
+      # gaussian blur
+      - prob: 1
+        args:
+          sigma: [0, 10]
+document:
+  fullscreen: 0.5
+  landscape: 0.5
+  short_size: [480, 1024]
+  aspect_ratio: [1, 2]
+  paper:
+    image:
+      paths: [resources/paper]
+      weights: [1]
+      alpha: [0, 0.2]
+      grayscale: 1
+      crop: 1
+  content:
+    margin: [0, 0.1]
+    text:
+      path: resources/corpus/jawiki.txt
+    font:
+      paths: [resources/font/ja]
+      weights: [1]
+      bold: 0
+    layout:
+      text_scale: [0.0334, 0.1]
+      max_row: 10
+      max_col: 3
+      fill: [0.5, 1]
+      full: 0.1
+      align: [left, right, center]
+      stack_spacing: [0.0334, 0.0334]
+      stack_fill: [0.5, 1]
+      stack_full: 0.1
+    textbox:
+      fill: [0.5, 1]
+    textbox_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+    content_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+  effect:
+    args:
+      # elastic distortion
+      - prob: 1
+        args:
+          alpha: [0, 1]
+          sigma: [0, 0.5]
+      # gaussian noise
+      - prob: 1
+        args:
+          scale: [0, 8]
+          per_channel: 0
+      # perspective
+      - prob: 1
+        args:
+          weights: [750, 50, 50, 25, 25, 25, 25, 50]
+          args:
+            - percents: [[0.75, 1], [0.75, 1], [0.75, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [0.75, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [1, 1]]
+effect:
+  args:
+    # color
+    - prob: 0.2
+      args:
+        rgb: [[0, 255], [0, 255], [0, 255]]
+        alpha: [0, 0.2]
+    # shadow
+    - prob: 1
+      args:
+        intensity: [0, 160]
+        amount: [0, 1]
+        smoothing: [0.5, 1]
+        bidirectional: 0
+    # contrast
+    - prob: 1
+      args:
+        alpha: [1, 1.5]
+    # brightness
+    - prob: 1
+      args:
+        beta: [-48, 0]
+    # motion blur
+    - prob: 0.5
+      args:
+        k: [3, 5]
+        angle: [0, 360]
+    # gaussian blur
+    - prob: 1
+      args:
+        sigma: [0, 1.5]

synthdog/config_ko.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+quality: [50, 95]
+landscape: 0.5
+short_size: [720, 1024]
+aspect_ratio: [1, 2]
+background:
+  image:
+    paths: [resources/background]
+    weights: [1]
+  effect:
+    args:
+      # gaussian blur
+      - prob: 1
+        args:
+          sigma: [0, 10]
+document:
+  fullscreen: 0.5
+  landscape: 0.5
+  short_size: [480, 1024]
+  aspect_ratio: [1, 2]
+  paper:
+    image:
+      paths: [resources/paper]
+      weights: [1]
+      alpha: [0, 0.2]
+      grayscale: 1
+      crop: 1
+  content:
+    margin: [0, 0.1]
+    text:
+      path: resources/corpus/kowiki.txt
+    font:
+      paths: [resources/font/ko]
+      weights: [1]
+      bold: 0
+    layout:
+      text_scale: [0.0334, 0.1]
+      max_row: 10
+      max_col: 3
+      fill: [0.5, 1]
+      full: 0.1
+      align: [left, right, center]
+      stack_spacing: [0.0334, 0.0334]
+      stack_fill: [0.5, 1]
+      stack_full: 0.1
+    textbox:
+      fill: [0.5, 1]
+    textbox_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+    content_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+  effect:
+    args:
+      # elastic distortion
+      - prob: 1
+        args:
+          alpha: [0, 1]
+          sigma: [0, 0.5]
+      # gaussian noise
+      - prob: 1
+        args:
+          scale: [0, 8]
+          per_channel: 0
+      # perspective
+      - prob: 1
+        args:
+          weights: [750, 50, 50, 25, 25, 25, 25, 50]
+          args:
+            - percents: [[0.75, 1], [0.75, 1], [0.75, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [0.75, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [1, 1]]
+effect:
+  args:
+    # color
+    - prob: 0.2
+      args:
+        rgb: [[0, 255], [0, 255], [0, 255]]
+        alpha: [0, 0.2]
+    # shadow
+    - prob: 1
+      args:
+        intensity: [0, 160]
+        amount: [0, 1]
+        smoothing: [0.5, 1]
+        bidirectional: 0
+    # contrast
+    - prob: 1
+      args:
+        alpha: [1, 1.5]
+    # brightness
+    - prob: 1
+      args:
+        beta: [-48, 0]
+    # motion blur
+    - prob: 0.5
+      args:
+        k: [3, 5]
+        angle: [0, 360]
+    # gaussian blur
+    - prob: 1
+      args:
+        sigma: [0, 1.5]

synthdog/config_zh.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+quality: [50, 95]
+landscape: 0.5
+short_size: [720, 1024]
+aspect_ratio: [1, 2]
+background:
+  image:
+    paths: [resources/background]
+    weights: [1]
+  effect:
+    args:
+      # gaussian blur
+      - prob: 1
+        args:
+          sigma: [0, 10]
+document:
+  fullscreen: 0.5
+  landscape: 0.5
+  short_size: [480, 1024]
+  aspect_ratio: [1, 2]
+  paper:
+    image:
+      paths: [resources/paper]
+      weights: [1]
+      alpha: [0, 0.2]
+      grayscale: 1
+      crop: 1
+  content:
+    margin: [0, 0.1]
+    text:
+      path: resources/corpus/zhwiki.txt
+    font:
+      paths: [resources/font/zh]
+      weights: [1]
+      bold: 0
+    layout:
+      text_scale: [0.0334, 0.1]
+      max_row: 10
+      max_col: 3
+      fill: [0.5, 1]
+      full: 0.1
+      align: [left, right, center]
+      stack_spacing: [0.0334, 0.0334]
+      stack_fill: [0.5, 1]
+      stack_full: 0.1
+    textbox:
+      fill: [0.5, 1]
+    textbox_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+    content_color:
+      prob: 0.2
+      args:
+        gray: [0, 64]
+        colorize: 1
+  effect:
+    args:
+      # elastic distortion
+      - prob: 1
+        args:
+          alpha: [0, 1]
+          sigma: [0, 0.5]
+      # gaussian noise
+      - prob: 1
+        args:
+          scale: [0, 8]
+          per_channel: 0
+      # perspective
+      - prob: 1
+        args:
+          weights: [750, 50, 50, 25, 25, 25, 25, 50]
+          args:
+            - percents: [[0.75, 1], [0.75, 1], [0.75, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [0.75, 1]]
+            - percents: [[0.75, 1], [1, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [0.75, 1], [1, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [0.75, 1], [1, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [0.75, 1]]
+            - percents: [[1, 1], [1, 1], [1, 1], [1, 1]]
+effect:
+  args:
+    # color
+    - prob: 0.2
+      args:
+        rgb: [[0, 255], [0, 255], [0, 255]]
+        alpha: [0, 0.2]
+    # shadow
+    - prob: 1
+      args:
+        intensity: [0, 160]
+        amount: [0, 1]
+        smoothing: [0.5, 1]
+        bidirectional: 0
+    # contrast
+    - prob: 1
+      args:
+        alpha: [1, 1.5]
+    # brightness
+    - prob: 1
+      args:
+        beta: [-48, 0]
+    # motion blur
+    - prob: 0.5
+      args:
+        k: [3, 5]
+        angle: [0, 360]
+    # gaussian blur
+    - prob: 1
+      args:
+        sigma: [0, 1.5]

synthdog/elements/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+from elements.background import Background
+from elements.content import Content
+from elements.document import Document
+from elements.paper import Paper
+from elements.textbox import TextBox
+__all__ = ["Background", "Content", "Document", "Paper", "TextBox"]

synthdog/elements/background.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+from synthtiger import components, layers
+class Background:
+    def __init__(self, config):
+        self.image = components.BaseTexture(**config.get("image", {}))
+        self.effect = components.Iterator(
+            [
+                components.Switch(components.GaussianBlur()),
+            ],
+            **config.get("effect", {})
+        )
+    def generate(self, size):
+        bg_layer = layers.RectLayer(size, (255, 255, 255, 255))
+        self.image.apply([bg_layer])
+        self.effect.apply([bg_layer])
+        return bg_layer

synthdog/elements/content.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+from collections import OrderedDict
+import numpy as np
+from synthtiger import components
+from elements.textbox import TextBox
+from layouts import GridStack
+class TextReader:
+    def __init__(self, path, cache_size=2 ** 28, block_size=2 ** 20):
+        self.fp = open(path, "r", encoding="utf-8")
+        self.length = 0
+        self.offsets = [0]
+        self.cache = OrderedDict()
+        self.cache_size = cache_size
+        self.block_size = block_size
+        self.bucket_size = cache_size // block_size
+        self.idx = 0
+        while True:
+            text = self.fp.read(self.block_size)
+            if not text:
+                break
+            self.length += len(text)
+            self.offsets.append(self.fp.tell())
+    def __len__(self):
+        return self.length
+    def __iter__(self):
+        return self
+    def __next__(self):
+        char = self.get()
+        self.next()
+        return char
+    def move(self, idx):
+        self.idx = idx
+    def next(self):
+        self.idx = (self.idx + 1) % self.length
+    def prev(self):
+        self.idx = (self.idx - 1) % self.length
+    def get(self):
+        key = self.idx // self.block_size
+        if key in self.cache:
+            text = self.cache[key]
+        else:
+            if len(self.cache) >= self.bucket_size:
+                self.cache.popitem(last=False)
+            offset = self.offsets[key]
+            self.fp.seek(offset, 0)
+            text = self.fp.read(self.block_size)
+            self.cache[key] = text
+        self.cache.move_to_end(key)
+        char = text[self.idx % self.block_size]
+        return char
+class Content:
+    def __init__(self, config):
+        self.margin = config.get("margin", [0, 0.1])
+        self.reader = TextReader(**config.get("text", {}))
+        self.font = components.BaseFont(**config.get("font", {}))
+        self.layout = GridStack(config.get("layout", {}))
+        self.textbox = TextBox(config.get("textbox", {}))
+        self.textbox_color = components.Switch(components.Gray(), **config.get("textbox_color", {}))
+        self.content_color = components.Switch(components.Gray(), **config.get("content_color", {}))
+    def generate(self, size):
+        width, height = size
+        layout_left = width * np.random.uniform(self.margin[0], self.margin[1])
+        layout_top = height * np.random.uniform(self.margin[0], self.margin[1])
+        layout_width = max(width - layout_left * 2, 0)
+        layout_height = max(height - layout_top * 2, 0)
+        layout_bbox = [layout_left, layout_top, layout_width, layout_height]
+        text_layers, texts = [], []
+        layouts = self.layout.generate(layout_bbox)
+        self.reader.move(np.random.randint(len(self.reader)))
+        for layout in layouts:
+            font = self.font.sample()
+            for bbox, align in layout:
+                x, y, w, h = bbox
+                text_layer, text = self.textbox.generate((w, h), self.reader, font)
+                self.reader.prev()
+                if text_layer is None:
+                    continue
+                text_layer.center = (x + w / 2, y + h / 2)
+                if align == "left":
+                    text_layer.left = x
+                if align == "right":
+                    text_layer.right = x + w
+                self.textbox_color.apply([text_layer])
+                text_layers.append(text_layer)
+                texts.append(text)
+        self.content_color.apply(text_layers)
+        return text_layers, texts

synthdog/elements/document.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import numpy as np
+from synthtiger import components
+from elements.content import Content
+from elements.paper import Paper
+class Document:
+    def __init__(self, config):
+        self.fullscreen = config.get("fullscreen", 0.5)
+        self.landscape = config.get("landscape", 0.5)
+        self.short_size = config.get("short_size", [480, 1024])
+        self.aspect_ratio = config.get("aspect_ratio", [1, 2])
+        self.paper = Paper(config.get("paper", {}))
+        self.content = Content(config.get("content", {}))
+        self.effect = components.Iterator(
+            [
+                components.Switch(components.ElasticDistortion()),
+                components.Switch(components.AdditiveGaussianNoise()),
+                components.Switch(
+                    components.Selector(
+                        [
+                            components.Perspective(),
+                            components.Perspective(),
+                            components.Perspective(),
+                            components.Perspective(),
+                            components.Perspective(),
+                            components.Perspective(),
+                            components.Perspective(),
+                            components.Perspective(),
+                        ]
+                    )
+                ),
+            ],
+            **config.get("effect", {}),
+        )
+    def generate(self, size):
+        width, height = size
+        fullscreen = np.random.rand() < self.fullscreen
+        if not fullscreen:
+            landscape = np.random.rand() < self.landscape
+            max_size = width if landscape else height
+            short_size = np.random.randint(
+                min(width, height, self.short_size[0]),
+                min(width, height, self.short_size[1]) + 1,
+            )
+            aspect_ratio = np.random.uniform(
+                min(max_size / short_size, self.aspect_ratio[0]),
+                min(max_size / short_size, self.aspect_ratio[1]),
+            )
+            long_size = int(short_size * aspect_ratio)
+            size = (long_size, short_size) if landscape else (short_size, long_size)
+        text_layers, texts = self.content.generate(size)
+        paper_layer = self.paper.generate(size)
+        self.effect.apply([*text_layers, paper_layer])
+        return paper_layer, text_layers, texts

synthdog/elements/paper.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+from synthtiger import components, layers
+class Paper:
+    def __init__(self, config):
+        self.image = components.BaseTexture(**config.get("image", {}))
+    def generate(self, size):
+        paper_layer = layers.RectLayer(size, (255, 255, 255, 255))
+        self.image.apply([paper_layer])
+        return paper_layer

synthdog/elements/textbox.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import numpy as np
+from synthtiger import layers
+class TextBox:
+    def __init__(self, config):
+        self.fill = config.get("fill", [1, 1])
+    def generate(self, size, text, font):
+        width, height = size
+        char_layers, chars = [], []
+        fill = np.random.uniform(self.fill[0], self.fill[1])
+        width = np.clip(width * fill, height, width)
+        font = {**font, "size": int(height)}
+        left, top = 0, 0
+        for char in text:
+            if char in "\r\n":
+                continue
+            char_layer = layers.TextLayer(char, **font)
+            char_scale = height / char_layer.height
+            char_layer.bbox = [left, top, *(char_layer.size * char_scale)]
+            if char_layer.right > width:
+                break
+            char_layers.append(char_layer)
+            chars.append(char)
+            left = char_layer.right
+        text = "".join(chars).strip()
+        if len(char_layers) == 0 or len(text) == 0:
+            return None, None
+        text_layer = layers.Group(char_layers).merge()
+        return text_layer, text

synthdog/layouts/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+from layouts.grid import Grid
+from layouts.grid_stack import GridStack
+__all__ = ["Grid", "GridStack"]

synthdog/layouts/grid.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import numpy as np
+class Grid:
+    def __init__(self, config):
+        self.text_scale = config.get("text_scale", [0.05, 0.1])
+        self.max_row = config.get("max_row", 5)
+        self.max_col = config.get("max_col", 3)
+        self.fill = config.get("fill", [0, 1])
+        self.full = config.get("full", 0)
+        self.align = config.get("align", ["left", "right", "center"])
+    def generate(self, bbox):
+        left, top, width, height = bbox
+        text_scale = np.random.uniform(self.text_scale[0], self.text_scale[1])
+        text_size = min(width, height) * text_scale
+        grids = np.random.permutation(self.max_row * self.max_col)
+        for grid in grids:
+            row = grid // self.max_col + 1
+            col = grid % self.max_col + 1
+            if text_size * (col * 2 - 1) <= width and text_size * row <= height:
+                break
+        else:
+            return None
+        bound = max(1 - text_size / width * (col - 1), 0)
+        full = np.random.rand() < self.full
+        fill = np.random.uniform(self.fill[0], self.fill[1])
+        fill = 1 if full else fill
+        fill = np.clip(fill, 0, bound)
+        padding = np.random.randint(4) if col > 1 else np.random.randint(1, 4)
+        padding = (bool(padding // 2), bool(padding % 2))
+        weights = np.zeros(col * 2 + 1)
+        weights[1:-1] = text_size / width
+        probs = 1 - np.random.rand(col * 2 + 1)
+        probs[0] = 0 if not padding[0] else probs[0]
+        probs[-1] = 0 if not padding[-1] else probs[-1]
+        probs[1::2] *= max(fill - sum(weights[1::2]), 0) / sum(probs[1::2])
+        probs[::2] *= max(1 - fill - sum(weights[::2]), 0) / sum(probs[::2])
+        weights += probs
+        widths = [width * weights[c] for c in range(col * 2 + 1)]
+        heights = [text_size for _ in range(row)]
+        xs = np.cumsum([0] + widths)
+        ys = np.cumsum([0] + heights)
+        layout = []
+        for c in range(col):
+            align = self.align[np.random.randint(len(self.align))]
+            for r in range(row):
+                x, y = xs[c * 2 + 1], ys[r]
+                w, h = xs[c * 2 + 2] - x, ys[r + 1] - y
+                bbox = [left + x, top + y, w, h]
+                layout.append((bbox, align))
+        return layout

synthdog/layouts/grid_stack.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+Donut
+Copyright (c) 2022-present NAVER Corp.
+MIT License
+"""
+import numpy as np
+from layouts import Grid
+class GridStack:
+    def __init__(self, config):
+        self.text_scale = config.get("text_scale", [0.05, 0.1])
+        self.max_row = config.get("max_row", 5)
+        self.max_col = config.get("max_col", 3)
+        self.fill = config.get("fill", [0, 1])
+        self.full = config.get("full", 0)
+        self.align = config.get("align", ["left", "right", "center"])
+        self.stack_spacing = config.get("stack_spacing", [0, 0.05])
+        self.stack_fill = config.get("stack_fill", [1, 1])
+        self.stack_full = config.get("stack_full", 0)
+        self._grid = Grid(
+            {
+                "text_scale": self.text_scale,
+                "max_row": self.max_row,
+                "max_col": self.max_col,
+                "align": self.align,
+            }
+        )
+    def generate(self, bbox):
+        left, top, width, height = bbox
+        stack_spacing = np.random.uniform(self.stack_spacing[0], self.stack_spacing[1])
+        stack_spacing *= min(width, height)
+        stack_full = np.random.rand() < self.stack_full
+        stack_fill = np.random.uniform(self.stack_fill[0], self.stack_fill[1])
+        stack_fill = 1 if stack_full else stack_fill
+        full = np.random.rand() < self.full
+        fill = np.random.uniform(self.fill[0], self.fill[1])
+        fill = 1 if full else fill
+        self._grid.fill = [fill, fill]
+        layouts = []
+        line = 0
+        while True:
+            grid_size = (width, height * stack_fill - line)
+            text_scale = np.random.uniform(self.text_scale[0], self.text_scale[1])
+            text_size = min(width, height) * text_scale
+            text_scale = text_size / min(grid_size)
+            self._grid.text_scale = [text_scale, text_scale]
+            layout = self._grid.generate([left, top + line, *grid_size])
+            if layout is None:
+                break
+            line = max(y + h - top for (_, y, _, h), _ in layout) + stack_spacing
+            layouts.append(layout)
+        line = max(line - stack_spacing, 0)
+        space = max(height - line, 0)
+        spaces = np.random.rand(len(layouts) + 1)
+        spaces *= space / sum(spaces) if sum(spaces) > 0 else 0
+        spaces = np.cumsum(spaces)
+        for layout, space in zip(layouts, spaces):
+            for bbox, _ in layout:
+                x, y, w, h = bbox
+                bbox[:] = [x, y + space, w, h]
+        return layouts