diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..2e28544b708beea6c0570b12b6597a2239843b04 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +res0.png filter=lfs diff=lfs merge=lfs -text +table_drawn_bbox_with_extra.png filter=lfs diff=lfs merge=lfs -text +unitable/website/unitable-demo.gif filter=lfs diff=lfs merge=lfs -text +unitable/website/unitable-demo.mp4 filter=lfs diff=lfs merge=lfs -text +unitable/website/wandb_screenshot.png filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..2f1579976b64c9a40e5c851eb391b33c88af7cca --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ + +unitable/experiments/unitable_weights/** + +res/** + +TestingFiles/** +TestingFilesImages/** + +# python generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info + +# venv +.venv \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..579eff034c61234268a8921cdf93d4033947527d --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,33 @@ +variables: + GIT_STRATEGY: fetch + GIT_SSL_NO_VERIFY: "true" + GIT_LFS_SKIP_SMUDGE: 1 + DOCKER_BUILDKIT: 1 + +stages: + - build + +image_build: + stage: build + image: docker:stable + before_script: + - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN http://$CI_REGISTRY + script: | + CI_COMMIT_SHA_7=$(echo $CI_COMMIT_SHA | cut -c1-7) + DATE=$(date +%Y-%m-%d) + docker build --tag $CI_REGISTRY_IMAGE/$CI_COMMIT_BRANCH:latest \ + --tag $CI_REGISTRY_IMAGE/$CI_COMMIT_BRANCH:$CI_COMMIT_SHA_7 \ + --tag $CI_REGISTRY_IMAGE/$CI_COMMIT_BRANCH:$DATE \ + -f Dockerfile . + docker push $CI_REGISTRY_IMAGE/$CI_COMMIT_BRANCH:latest + docker push $CI_REGISTRY_IMAGE/$CI_COMMIT_BRANCH:$CI_COMMIT_SHA_7 + docker push $CI_REGISTRY_IMAGE/$CI_COMMIT_BRANCH:$DATE + # Run only when Dockerfile has changed + rules: + - if: $CI_PIPELINE_SOURCE == "push" + changes: + - Dockerfile + # Set to `on_success` to automatically rebuild + # Set to `manual` to trigger the build manually using Gitlab UI + when: on_success + allow_failure: true diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000000000000000000000000000000000..7a73a41bfdf76d6f793007240d80983a52f15f97 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,2 @@ +{ +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..a00bd474b24ba27f9b6917473909de5660647486 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,55 @@ +ARG BASE_IMAGE="nvidia/cuda:12.2.2-devel-ubuntu22.04" + +FROM ${BASE_IMAGE} +ARG HOMEDIRECTORY="/myhome" +ENV HOMEDIRECTORY=$HOMEDIRECTORY + +USER root +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + curl \ + python3 \ + python3-pip \ + python3-dev \ + poppler-utils \ + gcc \ + git \ + git-lfs \ + htop \ + libgl1 \ + libglib2.0-0 \ + ncdu \ + openssh-client \ + openssh-server \ + psmisc \ + rsync \ + screen \ + sudo \ + tmux \ + unzip \ + vim \ + wget && \ + wget -q https://github.com/justjanne/powerline-go/releases/download/v1.24/powerline-go-linux-"$(dpkg --print-architecture)" -O /usr/local/bin/powerline-shell && \ + chmod a+x /usr/local/bin/powerline-shell + +RUN ln -s /usr/bin/python3 /usr/bin/python +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu117 + + +# setup ssh +RUN ssh-keygen -A +RUN sed -i 's/#*PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config +EXPOSE 22 + +# Make the root user's home directory /myhome (the default for run.ai), +# and allow to login with password 'root'. +RUN echo 'root:root' | chpasswd +RUN sed -i 's|:root:/root:|:root:/myhome:|' /etc/passwd + +ENTRYPOINT sudo service ssh start && /bin/bash + + + + + diff --git a/README.md b/README.md index dd9befb09f074acbf513f0b8d6d16fcc9acc5bfe..fc2d8ae150dcef5769325cdc6aa0751ba8a89901 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,118 @@ --- -title: Alps -emoji: 👀 -colorFrom: purple -colorTo: green +title: alps +app_file: app.py sdk: gradio sdk_version: 4.44.0 -app_file: app.py -pinned: false --- +# Alps + +Pipeline for OCRing PDFs and tables + +This repository contains different OCR methods using various libraries/models. + +## Running gradio: +`python app.py` in terminal + + +## Installation : +Build the docker image and run the contianer + +Clone this repository and Install the required dependencies: +``` +pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu117 + +apt install weasyprint + +``` +Note: You need a GPU to run this code. + +## Example Usage + +Run python main.py inside the directory. Provide the path to the test file (the file must be placed inside the repository,and the file path should be relative to the repository (alps)). Next, provide the path to save intermediate outputs from the run (draw cell bounding boxes on the table, show table detection results in pdf), and specify which component to run. + +outputs are printed in terminal + +``` +usage: main.py [-h] [--test_file TEST_FILE] [--debug_folder DEBUG_FOLDER] [--englishFlag ENGLISHFLAG] [--denoise DENOISE] ocr + +``` +Description of the component: + +### ocr1 + +ocr1 +Input: Path to a PDF file +Output: Dictionary of each page and list of line_annotations. List of LineAnnotations contains bboxes for each line and List of its children wordAnnotation. Each wordAnnotation contains bboxes and text inside. +What it does: Runs Ragflow textline detector + OCR with DocTR + +Example: +``` +python main.py ocr1 --test_file TestingFiles/OCRTest1German.pdf --debug_folder ./res/ocrdebug1/ +python main.py ocr1 --test_file TestingFiles/OCRTest3English.pdf --debug_folder ./res/ocrdebug1/ --englishFlag True +``` + +### table1 +Input : file path to an image of a cropped table +Output: Parsed table in HTML form +What it does: Uses Unitable + DocTR + +``` +python main.py table1 --test_file cropped_table.png --debug_folder ./res/table1/ + +``` + +### table2 +Input: File path to an image of a cropped table +Output: Parsed table in HTML form +What it does: Uses Unitable + +``` +python main.py table2 --test_file cropped_table.png --debug_folder ./res/table2/ + +``` +### pdftable1 +Input: PDF file path +Output: Parsed table in HTML form +What it does: Uses Unitable + DocTR + + +``` +python main.py pdftable1 --test_file TestingFiles/OCRTest5English.pdf --debug_folder ./res/table_debug1/ + +python main.py pdftable3 --test_file TestingFiles/TableOCRTestEnglish.pdf --debug_folder ./res/poor_relief2 +``` + + +### pdftable2 : +Input: PDF file path +Output: Parsed table in HTML form +What it does: Detects table and parses them, Runs Full Unitable Table detection + +``` +python main.py pdftable2 --test_file TestingFiles/OCRTest5English.pdf --debug_folder ./res/table_debug2/ +``` + + +### pdftable3 +Input: PDF file path +Output: Parsed table in HTML form +What it does: Detects table with YOLO, Unitable + DocTR + + + +### pdftable4 +Input: PDF file path +Output: Parsed table in HTML form +What it does: Detects table with YOLO, Runs Full doctr Table detection + +python main.py pdftable4 --test_file TestingFiles/TableOCRTestEasier.pdf --debug_folder ./res/table_debug3/ + + +## bbox +They are ordered as ordered as [xmin,ymin,xmax,ymax] . Cause the coordinates starts from (0,0) of the image which is upper left corner + +xmin ymim - upper left corner +xmax ymax - bottom lower corner + +![alt text](image-2.png) -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/abstractClass.py b/abstractClass.py new file mode 100644 index 0000000000000000000000000000000000000000..47102109a27a37e5704d6c0d89e91bf7e035d16e --- /dev/null +++ b/abstractClass.py @@ -0,0 +1,59 @@ + +from typing import Any, List, Literal, Mapping, Optional, Tuple +from abc import ABC, abstractmethod + +import numpy as np +import cv2 +from PIL import Image +from abc import ABC, abstractmethod + +from utils import cropImage + + +class OCRComponent: + """ + Wrapper class for cropping images and giving it to OCR Predictor + """ + def predict_pdf(self, pdf_name:str="", page:int=None, bbx:List[List[float]]=None)-> List[List[float]]: + #TODO: Preprocessing to crop interest region + pass + + +class TextDetector(ABC): + """ + Abstract base class for text detectors that takes in bounding boxes, pdf name, and page + and returns bounding boxes results on them. + """ + + def __init__(self): + + pass + + """ + This is for predicting given an already cropped image + """ + @abstractmethod + def predict_img(self, img:np.ndarray=None)-> List[List[float]]: + # do something with self.input and return bbx + pass + +class textRecognizer(ABC): + """ + class of textRecognizer that takes in bounding boxes, pdf name and page and returns + OCR results on them + """ + + def __init__(self): + + pass + + + """ + This is for predicting given text line detection result form text line detector + """ + @abstractmethod + def predict_img(self, bxs:List[List[float]], img:Image.Image)-> List[List[float]]: + # do something with self.input and return bbx + pass + + \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..9f0db95a2d29bf05734f7334902e7bbce9c35962 --- /dev/null +++ b/app.py @@ -0,0 +1,406 @@ + +import os +import traceback +import argparse +from typing import List, Tuple, Set, Dict + +import time +from PIL import Image +import numpy as np +from doctr.models import ocr_predictor +import logging +import pandas as pd +from bs4 import BeautifulSoup +import gradio + +from utils import cropImages +from utils import draw_only_box,draw_box_with_text,getlogger,Annotation +from ocr_component1 import OCRComponent1 +from detectionAndOcrTable1 import DetectionAndOcrTable1 +from detectionAndOcrTable2 import DetectionAndOcrTable2 +from detectionAndOcrTable3 import DetectionAndOcrTable3 +from detectionAndOcrTable4 import DetectionAndOcrTable4 +from ocrTable1 import OcrTable1 +from ocrTable2 import OcrTable2 +from pdf2image import convert_from_path + + +def convertHTMLToCSV(html:str,output_path:str)->str: + + # empty list + data = [] + + # for getting the header from + # the HTML file + list_header = [] + soup = BeautifulSoup(html,'html.parser') + header = soup.find_all("table")[0].find("tr") + + for items in header: + try: + list_header.append(items.get_text()) + except: + continue + + # for getting the data + HTML_data = soup.find_all("table")[0].find_all("tr")[1:] + + for element in HTML_data: + sub_data = [] + for sub_element in element: + try: + sub_data.append(sub_element.get_text()) + except: + continue + data.append(sub_data) + + # Storing the data into Pandas + # DataFrame + dataFrame = pd.DataFrame(data = data, columns = list_header) + + # Converting Pandas DataFrame + # into CSV file + dataFrame.to_csv(output_path) + +def saveResults(image_list, results, labels, output_dir='output/', threshold=0.5): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + for idx, im in enumerate(image_list): + im = draw_only_box(im, results[idx], labels, threshold=threshold) + + out_path = os.path.join(output_dir, f"{idx}.jpg") + im.save(out_path, quality=95) + print("save result to: " + out_path) + +def InputToImages(input_path:str,resolution=300)-> List[Image.Image]: + """ + input is file location to image + return : List of Pillow image objects + """ + images=[] + try: + img =Image.open(input_path) + if img.mode == 'RGBA': + img = img.convert('RGB') + images.append(img) + except Exception as e: + traceback.print_exc() + return images + +def drawTextDetRes(bxs :List[List[float]],img:Image.Image,output_path:str): + """ + draw layout analysis results + """ + """bxs_draw is xmin, ymin, xmax, ymax""" + bxs_draw = [[b[0][0], b[0][1], b[1][0], b[-1][1]] for b in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]] + + #images_to_recognizer = cropImage(bxs, img) + img_to_save = draw_only_box(img, bxs_draw) + img_to_save.save(output_path, quality=95) + +def test_ocr_component1(test_file="TestingFiles/OCRTest1German.pdf", debug_folder = './res/table1/',englishFlag = False): + #Takes as input image of a single page and returns the detected lines and words + + images = convert_from_path(test_file) + ocr = OCRComponent1(englishFlag) + ocr_results = {} + + all_text_in_pages = {} + for page_number,img in enumerate(images): + text_in_page = "" + + line_annotations= ocr.predict(img = np.array(img)) + ocr_results[page_number] = line_annotations + + """ + boxes_to_draw =[] + for list_of_ann in word_annotations: + for ann in list_of_ann: + logger.info(ann.text) + b = ann.box + boxes_to_draw.append(b) + + img_to_save = draw_only_box(img,boxes_to_draw) + img_to_save.save("res/12June_2_lines.png", quality=95) + """ + + line_boxes_to_draw =[] + #print("Detected lines are ") + #print(len(line_annotations.items())) + for index,ann in line_annotations.items(): + + b = ann.box + line_boxes_to_draw.append(b) + line_words = "" + #print("detected words per line") + #print(len(ann.words)) + for wordann in ann.words: + line_words += wordann.text +" " + print(line_words) + text_in_page += line_words +"\n" + + img_to_save1 = draw_only_box(img,line_boxes_to_draw) + imgname = test_file.split("/")[-1][:-4] + img_to_save1.save(debug_folder+imgname+"_"+str(page_number)+"_bbox_detection.png", quality=95) + + all_text_in_pages[page_number] = text_in_page + + return ocr_results, all_text_in_pages + + +def test_tableOcrOnly1(test_file :Image.Image , debug_folder = './res/table1/',denoise = False,englishFlag = False): + #Hybrid Unitable +DocTR + #Good at these kind of tables - with a lot of texts + table = OcrTable1(englishFlag) + image = test_file.convert("RGB") + """ + parts = test_file.split("/") + filename = parts[-1][:-4] + debugfolder_filename_page_name= debug_folder+filename+"_" + + table_code = table.predict([image],debugfolder_filename_page_name,denoise = denoise) + with open(debugfolder_filename_page_name+'output.txt', 'w') as file: + file.write(table_code) + """ + + table_code = table.predict([image],denoise = denoise) + return table_code + + +def test_tableOcrOnly2(test_file:Image.Image, debug_folder = './res/table2/'): + table = OcrTable2() + #FullUnitable + #Good at these kind of tables - with not much text + + image = test_file.convert("RGB") + table.predict([image],debug_folder) + +def test_table_component1(test_file = 'TestingFiles/TableOCRTestEnglish.pdf', debug_folder ='./res/table_debug2/',denoise = False,englishFlag = True): + table_predictor = DetectionAndOcrTable1(englishFlag) + + images = convert_from_path(test_file) + for page_number,img in enumerate(images): + + #print(img.mode) + print("Looking at page:") + print(page_number) + parts = test_file.split("/") + filename = parts[-1][:-4] + debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' + table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name,denoise = denoise) + for index, table_code in enumerate(table_codes): + with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: + file.write(table_code) + return table_codes + +def test_table_component2(test_file = 'TestingFiles/TableOCRTestEnglish.pdf', debug_folder ='./res/table_debug2/'): + #This components can take in entire pdf page as input , scan for tables and return the table in html format + #Uses the full unitable model + + table_predictor = DetectionAndOcrTable2() + + images = convert_from_path(test_file) + for page_number,img in enumerate(images): + print("Looking at page:") + print(page_number) + parts = test_file.split("/") + filename = parts[-1][:-4] + debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' + table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) + for index, table_code in enumerate(table_codes): + with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: + file.write(table_code) + return table_codes + +def test_table_component3(test_file = 'TestingFiles/TableOCRTestEnglish.pdf',debug_folder ='./res/table_debug3/',denoise = False,englishFlag = True): + table_predictor = DetectionAndOcrTable3(englishFlag) + + images = convert_from_path(test_file) + for page_number,img in enumerate(images): + #print(img.mode) + print("Looking at page:") + print(page_number) + parts = test_file.split("/") + filename = parts[-1][:-4] + debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' + table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) + for index, table_code in enumerate(table_codes): + with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: + file.write(table_code) + return table_codes + + + +def test_table_component4(test_file = 'TestingFiles/TableOCRTestEnglish.pdf',debug_folder ='./res/table_debug3/'): + table_predictor = DetectionAndOcrTable4() + + images = convert_from_path(test_file) + for page_number,img in enumerate(images): + #print(img.mode) + print("Looking at page:") + print(page_number) + parts = test_file.split("/") + filename = parts[-1][:-4] + debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' + table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) + for index, table_code in enumerate(table_codes): + with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: + file.write(table_code) + return table_codes + + +""" + parser = argparse.ArgumentParser(description='Process some strings.') + parser.add_argument('ocr', type=str, help='type in id of the component to test') + parser.add_argument('--test_file',type=str, help='path to the testing file') + parser.add_argument('--debug_folder',type=str, help='path to the folder you want to save your results in') + parser.add_argument('--englishFlag',type=bool, help='Whether your pdf is in english => could lead to better results ') + parser.add_argument('--denoise',type=bool, help='preprocessing for not clean scans ') + + args = parser.parse_args() + start = time.time() + if args.ocr == "ocr1": + test_ocr_component1(args.test_file,args.debug_folder, args.englishFlag) + elif args.ocr == "table1": + test_tableOcrOnly1(args.test_file,args.debug_folder,args.englishFlag,args.denoise) + elif args.ocr == "table2": + test_tableOcrOnly2(args.test_file,args.debug_folder) + elif args.ocr =="pdftable1": + test_table_component1(args.test_file,args.debug_folder,args.englishFlag,args.denoise) + elif args.ocr =="pdftable2": + test_table_component2(args.test_file,args.debug_folder) + elif args.ocr =="pdftable3": + test_table_component3(args.test_file,args.debug_folder,args.englishFlag,args.denoise) + elif args.ocr =="pdftable4": + test_table_component4(args.test_file,args.debug_folder) + +""" +import gradio as gr +from gradio_pdf import PDF + +with gr.Blocks() as demo: + gr.Markdown("# OCR component") + inputs_for_ocr = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/"),gr.Checkbox(label ="English Document?",value =False)] + ocr_btn = gr.Button("Run ocr") + + gr.Examples( + examples=[["TestingFiles/OCRTest1German.pdf",'./res/table1/',False]], + inputs=inputs_for_ocr + ) + + outputs_for_ocr = [gr.Textbox(label="List of annotation objects"), gr.Textbox("Text in page")] + + ocr_btn.click(fn=test_ocr_component1, + inputs = inputs_for_ocr, + outputs = outputs_for_ocr, + api_name="OCR" + ) + + gr.Markdown("# Table OCR components that takes a pdf, extract table and return their html code ") + gr.Markdown("## Component 1 uses table transformer and doctr +Unitable") + inputs_for_pdftable1 = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/"),gr.Checkbox(label ="Denoise?",value =False),gr.Checkbox(label ="English Document?",value =False)] + table1_btn = gr.Button("Run pdftable1") + + gr.Examples( + examples=[["TestingFiles/OCRTest5English.pdf",'./res/table1/',False]], + inputs=inputs_for_pdftable1 + ) + outputs_for_pdftable1 = [gr.Textbox(label="Table code")] + + table1_btn.click(fn=test_table_component1, + inputs = inputs_for_pdftable1, + outputs = outputs_for_pdftable1, + api_name="pdfTable1" + ) + + gr.Markdown("## Component 2 uses table transformer and Unitable") + inputs_for_pdftable2 = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/")] + table2_btn = gr.Button("Run pdftable2") + + gr.Examples( + examples=[["TestingFiles/OCRTest5English.pdf",'./res/table1/',False]], + inputs=inputs_for_pdftable1 + ) + outputs_for_pdftable2 = [gr.Textbox(label="Table code")] + + table2_btn.click(fn=test_table_component2, + inputs = inputs_for_pdftable2, + outputs = outputs_for_pdftable2, + api_name="pdfTable2" + ) + + gr.Markdown("## Component 3 uses Yolo and Unitable+doctr") + inputs_for_pdftable3 = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/"),gr.Checkbox(label ="Denoise?",value =False),gr.Checkbox(label ="English Document?",value =False)] + table3_btn = gr.Button("Run pdftable3") + + + gr.Examples( + examples=[["TestingFiles/TableOCRTestEnglish.pdf",'./res/table1/',False]], + inputs=inputs_for_pdftable1 + ) + outputs_for_pdftable3 = [gr.Textbox(label="Table code")] + + table3_btn.click(fn=test_table_component3, + inputs = inputs_for_pdftable3, + outputs = outputs_for_pdftable3, + api_name="pdfTable3" + ) + + gr.Markdown("## Component 4 uses Yolo and Unitable") + inputs_for_pdftable4 = [PDF(label="Document"), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/")] + table4_btn = gr.Button("Run pdftable4") + + gr.Examples( + examples=[["TestingFiles/TableOCRTestEasier.pdf",'./res/table1/',False]], + inputs=inputs_for_pdftable1 + ) + outputs_for_pdftable4 = [gr.Textbox(label="Table code")] + + + table4_btn.click(fn=test_table_component4, + inputs = inputs_for_pdftable4, + outputs = outputs_for_pdftable4, + api_name="pdfTable4" + ) + + + gr.Markdown("# Table OCR component that takes image of an cropped tavle, extract table and return their html code ") + + inputs_for_table1 = [gr.Image(label="Image of cropped table",type='pil'), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/"),gr.Checkbox(label ="Denoise?",value =False),gr.Checkbox(label ="English Document?",value =False)] + onlytable1_btn = gr.Button("Run table1") + + gr.Examples( + examples=[[Image.open("cropped_table.png"),'./res/table1/',False]], + inputs=inputs_for_table1 + ) + outputs_for_table1 = [gr.HTML(label="Table code")] + + + onlytable1_btn.click(fn=test_tableOcrOnly1, + inputs = inputs_for_table1, + outputs = outputs_for_table1, + api_name="table1" + ) + + gr.Markdown("## Another Table OCR component that takes image of an cropped table, extract table and return their html code ") + + inputs_for_table2 = [gr.Image(label="Image of cropped table",type='pil'), gr.Textbox(label="internal debug folder",placeholder = "./res/table1/")] + onlytable2_btn = gr.Button("Run table2") + + + gr.Examples( + examples=[[Image.open("cropped_table.png"),'./res/table1/',False]], + inputs=inputs_for_table2 + ) + outputs_for_table2 = [gr.HTML(label="Table code")] + + onlytable2_btn.click(fn=test_tableOcrOnly2, + inputs = inputs_for_table2, + outputs = outputs_for_table2, + api_name="table2" + ) + + + + +demo.launch(share=True) \ No newline at end of file diff --git a/cropped_table.png b/cropped_table.png new file mode 100644 index 0000000000000000000000000000000000000000..05708c5e5a87d3c493aa66649e0c8656fb1fc403 Binary files /dev/null and b/cropped_table.png differ diff --git a/cropped_table_0.png b/cropped_table_0.png new file mode 100644 index 0000000000000000000000000000000000000000..f4cd408ba94ebbf9062240165cfa4f22486337d8 Binary files /dev/null and b/cropped_table_0.png differ diff --git a/cropped_table_1.png b/cropped_table_1.png new file mode 100644 index 0000000000000000000000000000000000000000..4c2e2bb5ec81a55f0dd581e475320d9ac80d7238 Binary files /dev/null and b/cropped_table_1.png differ diff --git a/deepdoc/README.md b/deepdoc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5c7235566f532e85e697f12d3a549022ac6ea4a9 --- /dev/null +++ b/deepdoc/README.md @@ -0,0 +1,122 @@ +English | [简体中文](./README_zh.md) + +# *Deep*Doc + +- [1. Introduction](#1) +- [2. Vision](#2) +- [3. Parser](#3) + + +## 1. Introduction + +With a bunch of documents from various domains with various formats and along with diverse retrieval requirements, +an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose. +There are 2 parts in *Deep*Doc so far: vision and parser. +You can run the flowing test programs if you're interested in our results of OCR, layout recognition and TSR. +```bash +python deepdoc/vision/t_ocr.py -h +usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] + +options: + -h, --help show this help message and exit + --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF + --output_dir OUTPUT_DIR + Directory where to store the output images. Default: './ocr_outputs' +``` +```bash +python deepdoc/vision/t_recognizer.py -h +usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}] + +options: + -h, --help show this help message and exit + --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF + --output_dir OUTPUT_DIR + Directory where to store the output images. Default: './layouts_outputs' + --threshold THRESHOLD + A threshold to filter out detections. Default: 0.5 + --mode {layout,tsr} Task mode: layout recognition or table structure recognition +``` + +Our models are served on HuggingFace. If you have trouble downloading HuggingFace models, this might help!! +```bash +export HF_ENDPOINT=https://hf-mirror.com +``` + + +## 2. Vision + +We use vision information to resolve problems as human being. + - OCR. Since a lot of documents presented as images or at least be able to transform to image, + OCR is a very essential and fundamental or even universal solution for text extraction. + ```bash + python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result + ``` + The inputs could be directory to images or PDF, or a image or PDF. + You can look into the folder 'path_to_store_result' where has images which demonstrate the positions of results, + txt files which contain the OCR text. +
+ +
+ + - Layout recognition. Documents from different domain may have various layouts, + like, newspaper, magazine, book and résumé are distinct in terms of layout. + Only when machine have an accurate layout analysis, it can decide if these text parts are successive or not, + or this part needs Table Structure Recognition(TSR) to process, or this part is a figure and described with this caption. + We have 10 basic layout components which covers most cases: + - Text + - Title + - Figure + - Figure caption + - Table + - Table caption + - Header + - Footer + - Reference + - Equation + + Have a try on the following command to see the layout detection results. + ```bash + python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result + ``` + The inputs could be directory to images or PDF, or a image or PDF. + You can look into the folder 'path_to_store_result' where has images which demonstrate the detection results as following: +
+ +
+ + - Table Structure Recognition(TSR). Data table is a frequently used structure to present data including numbers or text. + And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers. + Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM. + We have five labels for TSR task: + - Column + - Row + - Column header + - Projected row header + - Spanning cell + + Have a try on the following command to see the layout detection results. + ```bash + python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result + ``` + The inputs could be directory to images or PDF, or a image or PDF. + You can look into the folder 'path_to_store_result' where has both images and html pages which demonstrate the detection results as following: +
+ +
+ + +## 3. Parser + +Four kinds of document formats as PDF, DOCX, EXCEL and PPT have their corresponding parser. +The most complex one is PDF parser since PDF's flexibility. The output of PDF parser includes: + - Text chunks with their own positions in PDF(page number and rectangular positions). + - Tables with cropped image from the PDF, and contents which has already translated into natural language sentences. + - Figures with caption and text in the figures. + +### Résumé + +The résumé is a very complicated kind of document. A résumé which is composed of unstructured text +with various layouts could be resolved into structured data composed of nearly a hundred of fields. +We haven't opened the parser yet, as we open the processing method after parsing procedure. + + \ No newline at end of file diff --git a/deepdoc/__init__.py b/deepdoc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2b1cff12fe96e0b6bf64f2ec8a211a7e1f38497d --- /dev/null +++ b/deepdoc/__init__.py @@ -0,0 +1,8 @@ + +""" +In deepdoc/__init__.py, import the class from ragFlow.py and make it available for import from the deepdoc package: +""" + +from .vision import RagFlow + +__all__ = ['RagFlow'] \ No newline at end of file diff --git a/deepdoc/models/.gitattributes b/deepdoc/models/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/deepdoc/models/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/deepdoc/models/README.md b/deepdoc/models/README.md new file mode 100644 index 0000000000000000000000000000000000000000..154df8298fab5ecf322016157858e08cd1bccbe1 --- /dev/null +++ b/deepdoc/models/README.md @@ -0,0 +1,3 @@ +--- +license: apache-2.0 +--- diff --git a/deepdoc/models/det.onnx b/deepdoc/models/det.onnx new file mode 100644 index 0000000000000000000000000000000000000000..d8e1a035c9a98358b8a7969a757f6e50f49bb3cd --- /dev/null +++ b/deepdoc/models/det.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30a86f5731181461d08021402766601e4302a9b9b9666be8aff402696339cdff +size 4745517 diff --git a/deepdoc/models/layout.laws.onnx b/deepdoc/models/layout.laws.onnx new file mode 100644 index 0000000000000000000000000000000000000000..86ad0bda0883e703e6065f2bf3c91354349d5b30 --- /dev/null +++ b/deepdoc/models/layout.laws.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:485a7ddf6889ef15a150bded7091ec1ea5467871f50a88f5f4297c66c1ecef1e +size 12246134 diff --git a/deepdoc/models/layout.manual.onnx b/deepdoc/models/layout.manual.onnx new file mode 100644 index 0000000000000000000000000000000000000000..86ad0bda0883e703e6065f2bf3c91354349d5b30 --- /dev/null +++ b/deepdoc/models/layout.manual.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:485a7ddf6889ef15a150bded7091ec1ea5467871f50a88f5f4297c66c1ecef1e +size 12246134 diff --git a/deepdoc/models/layout.onnx b/deepdoc/models/layout.onnx new file mode 100644 index 0000000000000000000000000000000000000000..86ad0bda0883e703e6065f2bf3c91354349d5b30 --- /dev/null +++ b/deepdoc/models/layout.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:485a7ddf6889ef15a150bded7091ec1ea5467871f50a88f5f4297c66c1ecef1e +size 12246134 diff --git a/deepdoc/models/layout.paper.onnx b/deepdoc/models/layout.paper.onnx new file mode 100644 index 0000000000000000000000000000000000000000..86ad0bda0883e703e6065f2bf3c91354349d5b30 --- /dev/null +++ b/deepdoc/models/layout.paper.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:485a7ddf6889ef15a150bded7091ec1ea5467871f50a88f5f4297c66c1ecef1e +size 12246134 diff --git a/deepdoc/models/ocr.res b/deepdoc/models/ocr.res new file mode 100644 index 0000000000000000000000000000000000000000..84b885d8352226e49b1d5d791b8f43a663e246aa --- /dev/null +++ b/deepdoc/models/ocr.res @@ -0,0 +1,6623 @@ +' +疗 +绚 +诚 +娇 +溜 +题 +贿 +者 +廖 +更 +纳 +加 +奉 +公 +一 +就 +汴 +计 +与 +路 +房 +原 +妇 +2 +0 +8 +- +7 +其 +> +: +] +, +, +骑 +刈 +全 +消 +昏 +傈 +安 +久 +钟 +嗅 +不 +影 +处 +驽 +蜿 +资 +关 +椤 +地 +瘸 +专 +问 +忖 +票 +嫉 +炎 +韵 +要 +月 +田 +节 +陂 +鄙 +捌 +备 +拳 +伺 +眼 +网 +盎 +大 +傍 +心 +东 +愉 +汇 +蹿 +科 +每 +业 +里 +航 +晏 +字 +平 +录 +先 +1 +3 +彤 +鲶 +产 +稍 +督 +腴 +有 +象 +岳 +注 +绍 +在 +泺 +文 +定 +核 +名 +水 +过 +理 +让 +偷 +率 +等 +这 +发 +” +为 +含 +肥 +酉 +相 +鄱 +七 +编 +猥 +锛 +日 +镀 +蒂 +掰 +倒 +辆 +栾 +栗 +综 +涩 +州 +雌 +滑 +馀 +了 +机 +块 +司 +宰 +甙 +兴 +矽 +抚 +保 +用 +沧 +秩 +如 +收 +息 +滥 +页 +疑 +埠 +! +! +姥 +异 +橹 +钇 +向 +下 +跄 +的 +椴 +沫 +国 +绥 +獠 +报 +开 +民 +蜇 +何 +分 +凇 +长 +讥 +藏 +掏 +施 +羽 +中 +讲 +派 +嘟 +人 +提 +浼 +间 +世 +而 +古 +多 +倪 +唇 +饯 +控 +庚 +首 +赛 +蜓 +味 +断 +制 +觉 +技 +替 +艰 +溢 +潮 +夕 +钺 +外 +摘 +枋 +动 +双 +单 +啮 +户 +枇 +确 +锦 +曜 +杜 +或 +能 +效 +霜 +盒 +然 +侗 +电 +晁 +放 +步 +鹃 +新 +杖 +蜂 +吒 +濂 +瞬 +评 +总 +隍 +对 +独 +合 +也 +是 +府 +青 +天 +诲 +墙 +组 +滴 +级 +邀 +帘 +示 +已 +时 +骸 +仄 +泅 +和 +遨 +店 +雇 +疫 +持 +巍 +踮 +境 +只 +亨 +目 +鉴 +崤 +闲 +体 +泄 +杂 +作 +般 +轰 +化 +解 +迂 +诿 +蛭 +璀 +腾 +告 +版 +服 +省 +师 +小 +规 +程 +线 +海 +办 +引 +二 +桧 +牌 +砺 +洄 +裴 +修 +图 +痫 +胡 +许 +犊 +事 +郛 +基 +柴 +呼 +食 +研 +奶 +律 +蛋 +因 +葆 +察 +戏 +褒 +戒 +再 +李 +骁 +工 +貂 +油 +鹅 +章 +啄 +休 +场 +给 +睡 +纷 +豆 +器 +捎 +说 +敏 +学 +会 +浒 +设 +诊 +格 +廓 +查 +来 +霓 +室 +溆 +¢ +诡 +寥 +焕 +舜 +柒 +狐 +回 +戟 +砾 +厄 +实 +翩 +尿 +五 +入 +径 +惭 +喹 +股 +宇 +篝 +| +; +美 +期 +云 +九 +祺 +扮 +靠 +锝 +槌 +系 +企 +酰 +阊 +暂 +蚕 +忻 +豁 +本 +羹 +执 +条 +钦 +H +獒 +限 +进 +季 +楦 +于 +芘 +玖 +铋 +茯 +未 +答 +粘 +括 +样 +精 +欠 +矢 +甥 +帷 +嵩 +扣 +令 +仔 +风 +皈 +行 +支 +部 +蓉 +刮 +站 +蜡 +救 +钊 +汗 +松 +嫌 +成 +可 +. +鹤 +院 +从 +交 +政 +怕 +活 +调 +球 +局 +验 +髌 +第 +韫 +谗 +串 +到 +圆 +年 +米 +/ +* +友 +忿 +检 +区 +看 +自 +敢 +刃 +个 +兹 +弄 +流 +留 +同 +没 +齿 +星 +聆 +轼 +湖 +什 +三 +建 +蛔 +儿 +椋 +汕 +震 +颧 +鲤 +跟 +力 +情 +璺 +铨 +陪 +务 +指 +族 +训 +滦 +鄣 +濮 +扒 +商 +箱 +十 +召 +慷 +辗 +所 +莞 +管 +护 +臭 +横 +硒 +嗓 +接 +侦 +六 +露 +党 +馋 +驾 +剖 +高 +侬 +妪 +幂 +猗 +绺 +骐 +央 +酐 +孝 +筝 +课 +徇 +缰 +门 +男 +西 +项 +句 +谙 +瞒 +秃 +篇 +教 +碲 +罚 +声 +呐 +景 +前 +富 +嘴 +鳌 +稀 +免 +朋 +啬 +睐 +去 +赈 +鱼 +住 +肩 +愕 +速 +旁 +波 +厅 +健 +茼 +厥 +鲟 +谅 +投 +攸 +炔 +数 +方 +击 +呋 +谈 +绩 +别 +愫 +僚 +躬 +鹧 +胪 +炳 +招 +喇 +膨 +泵 +蹦 +毛 +结 +5 +4 +谱 +识 +陕 +粽 +婚 +拟 +构 +且 +搜 +任 +潘 +比 +郢 +妨 +醪 +陀 +桔 +碘 +扎 +选 +哈 +骷 +楷 +亿 +明 +缆 +脯 +监 +睫 +逻 +婵 +共 +赴 +淝 +凡 +惦 +及 +达 +揖 +谩 +澹 +减 +焰 +蛹 +番 +祁 +柏 +员 +禄 +怡 +峤 +龙 +白 +叽 +生 +闯 +起 +细 +装 +谕 +竟 +聚 +钙 +上 +导 +渊 +按 +艾 +辘 +挡 +耒 +盹 +饪 +臀 +记 +邮 +蕙 +受 +各 +医 +搂 +普 +滇 +朗 +茸 +带 +翻 +酚 +( +光 +堤 +墟 +蔷 +万 +幻 +〓 +瑙 +辈 +昧 +盏 +亘 +蛀 +吉 +铰 +请 +子 +假 +闻 +税 +井 +诩 +哨 +嫂 +好 +面 +琐 +校 +馊 +鬣 +缂 +营 +访 +炖 +占 +农 +缀 +否 +经 +钚 +棵 +趟 +张 +亟 +吏 +茶 +谨 +捻 +论 +迸 +堂 +玉 +信 +吧 +瞠 +乡 +姬 +寺 +咬 +溏 +苄 +皿 +意 +赉 +宝 +尔 +钰 +艺 +特 +唳 +踉 +都 +荣 +倚 +登 +荐 +丧 +奇 +涵 +批 +炭 +近 +符 +傩 +感 +道 +着 +菊 +虹 +仲 +众 +懈 +濯 +颞 +眺 +南 +释 +北 +缝 +标 +既 +茗 +整 +撼 +迤 +贲 +挎 +耱 +拒 +某 +妍 +卫 +哇 +英 +矶 +藩 +治 +他 +元 +领 +膜 +遮 +穗 +蛾 +飞 +荒 +棺 +劫 +么 +市 +火 +温 +拈 +棚 +洼 +转 +果 +奕 +卸 +迪 +伸 +泳 +斗 +邡 +侄 +涨 +屯 +萋 +胭 +氡 +崮 +枞 +惧 +冒 +彩 +斜 +手 +豚 +随 +旭 +淑 +妞 +形 +菌 +吲 +沱 +争 +驯 +歹 +挟 +兆 +柱 +传 +至 +包 +内 +响 +临 +红 +功 +弩 +衡 +寂 +禁 +老 +棍 +耆 +渍 +织 +害 +氵 +渑 +布 +载 +靥 +嗬 +虽 +苹 +咨 +娄 +库 +雉 +榜 +帜 +嘲 +套 +瑚 +亲 +簸 +欧 +边 +6 +腿 +旮 +抛 +吹 +瞳 +得 +镓 +梗 +厨 +继 +漾 +愣 +憨 +士 +策 +窑 +抑 +躯 +襟 +脏 +参 +贸 +言 +干 +绸 +鳄 +穷 +藜 +音 +折 +详 +) +举 +悍 +甸 +癌 +黎 +谴 +死 +罩 +迁 +寒 +驷 +袖 +媒 +蒋 +掘 +模 +纠 +恣 +观 +祖 +蛆 +碍 +位 +稿 +主 +澧 +跌 +筏 +京 +锏 +帝 +贴 +证 +糠 +才 +黄 +鲸 +略 +炯 +饱 +四 +出 +园 +犀 +牧 +容 +汉 +杆 +浈 +汰 +瑷 +造 +虫 +瘩 +怪 +驴 +济 +应 +花 +沣 +谔 +夙 +旅 +价 +矿 +以 +考 +s +u +呦 +晒 +巡 +茅 +准 +肟 +瓴 +詹 +仟 +褂 +译 +桌 +混 +宁 +怦 +郑 +抿 +些 +余 +鄂 +饴 +攒 +珑 +群 +阖 +岔 +琨 +藓 +预 +环 +洮 +岌 +宀 +杲 +瀵 +最 +常 +囡 +周 +踊 +女 +鼓 +袭 +喉 +简 +范 +薯 +遐 +疏 +粱 +黜 +禧 +法 +箔 +斤 +遥 +汝 +奥 +直 +贞 +撑 +置 +绱 +集 +她 +馅 +逗 +钧 +橱 +魉 +[ +恙 +躁 +唤 +9 +旺 +膘 +待 +脾 +惫 +购 +吗 +依 +盲 +度 +瘿 +蠖 +俾 +之 +镗 +拇 +鲵 +厝 +簧 +续 +款 +展 +啃 +表 +剔 +品 +钻 +腭 +损 +清 +锶 +统 +涌 +寸 +滨 +贪 +链 +吠 +冈 +伎 +迥 +咏 +吁 +览 +防 +迅 +失 +汾 +阔 +逵 +绀 +蔑 +列 +川 +凭 +努 +熨 +揪 +利 +俱 +绉 +抢 +鸨 +我 +即 +责 +膦 +易 +毓 +鹊 +刹 +玷 +岿 +空 +嘞 +绊 +排 +术 +估 +锷 +违 +们 +苟 +铜 +播 +肘 +件 +烫 +审 +鲂 +广 +像 +铌 +惰 +铟 +巳 +胍 +鲍 +康 +憧 +色 +恢 +想 +拷 +尤 +疳 +知 +S +Y +F +D +A +峄 +裕 +帮 +握 +搔 +氐 +氘 +难 +墒 +沮 +雨 +叁 +缥 +悴 +藐 +湫 +娟 +苑 +稠 +颛 +簇 +后 +阕 +闭 +蕤 +缚 +怎 +佞 +码 +嘤 +蔡 +痊 +舱 +螯 +帕 +赫 +昵 +升 +烬 +岫 +、 +疵 +蜻 +髁 +蕨 +隶 +烛 +械 +丑 +盂 +梁 +强 +鲛 +由 +拘 +揉 +劭 +龟 +撤 +钩 +呕 +孛 +费 +妻 +漂 +求 +阑 +崖 +秤 +甘 +通 +深 +补 +赃 +坎 +床 +啪 +承 +吼 +量 +暇 +钼 +烨 +阂 +擎 +脱 +逮 +称 +P +神 +属 +矗 +华 +届 +狍 +葑 +汹 +育 +患 +窒 +蛰 +佼 +静 +槎 +运 +鳗 +庆 +逝 +曼 +疱 +克 +代 +官 +此 +麸 +耧 +蚌 +晟 +例 +础 +榛 +副 +测 +唰 +缢 +迹 +灬 +霁 +身 +岁 +赭 +扛 +又 +菡 +乜 +雾 +板 +读 +陷 +徉 +贯 +郁 +虑 +变 +钓 +菜 +圾 +现 +琢 +式 +乐 +维 +渔 +浜 +左 +吾 +脑 +钡 +警 +T +啵 +拴 +偌 +漱 +湿 +硕 +止 +骼 +魄 +积 +燥 +联 +踢 +玛 +则 +窿 +见 +振 +畿 +送 +班 +钽 +您 +赵 +刨 +印 +讨 +踝 +籍 +谡 +舌 +崧 +汽 +蔽 +沪 +酥 +绒 +怖 +财 +帖 +肱 +私 +莎 +勋 +羔 +霸 +励 +哼 +帐 +将 +帅 +渠 +纪 +婴 +娩 +岭 +厘 +滕 +吻 +伤 +坝 +冠 +戊 +隆 +瘁 +介 +涧 +物 +黍 +并 +姗 +奢 +蹑 +掣 +垸 +锴 +命 +箍 +捉 +病 +辖 +琰 +眭 +迩 +艘 +绌 +繁 +寅 +若 +毋 +思 +诉 +类 +诈 +燮 +轲 +酮 +狂 +重 +反 +职 +筱 +县 +委 +磕 +绣 +奖 +晋 +濉 +志 +徽 +肠 +呈 +獐 +坻 +口 +片 +碰 +几 +村 +柿 +劳 +料 +获 +亩 +惕 +晕 +厌 +号 +罢 +池 +正 +鏖 +煨 +家 +棕 +复 +尝 +懋 +蜥 +锅 +岛 +扰 +队 +坠 +瘾 +钬 +@ +卧 +疣 +镇 +譬 +冰 +彷 +频 +黯 +据 +垄 +采 +八 +缪 +瘫 +型 +熹 +砰 +楠 +襁 +箐 +但 +嘶 +绳 +啤 +拍 +盥 +穆 +傲 +洗 +盯 +塘 +怔 +筛 +丿 +台 +恒 +喂 +葛 +永 +¥ +烟 +酒 +桦 +书 +砂 +蚝 +缉 +态 +瀚 +袄 +圳 +轻 +蛛 +超 +榧 +遛 +姒 +奘 +铮 +右 +荽 +望 +偻 +卡 +丶 +氰 +附 +做 +革 +索 +戚 +坨 +桷 +唁 +垅 +榻 +岐 +偎 +坛 +莨 +山 +殊 +微 +骇 +陈 +爨 +推 +嗝 +驹 +澡 +藁 +呤 +卤 +嘻 +糅 +逛 +侵 +郓 +酌 +德 +摇 +※ +鬃 +被 +慨 +殡 +羸 +昌 +泡 +戛 +鞋 +河 +宪 +沿 +玲 +鲨 +翅 +哽 +源 +铅 +语 +照 +邯 +址 +荃 +佬 +顺 +鸳 +町 +霭 +睾 +瓢 +夸 +椁 +晓 +酿 +痈 +咔 +侏 +券 +噎 +湍 +签 +嚷 +离 +午 +尚 +社 +锤 +背 +孟 +使 +浪 +缦 +潍 +鞅 +军 +姹 +驶 +笑 +鳟 +鲁 +》 +孽 +钜 +绿 +洱 +礴 +焯 +椰 +颖 +囔 +乌 +孔 +巴 +互 +性 +椽 +哞 +聘 +昨 +早 +暮 +胶 +炀 +隧 +低 +彗 +昝 +铁 +呓 +氽 +藉 +喔 +癖 +瑗 +姨 +权 +胱 +韦 +堑 +蜜 +酋 +楝 +砝 +毁 +靓 +歙 +锲 +究 +屋 +喳 +骨 +辨 +碑 +武 +鸠 +宫 +辜 +烊 +适 +坡 +殃 +培 +佩 +供 +走 +蜈 +迟 +翼 +况 +姣 +凛 +浔 +吃 +飘 +债 +犟 +金 +促 +苛 +崇 +坂 +莳 +畔 +绂 +兵 +蠕 +斋 +根 +砍 +亢 +欢 +恬 +崔 +剁 +餐 +榫 +快 +扶 +‖ +濒 +缠 +鳜 +当 +彭 +驭 +浦 +篮 +昀 +锆 +秸 +钳 +弋 +娣 +瞑 +夷 +龛 +苫 +拱 +致 +% +嵊 +障 +隐 +弑 +初 +娓 +抉 +汩 +累 +蓖 +" +唬 +助 +苓 +昙 +押 +毙 +破 +城 +郧 +逢 +嚏 +獭 +瞻 +溱 +婿 +赊 +跨 +恼 +璧 +萃 +姻 +貉 +灵 +炉 +密 +氛 +陶 +砸 +谬 +衔 +点 +琛 +沛 +枳 +层 +岱 +诺 +脍 +榈 +埂 +征 +冷 +裁 +打 +蹴 +素 +瘘 +逞 +蛐 +聊 +激 +腱 +萘 +踵 +飒 +蓟 +吆 +取 +咙 +簋 +涓 +矩 +曝 +挺 +揣 +座 +你 +史 +舵 +焱 +尘 +苏 +笈 +脚 +溉 +榨 +诵 +樊 +邓 +焊 +义 +庶 +儋 +蟋 +蒲 +赦 +呷 +杞 +诠 +豪 +还 +试 +颓 +茉 +太 +除 +紫 +逃 +痴 +草 +充 +鳕 +珉 +祗 +墨 +渭 +烩 +蘸 +慕 +璇 +镶 +穴 +嵘 +恶 +骂 +险 +绋 +幕 +碉 +肺 +戳 +刘 +潞 +秣 +纾 +潜 +銮 +洛 +须 +罘 +销 +瘪 +汞 +兮 +屉 +r +林 +厕 +质 +探 +划 +狸 +殚 +善 +煊 +烹 +〒 +锈 +逯 +宸 +辍 +泱 +柚 +袍 +远 +蹋 +嶙 +绝 +峥 +娥 +缍 +雀 +徵 +认 +镱 +谷 += +贩 +勉 +撩 +鄯 +斐 +洋 +非 +祚 +泾 +诒 +饿 +撬 +威 +晷 +搭 +芍 +锥 +笺 +蓦 +候 +琊 +档 +礁 +沼 +卵 +荠 +忑 +朝 +凹 +瑞 +头 +仪 +弧 +孵 +畏 +铆 +突 +衲 +车 +浩 +气 +茂 +悖 +厢 +枕 +酝 +戴 +湾 +邹 +飚 +攘 +锂 +写 +宵 +翁 +岷 +无 +喜 +丈 +挑 +嗟 +绛 +殉 +议 +槽 +具 +醇 +淞 +笃 +郴 +阅 +饼 +底 +壕 +砚 +弈 +询 +缕 +庹 +翟 +零 +筷 +暨 +舟 +闺 +甯 +撞 +麂 +茌 +蔼 +很 +珲 +捕 +棠 +角 +阉 +媛 +娲 +诽 +剿 +尉 +爵 +睬 +韩 +诰 +匣 +危 +糍 +镯 +立 +浏 +阳 +少 +盆 +舔 +擘 +匪 +申 +尬 +铣 +旯 +抖 +赘 +瓯 +居 +ˇ +哮 +游 +锭 +茏 +歌 +坏 +甚 +秒 +舞 +沙 +仗 +劲 +潺 +阿 +燧 +郭 +嗖 +霏 +忠 +材 +奂 +耐 +跺 +砀 +输 +岖 +媳 +氟 +极 +摆 +灿 +今 +扔 +腻 +枝 +奎 +药 +熄 +吨 +话 +q +额 +慑 +嘌 +协 +喀 +壳 +埭 +视 +著 +於 +愧 +陲 +翌 +峁 +颅 +佛 +腹 +聋 +侯 +咎 +叟 +秀 +颇 +存 +较 +罪 +哄 +岗 +扫 +栏 +钾 +羌 +己 +璨 +枭 +霉 +煌 +涸 +衿 +键 +镝 +益 +岢 +奏 +连 +夯 +睿 +冥 +均 +糖 +狞 +蹊 +稻 +爸 +刿 +胥 +煜 +丽 +肿 +璃 +掸 +跚 +灾 +垂 +樾 +濑 +乎 +莲 +窄 +犹 +撮 +战 +馄 +软 +络 +显 +鸢 +胸 +宾 +妲 +恕 +埔 +蝌 +份 +遇 +巧 +瞟 +粒 +恰 +剥 +桡 +博 +讯 +凯 +堇 +阶 +滤 +卖 +斌 +骚 +彬 +兑 +磺 +樱 +舷 +两 +娱 +福 +仃 +差 +找 +桁 +÷ +净 +把 +阴 +污 +戬 +雷 +碓 +蕲 +楚 +罡 +焖 +抽 +妫 +咒 +仑 +闱 +尽 +邑 +菁 +爱 +贷 +沥 +鞑 +牡 +嗉 +崴 +骤 +塌 +嗦 +订 +拮 +滓 +捡 +锻 +次 +坪 +杩 +臃 +箬 +融 +珂 +鹗 +宗 +枚 +降 +鸬 +妯 +阄 +堰 +盐 +毅 +必 +杨 +崃 +俺 +甬 +状 +莘 +货 +耸 +菱 +腼 +铸 +唏 +痤 +孚 +澳 +懒 +溅 +翘 +疙 +杷 +淼 +缙 +骰 +喊 +悉 +砻 +坷 +艇 +赁 +界 +谤 +纣 +宴 +晃 +茹 +归 +饭 +梢 +铡 +街 +抄 +肼 +鬟 +苯 +颂 +撷 +戈 +炒 +咆 +茭 +瘙 +负 +仰 +客 +琉 +铢 +封 +卑 +珥 +椿 +镧 +窨 +鬲 +寿 +御 +袤 +铃 +萎 +砖 +餮 +脒 +裳 +肪 +孕 +嫣 +馗 +嵇 +恳 +氯 +江 +石 +褶 +冢 +祸 +阻 +狈 +羞 +银 +靳 +透 +咳 +叼 +敷 +芷 +啥 +它 +瓤 +兰 +痘 +懊 +逑 +肌 +往 +捺 +坊 +甩 +呻 +〃 +沦 +忘 +膻 +祟 +菅 +剧 +崆 +智 +坯 +臧 +霍 +墅 +攻 +眯 +倘 +拢 +骠 +铐 +庭 +岙 +瓠 +′ +缺 +泥 +迢 +捶 +? +? +郏 +喙 +掷 +沌 +纯 +秘 +种 +听 +绘 +固 +螨 +团 +香 +盗 +妒 +埚 +蓝 +拖 +旱 +荞 +铀 +血 +遏 +汲 +辰 +叩 +拽 +幅 +硬 +惶 +桀 +漠 +措 +泼 +唑 +齐 +肾 +念 +酱 +虚 +屁 +耶 +旗 +砦 +闵 +婉 +馆 +拭 +绅 +韧 +忏 +窝 +醋 +葺 +顾 +辞 +倜 +堆 +辋 +逆 +玟 +贱 +疾 +董 +惘 +倌 +锕 +淘 +嘀 +莽 +俭 +笏 +绑 +鲷 +杈 +择 +蟀 +粥 +嗯 +驰 +逾 +案 +谪 +褓 +胫 +哩 +昕 +颚 +鲢 +绠 +躺 +鹄 +崂 +儒 +俨 +丝 +尕 +泌 +啊 +萸 +彰 +幺 +吟 +骄 +苣 +弦 +脊 +瑰 +〈 +诛 +镁 +析 +闪 +剪 +侧 +哟 +框 +螃 +守 +嬗 +燕 +狭 +铈 +缮 +概 +迳 +痧 +鲲 +俯 +售 +笼 +痣 +扉 +挖 +满 +咋 +援 +邱 +扇 +歪 +便 +玑 +绦 +峡 +蛇 +叨 +〖 +泽 +胃 +斓 +喋 +怂 +坟 +猪 +该 +蚬 +炕 +弥 +赞 +棣 +晔 +娠 +挲 +狡 +创 +疖 +铕 +镭 +稷 +挫 +弭 +啾 +翔 +粉 +履 +苘 +哦 +楼 +秕 +铂 +土 +锣 +瘟 +挣 +栉 +习 +享 +桢 +袅 +磨 +桂 +谦 +延 +坚 +蔚 +噗 +署 +谟 +猬 +钎 +恐 +嬉 +雒 +倦 +衅 +亏 +璩 +睹 +刻 +殿 +王 +算 +雕 +麻 +丘 +柯 +骆 +丸 +塍 +谚 +添 +鲈 +垓 +桎 +蚯 +芥 +予 +飕 +镦 +谌 +窗 +醚 +菀 +亮 +搪 +莺 +蒿 +羁 +足 +J +真 +轶 +悬 +衷 +靛 +翊 +掩 +哒 +炅 +掐 +冼 +妮 +l +谐 +稚 +荆 +擒 +犯 +陵 +虏 +浓 +崽 +刍 +陌 +傻 +孜 +千 +靖 +演 +矜 +钕 +煽 +杰 +酗 +渗 +伞 +栋 +俗 +泫 +戍 +罕 +沾 +疽 +灏 +煦 +芬 +磴 +叱 +阱 +榉 +湃 +蜀 +叉 +醒 +彪 +租 +郡 +篷 +屎 +良 +垢 +隗 +弱 +陨 +峪 +砷 +掴 +颁 +胎 +雯 +绵 +贬 +沐 +撵 +隘 +篙 +暖 +曹 +陡 +栓 +填 +臼 +彦 +瓶 +琪 +潼 +哪 +鸡 +摩 +啦 +俟 +锋 +域 +耻 +蔫 +疯 +纹 +撇 +毒 +绶 +痛 +酯 +忍 +爪 +赳 +歆 +嘹 +辕 +烈 +册 +朴 +钱 +吮 +毯 +癜 +娃 +谀 +邵 +厮 +炽 +璞 +邃 +丐 +追 +词 +瓒 +忆 +轧 +芫 +谯 +喷 +弟 +半 +冕 +裙 +掖 +墉 +绮 +寝 +苔 +势 +顷 +褥 +切 +衮 +君 +佳 +嫒 +蚩 +霞 +佚 +洙 +逊 +镖 +暹 +唛 +& +殒 +顶 +碗 +獗 +轭 +铺 +蛊 +废 +恹 +汨 +崩 +珍 +那 +杵 +曲 +纺 +夏 +薰 +傀 +闳 +淬 +姘 +舀 +拧 +卷 +楂 +恍 +讪 +厩 +寮 +篪 +赓 +乘 +灭 +盅 +鞣 +沟 +慎 +挂 +饺 +鼾 +杳 +树 +缨 +丛 +絮 +娌 +臻 +嗳 +篡 +侩 +述 +衰 +矛 +圈 +蚜 +匕 +筹 +匿 +濞 +晨 +叶 +骋 +郝 +挚 +蚴 +滞 +增 +侍 +描 +瓣 +吖 +嫦 +蟒 +匾 +圣 +赌 +毡 +癞 +恺 +百 +曳 +需 +篓 +肮 +庖 +帏 +卿 +驿 +遗 +蹬 +鬓 +骡 +歉 +芎 +胳 +屐 +禽 +烦 +晌 +寄 +媾 +狄 +翡 +苒 +船 +廉 +终 +痞 +殇 +々 +畦 +饶 +改 +拆 +悻 +萄 +£ +瓿 +乃 +訾 +桅 +匮 +溧 +拥 +纱 +铍 +骗 +蕃 +龋 +缬 +父 +佐 +疚 +栎 +醍 +掳 +蓄 +x +惆 +颜 +鲆 +榆 +〔 +猎 +敌 +暴 +谥 +鲫 +贾 +罗 +玻 +缄 +扦 +芪 +癣 +落 +徒 +臾 +恿 +猩 +托 +邴 +肄 +牵 +春 +陛 +耀 +刊 +拓 +蓓 +邳 +堕 +寇 +枉 +淌 +啡 +湄 +兽 +酷 +萼 +碚 +濠 +萤 +夹 +旬 +戮 +梭 +琥 +椭 +昔 +勺 +蜊 +绐 +晚 +孺 +僵 +宣 +摄 +冽 +旨 +萌 +忙 +蚤 +眉 +噼 +蟑 +付 +契 +瓜 +悼 +颡 +壁 +曾 +窕 +颢 +澎 +仿 +俑 +浑 +嵌 +浣 +乍 +碌 +褪 +乱 +蔟 +隙 +玩 +剐 +葫 +箫 +纲 +围 +伐 +决 +伙 +漩 +瑟 +刑 +肓 +镳 +缓 +蹭 +氨 +皓 +典 +畲 +坍 +铑 +檐 +塑 +洞 +倬 +储 +胴 +淳 +戾 +吐 +灼 +惺 +妙 +毕 +珐 +缈 +虱 +盖 +羰 +鸿 +磅 +谓 +髅 +娴 +苴 +唷 +蚣 +霹 +抨 +贤 +唠 +犬 +誓 +逍 +庠 +逼 +麓 +籼 +釉 +呜 +碧 +秧 +氩 +摔 +霄 +穸 +纨 +辟 +妈 +映 +完 +牛 +缴 +嗷 +炊 +恩 +荔 +茆 +掉 +紊 +慌 +莓 +羟 +阙 +萁 +磐 +另 +蕹 +辱 +鳐 +湮 +吡 +吩 +唐 +睦 +垠 +舒 +圜 +冗 +瞿 +溺 +芾 +囱 +匠 +僳 +汐 +菩 +饬 +漓 +黑 +霰 +浸 +濡 +窥 +毂 +蒡 +兢 +驻 +鹉 +芮 +诙 +迫 +雳 +厂 +忐 +臆 +猴 +鸣 +蚪 +栈 +箕 +羡 +渐 +莆 +捍 +眈 +哓 +趴 +蹼 +埕 +嚣 +骛 +宏 +淄 +斑 +噜 +严 +瑛 +垃 +椎 +诱 +压 +庾 +绞 +焘 +廿 +抡 +迄 +棘 +夫 +纬 +锹 +眨 +瞌 +侠 +脐 +竞 +瀑 +孳 +骧 +遁 +姜 +颦 +荪 +滚 +萦 +伪 +逸 +粳 +爬 +锁 +矣 +役 +趣 +洒 +颔 +诏 +逐 +奸 +甭 +惠 +攀 +蹄 +泛 +尼 +拼 +阮 +鹰 +亚 +颈 +惑 +勒 +〉 +际 +肛 +爷 +刚 +钨 +丰 +养 +冶 +鲽 +辉 +蔻 +画 +覆 +皴 +妊 +麦 +返 +醉 +皂 +擀 +〗 +酶 +凑 +粹 +悟 +诀 +硖 +港 +卜 +z +杀 +涕 +± +舍 +铠 +抵 +弛 +段 +敝 +镐 +奠 +拂 +轴 +跛 +袱 +e +t +沉 +菇 +俎 +薪 +峦 +秭 +蟹 +历 +盟 +菠 +寡 +液 +肢 +喻 +染 +裱 +悱 +抱 +氙 +赤 +捅 +猛 +跑 +氮 +谣 +仁 +尺 +辊 +窍 +烙 +衍 +架 +擦 +倏 +璐 +瑁 +币 +楞 +胖 +夔 +趸 +邛 +惴 +饕 +虔 +蝎 +§ +哉 +贝 +宽 +辫 +炮 +扩 +饲 +籽 +魏 +菟 +锰 +伍 +猝 +末 +琳 +哚 +蛎 +邂 +呀 +姿 +鄞 +却 +歧 +仙 +恸 +椐 +森 +牒 +寤 +袒 +婆 +虢 +雅 +钉 +朵 +贼 +欲 +苞 +寰 +故 +龚 +坭 +嘘 +咫 +礼 +硷 +兀 +睢 +汶 +’ +铲 +烧 +绕 +诃 +浃 +钿 +哺 +柜 +讼 +颊 +璁 +腔 +洽 +咐 +脲 +簌 +筠 +镣 +玮 +鞠 +谁 +兼 +姆 +挥 +梯 +蝴 +谘 +漕 +刷 +躏 +宦 +弼 +b +垌 +劈 +麟 +莉 +揭 +笙 +渎 +仕 +嗤 +仓 +配 +怏 +抬 +错 +泯 +镊 +孰 +猿 +邪 +仍 +秋 +鼬 +壹 +歇 +吵 +炼 +< +尧 +射 +柬 +廷 +胧 +霾 +凳 +隋 +肚 +浮 +梦 +祥 +株 +堵 +退 +L +鹫 +跎 +凶 +毽 +荟 +炫 +栩 +玳 +甜 +沂 +鹿 +顽 +伯 +爹 +赔 +蛴 +徐 +匡 +欣 +狰 +缸 +雹 +蟆 +疤 +默 +沤 +啜 +痂 +衣 +禅 +w +i +h +辽 +葳 +黝 +钗 +停 +沽 +棒 +馨 +颌 +肉 +吴 +硫 +悯 +劾 +娈 +马 +啧 +吊 +悌 +镑 +峭 +帆 +瀣 +涉 +咸 +疸 +滋 +泣 +翦 +拙 +癸 +钥 +蜒 ++ +尾 +庄 +凝 +泉 +婢 +渴 +谊 +乞 +陆 +锉 +糊 +鸦 +淮 +I +B +N +晦 +弗 +乔 +庥 +葡 +尻 +席 +橡 +傣 +渣 +拿 +惩 +麋 +斛 +缃 +矮 +蛏 +岘 +鸽 +姐 +膏 +催 +奔 +镒 +喱 +蠡 +摧 +钯 +胤 +柠 +拐 +璋 +鸥 +卢 +荡 +倾 +^ +_ +珀 +逄 +萧 +塾 +掇 +贮 +笆 +聂 +圃 +冲 +嵬 +M +滔 +笕 +值 +炙 +偶 +蜱 +搐 +梆 +汪 +蔬 +腑 +鸯 +蹇 +敞 +绯 +仨 +祯 +谆 +梧 +糗 +鑫 +啸 +豺 +囹 +猾 +巢 +柄 +瀛 +筑 +踌 +沭 +暗 +苁 +鱿 +蹉 +脂 +蘖 +牢 +热 +木 +吸 +溃 +宠 +序 +泞 +偿 +拜 +檩 +厚 +朐 +毗 +螳 +吞 +媚 +朽 +担 +蝗 +橘 +畴 +祈 +糟 +盱 +隼 +郜 +惜 +珠 +裨 +铵 +焙 +琚 +唯 +咚 +噪 +骊 +丫 +滢 +勤 +棉 +呸 +咣 +淀 +隔 +蕾 +窈 +饨 +挨 +煅 +短 +匙 +粕 +镜 +赣 +撕 +墩 +酬 +馁 +豌 +颐 +抗 +酣 +氓 +佑 +搁 +哭 +递 +耷 +涡 +桃 +贻 +碣 +截 +瘦 +昭 +镌 +蔓 +氚 +甲 +猕 +蕴 +蓬 +散 +拾 +纛 +狼 +猷 +铎 +埋 +旖 +矾 +讳 +囊 +糜 +迈 +粟 +蚂 +紧 +鲳 +瘢 +栽 +稼 +羊 +锄 +斟 +睁 +桥 +瓮 +蹙 +祉 +醺 +鼻 +昱 +剃 +跳 +篱 +跷 +蒜 +翎 +宅 +晖 +嗑 +壑 +峻 +癫 +屏 +狠 +陋 +袜 +途 +憎 +祀 +莹 +滟 +佶 +溥 +臣 +约 +盛 +峰 +磁 +慵 +婪 +拦 +莅 +朕 +鹦 +粲 +裤 +哎 +疡 +嫖 +琵 +窟 +堪 +谛 +嘉 +儡 +鳝 +斩 +郾 +驸 +酊 +妄 +胜 +贺 +徙 +傅 +噌 +钢 +栅 +庇 +恋 +匝 +巯 +邈 +尸 +锚 +粗 +佟 +蛟 +薹 +纵 +蚊 +郅 +绢 +锐 +苗 +俞 +篆 +淆 +膀 +鲜 +煎 +诶 +秽 +寻 +涮 +刺 +怀 +噶 +巨 +褰 +魅 +灶 +灌 +桉 +藕 +谜 +舸 +薄 +搀 +恽 +借 +牯 +痉 +渥 +愿 +亓 +耘 +杠 +柩 +锔 +蚶 +钣 +珈 +喘 +蹒 +幽 +赐 +稗 +晤 +莱 +泔 +扯 +肯 +菪 +裆 +腩 +豉 +疆 +骜 +腐 +倭 +珏 +唔 +粮 +亡 +润 +慰 +伽 +橄 +玄 +誉 +醐 +胆 +龊 +粼 +塬 +陇 +彼 +削 +嗣 +绾 +芽 +妗 +垭 +瘴 +爽 +薏 +寨 +龈 +泠 +弹 +赢 +漪 +猫 +嘧 +涂 +恤 +圭 +茧 +烽 +屑 +痕 +巾 +赖 +荸 +凰 +腮 +畈 +亵 +蹲 +偃 +苇 +澜 +艮 +换 +骺 +烘 +苕 +梓 +颉 +肇 +哗 +悄 +氤 +涠 +葬 +屠 +鹭 +植 +竺 +佯 +诣 +鲇 +瘀 +鲅 +邦 +移 +滁 +冯 +耕 +癔 +戌 +茬 +沁 +巩 +悠 +湘 +洪 +痹 +锟 +循 +谋 +腕 +鳃 +钠 +捞 +焉 +迎 +碱 +伫 +急 +榷 +奈 +邝 +卯 +辄 +皲 +卟 +醛 +畹 +忧 +稳 +雄 +昼 +缩 +阈 +睑 +扌 +耗 +曦 +涅 +捏 +瞧 +邕 +淖 +漉 +铝 +耦 +禹 +湛 +喽 +莼 +琅 +诸 +苎 +纂 +硅 +始 +嗨 +傥 +燃 +臂 +赅 +嘈 +呆 +贵 +屹 +壮 +肋 +亍 +蚀 +卅 +豹 +腆 +邬 +迭 +浊 +} +童 +螂 +捐 +圩 +勐 +触 +寞 +汊 +壤 +荫 +膺 +渌 +芳 +懿 +遴 +螈 +泰 +蓼 +蛤 +茜 +舅 +枫 +朔 +膝 +眙 +避 +梅 +判 +鹜 +璜 +牍 +缅 +垫 +藻 +黔 +侥 +惚 +懂 +踩 +腰 +腈 +札 +丞 +唾 +慈 +顿 +摹 +荻 +琬 +~ +斧 +沈 +滂 +胁 +胀 +幄 +莜 +Z +匀 +鄄 +掌 +绰 +茎 +焚 +赋 +萱 +谑 +汁 +铒 +瞎 +夺 +蜗 +野 +娆 +冀 +弯 +篁 +懵 +灞 +隽 +芡 +脘 +俐 +辩 +芯 +掺 +喏 +膈 +蝈 +觐 +悚 +踹 +蔗 +熠 +鼠 +呵 +抓 +橼 +峨 +畜 +缔 +禾 +崭 +弃 +熊 +摒 +凸 +拗 +穹 +蒙 +抒 +祛 +劝 +闫 +扳 +阵 +醌 +踪 +喵 +侣 +搬 +仅 +荧 +赎 +蝾 +琦 +买 +婧 +瞄 +寓 +皎 +冻 +赝 +箩 +莫 +瞰 +郊 +笫 +姝 +筒 +枪 +遣 +煸 +袋 +舆 +痱 +涛 +母 +〇 +启 +践 +耙 +绲 +盘 +遂 +昊 +搞 +槿 +诬 +纰 +泓 +惨 +檬 +亻 +越 +C +o +憩 +熵 +祷 +钒 +暧 +塔 +阗 +胰 +咄 +娶 +魔 +琶 +钞 +邻 +扬 +杉 +殴 +咽 +弓 +〆 +髻 +】 +吭 +揽 +霆 +拄 +殖 +脆 +彻 +岩 +芝 +勃 +辣 +剌 +钝 +嘎 +甄 +佘 +皖 +伦 +授 +徕 +憔 +挪 +皇 +庞 +稔 +芜 +踏 +溴 +兖 +卒 +擢 +饥 +鳞 +煲 +‰ +账 +颗 +叻 +斯 +捧 +鳍 +琮 +讹 +蛙 +纽 +谭 +酸 +兔 +莒 +睇 +伟 +觑 +羲 +嗜 +宜 +褐 +旎 +辛 +卦 +诘 +筋 +鎏 +溪 +挛 +熔 +阜 +晰 +鳅 +丢 +奚 +灸 +呱 +献 +陉 +黛 +鸪 +甾 +萨 +疮 +拯 +洲 +疹 +辑 +叙 +恻 +谒 +允 +柔 +烂 +氏 +逅 +漆 +拎 +惋 +扈 +湟 +纭 +啕 +掬 +擞 +哥 +忽 +涤 +鸵 +靡 +郗 +瓷 +扁 +廊 +怨 +雏 +钮 +敦 +E +懦 +憋 +汀 +拚 +啉 +腌 +岸 +f +痼 +瞅 +尊 +咀 +眩 +飙 +忌 +仝 +迦 +熬 +毫 +胯 +篑 +茄 +腺 +凄 +舛 +碴 +锵 +诧 +羯 +後 +漏 +汤 +宓 +仞 +蚁 +壶 +谰 +皑 +铄 +棰 +罔 +辅 +晶 +苦 +牟 +闽 +\ +烃 +饮 +聿 +丙 +蛳 +朱 +煤 +涔 +鳖 +犁 +罐 +荼 +砒 +淦 +妤 +黏 +戎 +孑 +婕 +瑾 +戢 +钵 +枣 +捋 +砥 +衩 +狙 +桠 +稣 +阎 +肃 +梏 +诫 +孪 +昶 +婊 +衫 +嗔 +侃 +塞 +蜃 +樵 +峒 +貌 +屿 +欺 +缫 +阐 +栖 +诟 +珞 +荭 +吝 +萍 +嗽 +恂 +啻 +蜴 +磬 +峋 +俸 +豫 +谎 +徊 +镍 +韬 +魇 +晴 +U +囟 +猜 +蛮 +坐 +囿 +伴 +亭 +肝 +佗 +蝠 +妃 +胞 +滩 +榴 +氖 +垩 +苋 +砣 +扪 +馏 +姓 +轩 +厉 +夥 +侈 +禀 +垒 +岑 +赏 +钛 +辐 +痔 +披 +纸 +碳 +“ +坞 +蠓 +挤 +荥 +沅 +悔 +铧 +帼 +蒌 +蝇 +a +p +y +n +g +哀 +浆 +瑶 +凿 +桶 +馈 +皮 +奴 +苜 +佤 +伶 +晗 +铱 +炬 +优 +弊 +氢 +恃 +甫 +攥 +端 +锌 +灰 +稹 +炝 +曙 +邋 +亥 +眶 +碾 +拉 +萝 +绔 +捷 +浍 +腋 +姑 +菖 +凌 +涞 +麽 +锢 +桨 +潢 +绎 +镰 +殆 +锑 +渝 +铬 +困 +绽 +觎 +匈 +糙 +暑 +裹 +鸟 +盔 +肽 +迷 +綦 +『 +亳 +佝 +俘 +钴 +觇 +骥 +仆 +疝 +跪 +婶 +郯 +瀹 +唉 +脖 +踞 +针 +晾 +忒 +扼 +瞩 +叛 +椒 +疟 +嗡 +邗 +肆 +跆 +玫 +忡 +捣 +咧 +唆 +艄 +蘑 +潦 +笛 +阚 +沸 +泻 +掊 +菽 +贫 +斥 +髂 +孢 +镂 +赂 +麝 +鸾 +屡 +衬 +苷 +恪 +叠 +希 +粤 +爻 +喝 +茫 +惬 +郸 +绻 +庸 +撅 +碟 +宄 +妹 +膛 +叮 +饵 +崛 +嗲 +椅 +冤 +搅 +咕 +敛 +尹 +垦 +闷 +蝉 +霎 +勰 +败 +蓑 +泸 +肤 +鹌 +幌 +焦 +浠 +鞍 +刁 +舰 +乙 +竿 +裔 +。 +茵 +函 +伊 +兄 +丨 +娜 +匍 +謇 +莪 +宥 +似 +蝽 +翳 +酪 +翠 +粑 +薇 +祢 +骏 +赠 +叫 +Q +噤 +噻 +竖 +芗 +莠 +潭 +俊 +羿 +耜 +O +郫 +趁 +嗪 +囚 +蹶 +芒 +洁 +笋 +鹑 +敲 +硝 +啶 +堡 +渲 +揩 +』 +携 +宿 +遒 +颍 +扭 +棱 +割 +萜 +蔸 +葵 +琴 +捂 +饰 +衙 +耿 +掠 +募 +岂 +窖 +涟 +蔺 +瘤 +柞 +瞪 +怜 +匹 +距 +楔 +炜 +哆 +秦 +缎 +幼 +茁 +绪 +痨 +恨 +楸 +娅 +瓦 +桩 +雪 +嬴 +伏 +榔 +妥 +铿 +拌 +眠 +雍 +缇 +‘ +卓 +搓 +哌 +觞 +噩 +屈 +哧 +髓 +咦 +巅 +娑 +侑 +淫 +膳 +祝 +勾 +姊 +莴 +胄 +疃 +薛 +蜷 +胛 +巷 +芙 +芋 +熙 +闰 +勿 +窃 +狱 +剩 +钏 +幢 +陟 +铛 +慧 +靴 +耍 +k +浙 +浇 +飨 +惟 +绗 +祜 +澈 +啼 +咪 +磷 +摞 +诅 +郦 +抹 +跃 +壬 +吕 +肖 +琏 +颤 +尴 +剡 +抠 +凋 +赚 +泊 +津 +宕 +殷 +倔 +氲 +漫 +邺 +涎 +怠 +$ +垮 +荬 +遵 +俏 +叹 +噢 +饽 +蜘 +孙 +筵 +疼 +鞭 +羧 +牦 +箭 +潴 +c +眸 +祭 +髯 +啖 +坳 +愁 +芩 +驮 +倡 +巽 +穰 +沃 +胚 +怒 +凤 +槛 +剂 +趵 +嫁 +v +邢 +灯 +鄢 +桐 +睽 +檗 +锯 +槟 +婷 +嵋 +圻 +诗 +蕈 +颠 +遭 +痢 +芸 +怯 +馥 +竭 +锗 +徜 +恭 +遍 +籁 +剑 +嘱 +苡 +龄 +僧 +桑 +潸 +弘 +澶 +楹 +悲 +讫 +愤 +腥 +悸 +谍 +椹 +呢 +桓 +葭 +攫 +阀 +翰 +躲 +敖 +柑 +郎 +笨 +橇 +呃 +魁 +燎 +脓 +葩 +磋 +垛 +玺 +狮 +沓 +砜 +蕊 +锺 +罹 +蕉 +翱 +虐 +闾 +巫 +旦 +茱 +嬷 +枯 +鹏 +贡 +芹 +汛 +矫 +绁 +拣 +禺 +佃 +讣 +舫 +惯 +乳 +趋 +疲 +挽 +岚 +虾 +衾 +蠹 +蹂 +飓 +氦 +铖 +孩 +稞 +瑜 +壅 +掀 +勘 +妓 +畅 +髋 +W +庐 +牲 +蓿 +榕 +练 +垣 +唱 +邸 +菲 +昆 +婺 +穿 +绡 +麒 +蚱 +掂 +愚 +泷 +涪 +漳 +妩 +娉 +榄 +讷 +觅 +旧 +藤 +煮 +呛 +柳 +腓 +叭 +庵 +烷 +阡 +罂 +蜕 +擂 +猖 +咿 +媲 +脉 +【 +沏 +貅 +黠 +熏 +哲 +烁 +坦 +酵 +兜 +× +潇 +撒 +剽 +珩 +圹 +乾 +摸 +樟 +帽 +嗒 +襄 +魂 +轿 +憬 +锡 +〕 +喃 +皆 +咖 +隅 +脸 +残 +泮 +袂 +鹂 +珊 +囤 +捆 +咤 +误 +徨 +闹 +淙 +芊 +淋 +怆 +囗 +拨 +梳 +渤 +R +G +绨 +蚓 +婀 +幡 +狩 +麾 +谢 +唢 +裸 +旌 +伉 +纶 +裂 +驳 +砼 +咛 +澄 +樨 +蹈 +宙 +澍 +倍 +貔 +操 +勇 +蟠 +摈 +砧 +虬 +够 +缁 +悦 +藿 +撸 +艹 +摁 +淹 +豇 +虎 +榭 +ˉ +吱 +d +° +喧 +荀 +踱 +侮 +奋 +偕 +饷 +犍 +惮 +坑 +璎 +徘 +宛 +妆 +袈 +倩 +窦 +昂 +荏 +乖 +K +怅 +撰 +鳙 +牙 +袁 +酞 +X +痿 +琼 +闸 +雁 +趾 +荚 +虻 +涝 +《 +杏 +韭 +偈 +烤 +绫 +鞘 +卉 +症 +遢 +蓥 +诋 +杭 +荨 +匆 +竣 +簪 +辙 +敕 +虞 +丹 +缭 +咩 +黟 +m +淤 +瑕 +咂 +铉 +硼 +茨 +嶂 +痒 +畸 +敬 +涿 +粪 +窘 +熟 +叔 +嫔 +盾 +忱 +裘 +憾 +梵 +赡 +珙 +咯 +娘 +庙 +溯 +胺 +葱 +痪 +摊 +荷 +卞 +乒 +髦 +寐 +铭 +坩 +胗 +枷 +爆 +溟 +嚼 +羚 +砬 +轨 +惊 +挠 +罄 +竽 +菏 +氧 +浅 +楣 +盼 +枢 +炸 +阆 +杯 +谏 +噬 +淇 +渺 +俪 +秆 +墓 +泪 +跻 +砌 +痰 +垡 +渡 +耽 +釜 +讶 +鳎 +煞 +呗 +韶 +舶 +绷 +鹳 +缜 +旷 +铊 +皱 +龌 +檀 +霖 +奄 +槐 +艳 +蝶 +旋 +哝 +赶 +骞 +蚧 +腊 +盈 +丁 +` +蜚 +矸 +蝙 +睨 +嚓 +僻 +鬼 +醴 +夜 +彝 +磊 +笔 +拔 +栀 +糕 +厦 +邰 +纫 +逭 +纤 +眦 +膊 +馍 +躇 +烯 +蘼 +冬 +诤 +暄 +骶 +哑 +瘠 +」 +臊 +丕 +愈 +咱 +螺 +擅 +跋 +搏 +硪 +谄 +笠 +淡 +嘿 +骅 +谧 +鼎 +皋 +姚 +歼 +蠢 +驼 +耳 +胬 +挝 +涯 +狗 +蒽 +孓 +犷 +凉 +芦 +箴 +铤 +孤 +嘛 +坤 +V +茴 +朦 +挞 +尖 +橙 +诞 +搴 +碇 +洵 +浚 +帚 +蜍 +漯 +柘 +嚎 +讽 +芭 +荤 +咻 +祠 +秉 +跖 +埃 +吓 +糯 +眷 +馒 +惹 +娼 +鲑 +嫩 +讴 +轮 +瞥 +靶 +褚 +乏 +缤 +宋 +帧 +删 +驱 +碎 +扑 +俩 +俄 +偏 +涣 +竹 +噱 +皙 +佰 +渚 +唧 +斡 +# +镉 +刀 +崎 +筐 +佣 +夭 +贰 +肴 +峙 +哔 +艿 +匐 +牺 +镛 +缘 +仡 +嫡 +劣 +枸 +堀 +梨 +簿 +鸭 +蒸 +亦 +稽 +浴 +{ +衢 +束 +槲 +j +阁 +揍 +疥 +棋 +潋 +聪 +窜 +乓 +睛 +插 +冉 +阪 +苍 +搽 +「 +蟾 +螟 +幸 +仇 +樽 +撂 +慢 +跤 +幔 +俚 +淅 +覃 +觊 +溶 +妖 +帛 +侨 +曰 +妾 +泗 +· +: +瀘 +風 +Ë +( +) +∶ +紅 +紗 +瑭 +雲 +頭 +鶏 +財 +許 +• +¥ +樂 +焗 +麗 +— +; +滙 +東 +榮 +繪 +興 +… +門 +業 +π +楊 +國 +顧 +é +盤 +寳 +Λ +龍 +鳳 +島 +誌 +緣 +結 +銭 +萬 +勝 +祎 +璟 +優 +歡 +臨 +時 +購 += +★ +藍 +昇 +鐵 +觀 +勅 +農 +聲 +畫 +兿 +術 +發 +劉 +記 +專 +耑 +園 +書 +壴 +種 +Ο +● +褀 +號 +銀 +匯 +敟 +锘 +葉 +橪 +廣 +進 +蒄 +鑽 +阝 +祙 +貢 +鍋 +豊 +夬 +喆 +團 +閣 +開 +燁 +賓 +館 +酡 +沔 +順 ++ +硚 +劵 +饸 +陽 +車 +湓 +復 +萊 +氣 +軒 +華 +堃 +迮 +纟 +戶 +馬 +學 +裡 +電 +嶽 +獨 +マ +シ +サ +ジ +燘 +袪 +環 +❤ +臺 +灣 +専 +賣 +孖 +聖 +攝 +線 +▪ +α +傢 +俬 +夢 +達 +莊 +喬 +貝 +薩 +劍 +羅 +壓 +棛 +饦 +尃 +璈 +囍 +醫 +G +I +A +# +N +鷄 +髙 +嬰 +啓 +約 +隹 +潔 +賴 +藝 +~ +寶 +籣 +麺 +  +嶺 +√ +義 +網 +峩 +長 +∧ +魚 +機 +構 +② +鳯 +偉 +L +B +㙟 +畵 +鴿 +' +詩 +溝 +嚞 +屌 +藔 +佧 +玥 +蘭 +織 +1 +3 +9 +0 +7 +點 +砭 +鴨 +鋪 +銘 +廳 +弍 +‧ +創 +湯 +坶 +℃ +卩 +骝 +& +烜 +荘 +當 +潤 +扞 +係 +懷 +碶 +钅 +蚨 +讠 +☆ +叢 +爲 +埗 +涫 +塗 +→ +楽 +現 +鯨 +愛 +瑪 +鈺 +忄 +悶 +藥 +飾 +樓 +視 +孬 +ㆍ +燚 +苪 +師 +① +丼 +锽 +│ +韓 +標 +è +兒 +閏 +匋 +張 +漢 +Ü +髪 +會 +閑 +檔 +習 +裝 +の +峯 +菘 +輝 +И +雞 +釣 +億 +浐 +K +O +R +8 +H +E +P +T +W +D +S +C +M +F +姌 +饹 +» +晞 +廰 +ä +嵯 +鷹 +負 +飲 +絲 +冚 +楗 +澤 +綫 +區 +❋ +← +質 +靑 +揚 +③ +滬 +統 +産 +協 +﹑ +乸 +畐 +經 +運 +際 +洺 +岽 +為 +粵 +諾 +崋 +豐 +碁 +ɔ +V +2 +6 +齋 +誠 +訂 +´ +勑 +雙 +陳 +無 +í +泩 +媄 +夌 +刂 +i +c +t +o +r +a +嘢 +耄 +燴 +暃 +壽 +媽 +靈 +抻 +體 +唻 +É +冮 +甹 +鎮 +錦 +ʌ +蜛 +蠄 +尓 +駕 +戀 +飬 +逹 +倫 +貴 +極 +Я +Й +寬 +磚 +嶪 +郎 +職 +| +間 +n +d +剎 +伈 +課 +飛 +橋 +瘊 +№ +譜 +骓 +圗 +滘 +縣 +粿 +咅 +養 +濤 +彳 +® +% +Ⅱ +啰 +㴪 +見 +矞 +薬 +糁 +邨 +鲮 +顔 +罱 +З +選 +話 +贏 +氪 +俵 +競 +瑩 +繡 +枱 +β +綉 +á +獅 +爾 +™ +麵 +戋 +淩 +徳 +個 +劇 +場 +務 +簡 +寵 +h +實 +膠 +轱 +圖 +築 +嘣 +樹 +㸃 +營 +耵 +孫 +饃 +鄺 +飯 +麯 +遠 +輸 +坫 +孃 +乚 +閃 +鏢 +㎡ +題 +廠 +關 +↑ +爺 +將 +軍 +連 +篦 +覌 +參 +箸 +- +窠 +棽 +寕 +夀 +爰 +歐 +呙 +閥 +頡 +熱 +雎 +垟 +裟 +凬 +勁 +帑 +馕 +夆 +疌 +枼 +馮 +貨 +蒤 +樸 +彧 +旸 +靜 +龢 +暢 +㐱 +鳥 +珺 +鏡 +灡 +爭 +堷 +廚 +Ó +騰 +診 +┅ +蘇 +褔 +凱 +頂 +豕 +亞 +帥 +嘬 +⊥ +仺 +桖 +複 +饣 +絡 +穂 +顏 +棟 +納 +▏ +濟 +親 +設 +計 +攵 +埌 +烺 +ò +頤 +燦 +蓮 +撻 +節 +講 +濱 +濃 +娽 +洳 +朿 +燈 +鈴 +護 +膚 +铔 +過 +補 +Z +U +5 +4 +坋 +闿 +䖝 +餘 +缐 +铞 +貿 +铪 +桼 +趙 +鍊 +[ +㐂 +垚 +菓 +揸 +捲 +鐘 +滏 +𣇉 +爍 +輪 +燜 +鴻 +鮮 +動 +鹞 +鷗 +丄 +慶 +鉌 +翥 +飮 +腸 +⇋ +漁 +覺 +來 +熘 +昴 +翏 +鲱 +圧 +鄉 +萭 +頔 +爐 +嫚 +г +貭 +類 +聯 +幛 +輕 +訓 +鑒 +夋 +锨 +芃 +珣 +䝉 +扙 +嵐 +銷 +處 +ㄱ +語 +誘 +苝 +歸 +儀 +燒 +楿 +內 +粢 +葒 +奧 +麥 +礻 +滿 +蠔 +穵 +瞭 +態 +鱬 +榞 +硂 +鄭 +黃 +煙 +祐 +奓 +逺 +* +瑄 +獲 +聞 +薦 +讀 +這 +樣 +決 +問 +啟 +們 +執 +説 +轉 +單 +隨 +唘 +帶 +倉 +庫 +還 +贈 +尙 +皺 +■ +餅 +產 +○ +∈ +報 +狀 +楓 +賠 +琯 +嗮 +禮 +` +傳 +> +≤ +嗞 +Φ +≥ +換 +咭 +∣ +↓ +曬 +ε +応 +寫 +″ +終 +様 +純 +費 +療 +聨 +凍 +壐 +郵 +ü +黒 +∫ +製 +塊 +調 +軽 +確 +撃 +級 +馴 +Ⅲ +涇 +繹 +數 +碼 +證 +狒 +処 +劑 +< +晧 +賀 +衆 +] +櫥 +兩 +陰 +絶 +對 +鯉 +憶 +◎ +p +e +Y +蕒 +煖 +頓 +測 +試 +鼽 +僑 +碩 +妝 +帯 +≈ +鐡 +舖 +權 +喫 +倆 +ˋ +該 +悅 +ā +俫 +. +f +s +b +m +k +g +u +j +貼 +淨 +濕 +針 +適 +備 +l +/ +給 +謢 +強 +觸 +衛 +與 +⊙ +$ +緯 +變 +⑴ +⑵ +⑶ +㎏ +殺 +∩ +幚 +─ +價 +▲ +離 +ú +ó +飄 +烏 +関 +閟 +﹝ +﹞ +邏 +輯 +鍵 +驗 +訣 +導 +歷 +屆 +層 +▼ +儱 +錄 +熳 +ē +艦 +吋 +錶 +辧 +飼 +顯 +④ +禦 +販 +気 +対 +枰 +閩 +紀 +幹 +瞓 +貊 +淚 +△ +眞 +墊 +Ω +獻 +褲 +縫 +緑 +亜 +鉅 +餠 +{ +} +◆ +蘆 +薈 +█ +◇ +溫 +彈 +晳 +粧 +犸 +穩 +訊 +崬 +凖 +熥 +П +舊 +條 +紋 +圍 +Ⅳ +筆 +尷 +難 +雜 +錯 +綁 +識 +頰 +鎖 +艶 +□ +殁 +殼 +⑧ +├ +▕ +鵬 +ǐ +ō +ǒ +糝 +綱 +▎ +μ +盜 +饅 +醬 +籤 +蓋 +釀 +鹽 +據 +à +ɡ +辦 +◥ +彐 +┌ +婦 +獸 +鲩 +伱 +ī +蒟 +蒻 +齊 +袆 +腦 +寧 +凈 +妳 +煥 +詢 +偽 +謹 +啫 +鯽 +騷 +鱸 +損 +傷 +鎻 +髮 +買 +冏 +儥 +両 +﹢ +∞ +載 +喰 +z +羙 +悵 +燙 +曉 +員 +組 +徹 +艷 +痠 +鋼 +鼙 +縮 +細 +嚒 +爯 +≠ +維 +" +鱻 +壇 +厍 +帰 +浥 +犇 +薡 +軎 +² +應 +醜 +刪 +緻 +鶴 +賜 +噁 +軌 +尨 +镔 +鷺 +槗 +彌 +葚 +濛 +請 +溇 +緹 +賢 +訪 +獴 +瑅 +資 +縤 +陣 +蕟 +栢 +韻 +祼 +恁 +伢 +謝 +劃 +涑 +總 +衖 +踺 +砋 +凉 +籃 +駿 +苼 +瘋 +昽 +紡 +驊 +腎 +﹗ +響 +杋 +剛 +嚴 +禪 +歓 +槍 +傘 +檸 +檫 +炣 +勢 +鏜 +鎢 +銑 +尐 +減 +奪 +惡 +θ +僮 +婭 +臘 +ū +ì +殻 +鉄 +∑ +蛲 +焼 +緖 +續 +紹 +懮 \ No newline at end of file diff --git a/deepdoc/models/rec.onnx b/deepdoc/models/rec.onnx new file mode 100644 index 0000000000000000000000000000000000000000..44523663b32151961641b417ce564f92cf2041ed --- /dev/null +++ b/deepdoc/models/rec.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c7cf60de2afd728d512f4190cf37455092b45f06175365c6fc58d8cd7e2a68b +size 10826336 diff --git a/deepdoc/models/tsr.onnx b/deepdoc/models/tsr.onnx new file mode 100644 index 0000000000000000000000000000000000000000..f5b9bf39d95da4118b9d24a8e60361c8427c0b82 --- /dev/null +++ b/deepdoc/models/tsr.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04c14c3e41802450a1f437a3865ce1a3186046262ea4d75c8975289687a43223 +size 12243020 diff --git a/deepdoc/vision/__init__.py b/deepdoc/vision/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..80c2b06da7034abbb278383ea2fbd0f7738e493b --- /dev/null +++ b/deepdoc/vision/__init__.py @@ -0,0 +1,3 @@ +from .ragFlow import RagFlow + +__all__ = ['ragFlow'] diff --git a/deepdoc/vision/ocr.res b/deepdoc/vision/ocr.res new file mode 100644 index 0000000000000000000000000000000000000000..b62de66190de02c68df57fb21de1e2da9bd92fea --- /dev/null +++ b/deepdoc/vision/ocr.res @@ -0,0 +1,6623 @@ +' +疗 +绚 +诚 +娇 +溜 +题 +贿 +者 +廖 +更 +纳 +加 +奉 +公 +一 +就 +汴 +计 +与 +路 +房 +原 +妇 +2 +0 +8 +- +7 +其 +> +: +] +, +, +骑 +刈 +全 +消 +昏 +傈 +安 +久 +钟 +嗅 +不 +影 +处 +驽 +蜿 +资 +关 +椤 +地 +瘸 +专 +问 +忖 +票 +嫉 +炎 +韵 +要 +月 +田 +节 +陂 +鄙 +捌 +备 +拳 +伺 +眼 +网 +盎 +大 +傍 +心 +东 +愉 +汇 +蹿 +科 +每 +业 +里 +航 +晏 +字 +平 +录 +先 +1 +3 +彤 +鲶 +产 +稍 +督 +腴 +有 +象 +岳 +注 +绍 +在 +泺 +文 +定 +核 +名 +水 +过 +理 +让 +偷 +率 +等 +这 +发 +” +为 +含 +肥 +酉 +相 +鄱 +七 +编 +猥 +锛 +日 +镀 +蒂 +掰 +倒 +辆 +栾 +栗 +综 +涩 +州 +雌 +滑 +馀 +了 +机 +块 +司 +宰 +甙 +兴 +矽 +抚 +保 +用 +沧 +秩 +如 +收 +息 +滥 +页 +疑 +埠 +! +! +姥 +异 +橹 +钇 +向 +下 +跄 +的 +椴 +沫 +国 +绥 +獠 +报 +开 +民 +蜇 +何 +分 +凇 +长 +讥 +藏 +掏 +施 +羽 +中 +讲 +派 +嘟 +人 +提 +浼 +间 +世 +而 +古 +多 +倪 +唇 +饯 +控 +庚 +首 +赛 +蜓 +味 +断 +制 +觉 +技 +替 +艰 +溢 +潮 +夕 +钺 +外 +摘 +枋 +动 +双 +单 +啮 +户 +枇 +确 +锦 +曜 +杜 +或 +能 +效 +霜 +盒 +然 +侗 +电 +晁 +放 +步 +鹃 +新 +杖 +蜂 +吒 +濂 +瞬 +评 +总 +隍 +对 +独 +合 +也 +是 +府 +青 +天 +诲 +墙 +组 +滴 +级 +邀 +帘 +示 +已 +时 +骸 +仄 +泅 +和 +遨 +店 +雇 +疫 +持 +巍 +踮 +境 +只 +亨 +目 +鉴 +崤 +闲 +体 +泄 +杂 +作 +般 +轰 +化 +解 +迂 +诿 +蛭 +璀 +腾 +告 +版 +服 +省 +师 +小 +规 +程 +线 +海 +办 +引 +二 +桧 +牌 +砺 +洄 +裴 +修 +图 +痫 +胡 +许 +犊 +事 +郛 +基 +柴 +呼 +食 +研 +奶 +律 +蛋 +因 +葆 +察 +戏 +褒 +戒 +再 +李 +骁 +工 +貂 +油 +鹅 +章 +啄 +休 +场 +给 +睡 +纷 +豆 +器 +捎 +说 +敏 +学 +会 +浒 +设 +诊 +格 +廓 +查 +来 +霓 +室 +溆 +¢ +诡 +寥 +焕 +舜 +柒 +狐 +回 +戟 +砾 +厄 +实 +翩 +尿 +五 +入 +径 +惭 +喹 +股 +宇 +篝 +| +; +美 +期 +云 +九 +祺 +扮 +靠 +锝 +槌 +系 +企 +酰 +阊 +暂 +蚕 +忻 +豁 +本 +羹 +执 +条 +钦 +H +獒 +限 +进 +季 +楦 +于 +芘 +玖 +铋 +茯 +未 +答 +粘 +括 +样 +精 +欠 +矢 +甥 +帷 +嵩 +扣 +令 +仔 +风 +皈 +行 +支 +部 +蓉 +刮 +站 +蜡 +救 +钊 +汗 +松 +嫌 +成 +可 +. +鹤 +院 +从 +交 +政 +怕 +活 +调 +球 +局 +验 +髌 +第 +韫 +谗 +串 +到 +圆 +年 +米 +/ +* +友 +忿 +检 +区 +看 +自 +敢 +刃 +个 +兹 +弄 +流 +留 +同 +没 +齿 +星 +聆 +轼 +湖 +什 +三 +建 +蛔 +儿 +椋 +汕 +震 +颧 +鲤 +跟 +力 +情 +璺 +铨 +陪 +务 +指 +族 +训 +滦 +鄣 +濮 +扒 +商 +箱 +十 +召 +慷 +辗 +所 +莞 +管 +护 +臭 +横 +硒 +嗓 +接 +侦 +六 +露 +党 +馋 +驾 +剖 +高 +侬 +妪 +幂 +猗 +绺 +骐 +央 +酐 +孝 +筝 +课 +徇 +缰 +门 +男 +西 +项 +句 +谙 +瞒 +秃 +篇 +教 +碲 +罚 +声 +呐 +景 +前 +富 +嘴 +鳌 +稀 +免 +朋 +啬 +睐 +去 +赈 +鱼 +住 +肩 +愕 +速 +旁 +波 +厅 +健 +茼 +厥 +鲟 +谅 +投 +攸 +炔 +数 +方 +击 +呋 +谈 +绩 +别 +愫 +僚 +躬 +鹧 +胪 +炳 +招 +喇 +膨 +泵 +蹦 +毛 +结 +5 +4 +谱 +识 +陕 +粽 +婚 +拟 +构 +且 +搜 +任 +潘 +比 +郢 +妨 +醪 +陀 +桔 +碘 +扎 +选 +哈 +骷 +楷 +亿 +明 +缆 +脯 +监 +睫 +逻 +婵 +共 +赴 +淝 +凡 +惦 +及 +达 +揖 +谩 +澹 +减 +焰 +蛹 +番 +祁 +柏 +员 +禄 +怡 +峤 +龙 +白 +叽 +生 +闯 +起 +细 +装 +谕 +竟 +聚 +钙 +上 +导 +渊 +按 +艾 +辘 +挡 +耒 +盹 +饪 +臀 +记 +邮 +蕙 +受 +各 +医 +搂 +普 +滇 +朗 +茸 +带 +翻 +酚 +( +光 +堤 +墟 +蔷 +万 +幻 +〓 +瑙 +辈 +昧 +盏 +亘 +蛀 +吉 +铰 +请 +子 +假 +闻 +税 +井 +诩 +哨 +嫂 +好 +面 +琐 +校 +馊 +鬣 +缂 +营 +访 +炖 +占 +农 +缀 +否 +经 +钚 +棵 +趟 +张 +亟 +吏 +茶 +谨 +捻 +论 +迸 +堂 +玉 +信 +吧 +瞠 +乡 +姬 +寺 +咬 +溏 +苄 +皿 +意 +赉 +宝 +尔 +钰 +艺 +特 +唳 +踉 +都 +荣 +倚 +登 +荐 +丧 +奇 +涵 +批 +炭 +近 +符 +傩 +感 +道 +着 +菊 +虹 +仲 +众 +懈 +濯 +颞 +眺 +南 +释 +北 +缝 +标 +既 +茗 +整 +撼 +迤 +贲 +挎 +耱 +拒 +某 +妍 +卫 +哇 +英 +矶 +藩 +治 +他 +元 +领 +膜 +遮 +穗 +蛾 +飞 +荒 +棺 +劫 +么 +市 +火 +温 +拈 +棚 +洼 +转 +果 +奕 +卸 +迪 +伸 +泳 +斗 +邡 +侄 +涨 +屯 +萋 +胭 +氡 +崮 +枞 +惧 +冒 +彩 +斜 +手 +豚 +随 +旭 +淑 +妞 +形 +菌 +吲 +沱 +争 +驯 +歹 +挟 +兆 +柱 +传 +至 +包 +内 +响 +临 +红 +功 +弩 +衡 +寂 +禁 +老 +棍 +耆 +渍 +织 +害 +氵 +渑 +布 +载 +靥 +嗬 +虽 +苹 +咨 +娄 +库 +雉 +榜 +帜 +嘲 +套 +瑚 +亲 +簸 +欧 +边 +6 +腿 +旮 +抛 +吹 +瞳 +得 +镓 +梗 +厨 +继 +漾 +愣 +憨 +士 +策 +窑 +抑 +躯 +襟 +脏 +参 +贸 +言 +干 +绸 +鳄 +穷 +藜 +音 +折 +详 +) +举 +悍 +甸 +癌 +黎 +谴 +死 +罩 +迁 +寒 +驷 +袖 +媒 +蒋 +掘 +模 +纠 +恣 +观 +祖 +蛆 +碍 +位 +稿 +主 +澧 +跌 +筏 +京 +锏 +帝 +贴 +证 +糠 +才 +黄 +鲸 +略 +炯 +饱 +四 +出 +园 +犀 +牧 +容 +汉 +杆 +浈 +汰 +瑷 +造 +虫 +瘩 +怪 +驴 +济 +应 +花 +沣 +谔 +夙 +旅 +价 +矿 +以 +考 +s +u +呦 +晒 +巡 +茅 +准 +肟 +瓴 +詹 +仟 +褂 +译 +桌 +混 +宁 +怦 +郑 +抿 +些 +余 +鄂 +饴 +攒 +珑 +群 +阖 +岔 +琨 +藓 +预 +环 +洮 +岌 +宀 +杲 +瀵 +最 +常 +囡 +周 +踊 +女 +鼓 +袭 +喉 +简 +范 +薯 +遐 +疏 +粱 +黜 +禧 +法 +箔 +斤 +遥 +汝 +奥 +直 +贞 +撑 +置 +绱 +集 +她 +馅 +逗 +钧 +橱 +魉 +[ +恙 +躁 +唤 +9 +旺 +膘 +待 +脾 +惫 +购 +吗 +依 +盲 +度 +瘿 +蠖 +俾 +之 +镗 +拇 +鲵 +厝 +簧 +续 +款 +展 +啃 +表 +剔 +品 +钻 +腭 +损 +清 +锶 +统 +涌 +寸 +滨 +贪 +链 +吠 +冈 +伎 +迥 +咏 +吁 +览 +防 +迅 +失 +汾 +阔 +逵 +绀 +蔑 +列 +川 +凭 +努 +熨 +揪 +利 +俱 +绉 +抢 +鸨 +我 +即 +责 +膦 +易 +毓 +鹊 +刹 +玷 +岿 +空 +嘞 +绊 +排 +术 +估 +锷 +违 +们 +苟 +铜 +播 +肘 +件 +烫 +审 +鲂 +广 +像 +铌 +惰 +铟 +巳 +胍 +鲍 +康 +憧 +色 +恢 +想 +拷 +尤 +疳 +知 +S +Y +F +D +A +峄 +裕 +帮 +握 +搔 +氐 +氘 +难 +墒 +沮 +雨 +叁 +缥 +悴 +藐 +湫 +娟 +苑 +稠 +颛 +簇 +后 +阕 +闭 +蕤 +缚 +怎 +佞 +码 +嘤 +蔡 +痊 +舱 +螯 +帕 +赫 +昵 +升 +烬 +岫 +、 +疵 +蜻 +髁 +蕨 +隶 +烛 +械 +丑 +盂 +梁 +强 +鲛 +由 +拘 +揉 +劭 +龟 +撤 +钩 +呕 +孛 +费 +妻 +漂 +求 +阑 +崖 +秤 +甘 +通 +深 +补 +赃 +坎 +床 +啪 +承 +吼 +量 +暇 +钼 +烨 +阂 +擎 +脱 +逮 +称 +P +神 +属 +矗 +华 +届 +狍 +葑 +汹 +育 +患 +窒 +蛰 +佼 +静 +槎 +运 +鳗 +庆 +逝 +曼 +疱 +克 +代 +官 +此 +麸 +耧 +蚌 +晟 +例 +础 +榛 +副 +测 +唰 +缢 +迹 +灬 +霁 +身 +岁 +赭 +扛 +又 +菡 +乜 +雾 +板 +读 +陷 +徉 +贯 +郁 +虑 +变 +钓 +菜 +圾 +现 +琢 +式 +乐 +维 +渔 +浜 +左 +吾 +脑 +钡 +警 +T +啵 +拴 +偌 +漱 +湿 +硕 +止 +骼 +魄 +积 +燥 +联 +踢 +玛 +则 +窿 +见 +振 +畿 +送 +班 +钽 +您 +赵 +刨 +印 +讨 +踝 +籍 +谡 +舌 +崧 +汽 +蔽 +沪 +酥 +绒 +怖 +财 +帖 +肱 +私 +莎 +勋 +羔 +霸 +励 +哼 +帐 +将 +帅 +渠 +纪 +婴 +娩 +岭 +厘 +滕 +吻 +伤 +坝 +冠 +戊 +隆 +瘁 +介 +涧 +物 +黍 +并 +姗 +奢 +蹑 +掣 +垸 +锴 +命 +箍 +捉 +病 +辖 +琰 +眭 +迩 +艘 +绌 +繁 +寅 +若 +毋 +思 +诉 +类 +诈 +燮 +轲 +酮 +狂 +重 +反 +职 +筱 +县 +委 +磕 +绣 +奖 +晋 +濉 +志 +徽 +肠 +呈 +獐 +坻 +口 +片 +碰 +几 +村 +柿 +劳 +料 +获 +亩 +惕 +晕 +厌 +号 +罢 +池 +正 +鏖 +煨 +家 +棕 +复 +尝 +懋 +蜥 +锅 +岛 +扰 +队 +坠 +瘾 +钬 +@ +卧 +疣 +镇 +譬 +冰 +彷 +频 +黯 +据 +垄 +采 +八 +缪 +瘫 +型 +熹 +砰 +楠 +襁 +箐 +但 +嘶 +绳 +啤 +拍 +盥 +穆 +傲 +洗 +盯 +塘 +怔 +筛 +丿 +台 +恒 +喂 +葛 +永 +¥ +烟 +酒 +桦 +书 +砂 +蚝 +缉 +态 +瀚 +袄 +圳 +轻 +蛛 +超 +榧 +遛 +姒 +奘 +铮 +右 +荽 +望 +偻 +卡 +丶 +氰 +附 +做 +革 +索 +戚 +坨 +桷 +唁 +垅 +榻 +岐 +偎 +坛 +莨 +山 +殊 +微 +骇 +陈 +爨 +推 +嗝 +驹 +澡 +藁 +呤 +卤 +嘻 +糅 +逛 +侵 +郓 +酌 +德 +摇 +※ +鬃 +被 +慨 +殡 +羸 +昌 +泡 +戛 +鞋 +河 +宪 +沿 +玲 +鲨 +翅 +哽 +源 +铅 +语 +照 +邯 +址 +荃 +佬 +顺 +鸳 +町 +霭 +睾 +瓢 +夸 +椁 +晓 +酿 +痈 +咔 +侏 +券 +噎 +湍 +签 +嚷 +离 +午 +尚 +社 +锤 +背 +孟 +使 +浪 +缦 +潍 +鞅 +军 +姹 +驶 +笑 +鳟 +鲁 +》 +孽 +钜 +绿 +洱 +礴 +焯 +椰 +颖 +囔 +乌 +孔 +巴 +互 +性 +椽 +哞 +聘 +昨 +早 +暮 +胶 +炀 +隧 +低 +彗 +昝 +铁 +呓 +氽 +藉 +喔 +癖 +瑗 +姨 +权 +胱 +韦 +堑 +蜜 +酋 +楝 +砝 +毁 +靓 +歙 +锲 +究 +屋 +喳 +骨 +辨 +碑 +武 +鸠 +宫 +辜 +烊 +适 +坡 +殃 +培 +佩 +供 +走 +蜈 +迟 +翼 +况 +姣 +凛 +浔 +吃 +飘 +债 +犟 +金 +促 +苛 +崇 +坂 +莳 +畔 +绂 +兵 +蠕 +斋 +根 +砍 +亢 +欢 +恬 +崔 +剁 +餐 +榫 +快 +扶 +‖ +濒 +缠 +鳜 +当 +彭 +驭 +浦 +篮 +昀 +锆 +秸 +钳 +弋 +娣 +瞑 +夷 +龛 +苫 +拱 +致 +% +嵊 +障 +隐 +弑 +初 +娓 +抉 +汩 +累 +蓖 +" +唬 +助 +苓 +昙 +押 +毙 +破 +城 +郧 +逢 +嚏 +獭 +瞻 +溱 +婿 +赊 +跨 +恼 +璧 +萃 +姻 +貉 +灵 +炉 +密 +氛 +陶 +砸 +谬 +衔 +点 +琛 +沛 +枳 +层 +岱 +诺 +脍 +榈 +埂 +征 +冷 +裁 +打 +蹴 +素 +瘘 +逞 +蛐 +聊 +激 +腱 +萘 +踵 +飒 +蓟 +吆 +取 +咙 +簋 +涓 +矩 +曝 +挺 +揣 +座 +你 +史 +舵 +焱 +尘 +苏 +笈 +脚 +溉 +榨 +诵 +樊 +邓 +焊 +义 +庶 +儋 +蟋 +蒲 +赦 +呷 +杞 +诠 +豪 +还 +试 +颓 +茉 +太 +除 +紫 +逃 +痴 +草 +充 +鳕 +珉 +祗 +墨 +渭 +烩 +蘸 +慕 +璇 +镶 +穴 +嵘 +恶 +骂 +险 +绋 +幕 +碉 +肺 +戳 +刘 +潞 +秣 +纾 +潜 +銮 +洛 +须 +罘 +销 +瘪 +汞 +兮 +屉 +r +林 +厕 +质 +探 +划 +狸 +殚 +善 +煊 +烹 +〒 +锈 +逯 +宸 +辍 +泱 +柚 +袍 +远 +蹋 +嶙 +绝 +峥 +娥 +缍 +雀 +徵 +认 +镱 +谷 += +贩 +勉 +撩 +鄯 +斐 +洋 +非 +祚 +泾 +诒 +饿 +撬 +威 +晷 +搭 +芍 +锥 +笺 +蓦 +候 +琊 +档 +礁 +沼 +卵 +荠 +忑 +朝 +凹 +瑞 +头 +仪 +弧 +孵 +畏 +铆 +突 +衲 +车 +浩 +气 +茂 +悖 +厢 +枕 +酝 +戴 +湾 +邹 +飚 +攘 +锂 +写 +宵 +翁 +岷 +无 +喜 +丈 +挑 +嗟 +绛 +殉 +议 +槽 +具 +醇 +淞 +笃 +郴 +阅 +饼 +底 +壕 +砚 +弈 +询 +缕 +庹 +翟 +零 +筷 +暨 +舟 +闺 +甯 +撞 +麂 +茌 +蔼 +很 +珲 +捕 +棠 +角 +阉 +媛 +娲 +诽 +剿 +尉 +爵 +睬 +韩 +诰 +匣 +危 +糍 +镯 +立 +浏 +阳 +少 +盆 +舔 +擘 +匪 +申 +尬 +铣 +旯 +抖 +赘 +瓯 +居 +ˇ +哮 +游 +锭 +茏 +歌 +坏 +甚 +秒 +舞 +沙 +仗 +劲 +潺 +阿 +燧 +郭 +嗖 +霏 +忠 +材 +奂 +耐 +跺 +砀 +输 +岖 +媳 +氟 +极 +摆 +灿 +今 +扔 +腻 +枝 +奎 +药 +熄 +吨 +话 +q +额 +慑 +嘌 +协 +喀 +壳 +埭 +视 +著 +於 +愧 +陲 +翌 +峁 +颅 +佛 +腹 +聋 +侯 +咎 +叟 +秀 +颇 +存 +较 +罪 +哄 +岗 +扫 +栏 +钾 +羌 +己 +璨 +枭 +霉 +煌 +涸 +衿 +键 +镝 +益 +岢 +奏 +连 +夯 +睿 +冥 +均 +糖 +狞 +蹊 +稻 +爸 +刿 +胥 +煜 +丽 +肿 +璃 +掸 +跚 +灾 +垂 +樾 +濑 +乎 +莲 +窄 +犹 +撮 +战 +馄 +软 +络 +显 +鸢 +胸 +宾 +妲 +恕 +埔 +蝌 +份 +遇 +巧 +瞟 +粒 +恰 +剥 +桡 +博 +讯 +凯 +堇 +阶 +滤 +卖 +斌 +骚 +彬 +兑 +磺 +樱 +舷 +两 +娱 +福 +仃 +差 +找 +桁 +÷ +净 +把 +阴 +污 +戬 +雷 +碓 +蕲 +楚 +罡 +焖 +抽 +妫 +咒 +仑 +闱 +尽 +邑 +菁 +爱 +贷 +沥 +鞑 +牡 +嗉 +崴 +骤 +塌 +嗦 +订 +拮 +滓 +捡 +锻 +次 +坪 +杩 +臃 +箬 +融 +珂 +鹗 +宗 +枚 +降 +鸬 +妯 +阄 +堰 +盐 +毅 +必 +杨 +崃 +俺 +甬 +状 +莘 +货 +耸 +菱 +腼 +铸 +唏 +痤 +孚 +澳 +懒 +溅 +翘 +疙 +杷 +淼 +缙 +骰 +喊 +悉 +砻 +坷 +艇 +赁 +界 +谤 +纣 +宴 +晃 +茹 +归 +饭 +梢 +铡 +街 +抄 +肼 +鬟 +苯 +颂 +撷 +戈 +炒 +咆 +茭 +瘙 +负 +仰 +客 +琉 +铢 +封 +卑 +珥 +椿 +镧 +窨 +鬲 +寿 +御 +袤 +铃 +萎 +砖 +餮 +脒 +裳 +肪 +孕 +嫣 +馗 +嵇 +恳 +氯 +江 +石 +褶 +冢 +祸 +阻 +狈 +羞 +银 +靳 +透 +咳 +叼 +敷 +芷 +啥 +它 +瓤 +兰 +痘 +懊 +逑 +肌 +往 +捺 +坊 +甩 +呻 +〃 +沦 +忘 +膻 +祟 +菅 +剧 +崆 +智 +坯 +臧 +霍 +墅 +攻 +眯 +倘 +拢 +骠 +铐 +庭 +岙 +瓠 +′ +缺 +泥 +迢 +捶 +? +? +郏 +喙 +掷 +沌 +纯 +秘 +种 +听 +绘 +固 +螨 +团 +香 +盗 +妒 +埚 +蓝 +拖 +旱 +荞 +铀 +血 +遏 +汲 +辰 +叩 +拽 +幅 +硬 +惶 +桀 +漠 +措 +泼 +唑 +齐 +肾 +念 +酱 +虚 +屁 +耶 +旗 +砦 +闵 +婉 +馆 +拭 +绅 +韧 +忏 +窝 +醋 +葺 +顾 +辞 +倜 +堆 +辋 +逆 +玟 +贱 +疾 +董 +惘 +倌 +锕 +淘 +嘀 +莽 +俭 +笏 +绑 +鲷 +杈 +择 +蟀 +粥 +嗯 +驰 +逾 +案 +谪 +褓 +胫 +哩 +昕 +颚 +鲢 +绠 +躺 +鹄 +崂 +儒 +俨 +丝 +尕 +泌 +啊 +萸 +彰 +幺 +吟 +骄 +苣 +弦 +脊 +瑰 +〈 +诛 +镁 +析 +闪 +剪 +侧 +哟 +框 +螃 +守 +嬗 +燕 +狭 +铈 +缮 +概 +迳 +痧 +鲲 +俯 +售 +笼 +痣 +扉 +挖 +满 +咋 +援 +邱 +扇 +歪 +便 +玑 +绦 +峡 +蛇 +叨 +〖 +泽 +胃 +斓 +喋 +怂 +坟 +猪 +该 +蚬 +炕 +弥 +赞 +棣 +晔 +娠 +挲 +狡 +创 +疖 +铕 +镭 +稷 +挫 +弭 +啾 +翔 +粉 +履 +苘 +哦 +楼 +秕 +铂 +土 +锣 +瘟 +挣 +栉 +习 +享 +桢 +袅 +磨 +桂 +谦 +延 +坚 +蔚 +噗 +署 +谟 +猬 +钎 +恐 +嬉 +雒 +倦 +衅 +亏 +璩 +睹 +刻 +殿 +王 +算 +雕 +麻 +丘 +柯 +骆 +丸 +塍 +谚 +添 +鲈 +垓 +桎 +蚯 +芥 +予 +飕 +镦 +谌 +窗 +醚 +菀 +亮 +搪 +莺 +蒿 +羁 +足 +J +真 +轶 +悬 +衷 +靛 +翊 +掩 +哒 +炅 +掐 +冼 +妮 +l +谐 +稚 +荆 +擒 +犯 +陵 +虏 +浓 +崽 +刍 +陌 +傻 +孜 +千 +靖 +演 +矜 +钕 +煽 +杰 +酗 +渗 +伞 +栋 +俗 +泫 +戍 +罕 +沾 +疽 +灏 +煦 +芬 +磴 +叱 +阱 +榉 +湃 +蜀 +叉 +醒 +彪 +租 +郡 +篷 +屎 +良 +垢 +隗 +弱 +陨 +峪 +砷 +掴 +颁 +胎 +雯 +绵 +贬 +沐 +撵 +隘 +篙 +暖 +曹 +陡 +栓 +填 +臼 +彦 +瓶 +琪 +潼 +哪 +鸡 +摩 +啦 +俟 +锋 +域 +耻 +蔫 +疯 +纹 +撇 +毒 +绶 +痛 +酯 +忍 +爪 +赳 +歆 +嘹 +辕 +烈 +册 +朴 +钱 +吮 +毯 +癜 +娃 +谀 +邵 +厮 +炽 +璞 +邃 +丐 +追 +词 +瓒 +忆 +轧 +芫 +谯 +喷 +弟 +半 +冕 +裙 +掖 +墉 +绮 +寝 +苔 +势 +顷 +褥 +切 +衮 +君 +佳 +嫒 +蚩 +霞 +佚 +洙 +逊 +镖 +暹 +唛 +& +殒 +顶 +碗 +獗 +轭 +铺 +蛊 +废 +恹 +汨 +崩 +珍 +那 +杵 +曲 +纺 +夏 +薰 +傀 +闳 +淬 +姘 +舀 +拧 +卷 +楂 +恍 +讪 +厩 +寮 +篪 +赓 +乘 +灭 +盅 +鞣 +沟 +慎 +挂 +饺 +鼾 +杳 +树 +缨 +丛 +絮 +娌 +臻 +嗳 +篡 +侩 +述 +衰 +矛 +圈 +蚜 +匕 +筹 +匿 +濞 +晨 +叶 +骋 +郝 +挚 +蚴 +滞 +增 +侍 +描 +瓣 +吖 +嫦 +蟒 +匾 +圣 +赌 +毡 +癞 +恺 +百 +曳 +需 +篓 +肮 +庖 +帏 +卿 +驿 +遗 +蹬 +鬓 +骡 +歉 +芎 +胳 +屐 +禽 +烦 +晌 +寄 +媾 +狄 +翡 +苒 +船 +廉 +终 +痞 +殇 +々 +畦 +饶 +改 +拆 +悻 +萄 +£ +瓿 +乃 +訾 +桅 +匮 +溧 +拥 +纱 +铍 +骗 +蕃 +龋 +缬 +父 +佐 +疚 +栎 +醍 +掳 +蓄 +x +惆 +颜 +鲆 +榆 +〔 +猎 +敌 +暴 +谥 +鲫 +贾 +罗 +玻 +缄 +扦 +芪 +癣 +落 +徒 +臾 +恿 +猩 +托 +邴 +肄 +牵 +春 +陛 +耀 +刊 +拓 +蓓 +邳 +堕 +寇 +枉 +淌 +啡 +湄 +兽 +酷 +萼 +碚 +濠 +萤 +夹 +旬 +戮 +梭 +琥 +椭 +昔 +勺 +蜊 +绐 +晚 +孺 +僵 +宣 +摄 +冽 +旨 +萌 +忙 +蚤 +眉 +噼 +蟑 +付 +契 +瓜 +悼 +颡 +壁 +曾 +窕 +颢 +澎 +仿 +俑 +浑 +嵌 +浣 +乍 +碌 +褪 +乱 +蔟 +隙 +玩 +剐 +葫 +箫 +纲 +围 +伐 +决 +伙 +漩 +瑟 +刑 +肓 +镳 +缓 +蹭 +氨 +皓 +典 +畲 +坍 +铑 +檐 +塑 +洞 +倬 +储 +胴 +淳 +戾 +吐 +灼 +惺 +妙 +毕 +珐 +缈 +虱 +盖 +羰 +鸿 +磅 +谓 +髅 +娴 +苴 +唷 +蚣 +霹 +抨 +贤 +唠 +犬 +誓 +逍 +庠 +逼 +麓 +籼 +釉 +呜 +碧 +秧 +氩 +摔 +霄 +穸 +纨 +辟 +妈 +映 +完 +牛 +缴 +嗷 +炊 +恩 +荔 +茆 +掉 +紊 +慌 +莓 +羟 +阙 +萁 +磐 +另 +蕹 +辱 +鳐 +湮 +吡 +吩 +唐 +睦 +垠 +舒 +圜 +冗 +瞿 +溺 +芾 +囱 +匠 +僳 +汐 +菩 +饬 +漓 +黑 +霰 +浸 +濡 +窥 +毂 +蒡 +兢 +驻 +鹉 +芮 +诙 +迫 +雳 +厂 +忐 +臆 +猴 +鸣 +蚪 +栈 +箕 +羡 +渐 +莆 +捍 +眈 +哓 +趴 +蹼 +埕 +嚣 +骛 +宏 +淄 +斑 +噜 +严 +瑛 +垃 +椎 +诱 +压 +庾 +绞 +焘 +廿 +抡 +迄 +棘 +夫 +纬 +锹 +眨 +瞌 +侠 +脐 +竞 +瀑 +孳 +骧 +遁 +姜 +颦 +荪 +滚 +萦 +伪 +逸 +粳 +爬 +锁 +矣 +役 +趣 +洒 +颔 +诏 +逐 +奸 +甭 +惠 +攀 +蹄 +泛 +尼 +拼 +阮 +鹰 +亚 +颈 +惑 +勒 +〉 +际 +肛 +爷 +刚 +钨 +丰 +养 +冶 +鲽 +辉 +蔻 +画 +覆 +皴 +妊 +麦 +返 +醉 +皂 +擀 +〗 +酶 +凑 +粹 +悟 +诀 +硖 +港 +卜 +z +杀 +涕 +± +舍 +铠 +抵 +弛 +段 +敝 +镐 +奠 +拂 +轴 +跛 +袱 +e +t +沉 +菇 +俎 +薪 +峦 +秭 +蟹 +历 +盟 +菠 +寡 +液 +肢 +喻 +染 +裱 +悱 +抱 +氙 +赤 +捅 +猛 +跑 +氮 +谣 +仁 +尺 +辊 +窍 +烙 +衍 +架 +擦 +倏 +璐 +瑁 +币 +楞 +胖 +夔 +趸 +邛 +惴 +饕 +虔 +蝎 +§ +哉 +贝 +宽 +辫 +炮 +扩 +饲 +籽 +魏 +菟 +锰 +伍 +猝 +末 +琳 +哚 +蛎 +邂 +呀 +姿 +鄞 +却 +歧 +仙 +恸 +椐 +森 +牒 +寤 +袒 +婆 +虢 +雅 +钉 +朵 +贼 +欲 +苞 +寰 +故 +龚 +坭 +嘘 +咫 +礼 +硷 +兀 +睢 +汶 +’ +铲 +烧 +绕 +诃 +浃 +钿 +哺 +柜 +讼 +颊 +璁 +腔 +洽 +咐 +脲 +簌 +筠 +镣 +玮 +鞠 +谁 +兼 +姆 +挥 +梯 +蝴 +谘 +漕 +刷 +躏 +宦 +弼 +b +垌 +劈 +麟 +莉 +揭 +笙 +渎 +仕 +嗤 +仓 +配 +怏 +抬 +错 +泯 +镊 +孰 +猿 +邪 +仍 +秋 +鼬 +壹 +歇 +吵 +炼 +< +尧 +射 +柬 +廷 +胧 +霾 +凳 +隋 +肚 +浮 +梦 +祥 +株 +堵 +退 +L +鹫 +跎 +凶 +毽 +荟 +炫 +栩 +玳 +甜 +沂 +鹿 +顽 +伯 +爹 +赔 +蛴 +徐 +匡 +欣 +狰 +缸 +雹 +蟆 +疤 +默 +沤 +啜 +痂 +衣 +禅 +w +i +h +辽 +葳 +黝 +钗 +停 +沽 +棒 +馨 +颌 +肉 +吴 +硫 +悯 +劾 +娈 +马 +啧 +吊 +悌 +镑 +峭 +帆 +瀣 +涉 +咸 +疸 +滋 +泣 +翦 +拙 +癸 +钥 +蜒 ++ +尾 +庄 +凝 +泉 +婢 +渴 +谊 +乞 +陆 +锉 +糊 +鸦 +淮 +I +B +N +晦 +弗 +乔 +庥 +葡 +尻 +席 +橡 +傣 +渣 +拿 +惩 +麋 +斛 +缃 +矮 +蛏 +岘 +鸽 +姐 +膏 +催 +奔 +镒 +喱 +蠡 +摧 +钯 +胤 +柠 +拐 +璋 +鸥 +卢 +荡 +倾 +^ +_ +珀 +逄 +萧 +塾 +掇 +贮 +笆 +聂 +圃 +冲 +嵬 +M +滔 +笕 +值 +炙 +偶 +蜱 +搐 +梆 +汪 +蔬 +腑 +鸯 +蹇 +敞 +绯 +仨 +祯 +谆 +梧 +糗 +鑫 +啸 +豺 +囹 +猾 +巢 +柄 +瀛 +筑 +踌 +沭 +暗 +苁 +鱿 +蹉 +脂 +蘖 +牢 +热 +木 +吸 +溃 +宠 +序 +泞 +偿 +拜 +檩 +厚 +朐 +毗 +螳 +吞 +媚 +朽 +担 +蝗 +橘 +畴 +祈 +糟 +盱 +隼 +郜 +惜 +珠 +裨 +铵 +焙 +琚 +唯 +咚 +噪 +骊 +丫 +滢 +勤 +棉 +呸 +咣 +淀 +隔 +蕾 +窈 +饨 +挨 +煅 +短 +匙 +粕 +镜 +赣 +撕 +墩 +酬 +馁 +豌 +颐 +抗 +酣 +氓 +佑 +搁 +哭 +递 +耷 +涡 +桃 +贻 +碣 +截 +瘦 +昭 +镌 +蔓 +氚 +甲 +猕 +蕴 +蓬 +散 +拾 +纛 +狼 +猷 +铎 +埋 +旖 +矾 +讳 +囊 +糜 +迈 +粟 +蚂 +紧 +鲳 +瘢 +栽 +稼 +羊 +锄 +斟 +睁 +桥 +瓮 +蹙 +祉 +醺 +鼻 +昱 +剃 +跳 +篱 +跷 +蒜 +翎 +宅 +晖 +嗑 +壑 +峻 +癫 +屏 +狠 +陋 +袜 +途 +憎 +祀 +莹 +滟 +佶 +溥 +臣 +约 +盛 +峰 +磁 +慵 +婪 +拦 +莅 +朕 +鹦 +粲 +裤 +哎 +疡 +嫖 +琵 +窟 +堪 +谛 +嘉 +儡 +鳝 +斩 +郾 +驸 +酊 +妄 +胜 +贺 +徙 +傅 +噌 +钢 +栅 +庇 +恋 +匝 +巯 +邈 +尸 +锚 +粗 +佟 +蛟 +薹 +纵 +蚊 +郅 +绢 +锐 +苗 +俞 +篆 +淆 +膀 +鲜 +煎 +诶 +秽 +寻 +涮 +刺 +怀 +噶 +巨 +褰 +魅 +灶 +灌 +桉 +藕 +谜 +舸 +薄 +搀 +恽 +借 +牯 +痉 +渥 +愿 +亓 +耘 +杠 +柩 +锔 +蚶 +钣 +珈 +喘 +蹒 +幽 +赐 +稗 +晤 +莱 +泔 +扯 +肯 +菪 +裆 +腩 +豉 +疆 +骜 +腐 +倭 +珏 +唔 +粮 +亡 +润 +慰 +伽 +橄 +玄 +誉 +醐 +胆 +龊 +粼 +塬 +陇 +彼 +削 +嗣 +绾 +芽 +妗 +垭 +瘴 +爽 +薏 +寨 +龈 +泠 +弹 +赢 +漪 +猫 +嘧 +涂 +恤 +圭 +茧 +烽 +屑 +痕 +巾 +赖 +荸 +凰 +腮 +畈 +亵 +蹲 +偃 +苇 +澜 +艮 +换 +骺 +烘 +苕 +梓 +颉 +肇 +哗 +悄 +氤 +涠 +葬 +屠 +鹭 +植 +竺 +佯 +诣 +鲇 +瘀 +鲅 +邦 +移 +滁 +冯 +耕 +癔 +戌 +茬 +沁 +巩 +悠 +湘 +洪 +痹 +锟 +循 +谋 +腕 +鳃 +钠 +捞 +焉 +迎 +碱 +伫 +急 +榷 +奈 +邝 +卯 +辄 +皲 +卟 +醛 +畹 +忧 +稳 +雄 +昼 +缩 +阈 +睑 +扌 +耗 +曦 +涅 +捏 +瞧 +邕 +淖 +漉 +铝 +耦 +禹 +湛 +喽 +莼 +琅 +诸 +苎 +纂 +硅 +始 +嗨 +傥 +燃 +臂 +赅 +嘈 +呆 +贵 +屹 +壮 +肋 +亍 +蚀 +卅 +豹 +腆 +邬 +迭 +浊 +} +童 +螂 +捐 +圩 +勐 +触 +寞 +汊 +壤 +荫 +膺 +渌 +芳 +懿 +遴 +螈 +泰 +蓼 +蛤 +茜 +舅 +枫 +朔 +膝 +眙 +避 +梅 +判 +鹜 +璜 +牍 +缅 +垫 +藻 +黔 +侥 +惚 +懂 +踩 +腰 +腈 +札 +丞 +唾 +慈 +顿 +摹 +荻 +琬 +~ +斧 +沈 +滂 +胁 +胀 +幄 +莜 +Z +匀 +鄄 +掌 +绰 +茎 +焚 +赋 +萱 +谑 +汁 +铒 +瞎 +夺 +蜗 +野 +娆 +冀 +弯 +篁 +懵 +灞 +隽 +芡 +脘 +俐 +辩 +芯 +掺 +喏 +膈 +蝈 +觐 +悚 +踹 +蔗 +熠 +鼠 +呵 +抓 +橼 +峨 +畜 +缔 +禾 +崭 +弃 +熊 +摒 +凸 +拗 +穹 +蒙 +抒 +祛 +劝 +闫 +扳 +阵 +醌 +踪 +喵 +侣 +搬 +仅 +荧 +赎 +蝾 +琦 +买 +婧 +瞄 +寓 +皎 +冻 +赝 +箩 +莫 +瞰 +郊 +笫 +姝 +筒 +枪 +遣 +煸 +袋 +舆 +痱 +涛 +母 +〇 +启 +践 +耙 +绲 +盘 +遂 +昊 +搞 +槿 +诬 +纰 +泓 +惨 +檬 +亻 +越 +C +o +憩 +熵 +祷 +钒 +暧 +塔 +阗 +胰 +咄 +娶 +魔 +琶 +钞 +邻 +扬 +杉 +殴 +咽 +弓 +〆 +髻 +】 +吭 +揽 +霆 +拄 +殖 +脆 +彻 +岩 +芝 +勃 +辣 +剌 +钝 +嘎 +甄 +佘 +皖 +伦 +授 +徕 +憔 +挪 +皇 +庞 +稔 +芜 +踏 +溴 +兖 +卒 +擢 +饥 +鳞 +煲 +‰ +账 +颗 +叻 +斯 +捧 +鳍 +琮 +讹 +蛙 +纽 +谭 +酸 +兔 +莒 +睇 +伟 +觑 +羲 +嗜 +宜 +褐 +旎 +辛 +卦 +诘 +筋 +鎏 +溪 +挛 +熔 +阜 +晰 +鳅 +丢 +奚 +灸 +呱 +献 +陉 +黛 +鸪 +甾 +萨 +疮 +拯 +洲 +疹 +辑 +叙 +恻 +谒 +允 +柔 +烂 +氏 +逅 +漆 +拎 +惋 +扈 +湟 +纭 +啕 +掬 +擞 +哥 +忽 +涤 +鸵 +靡 +郗 +瓷 +扁 +廊 +怨 +雏 +钮 +敦 +E +懦 +憋 +汀 +拚 +啉 +腌 +岸 +f +痼 +瞅 +尊 +咀 +眩 +飙 +忌 +仝 +迦 +熬 +毫 +胯 +篑 +茄 +腺 +凄 +舛 +碴 +锵 +诧 +羯 +後 +漏 +汤 +宓 +仞 +蚁 +壶 +谰 +皑 +铄 +棰 +罔 +辅 +晶 +苦 +牟 +闽 +\ +烃 +饮 +聿 +丙 +蛳 +朱 +煤 +涔 +鳖 +犁 +罐 +荼 +砒 +淦 +妤 +黏 +戎 +孑 +婕 +瑾 +戢 +钵 +枣 +捋 +砥 +衩 +狙 +桠 +稣 +阎 +肃 +梏 +诫 +孪 +昶 +婊 +衫 +嗔 +侃 +塞 +蜃 +樵 +峒 +貌 +屿 +欺 +缫 +阐 +栖 +诟 +珞 +荭 +吝 +萍 +嗽 +恂 +啻 +蜴 +磬 +峋 +俸 +豫 +谎 +徊 +镍 +韬 +魇 +晴 +U +囟 +猜 +蛮 +坐 +囿 +伴 +亭 +肝 +佗 +蝠 +妃 +胞 +滩 +榴 +氖 +垩 +苋 +砣 +扪 +馏 +姓 +轩 +厉 +夥 +侈 +禀 +垒 +岑 +赏 +钛 +辐 +痔 +披 +纸 +碳 +“ +坞 +蠓 +挤 +荥 +沅 +悔 +铧 +帼 +蒌 +蝇 +a +p +y +n +g +哀 +浆 +瑶 +凿 +桶 +馈 +皮 +奴 +苜 +佤 +伶 +晗 +铱 +炬 +优 +弊 +氢 +恃 +甫 +攥 +端 +锌 +灰 +稹 +炝 +曙 +邋 +亥 +眶 +碾 +拉 +萝 +绔 +捷 +浍 +腋 +姑 +菖 +凌 +涞 +麽 +锢 +桨 +潢 +绎 +镰 +殆 +锑 +渝 +铬 +困 +绽 +觎 +匈 +糙 +暑 +裹 +鸟 +盔 +肽 +迷 +綦 +『 +亳 +佝 +俘 +钴 +觇 +骥 +仆 +疝 +跪 +婶 +郯 +瀹 +唉 +脖 +踞 +针 +晾 +忒 +扼 +瞩 +叛 +椒 +疟 +嗡 +邗 +肆 +跆 +玫 +忡 +捣 +咧 +唆 +艄 +蘑 +潦 +笛 +阚 +沸 +泻 +掊 +菽 +贫 +斥 +髂 +孢 +镂 +赂 +麝 +鸾 +屡 +衬 +苷 +恪 +叠 +希 +粤 +爻 +喝 +茫 +惬 +郸 +绻 +庸 +撅 +碟 +宄 +妹 +膛 +叮 +饵 +崛 +嗲 +椅 +冤 +搅 +咕 +敛 +尹 +垦 +闷 +蝉 +霎 +勰 +败 +蓑 +泸 +肤 +鹌 +幌 +焦 +浠 +鞍 +刁 +舰 +乙 +竿 +裔 +。 +茵 +函 +伊 +兄 +丨 +娜 +匍 +謇 +莪 +宥 +似 +蝽 +翳 +酪 +翠 +粑 +薇 +祢 +骏 +赠 +叫 +Q +噤 +噻 +竖 +芗 +莠 +潭 +俊 +羿 +耜 +O +郫 +趁 +嗪 +囚 +蹶 +芒 +洁 +笋 +鹑 +敲 +硝 +啶 +堡 +渲 +揩 +』 +携 +宿 +遒 +颍 +扭 +棱 +割 +萜 +蔸 +葵 +琴 +捂 +饰 +衙 +耿 +掠 +募 +岂 +窖 +涟 +蔺 +瘤 +柞 +瞪 +怜 +匹 +距 +楔 +炜 +哆 +秦 +缎 +幼 +茁 +绪 +痨 +恨 +楸 +娅 +瓦 +桩 +雪 +嬴 +伏 +榔 +妥 +铿 +拌 +眠 +雍 +缇 +‘ +卓 +搓 +哌 +觞 +噩 +屈 +哧 +髓 +咦 +巅 +娑 +侑 +淫 +膳 +祝 +勾 +姊 +莴 +胄 +疃 +薛 +蜷 +胛 +巷 +芙 +芋 +熙 +闰 +勿 +窃 +狱 +剩 +钏 +幢 +陟 +铛 +慧 +靴 +耍 +k +浙 +浇 +飨 +惟 +绗 +祜 +澈 +啼 +咪 +磷 +摞 +诅 +郦 +抹 +跃 +壬 +吕 +肖 +琏 +颤 +尴 +剡 +抠 +凋 +赚 +泊 +津 +宕 +殷 +倔 +氲 +漫 +邺 +涎 +怠 +$ +垮 +荬 +遵 +俏 +叹 +噢 +饽 +蜘 +孙 +筵 +疼 +鞭 +羧 +牦 +箭 +潴 +c +眸 +祭 +髯 +啖 +坳 +愁 +芩 +驮 +倡 +巽 +穰 +沃 +胚 +怒 +凤 +槛 +剂 +趵 +嫁 +v +邢 +灯 +鄢 +桐 +睽 +檗 +锯 +槟 +婷 +嵋 +圻 +诗 +蕈 +颠 +遭 +痢 +芸 +怯 +馥 +竭 +锗 +徜 +恭 +遍 +籁 +剑 +嘱 +苡 +龄 +僧 +桑 +潸 +弘 +澶 +楹 +悲 +讫 +愤 +腥 +悸 +谍 +椹 +呢 +桓 +葭 +攫 +阀 +翰 +躲 +敖 +柑 +郎 +笨 +橇 +呃 +魁 +燎 +脓 +葩 +磋 +垛 +玺 +狮 +沓 +砜 +蕊 +锺 +罹 +蕉 +翱 +虐 +闾 +巫 +旦 +茱 +嬷 +枯 +鹏 +贡 +芹 +汛 +矫 +绁 +拣 +禺 +佃 +讣 +舫 +惯 +乳 +趋 +疲 +挽 +岚 +虾 +衾 +蠹 +蹂 +飓 +氦 +铖 +孩 +稞 +瑜 +壅 +掀 +勘 +妓 +畅 +髋 +W +庐 +牲 +蓿 +榕 +练 +垣 +唱 +邸 +菲 +昆 +婺 +穿 +绡 +麒 +蚱 +掂 +愚 +泷 +涪 +漳 +妩 +娉 +榄 +讷 +觅 +旧 +藤 +煮 +呛 +柳 +腓 +叭 +庵 +烷 +阡 +罂 +蜕 +擂 +猖 +咿 +媲 +脉 +【 +沏 +貅 +黠 +熏 +哲 +烁 +坦 +酵 +兜 +× +潇 +撒 +剽 +珩 +圹 +乾 +摸 +樟 +帽 +嗒 +襄 +魂 +轿 +憬 +锡 +〕 +喃 +皆 +咖 +隅 +脸 +残 +泮 +袂 +鹂 +珊 +囤 +捆 +咤 +误 +徨 +闹 +淙 +芊 +淋 +怆 +囗 +拨 +梳 +渤 +R +G +绨 +蚓 +婀 +幡 +狩 +麾 +谢 +唢 +裸 +旌 +伉 +纶 +裂 +驳 +砼 +咛 +澄 +樨 +蹈 +宙 +澍 +倍 +貔 +操 +勇 +蟠 +摈 +砧 +虬 +够 +缁 +悦 +藿 +撸 +艹 +摁 +淹 +豇 +虎 +榭 +ˉ +吱 +d +° +喧 +荀 +踱 +侮 +奋 +偕 +饷 +犍 +惮 +坑 +璎 +徘 +宛 +妆 +袈 +倩 +窦 +昂 +荏 +乖 +K +怅 +撰 +鳙 +牙 +袁 +酞 +X +痿 +琼 +闸 +雁 +趾 +荚 +虻 +涝 +《 +杏 +韭 +偈 +烤 +绫 +鞘 +卉 +症 +遢 +蓥 +诋 +杭 +荨 +匆 +竣 +簪 +辙 +敕 +虞 +丹 +缭 +咩 +黟 +m +淤 +瑕 +咂 +铉 +硼 +茨 +嶂 +痒 +畸 +敬 +涿 +粪 +窘 +熟 +叔 +嫔 +盾 +忱 +裘 +憾 +梵 +赡 +珙 +咯 +娘 +庙 +溯 +胺 +葱 +痪 +摊 +荷 +卞 +乒 +髦 +寐 +铭 +坩 +胗 +枷 +爆 +溟 +嚼 +羚 +砬 +轨 +惊 +挠 +罄 +竽 +菏 +氧 +浅 +楣 +盼 +枢 +炸 +阆 +杯 +谏 +噬 +淇 +渺 +俪 +秆 +墓 +泪 +跻 +砌 +痰 +垡 +渡 +耽 +釜 +讶 +鳎 +煞 +呗 +韶 +舶 +绷 +鹳 +缜 +旷 +铊 +皱 +龌 +檀 +霖 +奄 +槐 +艳 +蝶 +旋 +哝 +赶 +骞 +蚧 +腊 +盈 +丁 +` +蜚 +矸 +蝙 +睨 +嚓 +僻 +鬼 +醴 +夜 +彝 +磊 +笔 +拔 +栀 +糕 +厦 +邰 +纫 +逭 +纤 +眦 +膊 +馍 +躇 +烯 +蘼 +冬 +诤 +暄 +骶 +哑 +瘠 +」 +臊 +丕 +愈 +咱 +螺 +擅 +跋 +搏 +硪 +谄 +笠 +淡 +嘿 +骅 +谧 +鼎 +皋 +姚 +歼 +蠢 +驼 +耳 +胬 +挝 +涯 +狗 +蒽 +孓 +犷 +凉 +芦 +箴 +铤 +孤 +嘛 +坤 +V +茴 +朦 +挞 +尖 +橙 +诞 +搴 +碇 +洵 +浚 +帚 +蜍 +漯 +柘 +嚎 +讽 +芭 +荤 +咻 +祠 +秉 +跖 +埃 +吓 +糯 +眷 +馒 +惹 +娼 +鲑 +嫩 +讴 +轮 +瞥 +靶 +褚 +乏 +缤 +宋 +帧 +删 +驱 +碎 +扑 +俩 +俄 +偏 +涣 +竹 +噱 +皙 +佰 +渚 +唧 +斡 +# +镉 +刀 +崎 +筐 +佣 +夭 +贰 +肴 +峙 +哔 +艿 +匐 +牺 +镛 +缘 +仡 +嫡 +劣 +枸 +堀 +梨 +簿 +鸭 +蒸 +亦 +稽 +浴 +{ +衢 +束 +槲 +j +阁 +揍 +疥 +棋 +潋 +聪 +窜 +乓 +睛 +插 +冉 +阪 +苍 +搽 +「 +蟾 +螟 +幸 +仇 +樽 +撂 +慢 +跤 +幔 +俚 +淅 +覃 +觊 +溶 +妖 +帛 +侨 +曰 +妾 +泗 +· +: +瀘 +風 +Ë +( +) +∶ +紅 +紗 +瑭 +雲 +頭 +鶏 +財 +許 +• +¥ +樂 +焗 +麗 +— +; +滙 +東 +榮 +繪 +興 +… +門 +業 +π +楊 +國 +顧 +é +盤 +寳 +Λ +龍 +鳳 +島 +誌 +緣 +結 +銭 +萬 +勝 +祎 +璟 +優 +歡 +臨 +時 +購 += +★ +藍 +昇 +鐵 +觀 +勅 +農 +聲 +畫 +兿 +術 +發 +劉 +記 +專 +耑 +園 +書 +壴 +種 +Ο +● +褀 +號 +銀 +匯 +敟 +锘 +葉 +橪 +廣 +進 +蒄 +鑽 +阝 +祙 +貢 +鍋 +豊 +夬 +喆 +團 +閣 +開 +燁 +賓 +館 +酡 +沔 +順 ++ +硚 +劵 +饸 +陽 +車 +湓 +復 +萊 +氣 +軒 +華 +堃 +迮 +纟 +戶 +馬 +學 +裡 +電 +嶽 +獨 +マ +シ +サ +ジ +燘 +袪 +環 +❤ +臺 +灣 +専 +賣 +孖 +聖 +攝 +線 +▪ +α +傢 +俬 +夢 +達 +莊 +喬 +貝 +薩 +劍 +羅 +壓 +棛 +饦 +尃 +璈 +囍 +醫 +G +I +A +# +N +鷄 +髙 +嬰 +啓 +約 +隹 +潔 +賴 +藝 +~ +寶 +籣 +麺 +  +嶺 +√ +義 +網 +峩 +長 +∧ +魚 +機 +構 +② +鳯 +偉 +L +B +㙟 +畵 +鴿 +' +詩 +溝 +嚞 +屌 +藔 +佧 +玥 +蘭 +織 +1 +3 +9 +0 +7 +點 +砭 +鴨 +鋪 +銘 +廳 +弍 +‧ +創 +湯 +坶 +℃ +卩 +骝 +& +烜 +荘 +當 +潤 +扞 +係 +懷 +碶 +钅 +蚨 +讠 +☆ +叢 +爲 +埗 +涫 +塗 +→ +楽 +現 +鯨 +愛 +瑪 +鈺 +忄 +悶 +藥 +飾 +樓 +視 +孬 +ㆍ +燚 +苪 +師 +① +丼 +锽 +│ +韓 +標 +è +兒 +閏 +匋 +張 +漢 +Ü +髪 +會 +閑 +檔 +習 +裝 +の +峯 +菘 +輝 +И +雞 +釣 +億 +浐 +K +O +R +8 +H +E +P +T +W +D +S +C +M +F +姌 +饹 +» +晞 +廰 +ä +嵯 +鷹 +負 +飲 +絲 +冚 +楗 +澤 +綫 +區 +❋ +← +質 +靑 +揚 +③ +滬 +統 +産 +協 +﹑ +乸 +畐 +經 +運 +際 +洺 +岽 +為 +粵 +諾 +崋 +豐 +碁 +ɔ +V +2 +6 +齋 +誠 +訂 +´ +勑 +雙 +陳 +無 +í +泩 +媄 +夌 +刂 +i +c +t +o +r +a +嘢 +耄 +燴 +暃 +壽 +媽 +靈 +抻 +體 +唻 +É +冮 +甹 +鎮 +錦 +ʌ +蜛 +蠄 +尓 +駕 +戀 +飬 +逹 +倫 +貴 +極 +Я +Й +寬 +磚 +嶪 +郎 +職 +| +間 +n +d +剎 +伈 +課 +飛 +橋 +瘊 +№ +譜 +骓 +圗 +滘 +縣 +粿 +咅 +養 +濤 +彳 +® +% +Ⅱ +啰 +㴪 +見 +矞 +薬 +糁 +邨 +鲮 +顔 +罱 +З +選 +話 +贏 +氪 +俵 +競 +瑩 +繡 +枱 +β +綉 +á +獅 +爾 +™ +麵 +戋 +淩 +徳 +個 +劇 +場 +務 +簡 +寵 +h +實 +膠 +轱 +圖 +築 +嘣 +樹 +㸃 +營 +耵 +孫 +饃 +鄺 +飯 +麯 +遠 +輸 +坫 +孃 +乚 +閃 +鏢 +㎡ +題 +廠 +關 +↑ +爺 +將 +軍 +連 +篦 +覌 +參 +箸 +- +窠 +棽 +寕 +夀 +爰 +歐 +呙 +閥 +頡 +熱 +雎 +垟 +裟 +凬 +勁 +帑 +馕 +夆 +疌 +枼 +馮 +貨 +蒤 +樸 +彧 +旸 +靜 +龢 +暢 +㐱 +鳥 +珺 +鏡 +灡 +爭 +堷 +廚 +Ó +騰 +診 +┅ +蘇 +褔 +凱 +頂 +豕 +亞 +帥 +嘬 +⊥ +仺 +桖 +複 +饣 +絡 +穂 +顏 +棟 +納 +▏ +濟 +親 +設 +計 +攵 +埌 +烺 +ò +頤 +燦 +蓮 +撻 +節 +講 +濱 +濃 +娽 +洳 +朿 +燈 +鈴 +護 +膚 +铔 +過 +補 +Z +U +5 +4 +坋 +闿 +䖝 +餘 +缐 +铞 +貿 +铪 +桼 +趙 +鍊 +[ +㐂 +垚 +菓 +揸 +捲 +鐘 +滏 +𣇉 +爍 +輪 +燜 +鴻 +鮮 +動 +鹞 +鷗 +丄 +慶 +鉌 +翥 +飮 +腸 +⇋ +漁 +覺 +來 +熘 +昴 +翏 +鲱 +圧 +鄉 +萭 +頔 +爐 +嫚 +г +貭 +類 +聯 +幛 +輕 +訓 +鑒 +夋 +锨 +芃 +珣 +䝉 +扙 +嵐 +銷 +處 +ㄱ +語 +誘 +苝 +歸 +儀 +燒 +楿 +內 +粢 +葒 +奧 +麥 +礻 +滿 +蠔 +穵 +瞭 +態 +鱬 +榞 +硂 +鄭 +黃 +煙 +祐 +奓 +逺 +* +瑄 +獲 +聞 +薦 +讀 +這 +樣 +決 +問 +啟 +們 +執 +説 +轉 +單 +隨 +唘 +帶 +倉 +庫 +還 +贈 +尙 +皺 +■ +餅 +產 +○ +∈ +報 +狀 +楓 +賠 +琯 +嗮 +禮 +` +傳 +> +≤ +嗞 +Φ +≥ +換 +咭 +∣ +↓ +曬 +ε +応 +寫 +″ +終 +様 +純 +費 +療 +聨 +凍 +壐 +郵 +ü +黒 +∫ +製 +塊 +調 +軽 +確 +撃 +級 +馴 +Ⅲ +涇 +繹 +數 +碼 +證 +狒 +処 +劑 +< +晧 +賀 +衆 +] +櫥 +兩 +陰 +絶 +對 +鯉 +憶 +◎ +p +e +Y +蕒 +煖 +頓 +測 +試 +鼽 +僑 +碩 +妝 +帯 +≈ +鐡 +舖 +權 +喫 +倆 +ˋ +該 +悅 +ā +俫 +. +f +s +b +m +k +g +u +j +貼 +淨 +濕 +針 +適 +備 +l +/ +給 +謢 +強 +觸 +衛 +與 +⊙ +$ +緯 +變 +⑴ +⑵ +⑶ +㎏ +殺 +∩ +幚 +─ +價 +▲ +離 +ú +ó +飄 +烏 +関 +閟 +﹝ +﹞ +邏 +輯 +鍵 +驗 +訣 +導 +歷 +屆 +層 +▼ +儱 +錄 +熳 +ē +艦 +吋 +錶 +辧 +飼 +顯 +④ +禦 +販 +気 +対 +枰 +閩 +紀 +幹 +瞓 +貊 +淚 +△ +眞 +墊 +Ω +獻 +褲 +縫 +緑 +亜 +鉅 +餠 +{ +} +◆ +蘆 +薈 +█ +◇ +溫 +彈 +晳 +粧 +犸 +穩 +訊 +崬 +凖 +熥 +П +舊 +條 +紋 +圍 +Ⅳ +筆 +尷 +難 +雜 +錯 +綁 +識 +頰 +鎖 +艶 +□ +殁 +殼 +⑧ +├ +▕ +鵬 +ǐ +ō +ǒ +糝 +綱 +▎ +μ +盜 +饅 +醬 +籤 +蓋 +釀 +鹽 +據 +à +ɡ +辦 +◥ +彐 +┌ +婦 +獸 +鲩 +伱 +ī +蒟 +蒻 +齊 +袆 +腦 +寧 +凈 +妳 +煥 +詢 +偽 +謹 +啫 +鯽 +騷 +鱸 +損 +傷 +鎻 +髮 +買 +冏 +儥 +両 +﹢ +∞ +載 +喰 +z +羙 +悵 +燙 +曉 +員 +組 +徹 +艷 +痠 +鋼 +鼙 +縮 +細 +嚒 +爯 +≠ +維 +" +鱻 +壇 +厍 +帰 +浥 +犇 +薡 +軎 +² +應 +醜 +刪 +緻 +鶴 +賜 +噁 +軌 +尨 +镔 +鷺 +槗 +彌 +葚 +濛 +請 +溇 +緹 +賢 +訪 +獴 +瑅 +資 +縤 +陣 +蕟 +栢 +韻 +祼 +恁 +伢 +謝 +劃 +涑 +總 +衖 +踺 +砋 +凉 +籃 +駿 +苼 +瘋 +昽 +紡 +驊 +腎 +﹗ +響 +杋 +剛 +嚴 +禪 +歓 +槍 +傘 +檸 +檫 +炣 +勢 +鏜 +鎢 +銑 +尐 +減 +奪 +惡 +θ +僮 +婭 +臘 +ū +ì +殻 +鉄 +∑ +蛲 +焼 +緖 +續 +紹 +懮 \ No newline at end of file diff --git a/deepdoc/vision/operators.py b/deepdoc/vision/operators.py new file mode 100644 index 0000000000000000000000000000000000000000..382fe3635ff827799f4907d77f150cca72f2560c --- /dev/null +++ b/deepdoc/vision/operators.py @@ -0,0 +1,711 @@ +# +# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +import six +import cv2 +import numpy as np +import math +from PIL import Image + + +class DecodeImage(object): + """ decode image """ + + def __init__(self, + img_mode='RGB', + channel_first=False, + ignore_orientation=False, + **kwargs): + self.img_mode = img_mode + self.channel_first = channel_first + self.ignore_orientation = ignore_orientation + + def __call__(self, data): + img = data['image'] + if six.PY2: + assert isinstance(img, str) and len( + img) > 0, "invalid input 'img' in DecodeImage" + else: + assert isinstance(img, bytes) and len( + img) > 0, "invalid input 'img' in DecodeImage" + img = np.frombuffer(img, dtype='uint8') + if self.ignore_orientation: + img = cv2.imdecode(img, cv2.IMREAD_IGNORE_ORIENTATION | + cv2.IMREAD_COLOR) + else: + img = cv2.imdecode(img, 1) + if img is None: + return None + if self.img_mode == 'GRAY': + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + elif self.img_mode == 'RGB': + assert img.shape[2] == 3, 'invalid shape of image[%s]' % ( + img.shape) + img = img[:, :, ::-1] + + if self.channel_first: + img = img.transpose((2, 0, 1)) + + data['image'] = img + return data + + +class StandardizeImage(object): + """normalize image + Args: + mean (list): im - mean + std (list): im / std + is_scale (bool): whether need im / 255 + norm_type (str): type in ['mean_std', 'none'] + """ + + def __init__(self, mean, std, is_scale=True, norm_type='mean_std'): + self.mean = mean + self.std = std + self.is_scale = is_scale + self.norm_type = norm_type + + def __call__(self, im, im_info): + """ + Args: + im (np.ndarray): image (np.ndarray) + im_info (dict): info of image + Returns: + im (np.ndarray): processed image (np.ndarray) + im_info (dict): info of processed image + """ + im = im.astype(np.float32, copy=False) + if self.is_scale: + scale = 1.0 / 255.0 + im *= scale + + if self.norm_type == 'mean_std': + mean = np.array(self.mean)[np.newaxis, np.newaxis, :] + std = np.array(self.std)[np.newaxis, np.newaxis, :] + im -= mean + im /= std + return im, im_info + + +class NormalizeImage(object): + """ normalize image such as substract mean, divide std + """ + + def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs): + if isinstance(scale, str): + scale = eval(scale) + self.scale = np.float32(scale if scale is not None else 1.0 / 255.0) + mean = mean if mean is not None else [0.485, 0.456, 0.406] + std = std if std is not None else [0.229, 0.224, 0.225] + + shape = (3, 1, 1) if order == 'chw' else (1, 1, 3) + self.mean = np.array(mean).reshape(shape).astype('float32') + self.std = np.array(std).reshape(shape).astype('float32') + + def __call__(self, data): + img = data['image'] + from PIL import Image + if isinstance(img, Image.Image): + img = np.array(img) + assert isinstance(img, + np.ndarray), "invalid input 'img' in NormalizeImage" + data['image'] = ( + img.astype('float32') * self.scale - self.mean) / self.std + return data + + +class ToCHWImage(object): + """ convert hwc image to chw image + """ + + def __init__(self, **kwargs): + pass + + def __call__(self, data): + img = data['image'] + from PIL import Image + if isinstance(img, Image.Image): + img = np.array(img) + data['image'] = img.transpose((2, 0, 1)) + return data + + +class Fasttext(object): + def __init__(self, path="None", **kwargs): + import fasttext + self.fast_model = fasttext.load_model(path) + + def __call__(self, data): + label = data['label'] + fast_label = self.fast_model[label] + data['fast_label'] = fast_label + return data + + +class KeepKeys(object): + def __init__(self, keep_keys, **kwargs): + self.keep_keys = keep_keys + + def __call__(self, data): + data_list = [] + for key in self.keep_keys: + data_list.append(data[key]) + return data_list + + +class Pad(object): + def __init__(self, size=None, size_div=32, **kwargs): + if size is not None and not isinstance(size, (int, list, tuple)): + raise TypeError("Type of target_size is invalid. Now is {}".format( + type(size))) + if isinstance(size, int): + size = [size, size] + self.size = size + self.size_div = size_div + + def __call__(self, data): + + img = data['image'] + img_h, img_w = img.shape[0], img.shape[1] + if self.size: + resize_h2, resize_w2 = self.size + assert ( + img_h < resize_h2 and img_w < resize_w2 + ), '(h, w) of target size should be greater than (img_h, img_w)' + else: + resize_h2 = max( + int(math.ceil(img.shape[0] / self.size_div) * self.size_div), + self.size_div) + resize_w2 = max( + int(math.ceil(img.shape[1] / self.size_div) * self.size_div), + self.size_div) + img = cv2.copyMakeBorder( + img, + 0, + resize_h2 - img_h, + 0, + resize_w2 - img_w, + cv2.BORDER_CONSTANT, + value=0) + data['image'] = img + return data + + +class LinearResize(object): + """resize image by target_size and max_size + Args: + target_size (int): the target size of image + keep_ratio (bool): whether keep_ratio or not, default true + interp (int): method of resize + """ + + def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR): + if isinstance(target_size, int): + target_size = [target_size, target_size] + self.target_size = target_size + self.keep_ratio = keep_ratio + self.interp = interp + + def __call__(self, im, im_info): + """ + Args: + im (np.ndarray): image (np.ndarray) + im_info (dict): info of image + Returns: + im (np.ndarray): processed image (np.ndarray) + im_info (dict): info of processed image + """ + assert len(self.target_size) == 2 + assert self.target_size[0] > 0 and self.target_size[1] > 0 + im_channel = im.shape[2] + im_scale_y, im_scale_x = self.generate_scale(im) + im = cv2.resize( + im, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + im_info['im_shape'] = np.array(im.shape[:2]).astype('float32') + im_info['scale_factor'] = np.array( + [im_scale_y, im_scale_x]).astype('float32') + return im, im_info + + def generate_scale(self, im): + """ + Args: + im (np.ndarray): image (np.ndarray) + Returns: + im_scale_x: the resize ratio of X + im_scale_y: the resize ratio of Y + """ + origin_shape = im.shape[:2] + im_c = im.shape[2] + if self.keep_ratio: + im_size_min = np.min(origin_shape) + im_size_max = np.max(origin_shape) + target_size_min = np.min(self.target_size) + target_size_max = np.max(self.target_size) + im_scale = float(target_size_min) / float(im_size_min) + if np.round(im_scale * im_size_max) > target_size_max: + im_scale = float(target_size_max) / float(im_size_max) + im_scale_x = im_scale + im_scale_y = im_scale + else: + resize_h, resize_w = self.target_size + im_scale_y = resize_h / float(origin_shape[0]) + im_scale_x = resize_w / float(origin_shape[1]) + return im_scale_y, im_scale_x + + +class Resize(object): + def __init__(self, size=(640, 640), **kwargs): + self.size = size + + def resize_image(self, img): + resize_h, resize_w = self.size + ori_h, ori_w = img.shape[:2] # (h, w, c) + ratio_h = float(resize_h) / ori_h + ratio_w = float(resize_w) / ori_w + img = cv2.resize(img, (int(resize_w), int(resize_h))) + return img, [ratio_h, ratio_w] + + def __call__(self, data): + img = data['image'] + if 'polys' in data: + text_polys = data['polys'] + + img_resize, [ratio_h, ratio_w] = self.resize_image(img) + if 'polys' in data: + new_boxes = [] + for box in text_polys: + new_box = [] + for cord in box: + new_box.append([cord[0] * ratio_w, cord[1] * ratio_h]) + new_boxes.append(new_box) + data['polys'] = np.array(new_boxes, dtype=np.float32) + data['image'] = img_resize + return data + + +class DetResizeForTest(object): + def __init__(self, **kwargs): + super(DetResizeForTest, self).__init__() + self.resize_type = 0 + self.keep_ratio = False + if 'image_shape' in kwargs: + self.image_shape = kwargs['image_shape'] + self.resize_type = 1 + if 'keep_ratio' in kwargs: + self.keep_ratio = kwargs['keep_ratio'] + elif 'limit_side_len' in kwargs: + self.limit_side_len = kwargs['limit_side_len'] + self.limit_type = kwargs.get('limit_type', 'min') + elif 'resize_long' in kwargs: + self.resize_type = 2 + self.resize_long = kwargs.get('resize_long', 960) + else: + self.limit_side_len = 736 + self.limit_type = 'min' + + def __call__(self, data): + img = data['image'] + src_h, src_w, _ = img.shape + if sum([src_h, src_w]) < 64: + img = self.image_padding(img) + + if self.resize_type == 0: + # img, shape = self.resize_image_type0(img) + img, [ratio_h, ratio_w] = self.resize_image_type0(img) + elif self.resize_type == 2: + img, [ratio_h, ratio_w] = self.resize_image_type2(img) + else: + # img, shape = self.resize_image_type1(img) + img, [ratio_h, ratio_w] = self.resize_image_type1(img) + data['image'] = img + data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) + return data + + def image_padding(self, im, value=0): + h, w, c = im.shape + im_pad = np.zeros((max(32, h), max(32, w), c), np.uint8) + value + im_pad[:h, :w, :] = im + return im_pad + + def resize_image_type1(self, img): + resize_h, resize_w = self.image_shape + ori_h, ori_w = img.shape[:2] # (h, w, c) + if self.keep_ratio is True: + resize_w = ori_w * resize_h / ori_h + N = math.ceil(resize_w / 32) + resize_w = N * 32 + ratio_h = float(resize_h) / ori_h + ratio_w = float(resize_w) / ori_w + img = cv2.resize(img, (int(resize_w), int(resize_h))) + # return img, np.array([ori_h, ori_w]) + return img, [ratio_h, ratio_w] + + def resize_image_type0(self, img): + """ + resize image to a size multiple of 32 which is required by the network + args: + img(array): array with shape [h, w, c] + return(tuple): + img, (ratio_h, ratio_w) + """ + limit_side_len = self.limit_side_len + h, w, c = img.shape + + # limit the max side + if self.limit_type == 'max': + if max(h, w) > limit_side_len: + if h > w: + ratio = float(limit_side_len) / h + else: + ratio = float(limit_side_len) / w + else: + ratio = 1. + elif self.limit_type == 'min': + if min(h, w) < limit_side_len: + if h < w: + ratio = float(limit_side_len) / h + else: + ratio = float(limit_side_len) / w + else: + ratio = 1. + elif self.limit_type == 'resize_long': + ratio = float(limit_side_len) / max(h, w) + else: + raise Exception('not support limit type, image ') + resize_h = int(h * ratio) + resize_w = int(w * ratio) + + resize_h = max(int(round(resize_h / 32) * 32), 32) + resize_w = max(int(round(resize_w / 32) * 32), 32) + + try: + if int(resize_w) <= 0 or int(resize_h) <= 0: + return None, (None, None) + img = cv2.resize(img, (int(resize_w), int(resize_h))) + except BaseException: + print(img.shape, resize_w, resize_h) + sys.exit(0) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return img, [ratio_h, ratio_w] + + def resize_image_type2(self, img): + h, w, _ = img.shape + + resize_w = w + resize_h = h + + if resize_h > resize_w: + ratio = float(self.resize_long) / resize_h + else: + ratio = float(self.resize_long) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + img = cv2.resize(img, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + + return img, [ratio_h, ratio_w] + + +class E2EResizeForTest(object): + def __init__(self, **kwargs): + super(E2EResizeForTest, self).__init__() + self.max_side_len = kwargs['max_side_len'] + self.valid_set = kwargs['valid_set'] + + def __call__(self, data): + img = data['image'] + src_h, src_w, _ = img.shape + if self.valid_set == 'totaltext': + im_resized, [ratio_h, ratio_w] = self.resize_image_for_totaltext( + img, max_side_len=self.max_side_len) + else: + im_resized, (ratio_h, ratio_w) = self.resize_image( + img, max_side_len=self.max_side_len) + data['image'] = im_resized + data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) + return data + + def resize_image_for_totaltext(self, im, max_side_len=512): + + h, w, _ = im.shape + resize_w = w + resize_h = h + ratio = 1.25 + if h * ratio > max_side_len: + ratio = float(max_side_len) / resize_h + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return im, (ratio_h, ratio_w) + + def resize_image(self, im, max_side_len=512): + """ + resize image to a size multiple of max_stride which is required by the network + :param im: the resized image + :param max_side_len: limit of max image size to avoid out of memory in gpu + :return: the resized image and the resize ratio + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + + # Fix the longer side + if resize_h > resize_w: + ratio = float(max_side_len) / resize_h + else: + ratio = float(max_side_len) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + + return im, (ratio_h, ratio_w) + + +class KieResize(object): + def __init__(self, **kwargs): + super(KieResize, self).__init__() + self.max_side, self.min_side = kwargs['img_scale'][0], kwargs[ + 'img_scale'][1] + + def __call__(self, data): + img = data['image'] + points = data['points'] + src_h, src_w, _ = img.shape + im_resized, scale_factor, [ratio_h, ratio_w + ], [new_h, new_w] = self.resize_image(img) + resize_points = self.resize_boxes(img, points, scale_factor) + data['ori_image'] = img + data['ori_boxes'] = points + data['points'] = resize_points + data['image'] = im_resized + data['shape'] = np.array([new_h, new_w]) + return data + + def resize_image(self, img): + norm_img = np.zeros([1024, 1024, 3], dtype='float32') + scale = [512, 1024] + h, w = img.shape[:2] + max_long_edge = max(scale) + max_short_edge = min(scale) + scale_factor = min(max_long_edge / max(h, w), + max_short_edge / min(h, w)) + resize_w, resize_h = int(w * float(scale_factor) + 0.5), int(h * float( + scale_factor) + 0.5) + max_stride = 32 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(img, (resize_w, resize_h)) + new_h, new_w = im.shape[:2] + w_scale = new_w / w + h_scale = new_h / h + scale_factor = np.array( + [w_scale, h_scale, w_scale, h_scale], dtype=np.float32) + norm_img[:new_h, :new_w, :] = im + return norm_img, scale_factor, [h_scale, w_scale], [new_h, new_w] + + def resize_boxes(self, im, points, scale_factor): + points = points * scale_factor + img_shape = im.shape[:2] + points[:, 0::2] = np.clip(points[:, 0::2], 0, img_shape[1]) + points[:, 1::2] = np.clip(points[:, 1::2], 0, img_shape[0]) + return points + + +class SRResize(object): + def __init__(self, + imgH=32, + imgW=128, + down_sample_scale=4, + keep_ratio=False, + min_ratio=1, + mask=False, + infer_mode=False, + **kwargs): + self.imgH = imgH + self.imgW = imgW + self.keep_ratio = keep_ratio + self.min_ratio = min_ratio + self.down_sample_scale = down_sample_scale + self.mask = mask + self.infer_mode = infer_mode + + def __call__(self, data): + imgH = self.imgH + imgW = self.imgW + images_lr = data["image_lr"] + transform2 = ResizeNormalize( + (imgW // self.down_sample_scale, imgH // self.down_sample_scale)) + images_lr = transform2(images_lr) + data["img_lr"] = images_lr + if self.infer_mode: + return data + + images_HR = data["image_hr"] + label_strs = data["label"] + transform = ResizeNormalize((imgW, imgH)) + images_HR = transform(images_HR) + data["img_hr"] = images_HR + return data + + +class ResizeNormalize(object): + def __init__(self, size, interpolation=Image.BICUBIC): + self.size = size + self.interpolation = interpolation + + def __call__(self, img): + img = img.resize(self.size, self.interpolation) + img_numpy = np.array(img).astype("float32") + img_numpy = img_numpy.transpose((2, 0, 1)) / 255 + return img_numpy + + +class GrayImageChannelFormat(object): + """ + format gray scale image's channel: (3,h,w) -> (1,h,w) + Args: + inverse: inverse gray image + """ + + def __init__(self, inverse=False, **kwargs): + self.inverse = inverse + + def __call__(self, data): + img = data['image'] + img_single_channel = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + img_expanded = np.expand_dims(img_single_channel, 0) + + if self.inverse: + data['image'] = np.abs(img_expanded - 1) + else: + data['image'] = img_expanded + + data['src_image'] = img + return data + + +class Permute(object): + """permute image + Args: + to_bgr (bool): whether convert RGB to BGR + channel_first (bool): whether convert HWC to CHW + """ + + def __init__(self, ): + super(Permute, self).__init__() + + def __call__(self, im, im_info): + """ + Args: + im (np.ndarray): image (np.ndarray) + im_info (dict): info of image + Returns: + im (np.ndarray): processed image (np.ndarray) + im_info (dict): info of processed image + """ + im = im.transpose((2, 0, 1)).copy() + return im, im_info + + +class PadStride(object): + """ padding image for model with FPN, instead PadBatch(pad_to_stride) in original config + Args: + stride (bool): model with FPN need image shape % stride == 0 + """ + + def __init__(self, stride=0): + self.coarsest_stride = stride + + def __call__(self, im, im_info): + """ + Args: + im (np.ndarray): image (np.ndarray) + im_info (dict): info of image + Returns: + im (np.ndarray): processed image (np.ndarray) + im_info (dict): info of processed image + """ + coarsest_stride = self.coarsest_stride + if coarsest_stride <= 0: + return im, im_info + im_c, im_h, im_w = im.shape + pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride) + pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride) + padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32) + padding_im[:, :im_h, :im_w] = im + return padding_im, im_info + + +def decode_image(im_file, im_info): + """read rgb image + Args: + im_file (str|np.ndarray): input can be image path or np.ndarray + im_info (dict): info of image + Returns: + im (np.ndarray): processed image (np.ndarray) + im_info (dict): info of processed image + """ + if isinstance(im_file, str): + with open(im_file, 'rb') as f: + im_read = f.read() + data = np.frombuffer(im_read, dtype='uint8') + im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode + im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + else: + im = im_file + im_info['im_shape'] = np.array(im.shape[:2], dtype=np.float32) + im_info['scale_factor'] = np.array([1., 1.], dtype=np.float32) + return im, im_info + + +def preprocess(im, preprocess_ops): + # process image by preprocess_ops + im_info = { + 'scale_factor': np.array( + [1., 1.], dtype=np.float32), + 'im_shape': None, + } + im, im_info = decode_image(im, im_info) + for operator in preprocess_ops: + im, im_info = operator(im, im_info) + return im, im_info diff --git a/deepdoc/vision/postprocess.py b/deepdoc/vision/postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..ec6f69d08c4f0e0eeaabf94264b72fc5649db35b --- /dev/null +++ b/deepdoc/vision/postprocess.py @@ -0,0 +1,353 @@ +import copy + +import numpy as np +import cv2 +from shapely.geometry import Polygon +import pyclipper + + +def build_post_process(config, global_config=None): + support_dict = ['DBPostProcess', 'CTCLabelDecode'] + + config = copy.deepcopy(config) + module_name = config.pop('name') + if module_name == "None": + return + if global_config is not None: + config.update(global_config) + assert module_name in support_dict, Exception( + 'post process only support {}'.format(support_dict)) + module_class = eval(module_name)(**config) + return module_class + + +class DBPostProcess(object): + """ + The post process for Differentiable Binarization (DB). + """ + + def __init__(self, + thresh=0.3, + box_thresh=0.7, + max_candidates=1000, + unclip_ratio=2.0, + use_dilation=False, + score_mode="fast", + box_type='quad', + **kwargs): + self.thresh = thresh + self.box_thresh = box_thresh + self.max_candidates = max_candidates + self.unclip_ratio = unclip_ratio + self.min_size = 3 + self.score_mode = score_mode + self.box_type = box_type + assert score_mode in [ + "slow", "fast" + ], "Score mode must be in [slow, fast] but got: {}".format(score_mode) + + self.dilation_kernel = None if not use_dilation else np.array( + [[1, 1], [1, 1]]) + + def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height): + ''' + _bitmap: single map with shape (1, H, W), + whose values are binarized as {0, 1} + ''' + + bitmap = _bitmap + height, width = bitmap.shape + + boxes = [] + scores = [] + + contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), + cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + + for contour in contours[:self.max_candidates]: + epsilon = 0.002 * cv2.arcLength(contour, True) + approx = cv2.approxPolyDP(contour, epsilon, True) + points = approx.reshape((-1, 2)) + if points.shape[0] < 4: + continue + + score = self.box_score_fast(pred, points.reshape(-1, 2)) + if self.box_thresh > score: + continue + + if points.shape[0] > 2: + box = self.unclip(points, self.unclip_ratio) + if len(box) > 1: + continue + else: + continue + box = box.reshape(-1, 2) + + _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2))) + if sside < self.min_size + 2: + continue + + box = np.array(box) + box[:, 0] = np.clip( + np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip( + np.round(box[:, 1] / height * dest_height), 0, dest_height) + boxes.append(box.tolist()) + scores.append(score) + return boxes, scores + + def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height): + ''' + _bitmap: single map with shape (1, H, W), + whose values are binarized as {0, 1} + ''' + + bitmap = _bitmap + height, width = bitmap.shape + + outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, + cv2.CHAIN_APPROX_SIMPLE) + if len(outs) == 3: + img, contours, _ = outs[0], outs[1], outs[2] + elif len(outs) == 2: + contours, _ = outs[0], outs[1] + + num_contours = min(len(contours), self.max_candidates) + + boxes = [] + scores = [] + for index in range(num_contours): + contour = contours[index] + points, sside = self.get_mini_boxes(contour) + if sside < self.min_size: + continue + points = np.array(points) + if self.score_mode == "fast": + score = self.box_score_fast(pred, points.reshape(-1, 2)) + else: + score = self.box_score_slow(pred, contour) + if self.box_thresh > score: + continue + + box = self.unclip(points, self.unclip_ratio).reshape(-1, 1, 2) + box, sside = self.get_mini_boxes(box) + if sside < self.min_size + 2: + continue + box = np.array(box) + + box[:, 0] = np.clip( + np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip( + np.round(box[:, 1] / height * dest_height), 0, dest_height) + boxes.append(box.astype("int32")) + scores.append(score) + return np.array(boxes, dtype="int32"), scores + + def unclip(self, box, unclip_ratio): + poly = Polygon(box) + distance = poly.area * unclip_ratio / poly.length + offset = pyclipper.PyclipperOffset() + offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + expanded = np.array(offset.Execute(distance)) + return expanded + + def get_mini_boxes(self, contour): + bounding_box = cv2.minAreaRect(contour) + points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) + + index_1, index_2, index_3, index_4 = 0, 1, 2, 3 + if points[1][1] > points[0][1]: + index_1 = 0 + index_4 = 1 + else: + index_1 = 1 + index_4 = 0 + if points[3][1] > points[2][1]: + index_2 = 2 + index_3 = 3 + else: + index_2 = 3 + index_3 = 2 + + box = [ + points[index_1], points[index_2], points[index_3], points[index_4] + ] + return box, min(bounding_box[1]) + + def box_score_fast(self, bitmap, _box): + ''' + box_score_fast: use bbox mean score as the mean score + ''' + h, w = bitmap.shape[:2] + box = _box.copy() + xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1) + xmax = np.clip(np.ceil(box[:, 0].max()).astype("int32"), 0, w - 1) + ymin = np.clip(np.floor(box[:, 1].min()).astype("int32"), 0, h - 1) + ymax = np.clip(np.ceil(box[:, 1].max()).astype("int32"), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + box[:, 0] = box[:, 0] - xmin + box[:, 1] = box[:, 1] - ymin + cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1) + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] + + def box_score_slow(self, bitmap, contour): + ''' + box_score_slow: use polyon mean score as the mean score + ''' + h, w = bitmap.shape[:2] + contour = contour.copy() + contour = np.reshape(contour, (-1, 2)) + + xmin = np.clip(np.min(contour[:, 0]), 0, w - 1) + xmax = np.clip(np.max(contour[:, 0]), 0, w - 1) + ymin = np.clip(np.min(contour[:, 1]), 0, h - 1) + ymax = np.clip(np.max(contour[:, 1]), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + + contour[:, 0] = contour[:, 0] - xmin + contour[:, 1] = contour[:, 1] - ymin + + cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1) + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] + + def __call__(self, outs_dict, shape_list): + pred = outs_dict['maps'] + if not isinstance(pred, np.ndarray): + pred = pred.numpy() + pred = pred[:, 0, :, :] + segmentation = pred > self.thresh + + boxes_batch = [] + for batch_index in range(pred.shape[0]): + src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] + if self.dilation_kernel is not None: + mask = cv2.dilate( + np.array(segmentation[batch_index]).astype(np.uint8), + self.dilation_kernel) + else: + mask = segmentation[batch_index] + if self.box_type == 'poly': + boxes, scores = self.polygons_from_bitmap(pred[batch_index], + mask, src_w, src_h) + elif self.box_type == 'quad': + boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask, + src_w, src_h) + else: + raise ValueError( + "box_type can only be one of ['quad', 'poly']") + + boxes_batch.append({'points': boxes}) + return boxes_batch + + +class BaseRecLabelDecode(object): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False): + self.beg_str = "sos" + self.end_str = "eos" + self.reverse = False + self.character_str = [] + + if character_dict_path is None: + self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" + dict_character = list(self.character_str) + else: + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + self.character_str.append(line) + if use_space_char: + self.character_str.append(" ") + dict_character = list(self.character_str) + if 'arabic' in character_dict_path: + self.reverse = True + + dict_character = self.add_special_char(dict_character) + self.dict = {} + for i, char in enumerate(dict_character): + self.dict[char] = i + self.character = dict_character + + def pred_reverse(self, pred): + pred_re = [] + c_current = '' + for c in pred: + if not bool(re.search('[a-zA-Z0-9 :*./%+-]', c)): + if c_current != '': + pred_re.append(c_current) + pred_re.append(c) + c_current = '' + else: + c_current += c + if c_current != '': + pred_re.append(c_current) + + return ''.join(pred_re[::-1]) + + def add_special_char(self, dict_character): + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + selection = np.ones(len(text_index[batch_idx]), dtype=bool) + if is_remove_duplicate: + selection[1:] = text_index[batch_idx][1:] != text_index[ + batch_idx][:-1] + for ignored_token in ignored_tokens: + selection &= text_index[batch_idx] != ignored_token + + char_list = [ + self.character[text_id] + for text_id in text_index[batch_idx][selection] + ] + if text_prob is not None: + conf_list = text_prob[batch_idx][selection] + else: + conf_list = [1] * len(selection) + if len(conf_list) == 0: + conf_list = [0] + + text = ''.join(char_list) + + if self.reverse: # for arabic rec + text = self.pred_reverse(text) + + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def get_ignored_tokens(self): + return [0] # for ctc blank + + +class CTCLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(CTCLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, tuple) or isinstance(preds, list): + preds = preds[-1] + if not isinstance(preds, np.ndarray): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True) + if label is None: + return text + label = self.decode(label) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['blank'] + dict_character + return dict_character diff --git a/deepdoc/vision/ragFlow.py b/deepdoc/vision/ragFlow.py new file mode 100644 index 0000000000000000000000000000000000000000..c64a4e6ab733228daa4b806b2fe4f0b8b09bd4d7 --- /dev/null +++ b/deepdoc/vision/ragFlow.py @@ -0,0 +1,313 @@ +import copy +import time +import os + +from huggingface_hub import snapshot_download + +from .operators import * +import numpy as np +import onnxruntime as ort +import logging +from .postprocess import build_post_process + +from typing import List + +def get_deepdoc_directory(): + PROJECT_BASE = os.path.abspath( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + os.pardir + ) + ) + return PROJECT_BASE +def transform(data, ops=None): + """ transform """ + if ops is None: + ops = [] + for op in ops: + data = op(data) + if data is None: + return None + return data + + +def create_operators(op_param_list, global_config=None): + """ + create operators based on the config + + Args: + params(list): a dict list, used to create some operators + """ + assert isinstance( + op_param_list, list), ('operator config should be a list') + ops = [] + for operator in op_param_list: + assert isinstance(operator, + dict) and len(operator) == 1, "yaml format error" + op_name = list(operator)[0] + param = {} if operator[op_name] is None else operator[op_name] + if global_config is not None: + param.update(global_config) + op = eval(op_name)(**param) + ops.append(op) + return ops + + +def load_model(model_dir, nm): + model_file_path = os.path.join(model_dir, nm + ".onnx") + if not os.path.exists(model_file_path): + raise ValueError("not find model file path {}".format( + model_file_path)) + + options = ort.SessionOptions() + options.enable_cpu_mem_arena = False + options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL + options.intra_op_num_threads = 2 + options.inter_op_num_threads = 2 + if False and ort.get_device() == "GPU": + sess = ort.InferenceSession( + model_file_path, + options=options, + providers=['CUDAExecutionProvider']) + else: + sess = ort.InferenceSession( + model_file_path, + options=options, + providers=['CPUExecutionProvider']) + print(model_file_path) + print(sess.get_modelmeta().description) + return sess, sess.get_inputs()[0] + + +class RagFlowTextDetector: + """ + The class depends on TextDetector to perform its primary function of detecting text and retrieving bounding boxes. + """ + def __init__(self, model_dir): + pre_process_list = [{ + 'DetResizeForTest': { + 'limit_side_len': 960, + 'limit_type': "max", + } + }, { + 'NormalizeImage': { + 'std': [0.229, 0.224, 0.225], + 'mean': [0.485, 0.456, 0.406], + 'scale': '1./255.', + 'order': 'hwc' + } + }, { + 'ToCHWImage': None + }, { + 'KeepKeys': { + 'keep_keys': ['image', 'shape'] + } + }] + postprocess_params = {"name": "DBPostProcess", "thresh": 0.3, "box_thresh": 0.5, "max_candidates": 1000, + "unclip_ratio": 1.5, "use_dilation": False, "score_mode": "fast", "box_type": "quad"} + + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor = load_model(model_dir, 'det') + + img_h, img_w = self.input_tensor.shape[2:] + if isinstance(img_h, str) or isinstance(img_w, str): + pass + elif img_h is not None and img_w is not None and img_h > 0 and img_w > 0: + pre_process_list[0] = { + 'DetResizeForTest': { + 'image_shape': [img_h, img_w] + } + } + self.preprocess_op = create_operators(pre_process_list) + + def order_points_clockwise(self, pts): + rect = np.zeros((4, 2), dtype="float32") + s = pts.sum(axis=1) + rect[0] = pts[np.argmin(s)] + rect[2] = pts[np.argmax(s)] + tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0) + diff = np.diff(np.array(tmp), axis=1) + rect[1] = tmp[np.argmin(diff)] + rect[3] = tmp[np.argmax(diff)] + return rect + + def clip_det_res(self, points, img_height, img_width): + for pno in range(points.shape[0]): + points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1)) + points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1)) + return points + + def filter_tag_det_res(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + dt_boxes_new = [] + for box in dt_boxes: + if isinstance(box, list): + box = np.array(box) + box = self.order_points_clockwise(box) + box = self.clip_det_res(box, img_height, img_width) + rect_width = int(np.linalg.norm(box[0] - box[1])) + rect_height = int(np.linalg.norm(box[0] - box[3])) + if rect_width <= 3 or rect_height <= 3: + continue + dt_boxes_new.append(box) + dt_boxes = np.array(dt_boxes_new) + return dt_boxes + + def filter_tag_det_res_only_clip(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + dt_boxes_new = [] + for box in dt_boxes: + if isinstance(box, list): + box = np.array(box) + box = self.clip_det_res(box, img_height, img_width) + dt_boxes_new.append(box) + dt_boxes = np.array(dt_boxes_new) + return dt_boxes + + def __call__(self, img): + ori_im = img.copy() + data = {'image': img} + + st = time.time() + data = transform(data, self.preprocess_op) + img, shape_list = data + if img is None: + return None, 0 + img = np.expand_dims(img, axis=0) + shape_list = np.expand_dims(shape_list, axis=0) + img = img.copy() + input_dict = {} + input_dict[self.input_tensor.name] = img + for i in range(100000): + try: + outputs = self.predictor.run(None, input_dict) + break + except Exception as e: + if i >= 3: + raise e + time.sleep(5) + + post_result = self.postprocess_op({"maps": outputs[0]}, shape_list) + dt_boxes = post_result[0]['points'] + dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape) + + return dt_boxes, time.time() - st + + +class RagFlow(): + def __init__(self, model_dir=None): + + if not model_dir: + try: + model_dir = os.path.join( + get_deepdoc_directory(), + "models") + self.text_detector = RagFlowTextDetector(model_dir) + + + except Exception as e: + model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc", + local_dir=os.path.join(get_deepdoc_directory(), "models"), + local_dir_use_symlinks=False) + self.text_detector = RagFlowTextDetector(model_dir) + + + self.drop_score = 0.5 + self.crop_image_res_index = 0 + + def get_rotate_crop_image(self, img, points): + ''' + img_height, img_width = img.shape[0:2] + left = int(np.min(points[:, 0])) + right = int(np.max(points[:, 0])) + top = int(np.min(points[:, 1])) + bottom = int(np.max(points[:, 1])) + img_crop = img[top:bottom, left:right, :].copy() + points[:, 0] = points[:, 0] - left + points[:, 1] = points[:, 1] - top + ''' + assert len(points) == 4, "shape of points must be 4*2" + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]))) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]))) + pts_std = np.float32([[0, 0], [img_crop_width, 0], + [img_crop_width, img_crop_height], + [0, img_crop_height]]) + M = cv2.getPerspectiveTransform(points, pts_std) + dst_img = cv2.warpPerspective( + img, + M, (img_crop_width, img_crop_height), + borderMode=cv2.BORDER_REPLICATE, + flags=cv2.INTER_CUBIC) + dst_img_height, dst_img_width = dst_img.shape[0:2] + if dst_img_height * 1.0 / dst_img_width >= 1.5: + dst_img = np.rot90(dst_img) + return dst_img + + def sorted_boxes(self, dt_boxes): + """ + Sort text boxes in order from top to bottom, left to right + args: + dt_boxes(array):detected text boxes with shape [4, 2] + return: + sorted boxes(array) with shape [4, 2] + """ + num_boxes = dt_boxes.shape[0] + sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0])) + _boxes = list(sorted_boxes) + + for i in range(num_boxes - 1): + for j in range(i, -1, -1): + if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \ + (_boxes[j + 1][0][0] < _boxes[j][0][0]): + tmp = _boxes[j] + _boxes[j] = _boxes[j + 1] + _boxes[j + 1] = tmp + else: + break + return _boxes + + def detect(self, img): + time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0} + + if img is None: + return None, None, time_dict + + start = time.time() + dt_boxes, elapse = self.text_detector(img) + time_dict['det'] = elapse + + + return zip(self.sorted_boxes(dt_boxes), [ + ("", 0) for _ in range(len(dt_boxes))]) + + def recognize(self, ori_im, box): + img_crop = self.get_rotate_crop_image(ori_im, box) + + rec_res, elapse = self.text_recognizer([img_crop]) + text, score = rec_res[0] + if score < self.drop_score: + return "" + return text + + def predict(self,img:np.ndarray=None)-> List[List[float]]: + """ + Return np array of bounding boxes - for each box 4 points of 2 coordinates + """ + time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0} + + dt_boxes, elapse = self.text_detector(img) + time_dict['det'] = elapse + + + dt_boxes = self.sorted_boxes(dt_boxes) + + + return dt_boxes + + diff --git a/detectionAndOcrTable1.py b/detectionAndOcrTable1.py new file mode 100644 index 0000000000000000000000000000000000000000..d2e1b8ab860448691f2da693351da7120ac62533 --- /dev/null +++ b/detectionAndOcrTable1.py @@ -0,0 +1,425 @@ +from typing import Tuple, List, Sequence, Optional, Union +from torchvision import transforms +from torch import nn, Tensor +from PIL import Image +from pathlib import Path +from bs4 import BeautifulSoup as bs + +import numpy as np +import numpy.typing as npt +from numpy import uint8 +ImageType = npt.NDArray[uint8] +from transformers import AutoModelForObjectDetection +import torch +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.patches import Patch + +from unitable import UnitablePredictor +from doctrfiles import DoctrWordDetector,DoctrTextRecognizer +from utils import crop_an_Image,cropImageExtraMargin +from utils import denoisingAndSharpening + +#based on this notebook:https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Table%20Transformer/Inference_with_Table_Transformer_(TATR)_for_parsing_tables.ipynb +class MaxResize(object): + def __init__(self, max_size=800): + self.max_size = max_size + + def __call__(self, image): + width, height = image.size + current_max_size = max(width, height) + scale = self.max_size / current_max_size + resized_image = image.resize((int(round(scale*width)), int(round(scale*height)))) + + return resized_image + + +html_table_template = ( + + lambda table: f""" + + + + + {table} +
""" +) + +class DetectionAndOcrTable1(): + def __init__(self,englishFlag=True): + self.unitablePredictor = UnitablePredictor() + self.wordDetector = DoctrWordDetector(architecture="db_resnet50", + path_weights="doctrfiles/models/db_resnet50-79bd7d70.pt", + path_config_json ="doctrfiles/models/db_resnet50_config.json") + + + if englishFlag: + self.textRecognizer = DoctrTextRecognizer(architecture="master", path_weights="./doctrfiles/models/master-fde31e4a.pt", + path_config_json="./doctrfiles/models/master.json") + else: + self.textRecognizer = DoctrTextRecognizer(architecture="parseq", path_weights="./doctrfiles/models/doctr-multilingual-parseq.bin", + path_config_json="./doctrfiles/models/multilingual-parseq-config.json") + + + @staticmethod + def build_table_from_html_and_cell( + structure: List[str], content: List[str] = None + ) -> List[str]: + """Build table from html and cell token list""" + assert structure is not None + html_code = list() + + # deal with empty table + if content is None: + content = ["placeholder"] * len(structure) + + for tag in structure: + if tag in ("[]", ">[]"): + if len(content) == 0: + continue + cell = content.pop(0) + html_code.append(tag.replace("[]", cell)) + else: + html_code.append(tag) + + return html_code + + @staticmethod + def save_detection(detected_lines_images:List[ImageType], prefix = './res/test1/res_'): + i = 0 + for img in detected_lines_images: + pilimg = Image.fromarray(img) + pilimg.save(prefix+str(i)+'.png') + i=i+1 + + @staticmethod + # for output bounding box post-processing + def box_cxcywh_to_xyxy(x): + x_c, y_c, w, h = x.unbind(-1) + b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] + return torch.stack(b, dim=1) + + @staticmethod + def rescale_bboxes(out_bbox, size): + img_w, img_h = size + b = DetectionAndOcrTable1.box_cxcywh_to_xyxy(out_bbox) + b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32) + return b + + @staticmethod + def outputs_to_objects(outputs, img_size, id2label): + m = outputs.logits.softmax(-1).max(-1) + pred_labels = list(m.indices.detach().cpu().numpy())[0] + pred_scores = list(m.values.detach().cpu().numpy())[0] + pred_bboxes = outputs['pred_boxes'].detach().cpu()[0] + pred_bboxes = [elem.tolist() for elem in DetectionAndOcrTable1.rescale_bboxes(pred_bboxes, img_size)] + + objects = [] + for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes): + class_label = id2label[int(label)] + if not class_label == 'no object': + objects.append({'label': class_label, 'score': float(score), + 'bbox': [float(elem) for elem in bbox]}) + + return objects + + @staticmethod + def fig2img(fig): + """Convert a Matplotlib figure to a PIL Image and return it""" + import io + buf = io.BytesIO() + fig.savefig(buf) + buf.seek(0) + img = Image.open(buf) + return img + #For that, the TATR authors employ some padding to make sure the borders of the table are included. + + @staticmethod + def objects_to_crops(img, tokens, objects, class_thresholds, padding=10): + """ + Process the bounding boxes produced by the table detection model into + cropped table images and cropped tokens. + """ + + table_crops = [] + for obj in objects: + # abit unecessary here cause i crop them anywyas + if obj['score'] < class_thresholds[obj['label']]: + continue + + cropped_table = {} + + bbox = obj['bbox'] + bbox = [bbox[0]-padding, bbox[1]-padding, bbox[2]+padding, bbox[3]+padding] + + cropped_img = img.crop(bbox) + + # Add padding to the cropped image + padded_width = cropped_img.width + 40 + padded_height = cropped_img.height +40 + + new_img_np = np.full((padded_height, padded_width, 3), fill_value=255, dtype=np.uint8) + y_offset = (padded_height - cropped_img.height) // 2 + x_offset = (padded_width - cropped_img.width) // 2 + new_img_np[y_offset:y_offset + cropped_img.height, x_offset:x_offset+cropped_img.width] = np.array(cropped_img) + + padded_img = Image.fromarray(new_img_np,'RGB') + + + table_tokens = [token for token in tokens if iob(token['bbox'], bbox) >= 0.5] + for token in table_tokens: + token['bbox'] = [token['bbox'][0]-bbox[0] + padding, + token['bbox'][1]-bbox[1] + padding, + token['bbox'][2]-bbox[0] + padding, + token['bbox'][3]-bbox[1] + padding] + + # If table is predicted to be rotated, rotate cropped image and tokens/words: + if obj['label'] == 'table rotated': + padded_img = padded_img.rotate(270, expand=True) + for token in table_tokens: + bbox = token['bbox'] + bbox = [padded_img.size[0]-bbox[3]-1, + bbox[0], + padded_img.size[0]-bbox[1]-1, + bbox[2]] + token['bbox'] = bbox + + cropped_table['image'] = padded_img + cropped_table['tokens'] = table_tokens + + table_crops.append(cropped_table) + + return table_crops + + @staticmethod + def visualize_detected_tables(img, det_tables, out_path=None): + plt.imshow(img, interpolation="lanczos") + fig = plt.gcf() + fig.set_size_inches(20, 20) + ax = plt.gca() + + for det_table in det_tables: + bbox = det_table['bbox'] + + if det_table['label'] == 'table': + facecolor = (1, 0, 0.45) + edgecolor = (1, 0, 0.45) + alpha = 0.3 + linewidth = 2 + hatch='//////' + elif det_table['label'] == 'table rotated': + facecolor = (0.95, 0.6, 0.1) + edgecolor = (0.95, 0.6, 0.1) + alpha = 0.3 + linewidth = 2 + hatch='//////' + else: + continue + + rect = patches.Rectangle(bbox[:2], bbox[2]-bbox[0], bbox[3]-bbox[1], linewidth=linewidth, + edgecolor='none',facecolor=facecolor, alpha=0.1) + ax.add_patch(rect) + rect = patches.Rectangle(bbox[:2], bbox[2]-bbox[0], bbox[3]-bbox[1], linewidth=linewidth, + edgecolor=edgecolor,facecolor='none',linestyle='-', alpha=alpha) + ax.add_patch(rect) + rect = patches.Rectangle(bbox[:2], bbox[2]-bbox[0], bbox[3]-bbox[1], linewidth=0, + edgecolor=edgecolor,facecolor='none',linestyle='-', hatch=hatch, alpha=0.2) + ax.add_patch(rect) + + plt.xticks([], []) + plt.yticks([], []) + + legend_elements = [Patch(facecolor=(1, 0, 0.45), edgecolor=(1, 0, 0.45), + label='Table', hatch='//////', alpha=0.3), + Patch(facecolor=(0.95, 0.6, 0.1), edgecolor=(0.95, 0.6, 0.1), + label='Table (rotated)', hatch='//////', alpha=0.3)] + plt.legend(handles=legend_elements, bbox_to_anchor=(0.5, -0.02), loc='upper center', borderaxespad=0, + fontsize=10, ncol=2) + plt.gcf().set_size_inches(10, 10) + plt.axis('off') + + if out_path is not None: + plt.savefig(out_path, bbox_inches='tight', dpi=150) + + return fig + + + def predict(self,image:Image.Image,debugfolder_filename_page_name,denoise=False): + + + """ + 0. Locate the table using Table detection + 1. Unitable + """ + print("Running table transformer + Unitable Hybrid Model") + + # Step 0 : Locate the table using Table detection TODO + + #First we load a Table Transformer pre-trained for table detection. We use the "no_timm" version here to load the checkpoint with a Transformers-native backbone. + model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection", revision="no_timm") + device = "cuda" if torch.cuda.is_available() else "cpu" + model.to(device) + + #Preparing the image for the model + detection_transform = transforms.Compose([ + MaxResize(800), + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + pixel_values = detection_transform(image).unsqueeze(0) + pixel_values = pixel_values.to(device) + + # Next, we forward the pixel values through the model. + # The model outputs logits of shape (batch_size, num_queries, num_labels + 1). The +1 is for the "no object" class. + with torch.no_grad(): + outputs = model(pixel_values) + + # update id2label to include "no object" + id2label = model.config.id2label + id2label[len(model.config.id2label)] = "no object" + + #[{'label': 'table', 'score': 0.9999570846557617, 'bbox': [110.24547576904297, 73.31171417236328, 1024.609130859375, 308.7159423828125]}] + objects = DetectionAndOcrTable1.outputs_to_objects(outputs, image.size, id2label) + + #Only do these for objects with score greater than 0.8 + objects = [obj for obj in objects if obj['score'] > 0.95] + + print("detected object from the table transformers are") + print(objects) + if objects: + + #Next, we crop the table out of the image. For that, the TATR authors employ some padding to make sure the borders of the table are included. + + + tokens = [] + detection_class_thresholds = { + "table": 0.95, #this is a bit double cause we do up there another filtering but didn't want to modify too much from original code + "table rotated": 0.95, + "no object": 10 + } + crop_padding = 10 + + + tables_crops = DetectionAndOcrTable1.objects_to_crops(image, tokens, objects, detection_class_thresholds, padding=crop_padding) + + cropped_tables =[] + for i in range (len(tables_crops)): + cropped_table = tables_crops[i]['image'].convert("RGB") + cropped_table.save(debugfolder_filename_page_name+"cropped_table_"+str(i)+".png") + cropped_tables.append(cropped_table) + + # Step 1: Unitable + #This take PIL Images as input + if denoise: + cropped_tables =denoisingAndSharpening(cropped_tables) + pred_htmls, pred_bboxs = self.unitablePredictor.predict(cropped_tables,debugfolder_filename_page_name) + + table_codes = [] + for k in range(len(cropped_tables)): + pred_html =pred_htmls[k] + pred_bbox = pred_bboxs[k] + + # Some tabless have a lot of words in their header + # So for the headers, give doctr word ddetector doesn't work when the images aren't square + table_header_cells = 0 + header_exists = False + for cell in pred_html: + if cell=='>[]' or cell == '[]': + table_header_cells += 1 + if cell =='': + header_exists = True + break + if not header_exists: + table_header_cells = 0 + pred_cell = [] + cell_imgs_to_viz = [] + cell_img_num=0 + + # Find what one line should be if there is a cell with a single line + one_line_height = 100000 + for i in range(table_header_cells): + box = pred_bbox[i] + xmin, ymin, xmax, ymax = box + current_box_height = abs(ymax-ymin) + if current_box_height 0 and current_box_height>one_line_height+5: + + cell_img= cropImageExtraMargin([fourbytwo],cropped_tables[k],margin=1.4)[0] + table_header_cells -= 1 + + #List of 4 x 2 + detection_results = self.wordDetector.predict(cell_img,sort_vertical=True) + + input_to_recog = [] + if detection_results == []: + input_to_recog.append(cell_img) + else: + + for wordbox in detection_results: + + cropped_image= crop_an_Image(wordbox.box,cell_img) + if cropped_image.shape[0] >0 and cropped_image.shape[1]>0: + input_to_recog.append(cropped_image) + else: + print("Empty image") + else: + cell_img = crop_an_Image(fourbytwo,cropped_tables[k]) + if table_header_cells>0: + table_header_cells -= 1 + if cell_img.shape[0] >0 and cell_img.shape[1]>0: + input_to_recog =[cell_img] + + cell_imgs_to_viz.append(cell_img) + + + if input_to_recog != []: + words = self.textRecognizer.predict_for_tables(input_to_recog) + cell_output = " ".join(words) + pred_cell.append(cell_output) + else: + #Don't lose empty cell + pred_cell.append("") + + + print(pred_cell) + #Step3 : + pred_code = self.build_table_from_html_and_cell(pred_html, pred_cell) + pred_code = "".join(pred_code) + pred_code = html_table_template(pred_code) + + + soup = bs(pred_code) + #formatted and indented) string representation of the HTML document + table_code = soup.prettify() + print(table_code) + + # Append extracted table to table_codes + table_codes.append(table_code) + + return table_codes + + + + + + + + diff --git a/detectionAndOcrTable2.py b/detectionAndOcrTable2.py new file mode 100644 index 0000000000000000000000000000000000000000..ce2769c5259bf98d3ea5cc1df164d7b84e63c56c --- /dev/null +++ b/detectionAndOcrTable2.py @@ -0,0 +1,306 @@ +from typing import Tuple, List, Sequence, Optional, Union +from torchvision import transforms +from torch import nn, Tensor +from PIL import Image +from pathlib import Path +from bs4 import BeautifulSoup as bs + +import numpy as np +import numpy.typing as npt +from numpy import uint8 +ImageType = npt.NDArray[uint8] +from transformers import AutoModelForObjectDetection +import torch +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.patches import Patch + +from unitable import UnitableFullPredictor + +#based on this notebook:https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Table%20Transformer/Inference_with_Table_Transformer_(TATR)_for_parsing_tables.ipynb +class MaxResize(object): + def __init__(self, max_size=800): + self.max_size = max_size + + def __call__(self, image): + width, height = image.size + current_max_size = max(width, height) + scale = self.max_size / current_max_size + resized_image = image.resize((int(round(scale*width)), int(round(scale*height)))) + + return resized_image + +def iob(boxA, boxB): + """ + Calculate the Intersection over Bounding Box (IoB) of two bounding boxes. + + Parameters: + - boxA: list or tuple with [xmin, ymin, xmax, ymax] of the first box + - boxB: list or tuple with [xmin, ymin, xmax, ymax] of the second box + + Returns: + - iob: float, the IoB ratio + """ + # Determine the coordinates of the intersection rectangle + xA = max(boxA[0], boxB[0]) + yA = max(boxA[1], boxB[1]) + xB = min(boxA[2], boxB[2]) + yB = min(boxA[3], boxB[3]) + + # Compute the area of intersection rectangle + interWidth = max(0, xB - xA) + interHeight = max(0, yB - yA) + interArea = interWidth * interHeight + + # Compute the area of boxB (the second box) + boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) + + # Compute the Intersection over Bounding Box (IoB) ratio + iob = interArea / float(boxBArea) + + return iob + +class DetectionAndOcrTable2(): + #This components can take in entire pdf page as input , scan for tables and return the table in html format + #Uses the full unitable model - different to DetectionAndOcrTable1 + def __init__(self): + self.unitableFullPredictor = UnitableFullPredictor() + + + @staticmethod + def save_detection(detected_lines_images:List[ImageType], prefix = './res/test1/res_'): + i = 0 + for img in detected_lines_images: + pilimg = Image.fromarray(img) + pilimg.save(prefix+str(i)+'.png') + i=i+1 + + @staticmethod + # for output bounding box post-processing + def box_cxcywh_to_xyxy(x): + x_c, y_c, w, h = x.unbind(-1) + b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] + return torch.stack(b, dim=1) + + @staticmethod + def rescale_bboxes(out_bbox, size): + img_w, img_h = size + b = DetectionAndOcrTable2.box_cxcywh_to_xyxy(out_bbox) + b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32) + return b + + @staticmethod + def outputs_to_objects(outputs, img_size, id2label): + m = outputs.logits.softmax(-1).max(-1) + pred_labels = list(m.indices.detach().cpu().numpy())[0] + pred_scores = list(m.values.detach().cpu().numpy())[0] + pred_bboxes = outputs['pred_boxes'].detach().cpu()[0] + pred_bboxes = [elem.tolist() for elem in DetectionAndOcrTable2.rescale_bboxes(pred_bboxes, img_size)] + + objects = [] + for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes): + class_label = id2label[int(label)] + if not class_label == 'no object': + objects.append({'label': class_label, 'score': float(score), + 'bbox': [float(elem) for elem in bbox]}) + + return objects + + + @staticmethod + def visualize_detected_tables(img, det_tables, out_path=None): + plt.imshow(img, interpolation="lanczos") + fig = plt.gcf() + fig.set_size_inches(20, 20) + ax = plt.gca() + + for det_table in det_tables: + bbox = det_table['bbox'] + + if det_table['label'] == 'table': + facecolor = (1, 0, 0.45) + edgecolor = (1, 0, 0.45) + alpha = 0.3 + linewidth = 2 + hatch='//////' + elif det_table['label'] == 'table rotated': + facecolor = (0.95, 0.6, 0.1) + edgecolor = (0.95, 0.6, 0.1) + alpha = 0.3 + linewidth = 2 + hatch='//////' + else: + continue + + rect = patches.Rectangle(bbox[:2], bbox[2]-bbox[0], bbox[3]-bbox[1], linewidth=linewidth, + edgecolor='none',facecolor=facecolor, alpha=0.1) + ax.add_patch(rect) + rect = patches.Rectangle(bbox[:2], bbox[2]-bbox[0], bbox[3]-bbox[1], linewidth=linewidth, + edgecolor=edgecolor,facecolor='none',linestyle='-', alpha=alpha) + ax.add_patch(rect) + rect = patches.Rectangle(bbox[:2], bbox[2]-bbox[0], bbox[3]-bbox[1], linewidth=0, + edgecolor=edgecolor,facecolor='none',linestyle='-', hatch=hatch, alpha=0.2) + ax.add_patch(rect) + + plt.xticks([], []) + plt.yticks([], []) + + legend_elements = [Patch(facecolor=(1, 0, 0.45), edgecolor=(1, 0, 0.45), + label='Table', hatch='//////', alpha=0.3), + Patch(facecolor=(0.95, 0.6, 0.1), edgecolor=(0.95, 0.6, 0.1), + label='Table (rotated)', hatch='//////', alpha=0.3)] + plt.legend(handles=legend_elements, bbox_to_anchor=(0.5, -0.02), loc='upper center', borderaxespad=0, + fontsize=10, ncol=2) + plt.gcf().set_size_inches(10, 10) + plt.axis('off') + + if out_path is not None: + plt.savefig(out_path, bbox_inches='tight', dpi=150) + + return fig + + #For that, the TATR authors employ some padding to make sure the borders of the table are included. + @staticmethod + def objects_to_crops(img, tokens, objects, class_thresholds, padding=10): + """ + Process the bounding boxes produced by the table detection model into + cropped table images and cropped tokens. + """ + + table_crops = [] + for obj in objects: + # abit unecessary here cause i crop them anywyas + if obj['score'] < class_thresholds[obj['label']]: + print('skipping object with score', obj['score']) + continue + + cropped_table = {} + + bbox = obj['bbox'] + bbox = [bbox[0]-padding, bbox[1]-padding, bbox[2]+padding, bbox[3]+padding] + + cropped_img = img.crop(bbox) + + # Add padding to the cropped image + padded_width = cropped_img.width + 40 + padded_height = cropped_img.height +40 + + new_img_np = np.full((padded_height, padded_width, 3), fill_value=255, dtype=np.uint8) + y_offset = (padded_height - cropped_img.height) // 2 + x_offset = (padded_width - cropped_img.width) // 2 + new_img_np[y_offset:y_offset + cropped_img.height, x_offset:x_offset+cropped_img.width] = np.array(cropped_img) + + padded_img = Image.fromarray(new_img_np,'RGB') + + + table_tokens = [token for token in tokens if iob(token['bbox'], bbox) >= 0.5] + for token in table_tokens: + token['bbox'] = [token['bbox'][0]-bbox[0] + padding, + token['bbox'][1]-bbox[1] + padding, + token['bbox'][2]-bbox[0] + padding, + token['bbox'][3]-bbox[1] + padding] + + # If table is predicted to be rotated, rotate cropped image and tokens/words: + if obj['label'] == 'table rotated': + padded_img = padded_img.rotate(270, expand=True) + for token in table_tokens: + bbox = token['bbox'] + bbox = [padded_img.size[0]-bbox[3]-1, + bbox[0], + padded_img.size[0]-bbox[1]-1, + bbox[2]] + token['bbox'] = bbox + + cropped_table['image'] = padded_img + cropped_table['tokens'] = table_tokens + + table_crops.append(cropped_table) + + return table_crops + + def predict(self,image:Image.Image,debugfolder_filename_page_name): + + + """ + 0. Locate the table using Table detection + 1. Unitable + """ + + # Step 0 : Locate the table using Table detection TODO + + #First we load a Table Transformer pre-trained for table detection. We use the "no_timm" version here to load the checkpoint with a Transformers-native backbone. + model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection", revision="no_timm") + device = "cuda" if torch.cuda.is_available() else "cpu" + model.to(device) + + #Preparing the image for the model + detection_transform = transforms.Compose([ + MaxResize(800), + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + pixel_values = detection_transform(image).unsqueeze(0) + pixel_values = pixel_values.to(device) + + # Next, we forward the pixel values through the model. + # The model outputs logits of shape (batch_size, num_queries, num_labels + 1). The +1 is for the "no object" class. + with torch.no_grad(): + outputs = model(pixel_values) + + # update id2label to include "no object" + id2label = model.config.id2label + id2label[len(model.config.id2label)] = "no object" + + #[{'label': 'table', 'score': 0.9999570846557617, 'bbox': [110.24547576904297, 73.31171417236328, 1024.609130859375, 308.7159423828125]}] + objects = DetectionAndOcrTable2.outputs_to_objects(outputs, image.size, id2label) + + #Only do these for objects with score greater than 0.8 + objects = [obj for obj in objects if obj['score'] > 0.95] + + print(objects) + if objects: + fig = DetectionAndOcrTable2.visualize_detected_tables(image, objects,out_path = "./res/table_debug/table_former_detection.jpg") + + #Next, we crop the table out of the image. For that, the TATR authors employ some padding to make sure the borders of the table are included. + + + tokens = [] + detection_class_thresholds = { + "table": 0.95, + "table rotated": 0.95, + "no object": 10 + } + crop_padding = 10 + + + tables_crops = DetectionAndOcrTable2.objects_to_crops(image, tokens, objects, detection_class_thresholds, padding=crop_padding) + + #[{'image': , 'tokens': []}] + #print(tables_crops) + + #TODO: Handle the case where there are multiple tables + cropped_tables =[] + for i in range (len(tables_crops)): + cropped_table = tables_crops[i]['image'].convert("RGB") + cropped_table.save(debugfolder_filename_page_name +"cropped_table_"+str(i)+".png") + cropped_tables.append(cropped_table) + + print("number of cropped tables found: "+str(len(cropped_tables))) + + + # Step 1: Unitable + #This take PIL Images as input + table_codes = self.unitableFullPredictor.predict(cropped_tables,debugfolder_filename_page_name) + + else: + return + + + + + + + + + + diff --git a/detectionAndOcrTable3.py b/detectionAndOcrTable3.py new file mode 100644 index 0000000000000000000000000000000000000000..4b345512f950b873aea2e5df7a2ddf4a4c2fce5b --- /dev/null +++ b/detectionAndOcrTable3.py @@ -0,0 +1,267 @@ +from typing import Tuple, List, Sequence, Optional, Union +from torchvision import transforms +from torch import nn, Tensor +from PIL import Image +from pathlib import Path +from bs4 import BeautifulSoup as bs + +import numpy as np +import numpy.typing as npt +from numpy import uint8 +ImageType = npt.NDArray[uint8] +from transformers import AutoModelForObjectDetection +import torch +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.patches import Patch +from utils import draw_only_box + +from unitable import UnitablePredictor +from ultralyticsplus import YOLO, render_result +from doctrfiles import DoctrWordDetector,DoctrTextRecognizer +from utils import crop_an_Image,cropImageExtraMargin +from utils import denoisingAndSharpening +""" +USES YOLO FOR DETECITON INSTEAD OF TABLE TRANSFORMER +Table TransFORMER +""" + + +html_table_template = ( + + lambda table: f""" + + + + + {table} +
""" +) + +class DetectionAndOcrTable3(): + #This components can take in entire pdf page as input , scan for tables and return the table in html format + #Uses the full unitable model - different to DetectionAndOcrTable1 + def __init__(self,englishFlag = True): + self.unitablePredictor = UnitablePredictor() + self.detector = YOLO('foduucom/table-detection-and-extraction') + # set model parameters + self.detector.overrides['conf'] = 0.25 # NMS confidence threshold + self.detector.overrides['iou'] = 0.45 # NMS IoU threshold + self.detector.overrides['agnostic_nms'] = False # NMS class-agnostic + self.detector.overrides['max_det'] = 1000 # maximum number of detections per image + + self.wordDetector = DoctrWordDetector(architecture="db_resnet50", + path_weights="doctrfiles/models/db_resnet50-79bd7d70.pt", + path_config_json ="doctrfiles/models/db_resnet50_config.json") + + + if englishFlag: + self.textRecognizer = DoctrTextRecognizer(architecture="master", path_weights="./doctrfiles/models/master-fde31e4a.pt", + path_config_json="./doctrfiles/models/master.json") + else: + self.textRecognizer = DoctrTextRecognizer(architecture="parseq", path_weights="./doctrfiles/models/doctr-multilingual-parseq.bin", + path_config_json="./doctrfiles/models/multilingual-parseq-config.json") + + + + @staticmethod + def save_detection(detected_lines_images:List[ImageType], prefix = './res/test1/res_'): + i = 0 + for img in detected_lines_images: + pilimg = Image.fromarray(img) + pilimg.save(prefix+str(i)+'.png') + i=i+1 + + @staticmethod + def build_table_from_html_and_cell( + structure: List[str], content: List[str] = None + ) -> List[str]: + """Build table from html and cell token list""" + assert structure is not None + html_code = list() + + # deal with empty table + if content is None: + content = ["placeholder"] * len(structure) + + for tag in structure: + if tag in ("[]", ">[]"): + if len(content) == 0: + continue + cell = content.pop(0) + html_code.append(tag.replace("[]", cell)) + else: + html_code.append(tag) + + return html_code + """ + Valid 'Boxes' object attributes and properties are: + + Attributes: + boxes (torch.Tensor) or (numpy.ndarray): A tensor or numpy array containing the detection boxes, + with shape (num_boxes, 6). + orig_shape (torch.Tensor) or (numpy.ndarray): Original image size, in the format (height, width). + + Properties: + xyxy (torch.Tensor) or (numpy.ndarray): The boxes in xyxy format. + conf (torch.Tensor) or (numpy.ndarray): The confidence values of the boxes. + cls (torch.Tensor) or (numpy.ndarray): The class values of the boxes. + xywh (torch.Tensor) or (numpy.ndarray): The boxes in xywh format. + xyxyn (torch.Tensor) or (numpy.ndarray): The boxes in xyxy format normalized by original image size. + xywhn (torch.Tensor) or (numpy.ndarray): The boxes in xywh format normalized by original image size. + """ + # Image is page image + def predict(self,image:Image.Image,debugfolder_filename_page_name = None,denoise =False): + + results = self.detector.predict(image) + + #Array of bboxes + bbxs = results[0].boxes.xyxy.int().tolist() + #Array of confidences + conf = results[0].boxes.conf.float().tolist() + print(bbxs) + print(conf) + + #images_to_recognizer = cropImage(bxs, img) + img_to_save = draw_only_box(image, bbxs) + img_to_save.save(debugfolder_filename_page_name+"detectionBoxRes.png", quality=95) + + # we need something to draw the detection + + + cropped_tables =[] + for i in range (len(bbxs)): + # TODO: find the right confidence and padding values + if conf[i]< 0.65: + continue + + padded = [bbxs[i][0]-10,bbxs[i][1]-10,bbxs[i][2]+10,bbxs[i][3]+10] + + cropped_table = image.convert("RGB").crop(padded) + cropped_table.save(debugfolder_filename_page_name +"yolo_cropped_table_"+str(i)+".png") + cropped_tables.append(cropped_table) + + print("number of cropped tables found: "+str(len(cropped_tables))) + + # Step 1: Unitable + #This take PIL Images as input + if cropped_tables != []: + if denoise: + cropped_tables =denoisingAndSharpening(cropped_tables) + pred_htmls, pred_bboxs = self.unitablePredictor.predict(cropped_tables,debugfolder_filename_page_name) + table_codes = [] + + for k in range(len(cropped_tables)): + pred_html =pred_htmls[k] + pred_bbox = pred_bboxs[k] + + # Some tabless have a lot of words in their header + # So for the headers, give doctr word ddetector doesn't work when the images aren't square + table_header_cells = 0 + header_exists = False + for cell in pred_html: + if cell=='>[]' or cell == '[]': + table_header_cells += 1 + if cell =='': + header_exists = True + break + if not header_exists: + table_header_cells = 0 + pred_cell = [] + cell_imgs_to_viz = [] + cell_img_num=0 + + # Find what one line should be if there is a cell with a single line + one_line_height = 100000 + for i in range(table_header_cells): + box = pred_bbox[i] + xmin, ymin, xmax, ymax = box + current_box_height = abs(ymax-ymin) + if current_box_height 0 and current_box_height>one_line_height+5: + + cell_img= cropImageExtraMargin([fourbytwo],cropped_tables[k],margin=1.4)[0] + table_header_cells -= 1 + + #List of 4 x 2 + detection_results = self.wordDetector.predict(cell_img,sort_vertical=True) + + input_to_recog = [] + if detection_results == []: + input_to_recog.append(cell_img) + else: + + for wordbox in detection_results: + + cropped_image= crop_an_Image(wordbox.box,cell_img) + if cropped_image.shape[0] >0 and cropped_image.shape[1]>0: + input_to_recog.append(cropped_image) + else: + print("Empty image") + else: + cell_img = crop_an_Image(fourbytwo,cropped_tables[k]) + if table_header_cells>0: + table_header_cells -= 1 + if cell_img.shape[0] >0 and cell_img.shape[1]>0: + input_to_recog =[cell_img] + + cell_imgs_to_viz.append(cell_img) + + if input_to_recog != []: + words = self.textRecognizer.predict_for_tables(input_to_recog) + cell_output = " ".join(words) + pred_cell.append(cell_output) + else: + #Don't lose empty cell + pred_cell.append("") + + + #self.save_detection(cell_imgs_to_viz,prefix = './res/test4/cell_imgs_') + print(pred_cell) + #Step3 : + pred_code = self.build_table_from_html_and_cell(pred_html, pred_cell) + pred_code = "".join(pred_code) + pred_code = html_table_template(pred_code) + + + soup = bs(pred_code) + #formatted and indented) string representation of the HTML document + table_code = soup.prettify() + print(table_code) + table_codes.append(table_code) + + return table_codes + return [] + + + + + + + + + + + + + diff --git a/detectionAndOcrTable4.py b/detectionAndOcrTable4.py new file mode 100644 index 0000000000000000000000000000000000000000..a18a588eab26b94433840d709b7a3dd31c45c4ca --- /dev/null +++ b/detectionAndOcrTable4.py @@ -0,0 +1,112 @@ +from typing import Tuple, List, Sequence, Optional, Union +from torchvision import transforms +from torch import nn, Tensor +from PIL import Image +from pathlib import Path +from bs4 import BeautifulSoup as bs + +import numpy as np +import numpy.typing as npt +from numpy import uint8 +ImageType = npt.NDArray[uint8] +from transformers import AutoModelForObjectDetection +import torch +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.patches import Patch +from utils import draw_only_box + +from unitable import UnitableFullPredictor +from ultralyticsplus import YOLO, render_result +""" +USES YOLO FOR DETECITON INSTEAD OF TABLE TRANSFORMER +Table TransFORMER +""" + +class DetectionAndOcrTable4(): + #This components can take in entire pdf page as input , scan for tables and return the table in html format + #Uses the full unitable model - different to DetectionAndOcrTable1 + def __init__(self): + self.unitableFullPredictor = UnitableFullPredictor() + self.detector = YOLO('foduucom/table-detection-and-extraction') + # set model parameters + self.detector.overrides['conf'] = 0.25 # NMS confidence threshold + self.detector.overrides['iou'] = 0.45 # NMS IoU threshold + self.detector.overrides['agnostic_nms'] = False # NMS class-agnostic + self.detector.overrides['max_det'] = 1000 # maximum number of detections per image + + + + @staticmethod + def save_detection(detected_lines_images:List[ImageType], prefix = './res/test1/res_'): + i = 0 + for img in detected_lines_images: + pilimg = Image.fromarray(img) + pilimg.save(prefix+str(i)+'.png') + i=i+1 + """ + Valid 'Boxes' object attributes and properties are: + + Attributes: + boxes (torch.Tensor) or (numpy.ndarray): A tensor or numpy array containing the detection boxes, + with shape (num_boxes, 6). + orig_shape (torch.Tensor) or (numpy.ndarray): Original image size, in the format (height, width). + + Properties: + xyxy (torch.Tensor) or (numpy.ndarray): The boxes in xyxy format. + conf (torch.Tensor) or (numpy.ndarray): The confidence values of the boxes. + cls (torch.Tensor) or (numpy.ndarray): The class values of the boxes. + xywh (torch.Tensor) or (numpy.ndarray): The boxes in xywh format. + xyxyn (torch.Tensor) or (numpy.ndarray): The boxes in xyxy format normalized by original image size. + xywhn (torch.Tensor) or (numpy.ndarray): The boxes in xywh format normalized by original image size. + """ + # Image is page image + def predict(self,image:Image.Image,debugfolder_filename_page_name = None): + + results = self.detector.predict(image) + + #Array of bboxes + bbxs = results[0].boxes.xyxy.int().tolist() + #Array of confidences + conf = results[0].boxes.conf.float().tolist() + print(bbxs) + print(conf) + + #images_to_recognizer = cropImage(bxs, img) + img_to_save = draw_only_box(image, bbxs) + img_to_save.save(debugfolder_filename_page_name+"detectionBoxRes.png", quality=95) + + # we need something to draw the detection + + + cropped_tables =[] + for i in range (len(bbxs)): + # TODO: find the right confidence and padding values + if conf[i]< 0.65: + continue + + padded = [bbxs[i][0]-10,bbxs[i][1]-10,bbxs[i][2]+10,bbxs[i][3]+10] + + cropped_table = image.convert("RGB").crop(padded) + cropped_table.save(debugfolder_filename_page_name +"yolo_cropped_table_"+str(i)+".png") + cropped_tables.append(cropped_table) + + print("number of cropped tables found: "+str(len(cropped_tables))) + + # Step 1: Unitable + #This take PIL Images as input + if cropped_tables != []: + table_codes = self.unitableFullPredictor.predict(cropped_tables,debugfolder_filename_page_name) + return table_codes + + + + + + + + + + + + diff --git a/doctrfiles/__init__.py b/doctrfiles/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a8c520f52ab0b969e24fe7ec70e4e9b6bea33864 --- /dev/null +++ b/doctrfiles/__init__.py @@ -0,0 +1,4 @@ +from .doctr_recognizer import DoctrTextRecognizer +from .word_detector import Wordboxes,DoctrWordDetector + +__all__ = ['DoctrTextRecognizer','DoctrWordDetector','Wordboxes'] \ No newline at end of file diff --git a/doctrfiles/doctr_recognizer.py b/doctrfiles/doctr_recognizer.py new file mode 100644 index 0000000000000000000000000000000000000000..8b10c0d4714001f53e13ae7a346f15156d4fad1e --- /dev/null +++ b/doctrfiles/doctr_recognizer.py @@ -0,0 +1,183 @@ +import os +from abc import ABC +from pathlib import Path +from typing import Any, List, Literal, Mapping, Optional, Tuple +from zipfile import ZipFile +import json +from typing import Any, List, Literal, Mapping, Optional,Dict +import uuid +from doctr.models.preprocessor import PreProcessor +from doctr.models.recognition.predictor import RecognitionPredictor # pylint: disable=W0611 +from doctr.models.recognition.zoo import ARCHS, recognition +import torch +# Numpy image type +import numpy.typing as npt +from numpy import uint8 +ImageType = npt.NDArray[uint8] + +from utils import WordAnnotation,getlogger + +class DoctrTextRecognizer(): + + def __init__( + self, + architecture: str, + path_weights: str, + path_config_json: str = None, + ) -> None: + """ + :param architecture: DocTR supports various text recognition models, e.g. "crnn_vgg16_bn", + "crnn_mobilenet_v3_small". The full list can be found here: + https://github.com/mindee/doctr/blob/main/doctr/models/recognition/zoo.py#L16. + :param path_weights: Path to the weights of the model + :param device: "cpu" or "cuda". + :param lib: "TF" or "PT" or None. If None, env variables USE_TENSORFLOW, USE_PYTORCH will be used. + :param path_config_json: Path to a json file containing the configuration of the model. Useful, if you have + a model trained on custom vocab. + """ + + self.architecture = architecture + self.path_weights = path_weights + + self.name = self.get_name(self.path_weights, self.architecture) + + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.path_config_json = path_config_json + + self.built_model = self.build_model(self.architecture, self.path_config_json) + self.load_model(self.path_weights, self.built_model, self.device) + self.doctr_predictor = self.get_wrapped_model() + + def predict(self, inputs: Dict[uuid.UUID, Tuple[ImageType,WordAnnotation]]) -> List[WordAnnotation]: + + """ + Prediction on a batch of text lines + + :param images: Dictionary where key is word's object id and the value is tupe of cropped image and word annotation + :return: A list of DetectionResult + """ + if inputs: + + + predictor =self.doctr_predictor + device = self.device + + word_uuids = list(inputs.keys()) + cropped_images = [value[0] for value in inputs.values()] + + raw_output = predictor(list(cropped_images)) + det_results =[] + for uuid, output in zip(word_uuids, raw_output): + ann = inputs[uuid][1] + ann.text = output[0] + det_results.append(ann) + return det_results + return [] + + def predict_for_tables(self, inputs: List[ImageType]) -> List[str]: + + if inputs: + + predictor =self.doctr_predictor + device = self.device + + raw_output = predictor(list(inputs)) + det_results =[] + for output in raw_output: + det_results.append(output[0]) + return det_results + return [] + + @staticmethod + def load_model(path_weights: str, doctr_predictor: Any, device: torch.device) -> None: + """Loading model weights + 1. Load the State Dictionary: + state_dict = torch.load(path_weights, map_location=device) loads the state dictionary from the specified file path and maps it to the specified device. + 2. Modify Keys in the State Dictionary: + The code prepends "model." to each key in the state dictionary. This is likely necessary to match the keys expected by the doctr_predictor model. + 3. Load State Dictionary into Model: + doctr_predictor.load_state_dict(state_dict) loads the modified state dictionary into the model. + 4. Move Model to Device: + doctr_predictor.to(device) moves the model to the specified device. + """ + state_dict = torch.load(path_weights, map_location=device) + for key in list(state_dict.keys()): + state_dict["model." + key] = state_dict.pop(key) + doctr_predictor.load_state_dict(state_dict) + doctr_predictor.to(device) + + @staticmethod + def build_model(architecture: str, path_config_json: Optional[str] = None) -> "RecognitionPredictor": + """Building the model + 1. Specific keys (arch, url, task) are removed from custom_configs. + mean and std values are moved to recognition_configs. + 2. Creating model + Check Architecture Type: + Case 1 : + If architecture is a string, it checks if it's in the predefined set of architectures (ARCHS). + If valid, it creates an instance of the model using the specified architecture and custom configurations. + Handle Custom Architecture Instances: + Case 2 : + If architecture is not a string, it checks if it's an **instance** of one of the recognized model classes (e.g., recognition.CRNN, recognition.SAR, etc.). + If valid, it assigns the provided architecture to model. + Get Input Shape and Create RecognitionPredictor: + + 3. Retrieves the input_shape from the model's configuration. + 4. Returns an instance of RecognitionPredictor initialized with a PreProcessor and the model. + """ + + # inspired and adapted from https://github.com/mindee/doctr/blob/main/doctr/models/recognition/zoo.py + custom_configs = {} + batch_size = 1024 + recognition_configs = {} + if path_config_json: + with open(path_config_json, "r", encoding="utf-8") as f: + custom_configs = json.load(f) + custom_configs.pop("arch", None) + custom_configs.pop("url", None) + custom_configs.pop("task", None) + recognition_configs["mean"] = custom_configs.pop("mean") + recognition_configs["std"] = custom_configs.pop("std") + #batch_size = custom_configs.pop("batch_size") + recognition_configs["batch_size"] = batch_size + + if isinstance(architecture, str): + if architecture not in ARCHS: + raise ValueError(f"unknown architecture '{architecture}'") + + model = recognition.__dict__[architecture](pretrained=True, pretrained_backbone=True, **custom_configs) + else: + if not isinstance( + architecture, + (recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR, recognition.PARSeq), + ): + raise ValueError(f"unknown architecture: {type(architecture)}") + model = architecture + + input_shape = model.cfg["input_shape"][-2:] + """ + (class) PreProcessor + Implements an abstract preprocessor object which performs casting, resizing, batching and normalization. + + Args: + output_size: expected size of each page in format (H, W) + batch_size: the size of page batches + mean: mean value of the training distribution by channel + std: standard deviation of the training distribution by channel + """ + return RecognitionPredictor(PreProcessor(input_shape, preserve_aspect_ratio=True, **recognition_configs), model) + + + def get_wrapped_model(self) -> Any: + """ + Get the inner (wrapped) model. + """ + doctr_predictor = self.build_model(self.architecture, self.path_config_json) + device_str = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.load_model(self.path_weights, doctr_predictor, device_str) + return doctr_predictor + + @staticmethod + def get_name(path_weights: str, architecture: str) -> str: + """Returns the name of the model""" + return f"doctr_{architecture}" + "_".join(Path(path_weights).parts[-2:]) diff --git a/doctrfiles/models/config-multi2.json b/doctrfiles/models/config-multi2.json new file mode 100644 index 0000000000000000000000000000000000000000..29cdd1e78236bd909120b0c1251f962e88b74cf8 --- /dev/null +++ b/doctrfiles/models/config-multi2.json @@ -0,0 +1,21 @@ +{ + "mean": [ + 0.694, + 0.695, + 0.693 + ], + "std": [ + 0.299, + 0.296, + 0.301 + ], + "input_shape": [ + 3, + 32, + 128 + ], + "vocab": "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~°£€¥¢฿äöüßÄÖÜẞàâéèêëîïôùûçÀÂÉÈÊËÎÏÔÙÛǧ", + "url": "https://doctr-static.mindee.com/models?id=v0.3.1/crnn_vgg16_bn-9762b0b0.pt&src=0", + "arch": "crnn_vgg16_bn", + "task": "recognition" +} \ No newline at end of file diff --git a/doctrfiles/models/db_mobilenet_v3_large-81e9b152.pt b/doctrfiles/models/db_mobilenet_v3_large-81e9b152.pt new file mode 100644 index 0000000000000000000000000000000000000000..889ccf770ee8368df86669dbee9e0397fb4590a9 --- /dev/null +++ b/doctrfiles/models/db_mobilenet_v3_large-81e9b152.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81e9b152c11e9681f5eb4a2ec72e5f5d67df8ab860a846e1004756badfa5d37a +size 16987510 diff --git a/doctrfiles/models/db_resnet34-cb6aed9e.pt b/doctrfiles/models/db_resnet34-cb6aed9e.pt new file mode 100644 index 0000000000000000000000000000000000000000..b52c30e04513d04976bb92bde241ff318af30113 --- /dev/null +++ b/doctrfiles/models/db_resnet34-cb6aed9e.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb6aed9e4252c8a92d381de1b15e1e75461f7a125a4262ef16768a4b9f797347 +size 89991042 diff --git a/doctrfiles/models/db_resnet50-79bd7d70.pt b/doctrfiles/models/db_resnet50-79bd7d70.pt new file mode 100644 index 0000000000000000000000000000000000000000..093c08d03f1aee22081b3182a587b2df1b4fd1f8 --- /dev/null +++ b/doctrfiles/models/db_resnet50-79bd7d70.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79bd7d702506703b89cf11afa42d12aebf5cf25c3618e6ffd5f85772240ca483 +size 102021912 diff --git a/doctrfiles/models/db_resnet50_config.json b/doctrfiles/models/db_resnet50_config.json new file mode 100644 index 0000000000000000000000000000000000000000..38f21c2e8cdab8872a5fb25625a2d541c388ea46 --- /dev/null +++ b/doctrfiles/models/db_resnet50_config.json @@ -0,0 +1,20 @@ +{ +"mean": [ + 0.798, + 0.785, + 0.772 +], +"std": [ + 0.264, + 0.2749, + 0.287 +], +"input_shape": [ + 3, + 1024, + 1024 +], +"url": "https://doctr-static.mindee.com/models?id=v0.7.0/parseq-56125471.pt&src=0", +"arch": "db_resnet50", +"task": "detection" +} \ No newline at end of file diff --git a/doctrfiles/models/doctr-multilingual-parseq.bin b/doctrfiles/models/doctr-multilingual-parseq.bin new file mode 100644 index 0000000000000000000000000000000000000000..4e1b2683bc3af57a31716a8b4f5f99bb4ec84589 --- /dev/null +++ b/doctrfiles/models/doctr-multilingual-parseq.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5b1d3f3b9d8ab994e94c671c47828b9a4079f20b4288eb5f0ba3c6dacf6c237 +size 47872130 diff --git a/doctrfiles/models/master-fde31e4a.pt b/doctrfiles/models/master-fde31e4a.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f0fa380e83c1eb42a09ba04ee9cc5d33c365f11 --- /dev/null +++ b/doctrfiles/models/master-fde31e4a.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fde31e4a9612670af83daf4b730dd9c56216806589546b09290abc347ca3a49d +size 243889428 diff --git a/doctrfiles/models/master.json b/doctrfiles/models/master.json new file mode 100644 index 0000000000000000000000000000000000000000..84ecb9726c08ee1760b4493128c2fb68a3151c2a --- /dev/null +++ b/doctrfiles/models/master.json @@ -0,0 +1,21 @@ +{ + "mean": [ + 0.694, + 0.695, + 0.693 + ], + "std": [ + 0.299, + 0.296, + 0.301 + ], + "input_shape": [ + 3, + 32, + 128 + ], + "vocab": "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ", + "url": null, + "arch": "master", + "task": "recognition" +} \ No newline at end of file diff --git a/doctrfiles/models/multi2.bin b/doctrfiles/models/multi2.bin new file mode 100644 index 0000000000000000000000000000000000000000..12d6b0244165525459dc91c5297e3645bdd5268b --- /dev/null +++ b/doctrfiles/models/multi2.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bdc3c6922cad527714504b84a9d0efaa6b679d8ca8050a003611076eb514757 +size 63310142 diff --git a/doctrfiles/models/multilingual-parseq-config.json b/doctrfiles/models/multilingual-parseq-config.json new file mode 100644 index 0000000000000000000000000000000000000000..caa15f66c229a8b79c715a73cf35064362d85a17 --- /dev/null +++ b/doctrfiles/models/multilingual-parseq-config.json @@ -0,0 +1,21 @@ +{ + "mean": [ + 0.694, + 0.695, + 0.693 + ], + "std": [ + 0.299, + 0.296, + 0.301 + ], + "input_shape": [ + 3, + 32, + 128 + ], + "vocab": "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇáãíóõúÁÃÍÓÕÚñÑ¡¿äößÄÖẞčďěňřšťůýžČĎĚŇŘŠŤŮÝŽąćęłńśźżĄĆĘŁŃŚŹŻìòÌÒæøåÆØŧ", + "url": "https://doctr-static.mindee.com/models?id=v0.7.0/parseq-56125471.pt&src=0", + "arch": "parseq", + "task": "recognition" +} \ No newline at end of file diff --git a/doctrfiles/word_detector.py b/doctrfiles/word_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..8b6538653561cd70dc780cf8cb5a1045b6a78c4b --- /dev/null +++ b/doctrfiles/word_detector.py @@ -0,0 +1,282 @@ + +import os +from abc import ABC +from pathlib import Path +from typing import Any, List, Literal, Mapping, Optional, Tuple, Union, Dict, Type, Sequence +import json +import logging +import torch +from doctr.models.preprocessor import PreProcessor +from doctr.models.detection.predictor import DetectionPredictor # pylint: disable=W0611 +from doctr.models.detection.zoo import detection_predictor,detection + +import numpy.typing as npt +import numpy as np +from numpy import uint8 +ImageType = npt.NDArray[uint8] + + +from utils import Annotation,getlogger,group_words_into_lines + +ARCHS = [ + "db_resnet34", + "db_resnet50", + "db_mobilenet_v3_large", + "linknet_resnet18", + "linknet_resnet34", + "linknet_resnet50", + "fast_tiny", + "fast_small", + "fast_base", + ] +class Wordboxes: + def __init__(self,score, box): + self.box = box + self.score = score + +class DoctrWordDetector(): + """ + A deepdoctection wrapper of DocTr text line detector. We model text line detection as ObjectDetector + and assume to use this detector in a ImageLayoutService. + DocTr supports several text line detection implementations but provides only a subset of pre-trained models. + The most usable one for document OCR for which a pre-trained model exists is DBNet as described in “Real-time Scene + Text Detection with Differentiable Binarization”, with a ResNet-50 backbone. This model can be used in either + Tensorflow or PyTorch. + Some other pre-trained models exist that have not been registered in `ModelCatalog`. Please check the DocTr library + and organize the download of the pre-trained model by yourself. + + **Example:** + + path_weights_tl = ModelDownloadManager.maybe_download_weights_and_configs("doctr/db_resnet50/pt + /db_resnet50-ac60cadc.pt") + # Use "doctr/db_resnet50/tf/db_resnet50-adcafc63.zip" for Tensorflow + + categories = ModelCatalog.get_profile("doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt").categories + det = DoctrTextlineDetector("db_resnet50",path_weights_tl,categories,"cpu") + layout = ImageLayoutService(det,to_image=True, crop_image=True) + + path_weights_tr = dd.ModelDownloadManager.maybe_download_weights_and_configs("doctr/crnn_vgg16_bn + /pt/crnn_vgg16_bn-9762b0b0.pt") + rec = DoctrTextRecognizer("crnn_vgg16_bn", path_weights_tr, "cpu") + text = TextExtractionService(rec, extract_from_roi="word") + + analyzer = DoctectionPipe(pipeline_component_list=[layout,text]) + + path = "/path/to/image_dir" + df = analyzer.analyze(path = path) + + for dp in df: + ... + """ + + def __init__( + self, + architecture: str, + path_weights: str, + path_config_json:str + ) -> None: + """ + :param architecture: DocTR supports various text line detection models, e.g. "db_resnet50", + "db_mobilenet_v3_large". The full list can be found here: + https://github.com/mindee/doctr/blob/main/doctr/models/detection/zoo.py#L20 + :param path_weights: Path to the weights of the model + :param categories: A dict with the model output label and value + :param device: "cpu" or "cuda" or any tf.device or torch.device. The device must be compatible with the dll + :param lib: "TF" or "PT" or None. If None, env variables USE_TENSORFLOW, USE_PYTORCH will be used. + """ + self.architecture = architecture + self.path_weights = path_weights + self.path_config_json =path_config_json + + # Ensure the correct device is chosen (either CPU or CUDA if available) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Initialize the model with the given architecture and path to weights + self.doctr_predictor = self.get_wrapped_model() + + """ + Two static method so that they can be called without creating an instance of the class + Also, they don't require any instance specific data + """ + + def get_wrapped_model( + self + ) -> Any: + """ + Get the inner (wrapped) model. + + :param architecture: DocTR supports various text line detection models, e.g. "db_resnet50", + "db_mobilenet_v3_large". The full list can be found here: + https://github.com/mindee/doctr/blob/main/doctr/models/detection/zoo.py#L20 + :param path_weights: Path to the weights of the model + + :return: Inner model which is a "nn.Module" in PyTorch or a "tf.keras.Model" in Tensorflow + """ + + """ + (function) detection_predictor: ((arch: Any = "db_resnet50", pretrained: bool = False, assume_straight_pages: bool = True, **kwargs: Any) -> DetectionPredictor) + """ + #doctr_predictor = detection_predictor(arch=architecture, pretrained=False, pretrained_backbone=False) + #doctr_predictor = detection_predictor(arch=architecture, pretrained=False) + + doctr_predictor = self.build_model(self.architecture, self.path_config_json) + + self.load_model(self.path_weights, doctr_predictor, self.device) + return doctr_predictor + @staticmethod + def build_model(arch: str, pretrained = False,assume_straight_pages=True, path_config_json: Optional[str] = None) -> "DetectionPredictor": + """Building the model + 1. Specific keys (arch, url, task) are removed from custom_configs. + mean and std values are moved to recognition_configs. + 2. Creating model + Check Architecture Type: + Case 1 : + If architecture is a string, it checks if it's in the predefined set of architectures (ARCHS). + If valid, it creates an instance of the model using the specified architecture and custom configurations. + Handle Custom Architecture Instances: + Case 2 : + If architecture is not a string, it checks if it's an **instance** of one of the recognized model classes (e.g., recognition.CRNN, recognition.SAR, etc.). + If valid, it assigns the provided architecture to model. + Get Input Shape and Create RecognitionPredictor: + + 3. Retrieves the input_shape from the model's configuration. + 4. Returns an instance of RecognitionPredictor initialized with a PreProcessor and the model. + """ + + custom_configs = {} + batch_size = 4 + detection_configs = {} + if path_config_json: + with open(path_config_json, "r", encoding="utf-8") as f: + custom_configs = json.load(f) + custom_configs.pop("arch", None) + custom_configs.pop("url", None) + custom_configs.pop("task", None) + detection_configs["mean"] = custom_configs.pop("mean") + detection_configs["std"] = custom_configs.pop("std") + #batch_size = custom_configs.pop("batch_size") + detection_configs["batch_size"] = batch_size + if isinstance(arch, str): + if arch not in ARCHS: + raise ValueError(f"unknown architecture '{arch}'") + + model = detection.__dict__[arch]( + pretrained=pretrained, + assume_straight_pages=assume_straight_pages + ) + + else: + if not isinstance(arch, (detection.DBNet, detection.LinkNet, detection.FAST)): + raise ValueError(f"unknown architecture: {type(arch)}") + + model = arch + model.assume_straight_pages = assume_straight_pages + + input_shape = model.cfg["input_shape"][-2:] + + predictor = DetectionPredictor( + PreProcessor(input_shape, batch_size=batch_size,**detection_configs), + model + ) + return predictor + + @staticmethod + def load_model(path_weights: str, doctr_predictor: Any, device: torch.device) -> None: + """Loading model weights + 1. Load the State Dictionary: + state_dict = torch.load(path_weights, map_location=device) loads the state dictionary from the specified file path and maps it to the specified device. + 2. Modify Keys in the State Dictionary: + The code prepends "model." to each key in the state dictionary. This is likely necessary to match the keys expected by the doctr_predictor model. + 3. Load State Dictionary into Model: + doctr_predictor.load_state_dict(state_dict) loads the modified state dictionary into the model. + 4. Move Model to Device: + doctr_predictor.to(device) moves the model to the specified device. + """ + state_dict = torch.load(path_weights, map_location=device) + for key in list(state_dict.keys()): + state_dict["model." + key] = state_dict.pop(key) + doctr_predictor.load_state_dict(state_dict) + doctr_predictor.to(device) + + + def predict(self, np_img: ImageType,sort_vertical = False) -> List[Wordboxes]: + """ + Prediction per image. + + :param np_img: image as numpy array + + :return: A list of DetectionResult + """ + + raw_output =self.doctr_predictor([np_img]) + height, width = np_img.shape[:2] + + """ + raw_output is arrary of dictionary with just one key "words" + 1-4th element : coordinates You take first 4 elements in this array by doing box[:4] + 5th element - score + But those are 4 point and we need 4X2 + type(raw_output[0]["words"]) are numpy arrary + Okay hypothesis :xmin, ymin, xmax, ymax + Points should be ordered in this order :left_lower, right_lower, right_upper, left_upper + """ + + logger = getlogger("array") + # Check if the logger has any handlers + if (logger.hasHandlers()): + logger.handlers.clear() + + # Create a handler + handler = logging.StreamHandler() + + # Create a formatter and add it to the handler + formatter = logging.Formatter('%(levelname)s:%(message)s') + handler.setFormatter(formatter) + + # Add the handler to the logger + logger.addHandler(handler) + #logger.info(raw_output[0]["words"]) + + #array is numpy array of shape (n,5) where n is number of words and 5 is size of each element(array) with coordinate(xmin,ymin,xmax,ymax) + score + + array = raw_output[0]["words"] + if not sort_vertical: + #Only When input has one line + sorted_array = array[array[:, 0].argsort()] + else: + #When input can have multiple lines + sorted_array = group_words_into_lines(array) + #logger.info(sorted_array) + + + detection_results = [] + for box in sorted_array: + xmin, ymin, xmax, ymax = box[:4] + xmin = xmin*width + ymin = ymin*height + xmax = xmax*width + ymax = ymax*height + newb = np.array([ + [xmin, ymin], + [xmax, ymin], + [xmax, ymax], + [xmin, ymax] + ], dtype=np.float32) + assert newb.shape == (4, 2), f"Points array must be of shape (4, 2), but got {box.shape}" + assert newb.dtype == np.float32, f"Points array must be of dtype float32, but got {box.dtype}" + + w = Wordboxes( + score=box[4], + box = newb + ) + + detection_results.append(w) + + return detection_results + + + + + + + diff --git a/image-1.png b/image-1.png new file mode 100644 index 0000000000000000000000000000000000000000..9e6e84c8925ce0e1176d559ae4b05d26240d995a Binary files /dev/null and b/image-1.png differ diff --git a/image-2.png b/image-2.png new file mode 100644 index 0000000000000000000000000000000000000000..2027dd96b8b986186aff152f27f655ce20d3e8a0 Binary files /dev/null and b/image-2.png differ diff --git a/image.png b/image.png new file mode 100644 index 0000000000000000000000000000000000000000..7cd5f37270123801a5c753003a8c7ca960ca301e Binary files /dev/null and b/image.png differ diff --git a/june11.jpg b/june11.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e244b174a15cc5a926b8b7c72d520c9746d790a1 Binary files /dev/null and b/june11.jpg differ diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..2a92e0ec41c001720138309223680ae9f93b3faa --- /dev/null +++ b/main.py @@ -0,0 +1,295 @@ + +import os +import traceback +import argparse +from typing import List, Tuple, Set, Dict + +import time +from PIL import Image +import numpy as np +import logging +import pandas as pd +from bs4 import BeautifulSoup + +from utils import cropImages +from utils import draw_only_box,draw_box_with_text,getlogger,Annotation +from ocr_component1 import OCRComponent1 +from detectionAndOcrTable1 import DetectionAndOcrTable1 +from detectionAndOcrTable2 import DetectionAndOcrTable2 +from detectionAndOcrTable3 import DetectionAndOcrTable3 +from detectionAndOcrTable4 import DetectionAndOcrTable4 +from ocrTable1 import OcrTable1 +from ocrTable2 import OcrTable2 +from pdf2image import convert_from_path + + + +def convertHTMLToCSV(html:str,output_path:str)->str: + + # empty list + data = [] + + # for getting the header from + # the HTML file + list_header = [] + soup = BeautifulSoup(html,'html.parser') + header = soup.find_all("table")[0].find("tr") + + for items in header: + try: + list_header.append(items.get_text()) + except: + continue + + # for getting the data + HTML_data = soup.find_all("table")[0].find_all("tr")[1:] + + for element in HTML_data: + sub_data = [] + for sub_element in element: + try: + sub_data.append(sub_element.get_text()) + except: + continue + data.append(sub_data) + + # Storing the data into Pandas + # DataFrame + dataFrame = pd.DataFrame(data = data, columns = list_header) + + # Converting Pandas DataFrame + # into CSV file + dataFrame.to_csv(output_path) + +def saveResults(image_list, results, labels, output_dir='output/', threshold=0.5): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + for idx, im in enumerate(image_list): + im = draw_only_box(im, results[idx], labels, threshold=threshold) + + out_path = os.path.join(output_dir, f"{idx}.jpg") + im.save(out_path, quality=95) + print("save result to: " + out_path) + +def InputToImages(input_path:str,resolution=300)-> List[Image.Image]: + """ + input is file location to image + return : List of Pillow image objects + """ + images=[] + try: + img =Image.open(input_path) + if img.mode == 'RGBA': + img = img.convert('RGB') + images.append(img) + except Exception as e: + traceback.print_exc() + return images + +def drawTextDetRes(bxs :List[List[float]],img:Image.Image,output_path:str): + """ + draw layout analysis results + """ + """bxs_draw is xmin, ymin, xmax, ymax""" + bxs_draw = [[b[0][0], b[0][1], b[1][0], b[-1][1]] for b in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]] + + #images_to_recognizer = cropImage(bxs, img) + img_to_save = draw_only_box(img, bxs_draw) + img_to_save.save(output_path, quality=95) + +def test_ocr_component1(test_file="TestingFiles/OCRTest1German.pdf", debug_folder = './res/table1/',englishFlag = False): + #Takes as input image of a single page and returns the detected lines and words + # + if not os.path.exists(debug_folder): + os.makedirs(debug_folder) + images = convert_from_path(test_file) + ocr = OCRComponent1(englishFlag) + ocr_results = {} + for page_number,img in enumerate(images): + + + line_annotations= ocr.predict(img = np.array(img)) + ocr_results[page_number] = line_annotations + + """boxes_to_draw =[] + for list_of_ann in word_annotations: + for ann in list_of_ann: + logger.info(ann.text) + b = ann.box + boxes_to_draw.append(b) + + img_to_save = draw_only_box(img,boxes_to_draw) + img_to_save.save("res/12June_2_lines.png", quality=95) + """ + + line_boxes_to_draw =[] + #print("Detected lines are ") + #print(len(line_annotations.items())) + for index,ann in line_annotations.items(): + + b = ann.box + line_boxes_to_draw.append(b) + line_words = "" + #print("detected words per line") + #print(len(ann.words)) + for wordann in ann.words: + line_words += wordann.text +" " + print(line_words) + + img_to_save1 = draw_only_box(img,line_boxes_to_draw) + imgname = test_file.split("/")[-1][:-4] + img_to_save1.save(debug_folder+imgname+"_"+str(page_number)+"_bbox_detection.png", quality=95) + + return ocr_results + + +def test_tableOcrOnly1(test_file = './cropped_table_0.png' , debug_folder = './res/table1/',denoise = False,englishFlag = False): + if not os.path.exists(debug_folder): + os.makedirs(debug_folder) + #Hybrid Unitable +DocTR + #Good at these kind of tables - with a lot of texts + table = OcrTable1(englishFlag) + image = Image.open(test_file).convert("RGB") + parts = test_file.split("/") + filename = parts[-1][:-4] + debugfolder_filename_page_name= debug_folder+filename+"_" + + table_code = table.predict([image],debugfolder_filename_page_name,denoise = denoise) + with open(debugfolder_filename_page_name+'output.txt', 'w') as file: + file.write(table_code) + return table_code + +def test_tableOcrOnly2(test_file = './cropped_table_1.png' , debug_folder = './res/table2/'): + if not os.path.exists(debug_folder): + os.makedirs(debug_folder) + table = OcrTable2() + + #FullUnitable + #Good at these kind of tables - with not much text + + image = Image.open(test_file).convert("RGB") + table.predict([image],debug_folder) + +def test_table_component1(test_file = 'TestingFiles/TableOCRTestEnglish.pdf', debug_folder ='./res/table_debug2/',denoise = False,englishFlag = True): + table_predictor = DetectionAndOcrTable1(englishFlag) + if not os.path.exists(debug_folder): + os.makedirs(debug_folder) + + images = convert_from_path(test_file) + for page_number,img in enumerate(images): + + #print(img.mode) + print("Looking at page:") + print(page_number) + parts = test_file.split("/") + filename = parts[-1][:-4] + debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' + table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name,denoise = denoise) + for index, table_code in enumerate(table_codes): + with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: + file.write(table_code) + +def test_table_component2(test_file = 'TestingFiles/TableOCRTestEnglish.pdf', debug_folder ='./res/table_debug2/'): + #This components can take in entire pdf page as input , scan for tables and return the table in html format + #Uses the full unitable model + if not os.path.exists(debug_folder): + os.makedirs(debug_folder) + table_predictor = DetectionAndOcrTable2() + + images = convert_from_path(test_file) + for page_number,img in enumerate(images): + print("Looking at page:") + print(page_number) + parts = test_file.split("/") + filename = parts[-1][:-4] + debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' + table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) + for index, table_code in enumerate(table_codes): + with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w',encoding='utf-8') as file: + file.write(table_code) + +def test_table_component3(test_file = 'TestingFiles/TableOCRTestEnglish.pdf',debug_folder ='./res/table_debug3/',denoise = False,englishFlag = True): + if not os.path.exists(debug_folder): + os.makedirs(debug_folder) + table_predictor = DetectionAndOcrTable3(englishFlag) + + images = convert_from_path(test_file) + for page_number,img in enumerate(images): + #print(img.mode) + print("Looking at page:") + print(page_number) + parts = test_file.split("/") + filename = parts[-1][:-4] + debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' + table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) + for index, table_code in enumerate(table_codes): + with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w',encoding='utf-8') as file: + file.write(table_code) + + + +def test_table_component4(test_file = 'TestingFiles/TableOCRTestEnglish.pdf',debug_folder ='./res/table_debug3/'): + if not os.path.exists(debug_folder): + os.makedirs(debug_folder) + table_predictor = DetectionAndOcrTable4() + + images = convert_from_path(test_file) + for page_number,img in enumerate(images): + #print(img.mode) + print("Looking at page:") + print(page_number) + parts = test_file.split("/") + filename = parts[-1][:-4] + debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' + table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) + for index, table_code in enumerate(table_codes): + with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w',encoding='utf-8') as file: + file.write(table_code) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Process some strings.') + parser.add_argument('ocr', type=str, help='type in id of the component to test') + parser.add_argument('--test_file',type=str, help='path to the testing file') + parser.add_argument('--debug_folder',type=str, help='path to the folder you want to save your results in') + parser.add_argument('--englishFlag',type=bool, help='Whether your pdf is in english => could lead to better results ') + parser.add_argument('--denoise',type=bool, help='preprocessing for not clean scans ') + + args = parser.parse_args() + start = time.time() + if args.ocr == "ocr1": + test_ocr_component1(args.test_file,args.debug_folder, args.englishFlag) + elif args.ocr == "table1": + test_tableOcrOnly1(args.test_file,args.debug_folder,args.englishFlag,args.denoise) + elif args.ocr == "table2": + test_tableOcrOnly2(args.test_file,args.debug_folder) + elif args.ocr =="pdftable1": + test_table_component1(args.test_file,args.debug_folder,args.englishFlag,args.denoise) + elif args.ocr =="pdftable2": + test_table_component2(args.test_file,args.debug_folder) + elif args.ocr =="pdftable3": + test_table_component3(args.test_file,args.debug_folder,args.englishFlag,args.denoise) + elif args.ocr =="pdftable4": + test_table_component4(args.test_file,args.debug_folder) + + #test_table_component1() + #test_ocr_component1("TestingFilesImages/OCRTest3English_0.jpg") + #test_tableOcrOnly2() + #test_tableOcrOnly2() + #test_tableOcrOnly2_singleImage() + + # Example run + # python main.py pdftable2 --test_file TestingFiles/TableOCRTestEnglish.pdf --debug_foler ./res/table_debug2/ + # python main.py ocr1 --test_file TestingFiles/OCRTest1German.pdf --debug_repo ./res/ocrdebug1/ + # python main.py table1 --test_file ./cropped_table_0.png --debug_repo ./res/table1/ + # python main.py table2 --test_file ./cropped_table_1.png --debug_repo ./res/table2/ + # python main.py pdftable1 --test_file TestingFiles/TableOCRTestEnglish.pdf --debug_foler ./res/table_debug2/ + + + + + + + end = time.time() + print("The entire pipeline took " , end-start) + diff --git a/nougat.py b/nougat.py new file mode 100644 index 0000000000000000000000000000000000000000..e093d151ca96442ee2e1f5f1eb95d67154dcd78e --- /dev/null +++ b/nougat.py @@ -0,0 +1,43 @@ +from typing import List, Tuple, Set, Dict + +from huggingface_hub import hf_hub_download +import re +from PIL import Image +from transformers import NougatProcessor, VisionEncoderDecoderModel +from datasets import load_dataset +import torch +from doctrfiles import DetectionResult +# Numpy image type +import numpy.typing as npt +from numpy import uint8 +ImageType = npt.NDArray[uint8] + + +def run_nougat(inputs: List[Tuple[int, ImageType]])-> List[DetectionResult]: + processor = NougatProcessor.from_pretrained("facebook/nougat-base") + model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base") + + device = "cuda" if torch.cuda.is_available() else "cpu" + model.to(device) + + detection_results =[] + for index, np_img in inputs: + image = Image.fromarray(np_img) + pixel_values = processor(image, return_tensors="pt").pixel_values + + # generate transcription (here we only generate 30 tokens) + outputs = model.generate( + pixel_values.to(device), + min_length=1, + max_new_tokens=30, + bad_words_ids=[[processor.tokenizer.unk_token_id]], + ) + + sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0] + sequence = processor.post_process_generation(sequence, fix_markdown=False) + # note: we're using repr here such for the sake of printing the \n characters, feel free to just print the sequence + text = sequence + detection_results.append(DetectionResult(score=1, text=text, index=index)) + + return detection_results + diff --git a/ocrTable1.py b/ocrTable1.py new file mode 100644 index 0000000000000000000000000000000000000000..cbd584e2fa99bcd37964aab0054ebc255be9520b --- /dev/null +++ b/ocrTable1.py @@ -0,0 +1,209 @@ +from typing import Tuple, List, Sequence, Optional, Union +from torchvision import transforms +from torch import nn, Tensor +from PIL import Image +from pathlib import Path +from bs4 import BeautifulSoup as bs +from unitable import UnitablePredictor +from doctrfiles import DoctrWordDetector,DoctrTextRecognizer +import numpy as np +from utils import crop_an_Image,cropImageExtraMargin +from utils import denoisingAndSharpening +import numpy.typing as npt +from numpy import uint8 + + +ImageType = npt.NDArray[uint8] + +html_table_template = ( + + lambda table: f""" + + + + + {table} +
""" +) + +class OcrTable1(): + def __init__(self,englishFlag = True): + self.wordDetector = DoctrWordDetector(architecture="db_resnet50", + path_weights="./doctrfiles/models/db_resnet50-79bd7d70.pt", + path_config_json ="./doctrfiles/models/db_resnet50_config.json") + + self.unitablePredictor = UnitablePredictor() + + if englishFlag: + self.textRecognizer = DoctrTextRecognizer(architecture="master", path_weights="./doctrfiles/models/master-fde31e4a.pt", + path_config_json="./doctrfiles/models/master.json") + else: + self.textRecognizer = DoctrTextRecognizer(architecture="parseq", path_weights="./doctrfiles/models/doctr-multilingual-parseq.bin", + path_config_json="./doctrfiles/models/multilingual-parseq-config.json") + + + @staticmethod + def build_table_from_html_and_cell( + structure: List[str], content: List[str] = None + ) -> List[str]: + """Build table from html and cell token list""" + assert structure is not None + html_code = list() + + # deal with empty table + if content is None: + content = ["placeholder"] * len(structure) + + for tag in structure: + if tag in ("[]", ">[]"): + if len(content) == 0: + continue + cell = content.pop(0) + html_code.append(tag.replace("[]", cell)) + else: + html_code.append(tag) + + return html_code + + @staticmethod + def save_detection(detected_lines_images:List[ImageType],prefix = './res/test1/res_'): + i = 0 + for img in detected_lines_images: + pilimg = Image.fromarray(img) + pilimg.save(prefix+str(i)+'.png') + i=i+1 + + + def predict(self,images:List[ImageType],debug_folder="./res",denoise=False): + + """ + this hardcodes 0 into images and bbxs cause they are made to get multiple images but this component will only get one image + """ + + # Step 0 : Locate the table using Table detection TODO + # PreProcessing + if denoise: + images = denoisingAndSharpening(images) + else: + images = images + + pred_htmls, bbxs = self.unitablePredictor.predict(images,debug_folder) + + #pred_htmlbbxs = [[608, 33, 820, 106], [72, 125, 1353, 212], [377, 255, 654, 340], [709, 255, 989, 340], [1044, 255, 1330, 340], [166, 364, 254, 394], [351, 451, 517, 484], [520, 424, 676, 538], [689, 451, 839, 484], [859, 424, 1011, 538], [1024, 424, 1181, 511], [1194, 424, 1353, 538], [420, 614, 446, 644], [592, 614, 615, 644], [761, 614, 784, 644], [930, 614, 953, 644], [1096, 614, 1119, 644], [1262, 614, 1285, 644], [72, 671, 185, 701], [315, 671, 351, 701], [394, 671, 462, 701], [595, 671, 631, 701], [728, 671, 797, 701], [930, 671, 966, 701], [1063, 671, 1132, 701], [1268, 671, 1304, 701], [72, 698, 205, 728], [315, 698, 351, 728], [416, 698, 462, 728], [589, 698, 631, 728], [748, 698, 790, 728], [924, 698, 966, 728], [1089, 698, 1132, 728], [1259, 698, 1304, 728], [72, 725, 208, 755], [315, 725, 351, 755], [416, 725, 462, 755], [595, 725, 631, 755], [751, 725, 797, 755], [930, 725, 966, 755], [1063, 725, 1135, 755], [1268, 725, 1304, 755], [72, 752, 211, 782], [315, 752, 351, 782], [416, 752, 462, 782], [595, 752, 631, 782], [764, 752, 797, 782], [946, 752, 966, 782], [1089, 752, 1132, 782], [1268, 752, 1304, 782], [72, 780, 179, 810], [315, 780, 351, 810], [416, 780, 462, 810], [595, 780, 631, 810], [764, 780, 797, 810], [946, 780, 966, 810], [1089, 780, 1132, 810], [1268, 780, 1304, 810], [72, 807, 182, 837], [315, 807, 351, 837], [416, 807, 462, 837], [595, 807, 631, 837], [751, 807, 797, 837], [946, 807, 966, 837], [1089, 807, 1132, 837], [1268, 807, 1304, 837], [72, 834, 169, 864], [315, 834, 351, 864], [416, 834, 462, 864], [595, 834, 631, 864], [764, 834, 797, 864], [946, 834, 966, 864], [1089, 834, 1132, 864], [1268, 834, 1304, 864], [72, 861, 189, 891], [315, 861, 351, 891], [416, 861, 462, 891], [595, 861, 631, 891], [764, 861, 797, 891], [946, 861, 966, 891], [1089, 861, 1132, 891], [1268, 861, 1304, 891], [72, 888, 189, 918], [315, 888, 351, 918], [416, 888, 462, 918], [595, 888, 631, 918], [751, 888, 797, 918], [946, 888, 966, 918], [1089, 888, 1132, 918], [1268, 888, 1304, 918], [72, 915, 179, 945], [315, 915, 351, 945], [416, 915, 462, 945], [595, 915, 631, 945], [764, 915, 797, 945], [946, 915, 966, 945], [1089, 915, 1132, 945], [1268, 915, 1304, 945], [72, 943, 241, 973], [315, 943, 351, 973], [416, 943, 462, 973], [595, 943, 631, 973], [764, 943, 797, 973], [946, 943, 966, 973], [1089, 943, 1132, 973], [1268, 943, 1304, 973], [72, 970, 231, 1000], [315, 970, 351, 1000], [394, 970, 462, 1000], [595, 970, 631, 1000], [751, 970, 797, 1000], [930, 970, 966, 1000], [1063, 970, 1132, 1000], [1268, 970, 1304, 1000], [72, 997, 211, 1027], [315, 997, 351, 1027], [416, 997, 462, 1027], [595, 997, 631, 1027], [764, 997, 797, 1027], [946, 997, 966, 1027], [1089, 997, 1132, 1027], [1268, 997, 1304, 1027], [72, 1024, 198, 1054], [315, 1024, 351, 1054], [394, 1024, 462, 1054], [595, 1024, 631, 1054], [764, 1024, 797, 1054], [946, 1024, 966, 1054], [1063, 1024, 1132, 1054], [1268, 1024, 1304, 1054], [72, 1051, 231, 1081], [315, 1051, 351, 1081], [394, 1051, 462, 1081], [595, 1051, 631, 1081], [764, 1051, 797, 1081], [946, 1051, 966, 1081], [1063, 1051, 1132, 1081], [1268, 1051, 1304, 1081], [124, 1108, 195, 1138], [315, 1108, 351, 1138], [381, 1108, 462, 1138], [595, 1108, 631, 1138], [728, 1108, 797, 1138], [946, 1108, 966, 1138], [1054, 1108, 1135, 1138], [1268, 1108, 1304, 1138]] + + #Step2: Crop the images from the returned bboxes + pred_cell = [] + cell_imgs_to_viz = [] + cell_img_num=0 + + # Some tabless have a lot of words in their header + # So for the headers, give doctr word ddetector doesn't work when the images aren't square + table_header_cells = 0 + header_exists = False + for cell in pred_html: + if cell=='>[]' or cell == '[]': + table_header_cells += 1 + if cell =='': + header_exists = True + break + if not header_exists: + table_header_cells = 0 + pred_cell = [] + cell_imgs_to_viz = [] + cell_img_num=0 + + one_line_height = 100000 + for i in range(table_header_cells): + box = bbxs[0][i] + xmin, ymin, xmax, ymax = box + current_box_height = abs(ymax-ymin) + if current_box_height 0 and current_box_height>one_line_height+5: + + cell_img = cropImageExtraMargin([fourbytwo],images[0])[0] + table_header_cells -= 1 + + #List of 4 x 2 + detection_results = self.wordDetector.predict(cell_img,sort_vertical=True) + + input_to_recog = [] + if detection_results == []: + input_to_recog.append(cell_img) + else: + #print("Debugging the issue") + for wordbox in detection_results: + #print(wordbox.box) + #print(cell_img.shape) + cropped_image= crop_an_Image(wordbox.box,cell_img) + #print(cropped_image.shape) + if cropped_image.shape[0] >0 and cropped_image.shape[1]>0: + input_to_recog.append(cropped_image) + else: + print("Empty image") + + else:# For normal cells don't do word detection! + cell_img = crop_an_Image(fourbytwo,images[0]) + if table_header_cells>0: + table_header_cells -= 1 + if cell_img.shape[0] >0 and cell_img.shape[1]>0: + input_to_recog =[cell_img] + + + cell_imgs_to_viz.append(cell_img) + cell_img_num = cell_img_num+1 + + + if input_to_recog != []: + words = self.textRecognizer.predict_for_tables(input_to_recog) + cell_output = " ".join(words) + pred_cell.append(cell_output) + else: + #Don't lose empty cell + pred_cell.append("") + + + self.save_detection(cell_imgs_to_viz,prefix = './res/test1/cell_imgs_') + + + print(pred_cell) + #Step3 : + pred_html = pred_htmls[0] + pred_code = self.build_table_from_html_and_cell(pred_html, pred_cell) + print(pred_code) + pred_code = "".join(pred_code) + pred_code = html_table_template(pred_code) + + # Display the HTML table + soup = bs(pred_code) + #formatted and indented) string representation of the HTML document + table_code = soup.prettify() + print(table_code) + return table_code + + + + + + + + diff --git a/ocrTable2.py b/ocrTable2.py new file mode 100644 index 0000000000000000000000000000000000000000..9d8813439c44fb401dac5c03644244a781aa38cd --- /dev/null +++ b/ocrTable2.py @@ -0,0 +1,50 @@ +from typing import Tuple, List, Sequence, Optional, Union +from torchvision import transforms +from torch import nn, Tensor +from PIL import Image +from pathlib import Path +from bs4 import BeautifulSoup as bs +from unitable import UnitableFullPredictor +from unitable import UnitableFullSinglePredictor +from doctrfiles import DoctrWordDetector,DoctrTextRecognizer +import numpy as np +from utils import crop_an_Image,cropImageExtraMargin +import numpy.typing as npt +from numpy import uint8 +ImageType = npt.NDArray[uint8] + + +class OcrTable2(): + #Takes as input the table image - no table detection + def __init__(self): + + self.unitablePredictor = UnitableFullPredictor() + #self.unitablePredictor = UnitableFullSinglePredictor() + + + @staticmethod + def save_detection(detected_lines_images:List[ImageType], prefix = './res/test1/res_'): + i = 0 + for img in detected_lines_images: + pilimg = Image.fromarray(img) + pilimg.save(prefix+str(i)+'.png') + i=i+1 + + def predict(self,images,debug_repo="./res/test1"): + + + # Step 1: Get table structure and bbox for cell contents from unitable + + table_code = self.unitablePredictor.predict(images,debug_repo) + + + + + + + + + + + + diff --git a/ocr_component1.py b/ocr_component1.py new file mode 100644 index 0000000000000000000000000000000000000000..d5bd00df06e2f262ec9541699d858904fd60b4f5 --- /dev/null +++ b/ocr_component1.py @@ -0,0 +1,219 @@ +from typing import Any, List, Literal, Mapping, Optional, Tuple +import time + +from PIL import Image +# Numpy image type +import numpy.typing as npt +from numpy import uint8 +ImageType = npt.NDArray[uint8] + +import numpy as np +import uuid + +from doctrfiles import DoctrWordDetector,DoctrTextRecognizer,Wordboxes +from deepdoc import RagFlow +from utils import LineAnnotation,WordAnnotation,getlogger,cropImageExtraMargin,crop_an_Image,cropImages,get_new_coord +from numpy.typing import NDArray + +MARGIN_FACTOR = 1.4 +class OCRComponent1(): + """ + This component uses RagFlow as text line detector + Uses DocTR's word detector and text recognizer + """ + def __init__(self,englishflag =False): + logger = getlogger("1") + start_time = time.time() + self.textlineDetector = RagFlow() + end_time = time.time() + execution_time = end_time - start_time + logger.info(f"time to initialize Ragflow: {execution_time} seconds") + + + start_time = time.time() + """ + self.wordDetector = DoctrWordDetector(architecture="db_resnet50", + path_weights="doctrfiles/models/db_resnet50-79bd7d70.pt") + + """ + + self.wordDetector = DoctrWordDetector(architecture="db_resnet50", + path_weights="doctrfiles/models/db_resnet50-79bd7d70.pt", + path_config_json ="doctrfiles/models/db_resnet50_config.json") + + + end_time = time.time() + + execution_time = end_time - start_time + logger.info(f"time to initialize DoctrWordDetectorDebug: {execution_time} seconds") + start_time = time.time() + if not englishflag: + self.textRecognizer = DoctrTextRecognizer(architecture="parseq", path_weights="doctrfiles/models/doctr-multilingual-parseq.bin", + path_config_json="doctrfiles/models/multilingual-parseq-config.json") + else: + self.textRecognizer = DoctrTextRecognizer(architecture="master", path_weights="doctrfiles/models/master-fde31e4a.pt", + path_config_json="doctrfiles/models/master.json") + end_time = time.time() + execution_time = end_time - start_time + logger.info(f"time to initialize DoctrTextRecognizer: {execution_time} seconds") + + + @staticmethod + def save_detection(detected_lines_images:List[ImageType], prefix = './res/test1/res_'): + i = 0 + for img in detected_lines_images: + pilimg = Image.fromarray(img) + pilimg.save(prefix+str(i)+'.png') + i=i+1 + + @staticmethod + def convert_coordinates(original_coord = NDArray[np.float32],detection_res = NDArray[np.float32])-> NDArray[np.float32]: + """ + Type if original_coord : np.array([ + [xmin, ymin], + [xmax, ymin], + [xmax, ymax], + [xmin, ymax] + ] + """ + height = original_coord[3][1] - original_coord[0][1] + width = original_coord[1][0] - original_coord[0][0] + if width/height<1.6: + bigger = max(height,width) + new_height = int(bigger *3) + new_width = int(bigger*3) + else: + bigger = max(height,width) + new_height = int(bigger *MARGIN_FACTOR) + new_width = int(bigger*MARGIN_FACTOR) + + y_offset = (new_height - height) // 2 + x_offset = (new_width - width) // 2 + #new_img[y_offset:y_offset + height, x_offset:x_offset+width] = dst_img + #x,y offsets are the min x and y + + # Calculate relative coordinate to the original image in the padded image + + rel = np.array( + [ + [detection_res[0][0] - x_offset, detection_res[0][1]-y_offset], + [detection_res[1][0] - x_offset, detection_res[1][1]-y_offset], + [detection_res[2][0] - x_offset, detection_res[2][1]-y_offset], + [detection_res[3][0] - x_offset, detection_res[3][1]-y_offset], + ] + ) + xmin = original_coord[0][0] + ymin = original_coord[0][1] + xmax = original_coord[1][0] + ymax = original_coord[2][1] + #This used to return 4 x 2 array + #rel_in_page =[[xmin+b[0],ymin+b[1]] for b in rel] + #Now returns 4x1 array + rel_in_page = np.array([xmin+rel[0][0],ymin+rel[0][1], xmin +rel[1][0], ymin +rel[2][1]]) + return rel_in_page + + + + def predict(self, img:ImageType)->Tuple[List[LineAnnotation],List[WordAnnotation]]: + + logger = getlogger("1") + start_time = time.time() + + """ + bxs : Text line detection results - bounding boxes + Each element looks like : [array([[ 90., 98.], + [313., 100.], + [312., 129.], + [ 90., 127.]], dtype=float32) + [left_lower, right_lower, right_upper, left_upper] + """ + # 4x2 array + bxs:List[NDArray[np.float32]] = self.textlineDetector.predict(img = np.array(img)) + + end_time = time.time() + execution_time = end_time - start_time + logger.info(f"time to detecttextline: {execution_time} seconds") + + line_annotations = {} + straightboxs = [] + for points in bxs: + xmin, ymin, xmax, ymax = get_new_coord(img.shape[1],img.shape[0],points) + b = np.array([ + [xmin, ymin], + [xmax, ymin], + [xmax, ymax], + [xmin, ymax] + ], dtype=np.float32) + straightboxs.append(b) + ann = LineAnnotation(box =[xmin, ymin, xmax, ymax]) + line_annotations[ann.index] = ann + + """ + detected_lines_images : cropped images of detected lines + """ + # Double computation in line 117 - we calculate the straight lines again + #Straightboxes : 4x 2 array + detected_lines_images:List[ImageType] = cropImageExtraMargin(straightboxs, img,margin =MARGIN_FACTOR,straight=True) + #self.save_detection(detected_lines_images,prefix = './res/12June_two_Line_') + start_time = time.time() + word_annotations =[] + + #viz_word_detection =[] + for uuid, lineimg in zip(line_annotations.keys(),detected_lines_images): + + original_coord = line_annotations[uuid].box + xmin, ymin, xmax, ymax = original_coord + original_coord_b = np.array([ + [xmin, ymin], + [xmax, ymin], + [xmax, ymax], + [xmin, ymax] + ], dtype=np.float32) + + #List of 4 x 2 + detection_results :List[Wordboxes]= self.wordDetector.predict(lineimg) + + input_Word_recog ={} + + for wordbox in detection_results: + #So i think cropped_image's expected form is different that what is being returned + #takes in 4x2 array : box + cropped_image= crop_an_Image(wordbox.box,lineimg) + """ + We need to convert coordintes in wordbox.box to the original image + wordbox.box = np.array(wordbox.box) + """ + #original_coord_b :4x2 array + #coord_in_page :4 x 1 array + coord_in_page = self.convert_coordinates(original_coord_b,wordbox.box) + #logger.info("returned coordinate in page ") + #logger.info(coord_in_page) + + + wordAnn = WordAnnotation(box = coord_in_page, text = None) + word_uuid = wordAnn.index + input_Word_recog[word_uuid]= [cropped_image,wordAnn] + #print("uuid is ") + #print(uuid) + #print(len(line_annotations[uuid].words)) + line_annotations[uuid].words.append(wordAnn) + + #viz_word_detection.append(cropped_image) + + + #input_Word_recog contains only word detection + #It is dictionary of annotation id as key, than as values - list of cropped_image and Annotation Instance with key as uuid + + word_annotations_in_line = self.textRecognizer.predict(input_Word_recog) + word_annotations.append(word_annotations_in_line) + + #self.save_detection(viz_word_detection,prefix = './res/test4/rel_page_') + end_time = time.time() + execution_time = end_time - start_time + logger.info(f"Entire DocTR pipeline: {execution_time} seconds") + return line_annotations + + + + + diff --git a/ocr_component2.py b/ocr_component2.py new file mode 100644 index 0000000000000000000000000000000000000000..0f4577af3f09660ec27d706bea84bd07c66cbd96 --- /dev/null +++ b/ocr_component2.py @@ -0,0 +1,24 @@ + +import pdfplumber + +if __name__ == '__main__': + #input can be path to your PDF file or file object, loaded as bytes + with pdfplumber.open('/myhome/alps/TestingFiles/OCRTest1German.pdf') as pdf: + for page in pdf.pages: + im = page.to_image() + #extract_words() - Returns a list of all word-looking things and their bounding boxes + #Example: + #[{'text': 'Inhaltsverzeichnis', 'x0': 33.99, 'x1': 111.77713499999999, 'top': 36.59723999999994, 'doctop': 36.59723999999994, 'bottom': 46.58723999999995, 'upright': True, 'height': 9.990000000000009, 'width': 77.78713499999998, 'direction': 'ltr'}, {'text': 'Übersicht', 'x0': 33.99, 'x1': 71.4912, 't + + extracted_words = page.extract_words() + print(extracted_words) + for word in extracted_words: + print(word['text']) + word['x0'] + + + """ + Using the Page.extract_text(...) method, we grab every character on the page, line by line, using keep_blank_chars=True to retain all those whitespace characters as literal characters: + + text = p0.extract_text(keep_blank_chars=True) + """ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..20bcabbc0057f16b3e62ddd6fc1506e77e9ea461 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,47 @@ +onnxruntime-gpu==1.17.1 +#from deepdoctection +pdfplumber==0.11.0 +numpy>=1.21 +Pillow== 10.3.0 +huggingface_hub==0.22.2 +six==1.16.0 +opencv-python==4.9.0.80 +shapely==2.0.3 +pyclipper==1.3.0.post5 +python-doctr==0.7.0 +torch==1.13.1+cu117 +torchvision==0.14.1+cu117 +transformers==4.39.3 +datasets +pymupdf +python-Levenshtein +nltk +torch==1.13.1+cu117 +torchvision==0.14.1+cu117 +torchaudio==0.13.1 + +# Add this line as a comment to remind the user of the extra index URL +# Install with: pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu117 + +#unitable +torchtext==0.14.1 +jsonlines +beautifulsoup4 +matplotlib +hydra-core +hydra_colorlog +apted +Distance +lxml==4.9.3 +torchmetrics +wandb +einops +ptflops +tokenizers +pycocotools +faster-coco-eval +pdf2image +ultralyticsplus==0.0.28 +ultralytics==8.0.43 +gradio +gradio-pdf diff --git a/res0.png b/res0.png new file mode 100644 index 0000000000000000000000000000000000000000..264a223f0a47c0400aa584d7b11632c4abed2ccb --- /dev/null +++ b/res0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47ea95af507e071c8ecf0d1b63744cf493e03082f39f41167e80c9d3fcd7470a +size 1578582 diff --git a/table_drawn_bbox_with_extra.png b/table_drawn_bbox_with_extra.png new file mode 100644 index 0000000000000000000000000000000000000000..20554f9cd53797a36fcf3f34056f8a4a6152f552 --- /dev/null +++ b/table_drawn_bbox_with_extra.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:add5dacbb1a28a4094d260617a328ed460f1569a464a20f20cb58193a8b8c8aa +size 1290568 diff --git a/table_resize_with_padding.png b/table_resize_with_padding.png new file mode 100644 index 0000000000000000000000000000000000000000..d8333398e93761cddc23313a56c4087b067e8fc4 Binary files /dev/null and b/table_resize_with_padding.png differ diff --git a/unitable/CONFIG.mk b/unitable/CONFIG.mk new file mode 100644 index 0000000000000000000000000000000000000000..e33fa44f8a89aa74a8b2c8108f8dad1132ce1695 --- /dev/null +++ b/unitable/CONFIG.mk @@ -0,0 +1,337 @@ +################################################## +# Configurations # +################################################## + +# +# Datasets +# + +# label type +LABEL_IMAGE = ++trainer.label_type="image" +LABEL_HTML = ++trainer.label_type="html" "++trainer.train.loss_weights.html=1" +LABEL_CELL = ++trainer.label_type="cell" "++trainer.train.loss_weights.cell=1" +LABEL_BBOX = ++trainer.label_type="bbox" "++trainer.train.loss_weights.bbox=1" +MEAN = [0.86597056,0.88463002,0.87491087] +STD = [0.20686628,0.18201602,0.18485524] + +# augmentation +AUG_VQVAE = dataset/augmentation=vqvae +AUG_BEIT = dataset/augmentation=beit \ + ++dataset.augmentation.mean=$(MEAN) ++dataset.augmentation.std=$(STD) +AUG_RESIZE_NORM = dataset/augmentation=resize_normalize \ + ++dataset.augmentation.transforms.2.mean=$(MEAN) ++dataset.augmentation.transforms.2.std=$(STD) + +# single dataset +DATA_SINGLE = dataset=single_dataset +PUBTABNET = $(DATA_SINGLE) \ + +dataset/pubtabnet@dataset.train_dataset=train_dataset \ + +dataset/pubtabnet@dataset.valid_dataset=valid_dataset \ + +dataset/pubtabnet@dataset.test_dataset=test_dataset +MINIPUBTABNET = $(DATA_SINGLE) \ + +dataset/mini_pubtabnet@dataset.train_dataset=train_dataset \ + +dataset/mini_pubtabnet@dataset.valid_dataset=valid_dataset \ + +dataset/mini_pubtabnet@dataset.test_dataset=test_dataset + +# multiple datasets +DATA_MULTI = dataset=concat_dataset +PUBTABNET_M = +dataset/pubtabnet@dataset.train.d1=train_dataset \ + +dataset/pubtabnet@dataset.valid.d1=valid_dataset \ + +dataset/pubtabnet@dataset.test.d1=test_dataset +SYN_MARKET_M = +dataset/synthtabnet_marketing@dataset.train.d2=train_dataset \ + +dataset/synthtabnet_marketing@dataset.valid.d2=valid_dataset \ + +dataset/synthtabnet_marketing@dataset.test.d2=test_dataset +SYN_FIN_M = +dataset/synthtabnet_fintabnet@dataset.train.d3=train_dataset \ + +dataset/synthtabnet_fintabnet@dataset.valid.d3=valid_dataset \ + +dataset/synthtabnet_fintabnet@dataset.test.d3=test_dataset +SYN_SPARSE_M = +dataset/synthtabnet_sparse@dataset.train.d4=train_dataset \ + +dataset/synthtabnet_sparse@dataset.valid.d4=valid_dataset \ + +dataset/synthtabnet_sparse@dataset.test.d4=test_dataset +SYN_PUB_M = +dataset/synthtabnet_pubtabnet@dataset.train.d5=train_dataset \ + +dataset/synthtabnet_pubtabnet@dataset.valid.d5=valid_dataset \ + +dataset/synthtabnet_pubtabnet@dataset.test.d5=test_dataset +PUBTABLES_M = +dataset/pubtables1m@dataset.train.d7=train_dataset \ + +dataset/pubtables1m@dataset.valid.d7=valid_dataset \ + +dataset/pubtables1m@dataset.test.d7=test_dataset +TABLEBANK_M = +dataset/tablebank@dataset.train.d8=train_dataset \ + +dataset/tablebank@dataset.valid.d8=valid_dataset \ + +dataset/tablebank@dataset.test.d8=test_dataset +FINTABNET_M = +dataset/fintabnet@dataset.train.d9=train_dataset \ + +dataset/fintabnet@dataset.valid.d9=valid_dataset \ + +dataset/fintabnet@dataset.test.d9=test_dataset + +DATA_VQVAE_1M = $(DATA_MULTI) \ + $(PUBTABNET_M) $(SYN_MARKET_M) $(SYN_FIN_M) $(SYN_SPARSE_M) +DATA_VQVAE_2M = $(DATA_MULTI) \ + $(PUBTABNET_M) $(SYN_MARKET_M) $(SYN_FIN_M) $(SYN_SPARSE_M) $(SYN_PUB_M) \ + $(PUBTABLES_M) $(TABLEBANK_M) + +PUBTABLES1M = $(DATA_MULTI) $(PUBTABLES_M) +FINTABNET = $(DATA_MULTI) $(FINTABNET_M) + +PUB_SYN = $(DATA_MULTI) \ + $(PUBTABNET_M) $(SYN_MARKET_M) $(SYN_FIN_M) $(SYN_SPARSE_M) $(SYN_PUB_M) + +PUB_SYN_FIN = $(DATA_MULTI) $(PUBTABNET_M) $(FINTABNET_M) \ + $(SYN_MARKET_M) $(SYN_FIN_M) $(SYN_SPARSE_M) $(SYN_PUB_M) + +PUB_SYN_PUB1M = $(DATA_MULTI) $(PUBTABNET_M) $(PUBTABLES_M) \ + $(SYN_MARKET_M) $(SYN_FIN_M) $(SYN_SPARSE_M) $(SYN_PUB_M) + +SYN = $(DATA_MULTI) $(SYN_MARKET_M) $(SYN_FIN_M) $(SYN_SPARSE_M) $(SYN_PUB_M) + +SYN_fin = $(DATA_MULTI) $(SYN_FIN_M) +SYN_market = $(DATA_MULTI) $(SYN_MARKET_M) +SYN_pub = $(DATA_MULTI) $(SYN_PUB_M) +SYN_sparse = $(DATA_MULTI) $(SYN_SPARSE_M) + +# +# Vocab +# +VOCAB_NONE = vocab=empty +VOCAB_HTML = vocab=html +VOCAB_BBOX = vocab=bbox +VOCAB_CELL = vocab=cell + + +# +# Trainer +# + +# trainer type +TRAINER_VQVAE = trainer=vqvae +TRAINER_BEIT = trainer=beit +TRAINER_TABLE = trainer=table + +# input image size +I224 = ++trainer.img_size=[224,224] +I448 = ++trainer.img_size=[448,448] +I112_448 = ++trainer.img_size=[112,448] + +# max sequence length +SEQ200 = trainer.max_seq_len=200 +SEQ512 = trainer.max_seq_len=512 +SEQ1024 = trainer.max_seq_len=1024 + +# batch size + epoch +BATCH24 = ++trainer.train.dataloader.batch_size=24 ++trainer.valid.dataloader.batch_size=24 +BATCH48 = ++trainer.train.dataloader.batch_size=48 ++trainer.valid.dataloader.batch_size=48 +BATCH72 = ++trainer.train.dataloader.batch_size=72 ++trainer.valid.dataloader.batch_size=72 +BATCH80 = ++trainer.train.dataloader.batch_size=80 ++trainer.valid.dataloader.batch_size=80 +BATCH96 = ++trainer.train.dataloader.batch_size=96 ++trainer.valid.dataloader.batch_size=96 +BATCH256 = ++trainer.train.dataloader.batch_size=256 ++trainer.valid.dataloader.batch_size=256 +BATCH384 = ++trainer.train.dataloader.batch_size=384 ++trainer.valid.dataloader.batch_size=384 + +EPOCH24 = ++trainer.train.epochs=24 +EPOCH30 = ++trainer.train.epochs=30 +EPOCH48 = ++trainer.train.epochs=48 + +# optimizer +OPT_ADAMW = trainer/train/optimizer=adamw +OPT_WD5e2 = ++trainer.train.optimizer.weight_decay=5e-2 + +# lr + scheduler +LR_5e4 = ++trainer.train.optimizer.lr=5e-4 +LR_3e4 = ++trainer.train.optimizer.lr=3e-4 +LR_1e4 = ++trainer.train.optimizer.lr=1e-4 +LR_8e5 = ++trainer.train.optimizer.lr=8e-5 + +LR_cosine = trainer/train/lr_scheduler=cosine ++trainer.train.lr_scheduler.lr_lambda.min_ratio=5e-3 +LR_cosine93k_warm6k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=93400 ++trainer.train.lr_scheduler.lr_lambda.warmup=5800 +LR_cosine77k_warm8k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=76600 ++trainer.train.lr_scheduler.lr_lambda.warmup=7660 +LR_cosine30k_warm4k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=30500 ++trainer.train.lr_scheduler.lr_lambda.warmup=4000 +LR_cosine8k_warm1k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=7600 ++trainer.train.lr_scheduler.lr_lambda.warmup=800 +LR_cosine44k_warm6k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=44100 ++trainer.train.lr_scheduler.lr_lambda.warmup=5500 +LR_cosine118k_warm15k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=117800 ++trainer.train.lr_scheduler.lr_lambda.warmup=14700 +LR_cosine216k_warm27k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=216000 ++trainer.train.lr_scheduler.lr_lambda.warmup=27000 +LR_cosine32k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=32000 ++trainer.train.lr_scheduler.lr_lambda.warmup=0 +LR_cosine118k = $(LR_cosine) ++trainer.train.lr_scheduler.lr_lambda.total_step=118000 ++trainer.train.lr_scheduler.lr_lambda.warmup=0 + +GRAD_CLIP12 = ++trainer.train.grad_clip=12 + +# vqvae +VQVAE_TEMP_1M = ++trainer.train.starting_temp=1. \ + ++trainer.train.temp_min=5e-3 ++trainer.train.temp_anneal_rate=1e-3 +VQVAE_TEMP_2M = ++trainer.train.starting_temp=1. \ + ++trainer.train.temp_min=1e-3 ++trainer.train.temp_anneal_rate=2e-4 + +# pretraining specific +TRANS448_VQVAE224_GRID28_MASK300 = ++trainer.trans_size=[448,448] ++trainer.vqvae_size=[224,224] ++trainer.grid_size=28 ++trainer.num_mask_patches=300 +VQVAE1M_WEIGHTS = $(MODEL_VQVAE) ++trainer.vqvae_weights="../unitable_weights/vqvae_1m.pt" +VQVAE2M_WEIGHTS = $(MODEL_VQVAE_L) ++trainer.vqvae_weights="../unitable_weights/vqvae_2m.pt" + +# finetuning specific +WEIGHTS_mtim_1m_base = ++trainer.trainer.beit_pretrained_weights="../unitable_weights/ssp_1m_base.pt" +WEIGHTS_mtim_1m_large = ++trainer.trainer.beit_pretrained_weights="../unitable_weights/ssp_1m_large.pt" +WEIGHTS_mtim_2m_base = ++trainer.trainer.beit_pretrained_weights="../unitable_weights/ssp_2m_base.pt" +WEIGHTS_mtim_2m_large = ++trainer.trainer.beit_pretrained_weights="../unitable_weights/ssp_2m_large.pt" +LOCK_MTIM_4 = ++trainer.trainer.freeze_beit_epoch=4 + +# +# Models +# + +# model type +MODEL_VQVAE = model=vqvae +MODEL_VQVAE_L = $(MODEL_VQVAE) ++model.codebook_tokens=16384 ++model.hidden_dim=512 +MODEL_BEIT = model=beit +MODEL_ENCODER_DECODER = model=encoderdecoder + +# backbone for input preprocessing: resnet, linear projection, and convstem +IMGCNN = model/model/backbone=imgcnn +IMGLINEAR = model/model/backbone=imglinear +IMGCONVSTEM = model/model/backbone=imgconvstem + +# number of layers +E4 = ++model.model.encoder.nlayer=4 +E12 = ++model.model.encoder.nlayer=12 +E24 = ++model.model.encoder.nlayer=24 +D4 = ++model.model.decoder.nlayer=4 + +# transformer layer: attention heads, hidden size, activation, norm +FF4 = ++model.ff_ratio=4 + +NHEAD8 = ++model.nhead=8 +NHEAD12 = ++model.nhead=12 + +NORM_FIRST = ++model.norm_first=true +NORM_LAST = ++model.norm_first=false + +ACT_RELU = ++model.activation="relu" +ACT_GELU = ++model.activation="gelu" + +D_MODEL512 = ++model.d_model=512 +D_MODEL768 = ++model.d_model=768 + +# regularization +REG_d00 = ++model.dropout=0.0 +REG_d02 = ++model.dropout=0.2 + +# linear projection patch size +P16 = ++model.backbone_downsampling_factor=16 +P28 = ++model.backbone_downsampling_factor=28 +P32 = ++model.backbone_downsampling_factor=32 + +# cnn backbone +R18 = ++model.model.backbone.backbone._target_=torchvision.models.resnet18 \ + ++model.model.backbone.output_channels=512 + +MTIM_BASE = $(MODEL_BEIT) $(IMGLINEAR) $(NHEAD8) $(FF4) $(ACT_GELU) \ + $(NORM_FIRST) $(D_MODEL512) $(REG_d02) $(P16) $(E4) +MTIM_LARGE = $(MODEL_BEIT) $(IMGLINEAR) $(NHEAD12) $(FF4) $(ACT_GELU) \ + $(NORM_FIRST) $(D_MODEL768) $(REG_d02) $(P16) $(E12) + +ARCH_BASE = $(MTIM_BASE) $(MODEL_ENCODER_DECODER) $(D4) +ARCH_LARGE = $(MTIM_LARGE) $(MODEL_ENCODER_DECODER) $(D4) + + +############################################### +# Experiments # +############################################### + +TRAIN_vqvae := $(VOCAB_NONE) \ + $(LABEL_IMAGE) $(AUG_VQVAE) $(I224) \ + $(TRAINER_VQVAE) $(OPT_ADAMW) $(LR_1e4) $(EPOCH24) + +TRAIN_mtim := $(VOCAB_NONE) \ + $(LABEL_IMAGE) $(AUG_BEIT) \ + $(TRAINER_BEIT) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_5e4) \ + $(TRANS448_VQVAE224_GRID28_MASK300) + +# +# mini_pubtabnet pretraining example (dataset code: mini) +# + +# vq-vae +# > make experiments/vqvae_mini/.done_pretrain +EXP_vqvae_mini := $(TRAIN_vqvae) $(MINIPUBTABNET) $(VQVAE_TEMP_2M) $(BATCH80) $(MODEL_VQVAE) $(LR_cosine32k) + +# visual encoder pretraining - masked tabular image modeling (MTIM) +# > make experiments/mtim_mini_base/.done_pretrain +EXP_mtim_mini_base := $(TRAIN_mtim) $(MINIPUBTABNET) $(VQVAE2M_WEIGHTS) $(MTIM_BASE) \ + $(BATCH384) $(LR_cosine8k_warm1k) $(EPOCH24) + +# +# mini_pubtabnet finetuning example +# + +# table structure (task code: html) +# > make experiments/ssp_2m_mini_html_base/.done_finetune +TRAIN_mini_html := $(VOCAB_HTML) \ + $(MINIPUBTABNET) $(LABEL_HTML) $(AUG_RESIZE_NORM) \ + $(TRAINER_TABLE) $(I448) $(SEQ512) \ + $(EPOCH48) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_8e5) + +EXP_ssp_2m_mini_html_base := $(TRAIN_mini_html) $(ARCH_BASE) \ + $(WEIGHTS_mtim_2m_base) $(LOCK_MTIM_4) $(BATCH72) $(LR_cosine93k_warm6k) + +# table cell bbox (task code: bbox) +# > make experiments/ssp_2m_mini_bbox_base/.done_finetune +TRAIN_mini_bbox := $(VOCAB_BBOX) \ + $(MINIPUBTABNET) $(LABEL_BBOX) $(AUG_RESIZE_NORM) \ + $(TRAINER_TABLE) $(I448) $(SEQ1024) \ + $(EPOCH30) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_3e4) $(GRAD_CLIP12) + +EXP_ssp_2m_mini_bbox_base := $(TRAIN_mini_bbox) $(ARCH_BASE) \ + $(WEIGHTS_mtim_2m_base) $(LOCK_MTIM_4) $(BATCH48) $(LR_cosine77k_warm8k) + +# table cell content (task code: cell) +# > make experiments/ssp_2m_mini_cell_base/.done_finetune +TRAIN_mini_cell := $(VOCAB_CELL) \ + $(MINIPUBTABNET) $(LABEL_CELL) $(AUG_RESIZE_NORM) \ + $(TRAINER_TABLE) $(I112_448) $(SEQ200) \ + $(EPOCH24) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_8e5) $(GRAD_CLIP12) + +EXP_ssp_2m_mini_cell_base := $(TRAIN_mini_cell) $(ARCH_BASE) \ + $(WEIGHTS_mtim_2m_base) $(LOCK_MTIM_4) $(BATCH24) $(LR_cosine216k_warm27k) + +# +# cross-dataset pretraining +# + +# vq-vae +EXP_vqvae_1M := $(TRAIN_vqvae) $(DATA_VQVAE_1M) $(VQVAE_TEMP_1M) $(BATCH80) $(MODEL_VQVAE) $(LR_cosine32k) +EXP_vqvae_2M := $(TRAIN_vqvae) $(DATA_VQVAE_2M) $(VQVAE_TEMP_2M) $(BATCH48) $(MODEL_VQVAE_L) $(LR_cosine118k) + +# visual encoder pretraining +EXP_mtim_1M_base := $(TRAIN_mtim) $(PUB_SYN) $(VQVAE1M_WEIGHTS) $(MTIM_BASE) \ + $(BATCH384) $(LR_cosine8k_warm1k) $(EPOCH24) +EXP_mtim_1M_large := $(TRAIN_mtim) $(PUB_SYN) $(VQVAE1M_WEIGHTS) $(MTIM_LARGE) \ + $(BATCH96) $(LR_cosine30k_warm4k) $(EPOCH24) +EXP_mtim_2M_base := $(TRAIN_mtim) $(DATA_VQVAE_2M) $(VQVAE2M_WEIGHTS) $(MTIM_BASE) \ + $(BATCH256) $(LR_cosine44k_warm6k) $(EPOCH48) +EXP_mtim_2M_large := $(TRAIN_mtim) $(DATA_VQVAE_2M) $(VQVAE2M_WEIGHTS) $(MTIM_LARGE) \ + $(BATCH96) $(LR_cosine118k_warm15k) $(EPOCH48) + +# +# cross-dataset finetuning +# + +# table structure +# > make experiments/ssp_2m_syn_pub_html_medium/.done_finetune +TRAIN_syn_pub_html := $(VOCAB_HTML) \ + $(PUB_SYN) $(LABEL_HTML) $(AUG_RESIZE_NORM) \ + $(TRAINER_TABLE) $(I448) $(SEQ512) \ + $(EPOCH48) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_8e5) + +EXP_ssp_2m_syn_pub_html_large := $(TRAIN_syn_pub_html) $(ARCH_LARGE) \ + $(WEIGHTS_mtim_2m_large) $(LOCK_MTIM_4) $(BATCH72) $(LR_cosine93k_warm6k) + +# table cell bbox +# > make experiments/ssp_2m_syn_pub_bbox_medium/.done_finetune +TRAIN_syn_pub_bbox := $(VOCAB_BBOX) \ + $(PUB_SYN) $(LABEL_BBOX) $(AUG_RESIZE_NORM) \ + $(TRAINER_TABLE) $(I448) $(SEQ1024) \ + $(EPOCH30) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_3e4) $(GRAD_CLIP12) + +EXP_ssp_2m_syn_pub_bbox_large := $(TRAIN_syn_pub_bbox) $(ARCH_LARGE) \ + $(WEIGHTS_mtim_2m_large) $(LOCK_MTIM_4) $(BATCH48) $(LR_cosine77k_warm8k) + +# table cell content +# > make experiments/syn_pub_pub1m_cell_medium/.done_finetune +TRAIN_syn_pub_pub1m_cell := $(VOCAB_CELL) \ + $(PUB_SYN_PUB1M) $(LABEL_CELL) $(AUG_RESIZE_NORM) \ + $(TRAINER_TABLE) $(I112_448) $(SEQ200) \ + $(EPOCH24) $(OPT_ADAMW) $(OPT_WD5e2) $(LR_8e5) $(GRAD_CLIP12) + +EXP_ssp_2m_syn_pub_pub1m_cell_large := $(TRAIN_syn_pub_pub1m_cell) $(ARCH_LARGE) \ + $(WEIGHTS_mtim_2m_base) $(LOCK_MTIM_4) $(BATCH24) $(LR_cosine216k_warm27k) \ No newline at end of file diff --git a/unitable/LICENSE b/unitable/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..0d212aaeaf2eef890387f73499c34d3352241edc --- /dev/null +++ b/unitable/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 ShengYun (Anthony) Peng. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/unitable/Makefile b/unitable/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..4bbbf490f7aa735a3c8fd3a41ce5103548fc25d4 --- /dev/null +++ b/unitable/Makefile @@ -0,0 +1,85 @@ +SHELL := /bin/bash +VENV_NAME := unitable +CONDA_ACTIVATE := source $$(conda info --base)/etc/profile.d/conda.sh && conda activate $(VENV_NAME) +PYTHON := $(CONDA_ACTIVATE) && python +PIP := $(CONDA_ACTIVATE) && pip3 +# Stacked single-node multi-worker: https://pytorch.org/docs/stable/elastic/run.html#stacked-single-node-multi-worker +TORCHRUN = $(CONDA_ACTIVATE) && torchrun --rdzv-backend=c10d --rdzv_endpoint localhost:0 --nnodes=1 --nproc_per_node=$(NGPU) + +# Taken from https://tech.davis-hansson.com/p/make/ +ifeq ($(origin .RECIPEPREFIX), undefined) + $(error This Make does not support .RECIPEPREFIX. Please use GNU Make 4.0 or later) +endif +.RECIPEPREFIX = > + +# +# Virtual Environment Targets +# +clean: +> rm -f .venv_done + +.done_venv: clean +> conda create -n $(VENV_NAME) python=3.9 -y +> $(PIP) install -r requirements.txt +> $(PIP) install -e . +> touch $@ + +# +# Download pretrained and UniTable model weights +# +WEIGHTS_PATH = experiments/unitable_weights +M_VQVAE_1M = $(WEIGHTS_PATH)/vqvae_1m.pt +M_VQVAE_2M = $(WEIGHTS_PATH)/vqvae_2m.pt +M_SSP_1M_BASE = $(WEIGHTS_PATH)/ssp_1m_base.pt +M_SSP_1M_LARGE = $(WEIGHTS_PATH)/ssp_1m_large.pt +M_SSP_2M_BASE = $(WEIGHTS_PATH)/ssp_2m_base.pt +M_SSP_2M_LARGE = $(WEIGHTS_PATH)/ssp_2m_large.pt +UNITABLE_HTML = $(WEIGHTS_PATH)/unitable_large_structure.pt +UNITABLE_BBOX = $(WEIGHTS_PATH)/unitable_large_bbox.pt +UNITABLE_CELL = $(WEIGHTS_PATH)/unitable_large_content.pt + +.done_download_weights: +ifeq ("$(words $(wildcard $(WEIGHTS_PATH)/*.pt))", "9") +> $(info All 9 model weights have already been downloaded to $(WEIGHTS_PATH).) +else +> $(info There should be 9 weights file under $(WEIGHTS_PATH), but only $(words $(wildcard $(WEIGHTS_PATH)/*.pt)) are found.) +> $(info Begin downloading weights from HuggingFace ...) +> wget -c https://huggingface.co/poloclub/UniTable/resolve/main/vqvae_1m.pt -P $(WEIGHTS_PATH) +> wget -c https://huggingface.co/poloclub/UniTable/resolve/main/vqvae_2m.pt -P $(WEIGHTS_PATH) +> wget -c https://huggingface.co/poloclub/UniTable/resolve/main/ssp_1m_base.pt -P $(WEIGHTS_PATH) +> wget -c https://huggingface.co/poloclub/UniTable/resolve/main/ssp_1m_large.pt -P $(WEIGHTS_PATH) +> wget -c https://huggingface.co/poloclub/UniTable/resolve/main/ssp_2m_base.pt -P $(WEIGHTS_PATH) +> wget -c https://huggingface.co/poloclub/UniTable/resolve/main/ssp_2m_large.pt -P $(WEIGHTS_PATH) +> wget -c https://huggingface.co/poloclub/UniTable/resolve/main/unitable_large_structure.pt -P $(WEIGHTS_PATH) +> wget -c https://huggingface.co/poloclub/UniTable/resolve/main/unitable_large_bbox.pt -P $(WEIGHTS_PATH) +> wget -c https://huggingface.co/poloclub/UniTable/resolve/main/unitable_large_content.pt -P $(WEIGHTS_PATH) +> $(info Completed!) +endif + +# +# Python Targets +# +include CONFIG.mk +SRC := src +BEST_MODEL = "../$(word 1,$(subst -, ,$*))/model/best.pt" +RESULT_JSON := html.json +TEDS_STRUCTURE = -f "../experiments/$*/$(RESULT_JSON)" -s + +###################### +NGPU := 1 # number of gpus used in the experiments + +.SECONDARY: + +# vq-vae and self-supervised pretraining +experiments/%/.done_pretrain: +> @echo "Using experiment configurations from variable EXP_$*" +> cd $(SRC) && $(TORCHRUN) -m main ++name=$* $(EXP_$*) ++trainer.mode="train" +> touch $@ + +# finetuning from SSP weights for table structure, cell bbox and cell content +experiments/%/.done_finetune: +> @echo "Finetuning phase 1 - using experiment configurations from variable EXP_$*" +> cd $(SRC) && $(TORCHRUN) -m main ++name=$* $(EXP_$*) ++trainer.mode="train" +> @echo "Finetuning phase 2 - starting from epoch 4" +> cd $(SRC) && $(TORCHRUN) -m main ++name=$* $(EXP_$*) ++trainer.mode="train" ++trainer.trainer.snapshot="epoch3_snapshot.pt" ++trainer.trainer.beit_pretrained_weights=null +> touch $@ \ No newline at end of file diff --git a/unitable/README.md b/unitable/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a3a2cb6d44e2246133bd17b947f85022657c96d2 --- /dev/null +++ b/unitable/README.md @@ -0,0 +1,14 @@ +# UniTable: Towards a Unified Table Foundation Model + +![alt text](image1.png) + +1. 📈 [High-Performance Transformers for Table Structure Recognition Need Early Convolutions](https://arxiv.org/abs/2311.05565). ShengYun Peng, Seongmin Lee, Xiaojing Wang, Rajarajeswari Balasubramaniyan, Duen Horng Chau. In *NeurIPS Second Table Representation Learning Workshop*, 2023. (Oral) +2. 🚀 [Self-Supervised Pretraining for Table Structure Recognition Transformer](https://arxiv.org/abs/2402.15578). ShengYun Peng, Seongmin Lee, Xiaojing Wang, Rajarajeswari Balasubramaniyan, Duen Horng Chau. In *AAAI Scientific Document Understanding Workshop*, 2024. (Oral) +3. 🆕 [UniTable: Towards a Unified Framework for Table Structure Recognition via Self-Supervised Pretraining](https://arxiv.org/abs/2403.04822). ShengYun Peng, Seongmin Lee, Xiaojing Wang, Rajarajeswari Balasubramaniyan, Duen Horng Chau. ArXiv, 2024. + +## Implementation details + +![alt text](image2.png) + +![alt text](image3.png) + diff --git a/unitable/__init__.py b/unitable/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..93718b10f0df24d732e58f2986f010185a2f4d01 --- /dev/null +++ b/unitable/__init__.py @@ -0,0 +1,5 @@ +from .unitable_predictor import UnitablePredictor +from .unitable_full import UnitableFullPredictor +from .unitable_full_singleimage import UnitableFullSinglePredictor + +__all__ = ['UnitablePredictor','UnitableFullPredictor','UnitableFullSinglePredictor'] \ No newline at end of file diff --git a/unitable/configs/dataset/augmentation/beit.yaml b/unitable/configs/dataset/augmentation/beit.yaml new file mode 100644 index 0000000000000000000000000000000000000000..20ab02ece03eac0871578ea48288457570547470 --- /dev/null +++ b/unitable/configs/dataset/augmentation/beit.yaml @@ -0,0 +1,7 @@ +_target_: src.datamodule.augmentation.AugmentationForMIM +mean: [0.86597056, 0.88463002, 0.87491087] +std: [0.20686628, 0.18201602, 0.18485524] +trans_size: ${trainer.trans_size} +vqvae_size: ${trainer.vqvae_size} +trans_interpolation: bicubic +vqvae_interpolation: lanczos \ No newline at end of file diff --git a/unitable/configs/dataset/augmentation/resize_normalize.yaml b/unitable/configs/dataset/augmentation/resize_normalize.yaml new file mode 100644 index 0000000000000000000000000000000000000000..64511c87c1a4f9c62f45d2e3ffb6caa3e4956396 --- /dev/null +++ b/unitable/configs/dataset/augmentation/resize_normalize.yaml @@ -0,0 +1,9 @@ +_target_: torchvision.transforms.Compose +transforms: + - _target_: torchvision.transforms.Resize + size: ${trainer.img_size} + - _target_: torchvision.transforms.ToTensor + - _target_: torchvision.transforms.Normalize + mean: [0.86597056, 0.88463002, 0.87491087] + std: [0.20686628, 0.18201602, 0.18485524] + diff --git a/unitable/configs/dataset/augmentation/vqvae.yaml b/unitable/configs/dataset/augmentation/vqvae.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f421374ba98362278f23cf559e36993dd995c877 --- /dev/null +++ b/unitable/configs/dataset/augmentation/vqvae.yaml @@ -0,0 +1,5 @@ +_target_: torchvision.transforms.Compose +transforms: + - _target_: torchvision.transforms.Resize + size: ${trainer.img_size} + - _target_: torchvision.transforms.ToTensor \ No newline at end of file diff --git a/unitable/configs/dataset/concat_dataset.yaml b/unitable/configs/dataset/concat_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d015291136221b4740ca85cce7150e34a752796a --- /dev/null +++ b/unitable/configs/dataset/concat_dataset.yaml @@ -0,0 +1,29 @@ +defaults: + - _self_ + - augmentation: beit + # - pubtabnet@train.d1: train_dataset + # - pubtabnet@valid.d1: valid_dataset + # - synthtabnet_marketing@train.d2: train_dataset + # - synthtabnet_marketing@valid.d2: valid_dataset + # - synthtabnet_fintabnet@train.d3: train_dataset + # - synthtabnet_fintabnet@valid.d3: valid_dataset + # - synthtabnet_sparse@train.d4: train_dataset + # - synthtabnet_sparse@valid.d4: valid_dataset + # - synthtabnet_pubtabnet@train.d5: train_dataset + # - synthtabnet_pubtabnet@valid.d5: valid_dataset + + +label_type: ${trainer.label_type} +cell_limit: 10 + +train_dataset: + _target_: torch.utils.data.ConcatDataset + datasets: ${oc.dict.values:..train} + +valid_dataset: + _target_: torch.utils.data.ConcatDataset + datasets: ${oc.dict.values:..valid} + +test_dataset: + _target_: torch.utils.data.ConcatDataset + datasets: ${oc.dict.values:..test} \ No newline at end of file diff --git a/unitable/configs/dataset/fintabnet/test_dataset.yaml b/unitable/configs/dataset/fintabnet/test_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f660ab55e1415ceec352c78e145bc797678b8d23 --- /dev/null +++ b/unitable/configs/dataset/fintabnet/test_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - valid_dataset + +jsonl_filename: clean_FinTabNet_1.0.0_cell_test.jsonl \ No newline at end of file diff --git a/unitable/configs/dataset/fintabnet/train_dataset.yaml b/unitable/configs/dataset/fintabnet/train_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cff986413d35552ba699759b5a0b139fccfcbb1b --- /dev/null +++ b/unitable/configs/dataset/fintabnet/train_dataset.yaml @@ -0,0 +1,5 @@ +_target_: src.datamodule.FinTabNet +root_dir: ../../../../DATASETS/finTabNet +label_type: ${dataset.label_type} +jsonl_filename: clean_FinTabNet_1.0.0_cell_train.jsonl +transform: ${dataset.augmentation} \ No newline at end of file diff --git a/unitable/configs/dataset/fintabnet/valid_dataset.yaml b/unitable/configs/dataset/fintabnet/valid_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..490c758fa25c6f5485e45c9c29a8bc7d053be839 --- /dev/null +++ b/unitable/configs/dataset/fintabnet/valid_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - train_dataset + +jsonl_filename: clean_FinTabNet_1.0.0_cell_val.jsonl \ No newline at end of file diff --git a/unitable/configs/dataset/icdar/test_dataset.yaml b/unitable/configs/dataset/icdar/test_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5eaa17a3ff1530a8122002f4e7b1735e35e1be2a --- /dev/null +++ b/unitable/configs/dataset/icdar/test_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - valid_dataset + +split: test \ No newline at end of file diff --git a/unitable/configs/dataset/icdar/train_dataset.yaml b/unitable/configs/dataset/icdar/train_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fe712216d824c2de44b2c4b0d36662cff7f0def8 --- /dev/null +++ b/unitable/configs/dataset/icdar/train_dataset.yaml @@ -0,0 +1,5 @@ +_target_: src.datamodule.ICDAR +root_dir: ../../../../DATASETS/ICDAR-2013 +label_type: ${dataset.label_type} +split: train +transform: ${dataset.augmentation} \ No newline at end of file diff --git a/unitable/configs/dataset/icdar/valid_dataset.yaml b/unitable/configs/dataset/icdar/valid_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f649896a9e21eb1eb3b1e0d0edfe92b8d65e94af --- /dev/null +++ b/unitable/configs/dataset/icdar/valid_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - train_dataset + +split: val \ No newline at end of file diff --git a/unitable/configs/dataset/mini_pubtabnet/test_dataset.yaml b/unitable/configs/dataset/mini_pubtabnet/test_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9200cab491d7aef23296a82f59bee6adfb5ccaa8 --- /dev/null +++ b/unitable/configs/dataset/mini_pubtabnet/test_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - valid_dataset + +cell_limit: 256 \ No newline at end of file diff --git a/unitable/configs/dataset/mini_pubtabnet/train_dataset.yaml b/unitable/configs/dataset/mini_pubtabnet/train_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..248173846adc12dfec93b2b88e7e5ad7a40d4226 --- /dev/null +++ b/unitable/configs/dataset/mini_pubtabnet/train_dataset.yaml @@ -0,0 +1,8 @@ + +_target_: src.datamodule.pubtabnet.PubTabNet +root_dir: ../../dataset/mini_pubtabnet +label_type: ${dataset.label_type} +split: train +json_html: mini_pubtabnet_examples.jsonl +transform: ${dataset.augmentation} +cell_limit: 150 \ No newline at end of file diff --git a/unitable/configs/dataset/mini_pubtabnet/valid_dataset.yaml b/unitable/configs/dataset/mini_pubtabnet/valid_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..414f84109a18c5455892546255dff6cf5e1c4b70 --- /dev/null +++ b/unitable/configs/dataset/mini_pubtabnet/valid_dataset.yaml @@ -0,0 +1,2 @@ +defaults: + - train_dataset \ No newline at end of file diff --git a/unitable/configs/dataset/pubtables1m/test_dataset.yaml b/unitable/configs/dataset/pubtables1m/test_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5eaa17a3ff1530a8122002f4e7b1735e35e1be2a --- /dev/null +++ b/unitable/configs/dataset/pubtables1m/test_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - valid_dataset + +split: test \ No newline at end of file diff --git a/unitable/configs/dataset/pubtables1m/train_dataset.yaml b/unitable/configs/dataset/pubtables1m/train_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6450375e5c2405d81e8cf247dc5cd31db9d33550 --- /dev/null +++ b/unitable/configs/dataset/pubtables1m/train_dataset.yaml @@ -0,0 +1,6 @@ +_target_: src.datamodule.PubTables +root_dir: ../../../../DATASETS/pubtables1m/PubTables-1M-Structure +label_type: ${dataset.label_type} +split: train +transform: ${dataset.augmentation} +cell_limit: ${dataset.cell_limit} \ No newline at end of file diff --git a/unitable/configs/dataset/pubtables1m/valid_dataset.yaml b/unitable/configs/dataset/pubtables1m/valid_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f649896a9e21eb1eb3b1e0d0edfe92b8d65e94af --- /dev/null +++ b/unitable/configs/dataset/pubtables1m/valid_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - train_dataset + +split: val \ No newline at end of file diff --git a/unitable/configs/dataset/pubtabnet/test_dataset.yaml b/unitable/configs/dataset/pubtabnet/test_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9200cab491d7aef23296a82f59bee6adfb5ccaa8 --- /dev/null +++ b/unitable/configs/dataset/pubtabnet/test_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - valid_dataset + +cell_limit: 256 \ No newline at end of file diff --git a/unitable/configs/dataset/pubtabnet/train_dataset.yaml b/unitable/configs/dataset/pubtabnet/train_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eaed8a932ab4310dae89dcc8b1be498f521a4314 --- /dev/null +++ b/unitable/configs/dataset/pubtabnet/train_dataset.yaml @@ -0,0 +1,7 @@ +_target_: src.datamodule.PubTabNet +root_dir: ../../../../DATASETS/pubtabnet +label_type: ${dataset.label_type} +split: train +json_html: clean_html_PubTabNet_2.0.0.jsonl +transform: ${dataset.augmentation} +cell_limit: ${dataset.cell_limit} \ No newline at end of file diff --git a/unitable/configs/dataset/pubtabnet/valid_dataset.yaml b/unitable/configs/dataset/pubtabnet/valid_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f649896a9e21eb1eb3b1e0d0edfe92b8d65e94af --- /dev/null +++ b/unitable/configs/dataset/pubtabnet/valid_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - train_dataset + +split: val \ No newline at end of file diff --git a/unitable/configs/dataset/single_dataset.yaml b/unitable/configs/dataset/single_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cf0edafd9d16c816814cafae40aec27298379616 --- /dev/null +++ b/unitable/configs/dataset/single_dataset.yaml @@ -0,0 +1,9 @@ +defaults: + - _self_ + - augmentation: beit + # - pubtabnet@train_dataset: train_dataset + # - pubtabnet@valid_dataset: valid_dataset + # - pubtabnet@test_dataset: test_dataset + +label_type: ${trainer.label_type} +cell_limit: 10 \ No newline at end of file diff --git a/unitable/configs/dataset/synthtabnet_fintabnet/test_dataset.yaml b/unitable/configs/dataset/synthtabnet_fintabnet/test_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0d7c26c1ec5e0d2312a9f767f72127b6c52c8751 --- /dev/null +++ b/unitable/configs/dataset/synthtabnet_fintabnet/test_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - train_dataset + +split: test \ No newline at end of file diff --git a/unitable/configs/dataset/synthtabnet_fintabnet/train_dataset.yaml b/unitable/configs/dataset/synthtabnet_fintabnet/train_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e20e9e2ae424f01daffd9b7b6f69beb4ed34a56f --- /dev/null +++ b/unitable/configs/dataset/synthtabnet_fintabnet/train_dataset.yaml @@ -0,0 +1,7 @@ +_target_: src.datamodule.Synthtabnet +root_dir: ../../../../DATASETS/synthtabnet/fintabnet +label_type: ${dataset.label_type} +split: train +json_html: clean_html_synthetic_data.jsonl +transform: ${dataset.augmentation} +cell_limit: ${dataset.cell_limit} \ No newline at end of file diff --git a/unitable/configs/dataset/synthtabnet_fintabnet/valid_dataset.yaml b/unitable/configs/dataset/synthtabnet_fintabnet/valid_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f649896a9e21eb1eb3b1e0d0edfe92b8d65e94af --- /dev/null +++ b/unitable/configs/dataset/synthtabnet_fintabnet/valid_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - train_dataset + +split: val \ No newline at end of file diff --git a/unitable/configs/dataset/synthtabnet_marketing/test_dataset.yaml b/unitable/configs/dataset/synthtabnet_marketing/test_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0d7c26c1ec5e0d2312a9f767f72127b6c52c8751 --- /dev/null +++ b/unitable/configs/dataset/synthtabnet_marketing/test_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - train_dataset + +split: test \ No newline at end of file diff --git a/unitable/configs/dataset/synthtabnet_marketing/train_dataset.yaml b/unitable/configs/dataset/synthtabnet_marketing/train_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..96e11ce0790dcdcf50b8704a30d71042abe94b56 --- /dev/null +++ b/unitable/configs/dataset/synthtabnet_marketing/train_dataset.yaml @@ -0,0 +1,7 @@ +_target_: src.datamodule.Synthtabnet +root_dir: ../../../../DATASETS/synthtabnet/marketing +label_type: ${dataset.label_type} +split: train +json_html: clean_html_synthetic_data.jsonl +transform: ${dataset.augmentation} +cell_limit: ${dataset.cell_limit} \ No newline at end of file diff --git a/unitable/configs/dataset/synthtabnet_marketing/valid_dataset.yaml b/unitable/configs/dataset/synthtabnet_marketing/valid_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f649896a9e21eb1eb3b1e0d0edfe92b8d65e94af --- /dev/null +++ b/unitable/configs/dataset/synthtabnet_marketing/valid_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - train_dataset + +split: val \ No newline at end of file diff --git a/unitable/configs/dataset/synthtabnet_pubtabnet/test_dataset.yaml b/unitable/configs/dataset/synthtabnet_pubtabnet/test_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0d7c26c1ec5e0d2312a9f767f72127b6c52c8751 --- /dev/null +++ b/unitable/configs/dataset/synthtabnet_pubtabnet/test_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - train_dataset + +split: test \ No newline at end of file diff --git a/unitable/configs/dataset/synthtabnet_pubtabnet/train_dataset.yaml b/unitable/configs/dataset/synthtabnet_pubtabnet/train_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..816764b42516b2281b220acc9804a3a77c1e7704 --- /dev/null +++ b/unitable/configs/dataset/synthtabnet_pubtabnet/train_dataset.yaml @@ -0,0 +1,7 @@ +_target_: src.datamodule.Synthtabnet +root_dir: ../../../../DATASETS/synthtabnet/pubtabnet +label_type: ${dataset.label_type} +split: train +json_html: clean_html_synthetic_data.jsonl +transform: ${dataset.augmentation} +cell_limit: ${dataset.cell_limit} \ No newline at end of file diff --git a/unitable/configs/dataset/synthtabnet_pubtabnet/valid_dataset.yaml b/unitable/configs/dataset/synthtabnet_pubtabnet/valid_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f649896a9e21eb1eb3b1e0d0edfe92b8d65e94af --- /dev/null +++ b/unitable/configs/dataset/synthtabnet_pubtabnet/valid_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - train_dataset + +split: val \ No newline at end of file diff --git a/unitable/configs/dataset/synthtabnet_sparse/test_dataset.yaml b/unitable/configs/dataset/synthtabnet_sparse/test_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0d7c26c1ec5e0d2312a9f767f72127b6c52c8751 --- /dev/null +++ b/unitable/configs/dataset/synthtabnet_sparse/test_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - train_dataset + +split: test \ No newline at end of file diff --git a/unitable/configs/dataset/synthtabnet_sparse/train_dataset.yaml b/unitable/configs/dataset/synthtabnet_sparse/train_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..95a5223f912db09a39b9dd440064fbebb6de4569 --- /dev/null +++ b/unitable/configs/dataset/synthtabnet_sparse/train_dataset.yaml @@ -0,0 +1,7 @@ +_target_: src.datamodule.Synthtabnet +root_dir: ../../../../DATASETS/synthtabnet/sparse +label_type: ${dataset.label_type} +split: train +json_html: clean_html_synthetic_data.jsonl +transform: ${dataset.augmentation} +cell_limit: ${dataset.cell_limit} \ No newline at end of file diff --git a/unitable/configs/dataset/synthtabnet_sparse/valid_dataset.yaml b/unitable/configs/dataset/synthtabnet_sparse/valid_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f649896a9e21eb1eb3b1e0d0edfe92b8d65e94af --- /dev/null +++ b/unitable/configs/dataset/synthtabnet_sparse/valid_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - train_dataset + +split: val \ No newline at end of file diff --git a/unitable/configs/dataset/tablebank/test_dataset.yaml b/unitable/configs/dataset/tablebank/test_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5eaa17a3ff1530a8122002f4e7b1735e35e1be2a --- /dev/null +++ b/unitable/configs/dataset/tablebank/test_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - valid_dataset + +split: test \ No newline at end of file diff --git a/unitable/configs/dataset/tablebank/train_dataset.yaml b/unitable/configs/dataset/tablebank/train_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dcd7cca9542e26e0774a2512df266cf002422293 --- /dev/null +++ b/unitable/configs/dataset/tablebank/train_dataset.yaml @@ -0,0 +1,5 @@ +_target_: src.datamodule.TableBank +root_dir: ../../../../DATASETS/tablebank/Recognition +label_type: ${dataset.label_type} +split: train +transform: ${dataset.augmentation} diff --git a/unitable/configs/dataset/tablebank/valid_dataset.yaml b/unitable/configs/dataset/tablebank/valid_dataset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f649896a9e21eb1eb3b1e0d0edfe92b8d65e94af --- /dev/null +++ b/unitable/configs/dataset/tablebank/valid_dataset.yaml @@ -0,0 +1,4 @@ +defaults: + - train_dataset + +split: val \ No newline at end of file diff --git a/unitable/configs/main.yaml b/unitable/configs/main.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d2088f6a949188afdeec43d7cc51289e5379f5ae --- /dev/null +++ b/unitable/configs/main.yaml @@ -0,0 +1,23 @@ +defaults: + - _self_ + - dataset: mini_pubtabnet + - model: encoderdecoder + - trainer: table + - vocab: html + - override hydra/job_logging: colorlog + - override hydra/hydra_logging: colorlog + + +hydra: + run: + dir: ../experiments/${name} + sweep: + dir: ../experiments/${name} + job: + name: ${name} + chdir: true + +wandb: + project: UniTable + +seed: 1234 \ No newline at end of file diff --git a/unitable/configs/model/beit.yaml b/unitable/configs/model/beit.yaml new file mode 100644 index 0000000000000000000000000000000000000000..be193c1f6ede8b778abeef28b985a5d9372b2743 --- /dev/null +++ b/unitable/configs/model/beit.yaml @@ -0,0 +1,35 @@ +defaults: + - _self_ + - model/backbone: imglinear + - model/encoder: transformer + +nhead: 12 +ff_ratio: 4 +activation: gelu +norm_first: true +d_model: 768 +dropout: 0.0 +backbone_downsampling_factor: 16 + +codebook_tokens: 8192 +hidden_dim: 256 + +model: + _target_: src.model.beit.BeitEncoder + d_model: ${model.d_model} + codebook_tokens: ${model.codebook_tokens} + dropout: ${model.dropout} + norm_layer: + _partial_: true + _target_: torch.nn.LayerNorm + eps: 1e-6 + +model_vqvae: + _target_: src.model.vqvae.DiscreteVAE + image_size: ${trainer.vqvae_size} + codebook_tokens: ${model.codebook_tokens} + codebook_dim: 512 + num_layers: 3 + hidden_dim: ${model.hidden_dim} + smooth_l1_loss: false + kl_div_loss_weight: 0.0 \ No newline at end of file diff --git a/unitable/configs/model/encoderdecoder.yaml b/unitable/configs/model/encoderdecoder.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fe49d97979a2858df45d87b37510c2e2bb54cf1e --- /dev/null +++ b/unitable/configs/model/encoderdecoder.yaml @@ -0,0 +1,29 @@ +defaults: + - _self_ + - model/backbone: imgcnn + - model/encoder: transformer + - model/decoder: transformer + + +nhead: 4 +ff_ratio: 2 +activation: relu +norm_first: false +d_model: 512 +dropout: 0.5 +backbone_downsampling_factor: 16 + + +model: + _target_: src.model.EncoderDecoder + vocab_size: -1 + d_model: ${model.d_model} + padding_idx: -1 + max_seq_len: ${trainer.max_seq_len} + dropout: ${model.dropout} + norm_layer: + _partial_: true + _target_: torch.nn.LayerNorm + eps: 1e-6 + + diff --git a/unitable/configs/model/model/backbone/imgcnn.yaml b/unitable/configs/model/model/backbone/imgcnn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b177db3e545a342d6324d3be9048573dac8a09b8 --- /dev/null +++ b/unitable/configs/model/model/backbone/imgcnn.yaml @@ -0,0 +1,9 @@ +_target_: src.model.components.ImgCnnBackbone +backbone: + _target_: torchvision.models.resnet18 +output_channels: 512 +d_model: ${model.d_model} +drop_layer: + - 3 + - 8 + - 9 \ No newline at end of file diff --git a/unitable/configs/model/model/backbone/imgconvstem.yaml b/unitable/configs/model/model/backbone/imgconvstem.yaml new file mode 100644 index 0000000000000000000000000000000000000000..023e1af851817ed9f5ae8ff556693211bfc2a4f6 --- /dev/null +++ b/unitable/configs/model/model/backbone/imgconvstem.yaml @@ -0,0 +1,5 @@ +_target_: src.model.components.ImgConvStemBackbone +d_model: ${model.d_model} +downsample_factor: ${model.backbone_downsampling_factor} +output_channels: 192 +kernel_size: 3 \ No newline at end of file diff --git a/unitable/configs/model/model/backbone/imglinear.yaml b/unitable/configs/model/model/backbone/imglinear.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dabed69e8fcd5843538666b92188fd2b7e739f41 --- /dev/null +++ b/unitable/configs/model/model/backbone/imglinear.yaml @@ -0,0 +1,3 @@ +_target_: src.model.components.ImgLinearBackbone +d_model: ${model.d_model} +patch_size: ${model.backbone_downsampling_factor} \ No newline at end of file diff --git a/unitable/configs/model/model/decoder/transformer.yaml b/unitable/configs/model/model/decoder/transformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e16c6d562847cd8650cb00c056ece3f1bf2b163f --- /dev/null +++ b/unitable/configs/model/model/decoder/transformer.yaml @@ -0,0 +1,8 @@ +_target_: src.model.components.Decoder +d_model: ${model.d_model} +nhead: ${model.nhead} +dropout: ${model.dropout} +activation: ${model.activation} +norm_first: ${model.norm_first} +nlayer: 4 +ff_ratio: ${model.ff_ratio} \ No newline at end of file diff --git a/unitable/configs/model/model/encoder/transformer.yaml b/unitable/configs/model/model/encoder/transformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9cdcde30dd06b34e2d80fdd8c7f45060779e8087 --- /dev/null +++ b/unitable/configs/model/model/encoder/transformer.yaml @@ -0,0 +1,8 @@ +_target_: src.model.components.Encoder +d_model: ${model.d_model} +nhead: ${model.nhead} +dropout: ${model.dropout} +activation: ${model.activation} +norm_first: ${model.norm_first} +nlayer: 2 +ff_ratio: ${model.ff_ratio} \ No newline at end of file diff --git a/unitable/configs/model/vqvae.yaml b/unitable/configs/model/vqvae.yaml new file mode 100644 index 0000000000000000000000000000000000000000..294740cb59e4cbe24b5a5393d3dea6d75dcee3c9 --- /dev/null +++ b/unitable/configs/model/vqvae.yaml @@ -0,0 +1,15 @@ +defaults: + - _self_ + +codebook_tokens: 8192 +hidden_dim: 256 + +model: + _target_: src.model.vqvae.DiscreteVAE + image_size: ${trainer.img_size} + codebook_tokens: ${model.codebook_tokens} + codebook_dim: 512 + num_layers: 3 + hidden_dim: ${model.hidden_dim} + smooth_l1_loss: false + kl_div_loss_weight: 0.0 \ No newline at end of file diff --git a/unitable/configs/trainer/beit.yaml b/unitable/configs/trainer/beit.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bde1f1751d8127b0bcbe4e2e10e41d6987e4fb6c --- /dev/null +++ b/unitable/configs/trainer/beit.yaml @@ -0,0 +1,40 @@ +defaults: + - _self_ + - train/lr_scheduler: exponential + - train/optimizer: adam + +mode: train +trans_size: 448 +vqvae_size: 224 +grid_size: 28 +num_mask_patches: 300 +min_num_patches: 16 +max_seq_len: null + +vqvae_weights: null + +train: + epochs: 20 + grad_clip: 5 + save_every: 3 + dataloader: + _target_: src.datamodule.dataloader.dataloader_beit + batch_size: 48 + grid_size: ${trainer.grid_size} + num_mask_patches: ${trainer.num_mask_patches} + min_num_patches: ${trainer.min_num_patches} +valid: + dataloader: + _target_: src.datamodule.dataloader.dataloader_beit + batch_size: 48 + grid_size: ${trainer.grid_size} + num_mask_patches: ${trainer.num_mask_patches} + min_num_patches: ${trainer.min_num_patches} +test: + metrics: null + + +trainer: + _target_: src.trainer.BeitTrainer + snapshot: null + model_weights: null \ No newline at end of file diff --git a/unitable/configs/trainer/table.yaml b/unitable/configs/trainer/table.yaml new file mode 100644 index 0000000000000000000000000000000000000000..07b1248b5e9284b1b4b3a9bea92f337e1290a245 --- /dev/null +++ b/unitable/configs/trainer/table.yaml @@ -0,0 +1,56 @@ +defaults: + - _self_ + - train/lr_scheduler: step + - train/optimizer: adam + + +mode: train +img_size: [448,448] +max_seq_len: 512 +label_type: html+cell+bbox + +train: + target: ${trainer.label_type} + img_size: ${trainer.img_size} + loss_weights: + table: 0 + html: 0 + cell: 0 + bbox: 0 + grad_clip: 5 + epochs: 24 + save_every: 1 + max_seq_len: ${trainer.max_seq_len} + dataloader: + _target_: src.datamodule.dataloader_html + batch_size: 48 + label_type: ${trainer.label_type} +valid: + target: ${trainer.label_type} + img_size: ${trainer.img_size} + loss_weights: ${trainer.train.loss_weights} + max_seq_len: ${trainer.max_seq_len} + dataloader: + _target_: src.datamodule.dataloader_html + batch_size: 48 + label_type: ${trainer.label_type} +test: + target: ${trainer.train.target} + img_size: ${trainer.img_size} + loss_weights: ${trainer.train.loss_weights} + metrics: teds + max_seq_len: ${trainer.max_seq_len} + sampling: greedy + save_to_prefix: html_table_result + dataloader: + _target_: src.datamodule.dataloader_html + batch_size: 96 + label_type: ${trainer.label_type} + + +trainer: + _target_: src.trainer.TableTrainer + snapshot: null + model_weights: null + beit_pretrained_weights: null + freeze_beit_epoch: null \ No newline at end of file diff --git a/unitable/configs/trainer/train/lr_scheduler/cosine.yaml b/unitable/configs/trainer/train/lr_scheduler/cosine.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e7314232c59a71c2d0cd295c4f499cf73c6b75d4 --- /dev/null +++ b/unitable/configs/trainer/train/lr_scheduler/cosine.yaml @@ -0,0 +1,7 @@ +_target_: torch.optim.lr_scheduler.LambdaLR +lr_lambda: + _partial_: true + _target_: src.utils.cosine_schedule_with_warmup + warmup: 6 + min_ratio: 5e-3 + total_step: ${trainer.train.epochs} diff --git a/unitable/configs/trainer/train/lr_scheduler/exponential.yaml b/unitable/configs/trainer/train/lr_scheduler/exponential.yaml new file mode 100644 index 0000000000000000000000000000000000000000..632cb7c3b0af5a8bf101b2134daaff2822f7a98b --- /dev/null +++ b/unitable/configs/trainer/train/lr_scheduler/exponential.yaml @@ -0,0 +1,2 @@ +_target_: torch.optim.lr_scheduler.ExponentialLR +gamma: 0.98 \ No newline at end of file diff --git a/unitable/configs/trainer/train/lr_scheduler/step.yaml b/unitable/configs/trainer/train/lr_scheduler/step.yaml new file mode 100644 index 0000000000000000000000000000000000000000..68ca3a58735941e1f2198978fded413acb6c4953 --- /dev/null +++ b/unitable/configs/trainer/train/lr_scheduler/step.yaml @@ -0,0 +1,3 @@ +_target_: torch.optim.lr_scheduler.StepLR +step_size: 12 +gamma: 0.1 \ No newline at end of file diff --git a/unitable/configs/trainer/train/optimizer/adam.yaml b/unitable/configs/trainer/train/optimizer/adam.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fa8a47a08b62ceb8915d67a33ca483aa0b4d71df --- /dev/null +++ b/unitable/configs/trainer/train/optimizer/adam.yaml @@ -0,0 +1,3 @@ +_target_: torch.optim.Adam +lr: 1e-4 +weight_decay: 1e-4 \ No newline at end of file diff --git a/unitable/configs/trainer/train/optimizer/adamw.yaml b/unitable/configs/trainer/train/optimizer/adamw.yaml new file mode 100644 index 0000000000000000000000000000000000000000..660a00e487a5724de07106724517c71b35baa021 --- /dev/null +++ b/unitable/configs/trainer/train/optimizer/adamw.yaml @@ -0,0 +1,4 @@ +_target_: torch.optim.AdamW +lr: 1e-4 +betas: [0.9, 0.999] +weight_decay: 1e-4 \ No newline at end of file diff --git a/unitable/configs/trainer/vqvae.yaml b/unitable/configs/trainer/vqvae.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3954ab3b3d8568cbdf26cd1ac086f0414c3dfe35 --- /dev/null +++ b/unitable/configs/trainer/vqvae.yaml @@ -0,0 +1,33 @@ +defaults: + - _self_ + - train/lr_scheduler: exponential + - train/optimizer: adam + +mode: train +img_size: [256,256] +label_type: image +max_seq_len: null + +train: + epochs: 20 + grad_clip: 0.2 + starting_temp: 1. + temp_min: 0.06 + temp_anneal_rate: 1e-6 + save_every: 3 + dataloader: + _target_: src.datamodule.dataloader.dataloader_vae + batch_size: 48 +valid: + dataloader: + _target_: src.datamodule.dataloader.dataloader_vae + batch_size: 48 +test: + metrics: null + + +trainer: + _target_: src.trainer.VqvaeTrainer + snapshot: null + model_weights: null + diff --git a/unitable/configs/vocab/bbox.yaml b/unitable/configs/vocab/bbox.yaml new file mode 100644 index 0000000000000000000000000000000000000000..40990e38e65a2159844f5bb950845dcad9c46d7d --- /dev/null +++ b/unitable/configs/vocab/bbox.yaml @@ -0,0 +1,3 @@ +need_vocab: true +type: html +dir: ${hydra:runtime.cwd}/../vocab/vocab_bbox.json \ No newline at end of file diff --git a/unitable/configs/vocab/cell.yaml b/unitable/configs/vocab/cell.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e3be3b77ef5ed58c9544eb7580a0806ec465e55 --- /dev/null +++ b/unitable/configs/vocab/cell.yaml @@ -0,0 +1,3 @@ +need_vocab: true +type: cell +dir: ${hydra:runtime.cwd}/../vocab/vocab_cell_6k.json \ No newline at end of file diff --git a/unitable/configs/vocab/empty.yaml b/unitable/configs/vocab/empty.yaml new file mode 100644 index 0000000000000000000000000000000000000000..751fba4b96057de156f218474e8a66da09d055c2 --- /dev/null +++ b/unitable/configs/vocab/empty.yaml @@ -0,0 +1 @@ +need_vocab: false \ No newline at end of file diff --git a/unitable/configs/vocab/html.yaml b/unitable/configs/vocab/html.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0ed5f5efd2f042f24bd978a10718c5d48121f3a6 --- /dev/null +++ b/unitable/configs/vocab/html.yaml @@ -0,0 +1,3 @@ +need_vocab: true +type: html +dir: ${hydra:runtime.cwd}/../vocab/vocab_html.json \ No newline at end of file diff --git a/unitable/image1.png b/unitable/image1.png new file mode 100644 index 0000000000000000000000000000000000000000..4bdc016ccdbdafc386df9a0ece0cd8079a348461 Binary files /dev/null and b/unitable/image1.png differ diff --git a/unitable/image2.png b/unitable/image2.png new file mode 100644 index 0000000000000000000000000000000000000000..ba1c81a0bd5e761ccf24a5defa8b9151641d5db0 Binary files /dev/null and b/unitable/image2.png differ diff --git a/unitable/image3.png b/unitable/image3.png new file mode 100644 index 0000000000000000000000000000000000000000..ba6e158820016a2e8bad89c02b94705f014f6958 Binary files /dev/null and b/unitable/image3.png differ diff --git a/unitable/notebooks/full_pipeline.ipynb b/unitable/notebooks/full_pipeline.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..ede798b561c7a889cd469e372f5c71256a0e7fae --- /dev/null +++ b/unitable/notebooks/full_pipeline.ipynb @@ -0,0 +1,2045 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from IPython.display import display, HTML\n", + "\n", + "from typing import Tuple, List, Sequence, Optional, Union\n", + "from pathlib import Path\n", + "import re\n", + "import torch\n", + "import tokenizers as tk\n", + "from PIL import Image\n", + "from matplotlib import pyplot as plt\n", + "from matplotlib import patches\n", + "from torchvision import transforms\n", + "from torch import nn, Tensor\n", + "from functools import partial\n", + "from bs4 import BeautifulSoup as bs\n", + "import warnings\n", + "\n", + "from src.model import EncoderDecoder, ImgLinearBackbone, Encoder, Decoder\n", + "from src.utils import subsequent_mask, pred_token_within_range, greedy_sampling, bbox_str_to_token_list, cell_str_to_token_list, html_str_to_token_list, build_table_from_html_and_cell, html_table_template\n", + "from src.trainer.utils import VALID_HTML_TOKEN, VALID_BBOX_TOKEN, INVALID_CELL_TOKEN\n", + "\n", + "warnings.filterwarnings('ignore')\n", + "device = torch.device(\"cuda:0\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Check all model weights have been downloaded to experiments/unitable_weights\n", + "MODEL_FILE_NAME = [\"unitable_large_structure.pt\", \"unitable_large_bbox.pt\", \"unitable_large_content.pt\"]\n", + "MODEL_DIR = Path(\"../experiments/unitable_weights\")\n", + "\n", + "assert all([(MODEL_DIR / name).is_file() for name in MODEL_FILE_NAME]), f\"Please download model weights from HuggingFace: https://huggingface.co/poloclub/UniTable/tree/main\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Load tabular image\n", + "image_name = \"PMC2838834_005_00.png\"\n", + "image_path = f\"../dataset/mini_pubtabnet/train/{image_name}\"\n", + "image = Image.open(image_path).convert(\"RGB\")\n", + "image_size = image.size\n", + "\n", + "fig, ax = plt.subplots(figsize=(12, 10))\n", + "ax.imshow(image)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# UniTable large model\n", + "d_model = 768\n", + "patch_size = 16\n", + "nhead = 12\n", + "dropout = 0.2\n", + "\n", + "backbone = ImgLinearBackbone(d_model=d_model, patch_size=patch_size)\n", + "encoder = Encoder(\n", + " d_model=d_model,\n", + " nhead=nhead,\n", + " dropout = dropout,\n", + " activation=\"gelu\",\n", + " norm_first=True,\n", + " nlayer=12,\n", + " ff_ratio=4,\n", + ")\n", + "decoder = Decoder(\n", + " d_model=d_model,\n", + " nhead=nhead,\n", + " dropout = dropout,\n", + " activation=\"gelu\",\n", + " norm_first=True,\n", + " nlayer=4,\n", + " ff_ratio=4,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def autoregressive_decode(\n", + " model: EncoderDecoder,\n", + " image: Tensor,\n", + " prefix: Sequence[int],\n", + " max_decode_len: int,\n", + " eos_id: int,\n", + " token_whitelist: Optional[Sequence[int]] = None,\n", + " token_blacklist: Optional[Sequence[int]] = None,\n", + ") -> Tensor:\n", + " model.eval()\n", + " with torch.no_grad():\n", + " memory = model.encode(image)\n", + " context = torch.tensor(prefix, dtype=torch.int32).repeat(image.shape[0], 1).to(device)\n", + "\n", + " for _ in range(max_decode_len):\n", + " eos_flag = [eos_id in k for k in context]\n", + " if all(eos_flag):\n", + " break\n", + "\n", + " with torch.no_grad():\n", + " causal_mask = subsequent_mask(context.shape[1]).to(device)\n", + " logits = model.decode(\n", + " memory, context, tgt_mask=causal_mask, tgt_padding_mask=None\n", + " )\n", + " logits = model.generator(logits)[:, -1, :]\n", + "\n", + " logits = pred_token_within_range(\n", + " logits.detach(),\n", + " white_list=token_whitelist,\n", + " black_list=token_blacklist,\n", + " )\n", + "\n", + " next_probs, next_tokens = greedy_sampling(logits)\n", + " context = torch.cat([context, next_tokens], dim=1)\n", + " return context\n", + "\n", + "def load_vocab_and_model(\n", + " vocab_path: Union[str, Path],\n", + " max_seq_len: int,\n", + " model_weights: Union[str, Path],\n", + ") -> Tuple[tk.Tokenizer, EncoderDecoder]:\n", + " vocab = tk.Tokenizer.from_file(vocab_path)\n", + " model = EncoderDecoder(\n", + " backbone=backbone,\n", + " encoder=encoder,\n", + " decoder=decoder,\n", + " vocab_size=vocab.get_vocab_size(),\n", + " d_model=d_model,\n", + " padding_idx=vocab.token_to_id(\"\"),\n", + " max_seq_len=max_seq_len,\n", + " dropout=dropout,\n", + " norm_layer=partial(nn.LayerNorm, eps=1e-6)\n", + " )\n", + "\n", + " model.load_state_dict(torch.load(model_weights, map_location=\"cpu\"))\n", + " model = model.to(device)\n", + " return vocab, model\n", + "\n", + "def image_to_tensor(image: Image, size: Tuple[int, int]) -> Tensor:\n", + " T = transforms.Compose([\n", + " transforms.Resize(size),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.86597056,0.88463002,0.87491087], std = [0.20686628,0.18201602,0.18485524])\n", + " ])\n", + " image_tensor = T(image)\n", + " image_tensor = image_tensor.to(device).unsqueeze(0)\n", + "\n", + " return image_tensor\n", + "\n", + "def rescale_bbox(\n", + " bbox: Sequence[Sequence[float]],\n", + " src: Tuple[int, int],\n", + " tgt: Tuple[int, int]\n", + ") -> Sequence[Sequence[float]]:\n", + " assert len(src) == len(tgt) == 2\n", + " ratio = [tgt[0] / src[0], tgt[1] / src[1]] * 2\n", + " bbox = [[int(round(i * j)) for i, j in zip(entry, ratio)] for entry in bbox]\n", + " return bbox" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Table structure extraction\n", + "vocab, model = load_vocab_and_model(\n", + " vocab_path=\"../vocab/vocab_html.json\",\n", + " max_seq_len=784,\n", + " model_weights=MODEL_DIR / MODEL_FILE_NAME[0],\n", + ")\n", + "\n", + "# Image transformation\n", + "image_tensor = image_to_tensor(image, size=(448, 448))\n", + "\n", + "# Inference\n", + "pred_html = autoregressive_decode(\n", + " model=model,\n", + " image=image_tensor,\n", + " prefix=[vocab.token_to_id(\"[html]\")],\n", + " max_decode_len=512,\n", + " eos_id=vocab.token_to_id(\"\"),\n", + " token_whitelist=[vocab.token_to_id(i) for i in VALID_HTML_TOKEN],\n", + " token_blacklist = None\n", + ")\n", + "\n", + "# Convert token id to token text\n", + "pred_html = pred_html.detach().cpu().numpy()[0]\n", + "pred_html = vocab.decode(pred_html, skip_special_tokens=False)\n", + "pred_html = html_str_to_token_list(pred_html)\n", + "\n", + "# print(pred_html)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Table cell bbox detection\n", + "vocab, model = load_vocab_and_model(\n", + " vocab_path=\"../vocab/vocab_bbox.json\",\n", + " max_seq_len=1024,\n", + " model_weights=MODEL_DIR / MODEL_FILE_NAME[1],\n", + ")\n", + "\n", + "# Image transformation\n", + "image_tensor = image_to_tensor(image, size=(448, 448))\n", + "\n", + "# Inference\n", + "pred_bbox = autoregressive_decode(\n", + " model=model,\n", + " image=image_tensor,\n", + " prefix=[vocab.token_to_id(\"[bbox]\")],\n", + " max_decode_len=1024,\n", + " eos_id=vocab.token_to_id(\"\"),\n", + " token_whitelist=[vocab.token_to_id(i) for i in VALID_BBOX_TOKEN[: 449]],\n", + " token_blacklist = None\n", + ")\n", + "\n", + "# Convert token id to token text\n", + "pred_bbox = pred_bbox.detach().cpu().numpy()[0]\n", + "pred_bbox = vocab.decode(pred_bbox, skip_special_tokens=False)\n", + "\n", + "# print(pred_bbox)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Visualize detected bbox\n", + "pred_bbox = bbox_str_to_token_list(pred_bbox)\n", + "pred_bbox = rescale_bbox(pred_bbox, src=(448, 448), tgt=image_size)\n", + "\n", + "fig, ax = plt.subplots(figsize=(12, 10))\n", + "for i in pred_bbox:\n", + " rect = patches.Rectangle(i[:2], i[2] - i[0], i[3] - i[1], linewidth=1, edgecolor='r', facecolor='none')\n", + " ax.add_patch(rect)\n", + "ax.set_axis_off()\n", + "ax.imshow(image)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Table cell content recognition\n", + "vocab, model = load_vocab_and_model(\n", + " vocab_path=\"../vocab/vocab_cell_6k.json\",\n", + " max_seq_len=200,\n", + " model_weights=MODEL_DIR / MODEL_FILE_NAME[2],\n", + ")\n", + "\n", + "# Cell image cropping and transformation\n", + "image_tensor = [image_to_tensor(image.crop(bbox), size=(112, 448)) for bbox in pred_bbox]\n", + "image_tensor = torch.cat(image_tensor, dim=0)\n", + "\n", + "# Inference\n", + "pred_cell = autoregressive_decode(\n", + " model=model,\n", + " image=image_tensor,\n", + " prefix=[vocab.token_to_id(\"[cell]\")],\n", + " max_decode_len=200,\n", + " eos_id=vocab.token_to_id(\"\"),\n", + " token_whitelist=None,\n", + " token_blacklist = [vocab.token_to_id(i) for i in INVALID_CELL_TOKEN]\n", + ")\n", + "\n", + "# Convert token id to token text\n", + "pred_cell = pred_cell.detach().cpu().numpy()\n", + "pred_cell = vocab.decode_batch(pred_cell, skip_special_tokens=False)\n", + "pred_cell = [cell_str_to_token_list(i) for i in pred_cell]\n", + "pred_cell = [re.sub(r'(\\d).\\s+(\\d)', r'\\1.\\2', i) for i in pred_cell]\n", + "\n", + "# print(pred_cell)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " Main cellular process\n", + " \n", + " Modulated pathways\n", + " \n", + " P value\n", + " \n", + " Genes in pathway\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " Expressed\n", + " \n", + " total\n", + "
\n", + " \n", + " \n", + " + PMN\n", + " \n", + " - PMN\n", + " \n", + " + PMN\n", + " \n", + " PMN\n", + " \n", + "
\n", + " Cell cycle\n", + " \n", + " Role of APC in cell cycle regulation\n", + " \n", + " 1.040E - 09\n", + " \n", + " 8.149E - 08\n", + " \n", + " 15\n", + " \n", + " 12\n", + " \n", + " 32\n", + "
\n", + " \n", + " Chromosome condensation in prometaphase\n", + " \n", + " 4.131E - 06\n", + " \n", + " 8.392E - 11\n", + " \n", + " 9\n", + " \n", + " 12\n", + " \n", + " 20\n", + "
\n", + " \n", + " The metaphase checkpoint\n", + " \n", + " 4.423E - 06\n", + " \n", + " 1.474E - 04\n", + " \n", + " 12\n", + " \n", + " 9\n", + " \n", + " 36\n", + "
\n", + " \n", + " Spindle assembly and chromosome separation\n", + " \n", + " 3.170E - 04\n", + " \n", + " 1.937E - 03\n", + " \n", + " 9\n", + " \n", + " 7\n", + " \n", + " 32\n", + "
\n", + " \n", + " Start of DNA replication in early S phase\n", + " \n", + " 1.284E - 03\n", + " \n", + " 3.115E - 02\n", + " \n", + " 8\n", + " \n", + " 5\n", + " \n", + " 31\n", + "
\n", + " \n", + " Initiation of mitosis\n", + " \n", + " 1.544E - 03\n", + " \n", + " 2.483E - 03\n", + " \n", + " 7\n", + " \n", + " 6\n", + " \n", + " 25\n", + "
\n", + " \n", + " Sister chromatid cohesion\n", + " \n", + " 1.530E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " 21\n", + "
\n", + " \n", + " Transition and termination of DNA replication\n", + " \n", + " \n", + " 1.523E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " 26\n", + "
\n", + " \n", + " Role of Nek in cell cycle regulation\n", + " \n", + " \n", + " 2.390E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " 29\n", + "
\n", + " \n", + " Nucleocytoplasmic transport of CDK / Cyclins\n", + " \n", + " \n", + " 4.386E - 02\n", + " \n", + " \n", + " 3\n", + " \n", + " 14\n", + "
\n", + " Immune response\n", + " \n", + " Alternative complement pathway\n", + " \n", + " 4.539E - 07\n", + " \n", + " 2.737E - 02\n", + " \n", + " 12\n", + " \n", + " 5\n", + " \n", + " 30\n", + "
\n", + " \n", + " Fc gamma R - mediated phagocytosis\n", + " \n", + " 1.606E - 03\n", + " \n", + " 9.058E - 03\n", + " \n", + " 8\n", + " \n", + " 6\n", + " \n", + " 32\n", + "
\n", + " \n", + " Antigen presentation by MHC class II\n", + " \n", + " 6.046E - 03\n", + " \n", + " 2.644E - 03\n", + " \n", + " 4\n", + " \n", + " 4\n", + " \n", + " 11\n", + "
\n", + " \n", + " Classic complement pathway\n", + " \n", + " 1.517E - 05\n", + " \n", + " \n", + " 12\n", + " \n", + " \n", + " 40\n", + "
\n", + " \n", + " Antiviral actions of Interferons\n", + " \n", + " 2.431E - 04\n", + " \n", + " \n", + " 9\n", + " \n", + " \n", + " 31\n", + "
\n", + " \n", + " CCR3 signalling\n", + " \n", + " 8.728E - 04\n", + " \n", + " \n", + " 12\n", + " \n", + " \n", + " 59\n", + "
\n", + " \n", + " Lectin Induced complement pathway\n", + " \n", + " 1.251E - 03\n", + " \n", + " \n", + " 9\n", + " \n", + " \n", + " 38\n", + "
\n", + " \n", + " Lipoxin inhibitory action on Superoxide production\n", + " \n", + " 1.544E - 03\n", + " \n", + " 2.483E - 03\n", + " \n", + " 7\n", + " \n", + " 6\n", + " \n", + " 25\n", + "
\n", + " \n", + " IFN alpha / beta signalling pathway\n", + " \n", + " 6.214E - 03\n", + " \n", + " \n", + " 6\n", + " \n", + " \n", + " 24\n", + "
\n", + " \n", + " IL - 10 signalling pathway\n", + " \n", + " 2.245E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " 23\n", + "
\n", + " \n", + " Antigen presentation by MHC class I\n", + " \n", + " 3.675E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " 26\n", + "
\n", + " \n", + " Transcription regulation of granulocyte development\n", + " \n", + " \n", + " 3.115E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " 31\n", + "
\n", + " Oxidative stress\n", + " \n", + " ROS production\n", + " \n", + " 8.932E - 04\n", + " \n", + " 4.113E - 02\n", + " \n", + " 7\n", + " \n", + " 4\n", + " \n", + " 23\n", + "
\n", + " Apoptosis\n", + " \n", + " Inhibition of ROS induced apoptosis\n", + " \n", + " 3.675E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " 26\n", + "
\n", + " G protein signalling\n", + " \n", + " Rac2 regulation pathway\n", + " \n", + " 4.957E - 03\n", + " \n", + " 4.113E - 02\n", + " \n", + " 6\n", + " \n", + " 4\n", + " \n", + " 23\n", + "
\n", + " \n", + " RAC1 in cellular process\n", + " \n", + " 1.361E - 02\n", + " \n", + " \n", + " 6\n", + " \n", + " \n", + " 28\n", + "
\n", + " Cytoskeleton remodelling\n", + " \n", + " Regulation of actin cytoskeleton by Rho GTPases\n", + " \n", + " 8.972E - 03\n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " 23\n", + "
\n", + " \n", + " Alpha - 1A adrenergic receptor - dependent inhibition of PI3K\n", + " \n", + " 2.887E - 02\n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " 12\n", + "
\n", + " Metabolic process\n", + " \n", + " Lipoprotein metabolism I. Chylomicron, VLDL and LDL metabolism\n", + " \n", + " 1.630E - 02\n", + " \n", + " 9.007E - 07\n", + " \n", + " 3\n", + " \n", + " 6\n", + " \n", + " 8\n", + "
\n", + " \n", + " Lipoprotein metabolism II. HDL metabolism\n", + " \n", + " 1.630E - 02\n", + " \n", + " 9.007E - 07\n", + " \n", + " 3\n", + " \n", + " 6\n", + " \n", + " 8\n", + "
\n", + " \n", + " G - alpha ( q ) regulation of lipid metabolism\n", + " \n", + " 2.245E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " 23\n", + "
\n", + " \n", + " Urea cycle\n", + " \n", + " 3.675E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " 26\n", + "
\n", + " \n", + " LDL metabolism during development of fatty streak lesion\n", + " \n", + " 1.870E - 02\n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " 4\n", + "
\n", + " \n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Combine the table structure and cell content\n", + "pred_code = build_table_from_html_and_cell(pred_html, pred_cell)\n", + "pred_code = \"\".join(pred_code)\n", + "pred_code = html_table_template(pred_code)\n", + "\n", + "# Display the HTML table\n", + "soup = bs(pred_code)\n", + "table_code = soup.prettify()\n", + "display(HTML(table_code))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " Main cellular process\n", + " \n", + " Modulated pathways\n", + " \n", + " P value\n", + " \n", + " Genes in pathway\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " Expressed\n", + " \n", + " total\n", + "
\n", + " \n", + " \n", + " + PMN\n", + " \n", + " - PMN\n", + " \n", + " + PMN\n", + " \n", + " PMN\n", + " \n", + "
\n", + " Cell cycle\n", + " \n", + " Role of APC in cell cycle regulation\n", + " \n", + " 1.040E - 09\n", + " \n", + " 8.149E - 08\n", + " \n", + " 15\n", + " \n", + " 12\n", + " \n", + " 32\n", + "
\n", + " \n", + " Chromosome condensation in prometaphase\n", + " \n", + " 4.131E - 06\n", + " \n", + " 8.392E - 11\n", + " \n", + " 9\n", + " \n", + " 12\n", + " \n", + " 20\n", + "
\n", + " \n", + " The metaphase checkpoint\n", + " \n", + " 4.423E - 06\n", + " \n", + " 1.474E - 04\n", + " \n", + " 12\n", + " \n", + " 9\n", + " \n", + " 36\n", + "
\n", + " \n", + " Spindle assembly and chromosome separation\n", + " \n", + " 3.170E - 04\n", + " \n", + " 1.937E - 03\n", + " \n", + " 9\n", + " \n", + " 7\n", + " \n", + " 32\n", + "
\n", + " \n", + " Start of DNA replication in early S phase\n", + " \n", + " 1.284E - 03\n", + " \n", + " 3.115E - 02\n", + " \n", + " 8\n", + " \n", + " 5\n", + " \n", + " 31\n", + "
\n", + " \n", + " Initiation of mitosis\n", + " \n", + " 1.544E - 03\n", + " \n", + " 2.483E - 03\n", + " \n", + " 7\n", + " \n", + " 6\n", + " \n", + " 25\n", + "
\n", + " \n", + " Sister chromatid cohesion\n", + " \n", + " 1.530E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " 21\n", + "
\n", + " \n", + " Transition and termination of DNA replication\n", + " \n", + " \n", + " 1.523E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " 26\n", + "
\n", + " \n", + " Role of Nek in cell cycle regulation\n", + " \n", + " \n", + " 2.390E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " 29\n", + "
\n", + " \n", + " Nucleocytoplasmic transport of CDK / Cyclins\n", + " \n", + " \n", + " 4.386E - 02\n", + " \n", + " \n", + " 3\n", + " \n", + " 14\n", + "
\n", + " Immune response\n", + " \n", + " Alternative complement pathway\n", + " \n", + " 4.539E - 07\n", + " \n", + " 2.737E - 02\n", + " \n", + " 12\n", + " \n", + " 5\n", + " \n", + " 30\n", + "
\n", + " \n", + " Fc gamma R - mediated phagocytosis\n", + " \n", + " 1.606E - 03\n", + " \n", + " 9.058E - 03\n", + " \n", + " 8\n", + " \n", + " 6\n", + " \n", + " 32\n", + "
\n", + " \n", + " Antigen presentation by MHC class II\n", + " \n", + " 6.046E - 03\n", + " \n", + " 2.644E - 03\n", + " \n", + " 4\n", + " \n", + " 4\n", + " \n", + " 11\n", + "
\n", + " \n", + " Classic complement pathway\n", + " \n", + " 1.517E - 05\n", + " \n", + " \n", + " 12\n", + " \n", + " \n", + " 40\n", + "
\n", + " \n", + " Antiviral actions of Interferons\n", + " \n", + " 2.431E - 04\n", + " \n", + " \n", + " 9\n", + " \n", + " \n", + " 31\n", + "
\n", + " \n", + " CCR3 signalling\n", + " \n", + " 8.728E - 04\n", + " \n", + " \n", + " 12\n", + " \n", + " \n", + " 59\n", + "
\n", + " \n", + " Lectin Induced complement pathway\n", + " \n", + " 1.251E - 03\n", + " \n", + " \n", + " 9\n", + " \n", + " \n", + " 38\n", + "
\n", + " \n", + " Lipoxin inhibitory action on Superoxide production\n", + " \n", + " 1.544E - 03\n", + " \n", + " 2.483E - 03\n", + " \n", + " 7\n", + " \n", + " 6\n", + " \n", + " 25\n", + "
\n", + " \n", + " IFN alpha / beta signalling pathway\n", + " \n", + " 6.214E - 03\n", + " \n", + " \n", + " 6\n", + " \n", + " \n", + " 24\n", + "
\n", + " \n", + " IL - 10 signalling pathway\n", + " \n", + " 2.245E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " 23\n", + "
\n", + " \n", + " Antigen presentation by MHC class I\n", + " \n", + " 3.675E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " 26\n", + "
\n", + " \n", + " Transcription regulation of granulocyte development\n", + " \n", + " \n", + " 3.115E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " 31\n", + "
\n", + " Oxidative stress\n", + " \n", + " ROS production\n", + " \n", + " 8.932E - 04\n", + " \n", + " 4.113E - 02\n", + " \n", + " 7\n", + " \n", + " 4\n", + " \n", + " 23\n", + "
\n", + " Apoptosis\n", + " \n", + " Inhibition of ROS induced apoptosis\n", + " \n", + " 3.675E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " 26\n", + "
\n", + " G protein signalling\n", + " \n", + " Rac2 regulation pathway\n", + " \n", + " 4.957E - 03\n", + " \n", + " 4.113E - 02\n", + " \n", + " 6\n", + " \n", + " 4\n", + " \n", + " 23\n", + "
\n", + " \n", + " RAC1 in cellular process\n", + " \n", + " 1.361E - 02\n", + " \n", + " \n", + " 6\n", + " \n", + " \n", + " 28\n", + "
\n", + " Cytoskeleton remodelling\n", + " \n", + " Regulation of actin cytoskeleton by Rho GTPases\n", + " \n", + " 8.972E - 03\n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " 23\n", + "
\n", + " \n", + " Alpha - 1A adrenergic receptor - dependent inhibition of PI3K\n", + " \n", + " 2.887E - 02\n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " 12\n", + "
\n", + " Metabolic process\n", + " \n", + " Lipoprotein metabolism I. Chylomicron, VLDL and LDL metabolism\n", + " \n", + " 1.630E - 02\n", + " \n", + " 9.007E - 07\n", + " \n", + " 3\n", + " \n", + " 6\n", + " \n", + " 8\n", + "
\n", + " \n", + " Lipoprotein metabolism II. HDL metabolism\n", + " \n", + " 1.630E - 02\n", + " \n", + " 9.007E - 07\n", + " \n", + " 3\n", + " \n", + " 6\n", + " \n", + " 8\n", + "
\n", + " \n", + " G - alpha ( q ) regulation of lipid metabolism\n", + " \n", + " 2.245E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " 23\n", + "
\n", + " \n", + " Urea cycle\n", + " \n", + " 3.675E - 02\n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " 26\n", + "
\n", + " \n", + " LDL metabolism during development of fatty streak lesion\n", + " \n", + " 1.870E - 02\n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " 4\n", + "
\n", + " \n", + "\n", + "\n" + ] + } + ], + "source": [ + "# Raw HTML table code\n", + "print(table_code)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation - Proceed only if you have the groundtruth annotation." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from src.utils.teds import TEDS\n", + "import jsonlines\n", + "from src.vocab.constant import CELL_SPECIAL" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Load groundtruth annotation\n", + "annotation_path = \"../dataset/mini_pubtabnet/mini_pubtabnet_examples.jsonl\"\n", + "with jsonlines.open(annotation_path) as f:\n", + " for obj in f:\n", + " if obj[\"filename\"] == image_name:\n", + " anno_html_raw = obj[\"html\"][\"structure\"][\"tokens\"]\n", + " anno_cell_raw = [\"\".join(cell[\"tokens\"]) for cell in obj[\"html\"][\"cells\"] if cell[\"tokens\"]]\n", + " break\n", + "\n", + "anno_html = []\n", + "idx = 0\n", + "while idx < len(anno_html_raw):\n", + " if \"[\" in anno_html_raw[idx]:\n", + " assert idx + 1 < len(anno_html_raw)\n", + " assert anno_html_raw[idx + 1] == \"]\"\n", + " anno_html.append(anno_html_raw[idx] + \"]\")\n", + " idx = idx + 2\n", + " else:\n", + " anno_html.append(anno_html_raw[idx])\n", + " idx = idx + 1\n", + "\n", + "anno_cell = []\n", + "for txt in anno_cell_raw:\n", + " for black in CELL_SPECIAL:\n", + " txt = txt.replace(black, \"\")\n", + " anno_cell.append(txt)\n", + "\n", + "anno_code = \"\".join(build_table_from_html_and_cell(anno_html, anno_cell))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Evaluate table structure only (S-TEDS)\n", + "teds = TEDS(structure_only=True)\n", + "teds.evaluate(pred_code, html_table_template(anno_code))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9523398767490218" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Evaluate both table structure and cell content (TEDS)\n", + "teds = TEDS(structure_only=False)\n", + "teds.evaluate(pred_code, html_table_template(anno_code))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "adp", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/unitable/requirements.txt b/unitable/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3c394d201ea73feab07804c53068cb53072292b --- /dev/null +++ b/unitable/requirements.txt @@ -0,0 +1,20 @@ +torch +torchvision +torchaudio +torchtext +jsonlines +beautifulsoup4 +matplotlib +hydra-core +hydra_colorlog +apted +Distance +lxml==4.9.3 +torchmetrics +wandb +einops +ptflops +tokenizers +pycocotools +torchmetrics +faster-coco-eval \ No newline at end of file diff --git a/unitable/setup.py b/unitable/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..ee6861148c8a639508709f5a158f5d32471b91ab --- /dev/null +++ b/unitable/setup.py @@ -0,0 +1,4 @@ +from setuptools import find_packages +from setuptools import setup + +setup(name="unitable", version="1.0.0", packages=find_packages()) \ No newline at end of file diff --git a/unitable/src/datamodule/__init__.py b/unitable/src/datamodule/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3c6456bbfa0e3f9973469d1135aab34010a7a993 --- /dev/null +++ b/unitable/src/datamodule/__init__.py @@ -0,0 +1,6 @@ +from .pubtabnet import PubTabNet +from .synthtabnet import Synthtabnet +from .dataloader import dataloader_vae, dataloader_beit, dataloader_html +from .pubtables1m import PubTables +from .tablebank import TableBank +from .fintabnet import FinTabNet diff --git a/unitable/src/datamodule/augmentation.py b/unitable/src/datamodule/augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..0258c64d94b7687e01d8097c20c0122f7ede5788 --- /dev/null +++ b/unitable/src/datamodule/augmentation.py @@ -0,0 +1,108 @@ +from typing import Tuple, Any, Optional, Union +from torch import Tensor +import random +from PIL import Image +import torchvision.transforms.functional as F +from torchvision import datasets, transforms + +from torchvision.transforms.transforms import _setup_size + + +_PIL_INTERPOLATION = { + "bilinear": Image.BILINEAR, + "bicubic": Image.BICUBIC, + "lanczos": Image.LANCZOS, + "hamming": Image.HAMMING, +} + +get_interpolation = lambda method: _PIL_INTERPOLATION.get(method, Image.BILINEAR) + + +class RandomResizedCropAndInterpolationWithTwoPic(transforms.RandomResizedCrop): + """Ensure both crops of vqvae and visual encoder have the same scale and size.""" + + def __init__( + self, + size: Union[int, Tuple[int, int]], # transformer + second_size: Union[int, Tuple[int, int]], # vqvae + scale: Tuple[float, float] = (0.08, 1.0), + ratio: Tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0), + interpolation: str = "bilinear", + second_interpolation: str = "lanczos", + ): + self.second_size = _setup_size( + second_size, + error_msg="Please provide only two dimensions (h, w) for second size.", + ) + + if interpolation == "random": + interpolation = random.choice( + [get_interpolation("bilinear"), get_interpolation("bicubic")] + ) + else: + interpolation = get_interpolation(interpolation) + self.second_interpolation = get_interpolation(second_interpolation) + + super().__init__( + size=size, scale=scale, ratio=ratio, interpolation=interpolation + ) + + def forward(self, img: Image): + i, j, h, w = self.get_params(img, self.scale, self.ratio) + out = F.resized_crop(img, i, j, h, w, self.size, self.interpolation) + out_second = F.resized_crop( + img, i, j, h, w, self.second_size, self.second_interpolation + ) + + return out, out_second + + +class AugmentationForMIM(object): + def __init__( + self, + mean: float, + std: float, + trans_size: Union[int, Tuple[int, int]], + vqvae_size: Union[int, Tuple[int, int]], + trans_interpolation: str, + vqvae_interpolation: str, + ) -> None: + self.common_transform = transforms.Compose( + [ + transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), + transforms.RandomHorizontalFlip(p=0.5), + RandomResizedCropAndInterpolationWithTwoPic( + size=trans_size, + second_size=vqvae_size, + interpolation=trans_interpolation, + second_interpolation=vqvae_interpolation, + ), + ] + ) + + self.trans_transform = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)] + ) + + self.vqvae_transform = transforms.ToTensor() + + def __call__(self, img: Image) -> Tuple[Tensor, Tensor]: + trans_img, vqvae_img = self.common_transform(img) + trans_img = self.trans_transform(trans_img) + vqvae_img = self.vqvae_transform(vqvae_img) + + return trans_img, vqvae_img + + +if __name__ == "__main__": + mean = [240.380, 240.390, 240.486] + std = [45.735, 45.785, 45.756] + + T = RandomResizedCropAndInterpolationWithTwoPic( + size=(256, 256), + second_size=(256, 256), + interpolation="bicubic", + second_interpolation="lanczos", + ) + + print(T) diff --git a/unitable/src/datamodule/dataloader.py b/unitable/src/datamodule/dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..25c44c10334e1400cd25b34ffe7e121edf656f31 --- /dev/null +++ b/unitable/src/datamodule/dataloader.py @@ -0,0 +1,133 @@ +from typing import Any +from torch.utils.data import DataLoader, Dataset, Sampler +from functools import partial +import tokenizers as tk +import torch +from torch.utils.data import default_collate +from ..utils.mask_generator import MaskGenerator +from ..utils import ( + prepare_html_seq, + prepare_cell_seq, + prepare_bbox_seq, +) + + +class Collator: + def __init__( + self, + vocab: tk.Tokenizer, + max_seq_len: int, + label_type: str, + ) -> None: + self.vocab = vocab + self.vocab.enable_truncation(max_seq_len) + self.label_type = label_type + + def __call__(self, batch) -> Any: + return self._collate_batch(batch, self.vocab, self.label_type) + + def _collate_batch( + self, + batch: list[dict], + vocab: tk.Tokenizer, + label_type: str, + ): + if "cell" in label_type: + image_list = [j for i in batch for j in i[0]] + else: + image_list = [i["image"] for i in batch] + image_list = default_collate(image_list) + + if "cell" in label_type: + filename = [(j["filename"], j["bbox_id"]) for i in batch for j in i[1]] + else: + filename = [i["filename"] for i in batch] + label = dict(filename=filename) + + if "html" in label_type: + html_list = ["".join(prepare_html_seq(i["html"])) for i in batch] + label["html"] = vocab.encode_batch(html_list) + + if "cell" in label_type: + cell_list = [ + " ".join(prepare_cell_seq(j["cell"])) for i in batch for j in i[1] + ] + label["cell"] = vocab.encode_batch(cell_list) + + if "bbox" in label_type: + bbox_list = [" ".join(prepare_bbox_seq(i["bbox"])) for i in batch] + label["bbox"] = vocab.encode_batch(bbox_list) + + return image_list, label + + +def generate_mask_for_batch_samples( + batch, grid_size: int, num_mask_patches: int, min_num_patches: int +): + N = len(batch) + mg = MaskGenerator( + input_size=grid_size, + num_mask_patches=num_mask_patches, + min_num_patches=min_num_patches, + ) + mask_list = [mg() for _ in range(N)] + return default_collate(batch), default_collate(mask_list) + + +def dataloader_vae( + dataset: Dataset, batch_size: int, sampler: Sampler = None, **kwargs +) -> DataLoader: + dataloader = DataLoader( + dataset, batch_size, sampler=sampler, num_workers=8, pin_memory=True + ) + + return dataloader + + +def dataloader_beit( + dataset: Dataset, + grid_size: int, + num_mask_patches: int, + min_num_patches: int, + batch_size: int, + sampler: Sampler = None, + **kwargs +): + dataloader = DataLoader( + dataset, + batch_size, + sampler=sampler, + collate_fn=partial( + generate_mask_for_batch_samples, + grid_size=grid_size, + num_mask_patches=num_mask_patches, + min_num_patches=min_num_patches, + ), + num_workers=8, + pin_memory=True, + ) + + return dataloader + + +def dataloader_html( + dataset: Dataset, + batch_size: int, + vocab: tk.Tokenizer, + max_seq_len: int, + label_type: str, + sampler=None, +) -> DataLoader: + collate_fn = Collator(vocab, max_seq_len, label_type) + + dataloader = DataLoader( + dataset, + batch_size=batch_size, + shuffle=False, + num_workers=8, + collate_fn=collate_fn, + pin_memory=True, + sampler=sampler, + ) + + return dataloader diff --git a/unitable/src/datamodule/fintabnet.py b/unitable/src/datamodule/fintabnet.py new file mode 100644 index 0000000000000000000000000000000000000000..e6c65837da02d3bb9eb06862483c118a2a24ac72 --- /dev/null +++ b/unitable/src/datamodule/fintabnet.py @@ -0,0 +1,50 @@ +from typing import Any, Literal, Union +from pathlib import Path +import jsonlines +from PIL import Image +from torch import Tensor +from torch.utils.data import Dataset +import torchvision.transforms as transforms + + +class FinTabNet(Dataset): + """Load PubTabNet for different training purposes.""" + + def __init__( + self, + root_dir: Union[Path, str], + label_type: Literal["image", "html", "cell", "bbox"], + transform: transforms = None, + jsonl_filename: Union[Path, str] = None, + ) -> None: + super().__init__() + + self.root_dir = Path(root_dir) + self.label_type = label_type + self.transform = transform + + if label_type != "image": + jsonl_file = self.root_dir / jsonl_filename + with jsonlines.open(jsonl_file) as f: + self.image_label_pair = list(f) + + def __len__(self): + return len(self.image_label_pair) + + def __getitem__(self, index: int) -> Any: + if self.label_type == "image": + raise ValueError("FinTabNet is not used in pretraining.") + else: + obj = self.image_label_pair[index] + img_name = f"{obj['table_id']}.png" + img = Image.open(self.root_dir / "image" / img_name) + if self.transform: + img = self.transform(img) + + sample = dict(filename=obj["filename"], image=img) + + if self.label_type == "html": + sample["html"] = obj["html"]["structure"]["tokens"] + return sample + else: + raise ValueError("Task not supported in current dataset.") diff --git a/unitable/src/datamodule/pubtables1m.py b/unitable/src/datamodule/pubtables1m.py new file mode 100644 index 0000000000000000000000000000000000000000..633f4c379304f7b3b7973c7104c467f94297eb1b --- /dev/null +++ b/unitable/src/datamodule/pubtables1m.py @@ -0,0 +1,105 @@ +from typing import Any, Literal, Union +from pathlib import Path +import jsonlines +from PIL import Image +from torch import Tensor +from torch.utils.data import Dataset +import torchvision.transforms as transforms +import numpy as np +import os +import json + +from src.utils import bbox_augmentation_resize + + +class PubTables(Dataset): + """PubTables-1M-Structure""" + + def __init__( + self, + root_dir: Union[Path, str], + label_type: Literal["image", "cell", "bbox"], + split: Literal["train", "val", "test"], + transform: transforms = None, + cell_limit: int = 100, + ) -> None: + super().__init__() + + self.root_dir = Path(root_dir) + self.split = split + self.label_type = label_type + self.transform = transform + self.cell_limit = cell_limit + + tmp = os.listdir(self.root_dir / self.split) + + self.image_list = [i.split(".xml")[0] for i in tmp] + + def __len__(self): + return len(self.image_list) + + def __getitem__(self, index: int) -> Any: + name = self.image_list[index] + img = Image.open(os.path.join(self.root_dir, "images", name + ".jpg")) + + if self.label_type == "image": + if self.transform: + img = self.transform(img) + return img + elif "bbox" in self.label_type: + img_size = img.size + if self.transform: + img = self.transform(img) + tgt_size = img.shape[-1] + with open( + os.path.join(self.root_dir, "words", name + "_words.json"), "r" + ) as f: + obj = json.load(f) + + obj[:] = [ + v + for i in obj + if "bbox" in i.keys() + and all([i["bbox"][w + 2] > i["bbox"][w] for w in range(2)]) + for v in bbox_augmentation_resize( + [ + min(max(i["bbox"][0], 0), img_size[0]), + min(max(i["bbox"][1], 0), img_size[1]), + min(max(i["bbox"][2], 0), img_size[0]), + min(max(i["bbox"][3], 0), img_size[1]), + ], + img_size, + tgt_size, + ) + ] + + sample = {"filename": name, "image": img, "bbox": obj} + return sample + + elif "cell" in self.label_type: + img_size = img.size + with open( + os.path.join(self.root_dir, "words", name + "_words.json"), "r" + ) as f: + obj = json.load(f) + + bboxes_texts = [ + (i["bbox"], i["text"]) + for idx, i in enumerate(obj) + if "bbox" in i + and i["bbox"][0] < i["bbox"][2] + and i["bbox"][1] < i["bbox"][3] + and i["bbox"][0] >= 0 + and i["bbox"][1] >= 0 + and i["bbox"][2] < img_size[0] + and i["bbox"][3] < img_size[1] + and idx < self.cell_limit + ] + + img_bboxes = [self.transform(img.crop(bbox[0])) for bbox in bboxes_texts] + + text_bboxes = [ + {"filename": name, "bbox_id": i, "cell": j[1]} + for i, j in enumerate(bboxes_texts) + ] + return img_bboxes, text_bboxes diff --git a/unitable/src/datamodule/pubtabnet.py b/unitable/src/datamodule/pubtabnet.py new file mode 100644 index 0000000000000000000000000000000000000000..3466955b191e04c372cf559a293b9b41fb73f941 --- /dev/null +++ b/unitable/src/datamodule/pubtabnet.py @@ -0,0 +1,104 @@ +from typing import Any, Literal, Union +from pathlib import Path +from PIL import Image +from torch import Tensor +from torch.utils.data import Dataset +import torchvision.transforms as transforms +import numpy as np +import os + +from src.utils import load_json_annotations, bbox_augmentation_resize + + +# average html annotation length: train: 181.327 149.753 +# samples train: 500777, val: 9115 +class PubTabNet(Dataset): + """Load PubTabNet for different training purposes.""" + + def __init__( + self, + root_dir: Union[Path, str], + label_type: Literal["image", "html", "cell", "bbox"], + split: Literal["train", "val"], + transform: transforms = None, + json_html: Union[Path, str] = None, + cell_limit: int = 150, + ) -> None: + super().__init__() + + self.root_dir = Path(root_dir) + self.split = split + self.label_type = label_type + self.transform = transform + self.cell_limit = cell_limit + + self.img_list = os.listdir(self.root_dir / self.split) + + if label_type != "image": + self.image_label_pair = load_json_annotations( + json_file_dir=Path(root_dir) / json_html, split=self.split + ) + + def __len__(self): + return len(self.img_list) + + def __getitem__(self, index: int) -> Any: + if self.label_type == "image": + img = Image.open(self.root_dir / self.split / self.img_list[index]) + if self.transform: + sample = self.transform(img) + return sample + else: + obj = self.image_label_pair[index] + img = Image.open(self.root_dir / self.split / obj[0]) + + if self.label_type == "html": + if self.transform: + img = self.transform(img) + sample = dict( + filename=obj[0], image=img, html=obj[1]["structure"]["tokens"] + ) + return sample + elif self.label_type == "cell": + bboxes_texts = [ + (i["bbox"], "".join(i["tokens"])) + for idx, i in enumerate(obj[1]["cells"]) + if "bbox" in i + and i["bbox"][0] < i["bbox"][2] + and i["bbox"][1] < i["bbox"][3] + and idx < self.cell_limit + ] + + img_bboxes = [ + self.transform(img.crop(bbox[0])) for bbox in bboxes_texts + ] + + text_bboxes = [ + {"filename": obj[0], "bbox_id": i, "cell": j[1]} + for i, j in enumerate(bboxes_texts) + ] + return img_bboxes, text_bboxes + else: + img_size = img.size + if self.transform: + img = self.transform(img) + tgt_size = img.shape[-1] + sample = dict(filename=obj[0], image=img) + + bboxes = [ + entry["bbox"] + for entry in obj[1]["cells"] + if "bbox" in entry + and entry["bbox"][0] < entry["bbox"][2] + and entry["bbox"][1] < entry["bbox"][3] + ] + + bboxes[:] = [ + i + for entry in bboxes + for i in bbox_augmentation_resize(entry, img_size, tgt_size) + ] + + sample["bbox"] = bboxes + + return sample diff --git a/unitable/src/datamodule/synthtabnet.py b/unitable/src/datamodule/synthtabnet.py new file mode 100644 index 0000000000000000000000000000000000000000..997b97134c1c68ea93e9b2ef5b538c3a0e8ffd85 --- /dev/null +++ b/unitable/src/datamodule/synthtabnet.py @@ -0,0 +1,115 @@ +from typing import Any, Literal, Union +from pathlib import Path +import jsonlines +from PIL import Image +from torch import Tensor +from torch.utils.data import Dataset +import torchvision.transforms as transforms +import numpy as np +import os + +from src.utils import load_json_annotations, bbox_augmentation_resize + +# invalid data pairs: image_000000_1634629424.098128.png has 4 channels +INVALID_DATA = [ + { + "dataset": "fintabnet", + "split": "train", + "image": "image_009379_1634631303.201671.png", + }, + { + "dataset": "marketing", + "split": "train", + "image": "image_000000_1634629424.098128.png", + }, +] + + +class Synthtabnet(Dataset): + def __init__( + self, + root_dir: Union[Path, str], + label_type: Literal["image", "html", "all"], + split: Literal["train", "val", "test"], + transform: transforms = None, + json_html: Union[Path, str] = None, + cell_limit: int = 100, + ) -> None: + super().__init__() + + self.root_dir = Path(root_dir) / "images" + self.split = split + self.label_type = label_type + self.transform = transform + self.cell_limit = cell_limit + + # SSP only needs image + self.img_list = os.listdir(self.root_dir / self.split) + if label_type != "image": + self.image_label_pair = load_json_annotations( + json_file_dir=Path(root_dir) / json_html, split=split + ) + + def __len__(self): + return len(self.img_list) + + def __getitem__(self, index: int) -> Any: + if self.label_type == "image": + img = Image.open(self.root_dir / self.split / self.img_list[index]) + if self.transform: + sample = self.transform(img) + return sample + else: + obj = self.image_label_pair[index] + img = Image.open(self.root_dir / self.split / obj[0]) + + if self.label_type == "html": + if self.transform: + img = self.transform(img) + sample = dict( + filename=obj[0], image=img, html=obj[1]["structure"]["tokens"] + ) + return sample + elif self.label_type == "cell": + bboxes_texts = [ + (i["bbox"], "".join(i["tokens"])) + for idx, i in enumerate(obj[1]["cells"]) + if "bbox" in i + and i["bbox"][0] < i["bbox"][2] + and i["bbox"][1] < i["bbox"][3] + and idx < self.cell_limit + ] + + img_bboxes = [ + self.transform(img.crop(bbox[0])) for bbox in bboxes_texts + ] # you can limit the total cropped cells to lower gpu memory + + text_bboxes = [ + {"filename": obj[0], "bbox_id": i, "cell": j[1]} + for i, j in enumerate(bboxes_texts) + ] + return img_bboxes, text_bboxes + else: + img_size = img.size + if self.transform: + img = self.transform(img) + tgt_size = img.shape[-1] + sample = dict(filename=obj[0], image=img) + + bboxes = [ + entry["bbox"] + for entry in obj[1]["cells"] + if "bbox" in entry + and entry["bbox"][0] < entry["bbox"][2] + and entry["bbox"][1] < entry["bbox"][3] + ] + + bboxes[:] = [ + i + for entry in bboxes + for i in bbox_augmentation_resize(entry, img_size, tgt_size) + ] + + sample["bbox"] = bboxes + + return sample diff --git a/unitable/src/datamodule/tablebank.py b/unitable/src/datamodule/tablebank.py new file mode 100644 index 0000000000000000000000000000000000000000..99942c71fa579e81ccaf8497ac62d89c3bf923cf --- /dev/null +++ b/unitable/src/datamodule/tablebank.py @@ -0,0 +1,47 @@ +from typing import Any, Literal, Union +from pathlib import Path +import jsonlines +from PIL import Image +from torch import Tensor +from torch.utils.data import Dataset +import torchvision.transforms as transforms +import numpy as np +import os +import json + + +class TableBank(Dataset): + """tablebank recognition""" + + def __init__( + self, + root_dir: Union[Path, str], + label_type: Literal["image"], + split: Literal["train", "val", "test"], + transform: transforms = None, + ) -> None: + super().__init__() + + assert label_type == "image", "No annotations" + + self.root_dir = Path(root_dir) + self.label_type = label_type + self.transform = transform + self.image_list = os.listdir(self.root_dir / "images") + + if split == "val" or split == "test": + self.image_list = self.image_list[:1000] + + def __len__(self): + return len(self.image_list) + + def __getitem__(self, index: int) -> Any: + name = self.image_list[index] + img = Image.open(os.path.join(self.root_dir, "images", name)) + if self.transform: + img = self.transform(img) + + if self.label_type == "image": + return img + else: + raise ValueError("TableBank doesn't have HTML annotations.") diff --git a/unitable/src/main.py b/unitable/src/main.py new file mode 100644 index 0000000000000000000000000000000000000000..f50d17d94507dff3529734f3b52c0e39a5b62dfa --- /dev/null +++ b/unitable/src/main.py @@ -0,0 +1,170 @@ +from typing import Any +import hydra +import logging +import os +import wandb +import torch +import tokenizers as tk +from omegaconf import DictConfig, OmegaConf +from hydra.utils import get_original_cwd, instantiate +from pathlib import Path +import torch.multiprocessing as mp +from torch.utils.data.distributed import DistributedSampler +from torch.distributed import init_process_group, destroy_process_group + +from src.utils import printer, count_total_parameters + +log = logging.getLogger(__name__) + + +@hydra.main(config_path="../configs", config_name="main", version_base="1.3") +def main(cfg: DictConfig): + torch.manual_seed(cfg.seed) + ddp_setup() + device = int(os.environ["LOCAL_RANK"]) + cwd = Path(get_original_cwd()) + exp_dir = Path(os.getcwd()) # experiment directory + + if cfg.trainer.mode == "train": + (exp_dir / "snapshot").mkdir(parents=True, exist_ok=True) + (exp_dir / "model").mkdir(parents=True, exist_ok=True) + if device == 0: + wandb.init(project=cfg.wandb.project, name=cfg.name, resume=True) + + # vocab is used in finetuning, not in self-supervised pretraining + vocab = None + if cfg.vocab.need_vocab: + log.info( + printer( + device, + f"Loading {cfg.vocab.type} vocab from {(cwd / cfg.vocab.dir).resolve()}", + ) + ) + vocab = tk.Tokenizer.from_file(str(cwd / cfg.vocab.dir)) + + # dataset + if cfg.trainer.mode == "train": + log.info(printer(device, "Loading training dataset")) + train_dataset = instantiate(cfg.dataset.train_dataset) + + log.info(printer(device, "Loading validation dataset")) + valid_dataset = instantiate(cfg.dataset.valid_dataset) + + train_kwargs = { + "dataset": train_dataset, + "sampler": DistributedSampler(train_dataset), + "vocab": vocab, + "max_seq_len": cfg.trainer.max_seq_len, + } + + valid_kwargs = { + "dataset": valid_dataset, + "sampler": DistributedSampler(valid_dataset), + "vocab": vocab, + "max_seq_len": cfg.trainer.max_seq_len, + } + + train_dataloader = instantiate(cfg.trainer.train.dataloader, **train_kwargs) + valid_dataloader = instantiate(cfg.trainer.valid.dataloader, **valid_kwargs) + elif cfg.trainer.mode == "test": + # load testing dataset, same as valid for ssl + log.info(printer(device, "Loading testing dataset")) + test_dataset = instantiate(cfg.dataset.test_dataset) + + test_kwargs = { + "dataset": test_dataset, + "sampler": DistributedSampler(test_dataset), + "vocab": vocab, + "max_seq_len": cfg.trainer.max_seq_len, + } + + test_dataloader = instantiate(cfg.trainer.test.dataloader, **test_kwargs) + + # model + log.info(printer(device, "Loading model ...")) + model_name = str(cfg.model.model._target_).split(".")[-1] + if model_name == "DiscreteVAE": + model = instantiate(cfg.model.model) + elif model_name == "BeitEncoder": + max_seq_len = ( + cfg.trainer.trans_size[0] // cfg.model.backbone_downsampling_factor + ) * (cfg.trainer.trans_size[1] // cfg.model.backbone_downsampling_factor) + model = instantiate( + cfg.model.model, + max_seq_len=max_seq_len, + ) + # load pretrained vqvae + model_vqvae = instantiate(cfg.model.model_vqvae) + + log.info(printer(device, "Loading pretrained VQVAE model ...")) + assert Path( + cfg.trainer.vqvae_weights + ).is_file(), f"VQVAE weights doesn't exist: {cfg.trainer.vqvae_weights}" + model_vqvae.load_state_dict( + torch.load(cfg.trainer.vqvae_weights, map_location="cpu") + ) + elif model_name == "EncoderDecoder": + max_seq_len = max( + (cfg.trainer.img_size[0] // cfg.model.backbone_downsampling_factor) + * (cfg.trainer.img_size[1] // cfg.model.backbone_downsampling_factor), + cfg.trainer.max_seq_len, + ) # for positional embedding + model = instantiate( + cfg.model.model, + max_seq_len=max_seq_len, + vocab_size=vocab.get_vocab_size(), + padding_idx=vocab.token_to_id(""), + ) + + log.info( + printer(device, f"Total parameters: {count_total_parameters(model) / 1e6:.2f}M") + ) + + # trainer + log.info(printer(device, "Loading trainer ...")) + trainer_name = str(cfg.trainer.trainer._target_).split(".")[-1] + trainer_kwargs = { + "device": device, + "model": model, + "log": log, + "exp_dir": exp_dir, + "snapshot": ( + exp_dir / "snapshot" / cfg.trainer.trainer.snapshot + if cfg.trainer.trainer.snapshot + else None + ), + } + + if trainer_name == "VqvaeTrainer": + trainer = instantiate(cfg.trainer.trainer, **trainer_kwargs) + elif trainer_name == "BeitTrainer": + trainer_kwargs["model_vqvae"] = model_vqvae + trainer = instantiate(cfg.trainer.trainer, **trainer_kwargs) + elif trainer_name == "TableTrainer": + trainer_kwargs["vocab"] = vocab + trainer = instantiate(cfg.trainer.trainer, **trainer_kwargs) + else: + raise ValueError(f"The provided trainer type {trainer_name} is not supported.") + + if cfg.trainer.mode == "train": + log.info(printer(device, "Training starts ...")) + trainer.train( + train_dataloader, valid_dataloader, cfg.trainer.train, cfg.trainer.valid + ) + elif cfg.trainer.mode == "test": + log.info(printer(device, "Evaluation starts ...")) + save_to = exp_dir / cfg.name + save_to.mkdir(parents=True, exist_ok=True) + trainer.test(test_dataloader, cfg.trainer.test, save_to=save_to) + else: + raise NotImplementedError + + destroy_process_group() + + +def ddp_setup(): + init_process_group(backend="nccl") + + +if __name__ == "__main__": + main() diff --git a/unitable/src/model/__init__.py b/unitable/src/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e66b16bd55002a8bb24b221033da4a7838a03fca --- /dev/null +++ b/unitable/src/model/__init__.py @@ -0,0 +1,4 @@ +from .beit import BeitEncoder +from .vqvae import DiscreteVAE +from .encoderdecoder import EncoderDecoder +from .components import * \ No newline at end of file diff --git a/unitable/src/model/beit.py b/unitable/src/model/beit.py new file mode 100644 index 0000000000000000000000000000000000000000..9443e5db8054c22f19bf74537421bfee72e2a1f6 --- /dev/null +++ b/unitable/src/model/beit.py @@ -0,0 +1,127 @@ +import math +import torch +from torch import nn, Tensor +from functools import partial + +from .components import ImgLinearBackbone, PositionEmbedding, Encoder + + +class BeitEncoder(nn.Module): + def __init__( + self, + d_model: int, # embed_dim + backbone: nn.Module, + max_seq_len: int, # for positional embedding + codebook_tokens: int, + dropout: float, + encoder: Encoder, + norm_layer: nn.Module, + init_std: float = 0.02, + ) -> None: + super().__init__() + + self.d_model = d_model + self.init_std = init_std + + self.backbone = backbone + self.pos_embed = PositionEmbedding( + max_seq_len=max_seq_len, d_model=d_model, dropout=dropout + ) + + self.encoder = encoder + self.norm = norm_layer(d_model) + self.generator = nn.Linear(d_model, codebook_tokens) + + self.trunc_normal = partial( + nn.init.trunc_normal_, std=init_std, a=-init_std, b=init_std + ) + self.apply(self._init_weights) + + self.mask_token = nn.Parameter(torch.zeros(1, 1, d_model)) + + def _init_weights(self, m: nn.Module): + if isinstance(m, nn.Linear): + self.trunc_normal(m.weight) + if m.bias is not None: + nn.init.constant_(m.bias, 0.0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.weight, 1.0) + nn.init.constant_(m.bias, 0.0) + elif isinstance(m, nn.Conv2d): + self.trunc_normal(m.weight) + if m.bias is not None: + nn.init.constant_(m.bias, 0.0) + elif isinstance(m, PositionEmbedding): + self.trunc_normal(m.embedding.weight) + + @torch.jit.ignore + def no_weight_decay(self): + return {"pos_embed"} + + def forward( + self, x: Tensor, bool_masked_pos: Tensor, return_all_tokens: bool = False + ): + x = self.backbone(x) + B, S, E = x.shape + assert E == self.d_model + + mask_token = self.mask_token.expand(B, S, -1) + + w = bool_masked_pos.unsqueeze(-1).type_as(mask_token) + x = x * (1 - w) + mask_token * w + + x = self.pos_embed(x) + + x = self.encoder(x) + x = self.norm(x) + + if return_all_tokens: + return self.generator(x) + else: + return self.generator(x[bool_masked_pos]) + + +if __name__ == "__main__": + d_model = 512 + patch_size = 16 + nhead = 8 + dropout = 0.0 + acitvation = "gelu" + norm_first = True + nlayer = 12 + ff_ratio = 4 + norm_layer = partial(nn.LayerNorm, eps=1e-6) + codebook_tokens = 8192 + + img_size = 448 + + max_seq_len = (img_size // patch_size) ** 2 + + backbone = ImgLinearBackbone(d_model=d_model, patch_size=patch_size) + encoder = Encoder( + d_model=d_model, + nhead=nhead, + dropout=dropout, + activation=acitvation, + norm_first=norm_first, + nlayer=nlayer, + ff_ratio=ff_ratio, + ) + + model = BeitEncoder( + d_model=d_model, + backbone=backbone, + max_seq_len=max_seq_len, + codebook_tokens=codebook_tokens, + dropout=dropout, + encoder=encoder, + norm_layer=norm_layer, + ) + + print(model) + + x = torch.rand((1, 3, img_size, img_size)) + bool_masked_pos = torch.rand((1, (img_size // patch_size) ** 2)) < 0.5 + y = model(x, bool_masked_pos) + print(torch.sum(bool_masked_pos)) + print(y.shape) diff --git a/unitable/src/model/components.py b/unitable/src/model/components.py new file mode 100644 index 0000000000000000000000000000000000000000..9a6bc9ffc6aae0434e9881daeb499e9963f9cb60 --- /dev/null +++ b/unitable/src/model/components.py @@ -0,0 +1,232 @@ +from typing import Optional, Tuple +import torch +from torch import nn, Tensor +from torchvision.ops.misc import Conv2dNormActivation + + +__all__ = [ + "ImgCnnBackbone", + "ImgLinearBackbone", + "ImgConvStemBackbone", + "PositionEmbedding", + "Encoder", + "Decoder", + "TokenEmbedding", +] + + +class ImgCnnBackbone(nn.Module): + def __init__( + self, + backbone: nn.Module, + output_channels: int, + d_model: int, + drop_layer: Tuple = None, + ) -> None: + super().__init__() + + # drop layers for classification & maxpooling for higher feature resolution + layers = list(backbone.children()) + nlayer = len(layers) + keep_layer = set([i for i in range(nlayer)]) - set(drop_layer) + backbone = [layers[i] for i in keep_layer] + self.backbone = nn.Sequential(*backbone) + self.proj = nn.Linear(output_channels, d_model) + self.channels = output_channels + + def forward(self, x: Tensor) -> Tensor: + x = self.backbone(x) + x = x.flatten(start_dim=-2).transpose(1, 2) + assert x.shape[-1] == self.channels, "Image channels size mismatch." + x = self.proj(x) + return x + + +class ImgLinearBackbone(nn.Module): + def __init__( + self, + d_model: int, + patch_size: int, + in_chan: int = 3, + ) -> None: + super().__init__() + + self.conv_proj = nn.Conv2d( + in_chan, out_channels=d_model, kernel_size=patch_size, stride=patch_size + ) + self.d_model = d_model + + def forward(self, x: Tensor) -> Tensor: + x = self.conv_proj(x) + x = x.flatten(start_dim=-2).transpose(1, 2) + return x + + +class ImgConvStemBackbone(nn.Module): + def __init__( + self, + d_model: int, + downsample_factor: int, + output_channels: int, + kernel_size: int, + ) -> None: + super().__init__() + + assert downsample_factor % 2 == 0 + assert output_channels % (downsample_factor // 2) == 0 + input_channels = output_channels // (downsample_factor // 2) + + layers = [ + Conv2dNormActivation( + 3, input_channels, kernel_size=kernel_size, stride=2, padding=1 + ) + ] + + while input_channels != output_channels: + layers.append( + Conv2dNormActivation( + input_channels, + input_channels * 2, + kernel_size=kernel_size, + stride=2, + padding=1, + ) + ) + input_channels = input_channels * 2 + + layers.append(nn.Conv2d(output_channels, d_model, kernel_size=1)) + + self.conv_stem = nn.Sequential(*layers) + + def forward(self, x: Tensor) -> Tensor: + x = self.conv_stem(x) + x = x.flatten(start_dim=-2).transpose(1, 2) + return x + + +class Encoder(nn.Module): + def __init__( + self, + d_model: int, + nhead: int, + dropout: float, + activation: str, + norm_first: bool, + nlayer: int, + ff_ratio: int = 4, + ) -> None: + super().__init__() + + encoder_layer = nn.TransformerEncoderLayer( + d_model, + nhead=nhead, + dim_feedforward=ff_ratio * d_model, + dropout=dropout, + activation=activation, + batch_first=True, + norm_first=norm_first, + ) + + self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=nlayer) + + def forward(self, x: Tensor) -> Tensor: + x = self.encoder(x) + return x + + +class Decoder(nn.Module): + def __init__( + self, + d_model: int, + nhead: int, + dropout: float, + activation: str, + norm_first: bool, + nlayer: int, + ff_ratio: int = 4, + ) -> None: + super().__init__() + decoder_layer = nn.TransformerDecoderLayer( + d_model, + nhead, + dim_feedforward=ff_ratio * d_model, + dropout=dropout, + activation=activation, + batch_first=True, + norm_first=norm_first, + ) + + self.decoder = nn.TransformerDecoder(decoder_layer, nlayer) + + def forward( + self, x: Tensor, memory: Tensor, tgt_mask: Tensor, tgt_padding_mask: Tensor + ) -> Tensor: + x = self.decoder( + x, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask + ) + return x + + +class PositionEmbedding(nn.Module): + def __init__(self, max_seq_len: int, d_model: int, dropout: float) -> None: + super().__init__() + self.embedding = nn.Embedding(max_seq_len, d_model) + self.dropout = nn.Dropout(dropout) + + def forward(self, x: Tensor) -> Tensor: + # assume x is batch first + out = self.embedding(torch.arange(x.shape[1], device=x.device)) + return self.dropout(out + x) + + +class TokenEmbedding(nn.Module): + def __init__( + self, + vocab_size: int, + d_model: int, + padding_idx: int, + ) -> None: + super().__init__() + assert vocab_size > 0 + self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=padding_idx) + + def forward(self, x: Tensor) -> Tensor: + return self.embedding(x) + + +class PrintLayer(nn.Module): + """Only for debugging when loss is nan.""" + + def __init__(self): + super().__init__() + + def forward(self, x): + print( + "torch.isfinite(x).all(): {}, min. {:.5f}, max. {:.5f}".format( + torch.isfinite(x).all(), x.min(), x.max() + ) + ) + return x + + +if __name__ == "__main__": + from torchvision import models + + x = torch.rand(1, 3, 392, 392) + model = ImgConvStemBackbone( + d_model=512, downsample_factor=16, output_channels=64, kernel_size=5 + ) + y = model(x) + print(model) + print(y.shape) + + model = ImgCnnBackbone( + backbone=models.resnet34(), + output_channels=512, + d_model=512, + drop_layer=(3, 8, 9), + ) + + # print(model) + y = model(x) + print(y.shape) diff --git a/unitable/src/model/encoderdecoder.py b/unitable/src/model/encoderdecoder.py new file mode 100644 index 0000000000000000000000000000000000000000..070bcc033b1efe541ba90db5eab6d6fa17ec3451 --- /dev/null +++ b/unitable/src/model/encoderdecoder.py @@ -0,0 +1,109 @@ +import torch +from torch import Tensor, nn +from functools import partial + +from .components import ( + ImgCnnBackbone, + ImgLinearBackbone, + ImgConvStemBackbone, + Encoder, + Decoder, + PositionEmbedding, + TokenEmbedding, +) + + +class EncoderDecoder(nn.Module): + """Encoder decoder architecture that takes in a tabular image and generates the text output. + Backbone serves as the image processor. There are three types of backbones: CNN, linear projection, and ConvStem. + + Args: + ---- + backbone: tabular image processor + encoder: transformer encoder + decoder: transformer decoder + vocab_size: size of the vocabulary + d_model: feature size + padding_idx: index of in the vocabulary + max_seq_len: max sequence length of generated text + dropout: dropout rate + norm_layer: layernorm + init_std: std in weights initialization + """ + + def __init__( + self, + backbone: nn.Module, + encoder: nn.Module, + decoder: nn.Module, + vocab_size: int, + d_model: int, + padding_idx: int, + max_seq_len: int, + dropout: float, + norm_layer: nn.Module, + init_std: float = 0.02, + ): + super().__init__() + + self.backbone = backbone + self.encoder = encoder + self.decoder = decoder + self.norm = norm_layer(d_model) + self.token_embed = TokenEmbedding( + vocab_size=vocab_size, d_model=d_model, padding_idx=padding_idx + ) + self.pos_embed = PositionEmbedding( + max_seq_len=max_seq_len, d_model=d_model, dropout=dropout + ) + self.generator = nn.Linear(d_model, vocab_size) + + self.trunc_normal = partial( + nn.init.trunc_normal_, std=init_std, a=-init_std, b=init_std + ) + self.apply(self._init_weights) + + def _init_weights(self, m: nn.Module): + if isinstance(m, nn.Linear): + self.trunc_normal(m.weight) + if m.bias is not None: + nn.init.constant_(m.bias, 0.0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.weight, 1.0) + nn.init.constant_(m.bias, 0.0) + elif isinstance(m, nn.Conv2d): + self.trunc_normal(m.weight) + if m.bias is not None: + nn.init.constant_(m.bias, 0.0) + elif isinstance(m, PositionEmbedding): + self.trunc_normal(m.embedding.weight) + elif isinstance(m, TokenEmbedding): + self.trunc_normal(m.embedding.weight) + + @torch.jit.ignore + def no_weight_decay(self): + return {"token_embed", "pos_embed"} + + def encode(self, src: Tensor) -> Tensor: + src_feature = self.backbone(src) + src_feature = self.pos_embed(src_feature) + memory = self.encoder(src_feature) + memory = self.norm(memory) + return memory + + def decode( + self, memory: Tensor, tgt: Tensor, tgt_mask: Tensor, tgt_padding_mask: Tensor + ) -> Tensor: + tgt_feature = self.pos_embed(self.token_embed(tgt)) + tgt = self.decoder(tgt_feature, memory, tgt_mask, tgt_padding_mask) + + return tgt + + def forward( + self, src: Tensor, tgt: Tensor, tgt_mask: Tensor, tgt_padding_mask: Tensor + ) -> Tensor: + memory = self.encode(src) + tgt = self.decode(memory, tgt, tgt_mask, tgt_padding_mask) + tgt = self.generator(tgt) + + return tgt diff --git a/unitable/src/model/vqvae.py b/unitable/src/model/vqvae.py new file mode 100644 index 0000000000000000000000000000000000000000..edcfffc6d4717bb718a684a9de740e437ba495f9 --- /dev/null +++ b/unitable/src/model/vqvae.py @@ -0,0 +1,213 @@ +import torch +from torch import nn, Tensor, einsum +from typing import Optional, Tuple +import math +from functools import partial +from collections import OrderedDict +import torch.nn.functional as F +from einops import rearrange + + +def exists(val): + return val is not None + + +def default(val, d): + return val if exists(val) else d + + +def eval_decorator(fn): + def inner(model, *args, **kwargs): + was_training = model.training + model.eval() + out = fn(model, *args, **kwargs) + model.train(was_training) + return out + + return inner + + +class ResBlock(nn.Module): + def __init__(self, chan_in, hidden_size, chan_out): + super().__init__() + self.net = nn.Sequential( + nn.Conv2d(chan_in, hidden_size, 3, padding=1), + nn.ReLU(), + nn.Conv2d(hidden_size, hidden_size, 3, padding=1), + nn.ReLU(), + nn.Conv2d(hidden_size, chan_out, 1), + ) + + def forward(self, x): + return self.net(x) + x + + +class BasicVAE(nn.Module): + def get_codebook_indices(self, images): + raise NotImplementedError() + + def decode(self, img_seq): + raise NotImplementedError() + + def get_codebook_probs(self, img_seq): + raise NotImplementedError() + + def get_image_tokens_size(self): + pass + + def get_image_size(self): + pass + + +class DiscreteVAE(BasicVAE): + def __init__( + self, + image_size: Tuple[int, int] = [256, 256], # input image size + codebook_tokens: int = 512, # codebook vocab size + codebook_dim: int = 512, # codebook embedding dimension + num_layers: int = 3, # layers of resnet blocks in encoder/decoder + hidden_dim: int = 64, # dimension in resnet blocks + channels: int = 3, # input channels + smooth_l1_loss: bool = False, # prevents exploding gradients + temperature: float = 0.9, # tau in gumbel softmax + straight_through: bool = False, # if True, the returned samples will be discretized as one-hot vectors, but will be differentiated as if it is the soft sample in autograd + kl_div_loss_weight: float = 0.0, + ): + super().__init__() + assert num_layers >= 1, "number of layers must be greater than or equal to 1" + + self.image_size = image_size + self.codebook_tokens = codebook_tokens + self.num_layers = num_layers + self.temperature = temperature + self.straight_through = straight_through + self.codebook = nn.Embedding(codebook_tokens, codebook_dim) + + encoder_layers = list() + decoder_layers = list() + + encoder_in = channels + decoder_in = codebook_dim + + for _ in range(num_layers): + encoder_layers.append( + nn.Sequential( + nn.Conv2d(encoder_in, hidden_dim, 4, stride=2, padding=1), nn.ReLU() + ) + ) + encoder_layers.append( + ResBlock( + chan_in=hidden_dim, hidden_size=hidden_dim, chan_out=hidden_dim + ) + ) + encoder_in = hidden_dim + + decoder_layers.append( + nn.Sequential( + nn.ConvTranspose2d(decoder_in, hidden_dim, 4, stride=2, padding=1), + nn.ReLU(), + ) + ) + decoder_layers.append( + ResBlock( + chan_in=hidden_dim, hidden_size=hidden_dim, chan_out=hidden_dim + ) + ) + decoder_in = hidden_dim + + encoder_layers.append(nn.Conv2d(hidden_dim, codebook_tokens, 1)) + decoder_layers.append(nn.Conv2d(hidden_dim, channels, 1)) + + self.encoder = nn.Sequential(*encoder_layers) + self.decoder = nn.Sequential(*decoder_layers) + + self.loss_fn = F.smooth_l1_loss if smooth_l1_loss else F.mse_loss + self.kl_div_loss_weight = kl_div_loss_weight + + def get_image_size(self): + return self.image_size + + def get_image_tokens_size(self) -> int: + ds_ratio = math.pow(2, self.num_layers) + return int((self.image_size[0] // ds_ratio) * (self.image_size[1] // ds_ratio)) + + @torch.no_grad() + @eval_decorator + def get_codebook_indices(self, images: Tensor): + logits = self.forward(images, return_logits=True) + codebook_indices = logits.argmax(dim=1) + return codebook_indices + + @torch.no_grad() + @eval_decorator + def get_codebook_probs(self, images: Tensor): + logits = self.forward(images, return_logits=True) + return nn.Softmax(dim=1)(logits) + + def decode(self, img_seq: Tensor): + image_embeds = self.codebook(img_seq) + image_embeds = image_embeds.permute((0, 3, 1, 2)).contiguous() + + # image_embeds = rearrange(image_embeds, "b h w d -> b d h w", h=h, w=w) + images = self.decoder(image_embeds) + return images + + def forward( + self, + img: Tensor, + return_loss: bool = False, + return_recons: bool = False, + return_logits: bool = False, + temp=None, + ) -> Tuple[Tensor, Optional[Tensor]]: + assert ( + img.shape[-1] == self.image_size[0] and img.shape[-2] == self.image_size[1] + ), f"input must have the correct image size {self.image_size}" + + logits = self.encoder(img) + + if return_logits: + return logits # return logits for getting hard image indices for DALL-E training + + temp = default(temp, self.temperature) + soft_one_hot = F.gumbel_softmax( + logits, tau=temp, dim=1, hard=self.straight_through + ) + sampled = einsum( + "b n h w, n d -> b d h w", soft_one_hot, self.codebook.weight + ).contiguous() + out = self.decoder(sampled) + + if not return_loss: + return out + + # reconstruction loss + recon_loss = self.loss_fn(img, out) + + # kl divergence + logits = rearrange(logits, "b n h w -> b (h w) n").contiguous() + qy = F.softmax(logits, dim=-1) + + log_qy = torch.log(qy + 1e-10) + log_uniform = torch.log( + torch.tensor([1.0 / self.codebook_tokens], device=img.device) + ) + kl_div = F.kl_div(log_uniform, log_qy, None, None, "batchmean", log_target=True) + + loss = recon_loss + (kl_div * self.kl_div_loss_weight) + + if not return_recons: + return loss + + return loss, out + + +if __name__ == "__main__": + input = torch.rand(1, 3, 256, 256) + model = DiscreteVAE() + loss, output = model(input, return_loss=True, return_recons=True) + + print(model) + print(model.get_image_tokens_size()) + print(model.get_codebook_indices(input).shape) + print(loss, output.shape, output.max(), output.min()) diff --git a/unitable/src/trainer/__init__.py b/unitable/src/trainer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fd8672021a681e33691c8de196a1877221074ace --- /dev/null +++ b/unitable/src/trainer/__init__.py @@ -0,0 +1,3 @@ +from .train_beit import BeitTrainer +from .train_vqvae import VqvaeTrainer +from .train_table import TableTrainer \ No newline at end of file diff --git a/unitable/src/trainer/train_beit.py b/unitable/src/trainer/train_beit.py new file mode 100644 index 0000000000000000000000000000000000000000..658c0fdf05664c9a87c184af1c61c91f537b174f --- /dev/null +++ b/unitable/src/trainer/train_beit.py @@ -0,0 +1,245 @@ +import wandb +from pathlib import Path +from typing import Tuple, List, Union, Dict +from omegaconf import DictConfig +from hydra.utils import instantiate +import logging +import torch +import time +from torch import nn, Tensor, autograd +from torch.utils.data import DataLoader +from torch.nn.parallel import DistributedDataParallel as DDP + +from ..utils import printer, compute_grad_norm +from ..trainer.utils import configure_optimizer_weight_decay + +SNAPSHOT_KEYS = set(["EPOCH", "STEP", "OPTIMIZER", "LR_SCHEDULER", "MODEL", "LOSS"]) + + +class BeitTrainer: + def __init__( + self, + device: int, + model: nn.Module, + model_vqvae: nn.Module, + log: logging.Logger, + exp_dir: Path, + snapshot: Path = None, + model_weights: Path = None, # only for testing + ) -> None: + self.device = device + self.log = log + self.exp_dir = exp_dir + self.criterion = nn.CrossEntropyLoss() + assert ( + snapshot is None or model_weights is None + ), "Snapshot and model weights cannot be set at the same time." + + self.model = model + if snapshot is not None and snapshot.is_file(): + self.snapshot = self.load_snapshot(snapshot) + self.model.load_state_dict(self.snapshot["MODEL"]) + self.start_epoch = self.snapshot["EPOCH"] + self.global_step = self.snapshot["STEP"] + elif model_weights is not None and model_weights.is_file(): + self.load_model(model_weights) + else: + self.snapshot = None + self.start_epoch = 0 + self.global_step = 0 + + self.model = self.model.to(device) + self.model = DDP(self.model, device_ids=[device]) + self.model_vqvae = model_vqvae.to(device) + + # https://discuss.pytorch.org/t/extra-10gb-memory-on-gpu-0-in-ddp-tutorial/118113 + torch.cuda.set_device(device) # master gpu takes up extra memory + torch.cuda.empty_cache() + + def train_epoch(self, epoch: int, grad_clip: float = None): + start = time.time() + total_loss = 0.0 + total_samples = 0 + + for i, obj in enumerate(self.train_dataloader): + (trans_image, vqvae_image), bool_mask_pos = obj + trans_image, vqvae_image, bool_mask_pos = ( + trans_image.to(self.device), + vqvae_image.to(self.device), + bool_mask_pos.to(self.device), + ) + + with torch.no_grad(): + input_ids = self.model_vqvae.get_codebook_indices(vqvae_image).flatten( + 1 + ) + bool_mask_pos = bool_mask_pos.flatten(1).to(torch.bool) + labels = input_ids[bool_mask_pos] + + with autograd.detect_anomaly(): + outputs = self.model( + trans_image, bool_mask_pos, return_all_tokens=False + ) + loss = self.criterion(outputs, labels) + + self.optimizer.zero_grad() + loss.backward() + if grad_clip: + nn.utils.clip_grad_norm_( + self.model.parameters(), max_norm=grad_clip + ) + self.optimizer.step() + + loss = loss.detach().cpu().data + total_loss += loss * trans_image.shape[0] + total_samples += trans_image.shape[0] + + self.lr_scheduler.step() + self.global_step += 1 + + if i % 10 == 0: + grad_norm = compute_grad_norm(self.model) + lr = self.optimizer.param_groups[0]["lr"] + elapsed = time.time() - start + self.log.info( + printer( + self.device, + f"Epoch {epoch} Step {i + 1}/{len(self.train_dataloader)} | Loss {loss:.4f} ({total_loss / total_samples:.4f}) | Grad norm {grad_norm:.3f} | {total_samples / elapsed:4.1f} images/s | lr {lr:5.1e}", + ) + ) + + if i % 100 == 0 and self.device == 0: + lr = self.optimizer.param_groups[0]["lr"] + log_info = { + "epoch": epoch, + "train_loss": loss, + "learning rate": lr, + "grad_norm": grad_norm, + } + + wandb.log( + log_info, + step=self.global_step, + ) + + return total_loss / total_samples + + def train( + self, + train_dataloader: DataLoader, + valid_dataloader: DataLoader, + train_cfg: DictConfig, + valid_cfg: DictConfig, + ): + self.train_dataloader = train_dataloader + self.valid_dataloader = valid_dataloader + + # ensure correct weight decay: https://github.com/karpathy/minGPT/blob/37baab71b9abea1b76ab957409a1cc2fbfba8a26/mingpt/model.py#L215 + optim_params = configure_optimizer_weight_decay( + self.model.module, weight_decay=train_cfg.optimizer.weight_decay + ) + self.optimizer = instantiate(train_cfg.optimizer, optim_params) + + self.lr_scheduler = instantiate( + train_cfg.lr_scheduler, optimizer=self.optimizer + ) + + if self.snapshot is not None: + self.optimizer.load_state_dict(self.snapshot["OPTIMIZER"]) + self.lr_scheduler.load_state_dict(self.snapshot["LR_SCHEDULER"]) + + best_loss = float("inf") + self.model.train() + for epoch in range(self.start_epoch, train_cfg.epochs): + train_dataloader.sampler.set_epoch(epoch) + train_loss = self.train_epoch(epoch, grad_clip=train_cfg.grad_clip) + + torch.cuda.empty_cache() + + valid_loss = self.valid(valid_cfg) + + if self.device == 0: + wandb.log( + { + "train loss (epoch)": train_loss, + "valid loss (epoch)": valid_loss, + }, + step=self.global_step, + ) + + if epoch % train_cfg.save_every == 0: + self.save_snapshot(epoch, best_loss) + if valid_loss < best_loss: + self.save_model(epoch) + best_loss = valid_loss + + def valid(self, cfg: DictConfig): + total_samples = 0 + total_loss = 0.0 + + self.model.eval() + for i, obj in enumerate(self.valid_dataloader): + (trans_image, vqvae_image), bool_mask_pos = obj + trans_image, vqvae_image, bool_mask_pos = ( + trans_image.to(self.device), + vqvae_image.to(self.device), + bool_mask_pos.to(self.device), + ) + + with torch.no_grad(): + input_ids = self.model_vqvae.get_codebook_indices(vqvae_image).flatten( + 1 + ) + bool_mask_pos = bool_mask_pos.flatten(1).to(torch.bool) + labels = input_ids[bool_mask_pos] + + outputs = self.model( + trans_image, bool_mask_pos, return_all_tokens=False + ) + loss = self.criterion(outputs, labels) + + loss = loss.detach().cpu().data + total_loss += loss * trans_image.shape[0] + total_samples += trans_image.shape[0] + + if i % 10 == 0: + self.log.info( + printer( + self.device, + f"Valid: Step {i + 1}/{len(self.valid_dataloader)} | Loss {loss:.4f} ({total_loss / total_samples:.4f})", + ) + ) + + return total_loss / total_samples + + def save_model(self, epoch: int): + filename = Path(self.exp_dir) / "model" / f"epoch{epoch}_model.pt" + torch.save(self.model.module.state_dict(), filename) + self.log.info(printer(self.device, f"Saving model to {filename}")) + filename = Path(self.exp_dir) / "model" / f"best.pt" + torch.save(self.model.module.state_dict(), filename) + + def load_model(self, path: Union[str, Path]): + self.model.load_state_dict(torch.load(path, map_location="cpu")) + self.log.info(printer(self.device, f"Loading model from {path}")) + + def save_snapshot(self, epoch: int, best_loss: float): + state_info = { + "EPOCH": epoch + 1, + "STEP": self.global_step, + "OPTIMIZER": self.optimizer.state_dict(), + "LR_SCHEDULER": self.lr_scheduler.state_dict(), + "MODEL": self.model.module.state_dict(), + "LOSS": best_loss, + } + + snapshot_path = Path(self.exp_dir) / "snapshot" / f"epoch{epoch}_snapshot.pt" + torch.save(state_info, snapshot_path) + + self.log.info(printer(self.device, f"Saving snapshot to {snapshot_path}")) + + def load_snapshot(self, path: Path): + self.log.info(printer(self.device, f"Loading snapshot from {path}")) + snapshot = torch.load(path, map_location="cpu") + assert SNAPSHOT_KEYS.issubset(snapshot.keys()) + return snapshot diff --git a/unitable/src/trainer/train_table.py b/unitable/src/trainer/train_table.py new file mode 100644 index 0000000000000000000000000000000000000000..a604fcc5e18b4d8ae025477259e193c27c3e3586 --- /dev/null +++ b/unitable/src/trainer/train_table.py @@ -0,0 +1,467 @@ +from typing import Tuple, List, Union, Dict, Optional +import torch +import wandb +import json +import os +from torch import nn, Tensor, autograd +from torch.utils.data import DataLoader +from omegaconf import DictConfig +from hydra.utils import instantiate +import logging +from pathlib import Path +from torch.nn.parallel import DistributedDataParallel as DDP +import tokenizers as tk +import torch.nn.functional as F + +from .utils import ( + Batch, + configure_optimizer_weight_decay, + turn_off_beit_grad, + VALID_HTML_TOKEN, + INVALID_CELL_TOKEN, + VALID_BBOX_TOKEN, +) +from ..utils import ( + printer, + compute_grad_norm, + count_total_parameters, + batch_autoregressive_decode, + combine_filename_pred_gt, +) + +SNAPSHOT_KEYS = set(["EPOCH", "STEP", "OPTIMIZER", "LR_SCHEDULER", "MODEL", "LOSS"]) + + +class TableTrainer: + """A trainer for table recognition. The supported tasks are: + 1) table structure extraction + 2) table cell bbox detection + 3) table cell content recognition + + Args: + ---- + device: gpu id + vocab: a vocab shared among all tasks + model: model architecture + log: logger + exp_dir: the experiment directory that saves logs, wandb files, model weights, and checkpoints (snapshots) + snapshot: specify which snapshot to use, only used in training + model_weights: specify which model weight to use, only used in testing + beit_pretrained_weights: load SSL pretrained visual encoder + freeze_beit_epoch: freeze beit weights for the first {freeze_beit_epoch} epochs + """ + + def __init__( + self, + device: int, + vocab: tk.Tokenizer, + model: nn.Module, + log: logging.Logger, + exp_dir: Path, + snapshot: Path = None, + model_weights: str = None, + beit_pretrained_weights: str = None, + freeze_beit_epoch: int = None, + ) -> None: + self.device = device + self.log = log + self.exp_dir = exp_dir + self.vocab = vocab + self.padding_idx = vocab.token_to_id("") + self.freeze_beit_epoch = freeze_beit_epoch + + # loss for training html, cell + self.criterion = nn.CrossEntropyLoss(ignore_index=self.padding_idx) + + self.model = model + + if ( + beit_pretrained_weights is not None + and Path(beit_pretrained_weights).is_file() + ): + self.load_pretrained_beit(Path(beit_pretrained_weights)) + + assert ( + snapshot is None or model_weights is None + ), "Cannot set snapshot and model_weights at the same time!" + + if snapshot is not None and snapshot.is_file(): + self.snapshot = self.load_snapshot(snapshot) + self.model.load_state_dict(self.snapshot["MODEL"]) + self.start_epoch = self.snapshot["EPOCH"] + self.global_step = self.snapshot["STEP"] + elif model_weights is not None and Path(model_weights).is_file(): + self.load_model(Path(model_weights)) + else: + self.snapshot = None + self.start_epoch = 0 + self.global_step = 0 + + if freeze_beit_epoch and freeze_beit_epoch > 0: + self._freeze_beit() + + self.model = self.model.to(device) + self.model = DDP(self.model, device_ids=[device]) + + # https://discuss.pytorch.org/t/extra-10gb-memory-on-gpu-0-in-ddp-tutorial/118113 + torch.cuda.set_device(device) # master gpu takes up extra memory + torch.cuda.empty_cache() + + def _freeze_beit(self): + if self.start_epoch < self.freeze_beit_epoch: + turn_off_beit_grad(self.model) + self.log.info( + printer( + self.device, + f"Lock SSL params for {self.freeze_beit_epoch} epochs (params: {count_total_parameters(self.model) / 1e6:.2f}M) - Current epoch {self.start_epoch + 1}", + ) + ) + else: + self.log.info( + printer( + self.device, + f"Unlock all weights (params: {count_total_parameters(self.model) / 1e6:.2f}M) - Current epoch {self.start_epoch + 1}", + ) + ) + + def train_epoch( + self, + epoch: int, + target: str, + loss_weights: List[float], + grad_clip: float = None, + ): + avg_loss = 0.0 + + # load data from dataloader + for i, obj in enumerate(self.train_dataloader): + batch = Batch(device=self.device, target=target, vocab=self.vocab, obj=obj) + + with autograd.detect_anomaly(): + loss, _ = batch.inference( + self.model, + criterion=self.criterion, + criterion_bbox=self.criterion_bbox, + loss_weights=loss_weights, + ) + + total_loss = loss["total"] + + self.optimizer.zero_grad() + total_loss.backward() + if grad_clip: + nn.utils.clip_grad_norm_( + self.model.parameters(), max_norm=grad_clip + ) + self.optimizer.step() + + total_loss = total_loss.detach().cpu().data + avg_loss += total_loss + self.lr_scheduler.step() + self.global_step += 1 + + if i % 10 == 0: + grad_norm = compute_grad_norm(self.model) + lr = self.optimizer.param_groups[0]["lr"] + # elapsed = time.time() - start + + loss_info = f"Loss {total_loss:.3f} ({avg_loss / (i + 1):.3f})" + if not isinstance(loss["html"], int): + loss_info += f" Html {loss['html'].detach().cpu().data:.3f}" + if not isinstance(loss["cell"], int): + loss_info += f" Cell {loss['cell'].detach().cpu().data:.3f}" + if not isinstance(loss["bbox"], int): + loss_info += f" Bbox {loss['bbox'].detach().cpu().data:.3f}" + self.log.info( + printer( + self.device, + f"Epoch {epoch} Step {i + 1}/{len(self.train_dataloader)} | {loss_info} | Grad norm {grad_norm:.3f} | lr {lr:5.1e}", + ) + ) + + if i % 100 == 0 and self.device == 0: + log_info = { + "epoch": epoch, + "train_total_loss": total_loss, + "learning rate": lr, + "grad_norm": grad_norm, + } + + wandb.log( + log_info, + step=self.global_step, + ) + + def train( + self, + train_dataloader: DataLoader, + valid_dataloader: DataLoader, + train_cfg: DictConfig, + valid_cfg: DictConfig, + ): + self.train_dataloader = train_dataloader + self.valid_dataloader = valid_dataloader + + # ensure correct weight decay: https://github.com/karpathy/minGPT/blob/37baab71b9abea1b76ab957409a1cc2fbfba8a26/mingpt/model.py#L215 + optim_params = configure_optimizer_weight_decay( + self.model.module, weight_decay=train_cfg.optimizer.weight_decay + ) + + self.optimizer = instantiate(train_cfg.optimizer, optim_params) + + self.lr_scheduler = instantiate( + train_cfg.lr_scheduler, optimizer=self.optimizer + ) + + if self.snapshot is not None: + self.optimizer.load_state_dict(self.snapshot["OPTIMIZER"]) + self.lr_scheduler.load_state_dict(self.snapshot["LR_SCHEDULER"]) + + self.criterion_bbox = None + if "bbox" in train_cfg.target: + tmp = [ + self.vocab.token_to_id(i) + for i in VALID_BBOX_TOKEN[ + : train_cfg.img_size[0] + 2 + ] # +1 for +1 for bbox == img_size + ] + tmp = [1.0 if i in tmp else 0.0 for i in range(self.vocab.get_vocab_size())] + self.criterion_bbox = nn.CrossEntropyLoss( + weight=torch.tensor(tmp, device=self.device), + ignore_index=self.padding_idx, + ) + + best_loss = float("inf") + self.model.train() + + if self.freeze_beit_epoch and self.start_epoch < self.freeze_beit_epoch: + max_epoch = self.freeze_beit_epoch + else: + max_epoch = train_cfg.epochs + for epoch in range(self.start_epoch, max_epoch): + train_dataloader.sampler.set_epoch(epoch) + + self.train_epoch( + epoch, + grad_clip=train_cfg.grad_clip, + target=train_cfg.target, + loss_weights=train_cfg.loss_weights, + ) + + torch.cuda.empty_cache() + + valid_loss = self.valid(valid_cfg) + + if self.device == 0: + wandb.log( + {"valid loss (epoch)": valid_loss}, + step=self.global_step, + ) + + if epoch % train_cfg.save_every == 0: + self.save_snapshot(epoch, best_loss) + if valid_loss < best_loss: + self.save_model(epoch) + best_loss = valid_loss + + def valid(self, cfg: DictConfig): + total_loss = 0.0 + avg_loss = 0.0 + total_samples = 0 + + self.model.eval() + for i, obj in enumerate(self.valid_dataloader): + batch = Batch( + device=self.device, target=cfg.target, vocab=self.vocab, obj=obj + ) + with torch.no_grad(): + loss, _ = batch.inference( + self.model, + criterion=self.criterion, + criterion_bbox=self.criterion_bbox, + loss_weights=cfg.loss_weights, + ) + + total_loss = loss["total"] + total_loss = total_loss.detach().cpu().data + avg_loss += total_loss * batch.image.shape[0] + total_samples += batch.image.shape[0] + + if i % 10 == 0: + loss_info = f"Loss {total_loss:.3f} ({avg_loss / total_samples:.3f})" + if not isinstance(loss["html"], int): + loss_info += f" Html {loss['html'].detach().cpu().data:.3f}" + if not isinstance(loss["cell"], int): + loss_info += f" Cell {loss['cell'].detach().cpu().data:.3f}" + if not isinstance(loss["bbox"], int): + loss_info += f" Bbox {loss['bbox'].detach().cpu().data:.3f}" + self.log.info( + printer( + self.device, + f"Valid: Step {i + 1}/{len(self.valid_dataloader)} | {loss_info}", + ) + ) + + return avg_loss / total_samples + + def test(self, test_dataloader: DataLoader, cfg: DictConfig, save_to: str): + total_result = dict() + for i, obj in enumerate(test_dataloader): + batch = Batch( + device=self.device, target=cfg.target, vocab=self.vocab, obj=obj + ) + + if cfg.target == "html": + prefix = [self.vocab.token_to_id("[html]")] + valid_token_whitelist = [ + self.vocab.token_to_id(i) for i in VALID_HTML_TOKEN + ] + valid_token_blacklist = None + elif cfg.target == "cell": + prefix = [self.vocab.token_to_id("[cell]")] + valid_token_whitelist = None + valid_token_blacklist = [ + self.vocab.token_to_id(i) for i in INVALID_CELL_TOKEN + ] + elif cfg.target == "bbox": + prefix = [self.vocab.token_to_id("[bbox]")] + valid_token_whitelist = [ + self.vocab.token_to_id(i) + for i in VALID_BBOX_TOKEN[: cfg.img_size[0]] + ] + valid_token_blacklist = None + else: + raise NotImplementedError + + pred_id = batch_autoregressive_decode( + device=self.device, + model=self.model, + batch_data=batch, + prefix=prefix, + max_decode_len=cfg.max_seq_len, + eos_id=self.vocab.token_to_id(""), + valid_token_whitelist=valid_token_whitelist, + valid_token_blacklist=valid_token_blacklist, + sampling=cfg.sampling, + ) + + if cfg.target == "html": + result = combine_filename_pred_gt( + filename=batch.name, + pred_id=pred_id, + gt_id=batch.html_tgt, + vocab=self.vocab, + type="html", + ) + elif cfg.target == "cell": + result = combine_filename_pred_gt( + filename=batch.name, + pred_id=pred_id, + gt_id=batch.cell_tgt, + vocab=self.vocab, + type="cell", + ) + elif cfg.target == "bbox": + result = combine_filename_pred_gt( + filename=batch.name, + pred_id=pred_id, + gt_id=batch.bbox_tgt, + vocab=self.vocab, + type="bbox", + ) + else: + raise NotImplementedError + + total_result.update(result) + + if i % 10 == 0: + self.log.info( + printer( + self.device, + f"Test: Step {i + 1}/{len(test_dataloader)}", + ) + ) + + self.log.info( + printer( + self.device, + f"Converting {len(total_result)} samples to html tables ...", + ) + ) + + with open( + os.path.join(save_to, cfg.save_to_prefix + f"_{self.device}.json"), + "w", + encoding="utf-8", + ) as f: + json.dump(total_result, f, indent=4) + + return total_result + + def save_model(self, epoch: int): + filename = Path(self.exp_dir) / "model" / f"epoch{epoch}_model.pt" + torch.save(self.model.module.state_dict(), filename) + self.log.info(printer(self.device, f"Saving model to {filename}")) + filename = Path(self.exp_dir) / "model" / "best.pt" + torch.save(self.model.module.state_dict(), filename) + + def load_model(self, path: Union[str, Path]): + self.model.load_state_dict(torch.load(path, map_location="cpu")) + self.log.info(printer(self.device, f"Loading model from {path}")) + + def save_snapshot(self, epoch: int, best_loss: float): + state_info = { + "EPOCH": epoch + 1, + "STEP": self.global_step, + "OPTIMIZER": self.optimizer.state_dict(), + "LR_SCHEDULER": self.lr_scheduler.state_dict(), + "MODEL": self.model.module.state_dict(), + "LOSS": best_loss, + } + + snapshot_path = Path(self.exp_dir) / "snapshot" / f"epoch{epoch}_snapshot.pt" + torch.save(state_info, snapshot_path) + + self.log.info(printer(self.device, f"Saving snapshot to {snapshot_path}")) + + def load_snapshot(self, path: Path): + self.log.info(printer(self.device, f"Loading snapshot from {path}")) + snapshot = torch.load(path, map_location="cpu") + assert SNAPSHOT_KEYS.issubset(snapshot.keys()) + return snapshot + + def load_pretrained_beit(self, path: Path): + self.log.info(printer(self.device, f"Loading pretrained BEiT from {path}")) + beit = torch.load(path, map_location="cpu") + redundant_keys_in_beit = [ + "cls_token", + "mask_token", + "generator.weight", + "generator.bias", + ] + for key in redundant_keys_in_beit: + if key in beit: + del beit[key] + + # max_seq_len in finetuning may go beyond the length in pretraining + if ( + self.model.pos_embed.embedding.weight.shape[0] + != beit["pos_embed.embedding.weight"].shape[0] + ): + emb_shape = self.model.pos_embed.embedding.weight.shape + ckpt_emb = beit["pos_embed.embedding.weight"].clone() + assert emb_shape[1] == ckpt_emb.shape[1] + + ckpt_emb = ckpt_emb.unsqueeze(0).permute(0, 2, 1) + ckpt_emb = F.interpolate(ckpt_emb, emb_shape[0], mode="nearest") + beit["pos_embed.embedding.weight"] = ckpt_emb.permute(0, 2, 1).squeeze() + + out = self.model.load_state_dict(beit, strict=False) + + # ensure missing keys are just token_embed, decoder, and generator + missing_keys_prefix = ("token_embed", "decoder", "generator") + for key in out[0]: + assert key.startswith( + missing_keys_prefix + ), f"Key {key} should be loaded from BEiT, but missing in current state dict." + assert len(out[1]) == 0, f"Unexpected keys from BEiT: {out[1]}" diff --git a/unitable/src/trainer/train_vqvae.py b/unitable/src/trainer/train_vqvae.py new file mode 100644 index 0000000000000000000000000000000000000000..4ea682ed48ca2f7d58df66023df08ce3988400a4 --- /dev/null +++ b/unitable/src/trainer/train_vqvae.py @@ -0,0 +1,297 @@ +import math +import wandb +from pathlib import Path +from typing import Tuple, List, Union, Dict +from omegaconf import DictConfig +from hydra.utils import instantiate +import logging +import torch +import time +from functools import partial +from torch import nn, Tensor, autograd +from torch.utils.data import DataLoader +from torch.optim import Adam +from torch.nn.parallel import DistributedDataParallel as DDP +import torch.distributed as dist +from torchvision.utils import make_grid + +from ..utils import printer, compute_grad_norm + +SNAPSHOT_KEYS = set(["EPOCH", "STEP", "OPTIMIZER", "LR_SCHEDULER", "MODEL", "LOSS"]) + + +class VqvaeTrainer: + def __init__( + self, + device: int, + model: nn.Module, + log: logging.Logger, + exp_dir: Path, + snapshot: Path = None, + model_weights: Path = None, # only for testing + ) -> None: + self.device = device + self.log = log + self.exp_dir = exp_dir + assert ( + snapshot is None or model_weights is None + ), "Snapshot and model weights cannot be set at the same time." + + self.model = model + if snapshot is not None and snapshot.is_file(): + self.snapshot = self.load_snapshot(snapshot) + self.model.load_state_dict(self.snapshot["MODEL"]) + self.start_epoch = self.snapshot["EPOCH"] + self.global_step = self.snapshot["STEP"] + elif model_weights is not None and model_weights.is_file(): + self.load_model(model_weights) + else: + self.snapshot = None + self.start_epoch = 0 + + self.model = self.model.to(device) + self.model = DDP(self.model, device_ids=[device]) + + # https://discuss.pytorch.org/t/extra-10gb-memory-on-gpu-0-in-ddp-tutorial/118113 + torch.cuda.set_device(device) # master gpu takes up extra memory + torch.cuda.empty_cache() + + def train_epoch( + self, + epoch: int, + starting_temp: float, + anneal_rate: float, + temp_min: float, + grad_clip: float = None, + ): + start = time.time() + total_loss = 0.0 + total_samples = 0 + + # load data from dataloader + for i, obj in enumerate(self.train_dataloader): + if isinstance(obj, Tensor): + img = obj.to(self.device) + elif isinstance(obj, (list, tuple)): + img = obj[0].to(self.device) + else: + raise ValueError(f"Unrecognized object type {type(obj)}") + + # temperature annealing + self.temp = max( + starting_temp * math.exp(-anneal_rate * self.global_step), temp_min + ) + + with autograd.detect_anomaly(): + loss, soft_recons = self.model( + img, return_loss=True, return_recons=True, temp=self.temp + ) + + self.optimizer.zero_grad() + loss.backward() + if grad_clip: + nn.utils.clip_grad_norm_( + self.model.parameters(), max_norm=grad_clip + ) + self.optimizer.step() + + loss = loss.detach().cpu().data + total_loss += loss * img.shape[0] + total_samples += img.shape[0] + + self.lr_scheduler.step() + self.global_step += 1 + + if i % 10 == 0: + grad_norm = compute_grad_norm(self.model) + lr = self.optimizer.param_groups[0]["lr"] + elapsed = time.time() - start + self.log.info( + printer( + self.device, + f"Epoch {epoch} Step {i + 1}/{len(self.train_dataloader)} | Loss {loss:.4f} ({total_loss / total_samples:.4f}) | Grad norm {grad_norm:.3f} | {total_samples / elapsed:4.1f} images/s | lr {lr:5.1e} | Temp {self.temp:.2e}", + ) + ) + + # visualize reconstruction images + if i % 100 == 0 and self.device == 0: + lr = self.optimizer.param_groups[0]["lr"] + k = 4 # num of images saved for visualization + codes = self.model.module.get_codebook_indices(img[:k]) + hard_recons = self.model.module.decode(codes) + + img = img[:k].detach().cpu() + soft_recons = soft_recons[:k].detach().cpu() + codes = codes.flatten(start_dim=1).detach().cpu() + hard_recons = hard_recons.detach().cpu() + + make_vis = partial(make_grid, nrow=int(math.sqrt(k)), normalize=True) + img, soft_recons, hard_recons = map( + make_vis, (img, soft_recons, hard_recons) + ) + + log_info = { + "epoch": epoch, + "train_loss": loss, + "temperature": self.temp, + "learning rate": lr, + "original images": wandb.Image( + img, caption=f"step: {self.global_step}" + ), + "soft reconstruction": wandb.Image( + soft_recons, caption=f"step: {self.global_step}" + ), + "hard reconstruction": wandb.Image( + hard_recons, caption=f"step: {self.global_step}" + ), + "codebook_indices": wandb.Histogram(codes), + } + + wandb.log( + log_info, + step=self.global_step, + ) + + return total_loss, total_samples + + def train( + self, + train_dataloader: DataLoader, + valid_dataloader: DataLoader, + train_cfg: DictConfig, + valid_cfg: DictConfig, + ): + self.train_dataloader = train_dataloader + self.valid_dataloader = valid_dataloader + self.optimizer = instantiate( + train_cfg.optimizer, params=self.model.parameters() + ) + + self.lr_scheduler = instantiate( + train_cfg.lr_scheduler, optimizer=self.optimizer + ) + + if self.snapshot is not None: + self.optimizer.load_state_dict(self.snapshot["OPTIMIZER"]) + self.lr_scheduler.load_state_dict(self.snapshot["LR_SCHEDULER"]) + + best_loss = float("inf") + self.model.train() + self.global_step = 0 + # self.temp = train_cfg.starting_temp + for epoch in range(self.start_epoch, train_cfg.epochs): + train_dataloader.sampler.set_epoch(epoch) + epoch_loss, epoch_samples = self.train_epoch( + epoch, + starting_temp=train_cfg.starting_temp, + anneal_rate=train_cfg.temp_anneal_rate, + temp_min=train_cfg.temp_min, + grad_clip=train_cfg.grad_clip, + ) + + torch.cuda.empty_cache() + + valid_loss, valid_samples = self.valid(valid_cfg) + + # reduce loss to gpu 0 + training_info = torch.tensor( + [epoch_loss, epoch_samples, valid_loss, valid_samples], + device=self.device, + ) + + dist.reduce( + training_info, + dst=0, + op=dist.ReduceOp.SUM, + ) + + if self.device == 0: + grad_norm = compute_grad_norm(self.model) + epoch_loss, epoch_samples, valid_loss, valid_samples = training_info + epoch_loss, valid_loss = ( + float(epoch_loss) / epoch_samples, + float(valid_loss) / valid_samples, + ) + + log_info = { + "train loss (epoch)": epoch_loss, + "valid loss (epoch)": valid_loss, + "train_samples": epoch_samples, + "valid_samples": valid_samples, + "grad_norm": grad_norm, + } + + wandb.log( + log_info, + step=self.global_step, + ) + + if epoch % train_cfg.save_every == 0: + self.save_snapshot(epoch, best_loss) + if valid_loss < best_loss: + self.save_model(epoch) + best_loss = valid_loss + + def valid(self, cfg: DictConfig): + total_samples = 0 + total_loss = 0.0 + + self.model.eval() + for i, obj in enumerate(self.valid_dataloader): + if isinstance(obj, Tensor): + img = obj.to(self.device) + elif isinstance(obj, (list, tuple)): + img = obj[0].to(self.device) + else: + raise ValueError(f"Unrecognized object type {type(obj)}") + + with torch.no_grad(): + loss = self.model( + img, return_loss=True, return_recons=False, temp=self.temp + ) + + loss = loss.detach().cpu().data + total_loss += loss * img.shape[0] + total_samples += img.shape[0] + + if i % 10 == 0: + self.log.info( + printer( + self.device, + f"Valid: Step {i + 1}/{len(self.valid_dataloader)} | Loss {loss:.4f} ({total_loss / total_samples:.4f})", + ) + ) + + return total_loss, total_samples + + def save_model(self, epoch: int): + filename = Path(self.exp_dir) / "model" / f"epoch{epoch}_model.pt" + torch.save(self.model.module.state_dict(), filename) + self.log.info(printer(self.device, f"Saving model to {filename}")) + filename = Path(self.exp_dir) / "model" / f"best.pt" + torch.save(self.model.module.state_dict(), filename) + + def load_model(self, path: Union[str, Path]): + self.model.load_state_dict(torch.load(path, map_location="cpu")) + self.log.info(printer(self.device, f"Loading model from {path}")) + + def save_snapshot(self, epoch: int, best_loss: float): + state_info = { + "EPOCH": epoch + 1, + "STEP": self.global_step, + "OPTIMIZER": self.optimizer.state_dict(), + "LR_SCHEDULER": self.lr_scheduler.state_dict(), + "MODEL": self.model.module.state_dict(), + "LOSS": best_loss, + } + + snapshot_path = Path(self.exp_dir) / "snapshot" / f"epoch{epoch}_snapshot.pt" + torch.save(state_info, snapshot_path) + + self.log.info(printer(self.device, f"Saving snapshot to {snapshot_path}")) + + def load_snapshot(self, path: Path): + self.log.info(printer(self.device, f"Loading snapshot from {path}")) + snapshot = torch.load(path, map_location="cpu") + assert SNAPSHOT_KEYS.issubset(snapshot.keys()) + return snapshot diff --git a/unitable/src/trainer/utils.py b/unitable/src/trainer/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d09bd5714e4e0860a64f0ad1da807d8aa90c7cd1 --- /dev/null +++ b/unitable/src/trainer/utils.py @@ -0,0 +1,225 @@ +from typing import List, Tuple, Dict +import torch +from torch import Tensor, nn +from torchtext.vocab import Vocab +import tokenizers as tk + +from ..utils import pred_token_within_range, subsequent_mask +from ..vocab import ( + HTML_TOKENS, + TASK_TOKENS, + RESERVED_TOKENS, + BBOX_TOKENS, +) + + +VALID_HTML_TOKEN = [""] + HTML_TOKENS +INVALID_CELL_TOKEN = ( + ["", "", "", ""] + TASK_TOKENS + RESERVED_TOKENS +) +VALID_BBOX_TOKEN = [ + "" +] + BBOX_TOKENS # image size will be addressed after instantiation + + +class Batch: + """Wrap up a batch of training samples with different training targets. + The input is not torch tensor + Shape of the image (src): B, S, E + Shape of the text (tgt): B, N, S, E (M includes 1 table detection, 1 structure, 1 cell, and multiple bbox) + Reshape text to (B * N, S, E) and inflate the image to match the shape of the text + + Args: + ---- + device: gpu id + """ + + def __init__( + self, + device: torch.device, + target: str, + vocab: Vocab, + obj: List, + ) -> None: + self.device = device + self.image = obj[0].to(device) + self.name = obj[1]["filename"] + self.target = target + self.vocab = vocab + self.image_size = self.image.shape[-1] + + if "table" in target: + raise NotImplementedError + + if "html" in target: + self.valid_html_token = [vocab.token_to_id(i) for i in VALID_HTML_TOKEN] + ( + self.html_src, + self.html_tgt, + self.html_casual_mask, + self.html_padding_mask, + ) = self._prepare_transformer_input(obj[1]["html"]) + + if "cell" in target: + self.invalid_cell_token = [vocab.token_to_id(i) for i in INVALID_CELL_TOKEN] + ( + self.cell_src, + self.cell_tgt, + self.cell_casual_mask, + self.cell_padding_mask, + ) = self._prepare_transformer_input(obj[1]["cell"]) + + if "bbox" in target: + ( + self.bbox_src, + self.bbox_tgt, + self.bbox_casual_mask, + self.bbox_padding_mask, + ) = self._prepare_transformer_input(obj[1]["bbox"]) + + def _prepare_transformer_input( + self, seq: List[tk.Encoding] + ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + tmp = [i.ids for i in seq] + tmp = torch.tensor(tmp, dtype=torch.int32) + src = tmp[:, :-1].to(self.device) + tgt = tmp[:, 1:].type(torch.LongTensor).to(self.device) + casual_mask = subsequent_mask(src.shape[-1]).to(self.device) + tmp = [i.attention_mask[:-1] for i in seq] # padding mask + tmp = torch.tensor(tmp, dtype=torch.bool) + padding_mask = (~tmp).to(self.device) + + return src, tgt, casual_mask, padding_mask + + def _inference_one_task( + self, model, memory, src, casual_mask, padding_mask, use_ddp + ): + if use_ddp: + out = model.module.decode(memory, src, casual_mask, padding_mask) + out = model.module.generator(out) + else: + out = model.decode(memory, src, casual_mask, padding_mask) + out = model.generator(out) + + return out + + def inference( + self, + model: nn.Module, + criterion: nn.Module, + criterion_bbox: nn.Module = None, + loss_weights: dict = None, + use_ddp: bool = True, + ) -> Tuple[Dict, Dict]: + pred = dict() + loss = dict(table=0, html=0, cell=0, bbox=0) + + if use_ddp: + memory = model.module.encode(self.image) + else: + memory = model.encode(self.image) + + # inference + suppress invalid logits + compute loss + if "html" in self.target: + out_html = self._inference_one_task( + model, + memory, + self.html_src, + self.html_casual_mask, + self.html_padding_mask, + use_ddp, + ) + + pred["html"] = pred_token_within_range( + out_html, white_list=self.valid_html_token + ).permute(0, 2, 1) + loss["html"] = criterion(pred["html"], self.html_tgt) + + if "cell" in self.target: + out_cell = self._inference_one_task( + model, + memory, + self.cell_src, + self.cell_casual_mask, + self.cell_padding_mask, + use_ddp, + ) + + pred["cell"] = pred_token_within_range( + out_cell, black_list=self.invalid_cell_token + ).permute(0, 2, 1) + loss["cell"] = criterion(pred["cell"], self.cell_tgt) + + if "bbox" in self.target: + assert criterion_bbox is not None + + out_bbox = self._inference_one_task( + model, + memory, + self.bbox_src, + self.bbox_casual_mask, + self.bbox_padding_mask, + use_ddp, + ) + pred["bbox"] = out_bbox.permute(0, 2, 1) + loss["bbox"] = criterion_bbox(pred["bbox"], self.bbox_tgt) + + total = 0.0 + for k, v in loss_weights.items(): + total += loss[k] * v + loss["total"] = total + + return loss, pred + + +def configure_optimizer_weight_decay( + model: nn.Module, weight_decay: float +) -> List[Dict]: + weight_decay_blacklist = (nn.LayerNorm, nn.BatchNorm2d, nn.Embedding) + + if hasattr(model, "no_weight_decay"): + skip_list = model.no_weight_decay() + decay = set() + no_decay = set() + for mn, m in model.named_modules(): + for pn, p in m.named_parameters(): + fpn = "%s.%s" % (mn, pn) if mn else pn # full param name + if pn.endswith("bias"): + no_decay.add(fpn) + elif pn.endswith("weight") and isinstance(m, weight_decay_blacklist): + no_decay.add(fpn) + elif pn in skip_list: + no_decay.add(fpn) + + param_dict = {pn: p for pn, p in model.named_parameters()} + decay = param_dict.keys() - no_decay + + optim_groups = [ + { + "params": [param_dict[pn] for pn in sorted(list(decay))], + "weight_decay": weight_decay, + }, + { + "params": [param_dict[pn] for pn in sorted(list(no_decay))], + "weight_decay": 0.0, + }, + ] + + return optim_groups + + +def turn_off_beit_grad(model: nn.Module): + "Freeze BEiT pretrained weights." + for param in model.encoder.parameters(): + param.requires_grad = False + + for param in model.backbone.parameters(): + param.requires_grad = False + + for param in model.pos_embed.parameters(): + param.requires_grad = False + + +def turn_on_beit_grad(model: nn.Module): + for param in model.parameters(): + param.requires_grad = True diff --git a/unitable/src/utils/__init__.py b/unitable/src/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c213338e9b5fcbe101d9ecd957406276f3fac5c5 --- /dev/null +++ b/unitable/src/utils/__init__.py @@ -0,0 +1,4 @@ +from .visualization import * +from .data import * +from .mask_generator import * +from .misc import * diff --git a/unitable/src/utils/coco_map.py b/unitable/src/utils/coco_map.py new file mode 100644 index 0000000000000000000000000000000000000000..0151ade2c26c23025990063f7d4e6e33cf6fe5bb --- /dev/null +++ b/unitable/src/utils/coco_map.py @@ -0,0 +1,47 @@ +import torch +from torchmetrics.detection import MeanAveragePrecision +from pprint import pprint + + +def compute_coco_map(file): + coco_pred = list() + coco_gt = list() + for _, obj in file.items(): + tmp_pred = { + "boxes": torch.tensor(obj["pred"], device=0), + "labels": torch.tensor([0] * len(obj["pred"]), device=0), + "scores": torch.tensor([0.999] * len(obj["pred"]), device=0), + } + + tmp_gt = { + "boxes": torch.tensor(obj["gt"], device=0), + "labels": torch.tensor([0] * len(obj["gt"]), device=0), + } + + coco_pred.append(tmp_pred) + coco_gt.append(tmp_gt) + + metric = MeanAveragePrecision( + iou_type="bbox", + max_detection_thresholds=[1, 10, 1000], + backend="faster_coco_eval", + ) + metric.update(coco_pred, coco_gt) + pprint(metric.compute()) + + +if __name__ == "__main__": + import json + import argparse + + parser = argparse.ArgumentParser(description="mAP Computation") + + parser.add_argument("-f", "--file", help="path to html table results in json file") + args = parser.parse_args() + + + results_file = args.file + with open(results_file, "r") as f: + results_json = json.load(f) + + compute_coco_map(results_json) diff --git a/unitable/src/utils/data.py b/unitable/src/utils/data.py new file mode 100644 index 0000000000000000000000000000000000000000..1ccfafbcb8c64bb2e7147070b2a28e8012aff769 --- /dev/null +++ b/unitable/src/utils/data.py @@ -0,0 +1,318 @@ +from typing import List, Tuple +import random +import tokenizers as tk +import torch +from torch import Tensor, nn +import torch.nn.functional as F + +from ..vocab import TASK_TOKENS, CELL_SPECIAL +from ..model.encoderdecoder import EncoderDecoder +from .misc import html_table_template + +__all__ = [ + "subsequent_mask", + "combine_cell_char_seq", + "random_continuous_sequence", + "prepare_html_seq", + "prepare_cell_seq", + "prepare_bbox_seq", + "html_str_to_token_list", + "cell_str_to_token_list", + "bbox_str_to_token_list", + "pred_token_within_range", + "batch_autoregressive_decode", + "greedy_sampling", + "combine_filename_pred_gt", + "build_table_from_html_and_cell" +] + + +def subsequent_mask(size: int, pad: int = 0): + attn_shape = (size, size) + output = torch.triu(torch.ones(attn_shape), diagonal=1).to(torch.bool) + if pad and pad > 0: + output[:pad] = False + return output + + +def combine_cell_char_seq(seq: List[str]) -> str: + """Replace empty token with in vocab. combine characters into a str""" + if seq: + out = "".join(seq) + else: + out = "" + return out + + +def prepare_html_seq(seq: List[str]) -> List[str]: + """Convert html annotations to html training template.""" + out = ["[html]", *seq, ""] + return out + + +def prepare_cell_seq(seq: str) -> List[str]: + """Convert cell sequence to training template.""" + for black in CELL_SPECIAL: + seq = seq.replace(black, "") + out = ["[cell]", seq, ""] + + return out + + +def prepare_bbox_seq(seq: List[dict]): + tmp = [f"bbox-{round(i)}" for i in seq] + out = ["[bbox]"] + tmp + [""] + + return out + + +def random_continuous_sequence(seq: List, N: int, length: int = 10) -> List: + """Randomly sample a continuous sub-sequence from a sequence for N times.""" + start_idx = [random.randrange(len(seq)) for _ in range(N)] + subseq_len = [random.randrange(1, length) for _ in range(N)] + output = [(i, min(i + j, len(seq))) for i, j in zip(start_idx, subseq_len)] + + return output + + +# def prepare_bbox_seq( +# seq: List[dict], +# N: int, +# delimiter: str = "", +# ) -> List[List[str]]: +# """Convert the annotation to bbox input/output sequence.""" +# out = list() +# # bbox_loss_start_idx = list() + +# subseq_idx = random_continuous_sequence(seq, N) + +# for idx in subseq_idx: +# entry = seq[idx[0] : idx[1]] +# tmp = list() +# bbox_seq = list() +# for i in entry: +# if "tokens" in i.keys(): +# # pubtabnet and synthtabnet +# tmp.append(combine_cell_char_seq(i["tokens"])) +# if "bbox" in i.keys(): +# bbox_seq.extend([f"bbox-{round(j)}" for j in i["bbox"]]) +# elif "text" in i.keys(): +# # pubtables and icdar +# tmp.append(i["text"]) +# if "bbox" in i.keys(): +# bbox_seq.extend([f"bbox-{round(j)}" for j in i["bbox"]]) + +# cell_seq = [delimiter] * len(tmp) +# cell_seq = [q for pair in zip(tmp, cell_seq) for q in pair] +# cell_seq = ["[bbox]", f"{len(entry)}-cell(s)", delimiter] + cell_seq + +# bbox_seq.append("") +# # bbox_loss_start_idx.append(len(cell_seq)) +# out.append(cell_seq + bbox_seq) + +# return out + + +def html_str_to_token_list( + seq: str, splitter: tk.pre_tokenizers.PreTokenizer = None +) -> List[str]: + """Convert decode output (str) to a list of tokens for constructing html table code""" + + # works for no + seq = seq.split("")[0] + + token_black_list = ["", "", *TASK_TOKENS] + for i in token_black_list: + seq = seq.replace(i, "") + + if not splitter: + splitter = tk.pre_tokenizers.Split(pattern=" ", behavior="contiguous") + + seq = splitter.pre_tokenize_str(seq) + # only preserve the space for spanning cell tokens + seq = [i[0] for i in seq if len(i[0].strip()) != 0 or i[1][1] - i[1][0] != 1] + + return seq + + +def cell_str_to_token_list(seq: str) -> List[str]: + seq = seq.split("")[0] + + token_black_list = ["", "", *TASK_TOKENS] + for i in token_black_list: + seq = seq.replace(i, "") + + seq = seq.strip() + + return seq + + +def build_table_from_html_and_cell( + structure: List[str], content: List[str] = None + ) -> List[str]: + """Build table from html and cell token list""" + assert structure is not None + html_code = list() + + # deal with empty table + if content is None: + content = ["placeholder"] * len(structure) + + for tag in structure: + if tag in ("[]", ">[]"): + if len(content) == 0: + continue + cell = content.pop(0) + html_code.append(tag.replace("[]", cell)) + else: + html_code.append(tag) + + return html_code + + + +def bbox_str_to_token_list( + seq: str, splitter: tk.pre_tokenizers.PreTokenizer = None +) -> List[List[int]]: + """ + Note the out could be an empty list + + return + [[ymin, xmin, ymax, xmax], + [ymin, xmin, ymax, xmax], + ... + ] + """ + + seq = seq.split("")[0] + + token_black_list = ["", "", *TASK_TOKENS] + for i in token_black_list: + seq = seq.replace(i, "") + + if not splitter: + splitter = tk.pre_tokenizers.Split(pattern=" ", behavior="removed") + + seq = splitter.pre_tokenize_str(seq) + seq = [int(i[0].split("-")[1]) for i in seq] + + rounded_seq_len = len(seq) // 4 * 4 + out = [seq[i : i + 4] for i in range(0, rounded_seq_len, 4)] + return out + + +def pred_token_within_range( + pred: Tensor, + white_list: List[int] = None, + black_list: List[int] = None, +) -> Tensor: + assert white_list is None or black_list is None + if white_list: + total = set([i for i in range(pred.shape[-1])]) + black_list = list(total.difference(set(white_list))) + + pred[..., black_list] = -float("inf") + + return pred + + +def greedy_sampling(logits: Tensor): + """logits should have shape [B, |V|].""" + probs = F.softmax(logits, dim=-1) + next_probs, next_tokens = probs.topk(1) + + return next_probs, next_tokens + + +def batch_autoregressive_decode( + device: int, + model: EncoderDecoder, + batch_data, + prefix: List[int], + max_decode_len: int, + eos_id: int, + valid_token_whitelist: List[int] = None, + valid_token_blacklist: List[int] = None, + sampling: str = "greedy", + use_ddp: bool = True, +) -> Tensor: + """Auto-regressively generate the output.""" + + model.eval() + with torch.no_grad(): + if use_ddp: + memory = model.module.encode(batch_data.image) + else: + memory = model.encode(batch_data.image) + + B = batch_data.image.shape[0] + + context = torch.tensor(prefix, dtype=torch.int32).repeat(B, 1).to(device) + + for _ in range(max_decode_len): + eos_flag = [eos_id in k for k in context] + if all(eos_flag): + break + + # as long as one sample hasn't reached , continue decoding until the max seq len + causal_mask = subsequent_mask(context.shape[1]).to(device) + + with torch.no_grad(): + if use_ddp: + logits = model.module.decode( + memory, context, tgt_mask=causal_mask, tgt_padding_mask=None + ) + logits = model.module.generator(logits)[:, -1, :] + else: + logits = model.decode( + memory, context, tgt_mask=causal_mask, tgt_padding_mask=None + ) + logits = model.generator(logits)[:, -1, :] + + logits = pred_token_within_range( + logits.detach(), + white_list=valid_token_whitelist if valid_token_whitelist else None, + black_list=valid_token_blacklist if valid_token_blacklist else None, + ) + + if sampling == "greedy": + next_probs, next_tokens = greedy_sampling(logits) + else: + raise NotImplementedError + + context = torch.cat([context, next_tokens], dim=1) + + return context + + +def combine_filename_pred_gt( + filename: List[str], pred_id: Tensor, gt_id: Tensor, vocab: tk.Tokenizer, type: str +) -> dict: + out = dict() + + assert len(filename) == len(pred_id) + + pred_id = pred_id.detach().cpu().numpy() + gt_id = gt_id.detach().cpu().numpy() + + pred_token = vocab.decode_batch(pred_id, skip_special_tokens=False) + gt_token = vocab.decode_batch(gt_id, skip_special_tokens=False) + + for idx, name in enumerate(filename): + if type == "html": + pred_token_list = html_str_to_token_list(pred_token[idx]) + gt_token_list = html_str_to_token_list(gt_token[idx]) + elif type == "cell": + pred_token_list = cell_str_to_token_list(pred_token[idx]) + gt_token_list = cell_str_to_token_list(gt_token[idx]) + elif type == "bbox": + pred_token_list = bbox_str_to_token_list(pred_token[idx]) + gt_token_list = bbox_str_to_token_list(gt_token[idx]) + else: + raise ValueError( + f"The supported tasks are html, cell and bbox, while {type} is provided." + ) + + out[name] = dict(pred=pred_token_list, gt=gt_token_list) + + return out diff --git a/unitable/src/utils/engine.py b/unitable/src/utils/engine.py new file mode 100644 index 0000000000000000000000000000000000000000..2bf8a044320fca0e8d1f3ef4baba6a5a52db3858 --- /dev/null +++ b/unitable/src/utils/engine.py @@ -0,0 +1,84 @@ +import os +import json +import argparse +from pathlib import Path +import glob + +from ..utils import build_table_from_html_and_cell, html_table_template + + +def combine_all_json(file_dir: str) -> dict: + total_result = dict() + files = os.listdir(file_dir) + try: + files.remove("final.json") + except ValueError: + pass + for file in files: + with open(os.path.join(file_dir, file), "r") as f: + result = json.load(f) + total_result.update(result) + + print(f"Combined to a json with {len(total_result)} entries.") + + return total_result + + +def json_to_final(file_dir: str, type: str): + if type == "html" or type == "bbox": + result = combine_all_json(file_dir) + elif type == "html+cell": + result_cell = combine_all_json(file_dir) + result_html_file = os.path.join( + Path(file_dir).parent, + Path(file_dir).name.split("-")[0].replace("cell", "html") + "-html", + ) + assert Path(result_html_file).is_dir(), f"{result_html_file} does not exist." + result = combine_all_json(result_html_file) + assert len(result) == len(result_cell) + else: + # assert html and cell json files have the same length + raise NotImplementedError + + out = dict() + + if type == "bbox": + out = result + else: + for filename, obj in result.items(): + if type == "html": + pred_html = "".join(obj["pred"]) + gt_html = "".join(obj["gt"]) + + out[filename] = dict( + pred=html_table_template(pred_html), gt=html_table_template(gt_html) + ) + elif type == "html+cell": + pred_html_cell = build_table_from_html_and_cell( + obj["pred"], result_cell[filename]["pred"] + ) + gt_html_cell = build_table_from_html_and_cell( + obj["gt"], result_cell[filename]["gt"] + ) + out[filename] = dict( + pred=html_table_template(pred_html_cell), + gt=html_table_template(gt_html_cell), + ) + else: + raise NotImplementedError + + # write to file + with open(os.path.join(file_dir, f"final.json"), "w", encoding="utf-8") as f: + json.dump(out, f, indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="postprecess") + + parser.add_argument( + "-f", "--file", help="path to all json files from difference devices" + ) + parser.add_argument("-t", "--type", help="html, html+cell") + args = parser.parse_args() + + json_to_final(args.file, args.type) diff --git a/unitable/src/utils/mask_generator.py b/unitable/src/utils/mask_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..91c819b11fc370e648fcd67b95e7f8c8c5b36061 --- /dev/null +++ b/unitable/src/utils/mask_generator.py @@ -0,0 +1,96 @@ +import random +import math +from typing import Any +import numpy as np + +""" +Code adapted from beit mask generator: https://github.com/microsoft/unilm/blob/ecff36188001e9b12a90b01bbbaf9058d2b8bda6/beit/masking_generator.py . +""" + +__all__ = ["MaskGenerator"] + + +class MaskGenerator: + def __init__( + self, + input_size: int, + num_mask_patches: int, + min_num_patches: int = 4, + max_num_patches: int = None, + min_aspect: float = 0.3, + max_aspect: float = None, + ) -> None: + if not isinstance(input_size, tuple): + input_size = (input_size,) * 2 + self.height, self.width = input_size + + self.num_patches = self.height * self.width + + self.num_mask_patches = num_mask_patches + + self.min_num_patches = min_num_patches + self.max_num_patches = ( + num_mask_patches if max_num_patches is None else max_num_patches + ) + + max_aspect = max_aspect or 1 / min_aspect + self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + + def __repr__(self): + repr_str = "Generator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % ( + self.height, + self.width, + self.min_num_patches, + self.max_num_patches, + self.num_mask_patches, + self.log_aspect_ratio[0], + self.log_aspect_ratio[1], + ) + return repr_str + + def get_shape(self): + return self.height, self.width + + def _mask(self, mask: np.array, max_mask_patches: int) -> int: + delta = 0 + for _ in range(10): + target_area = random.uniform(self.min_num_patches, max_mask_patches) + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + if w < self.width and h < self.height: + top = random.randint(0, self.height - h) + left = random.randint(0, self.width - w) + + num_masked = mask[top : top + h, left : left + w].sum() + if 0 < h * w - num_masked <= max_mask_patches: + for i in range(top, top + h): + for j in range(left, left + w): + if mask[i, j] == 0: + mask[i, j] = 1 + delta += 1 + if delta > 0: + break + return delta + + def __call__(self) -> Any: + mask = np.zeros((self.height, self.width), dtype=np.int32) + mask_count = 0 + while mask_count < self.num_mask_patches: + max_mask_patches = self.num_mask_patches - mask_count + max_mask_patches = min(max_mask_patches, self.max_num_patches) + + delta = self._mask(mask, max_mask_patches) + if delta == 0: + break + else: + mask_count += delta + + return mask + + +if __name__ == "__main__": + mg = MaskGenerator(input_size=14, num_mask_patches=75) + mask = mg() + print(mask) + print(mg, mask.sum()) diff --git a/unitable/src/utils/misc.py b/unitable/src/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..372021cb60aa52a18f0461a6a8343c418912e3bc --- /dev/null +++ b/unitable/src/utils/misc.py @@ -0,0 +1,93 @@ +import math +import jsonlines +from pathlib import Path +from typing import Dict, Tuple, List, Union +from torch import Tensor, nn + +__all__ = [ + "cosine_schedule_with_warmup", + "load_json_annotations", + "bbox_augmentation_resize", + "count_total_parameters", + "compute_grad_norm", + "printer", + "html_table_template", +] + +printer = lambda device, output: f"[GPU {device}] " + output + +html_table_template = ( + lambda table: f""" + + + + + {table} +
""" +) + + +# adpated from https://github.com/huggingface/transformers/blob/v4.33.0/src/transformers/optimization.py +def cosine_schedule_with_warmup( + step: int, + *, + warmup: int, + min_ratio: float, + total_step: int, + cycle: float = 0.5, +): + if step < warmup: + if step == 0: + step = 1 + return float(step) / float(max(1, warmup)) + + if step >= total_step: + step = total_step + progress = float(step - warmup) / float(max(1, total_step - warmup)) + return max( + min_ratio, 0.5 * (1.0 + math.cos(math.pi * float(cycle) * 2.0 * progress)) + ) + + +def load_json_annotations(json_file_dir: Path, split: str): + """Preprocess jsonl in dataset.""" + image_label_pair = list() + with jsonlines.open(json_file_dir) as f: + for obj in f: + if obj["split"] == split: + image_label_pair.append((obj["filename"], obj["html"])) + + return image_label_pair + + +def bbox_augmentation_resize( + bbox: List[int], image_size: List[int], target_size: int +) -> List[int]: + """Modify the bbox coordinates according to the image resizing.""" + # Assuming the bbox is [xmin, ymin, xmax, ymax] + assert len(image_size) == 2 + ratio = [target_size / i for i in image_size] + ratio = ratio * 2 + bbox = [int(round(i * j)) for i, j in zip(bbox, ratio)] + return bbox + + +def count_total_parameters(model: nn.Module) -> int: + """Count total parameters that need training.""" + total_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + return total_parameters + + +def compute_grad_norm(model: nn.Module) -> float: + total_norm = 0.0 + for p in model.parameters(): + if p.grad is not None and p.requires_grad: + param_norm = p.grad.detach().data.norm(2) + total_norm += param_norm.item() ** 2 + total_norm = total_norm**0.5 + return total_norm diff --git a/unitable/src/utils/teds.py b/unitable/src/utils/teds.py new file mode 100644 index 0000000000000000000000000000000000000000..211272863447f336848c4a1e4605ce501d52741e --- /dev/null +++ b/unitable/src/utils/teds.py @@ -0,0 +1,268 @@ +# code adapted from https://github.com/ibm-aur-nlp/PubTabNet/blob/master/src/metric.py +# tree edit distance video explanation: https://www.youtube.com/watch?v=6Ur8B35xCj8 +import apted +import distance +from collections import deque +from lxml import etree, html +from tqdm import tqdm +from concurrent.futures import ProcessPoolExecutor, as_completed +from typing import Tuple + + +class TableTree(apted.helpers.Tree): + def __init__(self, tag, colspan=None, rowspan=None, content=None, *children): + self.tag = tag + self.colspan = colspan + self.rowspan = rowspan + self.content = content + self.children = list(children) + + def bracket(self): + """Show tree using brackets notation.""" + if self.tag == "td": + result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % ( + self.tag, + self.colspan, + self.rowspan, + self.content, + ) + else: + result = '"tag": %s' % self.tag + for child in self.children: + result += child.bracket() + return "{{{}}}".format(result) + + +class CustomConfig(apted.Config): + @staticmethod + def maximum(*sequences): + """Get maximum possible value.""" + return max(map(len, sequences)) + + def normalized_distance(self, *sequences): + """Get distance from 0 to 1.""" + return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) + + def rename(self, node1, node2): + """Compares attributes of trees""" + if ( + (node1.tag != node2.tag) + or (node1.colspan != node2.colspan) + or (node1.rowspan != node2.rowspan) + ): + return 1.0 + if node1.tag == "td": + if node1.content or node2.content: + return self.normalized_distance(node1.content, node2.content) + return 0.0 + + +class TEDS(object): + """Tree Edit Distance basead Similarity""" + + def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None): + assert isinstance(n_jobs, int) and ( + n_jobs >= 1 + ), "n_jobs must be an integer greather than 1" + self.structure_only = structure_only + self.n_jobs = n_jobs + self.ignore_nodes = ignore_nodes + self.__tokens__ = [] + + def tokenize(self, node): + """Tokenizes table cells""" + self.__tokens__.append("<%s>" % node.tag) + if node.text is not None: + self.__tokens__ += list(node.text) + for n in node.getchildren(): + self.tokenize(n) + if node.tag != "unk": + self.__tokens__.append("" % node.tag) + if node.tag != "td" and node.tail is not None: + self.__tokens__ += list(node.tail) + + def load_html_tree(self, node, parent=None): + """Converts HTML tree to the format required by apted""" + global __tokens__ + if node.tag == "td": + if self.structure_only: + cell = [] + else: + self.__tokens__ = [] + self.tokenize(node) + cell = self.__tokens__[1:-1].copy() + new_node = TableTree( + node.tag, + int(node.attrib.get("colspan", "1")), + int(node.attrib.get("rowspan", "1")), + cell, + *deque(), + ) + else: + new_node = TableTree(node.tag, None, None, None, *deque()) + if parent is not None: + parent.children.append(new_node) + if node.tag != "td": + for n in node.getchildren(): + self.load_html_tree(n, new_node) + if parent is None: + return new_node + + def evaluate(self, pred, true): + """Computes TEDS score between the prediction and the ground truth of a + given sample + """ + if (not pred) or (not true): + return 0.0 + parser = html.HTMLParser(remove_comments=True, encoding="utf-8") + pred = html.fromstring(pred, parser=parser) + true = html.fromstring(true, parser=parser) + if pred.xpath("body/table") and true.xpath("body/table"): + pred = pred.xpath("body/table")[0] + true = true.xpath("body/table")[0] + if self.ignore_nodes: + etree.strip_tags(pred, *self.ignore_nodes) + etree.strip_tags(true, *self.ignore_nodes) + n_nodes_pred = len(pred.xpath(".//*")) + n_nodes_true = len(true.xpath(".//*")) + n_nodes = max(n_nodes_pred, n_nodes_true) + tree_pred = self.load_html_tree(pred) + tree_true = self.load_html_tree(true) + distance = apted.APTED( + tree_pred, tree_true, CustomConfig() + ).compute_edit_distance() + return 1.0 - (float(distance) / n_nodes) + else: + return 0.0 + + def batch_evaluate(self, results_json): + """Computes TEDS score between the prediction and the ground truth of + a batch of samples + @params pred_json: {'FILENAME': 'HTML CODE', ...} + @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...} + @output: {'FILENAME': 'TEDS SCORE', ...} + """ + samples = results_json.keys() + print(f"Total samples: {len(samples)}") + if self.n_jobs == 1: + scores = [ + self.evaluate( + results_json[filename]["pred"], + results_json[filename]["gt"], + ) + for filename in tqdm(samples) + ] + else: + inputs = [ + { + "pred": results_json[filename]["pred"], + "true": results_json[filename]["gt"], + } + for filename in samples + ] + scores = parallel_process( + inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1 + ) + output = dict() + for i, j in zip(samples, scores): + if "span" in results_json[i]["gt"]: + output[i] = dict(scores=j, type="complex") + else: + output[i] = dict(scores=j, type="simple") + # scores = dict(zip(samples, scores)) + return output + + +def parallel_process(array, function, n_jobs=16, use_kwargs=False, front_num=0): + """ + A parallel version of the map function with a progress bar. + + Args: + array (array-like): An array to iterate over. + function (function): A python function to apply to the elements of array + n_jobs (int, default=16): The number of cores to use + use_kwargs (boolean, default=False): Whether to consider the elements of array as dictionaries of + keyword arguments to function + front_num (int, default=3): The number of iterations to run serially before kicking off the parallel job. + Useful for catching bugs + Returns: + [function(array[0]), function(array[1]), ...] + """ + # We run the first few iterations serially to catch bugs + if front_num > 0: + front = [ + function(**a) if use_kwargs else function(a) for a in array[:front_num] + ] + else: + front = [] + # If we set n_jobs to 1, just run a list comprehension. This is useful for benchmarking and debugging. + if n_jobs == 1: + return front + [ + function(**a) if use_kwargs else function(a) + for a in tqdm(array[front_num:]) + ] + # Assemble the workers + with ProcessPoolExecutor(max_workers=n_jobs) as pool: + # Pass the elements of array into function + if use_kwargs: + futures = [pool.submit(function, **a) for a in array[front_num:]] + else: + futures = [pool.submit(function, a) for a in array[front_num:]] + kwargs = { + "total": len(futures), + "unit": "it", + "unit_scale": True, + "leave": True, + } + # Print out the progress as tasks complete + for f in tqdm(as_completed(futures), **kwargs): + pass + out = [] + # Get the results from the futures. + for i, future in tqdm(enumerate(futures)): + try: + out.append(future.result()) + except Exception as e: + out.append(e) + return front + out + + +if __name__ == "__main__": + import json + import pprint + import numpy as np + import argparse + + parser = argparse.ArgumentParser(description="TEDS Computation") + + parser.add_argument("-f", "--file", help="path to html table results in json file") + parser.add_argument("-t", "--type", help="html, html+cell") + parser.add_argument("-n", "--njob", default=200, help="number of jobs in parallel") + args = parser.parse_args() + + results_file = args.file + with open(results_file, "r") as f: + results_json = json.load(f) + + if args.type == "html": + s_only = True + else: + s_only = False + teds = TEDS(structure_only=s_only, n_jobs=args.njob) + scores = teds.batch_evaluate(results_json) + pp = pprint.PrettyPrinter() + pp.pprint(scores) + + # compute teds for simple and complex tables + total, simple, complex = list(), list(), list() + for _, obj in scores.items(): + if obj["type"] == "simple": + simple.append(obj["scores"]) + elif obj["type"] == "complex": + complex.append(obj["scores"]) + total.append(obj["scores"]) + + total, simple, complex = np.array(total), np.array(simple), np.array(complex) + print( + f"Simple: {np.mean(simple)} \nComplex: {np.mean(complex)} \nTotal: {np.mean(total)}" + ) diff --git a/unitable/src/utils/visualization.py b/unitable/src/utils/visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..89f4d8caaa25502edff00e7882da9efd808df9fe --- /dev/null +++ b/unitable/src/utils/visualization.py @@ -0,0 +1,13 @@ +from torchvision import transforms +import numpy as np + + +def normalize_image_for_visualization(mean: float, std: float): + invNormalization = transforms.Compose( + [ + transforms.Normalize(mean=[0.0] * 3, std=1.0 / np.array(std)), + transforms.Normalize(mean=-1.0 * np.array(mean), std=[1.0] * 3), + ] + ) + + return invNormalization diff --git a/unitable/src/vocab/__init__.py b/unitable/src/vocab/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0001f87373ec871dd042c2fce06f8e55706cfd77 --- /dev/null +++ b/unitable/src/vocab/__init__.py @@ -0,0 +1 @@ +from .constant import * diff --git a/unitable/src/vocab/constant.py b/unitable/src/vocab/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..25f82fb183adbd299c8b14b9bffc8a4f13014c54 --- /dev/null +++ b/unitable/src/vocab/constant.py @@ -0,0 +1,60 @@ +SPECIAL_TOKENS = ["", "", "", "", "", ""] +TASK_TOKENS = ["[table]", "[html]", "[cell]", "[bbox]", "[cell+bbox]"] +RESERVED_TOKENS = [ + f"reserved {i+1}" for i in range(20 - len(SPECIAL_TOKENS) - len(TASK_TOKENS)) +] +CELL_NUM_TOKENS = [f"{i+1}-cell(s)" for i in range(100)] +BBOX_TOKENS = [f"bbox-{i}" for i in range(880)] + +HTML_TOKENS = [ + "", + "[]", + "", + ">[]", + "", + "", + "", + "", + "", + "", + ' rowspan="2"', + ' rowspan="3"', + ' rowspan="4"', + ' rowspan="5"', + ' rowspan="6"', + ' rowspan="7"', + ' rowspan="8"', + ' rowspan="9"', + ' rowspan="10"', + ' rowspan="11"', + ' rowspan="12"', + ' rowspan="13"', + ' rowspan="14"', + ' rowspan="15"', + ' rowspan="16"', + ' rowspan="17"', + ' rowspan="18"', + ' rowspan="19"', + ' colspan="2"', + ' colspan="3"', + ' colspan="4"', + ' colspan="5"', + ' colspan="6"', + ' colspan="7"', + ' colspan="8"', + ' colspan="9"', + ' colspan="10"', + ' colspan="11"', + ' colspan="12"', + ' colspan="13"', + ' colspan="14"', + ' colspan="15"', + ' colspan="16"', + ' colspan="17"', + ' colspan="18"', + ' colspan="19"', + ' colspan="25"', +] + +CELL_SPECIAL = ["", "", "", "", "", "", "", ""] diff --git a/unitable/unitable_full.py b/unitable/unitable_full.py new file mode 100644 index 0000000000000000000000000000000000000000..69912dc145b372e759c50fdcaae40ecec56e3172 --- /dev/null +++ b/unitable/unitable_full.py @@ -0,0 +1,425 @@ +from typing import Tuple, List, Sequence, Optional, Union +from pathlib import Path +import re +import torch +import tokenizers as tk +from PIL import Image +from matplotlib import pyplot as plt +from matplotlib import patches +from torchvision import transforms +from torch import nn, Tensor +from functools import partial +import numpy.typing as npt +from numpy import uint8 +ImageType = npt.NDArray[uint8] +import warnings +import time +import argparse +from bs4 import BeautifulSoup as bs + +from .src.model import EncoderDecoder, ImgLinearBackbone, Encoder, Decoder + +from .src.utils import subsequent_mask, pred_token_within_range, greedy_sampling, bbox_str_to_token_list, html_str_to_token_list,cell_str_to_token_list, build_table_from_html_and_cell, html_table_template +from .src.trainer.utils import VALID_HTML_TOKEN, VALID_BBOX_TOKEN, INVALID_CELL_TOKEN + +""" + ImgLinearBackbone, Encoder, Decoder are in components.py + EncoderDecoder is in encoderdecoder.py + +""" + +warnings.filterwarnings('ignore') + + +class UnitableFullPredictor(): + def __init__(self): + pass + + def load_vocab_and_model( + self, + backbone, + encoder, + decoder, + vocab_path: Union[str, Path], + max_seq_len: int, + model_weights: Union[str, Path], + ) -> Tuple[tk.Tokenizer, EncoderDecoder]: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + vocab = tk.Tokenizer.from_file(vocab_path) + d_model = 768 + dropout = 0.2 + model = EncoderDecoder( + backbone= backbone, + encoder= encoder, + decoder= decoder, + vocab_size= vocab.get_vocab_size(), + d_model= d_model, + padding_idx= vocab.token_to_id(""), + max_seq_len=max_seq_len, + dropout=dropout, + norm_layer=partial(nn.LayerNorm, eps=1e-6) + ) + # it loads weights onto the CPU first and then moves the model to the desired device + model.load_state_dict(torch.load(model_weights, map_location="cpu")) + model = model.to(device) + + return vocab, model + + + def autoregressive_decode( + self, + model: EncoderDecoder, + image: Tensor, + prefix: Sequence[int], + max_decode_len: int, + eos_id: int, + token_whitelist: Optional[Sequence[int]] = None, + token_blacklist: Optional[Sequence[int]] = None, + ) -> Tensor: + model.eval() + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + with torch.no_grad(): + """ + The encoder takes the input data (in this case, an image) and transforms it into a high-dimensional feature representation. + This feature representation, or memory tensor, captures the essential information from the input data needed to generate the output sequence. + """ + memory = model.encode(image) + """ + Creates a context tensor from the prefix and repeats it to match the batch size of the image, moving it to the appropriate device. + """ + context = torch.tensor(prefix, dtype=torch.int32).repeat(image.shape[0], 1).to(device) + + for _ in range(max_decode_len): + eos_flag = [eos_id in k for k in context] + if all(eos_flag): + break + + with torch.no_grad(): + causal_mask = subsequent_mask(context.shape[1]).to(device) + logits = model.decode( + memory, context, tgt_mask=causal_mask, tgt_padding_mask=None + ) + logits = model.generator(logits)[:, -1, :] + + logits = pred_token_within_range( + logits.detach(), + white_list=token_whitelist, + black_list=token_blacklist, + ) + + next_probs, next_tokens = greedy_sampling(logits) + context = torch.cat([context, next_tokens], dim=1) + return context + + + @staticmethod + def image_to_tensor(image: Image, size: Tuple[int, int]) -> Tensor: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + T = transforms.Compose([ + transforms.Resize(size), + transforms.ToTensor(), + transforms.Normalize(mean=[0.86597056,0.88463002,0.87491087], std = [0.20686628,0.18201602,0.18485524]) + ]) + image_tensor = T(image) + image_tensor = image_tensor.to(device).unsqueeze(0) + + return image_tensor + + def rescale_bbox( + self, + bbox: Sequence[Sequence[float]], + src: Tuple[int, int], + tgt: Tuple[int, int] + ) -> Sequence[Sequence[float]]: + assert len(src) == len(tgt) == 2 + ratio = [tgt[0] / src[0], tgt[1] / src[1]] * 2 + print(ratio) + bbox = [[int(round(i * j)) for i, j in zip(entry, ratio)] for entry in bbox] + return bbox + + + def predict(self, images:List[Image.Image],debugfolder_filename_page_name:str): + MODEL_FILE_NAME = ["unitable_large_structure.pt", "unitable_large_bbox.pt", "unitable_large_content.pt"] + MODEL_DIR = Path("./unitable/experiments/unitable_weights") + # UniTable large model + d_model = 768 + patch_size = 16 + nhead = 12 + dropout = 0.2 + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + backbone= ImgLinearBackbone(d_model=d_model, patch_size=patch_size) + encoder= Encoder( + d_model=d_model, + nhead=nhead, + dropout=dropout, + activation="gelu", + norm_first=True, + nlayer=12, + ff_ratio=4, + ) + decoder= Decoder( + d_model=d_model, + nhead=nhead, + dropout=dropout, + activation="gelu", + norm_first=True, + nlayer=4, + ff_ratio=4, + ) + + print("Running table transformer + Unitable Full Model") + """ + Step 1 Load Table Structure Model + """ + + start1 = time.time() + # Table structure extraction + vocabS, modelS = self.load_vocab_and_model( + backbone=backbone, + encoder=encoder, + decoder=decoder, + vocab_path="./unitable/vocab/vocab_html.json", + max_seq_len=784, + model_weights=MODEL_DIR / MODEL_FILE_NAME[0] + ) + + end1 = time.time() + print("time to load table structure model ",end1-start1,"seconds") + + """ + Step 2 prepare images to tensor + """ + image_tensors = [] + for i in range(len(images)): + image_size = images[i].size + # Image transformation + image_tensor = self.image_to_tensor(images[i], (448, 448)) + image_tensors.append(image_tensor) + print("Check if image_tensors is what i want it to be ") + print(type(image_tensors)) + + + # This will be list of arrays(pred_html), which is again list of array + pred_htmls = [] + for i in range(len(image_tensors)): + #print(image_tensor) + print("Processing table "+str(i)) + start2 = time.time() + # Inference + pred_html = self.autoregressive_decode( + model= modelS, + image= image_tensors[i], + prefix=[vocabS.token_to_id("[html]")], + max_decode_len=512, + eos_id=vocabS.token_to_id(""), + token_whitelist=[vocabS.token_to_id(i) for i in VALID_HTML_TOKEN], + token_blacklist = None + ) + + end2 = time.time() + + print("time for inference table structure ",end2-start2,"seconds") + pred_html = pred_html.detach().cpu().numpy()[0] + pred_html = vocabS.decode(pred_html, skip_special_tokens=False) + + pred_html = html_str_to_token_list(pred_html) + pred_htmls.append(pred_html) + print(pred_html) + """ + Step 3 Load Table Cell detection + """ + + + start3 = time.time() + # Table cell bbox detection + vocabB, modelB = self.load_vocab_and_model( + backbone=backbone, + encoder=encoder, + decoder=decoder, + vocab_path="./unitable/vocab/vocab_bbox.json", + max_seq_len=1024, + model_weights=MODEL_DIR / MODEL_FILE_NAME[1], + ) + end3 = time.time() + print("time to load cell bbox detection model ",end3-start3,"seconds") + """ + Step 4 do the pred_bboxes detection + """ + + pred_bboxs =[] + for i in range(len(image_tensors)): + start4 = time.time() + # Inference + pred_bbox = self.autoregressive_decode( + model=modelB, + image=image_tensors[i], + prefix=[vocabB.token_to_id("[bbox]")], + max_decode_len=1024, + eos_id=vocabB.token_to_id(""), + token_whitelist=[vocabB.token_to_id(i) for i in VALID_BBOX_TOKEN[: 449]], + token_blacklist = None + ) + end4 = time.time() + print("Processing table "+str(i)) + print("time to do inference for table cell bbox detection model ",end4-start4,"seconds") + # Convert token id to token text + pred_bbox = pred_bbox.detach().cpu().numpy()[0] + pred_bbox = vocabB.decode(pred_bbox, skip_special_tokens=False) + pred_bbox = bbox_str_to_token_list(pred_bbox) + pred_bbox = self.rescale_bbox(pred_bbox, src=(448, 448), tgt=images[i].size) + print(pred_bbox) + + print("Size of the image ") + #(1498, 971) + print(images[i].size) + print("Number of bounding boxes ") + print(len(pred_bbox)) + countcells = 0 + for elem in pred_htmls[i] : + if elem == '[]' or elem == '>[]': + countcells+=1 + + #275 + print("number of countcells") + print(countcells) + if countcells > 256: + #TODO Extra processing for big tables + + #Find the last incomplete row and its ymax coordinate + + # Last bbox's ymax gives us coordinate of where the cutted off row starts + #IMPORTANT : pred_bbox is xmin, ymin, xmax, ymax + cut_off = pred_bbox[-1][1] + + #This will be used to distinguish how many cells are already detected in that row. + + last_cells_redudant = 0 + for cell in reversed(pred_bbox): + if cut_off-5 < cell[1] "), + token_whitelist=[vocabB.token_to_id(i) for i in VALID_BBOX_TOKEN[: 449]], + token_blacklist = None + ) + # Convert token id to token text + pred_bbox_extra = pred_bbox_extra.detach().cpu().numpy()[0] + pred_bbox_extra = vocabB.decode(pred_bbox_extra, skip_special_tokens=False) + pred_bbox_extra = bbox_str_to_token_list(pred_bbox_extra) + + pred_bbox_extra = pred_bbox_extra[last_cells_redudant-1:] + + pred_bbox_extra = self.rescale_bbox(pred_bbox_extra, src=(448, 448), tgt=cropped_image.size) + pred_bbox_extra = [[i[0], i[1]+cut_off, i[2], i[3]+cut_off] for i in pred_bbox_extra] + + pred_bbox = pred_bbox + pred_bbox_extra + + print("extra boxes:") + print(pred_bbox_extra) + print(len(pred_bbox_extra)) + + pred_bboxs.append(pred_bbox) + fig, ax = plt.subplots(figsize=(12, 10)) + for j in pred_bbox: + #i is xmin, ymin, xmax, ymax based on the function usage + rect = patches.Rectangle(j[:2], j[2] - j[0], j[3] - j[1], linewidth=1, edgecolor='r', facecolor='none') + ax.add_patch(rect) + ax.set_axis_off() + ax.imshow(images[i]) + fig.savefig(debugfolder_filename_page_name+str(i)+".png", bbox_inches='tight', dpi=300) + + + """ + Step 5 : Load table cell recognition contents + """ + + start4 = time.time() + # Table cell bbox detection + vocabC, modelC = self.load_vocab_and_model( + backbone=backbone, + encoder=encoder, + decoder=decoder, + vocab_path="./unitable/vocab/vocab_cell_6k.json", + max_seq_len=200, + model_weights=MODEL_DIR / MODEL_FILE_NAME[2], + ) + end4 = time.time() + print("time to load cell recognition model ",end4-start4,"seconds") + + pred_cells = [] + """ + Step 6 : Decode for all tables + """ + for i in range(len(images)): + + cell_image_tensors_for_img =[] + for bbox in pred_bboxs[i]: + cropped_img = images[i].crop(bbox) + if cropped_img.size[0] >0: + cell_image_tensors_for_img.append(self.image_to_tensor(cropped_img, size=(112, 448))) + + cell_image_tensors_for_img = torch.cat(cell_image_tensors_for_img, dim=0).to(device) + #print("size of tensor") + #print(image_tensor.size()) + + start4 = time.time() + # Inference + pred_cell = self.autoregressive_decode( + model=modelC, + image=cell_image_tensors_for_img, + prefix=[vocabC.token_to_id("[cell]")], + max_decode_len=200, + eos_id=vocabC.token_to_id(""), + token_whitelist=None, + token_blacklist = [vocabC.token_to_id(i) for i in INVALID_CELL_TOKEN] + ) + + # Convert token id to token text + pred_cell = pred_cell.detach().cpu().numpy() + pred_cell = vocabC.decode_batch(pred_cell, skip_special_tokens=False) + + end4 = time.time() + print("Processing table "+str(i)) + print("time to do cell recognition ",end4-start4,"seconds") + + pred_cell = [cell_str_to_token_list(i) for i in pred_cell] + #The code finds instances in each string of pred_cell where there is a digit followed by any character and then whitespace followed by another digit. + #It replaces these instances with the first digit, followed by a period, followed by the second digit, effectively removing the whitespace and any character between the digits and replacing it with a period. + pred_cell = [re.sub(r'(\d).\s+(\d)', r'\1.\2', i) for i in pred_cell] + + print(pred_cell) + pred_cells.append(pred_cell) + print(type(pred_cells)) + + table_codes =[] + for pred_html, pred_cell in zip(pred_htmls, pred_cells): + # Combine the table structure and cell content + pred_code = build_table_from_html_and_cell(pred_html, pred_cell) + pred_code = "".join(pred_code) + pred_code = html_table_template(pred_code) + + # Display the HTML table + soup = bs(pred_code) + table_code = soup.prettify() + print(table_code) + table_codes.append(table_code) + return table_codes + + diff --git a/unitable/unitable_full_singleimage.py b/unitable/unitable_full_singleimage.py new file mode 100644 index 0000000000000000000000000000000000000000..e223f47709efb2a26c762eaec01fe40dae089b64 --- /dev/null +++ b/unitable/unitable_full_singleimage.py @@ -0,0 +1,555 @@ +from typing import Tuple, List, Sequence, Optional, Union +from pathlib import Path +import re +import torch +import tokenizers as tk +from PIL import Image +from matplotlib import pyplot as plt +from matplotlib import patches +from torchvision import transforms +from torch import nn, Tensor +from functools import partial +import numpy.typing as npt +from numpy import uint8 +ImageType = npt.NDArray[uint8] +import warnings +import time +import argparse +from bs4 import BeautifulSoup as bs + +from .src.model import EncoderDecoder, ImgLinearBackbone, Encoder, Decoder +from .src.utils import subsequent_mask, pred_token_within_range, greedy_sampling, bbox_str_to_token_list, html_str_to_token_list,cell_str_to_token_list, build_table_from_html_and_cell, html_table_template +from .src.trainer.utils import VALID_HTML_TOKEN, VALID_BBOX_TOKEN, INVALID_CELL_TOKEN + +warnings.filterwarnings('ignore') + + +class UnitableFullSinglePredictor(): + def __init__(self): + MODEL_FILE_NAME = ["unitable_large_structure.pt", "unitable_large_bbox.pt", "unitable_large_content.pt"] + MODEL_DIR = Path("unitable/experiments/unitable_weights") + # UniTable large model + self.d_model = 768 + self.patch_size = 16 + self.nhead = 12 + self.dropout = 0.2 + + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.backbone= ImgLinearBackbone(d_model=self.d_model, patch_size=self.patch_size) + self.encoder= Encoder( + d_model=self.d_model, + nhead=self.nhead, + dropout=self.dropout, + activation="gelu", + norm_first=True, + nlayer=12, + ff_ratio=4, + ) + self.decoder= Decoder( + d_model=self.d_model, + nhead=self.nhead, + dropout=self.dropout, + activation="gelu", + norm_first=True, + nlayer=4, + ff_ratio=4, + ) + """ + start1 = time.time() + # Table structure extraction + self.vocabS, self.modelS = self.load_vocab_and_model( + backbone= ImgLinearBackbone(d_model=self.d_model, patch_size=self.patch_size), + encoder= Encoder( + d_model=self.d_model, + nhead=self.nhead, + dropout=self.dropout, + activation="gelu", + norm_first=True, + nlayer=12, + ff_ratio=4, + ), + decoder= Decoder( + d_model=self.d_model, + nhead=self.nhead, + dropout=self.dropout, + activation="gelu", + norm_first=True, + nlayer=4, + ff_ratio=4, + ), + d_model= self.d_model, + dropout= self.dropout, + vocab_path="unitable/vocab/vocab_html.json", + max_seq_len=784, + model_weights=MODEL_DIR / MODEL_FILE_NAME[0] + ) + end1 = time.time() + print("time to load table structure model ",end1-start1,"seconds") + + start3 = time.time() + # Table cell bbox detection + self.vocabB, self.modelB = self.load_vocab_and_model( + backbone = ImgLinearBackbone(d_model=self.d_model, patch_size=self.patch_size), + encoder = Encoder( + d_model= self.d_model, + nhead= self.nhead, + dropout = self.dropout, + activation="gelu", + norm_first=True, + nlayer=12, + ff_ratio=4, + ), + decoder = Decoder( + d_model= self.d_model, + nhead= self.nhead, + dropout = self.dropout, + activation="gelu", + norm_first=True, + nlayer=4, + ff_ratio=4, + ), + d_model= self.d_model, + dropout= self.dropout, + vocab_path="unitable/vocab/vocab_bbox.json", + max_seq_len=1024, + model_weights=MODEL_DIR / MODEL_FILE_NAME[1], + ) + end3 = time.time() + print("time to load cell bbox detection model ",end3-start3,"seconds") + + start4 = time.time() + # Table cell bbox detection + self.vocabC, self.modelC = self.load_vocab_and_model( + backbone = ImgLinearBackbone(d_model=self.d_model, patch_size=self.patch_size), + encoder = Encoder( + d_model= self.d_model, + nhead= self.nhead, + dropout = self.dropout, + activation="gelu", + norm_first=True, + nlayer=12, + ff_ratio=4, + ), + decoder = Decoder( + d_model= self.d_model, + nhead= self.nhead, + dropout = self.dropout, + activation="gelu", + norm_first=True, + nlayer=4, + ff_ratio=4, + ), + d_model= self.d_model, + dropout= self.dropout, + vocab_path="unitable/vocab/vocab_cell_6k.json", + max_seq_len=200, + #Using the content recognition model i guess + model_weights=MODEL_DIR / MODEL_FILE_NAME[2], + ) + end4 = time.time() + print("time to load cell recognition model ",end4-start4,"seconds") + """ + + + def load_vocab_and_model( + self, + vocab_path: Union[str, Path], + max_seq_len: int, + model_weights: Union[str, Path], + ) -> Tuple[tk.Tokenizer, EncoderDecoder]: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + vocab = tk.Tokenizer.from_file(vocab_path) + + model = EncoderDecoder( + backbone= self.backbone, + encoder= self.encoder, + decoder= self.decoder, + vocab_size= vocab.get_vocab_size(), + d_model= self.d_model, + padding_idx= vocab.token_to_id(""), + max_seq_len=max_seq_len, + dropout=self.dropout, + norm_layer=partial(nn.LayerNorm, eps=1e-6) + ) + # it loads weights onto the CPU first and then moves the model to the desired device + model.load_state_dict(torch.load(model_weights, map_location="cpu")) + model = model.to(device) + + return vocab, model + + + def autoregressive_decode( + self, + model: EncoderDecoder, + image: Tensor, + prefix: Sequence[int], + max_decode_len: int, + eos_id: int, + token_whitelist: Optional[Sequence[int]] = None, + token_blacklist: Optional[Sequence[int]] = None, + ) -> Tensor: + model.eval() + with torch.no_grad(): + memory = model.encode(image) + context = torch.tensor(prefix, dtype=torch.int32).repeat(image.shape[0], 1).to(self.device) + + for _ in range(max_decode_len): + eos_flag = [eos_id in k for k in context] + if all(eos_flag): + break + + with torch.no_grad(): + causal_mask = subsequent_mask(context.shape[1]).to(self.device) + logits = model.decode( + memory, context, tgt_mask=causal_mask, tgt_padding_mask=None + ) + logits = model.generator(logits)[:, -1, :] + + logits = pred_token_within_range( + logits.detach(), + white_list=token_whitelist, + black_list=token_blacklist, + ) + + next_probs, next_tokens = greedy_sampling(logits) + context = torch.cat([context, next_tokens], dim=1) + return context + + + @staticmethod + def image_to_tensor(image: Image, size: Tuple[int, int]) -> Tensor: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # Resize the image with padding + #resized_image = UnitableFullPredictor.resize_with_padding(image, size) + T = transforms.Compose([ + transforms.Resize(size), + transforms.ToTensor(), + transforms.Normalize(mean=[0.86597056,0.88463002,0.87491087], std = [0.20686628,0.18201602,0.18485524]) + ]) + image_tensor = T(image) + image_tensor = image_tensor.to(device).unsqueeze(0) + + return image_tensor + + """ + + @staticmethod + def resize_with_padding(image: Image, target_size: Tuple[int, int]) -> Image: + + #Resize the image to fit within the target size while preserving aspect ratio, + #then add padding to match the target size. + + original_width, original_height = image.size + target_width, target_height = target_size + + # Calculate the new size preserving aspect ratio + aspect_ratio = original_width / original_height + if target_width / target_height > aspect_ratio: + new_height = target_height + new_width = int(new_height * aspect_ratio) + else: + new_width = target_width + new_height = int(new_width / aspect_ratio) + + # Resize the image to the new size + resized_image = image.resize((new_width, new_height),Image.LANCZOS) + + # Create a new image with white background + new_image = Image.new("RGB", (target_width, target_height), (255, 255, 255)) + + # Paste the resized image onto the white background + paste_position = ((target_width - new_width) // 2, (target_height - new_height) // 2) + new_image.paste(resized_image, paste_position) + new_image.save("../res/table_resize_with_padding.png") + + return new_image + """ + + def rescale_bbox( + self, + bbox: Sequence[Sequence[float]], + src: Tuple[int, int], + tgt: Tuple[int, int] + ) -> Sequence[Sequence[float]]: + assert len(src) == len(tgt) == 2 + ratio = [tgt[0] / src[0], tgt[1] / src[1]] * 2 + print(ratio) + bbox = [[int(round(i * j)) for i, j in zip(entry, ratio)] for entry in bbox] + return bbox + """ + @staticmethod + def rescale_bbox( + bbox: Sequence[Sequence[float]], + src: Tuple[int, int], + tgt: Tuple[int, int] + ) -> Sequence[Sequence[float]]: + + #Rescale bounding boxes according to the transformation applied in resize_with_padding. + + src_width, src_height = src + tgt_width, tgt_height = tgt + + # Calculate the new size preserving aspect ratio + aspect_ratio = src_width / src_height + if tgt_width / tgt_height > aspect_ratio: + new_height = tgt_height + new_width = int(new_height * aspect_ratio) + else: + new_width = tgt_width + new_height = int(new_width / aspect_ratio) + + # Calculate the scale factors + #THIS *2 factor was done in their code - why ? i have no clue + scale_x = (new_width / src_width ) * 2 + scale_y = (new_height / src_height) *2 + + # Calculate the padding + pad_x = (tgt_width - new_width) // 2 + pad_y = (tgt_height - new_height) // 2 + + # Rescale and adjust the bounding boxes + rescaled_bbox = [] + for entry in bbox: + x_min = int(round(entry[0] * scale_x -pad_x)) + y_min = int(round(entry[1] * scale_y - pad_y)) + x_max = int(round(entry[2] * scale_x - pad_x)) + y_max = int(round(entry[3] * scale_y - pad_y)) + rescaled_bbox.append([x_min, y_min, x_max, y_max]) + + return rescaled_bbox + """ + + def predict(self, image:ImageType): + MODEL_FILE_NAME = ["unitable_large_structure.pt", "unitable_large_bbox.pt", "unitable_large_content.pt"] + MODEL_DIR = Path("unitable/experiments/unitable_weights") + image_size = image.size + + print("RUNING SINGLE IMAGE UNITABLE FOR DEBUGGGING ") + # Image transformation + image_tensor = self.image_to_tensor(image, (448, 448)) + #print(image_tensor) + + """ + Step 1 Table Structure recognition + """ + + + start1 = time.time() + # Table structure extraction + vocabS, modelS = self.load_vocab_and_model( + vocab_path="unitable/vocab/vocab_html.json", + max_seq_len=784, + model_weights=MODEL_DIR / MODEL_FILE_NAME[0] + ) + end1 = time.time() + print("time to load table structure model ",end1-start1,"seconds") + + + start2 = time.time() + # Inference + pred_html = self.autoregressive_decode( + model= modelS, + image= image_tensor, + prefix=[vocabS.token_to_id("[html]")], + max_decode_len=512, + eos_id=vocabS.token_to_id(""), + token_whitelist=[vocabS.token_to_id(i) for i in VALID_HTML_TOKEN], + token_blacklist = None + ) + end2 = time.time() + print("time for inference table structure ",end2-start2,"seconds") + + # Convert token id to token text + pred_html = pred_html.detach().cpu().numpy()[0] + pred_html = vocabS.decode(pred_html, skip_special_tokens=False) + #print(pred_html) + pred_html = html_str_to_token_list(pred_html) + + print(pred_html) + + + """ + Step 2 Table Cell detection + """ + + + start3 = time.time() + # Table cell bbox detection + vocabB, modelB = self.load_vocab_and_model( + vocab_path="unitable/vocab/vocab_bbox.json", + max_seq_len=1024, + model_weights=MODEL_DIR / MODEL_FILE_NAME[1], + ) + end3 = time.time() + print("time to load cell bbox detection model ",end3-start3,"seconds") + + + start4 = time.time() + # Inference + pred_bbox = self.autoregressive_decode( + model=modelB, + image=image_tensor, + prefix=[vocabB.token_to_id("[bbox]")], + max_decode_len=1024, + eos_id=vocabB.token_to_id(""), + token_whitelist=[vocabB.token_to_id(i) for i in VALID_BBOX_TOKEN[: 449]], + token_blacklist = None + ) + end4 = time.time() + print("time to do inference for table cell bbox detection model ",end4-start4,"seconds") + + # Convert token id to token text + pred_bbox = pred_bbox.detach().cpu().numpy()[0] + pred_bbox = vocabB.decode(pred_bbox, skip_special_tokens=False) + pred_bbox = bbox_str_to_token_list(pred_bbox) + pred_bbox = self.rescale_bbox(pred_bbox, src=(448, 448), tgt=image.size) + + + + print(pred_bbox) + + print("Size of the image ") + #(1498, 971) + print(image.size) + print("Number of bounding boxes ") + print(len(pred_bbox)) + + + countcells = 0 + #startBody = False + #startFirstRow = True + #numElemInRow = 0 + for elem in pred_html : + #if elem == '': + # startBody = True + #elif startBody ==True and elem == '': + # startFirstRow = True + #elif startFirstRow == True and elem == '[]': + # numElemInRow +=1 + #elif startBody ==True and elem == '': + # startFirstRow = False + # startBody = False + if elem == '[]': + countcells+=1 + + + + #275 + print(countcells) + if countcells > len(pred_bbox): + #TODO Extra processing for big tables + + #Find the last incomplete row and its ymax coordinate + + # Last bbox's ymax gives us coordinate of where the cutted off row starts + #IMPORTANT : pred_bbox is xmin, ymax, xmax, ymin + cut_off = pred_bbox[-1][1] + + width = image.size[0] + height = image.size[1] + #bbox = (0, cut_off, width, height) + #IMPORTANT : crop takes in (xmin, ymax, xmax, ymin) coordintes !!! + bbox = (0, cut_off, width, height) + # Crop the image to the specified bounding box + cropped_image = image.crop(bbox) + cropped_image.save("./res/cropped_image_for_extra_bbox_det.png") + image_tensor = self.image_to_tensor(cropped_image, (448, 448)) + pred_bbox_extra = self.autoregressive_decode( + model=modelB, + image=image_tensor, + prefix=[vocabB.token_to_id("[bbox]")], + max_decode_len=1024, + eos_id=vocabB.token_to_id(""), + token_whitelist=[vocabB.token_to_id(i) for i in VALID_BBOX_TOKEN[: 449]], + token_blacklist = None + ) + # Convert token id to token text + pred_bbox_extra = pred_bbox_extra.detach().cpu().numpy()[0] + pred_bbox_extra = vocabB.decode(pred_bbox_extra, skip_special_tokens=False) + pred_bbox_extra = bbox_str_to_token_list(pred_bbox_extra) + numberOrCellsToAdd = countcells-len(pred_bbox) + pred_bbox_extra = pred_bbox_extra[-numberOrCellsToAdd:] + pred_bbox_extra = self.rescale_bbox(pred_bbox_extra, src=(448, 448), tgt=cropped_image.size) + #This resulted in table_bbox_test_extra_3.png + #pred_bbox_extra = [[i[0], i[1]+cut_off, i[2], i[3]+cut_off] for i in pred_bbox_extra] + pred_bbox_extra = [[i[0], i[1]+cut_off, i[2], i[3]+cut_off] for i in pred_bbox_extra] + + pred_bbox = pred_bbox + pred_bbox_extra + + #[[25, 63, 152, 86], [227, 63, 292, 86], [326, 63, 373, 86], [413, 63, 460, 86], [562, 63, 609, 86], [708, 63, 758, 86], [848, 63, 895, 86], [935, 63, 982, 86], [1025, 63, 1075, 86], [1119, 63, 1165, 86], [1280, 63, 1327, 86]] + print(pred_bbox_extra) + #11 + print(len(pred_bbox_extra)) + + + fig, ax = plt.subplots(figsize=(12, 10)) + for i in pred_bbox: + #i is xmin, ymin, xmax, ymax based on the function usage + rect = patches.Rectangle(i[:2], i[2] - i[0], i[3] - i[1], linewidth=1, edgecolor='r', facecolor='none') + ax.add_patch(rect) + ax.set_axis_off() + ax.imshow(image) + fig.savefig('./res/table_debug3/singleimageres.png', bbox_inches='tight', dpi=300) + + """ + Step 3 : Table cell content recognition + """ + + start4 = time.time() + # Table cell bbox detection + vocabC, modelC = self.load_vocab_and_model( + vocab_path="unitable/vocab/vocab_cell_6k.json", + max_seq_len=200, + model_weights=MODEL_DIR / MODEL_FILE_NAME[2], + ) + end4 = time.time() + print("time to load cell recognition model ",end4-start4,"seconds") + + # Cell image cropping and transformation + """ + images = [image.crop(bbox) for bbox in pred_bbox] + for idx, img in enumerate(images): + img.save("res/debug/cell_{}.png".format(idx)) + """ + #Cropping boundaries are fine + image_tensor = [self.image_to_tensor(image.crop(bbox), size=(112, 448)) for bbox in pred_bbox] + image_tensor = torch.cat(image_tensor, dim=0) + #print("size of tensor") + #print(image_tensor.size()) + + start4 = time.time() + # Inference + pred_cell = self.autoregressive_decode( + model=modelC, + image=image_tensor, + prefix=[vocabC.token_to_id("[cell]")], + max_decode_len=200, + eos_id=vocabC.token_to_id(""), + token_whitelist=None, + token_blacklist = [vocabC.token_to_id(i) for i in INVALID_CELL_TOKEN] + ) + + # Convert token id to token text + pred_cell = pred_cell.detach().cpu().numpy() + pred_cell = vocabC.decode_batch(pred_cell, skip_special_tokens=False) + + end4 = time.time() + print("time to do cell recognition ",end4-start4,"seconds") + + pred_cell = [cell_str_to_token_list(i) for i in pred_cell] + pred_cell = [re.sub(r'(\d).\s+(\d)', r'\1.\2', i) for i in pred_cell] + + print(pred_cell) + + # Combine the table structure and cell content + pred_code = build_table_from_html_and_cell(pred_html, pred_cell) + pred_code = "".join(pred_code) + pred_code = html_table_template(pred_code) + + # Display the HTML table + soup = bs(pred_code) + table_code = soup.prettify() + print(table_code) + + diff --git a/unitable/unitable_predictor.py b/unitable/unitable_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..7c48d0218a4060f42656349ed45d89f797a2cf8a --- /dev/null +++ b/unitable/unitable_predictor.py @@ -0,0 +1,343 @@ +from typing import Tuple, List, Sequence, Optional, Union +from pathlib import Path +import re +import torch +import tokenizers as tk +from PIL import Image +from matplotlib import pyplot as plt +from matplotlib import patches +from torchvision import transforms +from torch import nn, Tensor +from functools import partial +import numpy.typing as npt +from numpy import uint8 +ImageType = npt.NDArray[uint8] +import warnings +import time +import argparse + +from .src.model import EncoderDecoder, ImgLinearBackbone, Encoder, Decoder +from .src.utils import subsequent_mask, pred_token_within_range, greedy_sampling, bbox_str_to_token_list, html_str_to_token_list +from .src.trainer.utils import VALID_HTML_TOKEN, VALID_BBOX_TOKEN, INVALID_CELL_TOKEN + +warnings.filterwarnings('ignore') +device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + +class UnitablePredictor(): + def __init__(self): + pass + + def load_vocab_and_model( + self, + backbone, + encoder, + decoder, + vocab_path: Union[str, Path], + max_seq_len: int, + model_weights: Union[str, Path], + ) -> Tuple[tk.Tokenizer, EncoderDecoder]: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + vocab = tk.Tokenizer.from_file(vocab_path) + d_model = 768 + dropout = 0.2 + model = EncoderDecoder( + backbone= backbone, + encoder= encoder, + decoder= decoder, + vocab_size= vocab.get_vocab_size(), + d_model= d_model, + padding_idx= vocab.token_to_id(""), + max_seq_len=max_seq_len, + dropout=dropout, + norm_layer=partial(nn.LayerNorm, eps=1e-6) + ) + # it loads weights onto the CPU first and then moves the model to the desired device + model.load_state_dict(torch.load(model_weights, map_location="cpu")) + model = model.to(device) + + return vocab, model + + + def autoregressive_decode( + self, + model: EncoderDecoder, + image: Tensor, + prefix: Sequence[int], + max_decode_len: int, + eos_id: int, + token_whitelist: Optional[Sequence[int]] = None, + token_blacklist: Optional[Sequence[int]] = None, + ) -> Tensor: + model.eval() + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + with torch.no_grad(): + """ + The encoder takes the input data (in this case, an image) and transforms it into a high-dimensional feature representation. + This feature representation, or memory tensor, captures the essential information from the input data needed to generate the output sequence. + """ + memory = model.encode(image) + """ + Creates a context tensor from the prefix and repeats it to match the batch size of the image, moving it to the appropriate device. + """ + context = torch.tensor(prefix, dtype=torch.int32).repeat(image.shape[0], 1).to(device) + + for _ in range(max_decode_len): + eos_flag = [eos_id in k for k in context] + if all(eos_flag): + break + + with torch.no_grad(): + causal_mask = subsequent_mask(context.shape[1]).to(device) + logits = model.decode( + memory, context, tgt_mask=causal_mask, tgt_padding_mask=None + ) + logits = model.generator(logits)[:, -1, :] + + logits = pred_token_within_range( + logits.detach(), + white_list=token_whitelist, + black_list=token_blacklist, + ) + + next_probs, next_tokens = greedy_sampling(logits) + context = torch.cat([context, next_tokens], dim=1) + return context + + + @staticmethod + def image_to_tensor(image: Image, size: Tuple[int, int]) -> Tensor: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + T = transforms.Compose([ + transforms.Resize(size), + transforms.ToTensor(), + transforms.Normalize(mean=[0.86597056,0.88463002,0.87491087], std = [0.20686628,0.18201602,0.18485524]) + ]) + + image_tensor = T(image) + image_tensor = image_tensor.to(device).unsqueeze(0) + + return image_tensor + + def rescale_bbox( + self, + bbox: Sequence[Sequence[float]], + src: Tuple[int, int], + tgt: Tuple[int, int] + ) -> Sequence[Sequence[float]]: + assert len(src) == len(tgt) == 2 + ratio = [tgt[0] / src[0], tgt[1] / src[1]] * 2 + print(ratio) + bbox = [[int(round(i * j)) for i, j in zip(entry, ratio)] for entry in bbox] + return bbox + + + def predict(self, images:List[Image.Image],debugfolder_filename_page_name:str): + MODEL_FILE_NAME = ["unitable_large_structure.pt", "unitable_large_bbox.pt", "unitable_large_content.pt"] + MODEL_DIR = Path("./unitable/experiments/unitable_weights") + # UniTable large model + d_model = 768 + patch_size = 16 + nhead = 12 + dropout = 0.2 + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + backbone= ImgLinearBackbone(d_model=d_model, patch_size=patch_size) + encoder= Encoder( + d_model=d_model, + nhead=nhead, + dropout=dropout, + activation="gelu", + norm_first=True, + nlayer=12, + ff_ratio=4, + ) + decoder= Decoder( + d_model=d_model, + nhead=nhead, + dropout=dropout, + activation="gelu", + norm_first=True, + nlayer=4, + ff_ratio=4, + ) + + + """ + Step 1 Load Table Structure Model + """ + + start1 = time.time() + # Table structure extraction + vocabS, modelS = self.load_vocab_and_model( + backbone=backbone, + encoder=encoder, + decoder=decoder, + vocab_path="./unitable/vocab/vocab_html.json", + max_seq_len=784, + model_weights=MODEL_DIR / MODEL_FILE_NAME[0] + ) + + end1 = time.time() + print("time to load table structure model ",end1-start1,"seconds") + + """ + Step 2 prepare images to tensor + """ + image_tensors = [] + for i in range(len(images)): + image_size = images[i].size + # Image transformation + image_tensor = self.image_to_tensor(images[i], (448, 448)) + image_tensors.append(image_tensor) + + + + # This will be list of arrays(pred_html), which is again list of array + pred_htmls = [] + for i in range(len(image_tensors)): + #print(image_tensor) + print("Processing table "+str(i)) + start2 = time.time() + # Inference + pred_html = self.autoregressive_decode( + model= modelS, + image= image_tensors[i], + prefix=[vocabS.token_to_id("[html]")], + max_decode_len=512, + eos_id=vocabS.token_to_id(""), + token_whitelist=[vocabS.token_to_id(i) for i in VALID_HTML_TOKEN], + token_blacklist = None + ) + + end2 = time.time() + + print("time for inference table structure ",end2-start2,"seconds") + pred_html = pred_html.detach().cpu().numpy()[0] + pred_html = vocabS.decode(pred_html, skip_special_tokens=False) + + pred_html = html_str_to_token_list(pred_html) + pred_htmls.append(pred_html) + print(pred_html) + """ + Step 3 Load Table Cell detection + """ + + + start3 = time.time() + # Table cell bbox detection + vocabB, modelB = self.load_vocab_and_model( + backbone=backbone, + encoder=encoder, + decoder=decoder, + vocab_path="./unitable/vocab/vocab_bbox.json", + max_seq_len=1024, + model_weights=MODEL_DIR / MODEL_FILE_NAME[1], + ) + end3 = time.time() + print("time to load cell bbox detection model ",end3-start3,"seconds") + """ + Step 4 do the pred_bboxes detection + """ + + pred_bboxs =[] + for i in range(len(image_tensors)): + start4 = time.time() + # Inference + pred_bbox = self.autoregressive_decode( + model=modelB, + image=image_tensors[i], + prefix=[vocabB.token_to_id("[bbox]")], + max_decode_len=1024, + eos_id=vocabB.token_to_id(""), + token_whitelist=[vocabB.token_to_id(i) for i in VALID_BBOX_TOKEN[: 449]], + token_blacklist = None + ) + end4 = time.time() + print("Processing table "+str(i)) + print("time to do inference for table cell bbox detection model ",end4-start4,"seconds") + # Convert token id to token text + pred_bbox = pred_bbox.detach().cpu().numpy()[0] + pred_bbox = vocabB.decode(pred_bbox, skip_special_tokens=False) + pred_bbox = bbox_str_to_token_list(pred_bbox) + pred_bbox = self.rescale_bbox(pred_bbox, src=(448, 448), tgt=images[i].size) + print(pred_bbox) + + print("Size of the image ") + #(1498, 971) + print(images[i].size) + print("Number of bounding boxes ") + print(len(pred_bbox)) + countcells = 0 + for elem in pred_htmls[i] : + if elem == '[]' or elem == '>[]': + countcells+=1 + + #275 + print("number of countcells") + print(countcells) + if countcells > 256: + #TODO Extra processing for big tables + + #Find the last incomplete row and its ymax coordinate + + # Last bbox's ymax gives us coordinate of where the cutted off row starts + #IMPORTANT : pred_bbox is xmin, ymin, xmax, ymax + cut_off = pred_bbox[-1][1] + + #This will be used to distinguish how many cells are already detected in that row. + + last_cells_redudant = 0 + for cell in reversed(pred_bbox): + if cut_off-5 < cell[1] "), + token_whitelist=[vocabB.token_to_id(i) for i in VALID_BBOX_TOKEN[: 449]], + token_blacklist = None + ) + # Convert token id to token text + pred_bbox_extra = pred_bbox_extra.detach().cpu().numpy()[0] + pred_bbox_extra = vocabB.decode(pred_bbox_extra, skip_special_tokens=False) + pred_bbox_extra = bbox_str_to_token_list(pred_bbox_extra) + + pred_bbox_extra = pred_bbox_extra[last_cells_redudant:] + pred_bbox_extra = self.rescale_bbox(pred_bbox_extra, src=(448, 448), tgt=cropped_image.size) + pred_bbox_extra = [[i[0], i[1]+cut_off, i[2], i[3]+cut_off] for i in pred_bbox_extra] + + pred_bbox = pred_bbox + pred_bbox_extra + + print("extra boxes:") + print(pred_bbox_extra) + print("length of extra boxes") + print(len(pred_bbox_extra)) + + pred_bboxs.append(pred_bbox) + fig, ax = plt.subplots(figsize=(12, 10)) + for j in pred_bbox: + #i is xmin, ymin, xmax, ymax based on the function usage + rect = patches.Rectangle(j[:2], j[2] - j[0], j[3] - j[1], linewidth=1, edgecolor='r', facecolor='none') + ax.add_patch(rect) + ax.set_axis_off() + ax.imshow(images[i]) + fig.savefig(debugfolder_filename_page_name+str(i)+".png", bbox_inches='tight', dpi=300) + + + return pred_htmls,pred_bboxs + diff --git a/unitable/unitable_run_double_check.py b/unitable/unitable_run_double_check.py new file mode 100644 index 0000000000000000000000000000000000000000..e63426272161b79920549f57c1b696e134b2133c --- /dev/null +++ b/unitable/unitable_run_double_check.py @@ -0,0 +1,287 @@ +# -*- coding: utf-8 -*- +"""Unitable_run_double_check.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1oaXgLoIaNY8SJwUQB_vMyiXPNZGKOIpb +""" + + +from typing import Tuple, List, Sequence, Optional, Union +from pathlib import Path +import re +import torch +import tokenizers as tk +from PIL import Image +from matplotlib import pyplot as plt +from matplotlib import patches +from torchvision import transforms +from torch import nn, Tensor +from functools import partial +from bs4 import BeautifulSoup as bs +import warnings +import time +from src.model import EncoderDecoder, ImgLinearBackbone, Encoder, Decoder +from src.utils import subsequent_mask, pred_token_within_range, greedy_sampling, bbox_str_to_token_list, cell_str_to_token_list, html_str_to_token_list, build_table_from_html_and_cell, html_table_template +from src.trainer.utils import VALID_HTML_TOKEN, VALID_BBOX_TOKEN, INVALID_CELL_TOKEN + +warnings.filterwarnings('ignore') +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Check all model weights have been downloaded to experiments/unitable_weights +MODEL_FILE_NAME = ["unitable_large_structure.pt", "unitable_large_bbox.pt", "unitable_large_content.pt"] +MODEL_DIR = Path("./experiments/unitable_weights") + +assert all([(MODEL_DIR / name).is_file() for name in MODEL_FILE_NAME]), f"Please download model weights from HuggingFace: https://huggingface.co/poloclub/UniTable/tree/main" + +# Load tabular image + +image_path = "../TestingFilesImages/table_Test1.png" +image = Image.open(image_path).convert("RGB") +image_size = image.size + +fig, ax = plt.subplots(figsize=(12, 10)) +ax.imshow(image) + +# UniTable large model +d_model = 768 +patch_size = 16 +nhead = 12 +dropout = 0.2 + +start= time.time() +backbone = ImgLinearBackbone(d_model=d_model, patch_size=patch_size) +encoder = Encoder( + d_model=d_model, + nhead=nhead, + dropout = dropout, + activation="gelu", + norm_first=True, + nlayer=12, + ff_ratio=4, +) +decoder = Decoder( + d_model=d_model, + nhead=nhead, + dropout = dropout, + activation="gelu", + norm_first=True, + nlayer=4, + ff_ratio=4, +) +end= time.time() +time1 = end-start +print("time to load" + str(time1)) + +def autoregressive_decode( + model: EncoderDecoder, + image: Tensor, + prefix: Sequence[int], + max_decode_len: int, + eos_id: int, + token_whitelist: Optional[Sequence[int]] = None, + token_blacklist: Optional[Sequence[int]] = None, +) -> Tensor: + model.eval() + with torch.no_grad(): + memory = model.encode(image) + context = torch.tensor(prefix, dtype=torch.int32).repeat(image.shape[0], 1).to(device) + + for _ in range(max_decode_len): + eos_flag = [eos_id in k for k in context] + if all(eos_flag): + break + + with torch.no_grad(): + causal_mask = subsequent_mask(context.shape[1]).to(device) + logits = model.decode( + memory, context, tgt_mask=causal_mask, tgt_padding_mask=None + ) + logits = model.generator(logits)[:, -1, :] + + logits = pred_token_within_range( + logits.detach(), + white_list=token_whitelist, + black_list=token_blacklist, + ) + + next_probs, next_tokens = greedy_sampling(logits) + context = torch.cat([context, next_tokens], dim=1) + return context + +def load_vocab_and_model( + vocab_path: Union[str, Path], + max_seq_len: int, + model_weights: Union[str, Path], +) -> Tuple[tk.Tokenizer, EncoderDecoder]: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + vocab = tk.Tokenizer.from_file(vocab_path) + model = EncoderDecoder( + backbone=backbone, + encoder=encoder, + decoder=decoder, + vocab_size=vocab.get_vocab_size(), + d_model=d_model, + padding_idx=vocab.token_to_id(""), + max_seq_len=max_seq_len, + dropout=dropout, + norm_layer=partial(nn.LayerNorm, eps=1e-6) + ) + + model.load_state_dict(torch.load(model_weights, map_location=device)) + model = model.to(device) + return vocab, model + +def image_to_tensor(image: Image, size: Tuple[int, int]) -> Tensor: + T = transforms.Compose([ + transforms.Resize(size), + transforms.ToTensor(), + transforms.Normalize(mean=[0.86597056,0.88463002,0.87491087], std = [0.20686628,0.18201602,0.18485524]) + ]) + image_tensor = T(image) + image_tensor = image_tensor.to(device).unsqueeze(0) + + return image_tensor + +def rescale_bbox( + bbox: Sequence[Sequence[float]], + src: Tuple[int, int], + tgt: Tuple[int, int] +) -> Sequence[Sequence[float]]: + assert len(src) == len(tgt) == 2 + ratio = [tgt[0] / src[0], tgt[1] / src[1]] * 2 + bbox = [[int(round(i * j)) for i, j in zip(entry, ratio)] for entry in bbox] + return bbox + +# Table structure extraction +import time +start= time.time() +vocab, model = load_vocab_and_model( + vocab_path="./vocab/vocab_html.json", + max_seq_len=784, + model_weights=MODEL_DIR / MODEL_FILE_NAME[0], +) +end= time.time() +time1 = end-start +print("time to load structure model " + str(time1)) +# Image transformation +image_tensor = image_to_tensor(image, size=(448, 448)) + +# Inference +start= time.time() +pred_html = autoregressive_decode( + model=model, + image=image_tensor, + prefix=[vocab.token_to_id("[html]")], + max_decode_len=512, + eos_id=vocab.token_to_id(""), + token_whitelist=[vocab.token_to_id(i) for i in VALID_HTML_TOKEN], + token_blacklist = None +) +end= time.time() +time1 = end-start +print("time to do structure inference" + str(time1)) + +# Convert token id to token text +pred_html = pred_html.detach().cpu().numpy()[0] +pred_html = vocab.decode(pred_html, skip_special_tokens=False) +pred_html = html_str_to_token_list(pred_html) + +# print(pred_html) + +# Table cell bbox detection +start= time.time() +vocab, model = load_vocab_and_model( + vocab_path="./vocab/vocab_bbox.json", + max_seq_len=1024, + model_weights=MODEL_DIR / MODEL_FILE_NAME[1], +) + +end= time.time() +time1 = end-start +print("time to load cell bbox detection " + str(time1)) +# Image transformation +image_tensor = image_to_tensor(image, size=(448, 448)) + +# Inference +start= time.time() +pred_bbox = autoregressive_decode( + model=model, + image=image_tensor, + prefix=[vocab.token_to_id("[bbox]")], + max_decode_len=1024, + eos_id=vocab.token_to_id(""), + token_whitelist=[vocab.token_to_id(i) for i in VALID_BBOX_TOKEN[: 449]], + token_blacklist = None +) +end= time.time() +time1 = end-start +print("time to do cell bbox detection " + str(time1)) + +# Convert token id to token text +pred_bbox = pred_bbox.detach().cpu().numpy()[0] +pred_bbox = vocab.decode(pred_bbox, skip_special_tokens=False) + +# print(pred_bbox) + +# Visualize detected bbox +pred_bbox = bbox_str_to_token_list(pred_bbox) +pred_bbox = rescale_bbox(pred_bbox, src=(448, 448), tgt=image_size) + +fig, ax = plt.subplots(figsize=(12, 10)) +for i in pred_bbox: + rect = patches.Rectangle(i[:2], i[2] - i[0], i[3] - i[1], linewidth=1, edgecolor='r', facecolor='none') + ax.add_patch(rect) +ax.set_axis_off() +ax.imshow(image) + +# Table cell content recognition +start= time.time() +vocab, model = load_vocab_and_model( + vocab_path="./vocab/vocab_cell_6k.json", + max_seq_len=200, + model_weights=MODEL_DIR / MODEL_FILE_NAME[2], +) +end= time.time() +time1 = end-start +print("time to load cell content " + str(time1)) + +# Cell image cropping and transformation +image_tensor = [image_to_tensor(image.crop(bbox), size=(112, 448)) for bbox in pred_bbox] +image_tensor = torch.cat(image_tensor, dim=0) + +start= time.time() +# Inference +pred_cell = autoregressive_decode( + model=model, + image=image_tensor, + prefix=[vocab.token_to_id("[cell]")], + max_decode_len=200, + eos_id=vocab.token_to_id(""), + token_whitelist=None, + token_blacklist = [vocab.token_to_id(i) for i in INVALID_CELL_TOKEN] +) +end= time.time() +time1 = end-start +print("time to do cell content " + str(time1)) +# Convert token id to token text +pred_cell = pred_cell.detach().cpu().numpy() +pred_cell = vocab.decode_batch(pred_cell, skip_special_tokens=False) +pred_cell = [cell_str_to_token_list(i) for i in pred_cell] +pred_cell = [re.sub(r'(\d).\s+(\d)', r'\1.\2', i) for i in pred_cell] + +# print(pred_cell) + +# Combine the table structure and cell content +pred_code = build_table_from_html_and_cell(pred_html, pred_cell) +pred_code = "".join(pred_code) +pred_code = html_table_template(pred_code) + +# Display the HTML table +soup = bs(pred_code) +table_code = soup.prettify() + +# Raw HTML table code +print(table_code) \ No newline at end of file diff --git a/unitable/vocab/vocab_bbox.json b/unitable/vocab/vocab_bbox.json new file mode 100644 index 0000000000000000000000000000000000000000..186c6e6ea4c64dce8f0ce8d078a65b97f237a86d --- /dev/null +++ b/unitable/vocab/vocab_bbox.json @@ -0,0 +1,8960 @@ +{ + "version": "1.0", + "truncation": null, + "padding": { + "strategy": "BatchLongest", + "direction": "Right", + "pad_to_multiple_of": null, + "pad_id": 2, + "pad_type_id": 0, + "pad_token": "" + }, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 4, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 5, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 6, + "content": "[table]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 7, + "content": "[html]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 8, + "content": "[cell]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 9, + "content": "[bbox]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 10, + "content": "[cell+bbox]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 11, + "content": "bbox-0", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 12, + "content": "bbox-1", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 13, + "content": "bbox-2", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 14, + "content": "bbox-3", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 15, + "content": "bbox-4", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 16, + "content": "bbox-5", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 17, + "content": "bbox-6", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 18, + "content": "bbox-7", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 19, + "content": "bbox-8", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 20, + "content": "bbox-9", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 21, + "content": "bbox-10", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 22, + "content": "bbox-11", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 23, + "content": "bbox-12", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 24, + "content": "bbox-13", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 25, + "content": "bbox-14", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 26, + "content": "bbox-15", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 27, + "content": "bbox-16", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 28, + "content": "bbox-17", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 29, + "content": "bbox-18", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 30, + "content": "bbox-19", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 31, + "content": "bbox-20", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 32, + "content": "bbox-21", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 33, + "content": "bbox-22", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 34, + "content": "bbox-23", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 35, + "content": "bbox-24", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 36, + "content": "bbox-25", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 37, + "content": "bbox-26", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 38, + "content": "bbox-27", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 39, + "content": "bbox-28", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 40, + "content": "bbox-29", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 41, + "content": "bbox-30", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 42, + "content": "bbox-31", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 43, + "content": "bbox-32", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 44, + "content": "bbox-33", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 45, + "content": "bbox-34", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 46, + "content": "bbox-35", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 47, + "content": "bbox-36", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 48, + "content": "bbox-37", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 49, + "content": "bbox-38", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 50, + "content": "bbox-39", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 51, + "content": "bbox-40", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 52, + "content": "bbox-41", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 53, + "content": "bbox-42", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 54, + "content": "bbox-43", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 55, + "content": "bbox-44", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 56, + "content": "bbox-45", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 57, + "content": "bbox-46", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 58, + "content": "bbox-47", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 59, + "content": "bbox-48", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 60, + "content": "bbox-49", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 61, + "content": "bbox-50", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 62, + "content": "bbox-51", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 63, + "content": "bbox-52", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 64, + "content": "bbox-53", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 65, + "content": "bbox-54", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 66, + "content": "bbox-55", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 67, + "content": "bbox-56", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 68, + "content": "bbox-57", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 69, + "content": "bbox-58", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 70, + "content": "bbox-59", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 71, + "content": "bbox-60", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 72, + "content": "bbox-61", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 73, + "content": "bbox-62", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 74, + "content": "bbox-63", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 75, + "content": "bbox-64", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 76, + "content": "bbox-65", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 77, + "content": "bbox-66", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 78, + "content": "bbox-67", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 79, + "content": "bbox-68", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 80, + "content": "bbox-69", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 81, + "content": "bbox-70", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 82, + "content": "bbox-71", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 83, + "content": "bbox-72", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 84, + "content": "bbox-73", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 85, + "content": "bbox-74", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 86, + "content": "bbox-75", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 87, + "content": "bbox-76", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 88, + "content": "bbox-77", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 89, + "content": "bbox-78", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 90, + "content": "bbox-79", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 91, + "content": "bbox-80", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 92, + "content": "bbox-81", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 93, + "content": "bbox-82", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 94, + "content": "bbox-83", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 95, + "content": "bbox-84", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 96, + "content": "bbox-85", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 97, + "content": "bbox-86", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 98, + "content": "bbox-87", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 99, + "content": "bbox-88", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 100, + "content": "bbox-89", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 101, + "content": "bbox-90", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 102, + "content": "bbox-91", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 103, + "content": "bbox-92", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 104, + "content": "bbox-93", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 105, + "content": "bbox-94", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 106, + "content": "bbox-95", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 107, + "content": "bbox-96", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 108, + "content": "bbox-97", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 109, + "content": "bbox-98", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 110, + "content": "bbox-99", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 111, + "content": "bbox-100", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 112, + "content": "bbox-101", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 113, + "content": "bbox-102", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 114, + "content": "bbox-103", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 115, + "content": "bbox-104", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 116, + "content": "bbox-105", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 117, + "content": "bbox-106", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 118, + "content": "bbox-107", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 119, + "content": "bbox-108", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 120, + "content": "bbox-109", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 121, + "content": "bbox-110", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 122, + "content": "bbox-111", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 123, + "content": "bbox-112", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 124, + "content": "bbox-113", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 125, + "content": "bbox-114", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 126, + "content": "bbox-115", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 127, + "content": "bbox-116", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128, + "content": "bbox-117", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 129, + "content": "bbox-118", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 130, + "content": "bbox-119", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 131, + "content": "bbox-120", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 132, + "content": "bbox-121", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 133, + "content": "bbox-122", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 134, + "content": "bbox-123", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 135, + "content": "bbox-124", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 136, + "content": "bbox-125", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 137, + "content": "bbox-126", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 138, + "content": "bbox-127", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 139, + "content": "bbox-128", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 140, + "content": "bbox-129", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 141, + "content": "bbox-130", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 142, + "content": "bbox-131", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 143, + "content": "bbox-132", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 144, + "content": "bbox-133", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 145, + "content": "bbox-134", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 146, + "content": "bbox-135", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 147, + "content": "bbox-136", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 148, + "content": "bbox-137", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 149, + "content": "bbox-138", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 150, + "content": "bbox-139", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 151, + "content": "bbox-140", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 152, + "content": "bbox-141", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 153, + "content": "bbox-142", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 154, + "content": "bbox-143", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 155, + "content": "bbox-144", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 156, + "content": "bbox-145", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 157, + "content": "bbox-146", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 158, + "content": "bbox-147", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 159, + "content": "bbox-148", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 160, + "content": "bbox-149", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 161, + "content": "bbox-150", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 162, + "content": "bbox-151", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 163, + "content": "bbox-152", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 164, + "content": "bbox-153", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 165, + "content": "bbox-154", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 166, + "content": "bbox-155", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 167, + "content": "bbox-156", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 168, + "content": "bbox-157", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 169, + "content": "bbox-158", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 170, + "content": "bbox-159", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 171, + "content": "bbox-160", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 172, + "content": "bbox-161", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 173, + "content": "bbox-162", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 174, + "content": "bbox-163", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 175, + "content": "bbox-164", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 176, + "content": "bbox-165", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 177, + "content": "bbox-166", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 178, + "content": "bbox-167", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 179, + "content": "bbox-168", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 180, + "content": "bbox-169", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 181, + "content": "bbox-170", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 182, + "content": "bbox-171", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 183, + "content": "bbox-172", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 184, + "content": "bbox-173", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 185, + "content": "bbox-174", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 186, + "content": "bbox-175", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 187, + "content": "bbox-176", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 188, + "content": "bbox-177", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 189, + "content": "bbox-178", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 190, + "content": "bbox-179", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 191, + "content": "bbox-180", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 192, + "content": "bbox-181", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 193, + "content": "bbox-182", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 194, + "content": "bbox-183", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 195, + "content": "bbox-184", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 196, + "content": "bbox-185", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 197, + "content": "bbox-186", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 198, + "content": "bbox-187", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 199, + "content": "bbox-188", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 200, + "content": "bbox-189", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 201, + "content": "bbox-190", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 202, + "content": "bbox-191", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 203, + "content": "bbox-192", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 204, + "content": "bbox-193", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 205, + "content": "bbox-194", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 206, + "content": "bbox-195", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 207, + "content": "bbox-196", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 208, + "content": "bbox-197", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 209, + "content": "bbox-198", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 210, + "content": "bbox-199", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 211, + "content": "bbox-200", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 212, + "content": "bbox-201", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 213, + "content": "bbox-202", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 214, + "content": "bbox-203", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 215, + "content": "bbox-204", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 216, + "content": "bbox-205", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 217, + "content": "bbox-206", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 218, + "content": "bbox-207", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 219, + "content": "bbox-208", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 220, + "content": "bbox-209", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 221, + "content": "bbox-210", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 222, + "content": "bbox-211", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 223, + "content": "bbox-212", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 224, + "content": "bbox-213", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 225, + "content": "bbox-214", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 226, + "content": "bbox-215", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 227, + "content": "bbox-216", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 228, + "content": "bbox-217", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 229, + "content": "bbox-218", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 230, + "content": "bbox-219", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 231, + "content": "bbox-220", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 232, + "content": "bbox-221", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 233, + "content": "bbox-222", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 234, + "content": "bbox-223", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 235, + "content": "bbox-224", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 236, + "content": "bbox-225", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 237, + "content": "bbox-226", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 238, + "content": "bbox-227", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 239, + "content": "bbox-228", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 240, + "content": "bbox-229", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 241, + "content": "bbox-230", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 242, + "content": "bbox-231", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 243, + "content": "bbox-232", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 244, + "content": "bbox-233", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 245, + "content": "bbox-234", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 246, + "content": "bbox-235", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 247, + "content": "bbox-236", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 248, + "content": "bbox-237", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 249, + "content": "bbox-238", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 250, + "content": "bbox-239", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 251, + "content": "bbox-240", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 252, + "content": "bbox-241", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 253, + "content": "bbox-242", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 254, + "content": "bbox-243", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 255, + "content": "bbox-244", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 256, + "content": "bbox-245", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 257, + "content": "bbox-246", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 258, + "content": "bbox-247", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 259, + "content": "bbox-248", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 260, + "content": "bbox-249", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 261, + "content": "bbox-250", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 262, + "content": "bbox-251", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 263, + "content": "bbox-252", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 264, + "content": "bbox-253", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 265, + "content": "bbox-254", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 266, + "content": "bbox-255", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 267, + "content": "bbox-256", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 268, + "content": "bbox-257", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 269, + "content": "bbox-258", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 270, + "content": "bbox-259", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 271, + "content": "bbox-260", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 272, + "content": "bbox-261", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 273, + "content": "bbox-262", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 274, + "content": "bbox-263", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 275, + "content": "bbox-264", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 276, + "content": "bbox-265", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 277, + "content": "bbox-266", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 278, + "content": "bbox-267", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 279, + "content": "bbox-268", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 280, + "content": "bbox-269", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 281, + "content": "bbox-270", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 282, + "content": "bbox-271", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 283, + "content": "bbox-272", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 284, + "content": "bbox-273", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 285, + "content": "bbox-274", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 286, + "content": "bbox-275", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 287, + "content": "bbox-276", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 288, + "content": "bbox-277", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 289, + "content": "bbox-278", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 290, + "content": "bbox-279", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 291, + "content": "bbox-280", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 292, + "content": "bbox-281", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 293, + "content": "bbox-282", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 294, + "content": "bbox-283", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 295, + "content": "bbox-284", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 296, + "content": "bbox-285", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 297, + "content": "bbox-286", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 298, + "content": "bbox-287", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 299, + "content": "bbox-288", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 300, + "content": "bbox-289", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 301, + "content": "bbox-290", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 302, + "content": "bbox-291", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 303, + "content": "bbox-292", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 304, + "content": "bbox-293", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 305, + "content": "bbox-294", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 306, + "content": "bbox-295", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 307, + "content": "bbox-296", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 308, + "content": "bbox-297", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 309, + "content": "bbox-298", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 310, + "content": "bbox-299", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 311, + "content": "bbox-300", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 312, + "content": "bbox-301", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 313, + "content": "bbox-302", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 314, + "content": "bbox-303", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 315, + "content": "bbox-304", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 316, + "content": "bbox-305", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 317, + "content": "bbox-306", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 318, + "content": "bbox-307", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 319, + "content": "bbox-308", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 320, + "content": "bbox-309", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 321, + "content": "bbox-310", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 322, + "content": "bbox-311", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 323, + "content": "bbox-312", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 324, + "content": "bbox-313", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 325, + "content": "bbox-314", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 326, + "content": "bbox-315", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 327, + "content": "bbox-316", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 328, + "content": "bbox-317", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 329, + "content": "bbox-318", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 330, + "content": "bbox-319", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 331, + "content": "bbox-320", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 332, + "content": "bbox-321", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 333, + "content": "bbox-322", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 334, + "content": "bbox-323", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 335, + "content": "bbox-324", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 336, + "content": "bbox-325", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 337, + "content": "bbox-326", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 338, + "content": "bbox-327", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 339, + "content": "bbox-328", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 340, + "content": "bbox-329", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 341, + "content": "bbox-330", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 342, + "content": "bbox-331", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 343, + "content": "bbox-332", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 344, + "content": "bbox-333", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 345, + "content": "bbox-334", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 346, + "content": "bbox-335", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 347, + "content": "bbox-336", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 348, + "content": "bbox-337", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 349, + "content": "bbox-338", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 350, + "content": "bbox-339", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 351, + "content": "bbox-340", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 352, + "content": "bbox-341", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 353, + "content": "bbox-342", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 354, + "content": "bbox-343", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 355, + "content": "bbox-344", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 356, + "content": "bbox-345", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 357, + "content": "bbox-346", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 358, + "content": "bbox-347", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 359, + "content": "bbox-348", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 360, + "content": "bbox-349", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 361, + "content": "bbox-350", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 362, + "content": "bbox-351", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 363, + "content": "bbox-352", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 364, + "content": "bbox-353", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 365, + "content": "bbox-354", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 366, + "content": "bbox-355", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 367, + "content": "bbox-356", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 368, + "content": "bbox-357", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 369, + "content": "bbox-358", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 370, + "content": "bbox-359", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 371, + "content": "bbox-360", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 372, + "content": "bbox-361", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 373, + "content": "bbox-362", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 374, + "content": "bbox-363", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 375, + "content": "bbox-364", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 376, + "content": "bbox-365", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 377, + "content": "bbox-366", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 378, + "content": "bbox-367", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 379, + "content": "bbox-368", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 380, + "content": "bbox-369", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 381, + "content": "bbox-370", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 382, + "content": "bbox-371", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 383, + "content": "bbox-372", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 384, + "content": "bbox-373", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 385, + "content": "bbox-374", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 386, + "content": "bbox-375", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 387, + "content": "bbox-376", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 388, + "content": "bbox-377", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 389, + "content": "bbox-378", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 390, + "content": "bbox-379", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 391, + "content": "bbox-380", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 392, + "content": "bbox-381", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 393, + "content": "bbox-382", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 394, + "content": "bbox-383", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 395, + "content": "bbox-384", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 396, + "content": "bbox-385", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 397, + "content": "bbox-386", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 398, + "content": "bbox-387", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 399, + "content": "bbox-388", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 400, + "content": "bbox-389", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 401, + "content": "bbox-390", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 402, + "content": "bbox-391", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 403, + "content": "bbox-392", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 404, + "content": "bbox-393", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 405, + "content": "bbox-394", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 406, + "content": "bbox-395", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 407, + "content": "bbox-396", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 408, + "content": "bbox-397", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 409, + "content": "bbox-398", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 410, + "content": "bbox-399", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 411, + "content": "bbox-400", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 412, + "content": "bbox-401", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 413, + "content": "bbox-402", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 414, + "content": "bbox-403", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 415, + "content": "bbox-404", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 416, + "content": "bbox-405", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 417, + "content": "bbox-406", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 418, + "content": "bbox-407", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 419, + "content": "bbox-408", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 420, + "content": "bbox-409", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 421, + "content": "bbox-410", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 422, + "content": "bbox-411", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 423, + "content": "bbox-412", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 424, + "content": "bbox-413", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 425, + "content": "bbox-414", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 426, + "content": "bbox-415", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 427, + "content": "bbox-416", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 428, + "content": "bbox-417", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 429, + "content": "bbox-418", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 430, + "content": "bbox-419", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 431, + "content": "bbox-420", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 432, + "content": "bbox-421", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 433, + "content": "bbox-422", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 434, + "content": "bbox-423", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 435, + "content": "bbox-424", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 436, + "content": "bbox-425", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 437, + "content": "bbox-426", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 438, + "content": "bbox-427", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 439, + "content": "bbox-428", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 440, + "content": "bbox-429", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 441, + "content": "bbox-430", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 442, + "content": "bbox-431", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 443, + "content": "bbox-432", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 444, + "content": "bbox-433", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 445, + "content": "bbox-434", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 446, + "content": "bbox-435", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 447, + "content": "bbox-436", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 448, + "content": "bbox-437", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 449, + "content": "bbox-438", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 450, + "content": "bbox-439", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 451, + "content": "bbox-440", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 452, + "content": "bbox-441", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 453, + "content": "bbox-442", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 454, + "content": "bbox-443", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 455, + "content": "bbox-444", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 456, + "content": "bbox-445", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 457, + "content": "bbox-446", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 458, + "content": "bbox-447", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 459, + "content": "bbox-448", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 460, + "content": "bbox-449", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 461, + "content": "bbox-450", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 462, + "content": "bbox-451", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 463, + "content": "bbox-452", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 464, + "content": "bbox-453", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 465, + "content": "bbox-454", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 466, + "content": "bbox-455", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 467, + "content": "bbox-456", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 468, + "content": "bbox-457", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 469, + "content": "bbox-458", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 470, + "content": "bbox-459", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 471, + "content": "bbox-460", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 472, + "content": "bbox-461", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 473, + "content": "bbox-462", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 474, + "content": "bbox-463", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 475, + "content": "bbox-464", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 476, + "content": "bbox-465", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 477, + "content": "bbox-466", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 478, + "content": "bbox-467", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 479, + "content": "bbox-468", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 480, + "content": "bbox-469", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 481, + "content": "bbox-470", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 482, + "content": "bbox-471", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 483, + "content": "bbox-472", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 484, + "content": "bbox-473", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 485, + "content": "bbox-474", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 486, + "content": "bbox-475", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 487, + "content": "bbox-476", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 488, + "content": "bbox-477", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 489, + "content": "bbox-478", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 490, + "content": "bbox-479", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 491, + "content": "bbox-480", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 492, + "content": "bbox-481", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 493, + "content": "bbox-482", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 494, + "content": "bbox-483", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 495, + "content": "bbox-484", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 496, + "content": "bbox-485", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 497, + "content": "bbox-486", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 498, + "content": "bbox-487", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 499, + "content": "bbox-488", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 500, + "content": "bbox-489", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 501, + "content": "bbox-490", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 502, + "content": "bbox-491", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 503, + "content": "bbox-492", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 504, + "content": "bbox-493", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 505, + "content": "bbox-494", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 506, + "content": "bbox-495", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 507, + "content": "bbox-496", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 508, + "content": "bbox-497", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 509, + "content": "bbox-498", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 510, + "content": "bbox-499", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 511, + "content": "bbox-500", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 512, + "content": "bbox-501", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 513, + "content": "bbox-502", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 514, + "content": "bbox-503", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 515, + "content": "bbox-504", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 516, + "content": "bbox-505", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 517, + "content": "bbox-506", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 518, + "content": "bbox-507", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 519, + "content": "bbox-508", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 520, + "content": "bbox-509", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 521, + "content": "bbox-510", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 522, + "content": "bbox-511", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 523, + "content": "bbox-512", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 524, + "content": "bbox-513", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 525, + "content": "bbox-514", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 526, + "content": "bbox-515", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 527, + "content": "bbox-516", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 528, + "content": "bbox-517", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 529, + "content": "bbox-518", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 530, + "content": "bbox-519", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 531, + "content": "bbox-520", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 532, + "content": "bbox-521", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 533, + "content": "bbox-522", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 534, + "content": "bbox-523", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 535, + "content": "bbox-524", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 536, + "content": "bbox-525", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 537, + "content": "bbox-526", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 538, + "content": "bbox-527", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 539, + "content": "bbox-528", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 540, + "content": "bbox-529", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 541, + "content": "bbox-530", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 542, + "content": "bbox-531", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 543, + "content": "bbox-532", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 544, + "content": "bbox-533", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 545, + "content": "bbox-534", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 546, + "content": "bbox-535", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 547, + "content": "bbox-536", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 548, + "content": "bbox-537", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 549, + "content": "bbox-538", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 550, + "content": "bbox-539", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 551, + "content": "bbox-540", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 552, + "content": "bbox-541", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 553, + "content": "bbox-542", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 554, + "content": "bbox-543", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 555, + "content": "bbox-544", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 556, + "content": "bbox-545", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 557, + "content": "bbox-546", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 558, + "content": "bbox-547", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 559, + "content": "bbox-548", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 560, + "content": "bbox-549", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 561, + "content": "bbox-550", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 562, + "content": "bbox-551", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 563, + "content": "bbox-552", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 564, + "content": "bbox-553", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 565, + "content": "bbox-554", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 566, + "content": "bbox-555", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 567, + "content": "bbox-556", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 568, + "content": "bbox-557", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 569, + "content": "bbox-558", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 570, + "content": "bbox-559", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 571, + "content": "bbox-560", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 572, + "content": "bbox-561", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 573, + "content": "bbox-562", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 574, + "content": "bbox-563", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 575, + "content": "bbox-564", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 576, + "content": "bbox-565", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 577, + "content": "bbox-566", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 578, + "content": "bbox-567", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 579, + "content": "bbox-568", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 580, + "content": "bbox-569", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 581, + "content": "bbox-570", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 582, + "content": "bbox-571", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 583, + "content": "bbox-572", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 584, + "content": "bbox-573", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 585, + "content": "bbox-574", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 586, + "content": "bbox-575", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 587, + "content": "bbox-576", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 588, + "content": "bbox-577", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 589, + "content": "bbox-578", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 590, + "content": "bbox-579", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 591, + "content": "bbox-580", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 592, + "content": "bbox-581", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 593, + "content": "bbox-582", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 594, + "content": "bbox-583", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 595, + "content": "bbox-584", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 596, + "content": "bbox-585", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 597, + "content": "bbox-586", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 598, + "content": "bbox-587", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 599, + "content": "bbox-588", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 600, + "content": "bbox-589", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 601, + "content": "bbox-590", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 602, + "content": "bbox-591", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 603, + "content": "bbox-592", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 604, + "content": "bbox-593", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 605, + "content": "bbox-594", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 606, + "content": "bbox-595", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 607, + "content": "bbox-596", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 608, + "content": "bbox-597", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 609, + "content": "bbox-598", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 610, + "content": "bbox-599", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 611, + "content": "bbox-600", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 612, + "content": "bbox-601", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 613, + "content": "bbox-602", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 614, + "content": "bbox-603", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 615, + "content": "bbox-604", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 616, + "content": "bbox-605", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 617, + "content": "bbox-606", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 618, + "content": "bbox-607", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 619, + "content": "bbox-608", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 620, + "content": "bbox-609", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 621, + "content": "bbox-610", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 622, + "content": "bbox-611", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 623, + "content": "bbox-612", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 624, + "content": "bbox-613", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 625, + "content": "bbox-614", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 626, + "content": "bbox-615", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 627, + "content": "bbox-616", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 628, + "content": "bbox-617", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 629, + "content": "bbox-618", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 630, + "content": "bbox-619", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 631, + "content": "bbox-620", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 632, + "content": "bbox-621", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 633, + "content": "bbox-622", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 634, + "content": "bbox-623", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 635, + "content": "bbox-624", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 636, + "content": "bbox-625", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 637, + "content": "bbox-626", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 638, + "content": "bbox-627", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 639, + "content": "bbox-628", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 640, + "content": "bbox-629", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 641, + "content": "bbox-630", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 642, + "content": "bbox-631", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 643, + "content": "bbox-632", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 644, + "content": "bbox-633", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 645, + "content": "bbox-634", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 646, + "content": "bbox-635", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 647, + "content": "bbox-636", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 648, + "content": "bbox-637", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 649, + "content": "bbox-638", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 650, + "content": "bbox-639", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 651, + "content": "bbox-640", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 652, + "content": "bbox-641", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 653, + "content": "bbox-642", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 654, + "content": "bbox-643", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 655, + "content": "bbox-644", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 656, + "content": "bbox-645", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 657, + "content": "bbox-646", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 658, + "content": "bbox-647", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 659, + "content": "bbox-648", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 660, + "content": "bbox-649", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 661, + "content": "bbox-650", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 662, + "content": "bbox-651", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 663, + "content": "bbox-652", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 664, + "content": "bbox-653", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 665, + "content": "bbox-654", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 666, + "content": "bbox-655", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 667, + "content": "bbox-656", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 668, + "content": "bbox-657", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 669, + "content": "bbox-658", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 670, + "content": "bbox-659", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 671, + "content": "bbox-660", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 672, + "content": "bbox-661", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 673, + "content": "bbox-662", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 674, + "content": "bbox-663", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 675, + "content": "bbox-664", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 676, + "content": "bbox-665", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 677, + "content": "bbox-666", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 678, + "content": "bbox-667", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 679, + "content": "bbox-668", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 680, + "content": "bbox-669", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 681, + "content": "bbox-670", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 682, + "content": "bbox-671", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 683, + "content": "bbox-672", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 684, + "content": "bbox-673", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 685, + "content": "bbox-674", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 686, + "content": "bbox-675", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 687, + "content": "bbox-676", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 688, + "content": "bbox-677", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 689, + "content": "bbox-678", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 690, + "content": "bbox-679", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 691, + "content": "bbox-680", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 692, + "content": "bbox-681", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 693, + "content": "bbox-682", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 694, + "content": "bbox-683", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 695, + "content": "bbox-684", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 696, + "content": "bbox-685", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 697, + "content": "bbox-686", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 698, + "content": "bbox-687", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 699, + "content": "bbox-688", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 700, + "content": "bbox-689", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 701, + "content": "bbox-690", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 702, + "content": "bbox-691", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 703, + "content": "bbox-692", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 704, + "content": "bbox-693", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 705, + "content": "bbox-694", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 706, + "content": "bbox-695", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 707, + "content": "bbox-696", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 708, + "content": "bbox-697", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 709, + "content": "bbox-698", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 710, + "content": "bbox-699", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 711, + "content": "bbox-700", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 712, + "content": "bbox-701", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 713, + "content": "bbox-702", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 714, + "content": "bbox-703", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 715, + "content": "bbox-704", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 716, + "content": "bbox-705", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 717, + "content": "bbox-706", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 718, + "content": "bbox-707", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 719, + "content": "bbox-708", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 720, + "content": "bbox-709", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 721, + "content": "bbox-710", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 722, + "content": "bbox-711", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 723, + "content": "bbox-712", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 724, + "content": "bbox-713", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 725, + "content": "bbox-714", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 726, + "content": "bbox-715", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 727, + "content": "bbox-716", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 728, + "content": "bbox-717", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 729, + "content": "bbox-718", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 730, + "content": "bbox-719", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 731, + "content": "bbox-720", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 732, + "content": "bbox-721", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 733, + "content": "bbox-722", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 734, + "content": "bbox-723", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 735, + "content": "bbox-724", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 736, + "content": "bbox-725", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 737, + "content": "bbox-726", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 738, + "content": "bbox-727", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 739, + "content": "bbox-728", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 740, + "content": "bbox-729", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 741, + "content": "bbox-730", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 742, + "content": "bbox-731", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 743, + "content": "bbox-732", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 744, + "content": "bbox-733", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 745, + "content": "bbox-734", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 746, + "content": "bbox-735", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 747, + "content": "bbox-736", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 748, + "content": "bbox-737", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 749, + "content": "bbox-738", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 750, + "content": "bbox-739", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 751, + "content": "bbox-740", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 752, + "content": "bbox-741", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 753, + "content": "bbox-742", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 754, + "content": "bbox-743", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 755, + "content": "bbox-744", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 756, + "content": "bbox-745", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 757, + "content": "bbox-746", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 758, + "content": "bbox-747", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 759, + "content": "bbox-748", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 760, + "content": "bbox-749", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 761, + "content": "bbox-750", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 762, + "content": "bbox-751", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 763, + "content": "bbox-752", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 764, + "content": "bbox-753", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 765, + "content": "bbox-754", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 766, + "content": "bbox-755", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 767, + "content": "bbox-756", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 768, + "content": "bbox-757", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 769, + "content": "bbox-758", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 770, + "content": "bbox-759", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 771, + "content": "bbox-760", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 772, + "content": "bbox-761", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 773, + "content": "bbox-762", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 774, + "content": "bbox-763", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 775, + "content": "bbox-764", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 776, + "content": "bbox-765", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 777, + "content": "bbox-766", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 778, + "content": "bbox-767", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 779, + "content": "bbox-768", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 780, + "content": "bbox-769", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 781, + "content": "bbox-770", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 782, + "content": "bbox-771", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 783, + "content": "bbox-772", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 784, + "content": "bbox-773", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 785, + "content": "bbox-774", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 786, + "content": "bbox-775", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 787, + "content": "bbox-776", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 788, + "content": "bbox-777", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 789, + "content": "bbox-778", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 790, + "content": "bbox-779", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 791, + "content": "bbox-780", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 792, + "content": "bbox-781", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 793, + "content": "bbox-782", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 794, + "content": "bbox-783", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 795, + "content": "bbox-784", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 796, + "content": "bbox-785", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 797, + "content": "bbox-786", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 798, + "content": "bbox-787", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 799, + "content": "bbox-788", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 800, + "content": "bbox-789", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 801, + "content": "bbox-790", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 802, + "content": "bbox-791", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 803, + "content": "bbox-792", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 804, + "content": "bbox-793", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 805, + "content": "bbox-794", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 806, + "content": "bbox-795", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 807, + "content": "bbox-796", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 808, + "content": "bbox-797", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 809, + "content": "bbox-798", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 810, + "content": "bbox-799", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 811, + "content": "bbox-800", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 812, + "content": "bbox-801", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 813, + "content": "bbox-802", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 814, + "content": "bbox-803", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 815, + "content": "bbox-804", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 816, + "content": "bbox-805", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 817, + "content": "bbox-806", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 818, + "content": "bbox-807", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 819, + "content": "bbox-808", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 820, + "content": "bbox-809", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 821, + "content": "bbox-810", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 822, + "content": "bbox-811", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 823, + "content": "bbox-812", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 824, + "content": "bbox-813", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 825, + "content": "bbox-814", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 826, + "content": "bbox-815", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 827, + "content": "bbox-816", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 828, + "content": "bbox-817", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 829, + "content": "bbox-818", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 830, + "content": "bbox-819", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 831, + "content": "bbox-820", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 832, + "content": "bbox-821", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 833, + "content": "bbox-822", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 834, + "content": "bbox-823", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 835, + "content": "bbox-824", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 836, + "content": "bbox-825", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 837, + "content": "bbox-826", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 838, + "content": "bbox-827", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 839, + "content": "bbox-828", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 840, + "content": "bbox-829", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 841, + "content": "bbox-830", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 842, + "content": "bbox-831", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 843, + "content": "bbox-832", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 844, + "content": "bbox-833", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 845, + "content": "bbox-834", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 846, + "content": "bbox-835", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 847, + "content": "bbox-836", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 848, + "content": "bbox-837", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 849, + "content": "bbox-838", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 850, + "content": "bbox-839", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 851, + "content": "bbox-840", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 852, + "content": "bbox-841", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 853, + "content": "bbox-842", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 854, + "content": "bbox-843", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 855, + "content": "bbox-844", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 856, + "content": "bbox-845", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 857, + "content": "bbox-846", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 858, + "content": "bbox-847", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 859, + "content": "bbox-848", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 860, + "content": "bbox-849", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 861, + "content": "bbox-850", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 862, + "content": "bbox-851", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 863, + "content": "bbox-852", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 864, + "content": "bbox-853", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 865, + "content": "bbox-854", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 866, + "content": "bbox-855", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 867, + "content": "bbox-856", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 868, + "content": "bbox-857", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 869, + "content": "bbox-858", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 870, + "content": "bbox-859", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 871, + "content": "bbox-860", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 872, + "content": "bbox-861", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 873, + "content": "bbox-862", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 874, + "content": "bbox-863", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 875, + "content": "bbox-864", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 876, + "content": "bbox-865", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 877, + "content": "bbox-866", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 878, + "content": "bbox-867", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 879, + "content": "bbox-868", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 880, + "content": "bbox-869", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 881, + "content": "bbox-870", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 882, + "content": "bbox-871", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 883, + "content": "bbox-872", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 884, + "content": "bbox-873", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 885, + "content": "bbox-874", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 886, + "content": "bbox-875", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 887, + "content": "bbox-876", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 888, + "content": "bbox-877", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 889, + "content": "bbox-878", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 890, + "content": "bbox-879", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "NFD" + }, + { + "type": "Lowercase" + }, + { + "type": "StripAccents" + }, + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": null, + "decoder": { + "type": "WordPiece", + "prefix": "##", + "cleanup": true + }, + "model": { + "type": "WordPiece", + "unk_token": "", + "continuing_subword_prefix": "##", + "max_input_chars_per_word": 100, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + "": 5, + "[table]": 6, + "[html]": 7, + "[cell]": 8, + "[bbox]": 9, + "[cell+bbox]": 10, + "bbox-0": 11, + "bbox-1": 12, + "bbox-2": 13, + "bbox-3": 14, + "bbox-4": 15, + "bbox-5": 16, + "bbox-6": 17, + "bbox-7": 18, + "bbox-8": 19, + "bbox-9": 20, + "bbox-10": 21, + "bbox-11": 22, + "bbox-12": 23, + "bbox-13": 24, + "bbox-14": 25, + "bbox-15": 26, + "bbox-16": 27, + "bbox-17": 28, + "bbox-18": 29, + "bbox-19": 30, + "bbox-20": 31, + "bbox-21": 32, + "bbox-22": 33, + "bbox-23": 34, + "bbox-24": 35, + "bbox-25": 36, + "bbox-26": 37, + "bbox-27": 38, + "bbox-28": 39, + "bbox-29": 40, + "bbox-30": 41, + "bbox-31": 42, + "bbox-32": 43, + "bbox-33": 44, + "bbox-34": 45, + "bbox-35": 46, + "bbox-36": 47, + "bbox-37": 48, + "bbox-38": 49, + "bbox-39": 50, + "bbox-40": 51, + "bbox-41": 52, + "bbox-42": 53, + "bbox-43": 54, + "bbox-44": 55, + "bbox-45": 56, + "bbox-46": 57, + "bbox-47": 58, + "bbox-48": 59, + "bbox-49": 60, + "bbox-50": 61, + "bbox-51": 62, + "bbox-52": 63, + "bbox-53": 64, + "bbox-54": 65, + "bbox-55": 66, + "bbox-56": 67, + "bbox-57": 68, + "bbox-58": 69, + "bbox-59": 70, + "bbox-60": 71, + "bbox-61": 72, + "bbox-62": 73, + "bbox-63": 74, + "bbox-64": 75, + "bbox-65": 76, + "bbox-66": 77, + "bbox-67": 78, + "bbox-68": 79, + "bbox-69": 80, + "bbox-70": 81, + "bbox-71": 82, + "bbox-72": 83, + "bbox-73": 84, + "bbox-74": 85, + "bbox-75": 86, + "bbox-76": 87, + "bbox-77": 88, + "bbox-78": 89, + "bbox-79": 90, + "bbox-80": 91, + "bbox-81": 92, + "bbox-82": 93, + "bbox-83": 94, + "bbox-84": 95, + "bbox-85": 96, + "bbox-86": 97, + "bbox-87": 98, + "bbox-88": 99, + "bbox-89": 100, + "bbox-90": 101, + "bbox-91": 102, + "bbox-92": 103, + "bbox-93": 104, + "bbox-94": 105, + "bbox-95": 106, + "bbox-96": 107, + "bbox-97": 108, + "bbox-98": 109, + "bbox-99": 110, + "bbox-100": 111, + "bbox-101": 112, + "bbox-102": 113, + "bbox-103": 114, + "bbox-104": 115, + "bbox-105": 116, + "bbox-106": 117, + "bbox-107": 118, + "bbox-108": 119, + "bbox-109": 120, + "bbox-110": 121, + "bbox-111": 122, + "bbox-112": 123, + "bbox-113": 124, + "bbox-114": 125, + "bbox-115": 126, + "bbox-116": 127, + "bbox-117": 128, + "bbox-118": 129, + "bbox-119": 130, + "bbox-120": 131, + "bbox-121": 132, + "bbox-122": 133, + "bbox-123": 134, + "bbox-124": 135, + "bbox-125": 136, + "bbox-126": 137, + "bbox-127": 138, + "bbox-128": 139, + "bbox-129": 140, + "bbox-130": 141, + "bbox-131": 142, + "bbox-132": 143, + "bbox-133": 144, + "bbox-134": 145, + "bbox-135": 146, + "bbox-136": 147, + "bbox-137": 148, + "bbox-138": 149, + "bbox-139": 150, + "bbox-140": 151, + "bbox-141": 152, + "bbox-142": 153, + "bbox-143": 154, + "bbox-144": 155, + "bbox-145": 156, + "bbox-146": 157, + "bbox-147": 158, + "bbox-148": 159, + "bbox-149": 160, + "bbox-150": 161, + "bbox-151": 162, + "bbox-152": 163, + "bbox-153": 164, + "bbox-154": 165, + "bbox-155": 166, + "bbox-156": 167, + "bbox-157": 168, + "bbox-158": 169, + "bbox-159": 170, + "bbox-160": 171, + "bbox-161": 172, + "bbox-162": 173, + "bbox-163": 174, + "bbox-164": 175, + "bbox-165": 176, + "bbox-166": 177, + "bbox-167": 178, + "bbox-168": 179, + "bbox-169": 180, + "bbox-170": 181, + "bbox-171": 182, + "bbox-172": 183, + "bbox-173": 184, + "bbox-174": 185, + "bbox-175": 186, + "bbox-176": 187, + "bbox-177": 188, + "bbox-178": 189, + "bbox-179": 190, + "bbox-180": 191, + "bbox-181": 192, + "bbox-182": 193, + "bbox-183": 194, + "bbox-184": 195, + "bbox-185": 196, + "bbox-186": 197, + "bbox-187": 198, + "bbox-188": 199, + "bbox-189": 200, + "bbox-190": 201, + "bbox-191": 202, + "bbox-192": 203, + "bbox-193": 204, + "bbox-194": 205, + "bbox-195": 206, + "bbox-196": 207, + "bbox-197": 208, + "bbox-198": 209, + "bbox-199": 210, + "bbox-200": 211, + "bbox-201": 212, + "bbox-202": 213, + "bbox-203": 214, + "bbox-204": 215, + "bbox-205": 216, + "bbox-206": 217, + "bbox-207": 218, + "bbox-208": 219, + "bbox-209": 220, + "bbox-210": 221, + "bbox-211": 222, + "bbox-212": 223, + "bbox-213": 224, + "bbox-214": 225, + "bbox-215": 226, + "bbox-216": 227, + "bbox-217": 228, + "bbox-218": 229, + "bbox-219": 230, + "bbox-220": 231, + "bbox-221": 232, + "bbox-222": 233, + "bbox-223": 234, + "bbox-224": 235, + "bbox-225": 236, + "bbox-226": 237, + "bbox-227": 238, + "bbox-228": 239, + "bbox-229": 240, + "bbox-230": 241, + "bbox-231": 242, + "bbox-232": 243, + "bbox-233": 244, + "bbox-234": 245, + "bbox-235": 246, + "bbox-236": 247, + "bbox-237": 248, + "bbox-238": 249, + "bbox-239": 250, + "bbox-240": 251, + "bbox-241": 252, + "bbox-242": 253, + "bbox-243": 254, + "bbox-244": 255, + "bbox-245": 256, + "bbox-246": 257, + "bbox-247": 258, + "bbox-248": 259, + "bbox-249": 260, + "bbox-250": 261, + "bbox-251": 262, + "bbox-252": 263, + "bbox-253": 264, + "bbox-254": 265, + "bbox-255": 266, + "bbox-256": 267, + "bbox-257": 268, + "bbox-258": 269, + "bbox-259": 270, + "bbox-260": 271, + "bbox-261": 272, + "bbox-262": 273, + "bbox-263": 274, + "bbox-264": 275, + "bbox-265": 276, + "bbox-266": 277, + "bbox-267": 278, + "bbox-268": 279, + "bbox-269": 280, + "bbox-270": 281, + "bbox-271": 282, + "bbox-272": 283, + "bbox-273": 284, + "bbox-274": 285, + "bbox-275": 286, + "bbox-276": 287, + "bbox-277": 288, + "bbox-278": 289, + "bbox-279": 290, + "bbox-280": 291, + "bbox-281": 292, + "bbox-282": 293, + "bbox-283": 294, + "bbox-284": 295, + "bbox-285": 296, + "bbox-286": 297, + "bbox-287": 298, + "bbox-288": 299, + "bbox-289": 300, + "bbox-290": 301, + "bbox-291": 302, + "bbox-292": 303, + "bbox-293": 304, + "bbox-294": 305, + "bbox-295": 306, + "bbox-296": 307, + "bbox-297": 308, + "bbox-298": 309, + "bbox-299": 310, + "bbox-300": 311, + "bbox-301": 312, + "bbox-302": 313, + "bbox-303": 314, + "bbox-304": 315, + "bbox-305": 316, + "bbox-306": 317, + "bbox-307": 318, + "bbox-308": 319, + "bbox-309": 320, + "bbox-310": 321, + "bbox-311": 322, + "bbox-312": 323, + "bbox-313": 324, + "bbox-314": 325, + "bbox-315": 326, + "bbox-316": 327, + "bbox-317": 328, + "bbox-318": 329, + "bbox-319": 330, + "bbox-320": 331, + "bbox-321": 332, + "bbox-322": 333, + "bbox-323": 334, + "bbox-324": 335, + "bbox-325": 336, + "bbox-326": 337, + "bbox-327": 338, + "bbox-328": 339, + "bbox-329": 340, + "bbox-330": 341, + "bbox-331": 342, + "bbox-332": 343, + "bbox-333": 344, + "bbox-334": 345, + "bbox-335": 346, + "bbox-336": 347, + "bbox-337": 348, + "bbox-338": 349, + "bbox-339": 350, + "bbox-340": 351, + "bbox-341": 352, + "bbox-342": 353, + "bbox-343": 354, + "bbox-344": 355, + "bbox-345": 356, + "bbox-346": 357, + "bbox-347": 358, + "bbox-348": 359, + "bbox-349": 360, + "bbox-350": 361, + "bbox-351": 362, + "bbox-352": 363, + "bbox-353": 364, + "bbox-354": 365, + "bbox-355": 366, + "bbox-356": 367, + "bbox-357": 368, + "bbox-358": 369, + "bbox-359": 370, + "bbox-360": 371, + "bbox-361": 372, + "bbox-362": 373, + "bbox-363": 374, + "bbox-364": 375, + "bbox-365": 376, + "bbox-366": 377, + "bbox-367": 378, + "bbox-368": 379, + "bbox-369": 380, + "bbox-370": 381, + "bbox-371": 382, + "bbox-372": 383, + "bbox-373": 384, + "bbox-374": 385, + "bbox-375": 386, + "bbox-376": 387, + "bbox-377": 388, + "bbox-378": 389, + "bbox-379": 390, + "bbox-380": 391, + "bbox-381": 392, + "bbox-382": 393, + "bbox-383": 394, + "bbox-384": 395, + "bbox-385": 396, + "bbox-386": 397, + "bbox-387": 398, + "bbox-388": 399, + "bbox-389": 400, + "bbox-390": 401, + "bbox-391": 402, + "bbox-392": 403, + "bbox-393": 404, + "bbox-394": 405, + "bbox-395": 406, + "bbox-396": 407, + "bbox-397": 408, + "bbox-398": 409, + "bbox-399": 410, + "bbox-400": 411, + "bbox-401": 412, + "bbox-402": 413, + "bbox-403": 414, + "bbox-404": 415, + "bbox-405": 416, + "bbox-406": 417, + "bbox-407": 418, + "bbox-408": 419, + "bbox-409": 420, + "bbox-410": 421, + "bbox-411": 422, + "bbox-412": 423, + "bbox-413": 424, + "bbox-414": 425, + "bbox-415": 426, + "bbox-416": 427, + "bbox-417": 428, + "bbox-418": 429, + "bbox-419": 430, + "bbox-420": 431, + "bbox-421": 432, + "bbox-422": 433, + "bbox-423": 434, + "bbox-424": 435, + "bbox-425": 436, + "bbox-426": 437, + "bbox-427": 438, + "bbox-428": 439, + "bbox-429": 440, + "bbox-430": 441, + "bbox-431": 442, + "bbox-432": 443, + "bbox-433": 444, + "bbox-434": 445, + "bbox-435": 446, + "bbox-436": 447, + "bbox-437": 448, + "bbox-438": 449, + "bbox-439": 450, + "bbox-440": 451, + "bbox-441": 452, + "bbox-442": 453, + "bbox-443": 454, + "bbox-444": 455, + "bbox-445": 456, + "bbox-446": 457, + "bbox-447": 458, + "bbox-448": 459, + "bbox-449": 460, + "bbox-450": 461, + "bbox-451": 462, + "bbox-452": 463, + "bbox-453": 464, + "bbox-454": 465, + "bbox-455": 466, + "bbox-456": 467, + "bbox-457": 468, + "bbox-458": 469, + "bbox-459": 470, + "bbox-460": 471, + "bbox-461": 472, + "bbox-462": 473, + "bbox-463": 474, + "bbox-464": 475, + "bbox-465": 476, + "bbox-466": 477, + "bbox-467": 478, + "bbox-468": 479, + "bbox-469": 480, + "bbox-470": 481, + "bbox-471": 482, + "bbox-472": 483, + "bbox-473": 484, + "bbox-474": 485, + "bbox-475": 486, + "bbox-476": 487, + "bbox-477": 488, + "bbox-478": 489, + "bbox-479": 490, + "bbox-480": 491, + "bbox-481": 492, + "bbox-482": 493, + "bbox-483": 494, + "bbox-484": 495, + "bbox-485": 496, + "bbox-486": 497, + "bbox-487": 498, + "bbox-488": 499, + "bbox-489": 500, + "bbox-490": 501, + "bbox-491": 502, + "bbox-492": 503, + "bbox-493": 504, + "bbox-494": 505, + "bbox-495": 506, + "bbox-496": 507, + "bbox-497": 508, + "bbox-498": 509, + "bbox-499": 510, + "bbox-500": 511, + "bbox-501": 512, + "bbox-502": 513, + "bbox-503": 514, + "bbox-504": 515, + "bbox-505": 516, + "bbox-506": 517, + "bbox-507": 518, + "bbox-508": 519, + "bbox-509": 520, + "bbox-510": 521, + "bbox-511": 522, + "bbox-512": 523, + "bbox-513": 524, + "bbox-514": 525, + "bbox-515": 526, + "bbox-516": 527, + "bbox-517": 528, + "bbox-518": 529, + "bbox-519": 530, + "bbox-520": 531, + "bbox-521": 532, + "bbox-522": 533, + "bbox-523": 534, + "bbox-524": 535, + "bbox-525": 536, + "bbox-526": 537, + "bbox-527": 538, + "bbox-528": 539, + "bbox-529": 540, + "bbox-530": 541, + "bbox-531": 542, + "bbox-532": 543, + "bbox-533": 544, + "bbox-534": 545, + "bbox-535": 546, + "bbox-536": 547, + "bbox-537": 548, + "bbox-538": 549, + "bbox-539": 550, + "bbox-540": 551, + "bbox-541": 552, + "bbox-542": 553, + "bbox-543": 554, + "bbox-544": 555, + "bbox-545": 556, + "bbox-546": 557, + "bbox-547": 558, + "bbox-548": 559, + "bbox-549": 560, + "bbox-550": 561, + "bbox-551": 562, + "bbox-552": 563, + "bbox-553": 564, + "bbox-554": 565, + "bbox-555": 566, + "bbox-556": 567, + "bbox-557": 568, + "bbox-558": 569, + "bbox-559": 570, + "bbox-560": 571, + "bbox-561": 572, + "bbox-562": 573, + "bbox-563": 574, + "bbox-564": 575, + "bbox-565": 576, + "bbox-566": 577, + "bbox-567": 578, + "bbox-568": 579, + "bbox-569": 580, + "bbox-570": 581, + "bbox-571": 582, + "bbox-572": 583, + "bbox-573": 584, + "bbox-574": 585, + "bbox-575": 586, + "bbox-576": 587, + "bbox-577": 588, + "bbox-578": 589, + "bbox-579": 590, + "bbox-580": 591, + "bbox-581": 592, + "bbox-582": 593, + "bbox-583": 594, + "bbox-584": 595, + "bbox-585": 596, + "bbox-586": 597, + "bbox-587": 598, + "bbox-588": 599, + "bbox-589": 600, + "bbox-590": 601, + "bbox-591": 602, + "bbox-592": 603, + "bbox-593": 604, + "bbox-594": 605, + "bbox-595": 606, + "bbox-596": 607, + "bbox-597": 608, + "bbox-598": 609, + "bbox-599": 610, + "bbox-600": 611, + "bbox-601": 612, + "bbox-602": 613, + "bbox-603": 614, + "bbox-604": 615, + "bbox-605": 616, + "bbox-606": 617, + "bbox-607": 618, + "bbox-608": 619, + "bbox-609": 620, + "bbox-610": 621, + "bbox-611": 622, + "bbox-612": 623, + "bbox-613": 624, + "bbox-614": 625, + "bbox-615": 626, + "bbox-616": 627, + "bbox-617": 628, + "bbox-618": 629, + "bbox-619": 630, + "bbox-620": 631, + "bbox-621": 632, + "bbox-622": 633, + "bbox-623": 634, + "bbox-624": 635, + "bbox-625": 636, + "bbox-626": 637, + "bbox-627": 638, + "bbox-628": 639, + "bbox-629": 640, + "bbox-630": 641, + "bbox-631": 642, + "bbox-632": 643, + "bbox-633": 644, + "bbox-634": 645, + "bbox-635": 646, + "bbox-636": 647, + "bbox-637": 648, + "bbox-638": 649, + "bbox-639": 650, + "bbox-640": 651, + "bbox-641": 652, + "bbox-642": 653, + "bbox-643": 654, + "bbox-644": 655, + "bbox-645": 656, + "bbox-646": 657, + "bbox-647": 658, + "bbox-648": 659, + "bbox-649": 660, + "bbox-650": 661, + "bbox-651": 662, + "bbox-652": 663, + "bbox-653": 664, + "bbox-654": 665, + "bbox-655": 666, + "bbox-656": 667, + "bbox-657": 668, + "bbox-658": 669, + "bbox-659": 670, + "bbox-660": 671, + "bbox-661": 672, + "bbox-662": 673, + "bbox-663": 674, + "bbox-664": 675, + "bbox-665": 676, + "bbox-666": 677, + "bbox-667": 678, + "bbox-668": 679, + "bbox-669": 680, + "bbox-670": 681, + "bbox-671": 682, + "bbox-672": 683, + "bbox-673": 684, + "bbox-674": 685, + "bbox-675": 686, + "bbox-676": 687, + "bbox-677": 688, + "bbox-678": 689, + "bbox-679": 690, + "bbox-680": 691, + "bbox-681": 692, + "bbox-682": 693, + "bbox-683": 694, + "bbox-684": 695, + "bbox-685": 696, + "bbox-686": 697, + "bbox-687": 698, + "bbox-688": 699, + "bbox-689": 700, + "bbox-690": 701, + "bbox-691": 702, + "bbox-692": 703, + "bbox-693": 704, + "bbox-694": 705, + "bbox-695": 706, + "bbox-696": 707, + "bbox-697": 708, + "bbox-698": 709, + "bbox-699": 710, + "bbox-700": 711, + "bbox-701": 712, + "bbox-702": 713, + "bbox-703": 714, + "bbox-704": 715, + "bbox-705": 716, + "bbox-706": 717, + "bbox-707": 718, + "bbox-708": 719, + "bbox-709": 720, + "bbox-710": 721, + "bbox-711": 722, + "bbox-712": 723, + "bbox-713": 724, + "bbox-714": 725, + "bbox-715": 726, + "bbox-716": 727, + "bbox-717": 728, + "bbox-718": 729, + "bbox-719": 730, + "bbox-720": 731, + "bbox-721": 732, + "bbox-722": 733, + "bbox-723": 734, + "bbox-724": 735, + "bbox-725": 736, + "bbox-726": 737, + "bbox-727": 738, + "bbox-728": 739, + "bbox-729": 740, + "bbox-730": 741, + "bbox-731": 742, + "bbox-732": 743, + "bbox-733": 744, + "bbox-734": 745, + "bbox-735": 746, + "bbox-736": 747, + "bbox-737": 748, + "bbox-738": 749, + "bbox-739": 750, + "bbox-740": 751, + "bbox-741": 752, + "bbox-742": 753, + "bbox-743": 754, + "bbox-744": 755, + "bbox-745": 756, + "bbox-746": 757, + "bbox-747": 758, + "bbox-748": 759, + "bbox-749": 760, + "bbox-750": 761, + "bbox-751": 762, + "bbox-752": 763, + "bbox-753": 764, + "bbox-754": 765, + "bbox-755": 766, + "bbox-756": 767, + "bbox-757": 768, + "bbox-758": 769, + "bbox-759": 770, + "bbox-760": 771, + "bbox-761": 772, + "bbox-762": 773, + "bbox-763": 774, + "bbox-764": 775, + "bbox-765": 776, + "bbox-766": 777, + "bbox-767": 778, + "bbox-768": 779, + "bbox-769": 780, + "bbox-770": 781, + "bbox-771": 782, + "bbox-772": 783, + "bbox-773": 784, + "bbox-774": 785, + "bbox-775": 786, + "bbox-776": 787, + "bbox-777": 788, + "bbox-778": 789, + "bbox-779": 790, + "bbox-780": 791, + "bbox-781": 792, + "bbox-782": 793, + "bbox-783": 794, + "bbox-784": 795, + "bbox-785": 796, + "bbox-786": 797, + "bbox-787": 798, + "bbox-788": 799, + "bbox-789": 800, + "bbox-790": 801, + "bbox-791": 802, + "bbox-792": 803, + "bbox-793": 804, + "bbox-794": 805, + "bbox-795": 806, + "bbox-796": 807, + "bbox-797": 808, + "bbox-798": 809, + "bbox-799": 810, + "bbox-800": 811, + "bbox-801": 812, + "bbox-802": 813, + "bbox-803": 814, + "bbox-804": 815, + "bbox-805": 816, + "bbox-806": 817, + "bbox-807": 818, + "bbox-808": 819, + "bbox-809": 820, + "bbox-810": 821, + "bbox-811": 822, + "bbox-812": 823, + "bbox-813": 824, + "bbox-814": 825, + "bbox-815": 826, + "bbox-816": 827, + "bbox-817": 828, + "bbox-818": 829, + "bbox-819": 830, + "bbox-820": 831, + "bbox-821": 832, + "bbox-822": 833, + "bbox-823": 834, + "bbox-824": 835, + "bbox-825": 836, + "bbox-826": 837, + "bbox-827": 838, + "bbox-828": 839, + "bbox-829": 840, + "bbox-830": 841, + "bbox-831": 842, + "bbox-832": 843, + "bbox-833": 844, + "bbox-834": 845, + "bbox-835": 846, + "bbox-836": 847, + "bbox-837": 848, + "bbox-838": 849, + "bbox-839": 850, + "bbox-840": 851, + "bbox-841": 852, + "bbox-842": 853, + "bbox-843": 854, + "bbox-844": 855, + "bbox-845": 856, + "bbox-846": 857, + "bbox-847": 858, + "bbox-848": 859, + "bbox-849": 860, + "bbox-850": 861, + "bbox-851": 862, + "bbox-852": 863, + "bbox-853": 864, + "bbox-854": 865, + "bbox-855": 866, + "bbox-856": 867, + "bbox-857": 868, + "bbox-858": 869, + "bbox-859": 870, + "bbox-860": 871, + "bbox-861": 872, + "bbox-862": 873, + "bbox-863": 874, + "bbox-864": 875, + "bbox-865": 876, + "bbox-866": 877, + "bbox-867": 878, + "bbox-868": 879, + "bbox-869": 880, + "bbox-870": 881, + "bbox-871": 882, + "bbox-872": 883, + "bbox-873": 884, + "bbox-874": 885, + "bbox-875": 886, + "bbox-876": 887, + "bbox-877": 888, + "bbox-878": 889, + "bbox-879": 890 + } + } +} \ No newline at end of file diff --git a/unitable/vocab/vocab_cell_6k.json b/unitable/vocab/vocab_cell_6k.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4c33781fa027286187a1c5ea1b38035ad8728f --- /dev/null +++ b/unitable/vocab/vocab_cell_6k.json @@ -0,0 +1,5590 @@ +{ + "version": "1.0", + "truncation": null, + "padding": { + "strategy": "BatchLongest", + "direction": "Right", + "pad_to_multiple_of": null, + "pad_id": 2, + "pad_type_id": 0, + "pad_token": "" + }, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 4, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 5, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 6, + "content": "[table]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 7, + "content": "[html]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 8, + "content": "[cell]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 9, + "content": "[bbox]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 10, + "content": "[cell+bbox]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 11, + "content": "reserved 1", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 12, + "content": "reserved 2", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 13, + "content": "reserved 3", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 14, + "content": "reserved 4", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 15, + "content": "reserved 5", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 16, + "content": "reserved 6", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 17, + "content": "reserved 7", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 18, + "content": "reserved 8", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 19, + "content": "reserved 9", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "NFD" + }, + { + "type": "StripAccents" + }, + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": null, + "decoder": { + "type": "WordPiece", + "prefix": "##", + "cleanup": true + }, + "model": { + "type": "WordPiece", + "unk_token": "", + "continuing_subword_prefix": "##", + "max_input_chars_per_word": 100, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + "": 5, + "[table]": 6, + "[html]": 7, + "[cell]": 8, + "[bbox]": 9, + "[cell+bbox]": 10, + "reserved 1": 11, + "reserved 2": 12, + "reserved 3": 13, + "reserved 4": 14, + "reserved 5": 15, + "reserved 6": 16, + "reserved 7": 17, + "reserved 8": 18, + "reserved 9": 19, + "\u0001": 20, + "\u0003": 21, + "\u0007": 22, + "\b": 23, + "\u0013": 24, + "\u0014": 25, + "\u0016": 26, + "\u0018": 27, + "!": 28, + "\"": 29, + "#": 30, + "$": 31, + "%": 32, + "&": 33, + "'": 34, + "(": 35, + ")": 36, + "*": 37, + "+": 38, + ",": 39, + "-": 40, + ".": 41, + "/": 42, + "0": 43, + "1": 44, + "2": 45, + "3": 46, + "4": 47, + "5": 48, + "6": 49, + "7": 50, + "8": 51, + "9": 52, + ":": 53, + ";": 54, + "<": 55, + "=": 56, + ">": 57, + "?": 58, + "@": 59, + "A": 60, + "B": 61, + "C": 62, + "D": 63, + "E": 64, + "F": 65, + "G": 66, + "H": 67, + "I": 68, + "J": 69, + "K": 70, + "L": 71, + "M": 72, + "N": 73, + "O": 74, + "P": 75, + "Q": 76, + "R": 77, + "S": 78, + "T": 79, + "U": 80, + "V": 81, + "W": 82, + "X": 83, + "Y": 84, + "Z": 85, + "[": 86, + "\\": 87, + "]": 88, + "^": 89, + "_": 90, + "`": 91, + "a": 92, + "b": 93, + "c": 94, + "d": 95, + "e": 96, + "f": 97, + "g": 98, + "h": 99, + "i": 100, + "j": 101, + "k": 102, + "l": 103, + "m": 104, + "n": 105, + "o": 106, + "p": 107, + "q": 108, + "r": 109, + "s": 110, + "t": 111, + "u": 112, + "v": 113, + "w": 114, + "x": 115, + "y": 116, + "z": 117, + "{": 118, + "|": 119, + "}": 120, + "~": 121, + "†": 122, + "‡": 123, + "Œ": 124, + "": 125, + "Ž": 126, + "": 127, + "’": 128, + "¡": 129, + "¢": 130, + "£": 131, + "¤": 132, + "¥": 133, + "¦": 134, + "§": 135, + "¨": 136, + "©": 137, + "ª": 138, + "«": 139, + "¬": 140, + "­": 141, + "®": 142, + "¯": 143, + "°": 144, + "±": 145, + "²": 146, + "³": 147, + "´": 148, + "µ": 149, + "¶": 150, + "·": 151, + "¸": 152, + "¹": 153, + "º": 154, + "»": 155, + "¼": 156, + "½": 157, + "¾": 158, + "¿": 159, + "Æ": 160, + "Ð": 161, + "×": 162, + "Ø": 163, + "Þ": 164, + "ß": 165, + "æ": 166, + "ð": 167, + "÷": 168, + "ø": 169, + "þ": 170, + "Đ": 171, + "đ": 172, + "ħ": 173, + "ı": 174, + "ĸ": 175, + "Ł": 176, + "ł": 177, + "Ŋ": 178, + "ŋ": 179, + "Œ": 180, + "œ": 181, + "Ŧ": 182, + "ŧ": 183, + "Ɔ": 184, + "ƍ": 185, + "Ǝ": 186, + "Ɛ": 187, + "Ƒ": 188, + "ƒ": 189, + "Ɣ": 190, + "Ɩ": 191, + "Ɨ": 192, + "Ƙ": 193, + "ƙ": 194, + "ƚ": 195, + "ƛ": 196, + "Ɯ": 197, + "ƞ": 198, + "Ɵ": 199, + "Ƥ": 200, + "ƥ": 201, + "Ʃ": 202, + "ƪ": 203, + "Ƭ": 204, + "Ʊ": 205, + "Ƴ": 206, + "ƴ": 207, + "Ʒ": 208, + "ǀ": 209, + "ǁ": 210, + "ǂ": 211, + "Ƞ": 212, + "ȣ": 213, + "ȹ": 214, + "ȼ": 215, + "ɀ": 216, + "ɑ": 217, + "ɒ": 218, + "ɓ": 219, + "ɔ": 220, + "ə": 221, + "ɛ": 222, + "ɜ": 223, + "ɠ": 224, + "ɡ": 225, + "ɣ": 226, + "ɤ": 227, + "ɥ": 228, + "ɨ": 229, + "ɪ": 230, + "ɫ": 231, + "ɯ": 232, + "ɱ": 233, + "ɳ": 234, + "ɷ": 235, + "ɸ": 236, + "ɹ": 237, + "ɾ": 238, + "ʁ": 239, + "ʃ": 240, + "ʈ": 241, + "ʉ": 242, + "ʊ": 243, + "ʋ": 244, + "ʌ": 245, + "ʎ": 246, + "ʏ": 247, + "ʒ": 248, + "ʓ": 249, + "ʔ": 250, + "ʗ": 251, + "ʘ": 252, + "ʝ": 253, + "ʞ": 254, + "ʟ": 255, + "ʠ": 256, + "ʡ": 257, + "ʤ": 258, + "ʧ": 259, + "ʰ": 260, + "ʵ": 261, + "ʷ": 262, + "ʹ": 263, + "ʺ": 264, + "ʻ": 265, + "ʼ": 266, + "ʾ": 267, + "˂": 268, + "˃": 269, + "˄": 270, + "˅": 271, + "ˆ": 272, + "ˇ": 273, + "ˈ": 274, + "ˉ": 275, + "ˊ": 276, + "ː": 277, + "ˑ": 278, + "˗": 279, + "˘": 280, + "˙": 281, + "˚": 282, + "˛": 283, + "˜": 284, + "˝": 285, + "˟": 286, + "ˠ": 287, + "ˣ": 288, + "ˤ": 289, + "˦": 290, + "˩": 291, + "ˮ": 292, + "˷": 293, + "ͻ": 294, + "΄": 295, + "Α": 296, + "Β": 297, + "Γ": 298, + "Δ": 299, + "Ε": 300, + "Ζ": 301, + "Η": 302, + "Θ": 303, + "Ι": 304, + "Κ": 305, + "Λ": 306, + "Μ": 307, + "Ν": 308, + "Ξ": 309, + "Ο": 310, + "Π": 311, + "Ρ": 312, + "Σ": 313, + "Τ": 314, + "Υ": 315, + "Φ": 316, + "Χ": 317, + "Ψ": 318, + "Ω": 319, + "α": 320, + "β": 321, + "γ": 322, + "δ": 323, + "ε": 324, + "ζ": 325, + "η": 326, + "θ": 327, + "ι": 328, + "κ": 329, + "λ": 330, + "μ": 331, + "ν": 332, + "ξ": 333, + "ο": 334, + "π": 335, + "ρ": 336, + "ς": 337, + "σ": 338, + "τ": 339, + "υ": 340, + "φ": 341, + "χ": 342, + "ψ": 343, + "ω": 344, + "ϐ": 345, + "ϑ": 346, + "ϒ": 347, + "ϕ": 348, + "ϖ": 349, + "Ϟ": 350, + "Ϫ": 351, + "Ϭ": 352, + "ϭ": 353, + "Ϯ": 354, + "ϯ": 355, + "ϰ": 356, + "ϱ": 357, + "ϴ": 358, + "ϵ": 359, + "Є": 360, + "Ѕ": 361, + "І": 362, + "Ј": 363, + "Џ": 364, + "А": 365, + "Б": 366, + "В": 367, + "Г": 368, + "Д": 369, + "Е": 370, + "Ж": 371, + "З": 372, + "И": 373, + "К": 374, + "Л": 375, + "М": 376, + "Н": 377, + "О": 378, + "П": 379, + "Р": 380, + "С": 381, + "Т": 382, + "Ф": 383, + "Х": 384, + "Ш": 385, + "Я": 386, + "а": 387, + "б": 388, + "в": 389, + "г": 390, + "д": 391, + "е": 392, + "ж": 393, + "з": 394, + "и": 395, + "к": 396, + "л": 397, + "м": 398, + "н": 399, + "о": 400, + "п": 401, + "р": 402, + "с": 403, + "т": 404, + "у": 405, + "ф": 406, + "х": 407, + "ц": 408, + "ч": 409, + "ш": 410, + "щ": 411, + "ы": 412, + "ь": 413, + "ю": 414, + "я": 415, + "і": 416, + "Ѱ": 417, + "ѱ": 418, + "Ѳ": 419, + "҂": 420, + "қ": 421, + "ҝ": 422, + "ҡ": 423, + "Ҩ": 424, + "Ұ": 425, + "Ҳ": 426, + "ә": 427, + "Ө": 428, + "ө": 429, + "Ӽ": 430, + "ӽ": 431, + "Ԑ": 432, + "ԑ": 433, + "ղ": 434, + "ր": 435, + "ց": 436, + "ւ": 437, + "׀": 438, + "ג": 439, + "ד": 440, + "ז": 441, + "׳": 442, + "و": 443, + "٠": 444, + "١": 445, + "٢": 446, + "٣": 447, + "٤": 448, + "٥": 449, + "٦": 450, + "٧": 451, + "٨": 452, + "٩": 453, + "٭": 454, + "۞": 455, + "۰": 456, + "۵": 457, + "۹": 458, + "࣓": 459, + "৹": 460, + "ก": 461, + "ข": 462, + "ค": 463, + "ง": 464, + "จ": 465, + "ช": 466, + "ญ": 467, + "ด": 468, + "ต": 469, + "ท": 470, + "น": 471, + "บ": 472, + "ป": 473, + "ผ": 474, + "พ": 475, + "ภ": 476, + "ม": 477, + "ย": 478, + "ร": 479, + "ล": 480, + "ว": 481, + "ส": 482, + "ห": 483, + "อ": 484, + "ะ": 485, + "า": 486, + "ำ": 487, + "เ": 488, + "แ": 489, + "โ": 490, + "ใ": 491, + "ไ": 492, + "๖": 493, + "ᄀ": 494, + "ᄁ": 495, + "ᄂ": 496, + "ᄃ": 497, + "ᄄ": 498, + "ᄅ": 499, + "ᄆ": 500, + "ᄇ": 501, + "ᄈ": 502, + "ᄉ": 503, + "ᄋ": 504, + "ᄌ": 505, + "ᄍ": 506, + "ᄎ": 507, + "ᄏ": 508, + "ᄐ": 509, + "ᄑ": 510, + "ᄒ": 511, + "ᅟ": 512, + "ᅡ": 513, + "ᅢ": 514, + "ᅣ": 515, + "ᅥ": 516, + "ᅦ": 517, + "ᅧ": 518, + "ᅨ": 519, + "ᅩ": 520, + "ᅪ": 521, + "ᅫ": 522, + "ᅬ": 523, + "ᅮ": 524, + "ᅯ": 525, + "ᅰ": 526, + "ᅱ": 527, + "ᅲ": 528, + "ᅳ": 529, + "ᅵ": 530, + "ᆨ": 531, + "ᆫ": 532, + "ᆭ": 533, + "ᆯ": 534, + "ᆲ": 535, + "ᆵ": 536, + "ᆷ": 537, + "ᆸ": 538, + "ᆺ": 539, + "ᆼ": 540, + "ᆾ": 541, + "ᇁ": 542, + "ᇂ": 543, + "፠": 544, + "ᛎ": 545, + "ᛏ": 546, + "ᴅ": 547, + "ᴋ": 548, + "ᴥ": 549, + "ᴬ": 550, + "ᴮ": 551, + "ᴻ": 552, + "ᵃ": 553, + "ᵇ": 554, + "ᵈ": 555, + "ᵉ": 556, + "ᵏ": 557, + "ᵐ": 558, + "ᵑ": 559, + "ᵓ": 560, + "ᵖ": 561, + "ᵗ": 562, + "ᵘ": 563, + "ᵛ": 564, + "ᵝ": 565, + "ᵟ": 566, + "ᵡ": 567, + "ᵧ": 568, + "ᵵ": 569, + "ᵷ": 570, + "ᵻ": 571, + "ᵼ": 572, + "ᶋ": 573, + "ᶜ": 574, + "ᶟ": 575, + "ᶠ": 576, + "ᶢ": 577, + "ᶤ": 578, + "ᶧ": 579, + "ᶬ": 580, + "ᶲ": 581, + "ᶴ": 582, + "ᶷ": 583, + "ᶽ": 584, + "ᶿ": 585, + "ẞ": 586, + "ẟ": 587, + "῾": 588, + "​": 589, + "‎": 590, + "‏": 591, + "‐": 592, + "‑": 593, + "‒": 594, + "–": 595, + "—": 596, + "―": 597, + "‖": 598, + "‗": 599, + "‘": 600, + "’": 601, + "‚": 602, + "‛": 603, + "“": 604, + "”": 605, + "„": 606, + "‟": 607, + "†": 608, + "‡": 609, + "•": 610, + "․": 611, + "‥": 612, + "…": 613, + "‧": 614, + "‰": 615, + "‱": 616, + "′": 617, + "″": 618, + "‴": 619, + "‵": 620, + "‹": 621, + "›": 622, + "※": 623, + "‾": 624, + "⁄": 625, + "⁎": 626, + "⁑": 627, + "⁡": 628, + "": 629, + "": 630, + "": 631, + "⁰": 632, + "ⁱ": 633, + "⁴": 634, + "⁵": 635, + "⁶": 636, + "⁸": 637, + "⁹": 638, + "⁺": 639, + "⁻": 640, + "₀": 641, + "₁": 642, + "₂": 643, + "₃": 644, + "₄": 645, + "₊": 646, + "₌": 647, + "ₓ": 648, + "₡": 649, + "₣": 650, + "₤": 651, + "₦": 652, + "₩": 653, + "₫": 654, + "€": 655, + "₮": 656, + "₱": 657, + "₸": 658, + "₹": 659, + "℃": 660, + "℅": 661, + "ℊ": 662, + "ℎ": 663, + "ℏ": 664, + "ℑ": 665, + "ℓ": 666, + "№": 667, + "℗": 668, + "ℜ": 669, + "ℝ": 670, + "™": 671, + "℮": 672, + "ℳ": 673, + "ℵ": 674, + "⅓": 675, + "⅔": 676, + "⅛": 677, + "⅝": 678, + "Ⅰ": 679, + "Ⅱ": 680, + "Ⅲ": 681, + "Ⅳ": 682, + "Ⅴ": 683, + "Ⅵ": 684, + "Ⅶ": 685, + "Ⅷ": 686, + "Ⅸ": 687, + "Ⅹ": 688, + "←": 689, + "↑": 690, + "→": 691, + "↓": 692, + "↔": 693, + "↕": 694, + "↗": 695, + "↘": 696, + "↙": 697, + "↨": 698, + "↩": 699, + "↱": 700, + "↳": 701, + "↺": 702, + "⇀": 703, + "⇄": 704, + "⇆": 705, + "⇈": 706, + "⇋": 707, + "⇌": 708, + "⇐": 709, + "⇑": 710, + "⇒": 711, + "⇓": 712, + "⇔": 713, + "⇞": 714, + "⇡": 715, + "⇣": 716, + "⇧": 717, + "⇨": 718, + "⇩": 719, + "∀": 720, + "∂": 721, + "∃": 722, + "∅": 723, + "∆": 724, + "∇": 725, + "∈": 726, + "∊": 727, + "∎": 728, + "∏": 729, + "∑": 730, + "−": 731, + "∓": 732, + "∕": 733, + "∖": 734, + "∗": 735, + "∘": 736, + "∙": 737, + "√": 738, + "∝": 739, + "∞": 740, + "∠": 741, + "∢": 742, + "∣": 743, + "∥": 744, + "∧": 745, + "∨": 746, + "∩": 747, + "∪": 748, + "∫": 749, + "∬": 750, + "∮": 751, + "∴": 752, + "∶": 753, + "∷": 754, + "∼": 755, + "∽": 756, + "≃": 757, + "≅": 758, + "≈": 759, + "≏": 760, + "≑": 761, + "≒": 762, + "≔": 763, + "≙": 764, + "≡": 765, + "≤": 766, + "≥": 767, + "≦": 768, + "≧": 769, + "≨": 770, + "≪": 771, + "≫": 772, + "≲": 773, + "≳": 774, + "≺": 775, + "≻": 776, + "⊂": 777, + "⊃": 778, + "⊆": 779, + "⊑": 780, + "⊓": 781, + "⊕": 782, + "⊖": 783, + "⊗": 784, + "⊘": 785, + "⊙": 786, + "⊚": 787, + "⊛": 788, + "⊝": 789, + "⊞": 790, + "⊠": 791, + "⊢": 792, + "⊣": 793, + "⊤": 794, + "⊥": 795, + "⊲": 796, + "⊿": 797, + "⋀": 798, + "⋃": 799, + "⋄": 800, + "⋅": 801, + "⋆": 802, + "⋇": 803, + "⋍": 804, + "⋕": 805, + "⋙": 806, + "⋮": 807, + "⋯": 808, + "⌀": 809, + "⌂": 810, + "⌃": 811, + "⌄": 812, + "⌈": 813, + "⌉": 814, + "⌊": 815, + "⌋": 816, + "⌐": 817, + "⌘": 818, + "⌠": 819, + "⌴": 820, + "⍛": 821, + "⍶": 822, + "⍺": 823, + "⎕": 824, + "⎛": 825, + "⎝": 826, + "⎞": 827, + "⎠": 828, + "⎡": 829, + "⎣": 830, + "⎤": 831, + "⎥": 832, + "⎦": 833, + "⎩": 834, + "⎪": 835, + "⎯": 836, + "␣": 837, + "①": 838, + "②": 839, + "③": 840, + "④": 841, + "⑤": 842, + "⑥": 843, + "⑦": 844, + "⑧": 845, + "⑨": 846, + "⑩": 847, + "⑪": 848, + "⑫": 849, + "⑬": 850, + "⑭": 851, + "⑮": 852, + "⑯": 853, + "⑴": 854, + "⑵": 855, + "⑶": 856, + "⑷": 857, + "⑸": 858, + "⑹": 859, + "⑺": 860, + "⑻": 861, + "⑼": 862, + "⑽": 863, + "⑾": 864, + "⑿": 865, + "⒀": 866, + "⒁": 867, + "⒂": 868, + "⒃": 869, + "⒄": 870, + "⒅": 871, + "⒆": 872, + "⒇": 873, + "Ⓡ": 874, + "ⓞ": 875, + "⓫": 876, + "⓬": 877, + "─": 878, + "━": 879, + "│": 880, + "┃": 881, + "┊": 882, + "┐": 883, + "└": 884, + "┘": 885, + "├": 886, + "┤": 887, + "┬": 888, + "┴": 889, + "┼": 890, + "┿": 891, + "╋": 892, + "═": 893, + "║": 894, + "╚": 895, + "╞": 896, + "╟": 897, + "╤": 898, + "╥": 899, + "╦": 900, + "╪": 901, + "╫": 902, + "╬": 903, + "╳": 904, + "▀": 905, + "▏": 906, + "░": 907, + "▒": 908, + "▓": 909, + "■": 910, + "□": 911, + "▣": 912, + "▪": 913, + "▫": 914, + "▬": 915, + "▯": 916, + "▲": 917, + "△": 918, + "▴": 919, + "▵": 920, + "▶": 921, + "▷": 922, + "▸": 923, + "►": 924, + "▼": 925, + "▽": 926, + "▾": 927, + "◁": 928, + "◄": 929, + "◆": 930, + "◇": 931, + "◈": 932, + "◉": 933, + "◊": 934, + "○": 935, + "◎": 936, + "●": 937, + "◐": 938, + "◑": 939, + "◖": 940, + "◘": 941, + "◙": 942, + "◦": 943, + "◩": 944, + "◭": 945, + "◮": 946, + "◯": 947, + "◻": 948, + "◽": 949, + "★": 950, + "☆": 951, + "☎": 952, + "☐": 953, + "☑": 954, + "☒": 955, + "☓": 956, + "☞": 957, + "☥": 958, + "☨": 959, + "☯": 960, + "☹": 961, + "☺": 962, + "☼": 963, + "☿": 964, + "♀": 965, + "♂": 966, + "♋": 967, + "♠": 968, + "♢": 969, + "♣": 970, + "♤": 971, + "♥": 972, + "♦": 973, + "♪": 974, + "♭": 975, + "♮": 976, + "♯": 977, + "⚝": 978, + "⚬": 979, + "✉": 980, + "✍": 981, + "✓": 982, + "✔": 983, + "✕": 984, + "✖": 985, + "✗": 986, + "✘": 987, + "✚": 988, + "✜": 989, + "✝": 990, + "✞": 991, + "✠": 992, + "✡": 993, + "✤": 994, + "✦": 995, + "✧": 996, + "✩": 997, + "✪": 998, + "✰": 999, + "✳": 1000, + "✴": 1001, + "✵": 1002, + "✶": 1003, + "✻": 1004, + "✽": 1005, + "❈": 1006, + "❋": 1007, + "❍": 1008, + "❑": 1009, + "❒": 1010, + "❖": 1011, + "❚": 1012, + "❤": 1013, + "❧": 1014, + "❶": 1015, + "❷": 1016, + "❸": 1017, + "❹": 1018, + "❺": 1019, + "❻": 1020, + "❼": 1021, + "❽": 1022, + "❾": 1023, + "❿": 1024, + "➀": 1025, + "➁": 1026, + "➂": 1027, + "➃": 1028, + "➌": 1029, + "➍": 1030, + "➎": 1031, + "➏": 1032, + "➑": 1033, + "➔": 1034, + "➙": 1035, + "➜": 1036, + "➝": 1037, + "➞": 1038, + "➢": 1039, + "➣": 1040, + "➤": 1041, + "⟂": 1042, + "⟡": 1043, + "⟨": 1044, + "⟩": 1045, + "⟳": 1046, + "⟶": 1047, + "⤈": 1048, + "⤉": 1049, + "⤓": 1050, + "⤺": 1051, + "⦸": 1052, + "⨁": 1053, + "⨉": 1054, + "⨕": 1055, + "⩒": 1056, + "⩽": 1057, + "⩾": 1058, + "⪓": 1059, + "⫧": 1060, + "⫮": 1061, + "⬆": 1062, + "⬇": 1063, + "ⱡ": 1064, + "Ⱶ": 1065, + "、": 1066, + "。": 1067, + "々": 1068, + "〇": 1069, + "〈": 1070, + "〉": 1071, + "《": 1072, + "》": 1073, + "『": 1074, + "』": 1075, + "【": 1076, + "】": 1077, + "〔": 1078, + "〕": 1079, + "あ": 1080, + "い": 1081, + "う": 1082, + "え": 1083, + "お": 1084, + "か": 1085, + "き": 1086, + "く": 1087, + "け": 1088, + "こ": 1089, + "さ": 1090, + "し": 1091, + "す": 1092, + "せ": 1093, + "そ": 1094, + "た": 1095, + "ち": 1096, + "っ": 1097, + "つ": 1098, + "て": 1099, + "と": 1100, + "な": 1101, + "に": 1102, + "ぬ": 1103, + "ね": 1104, + "の": 1105, + "は": 1106, + "ひ": 1107, + "ふ": 1108, + "へ": 1109, + "ほ": 1110, + "ま": 1111, + "み": 1112, + "む": 1113, + "め": 1114, + "も": 1115, + "や": 1116, + "ゅ": 1117, + "ゆ": 1118, + "ょ": 1119, + "よ": 1120, + "ら": 1121, + "り": 1122, + "る": 1123, + "れ": 1124, + "ろ": 1125, + "わ": 1126, + "を": 1127, + "ん": 1128, + "ア": 1129, + "イ": 1130, + "ウ": 1131, + "ェ": 1132, + "エ": 1133, + "オ": 1134, + "カ": 1135, + "キ": 1136, + "ク": 1137, + "ケ": 1138, + "コ": 1139, + "サ": 1140, + "シ": 1141, + "ス": 1142, + "セ": 1143, + "ソ": 1144, + "タ": 1145, + "チ": 1146, + "ッ": 1147, + "テ": 1148, + "ト": 1149, + "ナ": 1150, + "ニ": 1151, + "ハ": 1152, + "ヒ": 1153, + "フ": 1154, + "ヘ": 1155, + "ホ": 1156, + "マ": 1157, + "ミ": 1158, + "メ": 1159, + "ュ": 1160, + "ョ": 1161, + "ラ": 1162, + "リ": 1163, + "ル": 1164, + "レ": 1165, + "ロ": 1166, + "ン": 1167, + "・": 1168, + "ー": 1169, + "ㄱ": 1170, + "ㄲ": 1171, + "ㄴ": 1172, + "ㄷ": 1173, + "ㄸ": 1174, + "ㄹ": 1175, + "ㅁ": 1176, + "ㅂ": 1177, + "ㅃ": 1178, + "ㅅ": 1179, + "ㅇ": 1180, + "ㅈ": 1181, + "ㅉ": 1182, + "ㅊ": 1183, + "ㅋ": 1184, + "ㅌ": 1185, + "ㅍ": 1186, + "ㅎ": 1187, + "ㅏ": 1188, + "ㅐ": 1189, + "ㅑ": 1190, + "ㅒ": 1191, + "ㅓ": 1192, + "ㅔ": 1193, + "ㅕ": 1194, + "ㅖ": 1195, + "ㅗ": 1196, + "ㅛ": 1197, + "ㅜ": 1198, + "ㅠ": 1199, + "ㅡ": 1200, + "ㅣ": 1201, + "㎍": 1202, + "㎕": 1203, + "㎖": 1204, + "㎗": 1205, + "㎛": 1206, + "㎝": 1207, + "㎠": 1208, + "㎡": 1209, + "㎣": 1210, + "㎶": 1211, + "一": 1212, + "七": 1213, + "万": 1214, + "三": 1215, + "上": 1216, + "下": 1217, + "不": 1218, + "与": 1219, + "且": 1220, + "世": 1221, + "丘": 1222, + "业": 1223, + "东": 1224, + "丝": 1225, + "両": 1226, + "丢": 1227, + "两": 1228, + "严": 1229, + "並": 1230, + "个": 1231, + "中": 1232, + "丰": 1233, + "临": 1234, + "丸": 1235, + "丹": 1236, + "为": 1237, + "主": 1238, + "久": 1239, + "么": 1240, + "之": 1241, + "乐": 1242, + "九": 1243, + "习": 1244, + "书": 1245, + "乱": 1246, + "乳": 1247, + "了": 1248, + "争": 1249, + "事": 1250, + "二": 1251, + "于": 1252, + "云": 1253, + "五": 1254, + "井": 1255, + "些": 1256, + "亞": 1257, + "交": 1258, + "产": 1259, + "享": 1260, + "京": 1261, + "亮": 1262, + "人": 1263, + "亿": 1264, + "什": 1265, + "仁": 1266, + "介": 1267, + "仍": 1268, + "从": 1269, + "仔": 1270, + "仕": 1271, + "他": 1272, + "付": 1273, + "代": 1274, + "令": 1275, + "以": 1276, + "们": 1277, + "仮": 1278, + "仲": 1279, + "件": 1280, + "任": 1281, + "份": 1282, + "企": 1283, + "伐": 1284, + "休": 1285, + "优": 1286, + "会": 1287, + "传": 1288, + "伤": 1289, + "伯": 1290, + "伸": 1291, + "但": 1292, + "位": 1293, + "低": 1294, + "住": 1295, + "体": 1296, + "何": 1297, + "余": 1298, + "作": 1299, + "你": 1300, + "併": 1301, + "使": 1302, + "來": 1303, + "例": 1304, + "供": 1305, + "侣": 1306, + "侧": 1307, + "便": 1308, + "保": 1309, + "俞": 1310, + "信": 1311, + "個": 1312, + "倒": 1313, + "候": 1314, + "借": 1315, + "値": 1316, + "倦": 1317, + "值": 1318, + "假": 1319, + "做": 1320, + "停": 1321, + "健": 1322, + "偽": 1323, + "傘": 1324, + "備": 1325, + "催": 1326, + "傷": 1327, + "像": 1328, + "僚": 1329, + "僧": 1330, + "億": 1331, + "儿": 1332, + "元": 1333, + "充": 1334, + "光": 1335, + "免": 1336, + "児": 1337, + "兒": 1338, + "党": 1339, + "入": 1340, + "內": 1341, + "全": 1342, + "兩": 1343, + "兪": 1344, + "八": 1345, + "公": 1346, + "六": 1347, + "兰": 1348, + "共": 1349, + "关": 1350, + "兴": 1351, + "其": 1352, + "具": 1353, + "典": 1354, + "内": 1355, + "再": 1356, + "写": 1357, + "冤": 1358, + "冬": 1359, + "冰": 1360, + "决": 1361, + "冷": 1362, + "凍": 1363, + "减": 1364, + "凑": 1365, + "凝": 1366, + "几": 1367, + "凤": 1368, + "凰": 1369, + "凹": 1370, + "出": 1371, + "刀": 1372, + "分": 1373, + "切": 1374, + "列": 1375, + "刘": 1376, + "则": 1377, + "刚": 1378, + "创": 1379, + "判": 1380, + "利": 1381, + "别": 1382, + "到": 1383, + "制": 1384, + "刷": 1385, + "刺": 1386, + "刻": 1387, + "剂": 1388, + "削": 1389, + "前": 1390, + "剤": 1391, + "剥": 1392, + "剩": 1393, + "剪": 1394, + "創": 1395, + "劉": 1396, + "力": 1397, + "劝": 1398, + "功": 1399, + "加": 1400, + "务": 1401, + "动": 1402, + "助": 1403, + "努": 1404, + "励": 1405, + "劲": 1406, + "劳": 1407, + "势": 1408, + "動": 1409, + "務": 1410, + "募": 1411, + "包": 1412, + "化": 1413, + "北": 1414, + "匙": 1415, + "匹": 1416, + "区": 1417, + "医": 1418, + "十": 1419, + "千": 1420, + "半": 1421, + "华": 1422, + "协": 1423, + "卑": 1424, + "卒": 1425, + "单": 1426, + "南": 1427, + "博": 1428, + "卜": 1429, + "卫": 1430, + "危": 1431, + "却": 1432, + "厂": 1433, + "压": 1434, + "厌": 1435, + "厚": 1436, + "原": 1437, + "厭": 1438, + "去": 1439, + "参": 1440, + "參": 1441, + "叉": 1442, + "及": 1443, + "友": 1444, + "双": 1445, + "反": 1446, + "収": 1447, + "发": 1448, + "取": 1449, + "受": 1450, + "变": 1451, + "叠": 1452, + "口": 1453, + "古": 1454, + "只": 1455, + "叫": 1456, + "可": 1457, + "台": 1458, + "史": 1459, + "右": 1460, + "叶": 1461, + "号": 1462, + "司": 1463, + "叹": 1464, + "吃": 1465, + "合": 1466, + "吉": 1467, + "同": 1468, + "名": 1469, + "后": 1470, + "向": 1471, + "吓": 1472, + "君": 1473, + "吞": 1474, + "否": 1475, + "含": 1476, + "吳": 1477, + "吴": 1478, + "吵": 1479, + "吸": 1480, + "吹": 1481, + "吾": 1482, + "呆": 1483, + "呈": 1484, + "告": 1485, + "员": 1486, + "周": 1487, + "味": 1488, + "呵": 1489, + "呼": 1490, + "和": 1491, + "咒": 1492, + "咖": 1493, + "咳": 1494, + "咽": 1495, + "哀": 1496, + "品": 1497, + "員": 1498, + "哪": 1499, + "哭": 1500, + "唇": 1501, + "唤": 1502, + "商": 1503, + "問": 1504, + "啡": 1505, + "啤": 1506, + "善": 1507, + "喜": 1508, + "喝": 1509, + "営": 1510, + "喷": 1511, + "喻": 1512, + "嗽": 1513, + "嘘": 1514, + "嘱": 1515, + "器": 1516, + "嚢": 1517, + "囊": 1518, + "四": 1519, + "回": 1520, + "因": 1521, + "团": 1522, + "団": 1523, + "困": 1524, + "围": 1525, + "固": 1526, + "国": 1527, + "图": 1528, + "國": 1529, + "圓": 1530, + "土": 1531, + "在": 1532, + "地": 1533, + "场": 1534, + "圾": 1535, + "坂": 1536, + "均": 1537, + "坑": 1538, + "块": 1539, + "坚": 1540, + "垂": 1541, + "垃": 1542, + "型": 1543, + "垫": 1544, + "埃": 1545, + "執": 1546, + "基": 1547, + "堂": 1548, + "報": 1549, + "塑": 1550, + "塘": 1551, + "塞": 1552, + "填": 1553, + "增": 1554, + "壁": 1555, + "士": 1556, + "壮": 1557, + "声": 1558, + "売": 1559, + "変": 1560, + "复": 1561, + "夏": 1562, + "外": 1563, + "多": 1564, + "夜": 1565, + "够": 1566, + "大": 1567, + "天": 1568, + "太": 1569, + "夫": 1570, + "失": 1571, + "头": 1572, + "夹": 1573, + "奇": 1574, + "奈": 1575, + "奖": 1576, + "女": 1577, + "她": 1578, + "好": 1579, + "如": 1580, + "妈": 1581, + "妙": 1582, + "妻": 1583, + "姓": 1584, + "委": 1585, + "姜": 1586, + "姫": 1587, + "姿": 1588, + "威": 1589, + "娜": 1590, + "婦": 1591, + "媒": 1592, + "嫌": 1593, + "子": 1594, + "孔": 1595, + "字": 1596, + "存": 1597, + "孚": 1598, + "孟": 1599, + "孢": 1600, + "孤": 1601, + "学": 1602, + "孫": 1603, + "學": 1604, + "宁": 1605, + "它": 1606, + "宅": 1607, + "宇": 1608, + "守": 1609, + "安": 1610, + "宋": 1611, + "完": 1612, + "宏": 1613, + "宙": 1614, + "定": 1615, + "宜": 1616, + "宝": 1617, + "实": 1618, + "実": 1619, + "客": 1620, + "室": 1621, + "宰": 1622, + "害": 1623, + "家": 1624, + "容": 1625, + "寄": 1626, + "密": 1627, + "富": 1628, + "寒": 1629, + "察": 1630, + "寶": 1631, + "对": 1632, + "寺": 1633, + "导": 1634, + "対": 1635, + "寿": 1636, + "封": 1637, + "専": 1638, + "射": 1639, + "将": 1640, + "專": 1641, + "尊": 1642, + "對": 1643, + "小": 1644, + "少": 1645, + "尔": 1646, + "尘": 1647, + "尚": 1648, + "尝": 1649, + "就": 1650, + "尺": 1651, + "局": 1652, + "居": 1653, + "屈": 1654, + "屉": 1655, + "屋": 1656, + "屑": 1657, + "展": 1658, + "属": 1659, + "山": 1660, + "岁": 1661, + "岌": 1662, + "岗": 1663, + "岳": 1664, + "峰": 1665, + "川": 1666, + "州": 1667, + "工": 1668, + "左": 1669, + "巨": 1670, + "差": 1671, + "己": 1672, + "已": 1673, + "巴": 1674, + "巾": 1675, + "市": 1676, + "布": 1677, + "师": 1678, + "希": 1679, + "帘": 1680, + "帜": 1681, + "带": 1682, + "師": 1683, + "帯": 1684, + "帰": 1685, + "常": 1686, + "帽": 1687, + "幫": 1688, + "干": 1689, + "平": 1690, + "年": 1691, + "并": 1692, + "幸": 1693, + "幻": 1694, + "幼": 1695, + "广": 1696, + "庆": 1697, + "床": 1698, + "序": 1699, + "应": 1700, + "底": 1701, + "店": 1702, + "庙": 1703, + "度": 1704, + "座": 1705, + "庫": 1706, + "庭": 1707, + "康": 1708, + "庸": 1709, + "廊": 1710, + "延": 1711, + "建": 1712, + "开": 1713, + "弄": 1714, + "式": 1715, + "引": 1716, + "张": 1717, + "張": 1718, + "强": 1719, + "归": 1720, + "当": 1721, + "形": 1722, + "彭": 1723, + "影": 1724, + "待": 1725, + "很": 1726, + "律": 1727, + "後": 1728, + "徐": 1729, + "徒": 1730, + "従": 1731, + "得": 1732, + "從": 1733, + "御": 1734, + "徴": 1735, + "徽": 1736, + "心": 1737, + "必": 1738, + "忆": 1739, + "志": 1740, + "忘": 1741, + "応": 1742, + "忧": 1743, + "快": 1744, + "忽": 1745, + "态": 1746, + "怎": 1747, + "怒": 1748, + "怜": 1749, + "思": 1750, + "急": 1751, + "性": 1752, + "怨": 1753, + "怯": 1754, + "总": 1755, + "恋": 1756, + "恍": 1757, + "恐": 1758, + "息": 1759, + "恼": 1760, + "悟": 1761, + "悠": 1762, + "患": 1763, + "您": 1764, + "悪": 1765, + "情": 1766, + "惊": 1767, + "惧": 1768, + "想": 1769, + "愉": 1770, + "意": 1771, + "感": 1772, + "愤": 1773, + "愧": 1774, + "態": 1775, + "懷": 1776, + "懼": 1777, + "戈": 1778, + "成": 1779, + "我": 1780, + "或": 1781, + "戟": 1782, + "戦": 1783, + "戳": 1784, + "戴": 1785, + "户": 1786, + "戻": 1787, + "房": 1788, + "所": 1789, + "扁": 1790, + "手": 1791, + "才": 1792, + "打": 1793, + "扣": 1794, + "扫": 1795, + "扰": 1796, + "扶": 1797, + "找": 1798, + "技": 1799, + "把": 1800, + "抑": 1801, + "抗": 1802, + "折": 1803, + "护": 1804, + "报": 1805, + "抱": 1806, + "抹": 1807, + "抽": 1808, + "担": 1809, + "拆": 1810, + "拇": 1811, + "拉": 1812, + "拌": 1813, + "拍": 1814, + "拖": 1815, + "拧": 1816, + "拨": 1817, + "拭": 1818, + "拳": 1819, + "拿": 1820, + "持": 1821, + "挂": 1822, + "指": 1823, + "按": 1824, + "挑": 1825, + "挛": 1826, + "挤": 1827, + "挫": 1828, + "振": 1829, + "捕": 1830, + "捞": 1831, + "捡": 1832, + "换": 1833, + "捣": 1834, + "掉": 1835, + "排": 1836, + "探": 1837, + "控": 1838, + "掩": 1839, + "掰": 1840, + "揉": 1841, + "描": 1842, + "提": 1843, + "插": 1844, + "握": 1845, + "搅": 1846, + "搓": 1847, + "摇": 1848, + "摘": 1849, + "撒": 1850, + "撕": 1851, + "播": 1852, + "擦": 1853, + "擾": 1854, + "攝": 1855, + "收": 1856, + "改": 1857, + "放": 1858, + "政": 1859, + "效": 1860, + "救": 1861, + "敗": 1862, + "教": 1863, + "散": 1864, + "数": 1865, + "整": 1866, + "敷": 1867, + "文": 1868, + "斑": 1869, + "斗": 1870, + "料": 1871, + "斛": 1872, + "斤": 1873, + "斥": 1874, + "断": 1875, + "新": 1876, + "方": 1877, + "於": 1878, + "族": 1879, + "旗": 1880, + "无": 1881, + "日": 1882, + "旦": 1883, + "早": 1884, + "旬": 1885, + "旱": 1886, + "时": 1887, + "旺": 1888, + "昆": 1889, + "昇": 1890, + "昌": 1891, + "明": 1892, + "易": 1893, + "星": 1894, + "是": 1895, + "時": 1896, + "晃": 1897, + "普": 1898, + "景": 1899, + "晰": 1900, + "智": 1901, + "晾": 1902, + "暂": 1903, + "暇": 1904, + "暖": 1905, + "曰": 1906, + "曲": 1907, + "更": 1908, + "書": 1909, + "曹": 1910, + "最": 1911, + "月": 1912, + "有": 1913, + "朋": 1914, + "服": 1915, + "朗": 1916, + "望": 1917, + "朝": 1918, + "期": 1919, + "木": 1920, + "未": 1921, + "本": 1922, + "朮": 1923, + "术": 1924, + "朱": 1925, + "朴": 1926, + "机": 1927, + "李": 1928, + "杏": 1929, + "材": 1930, + "村": 1931, + "杜": 1932, + "杞": 1933, + "条": 1934, + "来": 1935, + "杯": 1936, + "杰": 1937, + "松": 1938, + "板": 1939, + "极": 1940, + "构": 1941, + "析": 1942, + "林": 1943, + "果": 1944, + "枝": 1945, + "枠": 1946, + "枣": 1947, + "枯": 1948, + "枳": 1949, + "架": 1950, + "枸": 1951, + "枹": 1952, + "柃": 1953, + "柏": 1954, + "染": 1955, + "柔": 1956, + "柜": 1957, + "查": 1958, + "柯": 1959, + "柱": 1960, + "柳": 1961, + "柴": 1962, + "査": 1963, + "柿": 1964, + "标": 1965, + "树": 1966, + "栗": 1967, + "校": 1968, + "株": 1969, + "样": 1970, + "核": 1971, + "根": 1972, + "桂": 1973, + "桃": 1974, + "框": 1975, + "案": 1976, + "桌": 1977, + "桐": 1978, + "桔": 1979, + "档": 1980, + "桦": 1981, + "梅": 1982, + "梗": 1983, + "條": 1984, + "梯": 1985, + "械": 1986, + "检": 1987, + "棗": 1988, + "森": 1989, + "椅": 1990, + "検": 1991, + "椿": 1992, + "楂": 1993, + "楝": 1994, + "楠": 1995, + "業": 1996, + "極": 1997, + "楼": 1998, + "楽": 1999, + "概": 2000, + "榔": 2001, + "榕": 2002, + "榴": 2003, + "様": 2004, + "標": 2005, + "樟": 2006, + "模": 2007, + "権": 2008, + "横": 2009, + "樱": 2010, + "樸": 2011, + "橘": 2012, + "機": 2013, + "橡": 2014, + "檳": 2015, + "櫻": 2016, + "欠": 2017, + "次": 2018, + "欢": 2019, + "欧": 2020, + "歡": 2021, + "止": 2022, + "正": 2023, + "此": 2024, + "武": 2025, + "歧": 2026, + "歩": 2027, + "歴": 2028, + "歸": 2029, + "死": 2030, + "残": 2031, + "段": 2032, + "毆": 2033, + "母": 2034, + "毎": 2035, + "每": 2036, + "毒": 2037, + "比": 2038, + "毛": 2039, + "氏": 2040, + "民": 2041, + "氓": 2042, + "气": 2043, + "気": 2044, + "氣": 2045, + "水": 2046, + "氷": 2047, + "汁": 2048, + "求": 2049, + "汉": 2050, + "汗": 2051, + "江": 2052, + "池": 2053, + "汤": 2054, + "決": 2055, + "沈": 2056, + "沌": 2057, + "沖": 2058, + "沟": 2059, + "没": 2060, + "沢": 2061, + "沫": 2062, + "河": 2063, + "沸": 2064, + "油": 2065, + "治": 2066, + "沿": 2067, + "泄": 2068, + "泉": 2069, + "泊": 2070, + "法": 2071, + "泡": 2072, + "波": 2073, + "泥": 2074, + "注": 2075, + "泪": 2076, + "泰": 2077, + "泳": 2078, + "泻": 2079, + "泽": 2080, + "洒": 2081, + "洗": 2082, + "津": 2083, + "洪": 2084, + "洲": 2085, + "活": 2086, + "流": 2087, + "浅": 2088, + "浆": 2089, + "济": 2090, + "浙": 2091, + "浪": 2092, + "浴": 2093, + "海": 2094, + "消": 2095, + "涝": 2096, + "涤": 2097, + "涮": 2098, + "液": 2099, + "淘": 2100, + "淡": 2101, + "淫": 2102, + "深": 2103, + "清": 2104, + "減": 2105, + "游": 2106, + "湖": 2107, + "湯": 2108, + "源": 2109, + "準": 2110, + "溪": 2111, + "溶": 2112, + "滇": 2113, + "滋": 2114, + "滎": 2115, + "满": 2116, + "滴": 2117, + "漓": 2118, + "演": 2119, + "漠": 2120, + "潘": 2121, + "潮": 2122, + "澳": 2123, + "激": 2124, + "濕": 2125, + "瀉": 2126, + "火": 2127, + "灯": 2128, + "灰": 2129, + "灵": 2130, + "灼": 2131, + "灾": 2132, + "炉": 2133, + "炎": 2134, + "炒": 2135, + "炙": 2136, + "炭": 2137, + "炮": 2138, + "点": 2139, + "為": 2140, + "烏": 2141, + "烛": 2142, + "烟": 2143, + "烦": 2144, + "無": 2145, + "焦": 2146, + "然": 2147, + "焼": 2148, + "煅": 2149, + "煤": 2150, + "照": 2151, + "煨": 2152, + "煮": 2153, + "熟": 2154, + "熨": 2155, + "熱": 2156, + "營": 2157, + "爱": 2158, + "父": 2159, + "片": 2160, + "版": 2161, + "牌": 2162, + "牙": 2163, + "牛": 2164, + "牡": 2165, + "物": 2166, + "特": 2167, + "犬": 2168, + "状": 2169, + "狀": 2170, + "独": 2171, + "猪": 2172, + "猫": 2173, + "獨": 2174, + "玄": 2175, + "率": 2176, + "王": 2177, + "玩": 2178, + "现": 2179, + "玻": 2180, + "珍": 2181, + "珠": 2182, + "班": 2183, + "現": 2184, + "球": 2185, + "理": 2186, + "琐": 2187, + "琴": 2188, + "瑞": 2189, + "璃": 2190, + "環": 2191, + "瓜": 2192, + "甘": 2193, + "甙": 2194, + "甚": 2195, + "甜": 2196, + "生": 2197, + "産": 2198, + "用": 2199, + "田": 2200, + "由": 2201, + "甲": 2202, + "电": 2203, + "男": 2204, + "町": 2205, + "画": 2206, + "畏": 2207, + "略": 2208, + "番": 2209, + "畫": 2210, + "異": 2211, + "當": 2212, + "疆": 2213, + "疊": 2214, + "疗": 2215, + "疲": 2216, + "疼": 2217, + "疾": 2218, + "病": 2219, + "症": 2220, + "痛": 2221, + "痩": 2222, + "痰": 2223, + "瘀": 2224, + "療": 2225, + "癌": 2226, + "発": 2227, + "發": 2228, + "白": 2229, + "百": 2230, + "皂": 2231, + "的": 2232, + "皮": 2233, + "益": 2234, + "盔": 2235, + "盖": 2236, + "盘": 2237, + "盛": 2238, + "目": 2239, + "直": 2240, + "相": 2241, + "省": 2242, + "看": 2243, + "真": 2244, + "眠": 2245, + "眨": 2246, + "眼": 2247, + "眾": 2248, + "着": 2249, + "睁": 2250, + "睑": 2251, + "睛": 2252, + "睡": 2253, + "瞒": 2254, + "瞬": 2255, + "瞳": 2256, + "知": 2257, + "石": 2258, + "矽": 2259, + "砂": 2260, + "研": 2261, + "硝": 2262, + "确": 2263, + "碗": 2264, + "磁": 2265, + "社": 2266, + "神": 2267, + "票": 2268, + "祭": 2269, + "福": 2270, + "离": 2271, + "秀": 2272, + "科": 2273, + "秒": 2274, + "积": 2275, + "称": 2276, + "移": 2277, + "程": 2278, + "稚": 2279, + "種": 2280, + "稿": 2281, + "積": 2282, + "穗": 2283, + "究": 2284, + "空": 2285, + "穿": 2286, + "突": 2287, + "窗": 2288, + "立": 2289, + "竖": 2290, + "端": 2291, + "竹": 2292, + "笑": 2293, + "笔": 2294, + "符": 2295, + "第": 2296, + "筆": 2297, + "等": 2298, + "筋": 2299, + "筑": 2300, + "答": 2301, + "策": 2302, + "筝": 2303, + "筷": 2304, + "算": 2305, + "管": 2306, + "箱": 2307, + "篮": 2308, + "米": 2309, + "类": 2310, + "籽": 2311, + "粉": 2312, + "粒": 2313, + "粟": 2314, + "精": 2315, + "糕": 2316, + "糖": 2317, + "糟": 2318, + "系": 2319, + "紅": 2320, + "紊": 2321, + "紙": 2322, + "素": 2323, + "索": 2324, + "紫": 2325, + "細": 2326, + "終": 2327, + "組": 2328, + "絡": 2329, + "給": 2330, + "絲": 2331, + "絶": 2332, + "經": 2333, + "綱": 2334, + "網": 2335, + "緑": 2336, + "縄": 2337, + "縦": 2338, + "红": 2339, + "纤": 2340, + "约": 2341, + "纸": 2342, + "纹": 2343, + "线": 2344, + "练": 2345, + "组": 2346, + "细": 2347, + "终": 2348, + "经": 2349, + "结": 2350, + "络": 2351, + "绝": 2352, + "绩": 2353, + "绪": 2354, + "续": 2355, + "绯": 2356, + "维": 2357, + "综": 2358, + "编": 2359, + "网": 2360, + "罪": 2361, + "羊": 2362, + "美": 2363, + "群": 2364, + "義": 2365, + "翹": 2366, + "考": 2367, + "者": 2368, + "耆": 2369, + "而": 2370, + "耐": 2371, + "耽": 2372, + "职": 2373, + "联": 2374, + "聞": 2375, + "聯": 2376, + "職": 2377, + "肃": 2378, + "肉": 2379, + "肌": 2380, + "肝": 2381, + "肢": 2382, + "肥": 2383, + "肱": 2384, + "育": 2385, + "肺": 2386, + "胃": 2387, + "胆": 2388, + "背": 2389, + "胡": 2390, + "胱": 2391, + "胶": 2392, + "能": 2393, + "脂": 2394, + "脑": 2395, + "脸": 2396, + "脾": 2397, + "腎": 2398, + "腑": 2399, + "腔": 2400, + "腥": 2401, + "腰": 2402, + "腸": 2403, + "腹": 2404, + "腾": 2405, + "腿": 2406, + "膀": 2407, + "膏": 2408, + "膜": 2409, + "膝": 2410, + "膽": 2411, + "臀": 2412, + "臟": 2413, + "臣": 2414, + "臨": 2415, + "自": 2416, + "至": 2417, + "臺": 2418, + "舀": 2419, + "興": 2420, + "舎": 2421, + "航": 2422, + "般": 2423, + "艘": 2424, + "色": 2425, + "艾": 2426, + "节": 2427, + "芋": 2428, + "芍": 2429, + "芎": 2430, + "芒": 2431, + "芩": 2432, + "芪": 2433, + "芭": 2434, + "花": 2435, + "芷": 2436, + "芸": 2437, + "芹": 2438, + "苁": 2439, + "苈": 2440, + "苏": 2441, + "苓": 2442, + "苦": 2443, + "英": 2444, + "苹": 2445, + "茅": 2446, + "茯": 2447, + "茱": 2448, + "茵": 2449, + "茶": 2450, + "草": 2451, + "荔": 2452, + "荣": 2453, + "荧": 2454, + "药": 2455, + "荷": 2456, + "荽": 2457, + "莪": 2458, + "莫": 2459, + "获": 2460, + "莹": 2461, + "菊": 2462, + "菌": 2463, + "菍": 2464, + "菔": 2465, + "菖": 2466, + "菜": 2467, + "菟": 2468, + "華": 2469, + "萄": 2470, + "萝": 2471, + "萧": 2472, + "萸": 2473, + "落": 2474, + "葉": 2475, + "葛": 2476, + "葡": 2477, + "葯": 2478, + "葶": 2479, + "蒋": 2480, + "蒙": 2481, + "蒜": 2482, + "蒲": 2483, + "蒸": 2484, + "蓉": 2485, + "蓝": 2486, + "蓮": 2487, + "蔵": 2488, + "蔻": 2489, + "蕉": 2490, + "薈": 2491, + "薑": 2492, + "薬": 2493, + "藏": 2494, + "藤": 2495, + "藥": 2496, + "藿": 2497, + "蘆": 2498, + "蘇": 2499, + "蘋": 2500, + "蘭": 2501, + "虎": 2502, + "虑": 2503, + "處": 2504, + "虚": 2505, + "虛": 2506, + "虫": 2507, + "蚣": 2508, + "蚪": 2509, + "蚱": 2510, + "蛋": 2511, + "蛎": 2512, + "蛙": 2513, + "蛭": 2514, + "蛸": 2515, + "蜈": 2516, + "蜗": 2517, + "蜜": 2518, + "蜡": 2519, + "蜢": 2520, + "蜥": 2521, + "蜴": 2522, + "蝌": 2523, + "蝥": 2524, + "螃": 2525, + "融": 2526, + "螵": 2527, + "蟹": 2528, + "血": 2529, + "行": 2530, + "衍": 2531, + "術": 2532, + "衛": 2533, + "衣": 2534, + "表": 2535, + "袋": 2536, + "袖": 2537, + "袜": 2538, + "被": 2539, + "装": 2540, + "補": 2541, + "裤": 2542, + "裹": 2543, + "西": 2544, + "要": 2545, + "見": 2546, + "規": 2547, + "視": 2548, + "覚": 2549, + "親": 2550, + "覺": 2551, + "见": 2552, + "观": 2553, + "规": 2554, + "视": 2555, + "觉": 2556, + "角": 2557, + "解": 2558, + "言": 2559, + "訂": 2560, + "計": 2561, + "討": 2562, + "記": 2563, + "許": 2564, + "診": 2565, + "誉": 2566, + "誌": 2567, + "認": 2568, + "誘": 2569, + "語": 2570, + "誤": 2571, + "読": 2572, + "誰": 2573, + "調": 2574, + "談": 2575, + "論": 2576, + "證": 2577, + "議": 2578, + "讀": 2579, + "讓": 2580, + "计": 2581, + "认": 2582, + "讨": 2583, + "训": 2584, + "记": 2585, + "论": 2586, + "评": 2587, + "诅": 2588, + "诊": 2589, + "词": 2590, + "试": 2591, + "诗": 2592, + "话": 2593, + "该": 2594, + "误": 2595, + "诱": 2596, + "说": 2597, + "请": 2598, + "诺": 2599, + "课": 2600, + "调": 2601, + "谈": 2602, + "谢": 2603, + "谨": 2604, + "豁": 2605, + "豆": 2606, + "象": 2607, + "貝": 2608, + "貞": 2609, + "負": 2610, + "責": 2611, + "費": 2612, + "賓": 2613, + "質": 2614, + "贝": 2615, + "贞": 2616, + "贡": 2617, + "责": 2618, + "败": 2619, + "货": 2620, + "贴": 2621, + "贵": 2622, + "费": 2623, + "赔": 2624, + "赢": 2625, + "赤": 2626, + "赭": 2627, + "起": 2628, + "超": 2629, + "趣": 2630, + "足": 2631, + "跟": 2632, + "路": 2633, + "跳": 2634, + "踩": 2635, + "蹲": 2636, + "躍": 2637, + "身": 2638, + "車": 2639, + "転": 2640, + "軽": 2641, + "车": 2642, + "轨": 2643, + "转": 2644, + "轮": 2645, + "辑": 2646, + "输": 2647, + "辛": 2648, + "辨": 2649, + "辯": 2650, + "边": 2651, + "辺": 2652, + "辽": 2653, + "迁": 2654, + "过": 2655, + "运": 2656, + "返": 2657, + "这": 2658, + "进": 2659, + "连": 2660, + "述": 2661, + "追": 2662, + "退": 2663, + "逃": 2664, + "逆": 2665, + "选": 2666, + "透": 2667, + "逐": 2668, + "通": 2669, + "造": 2670, + "連": 2671, + "進": 2672, + "逻": 2673, + "遂": 2674, + "遍": 2675, + "過": 2676, + "道": 2677, + "達": 2678, + "違": 2679, + "遠": 2680, + "適": 2681, + "遮": 2682, + "避": 2683, + "邈": 2684, + "邕": 2685, + "那": 2686, + "邪": 2687, + "邮": 2688, + "郁": 2689, + "郊": 2690, + "郑": 2691, + "部": 2692, + "郭": 2693, + "都": 2694, + "配": 2695, + "酒": 2696, + "酸": 2697, + "醇": 2698, + "醋": 2699, + "醒": 2700, + "醫": 2701, + "里": 2702, + "重": 2703, + "野": 2704, + "量": 2705, + "金": 2706, + "錄": 2707, + "錦": 2708, + "録": 2709, + "鐸": 2710, + "钟": 2711, + "钢": 2712, + "钥": 2713, + "钱": 2714, + "铅": 2715, + "铐": 2716, + "银": 2717, + "链": 2718, + "锁": 2719, + "锥": 2720, + "键": 2721, + "镜": 2722, + "镰": 2723, + "長": 2724, + "长": 2725, + "門": 2726, + "閃": 2727, + "間": 2728, + "閩": 2729, + "閾": 2730, + "關": 2731, + "门": 2732, + "闭": 2733, + "问": 2734, + "闲": 2735, + "间": 2736, + "闷": 2737, + "闸": 2738, + "闻": 2739, + "阈": 2740, + "队": 2741, + "防": 2742, + "阳": 2743, + "阵": 2744, + "阻": 2745, + "附": 2746, + "际": 2747, + "陈": 2748, + "限": 2749, + "陕": 2750, + "陜": 2751, + "院": 2752, + "除": 2753, + "险": 2754, + "陰": 2755, + "陳": 2756, + "陽": 2757, + "階": 2758, + "随": 2759, + "隐": 2760, + "隨": 2761, + "隻": 2762, + "难": 2763, + "雄": 2764, + "集": 2765, + "雙": 2766, + "雜": 2767, + "雨": 2768, + "雪": 2769, + "雲": 2770, + "零": 2771, + "雷": 2772, + "電": 2773, + "需": 2774, + "震": 2775, + "青": 2776, + "静": 2777, + "非": 2778, + "面": 2779, + "鞋": 2780, + "韓": 2781, + "韦": 2782, + "韩": 2783, + "音": 2784, + "項": 2785, + "順": 2786, + "顆": 2787, + "題": 2788, + "願": 2789, + "類": 2790, + "项": 2791, + "须": 2792, + "预": 2793, + "领": 2794, + "频": 2795, + "颗": 2796, + "题": 2797, + "颜": 2798, + "颠": 2799, + "颤": 2800, + "風": 2801, + "风": 2802, + "飛": 2803, + "食": 2804, + "飮": 2805, + "飯": 2806, + "飲": 2807, + "饺": 2808, + "馆": 2809, + "馒": 2810, + "首": 2811, + "香": 2812, + "验": 2813, + "骗": 2814, + "骨": 2815, + "髎": 2816, + "髓": 2817, + "體": 2818, + "高": 2819, + "鬚": 2820, + "鬱": 2821, + "魔": 2822, + "魚": 2823, + "鴻": 2824, + "鸡": 2825, + "鹽": 2826, + "麥": 2827, + "麦": 2828, + "麩": 2829, + "麻": 2830, + "黃": 2831, + "黄": 2832, + "黑": 2833, + "點": 2834, + "鼠": 2835, + "龙": 2836, + "ꜣ": 2837, + "ꜥ": 2838, + "ꞌ": 2839, + "": 2840, + "": 2841, + "": 2842, + "": 2843, + "": 2844, + "": 2845, + "": 2846, + "": 2847, + "": 2848, + "": 2849, + "": 2850, + "": 2851, + "": 2852, + "": 2853, + "": 2854, + "": 2855, + "": 2856, + "": 2857, + "": 2858, + "": 2859, + "": 2860, + "": 2861, + "": 2862, + "": 2863, + "": 2864, + "": 2865, + "": 2866, + "": 2867, + "": 2868, + "": 2869, + "": 2870, + "": 2871, + "": 2872, + "": 2873, + "": 2874, + "": 2875, + "": 2876, + "": 2877, + "": 2878, + "": 2879, + "": 2880, + "": 2881, + "": 2882, + "": 2883, + "": 2884, + "": 2885, + "": 2886, + "": 2887, + "": 2888, + "": 2889, + "": 2890, + "": 2891, + "": 2892, + "": 2893, + "": 2894, + "": 2895, + "": 2896, + "": 2897, + "": 2898, + "": 2899, + "": 2900, + "": 2901, + "": 2902, + "": 2903, + "": 2904, + "": 2905, + "": 2906, + "": 2907, + "": 2908, + "": 2909, + "": 2910, + "": 2911, + "": 2912, + "": 2913, + "": 2914, + "": 2915, + "": 2916, + "": 2917, + "": 2918, + "": 2919, + "": 2920, + "": 2921, + "": 2922, + "": 2923, + "": 2924, + "": 2925, + "": 2926, + "": 2927, + "": 2928, + "": 2929, + "": 2930, + "": 2931, + "": 2932, + "": 2933, + "": 2934, + "": 2935, + "": 2936, + "": 2937, + "": 2938, + "": 2939, + "": 2940, + "": 2941, + "": 2942, + "": 2943, + "": 2944, + "": 2945, + "": 2946, + "": 2947, + "": 2948, + "": 2949, + "": 2950, + "": 2951, + "": 2952, + "": 2953, + "": 2954, + "": 2955, + "": 2956, + "": 2957, + "": 2958, + "": 2959, + "": 2960, + "": 2961, + "": 2962, + "": 2963, + "": 2964, + "": 2965, + "": 2966, + "": 2967, + "": 2968, + "": 2969, + "": 2970, + "": 2971, + "": 2972, + "": 2973, + "": 2974, + "": 2975, + "": 2976, + "": 2977, + "": 2978, + "": 2979, + "": 2980, + "": 2981, + "": 2982, + "": 2983, + "": 2984, + "": 2985, + "": 2986, + "": 2987, + "": 2988, + "": 2989, + "": 2990, + "": 2991, + "": 2992, + "": 2993, + "": 2994, + "": 2995, + "": 2996, + "": 2997, + "": 2998, + "": 2999, + "": 3000, + "": 3001, + "": 3002, + "": 3003, + "ff": 3004, + "fi": 3005, + "fl": 3006, + "ffi": 3007, + "ffl": 3008, + "﹟": 3009, + "﹠": 3010, + "﹡": 3011, + "﹤": 3012, + "﹥": 3013, + "﹪": 3014, + "": 3015, + "%": 3016, + "&": 3017, + "(": 3018, + ")": 3019, + "*": 3020, + "+": 3021, + ",": 3022, + "-": 3023, + ".": 3024, + "/": 3025, + "3": 3026, + ":": 3027, + ";": 3028, + "<": 3029, + ">": 3030, + "?": 3031, + "F": 3032, + "G": 3033, + "H": 3034, + "M": 3035, + "R": 3036, + "~": 3037, + "ア": 3038, + "£": 3039, + "→": 3040, + "�": 3041, + "𝐜": 3042, + "𝐦": 3043, + "𝐩": 3044, + "𝐱": 3045, + "𝐴": 3046, + "𝐵": 3047, + "𝐶": 3048, + "𝐷": 3049, + "𝐸": 3050, + "𝐹": 3051, + "𝐺": 3052, + "𝐻": 3053, + "𝐼": 3054, + "𝐽": 3055, + "𝐾": 3056, + "𝐿": 3057, + "𝑀": 3058, + "𝑁": 3059, + "𝑂": 3060, + "𝑃": 3061, + "𝑄": 3062, + "𝑅": 3063, + "𝑆": 3064, + "𝑇": 3065, + "𝑈": 3066, + "𝑉": 3067, + "𝑊": 3068, + "𝑋": 3069, + "𝑌": 3070, + "𝑎": 3071, + "𝑏": 3072, + "𝑐": 3073, + "𝑑": 3074, + "𝑒": 3075, + "𝑓": 3076, + "𝑔": 3077, + "𝑖": 3078, + "𝑗": 3079, + "𝑘": 3080, + "𝑙": 3081, + "𝑚": 3082, + "𝑛": 3083, + "𝑜": 3084, + "𝑝": 3085, + "𝑞": 3086, + "𝑟": 3087, + "𝑠": 3088, + "𝑡": 3089, + "𝑢": 3090, + "𝑤": 3091, + "𝑥": 3092, + "𝑦": 3093, + "𝑧": 3094, + "𝛼": 3095, + "𝛽": 3096, + "𝛾": 3097, + "𝛿": 3098, + "𝜀": 3099, + "𝜂": 3100, + "𝜃": 3101, + "𝜅": 3102, + "𝜆": 3103, + "𝜇": 3104, + "𝜉": 3105, + "𝜋": 3106, + "𝜌": 3107, + "𝜎": 3108, + "𝜏": 3109, + "𝜑": 3110, + "𝜒": 3111, + "𝜓": 3112, + "𝜔": 3113, + "𝜙": 3114, + "𝜷": 3115, + "𝜼": 3116, + "𝝁": 3117, + "𝝨": 3118, + "𝟎": 3119, + "𝟏": 3120, + "󳨀": 3121, + "󳰀": 3122, + "󸀠": 3123, + "##1": 3124, + "##7": 3125, + "##0": 3126, + "##5": 3127, + "##8": 3128, + "##e": 3129, + "##c": 3130, + "##p": 3131, + "##t": 3132, + "##i": 3133, + "##v": 3134, + "##n": 3135, + "##s": 3136, + "##3": 3137, + "##2": 3138, + "##m": 3139, + "##d": 3140, + "##a": 3141, + "##B": 3142, + "##4": 3143, + "##9": 3144, + "##T": 3145, + "##H": 3146, + "##E": 3147, + "##R": 3148, + "##M": 3149, + "##_": 3150, + "##N": 3151, + "##A": 3152, + "##b": 3153, + "##g": 3154, + "##l": 3155, + "##P": 3156, + "##u": 3157, + "##h": 3158, + "##o": 3159, + "##r": 3160, + "##C": 3161, + "##G": 3162, + "##6": 3163, + "##S": 3164, + "##W": 3165, + "##D": 3166, + "##U": 3167, + "##O": 3168, + "##f": 3169, + "##K": 3170, + "##Q": 3171, + "##F": 3172, + "##L": 3173, + "##y": 3174, + "##I": 3175, + "##x": 3176, + "##w": 3177, + "##k": 3178, + "##Y": 3179, + "##V": 3180, + "##ǂ": 3181, + "##Φ": 3182, + "##j": 3183, + "##X": 3184, + "##q": 3185, + "##J": 3186, + "##z": 3187, + "##龙": 3188, + "##江": 3189, + "##省": 3190, + "##Z": 3191, + "##α": 3192, + "##%": 3193, + "##)": 3194, + "##<": 3195, + "##/": 3196, + "##*": 3197, + "##,": 3198, + "##§": 3199, + "##β": 3200, + "##▲": 3201, + "##■": 3202, + "##δ": 3203, + "##þ": 3204, + "##Δ": 3205, + "##π": 3206, + "###": 3207, + "##†": 3208, + "##ƒ": 3209, + "##+": 3210, + "##‘": 3211, + "##]": 3212, + "##.": 3213, + "##>": 3214, + "##=": 3215, + "##ª": 3216, + "##fl": 3217, + "##□": 3218, + "##○": 3219, + "##ɑ": 3220, + "##[": 3221, + "##‚": 3222, + "##●": 3223, + "##:": 3224, + "##…": 3225, + "##ϕ": 3226, + "##-": 3227, + "##fi": 3228, + "##$": 3229, + "##ˆ": 3230, + "##£": 3231, + "##(": 3232, + "##Β": 3233, + "##!": 3234, + "##ν": 3235, + "##”": 3236, + "##γ": 3237, + "##士": 3238, + "##✞": 3239, + "##♦": 3240, + "##“": 3241, + "##}": 3242, + "##ł": 3243, + "##ı": 3244, + "##ff": 3245, + "##个": 3246, + "##挑": 3247, + "##尔": 3248, + "##達": 3249, + "##電": 3250, + "##¶": 3251, + "##−": 3252, + "##ʹ": 3253, + "##ψ": 3254, + "##•": 3255, + "##≥": 3256, + "##Ο": 3257, + "##給": 3258, + "##𝑐": 3259, + "##ø": 3260, + "##‡": 3261, + "##—": 3262, + "##{": 3263, + "##–": 3264, + "##&": 3265, + "##∗": 3266, + "##В": 3267, + "##С": 3268, + "##ร": 3269, + "##ะ": 3270, + "##ท": 3271, + "##อ": 3272, + "##ม": 3273, + "##|": 3274, + "##ρ": 3275, + "##ο": 3276, + "##σ": 3277, + "##ε": 3278, + "##μ": 3279, + "##η": 3280, + "##'": 3281, + "##;": 3282, + "##√": 3283, + "##\"": 3284, + "##œ": 3285, + "##น": 3286, + "##ก": 3287, + "##ญ": 3288, + "##ช": 3289, + "##า": 3290, + "##θ": 3291, + "##ᶜ": 3292, + "##µ": 3293, + "##˗": 3294, + "##∞": 3295, + "##Þ": 3296, + "##?": 3297, + "##ɛ": 3298, + "##·": 3299, + "##手": 3300, + "##④": 3301, + "##≤": 3302, + "##χ": 3303, + "##え": 3304, + "##е": 3305, + "##О": 3306, + "##Н": 3307, + "##て": 3308, + "##あ": 3309, + "##る": 3310, + "##�": 3311, + "##ω": 3312, + "##º": 3313, + "##’": 3314, + "##°": 3315, + "##λ": 3316, + "##€": 3317, + "##": 3318, + "##⋅": 3319, + "##→": 3320, + "##`": 3321, + "##Θ": 3322, + "##φ": 3323, + "##\\": 3324, + "##↓": 3325, + "##◦": 3326, + "##Ž": 3327, + "##": 3328, + "##⬆": 3329, + "##ffi": 3330, + "##ß": 3331, + "##^": 3332, + "##~": 3333, + "##さ": 3334, + "##し": 3335, + "##く": 3336, + "##な": 3337, + "##い": 3338, + "##ˇ": 3339, + "##ζ": 3340, + "##˚": 3341, + "##▼": 3342, + "##ᵟ": 3343, + "##³": 3344, + "##ƛ": 3345, + "##⑩": 3346, + "##安": 3347, + "##行": 3348, + "##政": 3349, + "##∆": 3350, + "##Æ": 3351, + "##ξ": 3352, + "##ι": 3353, + "##ς": 3354, + "##⑤": 3355, + "##Ф": 3356, + "##×": 3357, + "##♯": 3358, + "##∑": 3359, + "##҂": 3360, + "##υ": 3361, + "##τ": 3362, + "##κ": 3363, + "##Α": 3364, + "##高": 3365, + "##兴": 3366, + "##或": 3367, + "##焦": 3368, + "##◊": 3369, + "##": 3370, + "##∫": 3371, + "##ᶷ": 3372, + "##Ω": 3373, + "##Ψ": 3374, + "##我": 3375, + "##在": 3376, + "##做": 3377, + "##事": 3378, + "##情": 3379, + "##時": 3380, + "##ɓ": 3381, + "##ð": 3382, + "##ꞌ": 3383, + "##↑": 3384, + "##ᵃ": 3385, + "##⇒": 3386, + "##ffl": 3387, + "##𝑃": 3388, + "##‑": 3389, + "##Ϯ": 3390, + "##±": 3391, + "##ᅦ": 3392, + "##ᆸ": 3393, + "##ᄐ": 3394, + "##ᅩ": 3395, + "##ᄉ": 3396, + "##ᅳ": 3397, + "##ᄑ": 3398, + "##ᅵ": 3399, + "##ᄅ": 3400, + "##ᅡ": 3401, + "##ᄌ": 3402, + "##ᆼ": 3403, + "##恋": 3404, + "##令": 3405, + "##片": 3406, + "##昌": 3407, + "##¥": 3408, + "##★": 3409, + "##„": 3410, + "##Ι": 3411, + "##♀": 3412, + "##了": 3413, + "##玩": 3414, + "##得": 3415, + "##愉": 3416, + "##快": 3417, + "##麩": 3418, + "##态": 3419, + "##@": 3420, + "##Ⅲ": 3421, + "##こ": 3422, + "##う": 3423, + "##о": 3424, + "##з": 3425, + "##в": 3426, + "##а": 3427, + "##я": 3428, + "##ᄎ": 3429, + "##ᆷ": 3430, + "##花": 3431, + "##枝": 3432, + "##)": 3433, + "##ᅮ": 3434, + "##Γ": 3435, + "##♂": 3436, + "##∧": 3437, + "##经": 3438, + "##常": 3439, + "##会": 3440, + "##因": 3441, + "##为": 3442, + "##上": 3443, + "##网": 3444, + "##到": 3445, + "##半": 3446, + "##夜": 3447, + "##而": 3448, + "##失": 3449, + "##眠": 3450, + "##么": 3451, + "##↔": 3452, + "##Е": 3453, + "##𝜂": 3454, + "##˜": 3455, + "##地": 3456, + "##黃": 3457, + "##ꜥ": 3458, + "##˘": 3459, + "##鬱": 3460, + "##化": 3461, + "##火": 3462, + "##𝑢": 3463, + "##‐": 3464, + "##胃": 3465, + "##不": 3466, + "##和": 3467, + "##患": 3468, + "##ᶲ": 3469, + "##〕": 3470, + "##𝑁": 3471, + "##邪": 3472, + "##东": 3473, + "##½": 3474, + "##药": 3475, + "##单": 3476, + "##ɸ": 3477, + "##⊝": 3478, + "##д": 3479, + "##р": 3480, + "##и": 3481, + "##南": 3482, + "##‖": 3483, + "##ɒ": 3484, + "##𝜑": 3485, + "##ᄀ": 3486, + "##ᅥ": 3487, + "##ᄂ": 3488, + "##∼": 3489, + "##Ø": 3490, + "##æ": 3491, + "##※": 3492, + "##ᄋ": 3493, + "##′": 3494, + "##Χ": 3495, + "##⑦": 3496, + "##⑧": 3497, + "##⑨": 3498, + "##⑪": 3499, + "##⑯": 3500, + "##": 3501, + "##≧": 3502, + "##ŧ": 3503, + "##画": 3504, + "##∬": 3505, + "##𝜋": 3506, + "##体": 3507, + "##‰": 3508, + "##ᄇ": 3509, + "##ᅱ": 3510, + "##»": 3511, + "##Ⅱ": 3512, + "##ส": 3513, + "##ย": 3514, + "##เ": 3515, + "##ข": 3516, + "##ว": 3517, + "##¼": 3518, + "##Μ": 3519, + "##✰": 3520, + "##々": 3521, + "##に": 3522, + "##前": 3523, + "##進": 3524, + "##ん": 3525, + "##と": 3526, + "##特": 3527, + "##徴": 3528, + "##か": 3529, + "##บ": 3530, + "##แ": 3531, + "##ด": 3532, + "##ง": 3533, + "##ᶿ": 3534, + "##融": 3535, + "##桌": 3536, + "##子": 3537, + "##水": 3538, + "##―": 3539, + "##擾": 3540, + "##+": 3541, + "##△": 3542, + "##≈": 3543, + "##▾": 3544, + "##«": 3545, + "##图": 3546, + "##Σ": 3547, + "##新": 3548, + "##一": 3549, + "##轮": 3550, + "##优": 3551, + "##秀": 3552, + "##人": 3553, + "##评": 3554, + "##比": 3555, + "##活": 3556, + "##动": 3557, + "##中": 3558, + "##Ɨ": 3559, + "##【": 3560, + "##": 3561, + "##Τ": 3562, + "##Ϟ": 3563, + "##ˊ": 3564, + "##¹": 3565, + "##ไ": 3566, + "##祭": 3567, + "##り": 3568, + "##Ⅰ": 3569, + "##开": 3570, + "##钱": 3571, + "##包": 3572, + "##必": 3573, + "##须": 3574, + "##所": 3575, + "##有": 3576, + "##的": 3577, + "##认": 3578, + "##可": 3579, + "##介": 3580, + "##賓": 3581, + "##逻": 3582, + "##辑": 3583, + "##结": 3584, + "##构": 3585, + "##非": 3586, + "##严": 3587, + "##谨": 3588, + "##清": 3589, + "##晰": 3590, + "##經": 3591, + "##素": 3592, + "##問": 3593, + "##積": 3594, + "##腰": 3595, + "##痛": 3596, + "##": 3597, + "##呼": 3598, + "##吸": 3599, + "##综": 3600, + "##合": 3601, + "##症": 3602, + "##息": 3603, + "##全": 3604, + "##血": 3605, + "##藤": 3606, + "##⟨": 3607, + "##布": 3608, + "##覺": 3609, + "##想": 3610, + "##去": 3611, + "##美": 3612, + "##好": 3613, + "##ʻ": 3614, + "##➔": 3615, + "##⩾": 3616, + "##": 3617, + "##\u0013": 3618, + "##疼": 3619, + "##哭": 3620, + "##斑": 3621, + "##蝥": 3622, + "##山": 3623, + "##具": 3624, + "##面": 3625, + "##条": 3626, + "##睑": 3627, + "##张": 3628, + "##被": 3629, + "##智": 3630, + "##属": 3631, + "##于": 3632, + "##以": 3633, + "##下": 3634, + "##哪": 3635, + "##类": 3636, + "##别": 3637, + "##⋇": 3638, + "##⑵": 3639, + "##⑷": 3640, + "##⑸": 3641, + "##⑹": 3642, + "##⑺": 3643, + "##⑻": 3644, + "##⑼": 3645, + "##⑿": 3646, + "##⒆": 3647, + "##®": 3648, + "##¢": 3649, + "##ʼ": 3650, + "##②": 3651, + "##③": 3652, + "##ɪ": 3653, + "##ℓ": 3654, + "##很": 3655, + "##專": 3656, + "##心": 3657, + "##信": 3658, + "##𝑛": 3659, + "##⑶": 3660, + "##⑽": 3661, + "##⑾": 3662, + "##⒀": 3663, + "##⒁": 3664, + "##⒂": 3665, + "##⒃": 3666, + "##𝐿": 3667, + "##Η": 3668, + "##": 3669, + "##险": 3670, + "##沟": 3671, + "##通": 3672, + "##魔": 3673, + "##烦": 3674, + "##с": 3675, + "##л": 3676, + "##т": 3677, + "##∠": 3678, + "##⊤": 3679, + "##ϰ": 3680, + "##˦": 3681, + "##­": 3682, + "##柏": 3683, + "##象": 3684, + "##體": 3685, + "##質": 3686, + "##辨": 3687, + "##證": 3688, + "##苹": 3689, + "##果": 3690, + "##ג": 3691, + "##未": 3692, + "##熟": 3693, + "##児": 3694, + "##網": 3695, + "##膜": 3696, + "##黄": 3697, + "##‒": 3698, + "##①": 3699, + "##碗": 3700, + "##刺": 3701, + "##激": 3702, + "##时": 3703, + "##无": 3704, + "##反": 3705, + "##应": 3706, + "##身": 3707, + "##呈": 3708, + "##肌": 3709, + "##阵": 3710, + "##ə": 3711, + "##道": 3712, + "##у": 3713, + "##穗": 3714, + "##金": 3715, + "##粟": 3716, + "##兰": 3717, + "##ϐ": 3718, + "##ˠ": 3719, + "##見": 3720, + "##⑥": 3721, + "##⊕": 3722, + "##仍": 3723, + "##保": 3724, + "##持": 3725, + "##眼": 3726, + "##闭": 3727, + "##胡": 3728, + "##谈": 3729, + "##𝛽": 3730, + "##藏": 3731, + "##自": 3732, + "##治": 3733, + "##区": 3734, + "##": 3735, + "##": 3736, + "##": 3737, + "##": 3738, + "##": 3739, + "##线": 3740, + "##候": 3741, + "##总": 3742, + "##着": 3743, + "##疆": 3744, + "##维": 3745, + "##吾": 3746, + "##ꜣ": 3747, + "##Π": 3748, + "##ᆯ": 3749, + "##⊣": 3750, + "##∙": 3751, + "##骗": 3752, + "##岁": 3753, + "##急": 3754, + "##ь": 3755, + "##к": 3756, + "##目": 3757, + "##大": 3758, + "##⇧": 3759, + "##◆": 3760, + "##健": 3761, + "##の": 3762, + "##専": 3763, + "##門": 3764, + "##職": 3765, + "##者": 3766, + "##同": 3767, + "##様": 3768, + "##Ŧ": 3769, + "##ᄒ": 3770, + "##ᅧ": 3771, + "##视": 3772, + "##力": 3773, + "##н": 3774, + "##ю": 3775, + "##х": 3776, + "##報": 3777, + "##𝑒": 3778, + "##イ": 3779, + "##ン": 3780, + "##コ": 3781, + "##ー": 3782, + "##ト": 3783, + "##𝑖": 3784, + "##ɨ": 3785, + "##´": 3786, + "##颠": 3787, + "##ϯ": 3788, + "##ч": 3789, + "##": 3790, + "##*": 3791, + "##": 3792, + "##▪": 3793, + "##细": 3794, + "##¡": 3795, + "##ᅟ": 3796, + "##⊥": 3797, + "##▀": 3798, + "##但": 3799, + "##睁": 3800, + "##←": 3801, + "##雪": 3802, + "##™": 3803, + "##箱": 3804, + "##房": 3805, + "##颤": 3806, + "##散": 3807, + "##☆": 3808, + "##∂": 3809, + "##梅": 3810, + "##◇": 3811, + "##ϵ": 3812, + "##⇑": 3813, + "##皮": 3814, + "##Ε": 3815, + "##〉": 3816, + "##生": 3817, + "##能": 3818, + "##Ԑ": 3819, + "##┼": 3820, + "##饺": 3821, + "##西": 3822, + "##普": 3823, + "##ห": 3824, + "##›": 3825, + "##‎": 3826, + "##": 3827, + "##ᅲ": 3828, + "##ᅢ": 3829, + "##炮": 3830, + "##叶": 3831, + "##榕": 3832, + "##ˉ": 3833, + "##定": 3834, + "##位": 3835, + "##": 3836, + "##𝐶": 3837, + "##争": 3838, + "##吵": 3839, + "##ょ": 3840, + "##拌": 3841, + "##咖": 3842, + "##啡": 3843, + "##钥": 3844, + "##匙": 3845, + "##‥": 3846, + "##筑": 3847, + "##⋆": 3848, + "##脑": 3849, + "##⁹": 3850, + "##𝑥": 3851, + "##消": 3852, + "##防": 3853, + "##员": 3854, + "##小": 3855, + "##王": 3856, + "##看": 3857, + "##星": 3858, + "##出": 3859, + "##现": 3860, + "##那": 3861, + "##瞬": 3862, + "##间": 3863, + "##▽": 3864, + "##ʌ": 3865, + "##М": 3866, + "##\u0018": 3867, + "##法": 3868, + "##由": 3869, + "##对": 3870, + "##宇": 3871, + "##宙": 3872, + "##壮": 3873, + "##惊": 3874, + "##叹": 3875, + "##已": 3876, + "##∝": 3877, + "##А": 3878, + "##性": 3879, + "##˝": 3880, + "##问": 3881, + "##": 3882, + "##": 3883, + "##": 3884, + "##菔": 3885, + "##附": 3886, + "##伯": 3887, + "##雄": 3888, + "##∘": 3889, + "##灵": 3890, + "##胶": 3891, + "##囊": 3892, + "##┤": 3893, + "##ゅ": 3894, + "##完": 3895, + "##п": 3896, + "##分": 3897, + "##間": 3898, + "##を": 3899, + "##理": 3900, + "##解": 3901, + "##す": 3902, + "##は": 3903, + "##Λ": 3904, + "##∓": 3905, + "##Ѱ": 3906, + "##∥": 3907, + "##制": 3908, + "##薑": 3909, + "##三": 3910, + "##角": 3911, + "##形": 3912, + "##怎": 3913, + "##样": 3914, + "##志": 3915, + "##″": 3916, + "##⨁": 3917, + "##ᵓ": 3918, + "##わ": 3919, + "##店": 3920, + "##多": 3921, + "##几": 3922, + "##❍": 3923, + "##飛": 3924, + "##І": 3925, + "##蝌": 3926, + "##蚪": 3927, + "##长": 3928, + "##之": 3929, + "##后": 3930, + "##变": 3931, + "##成": 3932, + "##只": 3933, + "##芋": 3934, + "##来": 3935, + "##𝑇": 3936, + "##𝑅": 3937, + "##¤": 3938, + "##☨": 3939, + "##工": 3940, + "##与": 3941, + "##社": 3942, + "##色": 3943, + "##、": 3944, + "##Ν": 3945, + "##博": 3946, + "##拉": 3947, + "##✓": 3948, + "##​": 3949, + "##纸": 3950, + "##ᛏ": 3951, + "##产": 3952, + "##洲": 3953, + "##氣": 3954, + "##營": 3955, + "##队": 3956, + "##ᅪ": 3957, + "##ᆨ": 3958, + "##ᆫ": 3959, + "##觉": 3960, + "##诱": 3961, + "##发": 3962, + "##电": 3963, + "##і": 3964, + "##ː": 3965, + "##ж": 3966, + "##仁": 3967, + "##ป": 3968, + "##ล": 3969, + "##台": 3970, + "##谢": 3971, + "##组": 3972, + "##学": 3973, + "##检": 3974, + "##查": 3975, + "##¨": 3976, + "##ǁ": 3977, + "##щ": 3978, + "##ʧ": 3979, + "##✔": 3980, + "##Υ": 3981, + "##ƙ": 3982, + "##Р": 3983, + "##斤": 3984, + "##导": 3985, + "##决": 3986, + "##策": 3987, + "##物": 3988, + "##乱": 3989, + "##用": 3990, + "##療": 3991, + "##受": 3992, + "##け": 3993, + "##国": 3994, + "##痩": 3995, + "##号": 3996, + "##¾": 3997, + "##ᵇ": 3998, + "##極": 3999, + "##ƚ": 4000, + "##": 4001, + "##⑭": 4002, + "##ˤ": 4003, + "##ʊ": 4004, + "##𝐹": 4005, + "##К": 4006, + "##基": 4007, + "##ᆺ": 4008, + "##": 4009, + "##┴": 4010, + "##₮": 4011, + "##б": 4012, + "##▫": 4013, + "##竖": 4014, + "##起": 4015, + "##拇": 4016, + "##指": 4017, + "##肢": 4018, + "##伸": 4019, + "##直": 4020, + "##◯": 4021, + "##𝑝": 4022, + "##": 4023, + "##需": 4024, + "##要": 4025, + "##努": 4026, + "##才": 4027, + "##₹": 4028, + "##蛭": 4029, + "##⩽": 4030, + "##讓": 4031, + "##從": 4032, + "##許": 4033, + "##フ": 4034, + "##˅": 4035, + "##桦": 4036, + "##": 4037, + "##𝑑": 4038, + "##科": 4039, + "##技": 4040, + "##展": 4041, + "##限": 4042, + "##公": 4043, + "##司": 4044, + "##进": 4045, + "##✩": 4046, + "##相": 4047, + "##班": 4048, + "##悠": 4049, + "##⟩": 4050, + "##。": 4051, + "##稚": 4052, + "##骨": 4053, + "##程": 4054, + "##师": 4055, + "##说": 4056, + "##这": 4057, + "##框": 4058, + "##架": 4059, + "##▴": 4060, + "##煨": 4061, + "##肉": 4062, + "##豆": 4063, + "##蔻": 4064, + "##⤉": 4065, + "##ƪ": 4066, + "##": 4067, + "##医": 4068, + "##院": 4069, + "##就": 4070, + "##诊": 4071, + "##珠": 4072, + "##ϑ": 4073, + "##ш": 4074, + "##ำ": 4075, + "##唇": 4076, + "##膏": 4077, + "##五": 4078, + "##ы": 4079, + "##毛": 4080, + "##巾": 4081, + "##ᶟ": 4082, + "##光": 4083, + "##凝": 4084, + "##术": 4085, + "##⁄": 4086, + "##∀": 4087, + "##│": 4088, + "##𝑦": 4089, + "##𝑙": 4090, + "##∇": 4091, + "##打": 4092, + "##明": 4093, + "##炭": 4094, + "##干": 4095, + "##射": 4096, + "##≡": 4097, + "##": 4098, + "##木": 4099, + "##宁": 4100, + "##病": 4101, + "##方": 4102, + "##論": 4103, + "##,": 4104, + "##∮": 4105, + "##ᶤ": 4106, + "##徒": 4107, + "##Т": 4108, + "##ˣ": 4109, + "##˃": 4110, + "##◉": 4111, + "##𝑗": 4112, + "##²": 4113, + "##思": 4114, + "##≦": 4115, + "##𝛼": 4116, + "##違": 4117, + "##シ": 4118, + "##ア": 4119, + "##黑": 4120, + "##板": 4121, + "##ʞ": 4122, + "##育": 4123, + "##抗": 4124, + "##☹": 4125, + "##ต": 4126, + "##併": 4127, + "##責": 4128, + "##任": 4129, + "##左": 4130, + "##感": 4131, + "##带": 4132, + "##ら": 4133, + "##止": 4134, + "##湯": 4135, + "##柃": 4136, + "##𝑚": 4137, + "##丹": 4138, + "##溪": 4139, + "##货": 4140, + "##ᛎ": 4141, + "##∅": 4142, + "##海": 4143, + "##市": 4144, + "##飮": 4145, + "##": 4146, + "##薈": 4147, + "##リ": 4148, + "##カ": 4149, + "##ハ": 4150, + "##や": 4151, + "##ᵈ": 4152, + "##║": 4153, + "##ɔ": 4154, + "##鞋": 4155, + "##垫": 4156, + "##セ": 4157, + "##芷": 4158, + "##连": 4159, + "##忘": 4160, + "##返": 4161, + "##參": 4162, + "##愧": 4163, + "##状": 4164, + "##軽": 4165, + "##減": 4166, + "##ɫ": 4167, + "##笔": 4168, + "##腿": 4169, + "##": 4170, + "##天": 4171, + "##芩": 4172, + "##対": 4173, + "##零": 4174, + "##度": 4175, + "##飲": 4176, + "##料": 4177, + "##康": 4178, + "##危": 4179, + "##害": 4180, + "##對": 4181, + "##個": 4182, + "##含": 4183, + "##是": 4184, + "##-": 4185, + "##÷": 4186, + "##認": 4187, + "##為": 4188, + "##м": 4189, + "##ʃ": 4190, + "##杞": 4191, + "##ɤ": 4192, + "##寒": 4193, + "##條": 4194, + "##傷": 4195, + "##神": 4196, + "##気": 4197, + "##ル": 4198, + "##╪": 4199, + "##戦": 4200, + "##⇞": 4201, + "##𝜇": 4202, + "##テ": 4203, + "##Ɛ": 4204, + "##ˈ": 4205, + "##牙": 4206, + "##草": 4207, + "##雷": 4208, + "##菌": 4209, + "##口": 4210, + "##服": 4211, + "##溶": 4212, + "##液": 4213, + "##参": 4214, + "##量": 4215, + "##等": 4216, + "##内": 4217, + "##ʔ": 4218, + "##ᅬ": 4219, + "##ᆭ": 4220, + "##◎": 4221, + "##─": 4222, + "##牡": 4223, + "##蛎": 4224, + "##ʎ": 4225, + "##⇓": 4226, + "##醒": 4227, + "##阳": 4228, + "##蜴": 4229, + "##⁑": 4230, + "##Ⅴ": 4231, + "##": 4232, + "##部": 4233, + "##校": 4234, + "##Ш": 4235, + "##肥": 4236, + "##沫": 4237, + "##欢": 4238, + "##罪": 4239, + "##ね": 4240, + "##敗": 4241, + "##落": 4242, + "##胆": 4243, + "##よ": 4244, + "##守": 4245, + "##ᅣ": 4246, + "##年": 4247, + "##葯": 4248, + "##集": 4249, + "##団": 4250, + "##☑": 4251, + "##橘": 4252, + "##": 4253, + "##〈": 4254, + "##": 4255, + "##実": 4256, + "##現": 4257, + "##誌": 4258, + "##业": 4259, + "##余": 4260, + "##爱": 4261, + "##休": 4262, + "##闲": 4263, + "##˙": 4264, + "##ѱ": 4265, + "##⁎": 4266, + "##∈": 4267, + "##ᅨ": 4268, + "##∖": 4269, + "##ᶽ": 4270, + "##咽": 4271, + "##困": 4272, + "##难": 4273, + "##ⱡ": 4274, + "##ᅯ": 4275, + "##ᆵ": 4276, + "##ま": 4277, + "##茱": 4278, + "##萸": 4279, + "##底": 4280, + "##オ": 4281, + "##レ": 4282, + "##造": 4283, + "##影": 4284, + "##℅": 4285, + "##𝜔": 4286, + "##Ɵ": 4287, + "##⋯": 4288, + "##愤": 4289, + "##怒": 4290, + "##绪": 4291, + "##牛": 4292, + "##尘": 4293, + "##": 4294, + "##✝": 4295, + "##⊢": 4296, + "##琴": 4297, + "##ใ": 4298, + "##宜": 4299, + "##膝": 4300, + "##睛": 4301, + "##侧": 4302, + "##瞳": 4303, + "##孔": 4304, + "##放": 4305, + "##固": 4306, + "##˂": 4307, + "##ʈ": 4308, + "##球": 4309, + "##善": 4310, + "##减": 4311, + "##压": 4312, + "##": 4313, + "##紊": 4314, + "##周": 4315, + "##辺": 4316, + "##ᅰ": 4317, + "##ᆲ": 4318, + "##们": 4319, + "##糟": 4320, + "##糕": 4321, + "##拭": 4322, + "##熱": 4323, + "##该": 4324, + "##楝": 4325, + "##机": 4326, + "##控": 4327, + "##⨕": 4328, + "##˩": 4329, + "##脾": 4330, + "##气": 4331, + "##跟": 4332, + "##随": 4333, + "##書": 4334, + "##极": 4335, + "##幸": 4336, + "##福": 4337, + "##決": 4338, + "##⌃": 4339, + "##つ": 4340, + "##き": 4341, + "##➢": 4342, + "##": 4343, + "##׀": 4344, + "##▯": 4345, + "##邮": 4346, + "##费": 4347, + "##変": 4348, + "##✪": 4349, + "##翹": 4350, + "##": 4351, + "##Ҩ": 4352, + "##屈": 4353, + "##曲": 4354, + "##ス": 4355, + "##均": 4356, + "##存": 4357, + "##華": 4358, + "##威": 4359, + "##兩": 4360, + "##虛": 4361, + "##件": 4362, + "##𝜃": 4363, + "##ᅫ": 4364, + "##ᆾ": 4365, + "##苁": 4366, + "##蓉": 4367, + "##段": 4368, + "##⇩": 4369, + "##戟": 4370, + "##传": 4371, + "##播": 4372, + "##记": 4373, + "##忆": 4374, + "##题": 4375, + "##ᵖ": 4376, + "##": 4377, + "##": 4378, + "##": 4379, + "##✦": 4380, + "##式": 4381, + "##<": 4382, + "##十": 4383, + "##荷": 4384, + "##刀": 4385, + "##写": 4386, + "##真": 4387, + "##:": 4388, + "##И": 4389, + "##チ": 4390, + "##正": 4391, + "##Ⓡ": 4392, + "##菜": 4393, + "##母": 4394, + "##た": 4395, + "##ĸ": 4396, + "##Ξ": 4397, + "##ク": 4398, + "##マ": 4399, + "##作": 4400, + "##容": 4401, + "##易": 4402, + "##疲": 4403, + "##倦": 4404, + "##眾": 4405, + "##討": 4406, + "##厭": 4407, + "##疗": 4408, + "##¸": 4409, + "##律": 4410, + "##桂": 4411, + "##♭": 4412, + "##姿": 4413, + "##郁": 4414, + "##夏": 4415, + "##回": 4416, + "##族": 4417, + "##♠": 4418, + "##闻": 4419, + "##联": 4420, + "##胱": 4421, + "##俞": 4422, + "##脸": 4423, + "##救": 4424, + "##¬": 4425, + "##厚": 4426, + "##樸": 4427, + "##𝐻": 4428, + "##⁸": 4429, + "##ᵡ": 4430, + "##計": 4431, + "##畫": 4432, + "##屉": 4433, + "##值": 4434, + "##腑": 4435, + "##▶": 4436, + "##П": 4437, + "##帯": 4438, + "##蓮": 4439, + "##毒": 4440, + "##家": 4441, + "##效": 4442, + "##𝛾": 4443, + "##": 4444, + "##典": 4445, + "##ɣ": 4446, + "##喜": 4447, + "##⎥": 4448, + "##其": 4449, + "##⊖": 4450, + "##ᄍ": 4451, + "##ᄆ": 4452, + "##売": 4453, + "##鼠": 4454, + "##♣": 4455, + "##浴": 4456, + "##元": 4457, + "##白": 4458, + "##话": 4459, + "##โ": 4460, + "##亞": 4461, + "##诺": 4462, + "##贝": 4463, + "##奖": 4464, + "##最": 4465, + "##栗": 4466, + "##磁": 4467, + "##炉": 4468, + "##ソ": 4469, + "##": 4470, + "##お": 4471, + "##⑮": 4472, + "##ふ": 4473, + "##": 4474, + "##⤈": 4475, + "##耽": 4476, + "##误": 4477, + "##ᶴ": 4478, + "##膽": 4479, + "##怯": 4480, + "##▵": 4481, + "##偽": 4482, + "##型": 4483, + "##肺": 4484, + "##炎": 4485, + "##遍": 4486, + "##¿": 4487, + "##岳": 4488, + "##功": 4489, + "##御": 4490, + "##遮": 4491, + "##掩": 4492, + "##𝑘": 4493, + "##共": 4494, + "##Ⅸ": 4495, + "##津": 4496, + "##ʉ": 4497, + "##國": 4498, + "##彭": 4499, + "##泰": 4500, + "##镜": 4501, + "##椿": 4502, + "##ェ": 4503, + "##メ": 4504, + "##声": 4505, + "##课": 4506, + "##本": 4507, + "##项": 4508, + "##链": 4509, + "##排": 4510, + "##斥": 4511, + "##ค": 4512, + "##刻": 4513, + "##池": 4514, + "##塘": 4515, + "##里": 4516, + "##蹲": 4517, + "##文": 4518, + "##": 4519, + "##⁰": 4520, + "##۹": 4521, + "##➍": 4522, + "##藥": 4523, + "##车": 4524, + "##₌": 4525, + "##郭": 4526, + "##月": 4527, + "##∣": 4528, + "##拳": 4529, + "##字": 4530, + "##势": 4531, + "##菖": 4532, + "##蒲": 4533, + "##些": 4534, + "##趣": 4535, + "##Ρ": 4536, + "##振": 4537, + "##获": 4538, + "##绝": 4539, + "##吃": 4540, + "##後": 4541, + "##": 4542, + "##": 4543, + "##ʺ": 4544, + "##✶": 4545, + "##土": 4546, + "##ϒ": 4547, + "##醫": 4548, + "##案": 4549, + "##封": 4550, + "##𝐷": 4551, + "##规": 4552, + "##则": 4553, + "##㎖": 4554, + "##ʤ": 4555, + "##𝐸": 4556, + "##𝐱": 4557, + "##肃": 4558, + "##根": 4559, + "##洗": 4560, + "##衣": 4561, + "##粉": 4562, + "##室": 4563, + "##G": 4564, + "##❷": 4565, + "##枯": 4566, + "##¦": 4567, + "##も": 4568, + "##⑫": 4569, + "##đ": 4570, + "##ᶧ": 4571, + "##首": 4572, + "##烏": 4573, + "##˄": 4574, + "##": 4575, + "##峰": 4576, + "##▓": 4577, + "##二": 4578, + "##纤": 4579, + "##右": 4580, + "##務": 4581, + "##竹": 4582, + "##冬": 4583, + "##叫": 4584, + "##ᶢ": 4585, + "##更": 4586, + "##转": 4587, + "##移": 4588, + "##注": 4589, + "##意": 4590, + "##塑": 4591, + "##述": 4592, + "##虫": 4593, + "##⊂": 4594, + "##朮": 4595, + "##㎣": 4596, + "##": 4597, + "##": 4598, + "##如": 4599, + "##催": 4600, + "##萄": 4601, + "##み": 4602, + "##𝑡": 4603, + "##馒": 4604, + "##头": 4605, + "##พ": 4606, + "##₀": 4607, + "##蒜": 4608, + "##ᄃ": 4609, + "##籽": 4610, + "##野": 4611, + "##桐": 4612, + "##使": 4613, + "##络": 4614, + "##它": 4615, + "##创": 4616, + "##旺": 4617, + "##侣": 4618, + "##∩": 4619, + "##芎": 4620, + "##題": 4621, + "##引": 4622, + "##陽": 4623, + "##動": 4624, + "##態": 4625, + "##除": 4626, + "##外": 4627, + "##他": 4628, + "##原": 4629, + "##杯": 4630, + "##朝": 4631, + "##": 4632, + "##油": 4633, + "##门": 4634, + "##按": 4635, + "##照": 4636, + "##嘱": 4637, + "##れ": 4638, + "##㎍": 4639, + "##": 4640, + "##欠": 4641, + "##☐": 4642, + "##緑": 4643, + "##脂": 4644, + "##嚢": 4645, + "##管": 4646, + "##己": 4647, + "##路": 4648, + "##❖": 4649, + "##⬇": 4650, + "##ө": 4651, + "##寄": 4652, + "##丝": 4653, + "##锥": 4654, + "##轨": 4655, + "##處": 4656, + "##狀": 4657, + "##ф": 4658, + "##↕": 4659, + "##Κ": 4660, + "##窗": 4661, + "##帘": 4662, + "##芹": 4663, + "##⌘": 4664, + "##背": 4665, + "##书": 4666, + "##棗": 4667, + "##風": 4668, + "##": 4669, + "##ʷ": 4670, + "##學": 4671, + "##柱": 4672, + "##妈": 4673, + "##錄": 4674, + "##点": 4675, + "##𝐦": 4676, + "##紙": 4677, + "##助": 4678, + "##₊": 4679, + "##✳": 4680, + "##ŋ": 4681, + "##階": 4682, + "##適": 4683, + "##交": 4684, + "##苏": 4685, + "##华": 4686, + "##企": 4687, + "##纹": 4688, + "##ち": 4689, + "##把": 4690, + "##弄": 4691, + "##泻": 4692, + "##螵": 4693, + "##蛸": 4694, + "##": 4695, + "##论": 4696, + "##¯": 4697, + "##𝑜": 4698, + "##©": 4699, + "##ϭ": 4700, + "##蒙": 4701, + "##古": 4702, + "##ヒ": 4703, + "##林": 4704, + "##户": 4705, + "##ϱ": 4706, + "##拖": 4707, + "##肝": 4708, + "##宝": 4709, + "##仲": 4710, + "##Œ": 4711, + "##✘": 4712, + "##Ɯ": 4713, + "##≅": 4714, + "##凰": 4715, + "##": 4716, + "##呆": 4717, + "##你": 4718, + "##预": 4719, + "##期": 4720, + "##富": 4721, + "##商": 4722, + "##李": 4723, + "##楠": 4724, + "##": 4725, + "##徽": 4726, + "##㎠": 4727, + "##": 4728, + "##米": 4729, + "##◘": 4730, + "##浆": 4731, + "##發": 4732, + "##静": 4733, + "##糖": 4734, + "##品": 4735, + "##逃": 4736, + "##避": 4737, + "##实": 4738, + "##Ө": 4739, + "##貞": 4740, + "##酸": 4741, + "##枣": 4742, + "##扰": 4743, + "##呵": 4744, + "##誉": 4745, + "##₤": 4746, + "##ɾ": 4747, + "##精": 4748, + "##Ⅳ": 4749, + "##场": 4750, + "##研": 4751, + "##究": 4752, + "##樱": 4753, + "##桃": 4754, + "##𝜅": 4755, + "##断": 4756, + "##免": 4757, + "##孤": 4758, + "##独": 4759, + "##再": 4760, + "##耐": 4761, + "##北": 4762, + "##甘": 4763, + "##そ": 4764, + "##辛": 4765, + "##∏": 4766, + "##苓": 4767, + "##►": 4768, + "##媒": 4769, + "##率": 4770, + "##": 4771, + "##考": 4772, + "##透": 4773, + "##г": 4774, + "##ц": 4775, + "##査": 4776, + "##ͻ": 4777, + "##卫": 4778, + "##材": 4779, + "##興": 4780, + "##𝑠": 4781, + "##伐": 4782, + "##帜": 4783, + "##差": 4784, + "##析": 4785, + "##✗": 4786, + "##﹡": 4787, + "##𝜆": 4788, + "##戈": 4789, + "##": 4790, + "##": 4791, + "##察": 4792, + "##ʾ": 4793, + "##吓": 4794, + "##劲": 4795, + "##当": 4796, + "##选": 4797, + "##劳": 4798, + "##模": 4799, + "##牌": 4800, + "##っ": 4801, + "##𝑈": 4802, + "##へ": 4803, + "##第": 4804, + "##尊": 4805, + "##重": 4806, + "##⋕": 4807, + "##腹": 4808, + "##師": 4809, + "##陜": 4810, + "##∪": 4811, + "##死": 4812, + "##𝑂": 4813, + "##染": 4814, + "##疾": 4815, + "##及": 4816, + "##眨": 4817, + "##日": 4818, + "##⌠": 4819, + "##サ": 4820, + "##隨": 4821, + "##筆": 4822, + "##ᄏ": 4823, + "##╦": 4824, + "##励": 4825, + "##莹": 4826, + "##般": 4827, + "##屑": 4828, + "##约": 4829, + "##卜": 4830, + "##ナ": 4831, + "##硝": 4832, + "##入": 4833, + "##ラ": 4834, + "##腎": 4835, + "##芪": 4836, + "##ϴ": 4837, + "##待": 4838, + "##围": 4839, + "##抱": 4840, + "##怨": 4841, + "##您": 4842, + "##复": 4843, + "##𝟎": 4844, + "##𝟏": 4845, + "##名": 4846, + "##妙": 4847, + "##序": 4848, + "##列": 4849, + "##無": 4850, + "##﹟": 4851, + "##": 4852, + "##住": 4853, + "##髓": 4854, + "##袜": 4855, + "##音": 4856, + "##波": 4857, + "##検": 4858, + "##\u0003": 4859, + "##𝑏": 4860, + "##苈": 4861, + "##灾": 4862, + "##楂": 4863, + "##ԑ": 4864, + "##掉": 4865, + "##姜": 4866, + "##ȹ": 4867, + "##味": 4868, + "##亮": 4869, + "##點": 4870, + "##め": 4871, + "##ほ": 4872, + "##蚣": 4873, + "##": 4874, + "##居": 4875, + "##然": 4876, + "##都": 4877, + "##贵": 4878, + "##Х": 4879, + "##嗽": 4880, + "##立": 4881, + "##Ζ": 4882, + "##⎦": 4883, + "##焼": 4884, + "##取": 4885, + "##組": 4886, + "##む": 4887, + "##誰": 4888, + "##孢": 4889, + "##挂": 4890, + "##➞": 4891, + "##璃": 4892, + "##腔": 4893, + "##絲": 4894, + "##银": 4895, + "##柴": 4896, + "##厌": 4897, + "##报": 4898, + "##数": 4899, + "##𝑓": 4900, + "##⁡": 4901, + "##盖": 4902, + "##虑": 4903, + "##☼": 4904, + "##‹": 4905, + "##辯": 4906, + "##少": 4907, + "##言": 4908, + "##Đ": 4909, + "##艘": 4910, + "##儿": 4911, + "##員": 4912, + "##何": 4913, + "##󸀠": 4914, + "##炒": 4915, + "##ケ": 4916, + "##算": 4917, + "##』": 4918, + "##领": 4919, + "##﹪": 4920, + "##济": 4921, + "##民": 4922, + "##扶": 4923, + "##伤": 4924, + "##陰": 4925, + "##河": 4926, + "##車": 4927, + "##ᵐ": 4928, + "##髎": 4929, + "##☥": 4930, + "##晃": 4931, + "##汁": 4932, + "##供": 4933, + "##ᵷ": 4934, + "##℃": 4935, + "##倒": 4936, + "##树": 4937, + "##": 4938, + "##望": 4939, + "##贴": 4940, + "##代": 4941, + "##流": 4942, + "##㎕": 4943, + "##收": 4944, + "##啤": 4945, + "##酒": 4946, + "##逐": 4947, + "##瘀": 4948, + "##汤": 4949, + "##℗": 4950, + "##ʏ": 4951, + "##➤": 4952, + "##关": 4953, + "##": 4954, + "##旦": 4955, + "##烟": 4956, + "##云": 4957, + "##Ѕ": 4958, + "##块": 4959, + "##⒅": 4960, + "##ろ": 4961, + "##见": 4962, + "##": 4963, + "##": 4964, + "##": 4965, + "##": 4966, + "##暇": 4967, + "##楽": 4968, + "##閃": 4969, + "##ᇂ": 4970, + "##𝑎": 4971, + "##業": 4972, + "##提": 4973, + "##臀": 4974, + "##利": 4975, + "##腾": 4976, + "##⊛": 4977, + "##闷": 4978, + "##Ұ": 4979, + "##†": 4980, + "##談": 4981, + "##香": 4982, + "##皂": 4983, + "##☞": 4984, + "##编": 4985, + "##畏": 4986, + "##懼": 4987, + "##芸": 4988, + "##ʠ": 4989, + "##票": 4990, + "##→": 4991, + "##ュ": 4992, + "##қ": 4993, + "##↘": 4994, + "##㎡": 4995, + "##ʵ": 4996, + "##執": 4997, + "##否": 4998, + "##♋": 4999, + "##├": 5000, + "##躍": 5001, + "##⎣": 5002, + "##源": 5003, + "##蟹": 5004, + "##寶": 5005, + "##斗": 5006, + "##歡": 5007, + "##長": 5008, + "##僚": 5009, + "##尝": 5010, + "##试": 5011, + "##隐": 5012, + "##瞒": 5013, + "##际": 5014, + "##筝": 5015, + "##孚": 5016, + "##恼": 5017, + "##责": 5018, + "##足": 5019, + "##找": 5020, + "##梗": 5021, + "##稿": 5022, + "##盔": 5023, + "##≨": 5024, + "##↗": 5025, + "##航": 5026, + "##蜢": 5027, + "##石": 5028, + "##松": 5029, + "##ז": 5030, + "##Л": 5031, + "##せ": 5032, + "##泳": 5033, + "##残": 5034, + "##却": 5035, + "##败": 5036, + "##告": 5037, + "##烛": 5038, + "##闸": 5039, + "##準": 5040, + "##備": 5041, + "##兪": 5042, + "##菍": 5043, + "##▬": 5044, + "##⑬": 5045, + "##验": 5046, + "##乐": 5047, + "##歸": 5048, + "##鬚": 5049, + "##君": 5050, + "##寿": 5051, + "##堂": 5052, + "##计": 5053, + "##ₓ": 5054, + "##屋": 5055, + "##涤": 5056, + "##氏": 5057, + "##训": 5058, + "##希": 5059, + "##調": 5060, + "##整": 5061, + "##榴": 5062, + "##項": 5063, + "##劝": 5064, + "##朋": 5065, + "##友": 5066, + "##郊": 5067, + "##游": 5068, + "##♥": 5069, + "##床": 5070, + "##✵": 5071, + "##\u0007": 5072, + "##耆": 5073, + "##裹": 5074, + "##氓": 5075, + "##⊿": 5076, + "##红": 5077, + "##州": 5078, + "##◽": 5079, + "##泉": 5080, + "##╤": 5081, + "##益": 5082, + "##节": 5083, + "##ᵉ": 5084, + "##次": 5085, + "##≒": 5086, + "##‾": 5087, + "##℮": 5088, + "##": 5089, + "##聞": 5090, + "##像": 5091, + "##榔": 5092, + "##": 5093, + "##": 5094, + "##景": 5095, + "##㎝": 5096, + "##๖": 5097, + "##∴": 5098, + "##器": 5099, + "##術": 5100, + "##負": 5101, + "##˛": 5102, + "##ロ": 5103, + "##符": 5104, + "##座": 5105, + "##╬": 5106, + "##抑": 5107, + "##悟": 5108, + "##过": 5109, + "##%": 5110, + "##鸡": 5111, + "##蛋": 5112, + "##": 5113, + "##团": 5114, + "##欧": 5115, + "##ᴥ": 5116, + "##圾": 5117, + "##腸": 5118, + "##ˑ": 5119, + "##朴": 5120, + "##钟": 5121, + "##ɳ": 5122, + "##ᇁ": 5123, + "##停": 5124, + "##泊": 5125, + "##❶": 5126, + "##乳": 5127, + "##癌": 5128, + "##Ɣ": 5129, + "##早": 5130, + "##順": 5131, + "##沿": 5132, + "##荽": 5133, + "##颜": 5134, + "##笑": 5135, + "##ᶠ": 5136, + "##◻": 5137, + "##㎶": 5138, + "##権": 5139, + "##縄": 5140, + "##ℎ": 5141, + "##𝑀": 5142, + "##姫": 5143, + "##": 5144, + "##垃": 5145, + "##٭": 5146, + "##ⓞ": 5147, + "##ᵑ": 5148, + "##菊": 5149, + "##甙": 5150, + "##終": 5151, + "##恍": 5152, + "##綱": 5153, + "##∕": 5154, + "##确": 5155, + "##匹": 5156, + "##配": 5157, + "##Ʒ": 5158, + "##芍": 5159, + "##ᴬ": 5160, + "##ᴮ": 5161, + "##庆": 5162, + "##调": 5163, + "##邈": 5164, + "##追": 5165, + "##求": 5166, + "##切": 5167, + "##𝑌": 5168, + "##建": 5169, + "##奈": 5170, + "##𝜏": 5171, + "##毆": 5172, + "##茶": 5173, + "##∽": 5174, + "##袖": 5175, + "##於": 5176, + "##逆": 5177, + "##転": 5178, + "##ʘ": 5179, + "##版": 5180, + "##語": 5181, + "##尺": 5182, + "##𝜓": 5183, + "##記": 5184, + "##録": 5185, + "##充": 5186, + "##Ł": 5187, + "##ョ": 5188, + "##標": 5189, + "##΄": 5190, + "##𝑤": 5191, + "##𝑟": 5192, + "##盘": 5193, + "##樟": 5194, + "##≏": 5195, + "##京": 5196, + "##局": 5197, + "##将": 5198, + "##临": 5199, + "##恐": 5200, + "##惧": 5201, + "##岌": 5202, + "##₦": 5203, + "##雙": 5204, + "##ผ": 5205, + "##铐": 5206, + "##鐸": 5207, + "##增": 5208, + "##󳰀": 5209, + "##⊙": 5210, + "##震": 5211, + "##帽": 5212, + "##尚": 5213, + "##ᵘ": 5214, + "##深": 5215, + "##応": 5216, + "##群": 5217, + "##厂": 5218, + "##ッ": 5219, + "##唤": 5220, + "##舎": 5221, + "##沌": 5222, + "##捕": 5223, + "##食": 5224, + "##昆": 5225, + "##筋": 5226, + "##ᶬ": 5227, + "##Я": 5228, + "##蕉": 5229, + "##ʁ": 5230, + "##ぬ": 5231, + "##铅": 5232, + "##并": 5233, + "##‱": 5234, + "##従": 5235, + "##": 5236, + "##": 5237, + "##频": 5238, + "##械": 5239, + "##內": 5240, + "##": 5241, + "##⚝": 5242, + "##ᄁ": 5243, + "##ᄈ": 5244, + "##忽": 5245, + "##略": 5246, + "##务": 5247, + "##宅": 5248, + "##且": 5249, + "##視": 5250, + "##醇": 5251, + "##剩": 5252, + "##義": 5253, + "##茅": 5254, + "##虚": 5255, + "##此": 5256, + "##剂": 5257, + "##超": 5258, + "##】": 5259, + "##赢": 5260, + "##份": 5261, + "##跳": 5262, + "##絡": 5263, + "##描": 5264, + "##贞": 5265, + "##萝": 5266, + "##蜜": 5267, + "##莪": 5268, + "##ℵ": 5269, + "##戻": 5270, + "##百": 5271, + "##阻": 5272, + "##塞": 5273, + "##赭": 5274, + "##㎗": 5275, + "##臣": 5276, + "##≃": 5277, + "##衍": 5278, + "##知": 5279, + "##◖": 5280, + "##": 5281, + "##坚": 5282, + "##砂": 5283, + "##瓜": 5284, + "##⎪": 5285, + "##歴": 5286, + "##史": 5287, + "##刷": 5288, + "##จ": 5289, + "##ニ": 5290, + "##ƥ": 5291, + "##够": 5292, + "##値": 5293, + "##卑": 5294, + "##❸": 5295, + "##宏": 5296, + "##タ": 5297, + "##ホ": 5298, + "##ミ": 5299, + "##♮": 5300, + "##什": 5301, + "##从": 5302, + "##ℊ": 5303, + "##索": 5304, + "##担": 5305, + "##忧": 5306, + "##蛙": 5307, + "##筷": 5308, + "##蔵": 5309, + "##庫": 5310, + "##斛": 5311, + "##颗": 5312, + "##粒": 5313, + "##來": 5314, + "##蘇": 5315, + "##档": 5316, + "##肱": 5317, + "##ℳ": 5318, + "##满": 5319, + "##": 5320, + "##ゆ": 5321, + "##┃": 5322, + "##朗": 5323, + "##羊": 5324, + "##藿": 5325, + "##機": 5326, + "##腥": 5327, + "##昇": 5328, + "##四": 5329, + "##☺": 5330, + "##覚": 5331, + "##誘": 5332, + "##発": 5333, + "##边": 5334, + "##向": 5335, + "##客": 5336, + "##ʰ": 5337, + "##扣": 5338, + "##川": 5339, + "##": 5340, + "##离": 5341, + "##煤": 5342, + "##加": 5343, + "##▷": 5344, + "##Џ": 5345, + "##荧": 5346, + "##漠": 5347, + "##咒": 5348, + "##曹": 5349, + "##柿": 5350, + "##杰": 5351, + "##戳": 5352, + "##汉": 5353, + "##协": 5354, + "##\u0001": 5355, + "##表": 5356, + "##插": 5357, + "##庙": 5358, + "##键": 5359, + "##词": 5360, + "##➝": 5361, + "##娜": 5362 + } + } +} \ No newline at end of file diff --git a/unitable/vocab/vocab_html.json b/unitable/vocab/vocab_html.json new file mode 100644 index 0000000000000000000000000000000000000000..c107e946260984cffb2ced31bfa1d992a24eb2ae --- /dev/null +++ b/unitable/vocab/vocab_html.json @@ -0,0 +1,640 @@ +{ + "version": "1.0", + "truncation": null, + "padding": { + "strategy": "BatchLongest", + "direction": "Right", + "pad_to_multiple_of": null, + "pad_id": 2, + "pad_type_id": 0, + "pad_token": "" + }, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 4, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 5, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 6, + "content": "[table]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 7, + "content": "[html]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 8, + "content": "[cell]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 9, + "content": "[bbox]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 10, + "content": "[cell+bbox]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 11, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 12, + "content": "[]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 13, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 15, + "content": ">[]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 16, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 17, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 18, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 19, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 20, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 21, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 22, + "content": " rowspan=\"2\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 23, + "content": " rowspan=\"3\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 24, + "content": " rowspan=\"4\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 25, + "content": " rowspan=\"5\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 26, + "content": " rowspan=\"6\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 27, + "content": " rowspan=\"7\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 28, + "content": " rowspan=\"8\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 29, + "content": " rowspan=\"9\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 30, + "content": " rowspan=\"10\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 31, + "content": " rowspan=\"11\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 32, + "content": " rowspan=\"12\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 33, + "content": " rowspan=\"13\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 34, + "content": " rowspan=\"14\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 35, + "content": " rowspan=\"15\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 36, + "content": " rowspan=\"16\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 37, + "content": " rowspan=\"17\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 38, + "content": " rowspan=\"18\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 39, + "content": " rowspan=\"19\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 40, + "content": " colspan=\"2\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 41, + "content": " colspan=\"3\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 42, + "content": " colspan=\"4\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 43, + "content": " colspan=\"5\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 44, + "content": " colspan=\"6\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 45, + "content": " colspan=\"7\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 46, + "content": " colspan=\"8\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 47, + "content": " colspan=\"9\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 48, + "content": " colspan=\"10\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 49, + "content": " colspan=\"11\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 50, + "content": " colspan=\"12\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 51, + "content": " colspan=\"13\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 52, + "content": " colspan=\"14\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 53, + "content": " colspan=\"15\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 54, + "content": " colspan=\"16\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 55, + "content": " colspan=\"17\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 56, + "content": " colspan=\"18\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 57, + "content": " colspan=\"19\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 58, + "content": " colspan=\"25\"", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "NFD" + }, + { + "type": "Lowercase" + }, + { + "type": "StripAccents" + }, + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": null, + "decoder": { + "type": "WordPiece", + "prefix": "##", + "cleanup": true + }, + "model": { + "type": "WordPiece", + "unk_token": "", + "continuing_subword_prefix": "##", + "max_input_chars_per_word": 100, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + "": 5, + "[table]": 6, + "[html]": 7, + "[cell]": 8, + "[bbox]": 9, + "[cell+bbox]": 10, + "": 11, + "[]": 12, + "": 14, + ">[]": 15, + "": 16, + "": 17, + "": 18, + "": 19, + "": 20, + "": 21, + " rowspan=\"2\"": 22, + " rowspan=\"3\"": 23, + " rowspan=\"4\"": 24, + " rowspan=\"5\"": 25, + " rowspan=\"6\"": 26, + " rowspan=\"7\"": 27, + " rowspan=\"8\"": 28, + " rowspan=\"9\"": 29, + " rowspan=\"10\"": 30, + " rowspan=\"11\"": 31, + " rowspan=\"12\"": 32, + " rowspan=\"13\"": 33, + " rowspan=\"14\"": 34, + " rowspan=\"15\"": 35, + " rowspan=\"16\"": 36, + " rowspan=\"17\"": 37, + " rowspan=\"18\"": 38, + " rowspan=\"19\"": 39, + " colspan=\"2\"": 40, + " colspan=\"3\"": 41, + " colspan=\"4\"": 42, + " colspan=\"5\"": 43, + " colspan=\"6\"": 44, + " colspan=\"7\"": 45, + " colspan=\"8\"": 46, + " colspan=\"9\"": 47, + " colspan=\"10\"": 48, + " colspan=\"11\"": 49, + " colspan=\"12\"": 50, + " colspan=\"13\"": 51, + " colspan=\"14\"": 52, + " colspan=\"15\"": 53, + " colspan=\"16\"": 54, + " colspan=\"17\"": 55, + " colspan=\"18\"": 56, + " colspan=\"19\"": 57, + " colspan=\"25\"": 58 + } + } +} \ No newline at end of file diff --git a/unitable/website/unitable-demo.gif b/unitable/website/unitable-demo.gif new file mode 100644 index 0000000000000000000000000000000000000000..a2cd043f465363d177c5c76327398bc5a49f638f --- /dev/null +++ b/unitable/website/unitable-demo.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c9c54e876fd3dae24df51d69937bab1916bc3b8d3f89b2c2531dbf8a0586cb4 +size 5219577 diff --git a/unitable/website/unitable-demo.mp4 b/unitable/website/unitable-demo.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..10e0527569b071887ff7408825def512ec4f895d --- /dev/null +++ b/unitable/website/unitable-demo.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:229c8b0ed13b1f40f442d0f1e10db620d54cd0028e1364cca118b99ea0e1aaae +size 14337695 diff --git a/unitable/website/wandb_screenshot.png b/unitable/website/wandb_screenshot.png new file mode 100644 index 0000000000000000000000000000000000000000..9174d5426b14fe791231faccf94e7e267f8093c8 --- /dev/null +++ b/unitable/website/wandb_screenshot.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9a67622be780272632d80437e810c0d7ca9bf23503da29fc3821b7724782df4 +size 2038109 diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d8e4a79e03839f0c598fc0054253b7d7486dd1da --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,24 @@ + + +from .cropping_boxes_to_images import get_rotate_crop_image ,cropImages,cropImageExtraMargin,crop_an_Image,get_new_coord +from .viz import draw_box, draw_only_box, draw_box_with_text, get_color_map_list +from .annotation import LineAnnotation,WordAnnotation,Annotation +from .logger import getlogger +from .preprocessing import denoisingAndSharpening +from .sorting import group_words_into_lines + +__all__ = [ "cropImageExtraMargin", + "getlogger", + "Annotation", + "LineAnnotation", + "WordAnnotation", + "get_rotate_crop_image", + "cropImages", + "draw_box", + "draw_only_box", + "draw_box_with_text", + "get_color_map_list", + "crop_an_Image", + "get_new_coord", + "group_words_into_lines"] + diff --git a/utils/annotation.py b/utils/annotation.py new file mode 100644 index 0000000000000000000000000000000000000000..df39ddfd375c2ceff1747bb019e2d7592dc9fce2 --- /dev/null +++ b/utils/annotation.py @@ -0,0 +1,51 @@ +from typing import Optional, List +import uuid +from abc import ABC, abstractmethod + +class Annotation(ABC): + box: Optional[List[float]] = None + index: uuid.UUID = None + + def __init__(self, box: Optional[List[float]] = None): + self.box = box + self.index = uuid.uuid4() + +class WordAnnotation(Annotation): + + # Class attributes since all classes should have it + box:Optional[List[float]] =None, + score:Optional[float]=None, + text:str = None, + index: uuid.UUID =None + + def __init__(self, + box:Optional[List[float]], + text:str=None): + self.box =box if box is not None else None + self.text = text if text is not None else None + self.index = uuid.uuid4() + +class LineAnnotation(Annotation): + """ + Detection results of all OCR Components + `pdf_id` : id or name of pdf so that it can be get from database + `page`: pdf page number + `box`: [xmin, ymin, xmax, ymax] + + `index`: same as index for bounding boxes, just the results is wrapped around in this class + `score`: prediction score + `line` : Parent line + `text`: text string. + """ + # Class attributes since all classes should have it + box:Optional[List[float]] =None, + words:Optional[List[WordAnnotation]] =None, + index: uuid.UUID =None + + def __init__(self, + box:Optional[List[float]], + words:List[WordAnnotation]=None): + self.box = box if box is not None else None + self.words = words if words is not None else [] + self.index = uuid.uuid4() + diff --git a/utils/cropping_boxes_to_images.py b/utils/cropping_boxes_to_images.py new file mode 100644 index 0000000000000000000000000000000000000000..0eaeca9535442a05aed93d948da6c66c17b6c314 --- /dev/null +++ b/utils/cropping_boxes_to_images.py @@ -0,0 +1,221 @@ +import numpy as np +from typing import List +from PIL import Image +import cv2 +import numpy.typing as npt +from numpy import uint8 +ImageType = npt.NDArray[uint8] +from numpy.typing import NDArray + +# not used actually +def get_rotate_crop_image(img: ImageType, points:NDArray[np.float32])-> ImageType: + + """ + Points should be ordered in this order :left_lower, right_lower, right_upper, left_upper + each point has 2 coordinate + So entire thing is np array of size 4 times 2 with float32 numbers + takes an image and a set of four points defining a quadrilateral region within the image. + It extracts and crops this region, corrects its orientation using a perspective transform, + and rotates it if necessary. + """ + + assert len(points) == 4 + # Check the shape and dtype of points + assert points.shape == (4, 2), f"Points array must be of shape (4, 2), but got {points.shape}" + assert points.dtype == np.float32, f"Points array must be of dtype float32, but got {points.dtype}" + + # Calculating Crop Dimensions + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]))) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]))) + + + #A set of standard points pts_std is defined to represent the corners of the cropped image in a straightened, upright rectangle. + pts_std = np.float32([[0, 0], [img_crop_width, 0], + [img_crop_width, img_crop_height], + [0, img_crop_height]]) + # perspective transformation matrix M that maps the four points to the standard rectangle. + M = cv2.getPerspectiveTransform(points, pts_std) + #applies the perspective transformation to the image, using the transformation matrix M + dst_img = cv2.warpPerspective( + img, + M, (img_crop_width, img_crop_height), + borderMode=cv2.BORDER_REPLICATE, + flags=cv2.INTER_CUBIC) + dst_img_height, dst_img_width = dst_img.shape[0:2] + + if dst_img_height * 1.0 / dst_img_width >= 1.5: + #rotating counter clock wise + dst_img = np.rot90(dst_img) + #correct would be k=3 + #st_img = np.rot90(dst_img,k=3) + + return dst_img + + +def get_crop_image(img: ImageType, points:NDArray[np.float32],straight=False)-> ImageType: + + """ + Points should be ordered in this order :left_lower, right_lower, right_upper, left_upper + each point has 2 coordinate + So entire thing is np array of size 4 times 2 with float32 numbers + takes an image and a set of four points defining a quadrilateral region within the image. + It extracts and crops this region. No perspective transformation is applied + """ + + assert len(points) == 4 # xmin, ymin, xmax, ymax + # Check the shape and dtype of points + assert points.shape == (4, 2), f"Points array must be of shape (4, 2), but got {points.shape}" + assert points.dtype == np.float32, f"Points array must be of dtype float32, but got {points.dtype}" + + if not straight : + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]))) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]))) + # bottom left corner + xmin = int(points[0][0]) + ymin = int(points[0][1]) + + # Ensure the crop area is within the bounds of the image + xmax = min(xmin + img_crop_width, img.shape[1]) + ymax = min(ymin + img_crop_height, img.shape[0]) + else: + xmin = int(points[0][0]) + ymin = int(points[0][1]) + xmax = int(points[2][0]) + ymax = int(points[2][1]) + + # Crop the image + dst_img = img[ymin:ymax, xmin:xmax] + + + return dst_img + + + +def cropImages(bxs:List[NDArray[np.float32]], img:Image.Image,straight=False) -> List[ImageType] : + images_to_recognizer = [] + for bnum in range(len(bxs)): + left_lower, right_lower, right_upper, left_upper = bxs[bnum] + box = np.array([left_lower, right_lower, right_upper, left_upper ]) + cropped_img = get_crop_image(np.array(img), box, straight) + images_to_recognizer.append(cropped_img) + # return list of np arrays + return images_to_recognizer + +def crop_an_Image(box:NDArray[np.float32], img:Image.Image) -> ImageType : + #box should be 4x2 array + left_lower, right_lower, right_upper, left_upper = box + b = np.array([left_lower, right_lower, right_upper, left_upper ]) + cropped_img = get_crop_image(np.array(img), b) + return cropped_img + +def get_new_coord(maxx:int,maxy:int,points:NDArray[np.float32]) -> list[int]: + #points = 4x2 array + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]))) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]))) + # bottom left corner + bottom_left_x = int(points[0][0]) + bottom_left_y = int(points[0][1]) + + # Ensure the crop area is within the bounds of the image + top_right_x = min(bottom_left_x + img_crop_width, maxx) + top_right_y = min(bottom_left_y + img_crop_height, maxy) + + # Crop the image + # 4x1 array of xmin, ymin, xmax, ymax + return [bottom_left_x, bottom_left_y, top_right_x, top_right_y] + +MARGIN_FACTOR = 1.4 +def get_crop_image_with_extra_margin(img: ImageType, points:NDArray[np.float32],straight=False, marginfactor = MARGIN_FACTOR)-> ImageType: + + """ + Points should be ordered in this order :left_lower, right_lower, right_upper, left_upper + each point has 2 coordinate + So entire thing is np array of size 4 times 2 with float32 numbers + takes an image and a set of four points defining a quadrilateral region within the image. + It extracts and crops this region, corrects its orientation using a perspective transform, + and rotates it if necessary. + """ + + + assert len(points) == 4 + # Calculating Crop Dimensions + if not straight : + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]))) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]))) + # bottom left corner + xmin = int(points[0][0]) + ymin = int(points[0][1]) + + # Ensure the crop area is within the bounds of the image + xmax = min(xmin + img_crop_width, img.shape[1]) + ymax = min(ymin + img_crop_height, img.shape[0]) + else: + xmin = int(points[0][0]) + ymin = int(points[0][1]) + xmax = int(points[2][0]) + ymax = int(points[2][1]) + #print("points are "+str(points)) + #print("xmin, ymin, xmax,ymax are "+ str(xmin)+" "+ str(ymin)+" "+ str(xmax)+" "+str(ymax)) + # Crop the image + dst_img = img[ymin:ymax, xmin:xmax] + + #print(dst_img.shape[:2]) + height, width = dst_img.shape[:2] + + if width/height<1.6: + bigger = max(height,width) + new_height = int(bigger *3) + new_width = int(bigger*3) + else: + bigger = max(height,width) + new_height = int(bigger *MARGIN_FACTOR) + new_width = int(bigger*MARGIN_FACTOR) + + # Create a new image with a white background + new_img = np.full((new_height, new_width, 3), fill_value=255, dtype=np.uint8) # RGB white background + # Calculate the position to center the image on the new white background + y_offset = (new_height - height) // 2 + x_offset = (new_width - width) // 2 + #print("offsets are " + str(x_offset)+" " +str(y_offset)) + + # Place the warped image on the new white background + new_img[y_offset:y_offset + height, x_offset:x_offset+width] = dst_img + + return new_img + + +def cropImageExtraMargin(bxs:List[NDArray[np.float32]], img:Image.Image,straight=False, margin = MARGIN_FACTOR ) -> List[ImageType] : + images_to_recognizer = [] + for bnum in range(len(bxs)): + left_lower, right_lower, right_upper, left_upper = bxs[bnum] + box = np.array([left_lower, right_lower, right_upper, left_upper ]) + #print("newbox is") + #print(box) + cropped_img = get_crop_image_with_extra_margin(np.array(img), box,straight,margin) + images_to_recognizer.append(cropped_img) + # return list of np arrays + return images_to_recognizer \ No newline at end of file diff --git a/utils/logger.py b/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..4a9d2b7b8044e64dff0d15c1ca6fd87d01431119 --- /dev/null +++ b/utils/logger.py @@ -0,0 +1,22 @@ + +import logging + +def getlogger(name:str): + # create logger + logger = logging.getLogger(name) + logger.setLevel(logging.DEBUG) + + # create console handler and set level to debug + ch = logging.StreamHandler() + ch.setLevel(logging.DEBUG) + + # create formatter + #formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + + # add formatter to ch + #ch.setFormatter(formatter) + + # add ch to logger + logger.propagate = False + logger.addHandler(ch) + return logger diff --git a/utils/preprocessing.py b/utils/preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..37459d879333dfe0844056844ee805760a9b9d90 --- /dev/null +++ b/utils/preprocessing.py @@ -0,0 +1,36 @@ +import numpy.typing as npt +from numpy import uint8 +import numpy as np +ImageType = npt.NDArray[uint8] +from typing import Tuple, List, Sequence, Optional, Union +import cv2 +from PIL import Image +from PIL import ImageEnhance + +def denoisingAndSharpening(images:List[ImageType]): + + new_images = [] + for img in images: + # Apply fastNlMeansDenoisingColored + # Parameters: + # - img: The input 8-bit 3-channel image. + # - None: The output image (in-place if None is passed). + # - h: Parameter regulating filter strength for luminance component. Higher h value removes noise better but also removes image details (10 is a good default value). + # - hForColorComponents: The same as h but for color images only. For most images, 10 will be a good value. + # - templateWindowSize: Size in pixels of the template patch that is used to compute weights. Should be odd. Recommended value is 7. + # - searchWindowSize: Size in pixels of the window that is used to compute a weighted average for the given pixel. Should be odd. Recommended value is 21. + h = 10 + hForColorComponents = 10 + templateWindowSize = 7 + searchWindowSize = 21 + + img = cv2.fastNlMeansDenoisingColored(np.array(img), None, h, hForColorComponents, templateWindowSize, searchWindowSize) + + + #cv2.imwrite(debug_folder+"denoisedAndHigherContrast.png",img) + img = Image.fromarray(img).convert('RGB') + enhancer = ImageEnhance.Contrast(img) + img = enhancer.enhance(2) # Increase contrast by a factor of 2 + + new_images.append(img) + return new_images \ No newline at end of file diff --git a/utils/sorting.py b/utils/sorting.py new file mode 100644 index 0000000000000000000000000000000000000000..803358e58148a0e4b07ee7de475516f05161824e --- /dev/null +++ b/utils/sorting.py @@ -0,0 +1,68 @@ + +from typing import Any, List, Literal, Mapping, Optional, Tuple, Union, Dict, Type, Sequence + +@staticmethod +#Based on deepdoctection +def group_words_into_lines( + word_boxes: List[List[int]], image_id: Optional[str] = None +) -> List[Tuple[int, int, str]]: + + """ + Arranging words into horizontal text lines and sorting text lines vertically in order to give + an enumeration of words that is used for establishing the reading order. + Using this reading order arragement + + Input:is numpy array of shape (n,5) where n is number of words and 5 is size of each element(array) with coordinate(xmin,ymin,xmax,ymax) + score + """ + """ + reading_lines: List to store tuples of the form (row_index, bbox). + rows: List to store dictionaries representing rows, with keys "upper" and "lower" representing the y-coordinates of the upper and lower bounds of the rows. + """ + reading_lines = [] + rows: List[Dict[str,float]] = [] + for bbox in word_boxes: + #For each word annotation, get the bounding box using word.get_bounding_box(image_id). + row_found = False + for row_idx, row in enumerate(rows): + row_cy = (row["upper"] + row["lower"]) / 2 + ymin =bbox[1] + ymax =bbox[3] + bbox_cy = (ymin+ ymax) / 2 + # word belongs to row if center lies within the upper and lower bounds of the row or if the center y + # coordinate lies within the upper and lower bounds of the word bounding boxes. + + #if (row["upper"] < bounding_box.cy < row["lower"]) or (bounding_box.uly < row_cy < bounding_box.lry): + if (row["upper"] < bbox_cy < row["lower"]) or (ymin < row_cy < ymax): + reading_lines.append((row_idx,bbox)) + row_found = True + break + + # If word belongs to bound we do not update any row bounds. Thus, row bound are determined by the + # first word that defines the row + if not row_found: + rows.append({"upper": bbox[1] , "lower": bbox[3]}) + reading_lines.append((len(rows) - 1, bbox)) + + """ + Create a dictionary rows_dict where keys are row indices and values are the original row indices, sorted by the upper bound of the rows. + Reassign row indices in reading_lines according to the vertical sort order defined in rows_dict. + """ + rows_dict = {k: rows[k] for k in range(len(rows))} + rows_dict = { + idx: key[0] # type:ignore + for idx, key in enumerate(sorted(rows_dict.items(), key=lambda it: it[1]["upper"])) + } + """ + Sort reading_lines by the row index (mapped through rows_dict) and then by the word’s xmin coordinate. + """ + reading_lines.sort(key=lambda x: (rows_dict[x[0]], x[1][0])) + + number_rows = len(rows_dict) + #print("group_words_into_lines : number of rows : " + str(number_rows)) + #print("group_words_into_lines : reading lines" + str(reading_lines)) + #print("group_words_into_lines : rows_dict" + str(rows_dict)) + + onlywords = [ aTuple[1] for aTuple in reading_lines] + + return onlywords + diff --git a/utils/viz.py b/utils/viz.py new file mode 100644 index 0000000000000000000000000000000000000000..3bf0a07efca19f7ae3f4c71354064971799af91b --- /dev/null +++ b/utils/viz.py @@ -0,0 +1,119 @@ + +import os +from typing import List +import logging +import PIL +from PIL import Image +from PIL import ImageDraw +from .annotation import Annotation + + +def draw_box(im:Image.Image, result, lables, threshold=0.5): + im = im.copy() + draw_thickness = min(im.size) // 320 + draw = ImageDraw.Draw(im) + color_list = get_color_map_list(len(lables)) + clsid2color = {n.lower():color_list[i] for i,n in enumerate(lables)} + result = [r for r in result if r["score"] >= threshold] + + for dt in result: + color = tuple(clsid2color[dt["type"]]) + xmin, ymin, xmax, ymax = dt["bbox"] + + #Draws a line forming a rectangle around the detected object. + + draw.line( + [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), + (xmin, ymin)], + width=draw_thickness, + fill=color) + + #Prepares the text for the label (class type and score). + + + # draw label + text = "{} {:.4f}".format(dt["type"], dt["score"]) + #Computes the size of the text using imagedraw_textsize_c. + tw, th = imagedraw_textsize_c(draw, text) + #Draws a filled rectangle for the text background. + draw.rectangle( + [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color) + #Draws the text on top of the rectangle. + draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) + return im + +def draw_only_box(im:Image.Image, result): + im = im.copy() + draw_thickness = min(im.size) // 400 + draw = ImageDraw.Draw(im) + result = [r for r in result] + + for dt in result: + xmin, ymin, xmax, ymax = dt + xmin = int(xmin) + ymin = int(ymin) + xmax = int(xmax) + ymax = int(ymax) + + draw.line( + [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), + (xmin, ymin)], + width=draw_thickness, + fill="red") + + return im + +def draw_box_with_text(im:Image.Image, result:List[Annotation], threshold=0.5): + im = im.copy() + draw_thickness = min(im.size) // 320 + draw = ImageDraw.Draw(im) + + result = [r for r in result if r.score >= threshold] + + for dt in result: + + xmin, ymin, xmax, ymax = dt.box + draw.line( + [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), + (xmin, ymin)], + width=draw_thickness, + fill="red") + + # draw label + text = "{:.4f}".format(dt.score) + tw, th = imagedraw_textsize_c(draw, text) + draw.rectangle( + [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill="green") + draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) + + return im + +def get_color_map_list(num_classes): + """ + Args: + num_classes (int): number of class + Returns: + color_map (list): RGB color list + """ + color_map = num_classes * [0, 0, 0] + for i in range(0, num_classes): + j = 0 + lab = i + while lab: + color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j)) + color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j)) + color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j)) + j += 1 + lab >>= 3 + color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)] + return color_map + + +def imagedraw_textsize_c(draw, text): + if int(PIL.__version__.split('.')[0]) < 10: + tw, th = draw.textsize(text) + else: + left, top, right, bottom = draw.textbbox((0, 0), text) + tw, th = right - left, bottom - top + + return tw, th