Spaces:

Epoching
/

DocumentQA

Build error

App Files Files Community

Epoching commited on Aug 24, 2022

Commit

c14d9ad

1 Parent(s): 0b25b77

init

Browse files

Files changed (35) hide show

.gitignore +2 -0
CrossEncoder/cross_encoder.py +122 -0
CrossEncoder/cross_encoder_env.yml +53 -0
DiT_Extractor/base_utils.py +378 -0
DiT_Extractor/dit_object_detection/README.md +120 -0
DiT_Extractor/dit_object_detection/ditod/__init__.py +11 -0
DiT_Extractor/dit_object_detection/ditod/backbone.py +156 -0
DiT_Extractor/dit_object_detection/ditod/beit.py +671 -0
DiT_Extractor/dit_object_detection/ditod/config.py +32 -0
DiT_Extractor/dit_object_detection/ditod/deit.py +476 -0
DiT_Extractor/dit_object_detection/publaynet_configs/Base-RCNN-FPN.yaml +69 -0
DiT_Extractor/dit_object_detection/publaynet_configs/cascade/cascade_dit_base.yaml +20 -0
DiT_Extractor/dit_object_detection/publaynet_configs/cascade/cascade_dit_large.yaml +28 -0
DiT_Extractor/dit_object_detection/publaynet_configs/maskrcnn/maskrcnn_dit_base.yaml +15 -0
DiT_Extractor/dit_object_detection/publaynet_configs/maskrcnn/maskrcnn_dit_large.yaml +22 -0
DiT_Extractor/dit_runner.py +158 -0
DiT_Extractor/sentence_extractor.py +136 -0
LICENSE +207 -0
NOTICE +21 -0
README.md +14 -4
UnifiedQA/demo_QA.py +180 -0
app.py +120 -0
env_setup.sh +32 -0
examples/1810.04805.pdf +0 -0
examples/1909.00694.pdf +0 -0
examples/2105.03011.pdf +0 -0
ms-marco-electra-base/CEBinaryClassificationEvaluator_MS-Marco_results.csv +43 -0
ms-marco-electra-base/README.md +64 -0
ms-marco-electra-base/config.json +31 -0
ms-marco-electra-base/pytorch_model.bin +3 -0
ms-marco-electra-base/special_tokens_map.json +1 -0
ms-marco-electra-base/tokenizer_config.json +1 -0
ms-marco-electra-base/vocab.txt +0 -0
packages.txt +1 -0
requirements.txt +13 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .ipynb_checkpoints
2	+ __pycache__

CrossEncoder/cross_encoder.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) 2022, Lawrence Livermore National Security, LLC.
+# All rights reserved.
+# See the top-level LICENSE and NOTICE files for details.
+# LLNL-CODE-838964
+# SPDX-License-Identifier: Apache-2.0-with-LLVM-exception
+from sentence_transformers.cross_encoder import CrossEncoder as CE
+import numpy as np
+from typing import List, Dict, Tuple
+import json
+from collections import defaultdict
+import os
+class CrossEncoder:
+    def __init__(self,
+                 model_path: str = None,
+                 max_length: int = None,
+                 **kwargs):
+        if max_length != None:
+            self.model = CE(model_path, max_length = max_length, **kwargs)
+        self.model = CE(model_path, **kwargs)
+    def predict(self,
+                sentences: List[Tuple[str, str]],
+                batch_size: int = 32,
+                show_progress_bar: bool = False) -> List[float]:
+        return self.model.predict(sentences = sentences,
+                                  batch_size = batch_size,
+                                  show_progress_bar = show_progress_bar)
+class CERank:
+    def __init__(self, model, batch_size: int =128, **kwargs):
+        self.cross_encoder = model
+        self.batch_size = batch_size
+    def flatten_examples(self, contexts: Dict[str, Dict], question: str):
+        text_pairs, pair_ids = [], []
+        for context_id, context in contexts.items():
+            pair_ids.append(['question_0', context_id])
+            text_pairs.append([question, context['text']])
+        return text_pairs, pair_ids
+    def group_questionrank(self, pair_ids, rank_scores):
+        unsorted = defaultdict(list)
+        for pair, score in zip(pair_ids, rank_scores):
+            query_id, paragraph_id = pair[0], pair[1]
+            unsorted[query_id].append((paragraph_id, score))
+        return unsorted
+    def get_rankings(self, pair_ids, rank_scores, text_pairs):
+        unsorted_ranks = self.group_questionrank(pair_ids, rank_scores)
+        rankings = defaultdict(dict)
+        for idx, (query_id, ranks) in enumerate(unsorted_ranks.items()):
+            sort_ranks = sorted(ranks, key = lambda item: item[1], reverse = True)
+            sorted_ranks, scores = list(zip(*sort_ranks))
+            rankings[query_id]['text'] = text_pairs[idx][0]
+            rankings[query_id]['scores'] = list(scores)
+            rankings[query_id]['ranks'] = list(sorted_ranks)
+        return rankings
+    def rank(self,
+             contexts: Dict[str, Dict],
+             question: str):
+        text_pairs, pair_ids = self.flatten_examples(contexts, question)
+        rank_scores = [float(score) for score in self.cross_encoder.predict(text_pairs, batch_size = self.batch_size)]
+        full_results = self.get_rankings(pair_ids, rank_scores, text_pairs)
+        return full_results
+def get_ranked_contexts(context_json, question):
+    dirname = 'examples'
+    model_path = '/data/actici/pretrained_weights/ms-marco-electra-base'
+    max_length =  512
+    # Can't use use_fast (fast tokenizers) while gradio is running, causes conflict with tokenizer multiprocessing/parallelism.
+    cross_encoder = CrossEncoder(model_path, max_length, tokenizer_args={'use_fast':False})
+    ranker = CERank(cross_encoder)
+    with open(context_json, 'r') as fin:
+        contexts = json.load(fin)
+    rankings = ranker.rank(contexts, question)
+    with open('ranked_{0}.json'.format(context_json[:-5]), 'w') as fout:
+        json.dump(rankings, fout)
+def get_ranked_contexts_in_memory(contexts, question):
+    dirname = 'examples'
+    model_path = '/data/actici/pretrained_weights/ms-marco-electra-base'
+    max_length =  512
+    # Can't use use_fast (fast tokenizers) while gradio is running, causes conflict with tokenizer multiprocessing/parallelism.
+    cross_encoder = CrossEncoder(model_path, max_length, tokenizer_args={'use_fast':False})
+    ranker = CERank(cross_encoder)
+    rankings = ranker.rank(contexts, question)
+    return rankings

CrossEncoder/cross_encoder_env.yml ADDED Viewed

	@@ -0,0 +1,53 @@

+name: cross_encoder_env
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - ca-certificates=2022.4.26=h06a4308_0
+  - certifi=2022.6.15=py39h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - ncurses=6.3=h7f8727e_2
+  - openssl=1.1.1o=h7f8727e_0
+  - pip=21.2.4=py39h06a4308_0
+  - python=3.9.12=h12debd9_1
+  - readline=8.1.2=h7f8727e_1
+  - setuptools=61.2.0=py39h06a4308_0
+  - sqlite=3.38.5=hc218d9a_0
+  - tk=8.6.12=h1ccaba5_0
+  - tzdata=2022a=hda174b7_0
+  - wheel=0.37.1=pyhd3eb1b0_0
+  - xz=5.2.5=h7f8727e_1
+  - zlib=1.2.12=h7f8727e_2
+  - pip:
+    - charset-normalizer==2.0.12
+    - click==8.1.3
+    - filelock==3.7.1
+    - huggingface-hub==0.8.1
+    - idna==3.3
+    - joblib==1.1.0
+    - nltk==3.7
+    - numpy==1.23.0
+    - packaging==21.3
+    - pillow==9.1.1
+    - pyparsing==3.0.9
+    - pyyaml==6.0
+    - regex==2022.6.2
+    - requests==2.28.0
+    - scikit-learn==1.1.1
+    - scipy==1.8.1
+    - sentence-transformers==2.2.2
+    - sentencepiece==0.1.96
+    - threadpoolctl==3.1.0
+    - tokenizers==0.12.1
+    - torch==1.11.0
+    - torchvision==0.12.0
+    - tqdm==4.64.0
+    - transformers==4.20.1
+    - typing-extensions==4.2.0
+    - urllib3==1.26.9
+prefix: /home/ordonez2/miniconda3/envs/cross_encoder

DiT_Extractor/base_utils.py ADDED Viewed

	@@ -0,0 +1,378 @@

+# Copyright (c) 2022, Lawrence Livermore National Security, LLC.
+# All rights reserved.
+# See the top-level LICENSE and NOTICE files for details.
+# LLNL-CODE-838964
+# SPDX-License-Identifier: Apache-2.0-with-LLVM-exception
+from pdfminer.pdfpage import PDFParser
+from pdfminer.pdfpage import PDFDocument
+from pdfminer.pdfpage import PDFPage
+from pdfminer.layout import LTTextBoxHorizontal
+from pdfminer.layout import LTTextLineHorizontal
+from pdfminer.layout import LTChar
+from pdfminer.layout import LAParams
+from pdfminer.layout import LTRect
+from pdfminer.layout import LTFigure
+from pdfminer.converter import PDFPageAggregator
+from pdfminer.pdfinterp import PDFResourceManager
+from pdfminer.pdfinterp import PDFPageInterpreter
+from pdfminer import pdfinterp
+from collections.abc import Iterable
+from collections import Counter
+from collections import OrderedDict
+import os
+# This is use for highlighting in PDFs
+from PyPDF2.generic import (
+    DictionaryObject,
+    NumberObject,
+    FloatObject,
+    NameObject,
+    TextStringObject,
+    ArrayObject
+)
+# Used to extract pages
+from PyPDF2 import PdfFileReader, PdfFileWriter
+def get_page_sizes(document):
+    parser = PDFParser(open(document, 'rb'))
+    doc = PDFDocument(parser)
+    pageSizesList = []
+    for page in PDFPage.create_pages(doc):
+        # the media box that is the page size as list of 4 integers x0 y0 x1 y1
+        pageSizesList.append(page.mediabox) # <- appending
+    return pageSizesList
+def get_page_count(document):
+    # Is there a better way of getting the page count than doing this?
+    parser = PDFParser(document)
+    tmpdoc = PDFDocument(parser)
+    page_count = pdfinterp.resolve1(tmpdoc.catalog['Pages'])['Count']
+    return page_count
+def get_pdf_page_count(filename):
+    with open(filename, 'rb') as document:
+        return get_page_count(document)
+def get_pages(document, page_numbers = None):
+    #Create resource manager
+    rsrcmgr = PDFResourceManager()
+    # Set parameters for analysis.
+    laparams = LAParams()
+    # Create a PDF page aggregator object.
+    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+    page_count = get_page_count(document)
+    if page_numbers is None:
+        page_numbers = range(page_count)
+    for page, page_number in zip(PDFPage.get_pages(document, page_numbers), page_numbers):
+        interpreter.process_page(page)
+        # receive the LTPage object for the page.
+        layout = device.get_result()
+        #print("Yield page:", page_number)
+        yield layout, page_number
+def partial_overlaps(box, other):
+    """
+    Determine if the two bounding boxes overlap eachother.
+    TODO: Really should just use a standard Python library for this.
+    box -- 2 coordinate bounding box (x1,y1,x2,y2)
+    other -- 2 coordinate bounding box (x1,y1,x2,y2)
+    """
+    # a1 x1  a2 x2
+    # <------------------>
+    x_intersects = (other[0] < box[0] and other[2] > box[0]) or (
+                    other[0] < box[2] and other[2] > box[2])
+    y_intersects = (other[1] < box[1] and other[3] > box[1]) or (
+                    other[1] < box[3] and other[3] > box[3])
+    intersects = x_intersects or y_intersects
+    # TODO: Simplify?
+    return intersects and overlaps(box, other)
+    #return intersects
+def overlaps(box, other):
+    """
+    Determine if the two bounding boxes overlap eachother.
+    TODO: Really should just use a standard Python library for this.
+    box -- 2 coordinate bounding box (x1,y1,x2,y2)
+    other -- 2 coordinate bounding box (x1,y1,x2,y2)
+    """
+    x_intersects = box[0] > other[2] or box[2] < other[0]
+    y_intersects = box[1] > other[3] or box[3] < other[1]
+    intersects = not (x_intersects or y_intersects)
+    return intersects
+def union(src, other):
+    """
+    Expand src by union of other bbox
+    src -- 2 coordinate bounding box (x1,y1,x2,y2)
+    other -- 2 coordinate bounding box (x1,y1,x2,y2)
+    returns union of src and other
+    """
+    xmin = min(src[0], other[0])
+    ymin = min(src[1], other[1])
+    xmax = max(src[2], other[2])
+    ymax = max(src[3], other[3])
+    return [xmin, ymin, xmax, ymax]
+# See: https://gist.github.com/agentcooper/4c55133f5d95866acdee5017cd318558#file-pypdf2highlight-py
+# x1, y1 starts in bottom left corner
+def createHighlight(x1, y1, x2, y2, meta, color = [1, 0, 0]):
+    newHighlight = DictionaryObject()
+    newHighlight.update({
+        NameObject("/F"): NumberObject(4),
+        NameObject("/Type"): NameObject("/Annot"),
+        NameObject("/Subtype"): NameObject("/Highlight"),
+        NameObject("/T"): TextStringObject(meta["author"]),
+        NameObject("/Contents"): TextStringObject(meta["contents"]),
+        NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
+        NameObject("/Rect"): ArrayObject([
+            FloatObject(x1),
+            FloatObject(y1),
+            FloatObject(x2),
+            FloatObject(y2)
+        ]),
+        NameObject("/QuadPoints"): ArrayObject([
+            FloatObject(x1),
+            FloatObject(y2),
+            FloatObject(x2),
+            FloatObject(y2),
+            FloatObject(x1),
+            FloatObject(y1),
+            FloatObject(x2),
+            FloatObject(y1)
+        ]),
+    })
+    return newHighlight
+def addHighlightToPage(highlight, page, output):
+    highlight_ref = output._addObject(highlight);
+    if "/Annots" in page:
+        page[NameObject("/Annots")].append(highlight_ref)
+    else:
+        page[NameObject("/Annots")] = ArrayObject([highlight_ref])
+def get_pdf_words(document, page_numbers=None):
+    """
+    Get all words from LTChar or LTTextLineHorizontal objects from the document.
+    :param document: string path of the PDF file to process
+    :returns: A map of page #'s containing lists of coordinates and PDFMiner
+    objects. Ex.: {page_number: [[x1, y1, x2, y2, <LTTextLineHorizontal>],]}
+    """
+    pdf_doc = open(document, 'rb')
+    bboxes = {}
+    for layout, page in get_pages(pdf_doc, page_numbers):
+        #print(element.get_text())
+        bboxes[page] = []
+        for element in layout:
+            if not isinstance(element, Iterable):
+                continue # not iterable
+            for subElement in element:
+                #print('Subelement type:', type(subElement))
+                if isinstance(subElement, LTChar):
+                    if (subElement.get_text() == ' '):
+                        pass  # TODO: Handle word deliminator
+                    # Print the character in this class
+                    # print(subElement.get_text(), end='')
+                    item = list(subElement.bbox)
+                    item.append(subElement)
+                    bboxes[page].append(item)
+                elif isinstance(subElement, LTTextLineHorizontal):
+                    #print(subElement.bbox)
+                    item = list(subElement.bbox)
+                    item.append(subElement)
+                    bboxes[page].append(item)
+                else:
+                    pass
+    return bboxes
+def get_paragraphs(words):
+    paragraph_tolerance = 0.1
+    max_height_diff = 1
+    paragraphs = []
+    for page, elements in words.items():
+        # Find nominal font size
+        # Round to int
+        freq = Counter()
+        for element in elements:
+            height = int(element[3] - element[1])
+            #print(height,end=' ')
+            freq[height] += 1
+        nominal_font = freq.most_common(1)[0][0]
+        print("Nominal font is:", nominal_font)
+        print("Page:", page)
+        x_offset_prev_line = None
+        prev_x_offset = None
+        prev_y_offset = None
+        paragraph_content = ""
+        #print("Element count:", len(elements))
+        first_line = False
+        processed_first_line = False
+        for element in elements:
+            x_offset = element[0]
+            y_offset = element[1]
+            height = int(element[3] - element[1])
+            text = element[4].get_text()
+            if x_offset_prev_line != None:
+                large_x_offset = (abs(x_offset_prev_line - x_offset) > paragraph_tolerance)
+            # Font size mismatch?
+            if abs(height - nominal_font) > max_height_diff:
+                if len(paragraph_content) > 0:
+                    print("Content append:", len(paragraph_content))
+                    paragraphs.append(paragraph_content)
+                    paragraph_content = ""
+                print("Continue due to height != nominal_font")
+                continue
+            print("ELEMENT:", element[0:4], text[0:15])
+            if prev_y_offset is not None and len(paragraph_content) > 0:
+                if y_offset < prev_y_offset - height * 1.5:
+                    print("Content append:", len(paragraph_content))
+                    if len(paragraph_content) > 0:
+                        paragraphs.append(paragraph_content)
+                        paragraph_content = text
+                    prev_y_offset = None
+                    continue
+            prev_y_offset = y_offset
+            prev_y_offset = y_offset
+            #print("element:", element)
+            if not isinstance(element[4], LTTextLineHorizontal):
+                continue
+            #print("Running text:", text)
+            #print(f"x_offset_prev_line , x_offset]: {x_offset_prev_line, x_offset}")
+            # Find first paragraph
+            if x_offset_prev_line is None:
+                #print("x_offset_prev is none")
+                x_offset_prev_line = x_offset
+                if not processed_first_line:
+                    first_line = True
+                    processed_first_line = True
+                if height == nominal_font:
+                    paragraph_content += text
+                #print("Continue due to x_offset_prev_line is none")
+                continue
+            # Check case if first line was indented
+            if x_offset_prev_line > x_offset and first_line:
+                #print("x_offset < element[0]")
+                first_line = False
+                paragraph_content += text
+                x_offset_prev_line = x_offset
+                #print("Continue due to  x_offset_prev_line > x_offset and first_line")
+                continue
+            # is this indented?
+            # and ignore small changes
+            if x_offset_prev_line < x_offset and large_x_offset:
+                #print(f"x_offset_prev_line > x_offset: {x_offset_prev_line, x_offset}")
+                if height == nominal_font and len(paragraph_content) > 0:
+                    paragraphs.append(paragraph_content)
+                paragraph_content = text
+                # Reset at next line read
+                # What if next paragraph is also indented???
+                x_offset_prev_line = None
+                #print("Continue due to  x_offset_prev_line < x_offset and large_x_offset")
+                continue
+            #print(element[0:4])
+            if height == nominal_font:
+                paragraph_content += text
+            #print("End of loop")
+            # TODO: Remove redundant space
+        if paragraph_content != "":
+            paragraphs.append(paragraph_content)
+    # Find paragraph indexes
+    c = 0
+    indexes = []
+    for p in paragraphs:
+        c += len(p)
+        indexes.append(c)
+    return paragraphs, indexes
+def get_pdf_elements(document, element_type, page_numbers=None):
+    pdf_doc = open(document, 'rb')
+    items = {}
+    for layout, page in get_pages(pdf_doc, page_numbers):
+        #print(element.get_text())
+        items[page] = []
+        for element in layout:
+            if isinstance(element, element_type):
+                item = list(element.bbox)
+                if hasattr(element, 'non_stroking_color'):
+                    item.append(element.non_stroking_color)
+                items[page].append(item)
+    print(items)
+    return items
+def get_large_colored_background_rectangles(document, page_numbers=None):
+    # Only include rectangles that are at least 4" x 1" in size
+    min_size = (288.0, 72.0)
+    elements = get_pdf_elements(document, LTRect, page_numbers)
+    rects_out = {}
+    for page, rects in elements.items():
+        print("Rects:", rects)
+        for rect in rects:
+            width = rect[2] - rect[0]
+            height = rect[3] - rect[1]
+            print("Dimensions:", width, height)
+            if (width > min_size[0] and
+                height > min_size[1]):
+                if not page in rects_out:
+                    rects_out[page] = []
+                rects_out[page].append(rect)
+    return rects_out
+def extract_pages(document, output, page_numbers=None):
+    pdf = PdfFileReader(document)
+    pdf_writer = PdfFileWriter()
+    for page in page_numbers:
+        current_page = pdf.getPage(page)
+        pdf_writer.addPage(current_page)
+    with open(output, "wb") as out:
+        pdf_writer.write(out)

DiT_Extractor/dit_object_detection/README.md ADDED Viewed

	@@ -0,0 +1,120 @@

+# DiT for Object Detection
+This folder contains Mask R-CNN Cascade Mask R-CNN running instructions on top of [Detectron2](https://github.com/facebookresearch/detectron2) for PubLayNet and ICDAR 2019 cTDaR.
+## Usage
+### Inference
+The quickest way to try out DiT for document layout analysis is the web demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/nielsr/dit-document-layout-analysis).
+One can run inference using the `inference.py` script. It can be run as follows (from the root of the unilm repository):
+```
+python ./dit/object_detection/inference.py \
+--image_path ./dit/object_detection/publaynet_example.jpeg \
+--output_file_name output.jpg \
+--config ./dit/object_detection/publaynet_configs/maskrcnn/maskrcnn_dit_base.yaml \
+--opts MODEL.WEIGHTS https://layoutlm.blob.core.windows.net/dit/dit-fts/publaynet_dit-b_mrcnn.pth \
+```
+Make sure that the configuration file (YAML) and PyTorch checkpoint match. The example above uses DiT-base with the Mask R-CNN framework fine-tuned on PubLayNet.
+### Data Preparation
+**PubLayNet**
+Download the data from this [link](https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/publaynet.tar.gz?_ga=2.218138265.1825957955.1646384196-1495010506.1633610665) (~96GB). Then extract it to `PATH-to-PubLayNet`.
+A soft link needs to be created to make the data accessible for the program:`ln -s PATH-to-PubLayNet publaynet_data`.
+**ICDAR 2019 cTDaR**
+Download the data from this [link](https://github.com/cndplab-founder/ICDAR2019_cTDaR) (~4GB). Assume path to this repository is named as `PATH-to-ICDARrepo`.
+Then run `python convert_to_coco_format.py --root_dir=PATH-to-ICDARrepo --target_dir=PATH-toICDAR`. Now the path to processed data is `PATH-to-ICDAR`.
+Run the following command to get the adaptively binarized images for archival subset.
+```
+cp -r PATH-to-ICDAR/trackA_archival PATH-to-ICDAR/at_trackA_archival
+python adaptive_binarize.py --root_dir PATH-to-ICDAR/at_trackA_archival
+```
+The binarized archival subset will be in `PATH-to-ICDAR/at_trackA_archival`.
+According to the subset you want to evaluate/fine-tune, a soft link should be created:`ln -s PATH-to-ICDAR/trackA_modern data` or `ln -s PATH-to-ICDAR/at_trackA_archival data`.
+### Evaluation
+Following commands provide two examples to evaluate the fine-tuned checkpoints.
+The config files can be found in `icdar19_configs` and `publaynet_configs`.
+1) Evaluate the fine-tuned checkpoint of DiT-Base with Mask R-CNN on PublayNet:
+```bash
+python train_net.py --config-file publaynet_configs/maskrcnn/maskrcnn_dit_base.yaml --eval-only --num-gpus 8 MODEL.WEIGHTS <finetuned_checkpoint_file_path or link> OUTPUT_DIR <your_output_dir>
+```
+2) Evaluate the fine-tuned checkpoint of DiT-Large with Cascade Mask R-CNN on ICDAR 2019 cTDaR archival subset (make sure you have created a soft link from `PATH-to-ICDAR/at_trackA_archival` to `data`):
+```bash
+python train_net.py --config-file icdar19_configs/cascade/cascade_dit_large.yaml --eval-only --num-gpus 8 MODEL.WEIGHTS <finetuned_checkpoint_file_path or link> OUTPUT_DIR <your_output_dir>
+```
+**Note**: We have fixed the **bug** in the [ICDAR2019 measurement tool](https://github.com/cndplab-founder/ctdar_measurement_tool) during integrating the tool into our code. If you use the tool to get the evaluation score, please modify the [code](https://github.com/cndplab-founder/ctdar_measurement_tool/blob/738456d3164a838ffaeefe7d1b5e64f3a4368a0e/evaluate.py#L146
+) as follows:
+```bash
+    ...
+    # print(each_file)
+# for file in gt_file_lst:
+#     if file.split(".") != "xml":
+#         gt_file_lst.remove(file)
+#     # print(gt_file_lst)
+# Comment the code above and add the code below
+for i in range(len(gt_file_lst) - 1, -1, -1):
+    if gt_file_lst[i].split(".")[-1] != "xml":
+        del gt_file_lst[i]
+if len(gt_file_lst) > 0:
+    ...
+```
+### Training
+The following commands provide two examples to train the Mask R-CNN/Cascade Mask R-CNN with DiT backbone on 8 32GB Nvidia V100 GPUs.
+1) Fine-tune DiT-Base with Cascade Mask R-CNN on PublayNet:
+```bash
+python train_net.py --config-file publaynet_configs/cascade/cascade_dit_base.yaml --num-gpus 8 MODEL.WEIGHTS <DiT-Base_file_path or link> OUTPUT_DIR <your_output_dir>
+```
+2) Fine-tune DiT-Large with Mask R-CNN on ICDAR 2019 cTDaR modern:
+```bash
+python train_net.py --config-file icdar19_configs/markrcnn/maskrcnn_dit_large.yaml --num-gpus 8 MODEL.WEIGHTS <DiT-Large_file_path or link> OUTPUT_DIR <your_output_dir>
+```
+[Detectron2's document](https://detectron2.readthedocs.io/en/latest/tutorials/getting_started.html) may help you for more details.
+## Citation
+If you find this repository useful, please consider citing our work:
+```
+@misc{li2022dit,
+    title={DiT: Self-supervised Pre-training for Document Image Transformer},
+    author={Junlong Li and Yiheng Xu and Tengchao Lv and Lei Cui and Cha Zhang and Furu Wei},
+    year={2022},
+    eprint={2203.02378},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
+## Acknowledgment
+Thanks to [Detectron2](https://github.com/facebookresearch/detectron2) for Mask R-CNN and Cascade Mask R-CNN implementation.

DiT_Extractor/dit_object_detection/ditod/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# --------------------------------------------------------------------------------
+# MPViT: Multi-Path Vision Transformer for Dense Prediction
+# Copyright (c) 2022 Electronics and Telecommunications Research Institute (ETRI).
+# All Rights Reserved.
+# Written by Youngwan Lee
+# This source code is licensed(Dual License(GPL3.0 & Commercial)) under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------------------------------
+from .config import add_vit_config
+from .backbone import build_vit_fpn_backbone

DiT_Extractor/dit_object_detection/ditod/backbone.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# --------------------------------------------------------------------------------
+# VIT: Multi-Path Vision Transformer for Dense Prediction
+# Copyright (c) 2022 Electronics and Telecommunications Research Institute (ETRI).
+# All Rights Reserved.
+# Written by Youngwan Lee
+# This source code is licensed(Dual License(GPL3.0 & Commercial)) under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------------------------------
+# References:
+# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# CoaT: https://github.com/mlpc-ucsd/CoaT
+# --------------------------------------------------------------------------------
+import torch
+from detectron2.layers import (
+    ShapeSpec,
+)
+from detectron2.modeling import Backbone, BACKBONE_REGISTRY, FPN
+from detectron2.modeling.backbone.fpn import LastLevelP6P7, LastLevelMaxPool
+from .beit import beit_base_patch16, dit_base_patch16, dit_large_patch16, beit_large_patch16
+from .deit import deit_base_patch16, mae_base_patch16
+__all__ = [
+    "build_vit_fpn_backbone",
+]
+class VIT_Backbone(Backbone):
+    """
+    Implement VIT backbone.
+    """
+    def __init__(self, name, out_features, drop_path, img_size, pos_type, model_kwargs):
+        super().__init__()
+        self._out_features = out_features
+        if 'base' in name:
+            self._out_feature_strides = {"layer3": 4, "layer5": 8, "layer7": 16, "layer11": 32}
+        else:
+            self._out_feature_strides = {"layer7": 4, "layer11": 8, "layer15": 16, "layer23": 32}
+        if name == 'beit_base_patch16':
+            model_func = beit_base_patch16
+            self._out_feature_channels = {"layer3": 768, "layer5": 768, "layer7": 768, "layer11": 768}
+        elif name == 'dit_base_patch16':
+            model_func = dit_base_patch16
+            self._out_feature_channels = {"layer3": 768, "layer5": 768, "layer7": 768, "layer11": 768}
+        elif name == "deit_base_patch16":
+            model_func = deit_base_patch16
+            self._out_feature_channels = {"layer3": 768, "layer5": 768, "layer7": 768, "layer11": 768}
+        elif name == "mae_base_patch16":
+            model_func = mae_base_patch16
+            self._out_feature_channels = {"layer3": 768, "layer5": 768, "layer7": 768, "layer11": 768}
+        elif name == "dit_large_patch16":
+            model_func = dit_large_patch16
+            self._out_feature_channels = {"layer7": 1024, "layer11": 1024, "layer15": 1024, "layer23": 1024}
+        elif name == "beit_large_patch16":
+            model_func = beit_large_patch16
+            self._out_feature_channels = {"layer7": 1024, "layer11": 1024, "layer15": 1024, "layer23": 1024}
+        else:
+            raise ValueError("Unsupported VIT name yet.")
+        if 'beit' in name or 'dit' in name:
+            if pos_type == "abs":
+                self.backbone = model_func(img_size=img_size,
+                                           out_features=out_features,
+                                           drop_path_rate=drop_path,
+                                           use_abs_pos_emb=True,
+                                           **model_kwargs)
+            elif pos_type == "shared_rel":
+                self.backbone = model_func(img_size=img_size,
+                                           out_features=out_features,
+                                           drop_path_rate=drop_path,
+                                           use_shared_rel_pos_bias=True,
+                                           **model_kwargs)
+            elif pos_type == "rel":
+                self.backbone = model_func(img_size=img_size,
+                                           out_features=out_features,
+                                           drop_path_rate=drop_path,
+                                           use_rel_pos_bias=True,
+                                           **model_kwargs)
+            else:
+                raise ValueError()
+        else:
+            self.backbone = model_func(img_size=img_size,
+                                       out_features=out_features,
+                                       drop_path_rate=drop_path,
+                                       **model_kwargs)
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert x.dim() == 4, f"VIT takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        return self.backbone.forward_features(x)
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+def build_VIT_backbone(cfg):
+    """
+    Create a VIT instance from config.
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        A VIT backbone instance.
+    """
+    # fmt: off
+    name = cfg.MODEL.VIT.NAME
+    out_features = cfg.MODEL.VIT.OUT_FEATURES
+    drop_path = cfg.MODEL.VIT.DROP_PATH
+    img_size = cfg.MODEL.VIT.IMG_SIZE
+    pos_type = cfg.MODEL.VIT.POS_TYPE
+    model_kwargs = eval(str(cfg.MODEL.VIT.MODEL_KWARGS).replace("`", ""))
+    return VIT_Backbone(name, out_features, drop_path, img_size, pos_type, model_kwargs)
+@BACKBONE_REGISTRY.register()
+def build_vit_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Create a VIT w/ FPN backbone.
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_VIT_backbone(cfg)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone

DiT_Extractor/dit_object_detection/ditod/beit.py ADDED Viewed

	@@ -0,0 +1,671 @@

+""" Vision Transformer (ViT) in PyTorch
+A PyTorch implement of Vision Transformers as described in
+'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale' - https://arxiv.org/abs/2010.11929
+The official jax code is released and available at https://github.com/google-research/vision_transformer
+Status/TODO:
+* Models updated to be compatible with official impl. Args added to support backward compat for old PyTorch weights.
+* Weights ported from official jax impl for 384x384 base and small models, 16x16 and 32x32 patches.
+* Trained (supervised on ImageNet-1k) my custom 'small' patch model to 77.9, 'base' to 79.4 top-1 with this code.
+* Hopefully find time and GPUs for SSL or unsupervised pretraining on OpenImages w/ ImageNet fine-tune in future.
+Acknowledgments:
+* The paper authors for releasing code and weights, thanks!
+* I fixed my class token impl based on Phil Wang's https://github.com/lucidrains/vit-pytorch ... check it out
+for some einops/einsum fun
+* Simple transformer style inspired by Andrej Karpathy's https://github.com/karpathy/minGPT
+* Bert reference code checks against Huggingface Transformers and Tensorflow Bert
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import warnings
+import math
+import torch
+from functools import partial
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        **kwargs
+    }
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+            self.register_buffer("relative_position_index", relative_position_index)
+            # trunc_normal_(self.relative_position_bias_table, std=.0)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, rel_pos_bias=None, training_window_size=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        if self.relative_position_bias_table is not None:
+            if training_window_size == self.window_size:
+                relative_position_bias = \
+                    self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                        self.window_size[0] * self.window_size[1] + 1,
+                        self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+                relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+                attn = attn + relative_position_bias.unsqueeze(0)
+            else:
+                training_window_size = tuple(training_window_size.tolist())
+                new_num_relative_distance = (2 * training_window_size[0] - 1) * (2 * training_window_size[1] - 1) + 3
+                # new_num_relative_dis 为 所有可能的相对位置选项，包含cls-cls，tok-cls，与cls-tok
+                new_relative_position_bias_table = F.interpolate(
+                    self.relative_position_bias_table[:-3, :].permute(1, 0).view(1, self.num_heads,
+                                                                                 2 * self.window_size[0] - 1,
+                                                                                 2 * self.window_size[1] - 1),
+                    size=(2 * training_window_size[0] - 1, 2 * training_window_size[1] - 1), mode='bicubic',
+                    align_corners=False)
+                new_relative_position_bias_table = new_relative_position_bias_table.view(self.num_heads,
+                                                                                         new_num_relative_distance - 3).permute(
+                    1, 0)
+                new_relative_position_bias_table = torch.cat(
+                    [new_relative_position_bias_table, self.relative_position_bias_table[-3::]], dim=0)
+                # get pair-wise relative position index for each token inside the window
+                coords_h = torch.arange(training_window_size[0])
+                coords_w = torch.arange(training_window_size[1])
+                coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+                coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+                relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+                relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+                relative_coords[:, :, 0] += training_window_size[0] - 1  # shift to start from 0
+                relative_coords[:, :, 1] += training_window_size[1] - 1
+                relative_coords[:, :, 0] *= 2 * training_window_size[1] - 1
+                relative_position_index = \
+                    torch.zeros(size=(training_window_size[0] * training_window_size[1] + 1,) * 2,
+                                dtype=relative_coords.dtype)
+                relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+                relative_position_index[0, 0:] = new_num_relative_distance - 3
+                relative_position_index[0:, 0] = new_num_relative_distance - 2
+                relative_position_index[0, 0] = new_num_relative_distance - 1
+                relative_position_bias = \
+                    new_relative_position_bias_table[relative_position_index.view(-1)].view(
+                        training_window_size[0] * training_window_size[1] + 1,
+                        training_window_size[0] * training_window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+                relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+                attn = attn + relative_position_bias.unsqueeze(0)
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        if init_values is not None:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+    def forward(self, x, rel_pos_bias=None, training_window_size=None):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(
+                self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, training_window_size=training_window_size))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias,
+                                                            training_window_size=training_window_size))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=[224, 224], patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches_w = self.patch_shape[0]
+        self.num_patches_h = self.patch_shape[1]
+        # the so-called patch_shape is the patch shape during pre-training
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x, position_embedding=None, **kwargs):
+        # FIXME look at relaxing size constraints
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+        if position_embedding is not None:
+            # interpolate the position embedding to the corresponding size
+            position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(0, 3,
+                                                                                                                  1, 2)
+            position_embedding = F.interpolate(position_embedding, size=(Hp, Wp), mode='bicubic')
+            x = x + position_embedding
+        x = x.flatten(2).transpose(1, 2)
+        return x, (Hp, Wp)
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+    def __init__(self, backbone, img_size=[224, 224], feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature
+                # map for all networks, the feature metadata has reliable channel and stride info, but using
+                # stride to calc feature dim requires info about padding of each stage that isn't captured.
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+class RelativePositionBias(nn.Module):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_heads = num_heads
+        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+        self.register_buffer("relative_position_index", relative_position_index)
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+    def forward(self, training_window_size):
+        if training_window_size == self.window_size:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        else:
+            training_window_size = tuple(training_window_size.tolist())
+            new_num_relative_distance = (2 * training_window_size[0] - 1) * (2 * training_window_size[1] - 1) + 3
+            # new_num_relative_dis 为 所有可能的相对位置选项，包含cls-cls，tok-cls，与cls-tok
+            new_relative_position_bias_table = F.interpolate(
+                self.relative_position_bias_table[:-3, :].permute(1, 0).view(1, self.num_heads,
+                                                                             2 * self.window_size[0] - 1,
+                                                                             2 * self.window_size[1] - 1),
+                size=(2 * training_window_size[0] - 1, 2 * training_window_size[1] - 1), mode='bicubic',
+                align_corners=False)
+            new_relative_position_bias_table = new_relative_position_bias_table.view(self.num_heads,
+                                                                                     new_num_relative_distance - 3).permute(
+                1, 0)
+            new_relative_position_bias_table = torch.cat(
+                [new_relative_position_bias_table, self.relative_position_bias_table[-3::]], dim=0)
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(training_window_size[0])
+            coords_w = torch.arange(training_window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += training_window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += training_window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * training_window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(training_window_size[0] * training_window_size[1] + 1,) * 2,
+                            dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = new_num_relative_distance - 3
+            relative_position_index[0:, 0] = new_num_relative_distance - 2
+            relative_position_index[0, 0] = new_num_relative_distance - 1
+            relative_position_bias = \
+                new_relative_position_bias_table[relative_position_index.view(-1)].view(
+                    training_window_size[0] * training_window_size[1] + 1,
+                    training_window_size[0] * training_window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        return relative_position_bias
+class BEiT(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self,
+                 img_size=[224, 224],
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=80,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 hybrid_backbone=None,
+                 norm_layer=None,
+                 init_values=None,
+                 use_abs_pos_emb=False,
+                 use_rel_pos_bias=False,
+                 use_shared_rel_pos_bias=False,
+                 use_checkpoint=True,
+                 pretrained=None,
+                 out_features=None,
+                 ):
+        super(BEiT, self).__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.use_checkpoint = use_checkpoint
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.out_features = out_features
+        self.out_indices = [int(name[5:]) for name in out_features]
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        self.use_shared_rel_pos_bias = use_shared_rel_pos_bias
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None)
+            for i in range(depth)])
+        # trunc_normal_(self.mask_token, std=.02)
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+                # nn.SyncBatchNorm(embed_dim),
+                nn.BatchNorm2d(embed_dim),
+                nn.GELU(),
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+            self.fpn2 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+            self.fpn3 = nn.Identity()
+            self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+            self.fpn2 = nn.Identity()
+            self.fpn3 = nn.Sequential(
+                nn.MaxPool2d(kernel_size=2, stride=2),
+            )
+            self.fpn4 = nn.Sequential(
+                nn.MaxPool2d(kernel_size=4, stride=4),
+            )
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    '''
+    def init_weights(self):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        logger = get_root_logger()
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            logger.info(f"Will load ckpt from {self.init_cfg['checkpoint']}")
+            load_checkpoint(self,
+                            filename=self.init_cfg['checkpoint'],
+                            strict=False,
+                            logger=logger,
+                            beit_spec_expand_rel_pos = self.use_rel_pos_bias,
+                            )
+    '''
+    def get_num_layers(self):
+        return len(self.blocks)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+    def forward_features(self, x):
+        B, C, H, W = x.shape
+        x, (Hp, Wp) = self.patch_embed(x, self.pos_embed[:, 1:, :] if self.pos_embed is not None else None)
+        # Hp, Wp are HW for patches
+        batch_size, seq_len, _ = x.size()
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        if self.pos_embed is not None:
+            cls_tokens = cls_tokens + self.pos_embed[:, :1, :]
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = self.pos_drop(x)
+        features = []
+        training_window_size = torch.tensor([Hp, Wp])
+        rel_pos_bias = self.rel_pos_bias(training_window_size) if self.rel_pos_bias is not None else None
+        for i, blk in enumerate(self.blocks):
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, rel_pos_bias, training_window_size)
+            else:
+                x = blk(x, rel_pos_bias=rel_pos_bias, training_window_size=training_window_size)
+            if i in self.out_indices:
+                xp = x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, Hp, Wp)
+                features.append(xp.contiguous())
+        ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+        for i in range(len(features)):
+            features[i] = ops[i](features[i])
+        feat_out = {}
+        for name, value in zip(self.out_features, features):
+            feat_out[name] = value
+        return feat_out
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+def beit_base_patch16(pretrained=False, **kwargs):
+    model = BEiT(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        init_values=None,
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+def beit_large_patch16(pretrained=False, **kwargs):
+    model = BEiT(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        init_values=None,
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+def dit_base_patch16(pretrained=False, **kwargs):
+    model = BEiT(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        init_values=0.1,
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+def dit_large_patch16(pretrained=False, **kwargs):
+    model = BEiT(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        init_values=1e-5,
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+if __name__ == '__main__':
+    model = BEiT(use_checkpoint=True, use_shared_rel_pos_bias=True)
+    model = model.to("cuda:0")
+    input1 = torch.rand(2, 3, 512, 762).to("cuda:0")
+    input2 = torch.rand(2, 3, 800, 1200).to("cuda:0")
+    input3 = torch.rand(2, 3, 720, 1000).to("cuda:0")
+    output1 = model(input1)
+    output2 = model(input2)
+    output3 = model(input3)
+    print("all done")

DiT_Extractor/dit_object_detection/ditod/config.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from detectron2.config import CfgNode as CN
+def add_vit_config(cfg):
+    """
+    Add config for VIT.
+    """
+    _C = cfg
+    _C.MODEL.VIT = CN()
+    # CoaT model name.
+    _C.MODEL.VIT.NAME = ""
+    # Output features from CoaT backbone.
+    _C.MODEL.VIT.OUT_FEATURES = ["layer3", "layer5", "layer7", "layer11"]
+    _C.MODEL.VIT.IMG_SIZE = [224, 224]
+    _C.MODEL.VIT.POS_TYPE = "shared_rel"
+    _C.MODEL.VIT.DROP_PATH = 0.
+    _C.MODEL.VIT.MODEL_KWARGS = "{}"
+    _C.SOLVER.OPTIMIZER = "ADAMW"
+    _C.SOLVER.BACKBONE_MULTIPLIER = 1.0
+    _C.AUG = CN()
+    _C.AUG.DETR = False

DiT_Extractor/dit_object_detection/ditod/deit.py ADDED Viewed

	@@ -0,0 +1,476 @@

+"""
+Mostly copy-paste from DINO and timm library:
+https://github.com/facebookresearch/dino
+https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+"""
+import warnings
+import math
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import trunc_normal_, drop_path, to_2tuple
+from functools import partial
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        **kwargs
+    }
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        q, k, v = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                      C // self.num_heads).permute(2, 0, 3, 1, 4)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer, drop=drop)
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.window_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches_w, self.num_patches_h = self.window_size
+        self.num_patches = self.window_size[0] * self.window_size[1]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(in_chans, embed_dim,
+                              kernel_size=patch_size, stride=patch_size)
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+    def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature
+                # map for all networks, the feature metadata has reliable channel and stride info, but using
+                # stride to calc feature dim requires info about padding of each stage that isn't captured.
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(
+                    1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+class ViT(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self,
+                 model_name='vit_base_patch16_224',
+                 img_size=384,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=1024,
+                 depth=24,
+                 num_heads=16,
+                 num_classes=19,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.1,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 hybrid_backbone=None,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 norm_cfg=None,
+                 pos_embed_interp=False,
+                 random_init=False,
+                 align_corners=False,
+                 use_checkpoint=False,
+                 num_extra_tokens=1,
+                 out_features=None,
+                 **kwargs,
+                 ):
+        super(ViT, self).__init__()
+        self.model_name = model_name
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.depth = depth
+        self.num_heads = num_heads
+        self.num_classes = num_classes
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.hybrid_backbone = hybrid_backbone
+        self.norm_layer = norm_layer
+        self.norm_cfg = norm_cfg
+        self.pos_embed_interp = pos_embed_interp
+        self.random_init = random_init
+        self.align_corners = align_corners
+        self.use_checkpoint = use_checkpoint
+        self.num_extra_tokens = num_extra_tokens
+        self.out_features = out_features
+        self.out_indices = [int(name[5:]) for name in out_features]
+        # self.num_stages = self.depth
+        # self.out_indices = tuple(range(self.num_stages))
+        if self.hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                self.hybrid_backbone, img_size=self.img_size, in_chans=self.in_chans, embed_dim=self.embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=self.img_size, patch_size=self.patch_size, in_chans=self.in_chans, embed_dim=self.embed_dim)
+        self.num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
+        if self.num_extra_tokens == 2:
+            self.dist_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(
+            1, self.num_patches + self.num_extra_tokens, self.embed_dim))
+        self.pos_drop = nn.Dropout(p=self.drop_rate)
+        # self.num_extra_tokens = self.pos_embed.shape[-2] - self.num_patches
+        dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate,
+                                                self.depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=self.embed_dim, num_heads=self.num_heads, mlp_ratio=self.mlp_ratio, qkv_bias=self.qkv_bias,
+                qk_scale=self.qk_scale,
+                drop=self.drop_rate, attn_drop=self.attn_drop_rate, drop_path=dpr[i], norm_layer=self.norm_layer)
+            for i in range(self.depth)])
+        # NOTE as per official impl, we could have a pre-logits representation dense layer + tanh here
+        # self.repr = nn.Linear(embed_dim, representation_size)
+        # self.repr_act = nn.Tanh()
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+                nn.SyncBatchNorm(embed_dim),
+                nn.GELU(),
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+            self.fpn2 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+            self.fpn3 = nn.Identity()
+            self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+            self.fpn2 = nn.Identity()
+            self.fpn3 = nn.Sequential(
+                nn.MaxPool2d(kernel_size=2, stride=2),
+            )
+            self.fpn4 = nn.Sequential(
+                nn.MaxPool2d(kernel_size=4, stride=4),
+            )
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        if self.num_extra_tokens==2:
+            trunc_normal_(self.dist_token, std=0.2)
+        self.apply(self._init_weights)
+        # self.fix_init_weight()
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    '''
+    def init_weights(self):
+        logger = get_root_logger()
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            logger.info(f"Will load ckpt from {self.init_cfg['checkpoint']}")
+            load_checkpoint(self, filename=self.init_cfg['checkpoint'], strict=False, logger=logger)
+    '''
+    def get_num_layers(self):
+        return len(self.blocks)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+    def _conv_filter(self, state_dict, patch_size=16):
+        """ convert patch embedding weight from manual patchify + linear proj to conv"""
+        out_dict = {}
+        for k, v in state_dict.items():
+            if 'patch_embed.proj.weight' in k:
+                v = v.reshape((v.shape[0], 3, patch_size, patch_size))
+            out_dict[k] = v
+        return out_dict
+    def to_2D(self, x):
+        n, hw, c = x.shape
+        h = w = int(math.sqrt(hw))
+        x = x.transpose(1, 2).reshape(n, c, h, w)
+        return x
+    def to_1D(self, x):
+        n, c, h, w = x.shape
+        x = x.reshape(n, c, -1).transpose(1, 2)
+        return x
+    def interpolate_pos_encoding(self, x, w, h):
+        npatch = x.shape[1] - self.num_extra_tokens
+        N = self.pos_embed.shape[1] - self.num_extra_tokens
+        if npatch == N and w == h:
+            return self.pos_embed
+        class_ORdist_pos_embed = self.pos_embed[:, 0:self.num_extra_tokens]
+        patch_pos_embed = self.pos_embed[:, self.num_extra_tokens:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_embed.patch_size[0]
+        h0 = h // self.patch_embed.patch_size[1]
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        w0, h0 = w0 + 0.1, h0 + 0.1
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+            scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+            mode='bicubic',
+        )
+        assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_ORdist_pos_embed, patch_pos_embed), dim=1)
+    def prepare_tokens(self, x, mask=None):
+        B, nc, w, h = x.shape
+        # patch linear embedding
+        x = self.patch_embed(x)
+        # mask image modeling
+        if mask is not None:
+            x = self.mask_model(x, mask)
+        x = x.flatten(2).transpose(1, 2)
+        # add the [CLS] token to the embed patch tokens
+        all_tokens = [self.cls_token.expand(B, -1, -1)]
+        if self.num_extra_tokens == 2:
+            dist_tokens = self.dist_token.expand(B, -1, -1)
+            all_tokens.append(dist_tokens)
+        all_tokens.append(x)
+        x = torch.cat(all_tokens, dim=1)
+        # add positional encoding to each token
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        return self.pos_drop(x)
+    def forward_features(self, x):
+        # print(f"==========shape of x is {x.shape}==========")
+        B, _, H, W = x.shape
+        Hp, Wp = H // self.patch_size, W // self.patch_size
+        x = self.prepare_tokens(x)
+        features = []
+        for i, blk in enumerate(self.blocks):
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+            if i in self.out_indices:
+                xp = x[:, self.num_extra_tokens:, :].permute(0, 2, 1).reshape(B, -1, Hp, Wp)
+                features.append(xp.contiguous())
+        ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+        for i in range(len(features)):
+            features[i] = ops[i](features[i])
+        feat_out = {}
+        for name, value in zip(self.out_features, features):
+            feat_out[name] = value
+        return feat_out
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+def deit_base_patch16(pretrained=False, **kwargs):
+    model = ViT(
+        patch_size=16,
+        drop_rate=0.,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        num_classes=1000,
+        mlp_ratio=4.,
+        qkv_bias=True,
+        use_checkpoint=True,
+        num_extra_tokens=2,
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+def mae_base_patch16(pretrained=False, **kwargs):
+    model = ViT(
+        patch_size=16,
+        drop_rate=0.,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        num_classes=1000,
+        mlp_ratio=4.,
+        qkv_bias=True,
+        use_checkpoint=True,
+        num_extra_tokens=1,
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model

DiT_Extractor/dit_object_detection/publaynet_configs/Base-RCNN-FPN.yaml ADDED Viewed

	@@ -0,0 +1,69 @@

+MODEL:
+  MASK_ON: True
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  BACKBONE:
+    NAME: "build_vit_fpn_backbone"
+  VIT:
+    OUT_FEATURES: ["layer3", "layer5", "layer7", "layer11"]
+    DROP_PATH: 0.1
+    IMG_SIZE: [224,224]
+    POS_TYPE: "abs"
+  FPN:
+    IN_FEATURES: ["layer3", "layer5", "layer7", "layer11"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+    NUM_CLASSES: 5
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("publaynet_train",)
+  TEST: ("publaynet_val",)
+SOLVER:
+  LR_SCHEDULER_NAME: "WarmupCosineLR"
+  AMP:
+    ENABLED: True
+  OPTIMIZER: "ADAMW"
+  BACKBONE_MULTIPLIER: 1.0
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0
+  WARMUP_FACTOR: 0.01
+  BASE_LR: 0.0004
+  WEIGHT_DECAY: 0.05
+  IMS_PER_BATCH: 32
+INPUT:
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  FORMAT: "RGB"
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+VERSION: 2
+AUG:
+  DETR: True
+SEED: 42

DiT_Extractor/dit_object_detection/publaynet_configs/cascade/cascade_dit_base.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  PIXEL_MEAN: [ 127.5, 127.5, 127.5 ]
+  PIXEL_STD: [ 127.5, 127.5, 127.5 ]
+  WEIGHTS: "https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth"
+  VIT:
+    NAME: "dit_base_patch16"
+  ROI_HEADS:
+    NAME: CascadeROIHeads
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
+SOLVER:
+  WARMUP_ITERS: 1000
+  IMS_PER_BATCH: 16
+  MAX_ITER: 60000
+  CHECKPOINT_PERIOD: 2000
+TEST:
+  EVAL_PERIOD: 2000

DiT_Extractor/dit_object_detection/publaynet_configs/cascade/cascade_dit_large.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  PIXEL_MEAN: [ 127.5, 127.5, 127.5 ]
+  PIXEL_STD: [ 127.5, 127.5, 127.5 ]
+  WEIGHTS: "https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-large-224-p16-500k-d7a2fb.pth"
+  VIT:
+    NAME: "dit_large_patch16"
+    OUT_FEATURES: [ "layer7", "layer11", "layer15", "layer23" ]
+    DROP_PATH: 0.2
+  FPN:
+    IN_FEATURES: [ "layer7", "layer11", "layer15", "layer23" ]
+  ROI_HEADS:
+    NAME: CascadeROIHeads
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
+SOLVER:
+  WARMUP_ITERS: 1000
+  IMS_PER_BATCH: 16
+  MAX_ITER: 60000
+  CHECKPOINT_PERIOD: 2000
+  BASE_LR: 0.0001
+  STEPS: (40000, 53333)
+  AMP:
+    ENABLED: False
+TEST:
+  EVAL_PERIOD: 2000

DiT_Extractor/dit_object_detection/publaynet_configs/maskrcnn/maskrcnn_dit_base.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  PIXEL_MEAN: [ 127.5, 127.5, 127.5 ]
+  PIXEL_STD: [ 127.5, 127.5, 127.5 ]
+  WEIGHTS: "https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth"
+  VIT:
+    NAME: "dit_base_patch16"
+SOLVER:
+  WARMUP_ITERS: 1000
+  IMS_PER_BATCH: 16
+  MAX_ITER: 60000
+  CHECKPOINT_PERIOD: 2000
+TEST:
+  EVAL_PERIOD: 2000
+OUTPUT_DIR: $AMLT_OUTPUT_DIR

DiT_Extractor/dit_object_detection/publaynet_configs/maskrcnn/maskrcnn_dit_large.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  PIXEL_MEAN: [ 127.5, 127.5, 127.5 ]
+  PIXEL_STD: [ 127.5, 127.5, 127.5 ]
+  WEIGHTS: "https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-large-224-p16-500k-d7a2fb.pth"
+  VIT:
+    NAME: "dit_large_patch16"
+    OUT_FEATURES: [ "layer7", "layer11", "layer15", "layer23" ]
+    DROP_PATH: 0.2
+  FPN:
+    IN_FEATURES: [ "layer7", "layer11", "layer15", "layer23" ]
+SOLVER:
+  WARMUP_ITERS: 1000
+  IMS_PER_BATCH: 16
+  MAX_ITER: 60000
+  CHECKPOINT_PERIOD: 2000
+  BASE_LR: 0.0001
+  AMP:
+    ENABLED: False
+TEST:
+  EVAL_PERIOD: 2000
+OUTPUT_DIR: "output/publaynet/mask_rcnn/dit_base_multistep_3x_ms"

DiT_Extractor/dit_runner.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# Copyright (c) 2022, Lawrence Livermore National Security, LLC.
+# All rights reserved.
+# See the top-level LICENSE and NOTICE files for details.
+# LLNL-CODE-838964
+# SPDX-License-Identifier: Apache-2.0-with-LLVM-exception
+import cv2
+from pathlib import Path
+import torch
+import json
+from detectron2.config import CfgNode as CN
+from detectron2.config import get_cfg
+from detectron2.utils.visualizer import ColorMode, Visualizer
+from detectron2.data import MetadataCatalog
+from detectron2.engine import DefaultPredictor
+from pdf2image import convert_from_path
+from PIL import Image
+import numpy as np
+from dit_object_detection.ditod import add_vit_config
+import base_utils
+from pdfminer.layout import LTTextLineHorizontal, LTTextBoxHorizontal, LTAnno, LTChar
+from tokenizers.pre_tokenizers import Whitespace
+import warnings
+warnings.filterwarnings("ignore")
+dit_path = Path('DiT_Extractor/dit_object_detection')
+cfg = get_cfg()
+add_vit_config(cfg)
+cfg.merge_from_file(dit_path / "publaynet_configs/cascade/cascade_dit_base.yaml")
+cfg.MODEL.WEIGHTS = "https://layoutlm.blob.core.windows.net/dit/dit-fts/publaynet_dit-b_cascade.pth"
+cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+predictor = DefaultPredictor(cfg)
+thing_classes = ["text","title","list","table","figure"]
+thing_map = dict(map(reversed, enumerate(thing_classes)))
+md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
+md.set(thing_classes=thing_classes)
+def get_pdf_image(pdf_file, page):
+    image = convert_from_path(pdf_file, dpi=200, first_page=page, last_page=page)
+    return image
+def get_characters(subelement):
+    all_chars = []
+    if isinstance(subelement, LTTextLineHorizontal):
+        for char in subelement:
+            if isinstance(char, LTChar):
+                all_chars.append((char.bbox, char.get_text()))
+            if isinstance(char, LTAnno):
+                # No bbox, just a space, so make a thin slice after previous text
+                bbox = all_chars[-1][0]
+                bbox = (bbox[2],bbox[1],bbox[2],bbox[3])
+                all_chars.append((bbox, char.get_text()))
+    return all_chars
+def get_dit_preds(pdf, score_threshold=0.5):
+    page_count = base_utils.get_pdf_page_count(pdf)
+    # Input is numpy array of PIL image
+    page_sizes = base_utils.get_page_sizes(pdf)
+    sections = {}
+    viz_images = []
+    page_words = base_utils.get_pdf_words(pdf)
+    for page in range(1, page_count+1): #range(2, page_count + 1):
+        image = get_pdf_image(pdf, page)
+        image = np.array(image[0])
+        # Get prediction
+        output = predictor(image)["instances"]
+        output = output.to('cpu')
+        # Visualize predictions
+        v = Visualizer(image[:, :, ::-1],
+                       md,
+                       scale=1.0,
+                       instance_mode=ColorMode.SEGMENTATION)
+        result = v.draw_instance_predictions(output)
+        result_image = result.get_image()[:, :, ::-1]
+        viz_img = Image.fromarray(result_image)
+        viz_images.append(viz_img)
+        words = page_words[page-1]
+        # Convert from image_size to page size
+        pdf_dimensions = page_sizes[page-1][2:]
+        # Swap height/width
+        pdf_image_size = (output.image_size[1], output.image_size[0])
+        scale = np.array(pdf_dimensions) / np.array(pdf_image_size)
+        scale_box = np.hstack((scale,scale))
+        # Words are in page coordinates
+        id = 0
+        sections[page-1] = []
+        draw = image.copy()
+        for box_t, clazz, score in zip(output.get('pred_boxes'), output.get('pred_classes'), output.get('scores')):
+            if score < score_threshold:
+                continue
+            box = box_t.numpy()
+            # Flip along Y axis
+            box[1] = pdf_image_size[1] - box[1]
+            box[3] = pdf_image_size[1] - box[3]
+            # Scale
+            scaled = box * scale_box
+            # This is the correct order
+            scaled = [scaled[0], scaled[3], scaled[2], scaled[1]]
+            if clazz != thing_map['text']:
+                continue
+            start = box[0:2].tolist()
+            end = box[2:4].tolist()
+            start = [int(x) for x in start]
+            end = [int(x) for x in end]
+            out = {}
+            for word in words.copy():
+                if base_utils.partial_overlaps(word[0:4], scaled):
+                    if out == {}:
+                        id += 1
+                        out['coord'] = word[0:4]
+                        out['subelements'] = []
+                        out['type'] = 'content_block'
+                        out['id']= id
+                        out['text'] = ''
+                    out['coord'] = base_utils.union(out['coord'], word[0:4])
+                    out['text'] = out['text'] + word[4].get_text()
+                    characters = get_characters(word[4])
+                    out['subelements'].append(characters)
+                    words.remove(word)
+            if len(out) != 0:
+                sections[page-1].append(out)
+    # Write final annotation
+    out_name = Path(pdf).name[:-4] + ".json"
+    with open(out_name, 'w', encoding='utf8') as json_out:
+        json.dump(sections, json_out, ensure_ascii=False, indent=4)
+    return viz_images

DiT_Extractor/sentence_extractor.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright (c) 2022, Lawrence Livermore National Security, LLC.
+# All rights reserved.
+# See the top-level LICENSE and NOTICE files for details.
+# LLNL-CODE-838964
+# SPDX-License-Identifier: Apache-2.0-with-LLVM-exception
+import json
+from tokenizers.pre_tokenizers import Whitespace
+import base_utils
+import spacy
+def guess_sentences(tokens, text):
+    sentence_delems = ('.', '?', ').', '!')
+    sentences = []
+    sentence = []
+    maybe_delem = None
+    for token in tokens:
+        # check next token to see if there is space after prev delem
+        if maybe_delem != None:
+            if maybe_delem[1][1] < token[1][0]:
+                sentences.append(sentence)
+                sentence = []
+        maybe_delem = None
+        sentence.append(token)
+        if token[0] in sentence_delems:
+            maybe_delem = token
+    if sentence != []:
+        sentences.append(sentence)
+    return sentences
+def spacey_sentences(text):
+    nlp = spacy.blank('en')
+    nlp.add_pipe('sentencizer')
+    sentences = [s.text for s in nlp(text).sents]
+    return sentences
+def add_coords(sentences, all_coords):
+    sentences_out = []
+    for sentence in sentences:
+        new_sentence = []
+        for token in sentence:
+            indexes = token[1]
+            bbox = all_coords[indexes[0]]
+            for i in range(indexes[0]+1, indexes[1]):
+                bbox = base_utils.union(bbox, all_coords[i])
+            new_sentence.append((token[0],token[1],bbox))
+        sentences_out.append(new_sentence)
+    return sentences_out
+def sentence_extract(document):
+    """
+    Convert extract .PDF result .pkl into tokens with max length of 384 tokens, seperated
+    on sentence delimiter boundaries such as .!?
+    """
+    max_tokens = 384
+    document_tree = json.load(open(document,'r'))
+    sections_per_page = {}
+    for page_num, page in document_tree.items():
+        # Tokenize per section (rectangular block that was detected by DIT)
+        word_sections = []
+        text_sections = []
+        for section in page:
+            text_sections.append(section['text'])
+            all_text = ''
+            all_coord = []
+            if 'subelements' not in section:
+                continue
+            for subelement in section['subelements']:
+                for char in subelement:
+                    all_text += char[1]
+                    all_coord.append(char[0])
+                    # check for weird characters, e.g. "(cid:206)", "ff", "fi", etc
+                    # if string isn't just 1 character, it's an irregular LTChar (character) from pdfminer.
+                    # instead of skipping them, we can just create extra duplicate coordinates for the additional characters.
+                    if len(char[1]) > 1:
+                        bad_char_len = len(char[1])
+                        dupe_coord_amt = (bad_char_len - 1)
+                        for dupe_i in range(dupe_coord_amt):
+                            all_coord.append(char[0])
+            pre_tokenizer = Whitespace()
+            sentences_pre_tok = spacey_sentences(all_text)
+            sentences = []
+            for sentence in sentences_pre_tok:
+                tokenized = pre_tokenizer.pre_tokenize_str(sentence)
+                sentences.append(tokenized)
+            sentences = add_coords(sentences, all_coord)
+            word_section = []
+            t = 0
+            for sentence in sentences:
+                t += len(sentence)
+                if t <= max_tokens:
+                    word_section += sentence
+                else:
+                    word_sections.append(word_section)
+                    word_section = sentence
+                    t = len(sentence)
+            word_sections.append(word_section)
+        sections = {'text_sections':text_sections, 'word_sections':word_sections}
+        sections_per_page[page_num] = sections
+    return sections_per_page
+def format_output_contexts(sections_per_page):
+    all_contexts = {}
+    for page_idx in sections_per_page.keys():
+        text_sections = sections_per_page[page_idx]['text_sections']
+        word_sections = sections_per_page[page_idx]['word_sections']
+        for text_section, word_section in zip(text_sections, word_sections):
+            whitespaced_text = ' '.join([word[0] for word in word_section])
+            words_info = []
+            for word in word_section:
+                words_info.append({'word_text:':word[0], 'char_indices':word[1], 'word_bbox':word[2]})
+            context_row = {'text':text_section, 'whitespaced_text':whitespaced_text, 'page_idx':int(page_idx), 'words_info':words_info}
+            context_id = 'context_{0}'.format(len(all_contexts))
+            all_contexts[context_id] = context_row
+    return all_contexts
+def get_contexts(json_input):
+    json_output = 'contexts_{0}'.format(json_input)
+    sections_per_page = sentence_extract(json_input)
+    all_contexts = format_output_contexts(sections_per_page)
+    with open(json_output, 'w', encoding='utf8') as json_out:
+        json.dump(all_contexts, json_out, ensure_ascii=False, indent=4)

LICENSE ADDED Viewed

	@@ -0,0 +1,207 @@

+Apache License
+Version 2.0, August 2022
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction,
+and distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by
+the copyright owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all
+other entities that control, are controlled by, or are under common
+control with that entity. For the purposes of this definition,
+"control" means (i) the power, direct or indirect, to cause the
+direction or management of such entity, whether by contract or
+otherwise, or (ii) ownership of fifty percent (50%) or more of the
+outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity
+exercising permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications,
+including but not limited to software source code, documentation
+source, and configuration files.
+"Object" form shall mean any form resulting from mechanical
+transformation or translation of a Source form, including but
+not limited to compiled object code, generated documentation,
+and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or
+Object form, made available under the License, as indicated by a
+copyright notice that is included in or attached to the work
+(an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object
+form, that is based on (or derived from) the Work and for which the
+editorial revisions, annotations, elaborations, or other modifications
+represent, as a whole, an original work of authorship. For the purposes
+of this License, Derivative Works shall not include works that remain
+separable from, or merely link (or bind by name) to the interfaces of,
+the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including
+the original version of the Work and any modifications or additions
+to that Work or Derivative Works thereof, that is intentionally
+submitted to Licensor for inclusion in the Work by the copyright owner
+or by an individual or Legal Entity authorized to submit on behalf of
+the copyright owner. For the purposes of this definition, "submitted"
+means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems,
+and issue tracking systems that are managed by, or on behalf of, the
+Licensor for the purpose of discussing and improving the Work, but
+excluding communication that is conspicuously marked or otherwise
+designated in writing by the copyright owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity
+on behalf of whom a Contribution has been received by Licensor and
+subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the
+Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+(except as stated in this section) patent license to make, have made,
+use, offer to sell, sell, import, and otherwise transfer the Work,
+where such license applies only to those patent claims licensable
+by such Contributor that are necessarily infringed by their
+Contribution(s) alone or by combination of their Contribution(s)
+with the Work to which such Contribution(s) was submitted. If You
+institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work
+or a Contribution incorporated within the Work constitutes direct
+or contributory patent infringement, then any patent licenses
+granted to You under this License for that Work shall terminate
+as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the
+Work or Derivative Works thereof in any medium, with or without
+modifications, and in Source or Object form, provided that You
+meet the following conditions:
+(a) You must give any other recipients of the Work or
+Derivative Works a copy of this License; and
+(b) You must cause any modified files to carry prominent notices
+stating that You changed the files; and
+(c) You must retain, in the Source form of any Derivative Works
+that You distribute, all copyright, patent, trademark, and
+attribution notices from the Source form of the Work,
+excluding those notices that do not pertain to any part of
+the Derivative Works; and
+(d) If the Work includes a "NOTICE" text file as part of its
+distribution, then any Derivative Works that You distribute must
+include a readable copy of the attribution notices contained
+within such NOTICE file, excluding those notices that do not
+pertain to any part of the Derivative Works, in at least one
+of the following places: within a NOTICE text file distributed
+as part of the Derivative Works; within the Source form or
+documentation, if provided along with the Derivative Works; or,
+within a display generated by the Derivative Works, if and
+wherever such third-party notices normally appear. The contents
+of the NOTICE file are for informational purposes only and
+do not modify the License. You may add Your own attribution
+notices within Derivative Works that You distribute, alongside
+or as an addendum to the NOTICE text from the Work, provided
+that such additional attribution notices cannot be construed
+as modifying the License.
+You may add Your own copyright statement to Your modifications and
+may provide additional or different license terms and conditions
+for use, reproduction, or distribution of Your modifications, or
+for any such Derivative Works as a whole, provided Your use,
+reproduction, and distribution of the Work otherwise complies with
+the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise,
+any Contribution intentionally submitted for inclusion in the Work
+by You to the Licensor shall be under the terms and conditions of
+this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify
+the terms of any separate license agreement you may have executed
+with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade
+names, trademarks, service marks, or product names of the Licensor,
+except as required for reasonable and customary use in describing the
+origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or
+agreed to in writing, Licensor provides the Work (and each
+Contributor provides its Contributions) on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied, including, without limitation, any warranties or conditions
+of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE. You are solely responsible for determining the
+appropriateness of using or redistributing the Work and assume any
+risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory,
+whether in tort (including negligence), contract, or otherwise,
+unless required by applicable law (such as deliberate and grossly
+negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special,
+incidental, or consequential damages of any character arising as a
+result of this License or out of the use or inability to use the
+Work (including but not limited to damages for loss of goodwill,
+work stoppage, computer failure or malfunction, or any and all
+other commercial damages or losses), even if such Contributor
+has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing
+the Work or Derivative Works thereof, You may choose to offer,
+and charge a fee for, acceptance of support, warranty, indemnity,
+or other liability obligations and/or rights consistent with this
+License. However, in accepting such obligations, You may act only
+on Your own behalf and on Your sole responsibility, not on behalf
+of any other Contributor, and only if You agree to indemnify,
+defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason
+of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+APPENDIX: How to apply the Apache License to your work.
+To apply the Apache License to your work, attach the following
+boilerplate notice, with the fields enclosed by brackets "[]"
+replaced with your own identifying information. (Don't include
+the brackets!)  The text should be enclosed in the appropriate
+comment syntax for the file format. We also recommend that a
+file or class name and description of purpose be included on the
+same "printed page" as the copyright notice for easier
+identification within third-party archives.
+Copyright 2018, Lawrence Livermore National Security, LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+---- LLVM Exceptions to the Apache 2.0 License ----
+As an exception, if, as a result of your compiling your source code, portions of this Software are embedded into an Object form of such source code, you may redistribute such embedded portions in such Object form without complying with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+In addition, if you combine or link compiled forms of this Software with software that is licensed under the GPLv2 ("Combined Software") and if a court of competent jurisdiction determines that the patent provision (Section 3), the indemnity provision (Section 9) or other Section of the License conflicts with the conditions of the GPLv2, you may retroactively and prospectively choose to deem waived or otherwise exclude such Section(s) of the License, but only in their entirety and only with respect to the Combined Software.

NOTICE ADDED Viewed

	@@ -0,0 +1,21 @@

+This work was produced under the auspices of the U.S. Department of
+Energy by Lawrence Livermore National Laboratory under Contract
+DE-AC52-07NA27344.
+This work was prepared as an account of work sponsored by an agency of
+the United States Government. Neither the United States Government nor
+Lawrence Livermore National Security, LLC, nor any of their employees
+makes any warranty, expressed or implied, or assumes any legal liability
+or responsibility for the accuracy, completeness, or usefulness of any
+information, apparatus, product, or process disclosed, or represents that
+its use would not infringe privately owned rights.
+Reference herein to any specific commercial product, process, or service
+by trade name, trademark, manufacturer, or otherwise does not necessarily
+constitute or imply its endorsement, recommendation, or favoring by the
+United States Government or Lawrence Livermore National Security, LLC.
+The views and opinions of authors expressed herein do not necessarily
+state or reflect those of the United States Government or Lawrence
+Livermore National Security, LLC, and shall not be used for advertising
+or product endorsement purposes.

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: Detect Retrieve Comprehend
-emoji: 👀
-colorFrom: pink
-colorTo: red
 sdk: gradio
 sdk_version: 3.1.7
 app_file: app.py
@@ -10,4 +10,14 @@ pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Detect Retrieve Comprehend
+emoji: 📚
+colorFrom: green
+colorTo: pink
 sdk: gradio
 sdk_version: 3.1.7
 app_file: app.py
 license: apache-2.0
 ---
+# Release
+---
+**Detect, Retrieve, Comprehend** is distributed under the terms of Apache 2.0 license with LLVM exception.
+See [LICENSE]() and [NOTICE]() for details.
+SPDX-License-Identifier: Apache-2.0-with-LLVM-exception
+LLNL-CODE-838964

UnifiedQA/demo_QA.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Copyright (c) 2022, Lawrence Livermore National Security, LLC.
+# All rights reserved.
+# See the top-level LICENSE and NOTICE files for details.
+# LLNL-CODE-838964
+# SPDX-License-Identifier: Apache-2.0-with-LLVM-exception
+import sys
+import json
+from math import ceil
+import torch
+import numpy as np
+from torch import tensor
+from torch.nn.functional import log_softmax
+from torch.distributions.categorical import Categorical
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+# load UnifiedQA onto device
+model_name = "allenai/unifiedqa-v2-t5-large-1363200"
+tokenizer = T5Tokenizer.from_pretrained(model_name)
+model = T5ForConditionalGeneration.from_pretrained(model_name)
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+model.to(device)
+def get_inputs(contexts_json, ranked_contexts_json):
+	with open(contexts_json, 'rt') as fp:
+		contexts = json.load(fp)
+	with open(ranked_contexts_json, 'rt') as fp:
+		ranked_contexts = json.load(fp)
+	question_id = list(ranked_contexts.keys())[0]
+	# assert len(questions) == 1, f'JSON should only have 1 question but found {len(questions)}: {questions}'
+	question = ranked_contexts[question_id]['text']
+	context_ids_sorted = ranked_contexts[question_id]['ranks']
+	context_scores = ranked_contexts[question_id]['scores']
+	contexts = [contexts[context_id]['text'] for context_id in context_ids_sorted]
+	# returns the question (str) and its contexts (sequence)
+	return question, contexts, context_scores
+def get_tokens(text, tokenizer, max_tokens):
+	return tokenizer.encode_plus(text, return_tensors='pt', max_length=max_tokens, padding='max_length', truncation=True)['input_ids']
+def prepare_inputs(tokenizer, max_tokens, context, question):
+	input_str = f'{question} \\n {context}'
+	inputs = get_tokens(input_str, tokenizer, max_tokens)
+	return inputs
+def get_outputs(model, tokenizer, input_tokens, max_tokens):
+	output_dict = model.generate(input_tokens, output_scores=True, return_dict_in_generate=True, **{'max_length': max_tokens})
+	pred_tokens = output_dict['sequences'].squeeze().tolist()
+	# initialize metrics
+	logit_entropy = []
+	sentence_probs = []
+	# accumulate metrics over logit_sequence
+	logit_sequence = output_dict['scores'][:-1] # discard end token
+	for logit in logit_sequence:
+		log_probs = log_softmax(logit, dim=-1)
+		# update metrics
+		logit_entropy.append(Categorical(log_probs.exp()).entropy())
+		sentence_probs.append(log_probs.max())
+	# finish metrics calculation
+	logit_entropy = tensor(logit_entropy)
+	sentence_probs = tensor(sentence_probs)
+	entropy = logit_entropy.mean()
+	sentence_std = 0 if len(logit_sequence) == 1 else sentence_probs.std(unbiased=True).exp()
+	# use entropy * sentence_std as uncertainty
+	uncertainty = (entropy * sentence_std).item()
+	# convert answer tokens to str
+	pred_str = tokenizer.decode(pred_tokens, skip_special_tokens=True).lower()
+	return pred_str, uncertainty
+# k_percent: percentage of contexts to use, cannot be less than min_k or greater than max_k
+# min_k: minimum number of contexts to use, if possible. Setting this too small reduces recall
+# max_k: maximum number of contexts to use. Setting this too big reduces precision
+# recommended uncertainty thresholds are 2,3,4, and 5. The lower the threshold, the more aggressive the filtering
+def run_model(model, tokenizer, device, question, contexts, context_scores, k_percent=0.1, min_k=10, max_k=25, uncertainty_thresh=3):
+	k = min(max(ceil(k_percent * len(contexts)), min_k), max_k)
+	contexts = contexts[:k]
+	context_scores = context_scores[:k]
+	# iterate through top-k contexts
+	answers = []
+	uncertainty = []
+	for context in contexts:
+		input_tokens = prepare_inputs(tokenizer, 512, context, question).to(device)
+		pred_str, uncertainty_1 = get_outputs(model, tokenizer, input_tokens, 512)
+		answers.append(pred_str)
+		uncertainty.append(uncertainty_1)
+	# contexts = np.array(contexts)
+	# answers = np.array(answers)
+	# uncertainty = np.array(uncertainty)
+	# sort by uncertainty, ascending order
+	# order = np.argsort(uncertainty)
+	# contexts = contexts[order]
+	# answers = answers[order]
+	# uncertainty = uncertainty[order]
+	# init lists for threshed answers
+	# weak_contexts = []
+	# weak_answers = []
+	# weak_uncertainty = []
+	# filter by uncertainty
+	# if len(answers) > min_k:
+		# weak = np.argwhere(uncertainty > uncertainty_thresh) # exceeds threshold
+		# weak_contexts = contexts[weak].tolist()
+		# weak_answers = answers[weak].tolist()
+		# weak_uncertainty = uncertainty[weak].tolist()
+		# strong = np.argwhere(uncertainty <= uncertainty_thresh) # within threshold
+		# contexts = contexts[strong]
+		# answers = answers[strong]
+		# uncertainty = uncertainty[strong]
+	# contexts = contexts.tolist()
+	# answers = answers.tolist()
+	# uncertainty = uncertainty.tolist()
+	# return {'contexts': contexts, 'answers': answers, 'uncertainty': uncertainty}, \
+	# 	{'contexts': weak_contexts, 'answers': weak_answers, 'uncertainty': weak_uncertainty}
+	return {'contexts': contexts, 'answers': answers, 'context_scores':context_scores, 'uncertainty': uncertainty}
+def get_qa_results(contexts_json, ranked_contexts_json, topk):
+	# extract question and contexts from json
+	question, contexts, context_scores = get_inputs(contexts_json, ranked_contexts_json)
+	# infer answers
+	with torch.inference_mode(True):
+		# strong_answers, weak_answers = run_model(model, tokenizer, device, question, contexts, k_percent=k_percent)
+		qa_results = run_model(model, tokenizer, device, question, contexts, context_scores, k_percent=1.0, min_k=1, max_k=topk)
+	return qa_results
+def get_qa_results_in_memory(contexts, ranked_contexts, topk):
+	question_id = list(ranked_contexts.keys())[0]
+	# assert len(questions) == 1, f'JSON should only have 1 question but found {len(questions)}: {questions}'
+	question = ranked_contexts[question_id]['text']
+	context_ids_sorted = ranked_contexts[question_id]['ranks']
+	context_scores = ranked_contexts[question_id]['scores']
+	contexts = [contexts[context_id]['text'] for context_id in context_ids_sorted]
+	# infer answers
+	with torch.inference_mode(True):
+		# strong_answers, weak_answers = run_model(model, tokenizer, device, question, contexts, k_percent=k_percent)
+		qa_results = run_model(model, tokenizer, device, question, contexts, context_scores, k_percent=1.0, min_k=1, max_k=topk)
+	return qa_results
+def load_custom_model(finetuned_model_path):
+    global tokenizer
+    global model
+    # load UnifiedQA onto device
+    tokenizer = T5Tokenizer.from_pretrained(finetuned_model_path)
+    model = T5ForConditionalGeneration.from_pretrained(finetuned_model_path)
+    model.to(device)
+def get_qa_results_in_memory_finetuned_unifiedqa(question, context_scores, contexts, topk):
+	# infer answers
+	with torch.inference_mode(True):
+		# strong_answers, weak_answers = run_model(model, tokenizer, device, question, contexts, k_percent=k_percent)
+		qa_results = run_model(model, tokenizer, device, question, contexts, context_scores, k_percent=1.0, min_k=1, max_k=topk)
+	return qa_results

app.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Copyright (c) 2022, Lawrence Livermore National Security, LLC.
+# All rights reserved.
+# See the top-level LICENSE and NOTICE files for details.
+# LLNL-CODE-838964
+# SPDX-License-Identifier: Apache-2.0-with-LLVM-exception
+import torch
+import gradio as gr
+from pathlib import Path
+from torchvision.transforms import ToPILImage, ToTensor
+tensor_to_image = ToPILImage()
+image_to_tensor = ToTensor()
+import sys
+sys.path.append('DiT_Extractor/')
+sys.path.append('CrossEncoder/')
+sys.path.append('UnifiedQA/')
+import dit_runner
+import sentence_extractor
+import cross_encoder
+import demo_QA
+from torchvision.transforms import ToPILImage
+tensor_to_image = ToPILImage()
+def run_fn(pdf_file_obj, question_text, input_topk):
+    pdf = pdf_file_obj.name
+    viz_images = dit_runner.get_dit_preds(pdf, score_threshold=0.5)
+    entity_json = '{0}.json'.format(Path(pdf).name[:-4])
+    sentence_extractor.get_contexts(entity_json)
+    contexts_json = 'contexts_{0}'.format(entity_json)
+    # contexts_json = 'contexts_2105u2iwiwxh.03011.json'
+    cross_encoder.get_ranked_contexts(contexts_json, question_text)
+    ranked_contexts_json = 'ranked_{0}'.format(contexts_json)
+    # ranked_contexts_json = 'ranked_contexts_2105u2iwiwxh.03011.json'
+    input_topk = int(input_topk)
+    # viz_images = [tensor_to_image(x) for x in torch.randn(4, 3, 256, 256)]
+    qa_results = demo_QA.get_qa_results(contexts_json, ranked_contexts_json, input_topk)
+    history = [('<<< [Retrieval Score: {0:.02f}] >>> {1}'.format(s, c), a) for c, s, a in zip(qa_results['contexts'], qa_results['context_scores'], qa_results['answers'])]
+    # Show in ascending order of score, since results box is already scrolled down.
+    history = history[::-1]
+    return viz_images, contexts_json, ranked_contexts_json, history
+demo = gr.Blocks()
+with demo:
+    gr.Markdown("<h1><center>Document-based Question Answering</center></h1>")
+    gr.Markdown("<center>This is a supplemental demo for our publication, [Document-based Question Answering](https://www.google.com). In this system, our input is a PDF file with a specific question of interest. The output is a set of most probable answers. There are 4 main components in our deployed pipeline: (1) DiT Layout Analysis (2) Context Extraction (3) Cross-Encoder Retrieval (4) UnifiedQA. See below for example uses with further explanation.</center>")
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                input_pdf_file = gr.File(file_count='single', label='PDF File')
+            with gr.Row():
+                input_question_text = gr.Textbox(label='Question')
+            with gr.Row():
+                input_k_percent = gr.Slider(minimum=1, maximum=24, step=1, value=8, label='Top K')
+            with gr.Row():
+                button_run = gr.Button('Run QA on Document')
+            gr.Markdown("<h3><center>Summary</center></h3>")
+            with gr.Row():
+                gr.Markdown('''
+                - <u>**DiT - Document Image Transformer**</u>: PDF -> converted into a list of images -> each image receives Entity Predictions
+                  - Note that using this computer vision approach allows us to ignore things like *page numbers, footnotes, references*, etc
+                - <u>**Paragraph-based Text Extraction**</u>: DiT Bounding Boxes -> Convert into PDF-Space Coordinates -> Text Extraction using PDFMiner6 -> Tokenize & Sentence Split if tokenizer max length is exceeded
+                - <u>**CrossEncoder Context Retrieval**</u>: All Contexts + Question -> Top K Relevant Contexts best suited for answering question
+                - <u>**UnifiedQA**</u>: Most Relevant Contexts + Supplied Question -> Predict Set of Probable Answers
+                ''')
+        with gr.Column():
+            with gr.Row():
+                output_gallery = gr.Gallery(label='DiT Predicted Entities')
+            with gr.Row():
+                gr.Markdown('''
+                - The `DiT predicted Entities` output box is scrollable! Scroll to see different page predictions. Note that predictions with confidence scores < 0.5 are not passed forward for text extraction.
+                - If an image is clicked, the output box will switch to a gallery view. To view these outputs in much higher resolution, right-click and choose "open image in new tab"
+                ''')
+            with gr.Row():
+                output_contexts = gr.File(label='Detected Contexts', interactive=False)
+                output_ranked_contexts = gr.File(label='CrossEncoder Ranked Contexts', interactive=False)
+            with gr.Row():
+                output_qa_results = gr.Chatbot(color_map=['blue', 'green'], label='UnifiedQA Results').style()
+    gr.Markdown("<h3><center>Related Work & Code</center></h3>")
+    gr.Markdown("<center>DiT (Document Image Transformer) - <a href=https://arxiv.org/abs/2203.02378>Arxiv Page</a> | <a href=https://github.com/microsoft/unilm/tree/master/dit>Github Repo</a></center>")
+    gr.Markdown("<center>CrossEncoder - <a href=https://arxiv.org/abs/2203.02378>Arxiv Page</a> | <a href=https://github.com/microsoft/unilm/tree/master/dit>Github Repo</a></center>")
+    gr.Markdown("<center>UnifiedQA - <a href=https://arxiv.org/abs/2005.00700>Arxiv Page</a> | <a href=https://github.com/allenai/unifiedqa>Github Repo</a></center>")
+    button_run.click(fn=run_fn, inputs=[input_pdf_file, input_question_text, input_k_percent], outputs=[output_gallery, output_contexts, output_ranked_contexts, output_qa_results])
+    examples = [
+        ['examples/1909.00694.pdf', 'What is the seed lexicon?', 5],
+        ['examples/1909.00694.pdf', 'How big is seed lexicon used for training?', 5],
+        ['examples/1810.04805.pdf', 'What is this paper about?', 5],
+        ['examples/1810.04805.pdf', 'What is the model size?', 5],
+        ['examples/2105.03011.pdf', 'How many questions are in this dataset?', 5],
+        ['examples/1909.00694.pdf', 'How are relations used to propagate polarity?', 5],
+    ]
+    gr.Examples(examples=examples,
+                inputs=[input_pdf_file, input_question_text, input_k_percent])
+    # examples = gr.Dataset(components=[input_pdf_file, input_question_text], samples=[[open('examples/1810.04805.pdf', mode='rb'), 'How many parameters are in the model?']])
+    demo.launch(enable_queue=True)

env_setup.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+conda create --name llnl_actici_env python=3.9
+conda activate llnl_actici_env
+conda install pytorch=1.10 torchvision torchaudio cudatoolkit=11.3 -c pytorch
+# For DiT
+python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html
+# For DiT
+pip install opencv-python
+pip install timm
+pip install pdfminer.six
+conda install -c conda-forge poppler
+pip install pdf2image
+pip install pypdf2
+pip install spacy
+# pytesseract, in case we need in future
+pip install pytesseract
+# For Retrieval & QA
+pip install transformers==4.20
+pip install sentence-transformers
+# For Demo
+pip install gradio
+# If Jupyter is allowed
+pip install jupyter
+# (Optional, adding this custom env to the base environment's jupyter)
+python -m ipykernel install --user --name llnl_actici_env --display-name "Python (llnl_actici_env)"

examples/1810.04805.pdf ADDED Viewed

Binary file (775 kB). View file

examples/1909.00694.pdf ADDED Viewed

Binary file (540 kB). View file

examples/2105.03011.pdf ADDED Viewed

Binary file (507 kB). View file

ms-marco-electra-base/CEBinaryClassificationEvaluator_MS-Marco_results.csv ADDED Viewed

	@@ -0,0 +1,43 @@

+epoch,steps,Accuracy,Accuracy_Threshold,F1,F1_Threshold,Precision,Recall,Average_Precision
+0,5000,0.9297070292970703,0.25256121158599854,0.8307839388145314,0.19771124422550201,0.7957875457875457,0.869,0.8904110467492587
+0,10000,0.939006099390061,0.5306986570358276,0.8460807600950118,0.28808051347732544,0.8058823529411765,0.8905,0.910544278892506
+0,15000,0.9393060693930607,0.5750397443771362,0.8566081871345029,0.48249387741088867,0.8048351648351648,0.9155,0.9132147986720082
+0,20000,0.9405059494050595,0.591253936290741,0.8546298558514537,0.570050835609436,0.8356426182513139,0.8745,0.9073685536522613
+0,25000,0.9436056394360564,0.5074090957641602,0.8603960396039605,0.5057582855224609,0.8519607843137255,0.869,0.9167379821993755
+0,30000,0.9396060393960604,0.8262588381767273,0.8542471042471043,0.7406325340270996,0.8255597014925373,0.885,0.8979176130668384
+0,35000,0.9425057494250575,0.46686679124832153,0.8596070915189268,0.28302955627441406,0.8252069917203312,0.897,0.9163289965092976
+0,40000,0.9417058294170583,0.6763133406639099,0.8575602629656682,0.6603987216949463,0.8357854769814903,0.8805,0.9173776247925393
+0,45000,0.9426057394260574,0.4643915295600891,0.8605042016806723,0.29147765040397644,0.8277136258660508,0.896,0.9120726077810245
+0,50000,0.945005499450055,0.5493776798248291,0.8624535315985131,0.4713650643825531,0.855036855036855,0.87,0.9209400105864155
+0,55000,0.9454054594540546,0.6156725287437439,0.864585893339887,0.5604670643806458,0.8501691638472693,0.8795,0.9206262233464874
+0,60000,0.9421057894210579,0.39554399251937866,0.8605827112930412,0.3811936378479004,0.8300046446818393,0.8935,0.9193948306076224
+0,65000,0.9428057194280572,0.5363738536834717,0.8629682313892841,0.32784485816955566,0.8205590622182146,0.91,0.9227492855045069
+0,70000,0.9438056194380562,0.38333064317703247,0.8628501827040195,0.3524332344532013,0.8413301662707838,0.8855,0.9236299441431376
+0,75000,0.9468053194680532,0.48936331272125244,0.8696717295443409,0.48936331272125244,0.8525456292026897,0.8875,0.9254413650794524
+0,80000,0.9454054594540546,0.3127445578575134,0.8651851851851852,0.3127445578575134,0.8546341463414634,0.876,0.9213706944185774
+0,85000,0.9443055694430557,0.31547677516937256,0.8655280250180418,0.21403872966766357,0.8340287436254057,0.8995,0.9237103419372517
+0,90000,0.9465053494650535,0.3857932686805725,0.8702401164200824,0.3761560022830963,0.8450306170513424,0.897,0.9258501989030058
+0,95000,0.9453054694530547,0.3604514002799988,0.8669713735867213,0.29048818349838257,0.8354195642095503,0.901,0.9226658871253511
+0,100000,0.9453054694530547,0.6748594045639038,0.8686288585786074,0.4552273154258728,0.8329508949059201,0.9075,0.9252677323330876
+0,105000,0.9435056494350565,0.40062007308006287,0.8639551192145862,0.1210024282336235,0.8112379280070237,0.924,0.9237990563267019
+0,110000,0.944905509449055,0.4197750985622406,0.8656429942418427,0.27975988388061523,0.8321033210332104,0.902,0.9247201058651281
+0,115000,0.9464053594640536,0.4172205924987793,0.8698167791706846,0.2961992919445038,0.839851024208566,0.902,0.927117403879296
+0,120000,0.9474052594740526,0.44686269760131836,0.8712047012732614,0.4383932948112488,0.8536468330134357,0.8895,0.9279628711835812
+0,125000,0.945005499450055,0.4358792304992676,0.8655339805825243,0.28539055585861206,0.8410377358490566,0.8915,0.9268525722856882
+0,130000,0.9462053794620537,0.21194982528686523,0.8703747911195989,0.16292141377925873,0.8328003654636821,0.9115,0.925512309638313
+0,135000,0.9454054594540546,0.2292814701795578,0.8678621991505427,0.11477036774158478,0.82171581769437,0.9195,0.9268551457216524
+0,140000,0.9482051794820517,0.31556186079978943,0.8758076094759513,0.26744428277015686,0.8398347865993575,0.915,0.9275073681003255
+0,145000,0.9478052194780522,0.3485147953033447,0.8719556305763203,0.12995882332324982,0.8421052631578947,0.904,0.9278250006342896
+0,150000,0.9483051694830517,0.32228657603263855,0.8726037369570493,0.21710461378097534,0.8477133427628477,0.899,0.9259328370035781
+0,155000,0.9474052594740526,0.1903868019580841,0.8731307284129282,0.18298938870429993,0.8434296365330848,0.905,0.9261096325445609
+0,160000,0.9473052694730527,0.5740681886672974,0.872194660996929,0.17134147882461548,0.8266905508284819,0.923,0.927973529121574
+0,165000,0.9495050494950505,0.38968273997306824,0.87591956841589,0.34622055292129517,0.8594802694898941,0.893,0.9241440163389828
+0,170000,0.9459054094590541,0.47478723526000977,0.8706669854171647,0.11328981816768646,0.8341731562070546,0.9105,0.9289979858500923
+0,175000,0.9473052694730527,0.5903739929199219,0.8703747911195989,0.15506823360919952,0.8328003654636821,0.9115,0.9305074303915251
+0,180000,0.9463053694630537,0.23235449194908142,0.8702585165498912,0.23235449194908142,0.841982234689107,0.9005,0.9291547676197442
+0,185000,0.9478052194780522,0.174373060464859,0.8734852157052836,0.171615868806839,0.8476011288805269,0.901,0.9280170204346545
+0,190000,0.949005099490051,0.5715193748474121,0.8747241971071341,0.5108739137649536,0.8581048581048581,0.892,0.9271410745170057
+0,195000,0.9461053894610539,0.5194154977798462,0.8679334916864608,0.170893132686615,0.8266968325791855,0.9135,0.9271023702066649
+0,200000,0.9468053194680532,0.3094758987426758,0.8707931277947754,0.11578939855098724,0.82258781680747,0.925,0.9290083868621436
+0,205000,0.9461053894610539,0.6028298139572144,0.8679067577113257,0.13052904605865479,0.8202047174009791,0.9215,0.9276186176796931
+0,210000,0.9459054094590541,0.49049288034439087,0.8694616484040019,0.16249723732471466,0.8303002729754322,0.9125,0.9285170114050436

ms-marco-electra-base/README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+---
+license: apache-2.0
+---
+# Cross-Encoder for MS Marco
+This model was trained on the [MS Marco Passage Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) task.
+The model can be used for Information Retrieval: Given a query, encode the query will all possible passages (e.g. retrieved with ElasticSearch). Then sort the passages in a decreasing order. See [SBERT.net Retrieve & Re-rank](https://www.sbert.net/examples/applications/retrieve_rerank/README.html) for more details. The training code is available here: [SBERT.net Training MS Marco](https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/ms_marco)
+## Usage with Transformers
+```python
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+model = AutoModelForSequenceClassification.from_pretrained('model_name')
+tokenizer = AutoTokenizer.from_pretrained('model_name')
+features = tokenizer(['How many people live in Berlin?', 'How many people live in Berlin?'], ['Berlin has a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.', 'New York City is famous for the Metropolitan Museum of Art.'],  padding=True, truncation=True, return_tensors="pt")
+model.eval()
+with torch.no_grad():
+    scores = model(**features).logits
+    print(scores)
+```
+## Usage with SentenceTransformers
+The usage becomes easier when you have [SentenceTransformers](https://www.sbert.net/) installed. Then, you can use the pre-trained models like this:
+```python
+from sentence_transformers import CrossEncoder
+model = CrossEncoder('model_name', max_length=512)
+scores = model.predict([('Query', 'Paragraph1'), ('Query', 'Paragraph2') , ('Query', 'Paragraph3')])
+```
+## Performance
+In the following table, we provide various pre-trained Cross-Encoders together with their performance on the [TREC Deep Learning 2019](https://microsoft.github.io/TREC-2019-Deep-Learning/) and the [MS Marco Passage Reranking](https://github.com/microsoft/MSMARCO-Passage-Ranking/) dataset.
+| Model-Name        | NDCG@10 (TREC DL 19) | MRR@10 (MS Marco Dev)  | Docs / Sec |
+| ------------- |:-------------| -----| --- |
+| **Version 2 models** | | |
+| cross-encoder/ms-marco-TinyBERT-L-2-v2 | 69.84 | 32.56 | 9000
+| cross-encoder/ms-marco-MiniLM-L-2-v2 | 71.01 | 34.85 | 4100
+| cross-encoder/ms-marco-MiniLM-L-4-v2 | 73.04 | 37.70 | 2500
+| cross-encoder/ms-marco-MiniLM-L-6-v2 | 74.30 | 39.01 | 1800
+| cross-encoder/ms-marco-MiniLM-L-12-v2 | 74.31 | 39.02 | 960
+| **Version 1 models** | | |
+| cross-encoder/ms-marco-TinyBERT-L-2  | 67.43 | 30.15  | 9000
+| cross-encoder/ms-marco-TinyBERT-L-4  | 68.09 | 34.50  | 2900
+| cross-encoder/ms-marco-TinyBERT-L-6 |  69.57 | 36.13  | 680
+| cross-encoder/ms-marco-electra-base | 71.99 | 36.41 | 340
+| **Other models** | | |
+| nboost/pt-tinybert-msmarco | 63.63 | 28.80 | 2900
+| nboost/pt-bert-base-uncased-msmarco | 70.94 | 34.75 | 340
+| nboost/pt-bert-large-msmarco | 73.36 | 36.48 | 100
+| Capreolus/electra-base-msmarco | 71.23 | 36.89 | 340
+| amberoad/bert-multilingual-passage-reranking-msmarco | 68.40 | 35.54 | 330
+| sebastian-hofstaetter/distilbert-cat-margin_mse-T2-msmarco | 72.82 | 37.88 | 720
+ Note: Runtime was computed on a V100 GPU.

ms-marco-electra-base/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "google/electra-base-discriminator",
+  "architectures": [
+    "ElectraForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "embedding_size": 768,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "electra",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "summary_activation": "gelu",
+  "summary_last_dropout": 0.1,
+  "summary_type": "first",
+  "summary_use_proj": true,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}

ms-marco-electra-base/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c554473d61458bf2969566b1bb464eb280ef7de9cacb6ec787b4fe7f0a9a80d9
+size 438022601

ms-marco-electra-base/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

ms-marco-electra-base/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "name_or_path": "google/electra-base-discriminator"}

ms-marco-electra-base/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ poppler-utils

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+torch==1.10.0
+torchvision
+opencv-python
+timm
+pdfminer.six
+pdf2image
+pypdf2
+spacy
+pytesseract
+transformers==4.20
+sentence-transformers
+https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.10/detectron2-0.6%2Bcpu-cp38-cp38-linux_x86_64.whl
+gradio