Spaces:

OFA-Sys
/

OFA-OCR

Runtime error

File size: 8,352 Bytes

ee21b96

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import json
from functools import lru_cache


def convert_sentence_to_json(sentence):
    if "_" in sentence:
        prefix, rest = sentence.split("_", 1)
        query, rest = rest.split("_", 1)
        query_index = len(prefix.rstrip().split(" "))
    else:
        query, query_index = None, None

    prefix, rest = sentence.split("[", 1)
    pronoun, rest = rest.split("]", 1)
    pronoun_index = len(prefix.rstrip().split(" "))

    sentence = sentence.replace("_", "").replace("[", "").replace("]", "")

    return {
        "idx": 0,
        "text": sentence,
        "target": {
            "span1_index": query_index,
            "span1_text": query,
            "span2_index": pronoun_index,
            "span2_text": pronoun,
        },
    }


def extended_noun_chunks(sentence):
    noun_chunks = {(np.start, np.end) for np in sentence.noun_chunks}
    np_start, cur_np = 0, "NONE"
    for i, token in enumerate(sentence):
        np_type = token.pos_ if token.pos_ in {"NOUN", "PROPN"} else "NONE"
        if np_type != cur_np:
            if cur_np != "NONE":
                noun_chunks.add((np_start, i))
            if np_type != "NONE":
                np_start = i
            cur_np = np_type
    if cur_np != "NONE":
        noun_chunks.add((np_start, len(sentence)))
    return [sentence[s:e] for (s, e) in sorted(noun_chunks)]


def find_token(sentence, start_pos):
    found_tok = None
    for tok in sentence:
        if tok.idx == start_pos:
            found_tok = tok
            break
    return found_tok


def find_span(sentence, search_text, start=0):
    search_text = search_text.lower()
    for tok in sentence[start:]:
        remainder = sentence[tok.i :].text.lower()
        if remainder.startswith(search_text):
            len_to_consume = len(search_text)
            start_idx = tok.idx
            for next_tok in sentence[tok.i :]:
                end_idx = next_tok.idx + len(next_tok.text)
                if end_idx - start_idx == len_to_consume:
                    span = sentence[tok.i : next_tok.i + 1]
                    return span
    return None


@lru_cache(maxsize=1)
def get_detokenizer():
    from sacremoses import MosesDetokenizer

    detok = MosesDetokenizer(lang="en")
    return detok


@lru_cache(maxsize=1)
def get_spacy_nlp():
    import en_core_web_lg

    nlp = en_core_web_lg.load()
    return nlp


def jsonl_iterator(input_fname, positive_only=False, ngram_order=3, eval=False):
    detok = get_detokenizer()
    nlp = get_spacy_nlp()

    with open(input_fname) as fin:
        for line in fin:
            sample = json.loads(line.strip())

            if positive_only and "label" in sample and not sample["label"]:
                # only consider examples where the query is correct
                continue

            target = sample["target"]

            # clean up the query
            query = target["span1_text"]
            if query is not None:
                if "\n" in query:
                    continue
                if query.endswith(".") or query.endswith(","):
                    query = query[:-1]

            # split tokens
            tokens = sample["text"].split(" ")

            def strip_pronoun(x):
                return x.rstrip('.,"')

            # find the pronoun
            pronoun_idx = target["span2_index"]
            pronoun = strip_pronoun(target["span2_text"])
            if strip_pronoun(tokens[pronoun_idx]) != pronoun:
                # hack: sometimes the index is misaligned
                if strip_pronoun(tokens[pronoun_idx + 1]) == pronoun:
                    pronoun_idx += 1
                else:
                    raise Exception("Misaligned pronoun!")
            assert strip_pronoun(tokens[pronoun_idx]) == pronoun

            # split tokens before and after the pronoun
            before = tokens[:pronoun_idx]
            after = tokens[pronoun_idx + 1 :]

            # the GPT BPE attaches leading spaces to tokens, so we keep track
            # of whether we need spaces before or after the pronoun
            leading_space = " " if pronoun_idx > 0 else ""
            trailing_space = " " if len(after) > 0 else ""

            # detokenize
            before = detok.detokenize(before, return_str=True)
            pronoun = detok.detokenize([pronoun], return_str=True)
            after = detok.detokenize(after, return_str=True)

            # hack: when the pronoun ends in a period (or comma), move the
            # punctuation to the "after" part
            if pronoun.endswith(".") or pronoun.endswith(","):
                after = pronoun[-1] + trailing_space + after
                pronoun = pronoun[:-1]

            # hack: when the "after" part begins with a comma or period, remove
            # the trailing space
            if after.startswith(".") or after.startswith(","):
                trailing_space = ""

            # parse sentence with spacy
            sentence = nlp(before + leading_space + pronoun + trailing_space + after)

            # find pronoun span
            start = len(before + leading_space)
            first_pronoun_tok = find_token(sentence, start_pos=start)
            pronoun_span = find_span(sentence, pronoun, start=first_pronoun_tok.i)
            assert pronoun_span.text == pronoun

            if eval:
                # convert to format where pronoun is surrounded by "[]" and
                # query is surrounded by "_"
                query_span = find_span(sentence, query)
                query_with_ws = "_{}_{}".format(
                    query_span.text,
                    (" " if query_span.text_with_ws.endswith(" ") else ""),
                )
                pronoun_with_ws = "[{}]{}".format(
                    pronoun_span.text,
                    (" " if pronoun_span.text_with_ws.endswith(" ") else ""),
                )
                if query_span.start < pronoun_span.start:
                    first = (query_span, query_with_ws)
                    second = (pronoun_span, pronoun_with_ws)
                else:
                    first = (pronoun_span, pronoun_with_ws)
                    second = (query_span, query_with_ws)
                sentence = (
                    sentence[: first[0].start].text_with_ws
                    + first[1]
                    + sentence[first[0].end : second[0].start].text_with_ws
                    + second[1]
                    + sentence[second[0].end :].text
                )
                yield sentence, sample.get("label", None)
            else:
                yield sentence, pronoun_span, query, sample.get("label", None)


def winogrande_jsonl_iterator(input_fname, eval=False):
    with open(input_fname) as fin:
        for line in fin:
            sample = json.loads(line.strip())
            sentence, option1, option2 = (
                sample["sentence"],
                sample["option1"],
                sample["option2"],
            )

            pronoun_span = (sentence.index("_"), sentence.index("_") + 1)

            if eval:
                query, cand = option1, option2
            else:
                query = option1 if sample["answer"] == "1" else option2
                cand = option2 if sample["answer"] == "1" else option1
            yield sentence, pronoun_span, query, cand


def filter_noun_chunks(
    chunks, exclude_pronouns=False, exclude_query=None, exact_match=False
):
    if exclude_pronouns:
        chunks = [
            np
            for np in chunks
            if (np.lemma_ != "-PRON-" and not all(tok.pos_ == "PRON" for tok in np))
        ]

    if exclude_query is not None:
        excl_txt = [exclude_query.lower()]
        filtered_chunks = []
        for chunk in chunks:
            lower_chunk = chunk.text.lower()
            found = False
            for excl in excl_txt:
                if (
                    not exact_match and (lower_chunk in excl or excl in lower_chunk)
                ) or lower_chunk == excl:
                    found = True
                    break
            if not found:
                filtered_chunks.append(chunk)
        chunks = filtered_chunks

    return chunks