Upload 9 files

Browse files

Files changed (8) hide show

added_tokens.json +3 -3
config.json +30 -30
decode_utils.py +697 -571
modeling_uie.py +162 -162
pytorch_model.bin +3 -0
special_tokens_map.json +7 -7
tokenizer.json +3 -15
tokenizer_config.json +57 -57

added_tokens.json CHANGED Viewed

@@ -1,3 +1,3 @@
-{
-  "[UNK]": 39979
-}

+{
+  "[UNK]": 39979
+}

config.json CHANGED Viewed

@@ -1,30 +1,30 @@
-{
-  "_name_or_path": "uie_base_pytorch",
-  "architectures": [
-    "UIEModel"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "auto_map": {
-    "AutoModel": "modeling_uie.UIEModel"
-  },
-  "classifier_dropout": null,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 2048,
-  "model_type": "ernie",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pad_token_id": 0,
-  "position_embedding_type": "absolute",
-  "task_type_vocab_size": 3,
-  "torch_dtype": "float32",
-  "transformers_version": "4.39.1",
-  "type_vocab_size": 4,
-  "use_cache": true,
-  "use_task_id": true,
-  "vocab_size": 40000
-}

+{
+  "_name_or_path": "uie_base_pytorch",
+  "architectures": [
+    "UIEModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "auto_map": {
+    "AutoModel": "modeling_uie.UIEModel"
+  },
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 2048,
+  "model_type": "ernie",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "task_type_vocab_size": 3,
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.2",
+  "type_vocab_size": 4,
+  "use_cache": true,
+  "use_task_id": true,
+  "vocab_size": 40000
+}

decode_utils.py CHANGED Viewed

@@ -1,571 +1,697 @@
-import math
-import re
-from typing import (
-    List,
-    Union,
-    Any,
-    Optional,
-)
-import numpy as np
-import torch
-import torch.nn as nn
-from tqdm import tqdm
-from transformers import PreTrainedTokenizer
-def get_id_and_prob(spans, offset_map):
-    prompt_length = 0
-    for i in range(1, len(offset_map)):
-        if offset_map[i] != [0, 0]:
-            prompt_length += 1
-        else:
-            break
-    for i in range(1, prompt_length + 1):
-        offset_map[i][0] -= (prompt_length + 1)
-        offset_map[i][1] -= (prompt_length + 1)
-    sentence_id = []
-    prob = []
-    for start, end in spans:
-        prob.append(start[1] * end[1])
-        sentence_id.append(
-            (offset_map[start[0]][0], offset_map[end[0]][1]))
-    return sentence_id, prob
-def get_span(start_ids, end_ids, with_prob=False):
-    """
-    Get span set from position start and end list.
-    Args:
-        start_ids (List[int]/List[tuple]): The start index list.
-        end_ids (List[int]/List[tuple]): The end index list.
-        with_prob (bool): If True, each element for start_ids and end_ids is a tuple aslike: (index, probability).
-    Returns:
-        set: The span set without overlapping, every id can only be used once.
-    """
-    if with_prob:
-        start_ids = sorted(start_ids, key=lambda x: x[0])
-        end_ids = sorted(end_ids, key=lambda x: x[0])
-    else:
-        start_ids = sorted(start_ids)
-        end_ids = sorted(end_ids)
-    start_pointer = 0
-    end_pointer = 0
-    len_start = len(start_ids)
-    len_end = len(end_ids)
-    couple_dict = {}
-    # 将每一个span的首/尾token的id进行配对（就近匹配，默认没有overlap的情况）
-    while start_pointer < len_start and end_pointer < len_end:
-        if with_prob:
-            start_id = start_ids[start_pointer][0]
-            end_id = end_ids[end_pointer][0]
-        else:
-            start_id = start_ids[start_pointer]
-            end_id = end_ids[end_pointer]
-        if start_id == end_id:
-            couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
-            start_pointer += 1
-            end_pointer += 1
-            continue
-        if start_id < end_id:
-            couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
-            start_pointer += 1
-            continue
-        if start_id > end_id:
-            end_pointer += 1
-            continue
-    result = [(couple_dict[end], end) for end in couple_dict]
-    result = set(result)
-    return result
-def get_bool_ids_greater_than(probs, limit=0.5, return_prob=False):
-    """
-    Get idx of the last dimension in probability arrays, which is greater than a limitation.
-    Args:
-        probs (List[List[float]]): The input probability arrays.
-        limit (float): The limitation for probability.
-        return_prob (bool): Whether to return the probability
-    Returns:
-        List[List[int]]: The index of the last dimension meet the conditions.
-    """
-    probs = np.array(probs)
-    dim_len = len(probs.shape)
-    if dim_len > 1:
-        result = []
-        for p in probs:
-            result.append(get_bool_ids_greater_than(p, limit, return_prob))
-        return result
-    else:
-        result = []
-        for i, p in enumerate(probs):
-            if p > limit:
-                if return_prob:
-                    result.append((i, p))
-                else:
-                    result.append(i)
-        return result
-def dbc2sbc(s):
-    rs = ""
-    for char in s:
-        code = ord(char)
-        if code == 0x3000:
-            code = 0x0020
-        else:
-            code -= 0xfee0
-        if not (0x0021 <= code <= 0x7e):
-            rs += char
-            continue
-        rs += chr(code)
-    return rs
-def cut_chinese_sent(para):
-    """
-    Cut the Chinese sentences more precisely, reference to
-    "https://blog.csdn.net/blmoistawinde/article/details/82379256".
-    """
-    para = re.sub(r'([。！？\?])([^”’])', r'\1\n\2', para)
-    para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)
-    para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)
-    para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)
-    para = para.rstrip()
-    return para.split("\n")
-def auto_splitter(input_texts, max_text_len, split_sentence=False):
-    """
-    Split the raw texts automatically for model inference.
-    Args:
-        input_texts (List[str]): input raw texts.
-        max_text_len (int): cutting length.
-        split_sentence (bool): If True, sentence-level split will be performed.
-    return:
-        short_input_texts (List[str]): the short input texts for model inference.
-        input_mapping (dict): mapping between raw text and short input texts.
-    """
-    input_mapping = {}
-    short_input_texts = []
-    cnt_short = 0
-    for cnt_org, text in enumerate(input_texts):
-        sens = cut_chinese_sent(text) if split_sentence else [text]
-        for sen in sens:
-            lens = len(sen)
-            if lens <= max_text_len:
-                short_input_texts.append(sen)
-                if cnt_org in input_mapping:
-                    input_mapping[cnt_org].append(cnt_short)
-                else:
-                    input_mapping[cnt_org] = [cnt_short]
-                cnt_short += 1
-            else:
-                temp_text_list = [sen[i: i + max_text_len] for i in range(0, lens, max_text_len)]
-                short_input_texts.extend(temp_text_list)
-                short_idx = cnt_short
-                cnt_short += math.ceil(lens / max_text_len)
-                temp_text_id = [short_idx + i for i in range(cnt_short - short_idx)]
-                if cnt_org in input_mapping:
-                    input_mapping[cnt_org].extend(temp_text_id)
-                else:
-                    input_mapping[cnt_org] = temp_text_id
-    return short_input_texts, input_mapping
-class UIEDecoder(nn.Module):
-    keys_to_ignore_on_gpu = ["offset_mapping", "texts"]
-    @torch.inference_mode()
-    def predict(
-        self,
-        tokenizer: PreTrainedTokenizer,
-        texts: Union[List[str], str],
-        schema: Optional[Any] = None,
-        batch_size: int = 64,
-        max_length: int = 512,
-        split_sentence: bool = False,
-        position_prob: float = 0.5,
-        is_english: bool = False,
-        disable_tqdm: bool = True,
-    ) -> List[Any]:
-        self.eval()
-        self.tokenizer = tokenizer
-        self.is_english = is_english
-        if schema is not None:
-            self.set_schema(schema)
-        texts = texts
-        if isinstance(texts, str):
-            texts = [texts]
-        return self._multi_stage_predict(
-            texts, batch_size, max_length, split_sentence, position_prob, disable_tqdm
-        )
-    def set_schema(self, schema):
-        if isinstance(schema, (dict, str)):
-            schema = [schema]
-        self._schema_tree = self._build_tree(schema)
-    def _multi_stage_predict(
-        self,
-        texts: List[str],
-        batch_size: int = 64,
-        max_length: int = 512,
-        split_sentence: bool = False,
-        position_prob: float = 0.5,
-        disable_tqdm: bool = True,
-    ) -> List[Any]:
-        """ Traversal the schema tree and do multi-stage prediction. """
-        results = [{} for _ in range(len(texts))]
-        if len(texts) < 1 or self._schema_tree is None:
-            return results
-        schema_list = self._schema_tree.children[:]
-        while len(schema_list) > 0:
-            node = schema_list.pop(0)
-            examples = []
-            input_map = {}
-            cnt = 0
-            idx = 0
-            if not node.prefix:
-                for data in texts:
-                    examples.append({"text": data, "prompt": dbc2sbc(node.name)})
-                    input_map[cnt] = [idx]
-                    idx += 1
-                    cnt += 1
-            else:
-                for pre, data in zip(node.prefix, texts):
-                    if len(pre) == 0:
-                        input_map[cnt] = []
-                    else:
-                        for p in pre:
-                            if self.is_english:
-                                if re.search(r'\[.*?\]$', node.name):
-                                    prompt_prefix = node.name[:node.name.find("[", 1)].strip()
-                                    cls_options = re.search(r'\[.*?\]$', node.name).group()
-                                    # Sentiment classification of xxx [positive, negative]
-                                    prompt = prompt_prefix + p + " " + cls_options
-                                else:
-                                    prompt = node.name + p
-                            else:
-                                prompt = p + node.name
-                            examples.append(
-                                {
-                                    "text": data,
-                                    "prompt": dbc2sbc(prompt)
-                                }
-                            )
-                        input_map[cnt] = [i + idx for i in range(len(pre))]
-                        idx += len(pre)
-                    cnt += 1
-            result_list = self._single_stage_predict(
-                examples, batch_size, max_length, split_sentence, position_prob, disable_tqdm
-            ) if examples else []
-            if not node.parent_relations:
-                relations = [[] for _ in range(len(texts))]
-                for k, v in input_map.items():
-                    for idx in v:
-                        if len(result_list[idx]) == 0:
-                            continue
-                        if node.name not in results[k].keys():
-                            results[k][node.name] = result_list[idx]
-                        else:
-                            results[k][node.name].extend(result_list[idx])
-                    if node.name in results[k].keys():
-                        relations[k].extend(results[k][node.name])
-            else:
-                relations = node.parent_relations
-                for k, v in input_map.items():
-                    for i in range(len(v)):
-                        if len(result_list[v[i]]) == 0:
-                            continue
-                        if "relations" not in relations[k][i].keys():
-                            relations[k][i]["relations"] = {node.name: result_list[v[i]]}
-                        elif node.name not in relations[k][i]["relations"].keys():
-                            relations[k][i]["relations"][node.name] = result_list[v[i]]
-                        else:
-                            relations[k][i]["relations"][node.name].extend(result_list[v[i]])
-                new_relations = [[] for _ in range(len(texts))]
-                for i in range(len(relations)):
-                    for j in range(len(relations[i])):
-                        if "relations" in relations[i][j].keys() and node.name in relations[i][j]["relations"].keys():
-                            for k in range(len(relations[i][j]["relations"][node.name])):
-                                new_relations[i].append(relations[i][j]["relations"][node.name][k])
-                relations = new_relations
-            prefix = [[] for _ in range(len(texts))]
-            for k, v in input_map.items():
-                for idx in v:
-                    for i in range(len(result_list[idx])):
-                        if self.is_english:
-                            prefix[k].append(" of " + result_list[idx][i]["text"])
-                        else:
-                            prefix[k].append(result_list[idx][i]["text"] + "的")
-            for child in node.children:
-                child.prefix = prefix
-                child.parent_relations = relations
-                schema_list.append(child)
-        return results
-    def _convert_ids_to_results(self, examples, sentence_ids, probs):
-        """ Convert ids to raw text in a single stage. """
-        results = []
-        for example, sentence_id, prob in zip(examples, sentence_ids, probs):
-            if len(sentence_id) == 0:
-                results.append([])
-                continue
-            result_list = []
-            text = example["text"]
-            prompt = example["prompt"]
-            for i in range(len(sentence_id)):
-                start, end = sentence_id[i]
-                if start < 0 and end >= 0:
-                    continue
-                if end < 0:
-                    start += len(prompt) + 1
-                    end += len(prompt) + 1
-                    result = {"text": prompt[start: end], "probability": prob[i]}
-                else:
-                    result = {"text": text[start: end], "start": start, "end": end, "probability": prob[i]}
-                result_list.append(result)
-            results.append(result_list)
-        return results
-    def _auto_splitter(self, input_texts, max_text_len, split_sentence=False):
-        """
-        Split the raw texts automatically for model inference.
-        Args:
-            input_texts (List[str]): input raw texts.
-            max_text_len (int): cutting length.
-            split_sentence (bool): If True, sentence-level split will be performed.
-        return:
-            short_input_texts (List[str]): the short input texts for model inference.
-            input_mapping (dict): mapping between raw text and short input texts.
-        """
-        input_mapping = {}
-        short_input_texts = []
-        cnt_short = 0
-        for cnt_org, text in enumerate(input_texts):
-            sens = cut_chinese_sent(text) if split_sentence else [text]
-            for sen in sens:
-                lens = len(sen)
-                if lens <= max_text_len:
-                    short_input_texts.append(sen)
-                    if cnt_org in input_mapping:
-                        input_mapping[cnt_org].append(cnt_short)
-                    else:
-                        input_mapping[cnt_org] = [cnt_short]
-                    cnt_short += 1
-                else:
-                    temp_text_list = [sen[i: i + max_text_len] for i in range(0, lens, max_text_len)]
-                    short_input_texts.extend(temp_text_list)
-                    short_idx = cnt_short
-                    cnt_short += math.ceil(lens / max_text_len)
-                    temp_text_id = [short_idx + i for i in range(cnt_short - short_idx)]
-                    if cnt_org in input_mapping:
-                        input_mapping[cnt_org].extend(temp_text_id)
-                    else:
-                        input_mapping[cnt_org] = temp_text_id
-        return short_input_texts, input_mapping
-    def _single_stage_predict(
-        self,
-        inputs: List[dict],
-        batch_size: int = 64,
-        max_length: int = 512,
-        split_sentence: bool = False,
-        position_prob: float = 0.5,
-        disable_tqdm: bool = True,
-    ):
-        input_texts = []
-        prompts = []
-        for i in range(len(inputs)):
-            input_texts.append(inputs[i]["text"])
-            prompts.append(inputs[i]["prompt"])
-        # max predict length should exclude the length of prompt and summary tokens
-        max_predict_len = max_length - len(max(prompts)) - 3
-        short_input_texts, input_mapping = self._auto_splitter(
-            input_texts, max_predict_len, split_sentence=split_sentence
-        )
-        short_texts_prompts = []
-        for k, v in input_mapping.items():
-            short_texts_prompts.extend([prompts[k] for _ in range(len(v))])
-        short_inputs = [
-            {
-                "text": short_input_texts[i],
-                "prompt": short_texts_prompts[i]
-            }
-            for i in range(len(short_input_texts))
-        ]
-        encoded_inputs = self.tokenizer(
-            text=short_texts_prompts,
-            text_pair=short_input_texts,
-            stride=2,
-            truncation=True,
-            max_length=max_length,
-            padding="longest",
-            add_special_tokens=True,
-            return_offsets_mapping=True,
-            return_tensors="np")
-        offset_maps = encoded_inputs["offset_mapping"]
-        start_prob_concat, end_prob_concat = [], []
-        if disable_tqdm:
-            batch_iterator = range(0, len(short_input_texts), batch_size)
-        else:
-            batch_iterator = tqdm(range(0, len(short_input_texts), batch_size), desc="Predicting", unit="batch")
-        for batch_start in batch_iterator:
-            batch = {
-                key:
-                    np.array(value[batch_start: batch_start + batch_size], dtype="int64")
-                for key, value in encoded_inputs.items() if key not in self.keys_to_ignore_on_gpu
-            }
-            for k, v in batch.items():
-                batch[k] = torch.tensor(v, device=self.device)
-            outputs = self(**batch)
-            start_prob, end_prob = outputs[0], outputs[1]
-            if self.device != torch.device("cpu"):
-                start_prob, end_prob = start_prob.cpu(), end_prob.cpu()
-            start_prob_concat.append(start_prob.detach().numpy())
-            end_prob_concat.append(end_prob.detach().numpy())
-        start_prob_concat = np.concatenate(start_prob_concat)
-        end_prob_concat = np.concatenate(end_prob_concat)
-        start_ids_list = get_bool_ids_greater_than(start_prob_concat, limit=position_prob, return_prob=True)
-        end_ids_list = get_bool_ids_greater_than(end_prob_concat, limit=position_prob, return_prob=True)
-        input_ids = encoded_inputs['input_ids'].tolist()
-        sentence_ids, probs = [], []
-        for start_ids, end_ids, ids, offset_map in zip(start_ids_list, end_ids_list, input_ids, offset_maps):
-            span_list = get_span(start_ids, end_ids, with_prob=True)
-            sentence_id, prob = get_id_and_prob(span_list, offset_map.tolist())
-            sentence_ids.append(sentence_id)
-            probs.append(prob)
-        results = self._convert_ids_to_results(short_inputs, sentence_ids, probs)
-        results = self._auto_joiner(results, short_input_texts, input_mapping)
-        return results
-    def _auto_joiner(self, short_results, short_inputs, input_mapping):
-        concat_results = []
-        is_cls_task = False
-        for short_result in short_results:
-            if not short_result:
-                continue
-            elif 'start' not in short_result[0].keys() and 'end' not in short_result[0].keys():
-                is_cls_task = True
-                break
-            else:
-                break
-        for k, vs in input_mapping.items():
-            single_results = []
-            if is_cls_task:
-                cls_options = {}
-                for v in vs:
-                    if len(short_results[v]) == 0:
-                        continue
-                    if short_results[v][0]['text'] in cls_options:
-                        cls_options[short_results[v][0]["text"]][0] += 1
-                        cls_options[short_results[v][0]["text"]][1] += short_results[v][0]["probability"]
-                    else:
-                        cls_options[short_results[v][0]["text"]] = [1, short_results[v][0]["probability"]]
-                if cls_options:
-                    cls_res, cls_info = max(cls_options.items(), key=lambda x: x[1])
-                    concat_results.append(
-                        [
-                            {"text": cls_res, "probability": cls_info[1] / cls_info[0]}
-                        ]
-                    )
-                else:
-                    concat_results.append([])
-            else:
-                offset = 0
-                for v in vs:
-                    if v == 0:
-                        single_results = short_results[v]
-                        offset += len(short_inputs[v])
-                    else:
-                        for i in range(len(short_results[v])):
-                            if 'start' not in short_results[v][i] or 'end' not in short_results[v][i]:
-                                continue
-                            short_results[v][i]["start"] += offset
-                            short_results[v][i]["end"] += offset
-                        offset += len(short_inputs[v])
-                        single_results.extend(short_results[v])
-                concat_results.append(single_results)
-        return concat_results
-    @classmethod
-    def _build_tree(cls, schema, name='root'):
-        """
-        Build the schema tree.
-        """
-        schema_tree = SchemaTree(name)
-        for s in schema:
-            if isinstance(s, str):
-                schema_tree.add_child(SchemaTree(s))
-            elif isinstance(s, dict):
-                for k, v in s.items():
-                    if isinstance(v, str):
-                        child = [v]
-                    elif isinstance(v, list):
-                        child = v
-                    else:
-                        raise TypeError(
-                            f"Invalid schema, value for each key:value pairs should be list or string"
-                            f"but {type(v)} received")
-                    schema_tree.add_child(cls._build_tree(child, name=k))
-            else:
-                raise TypeError(f"Invalid schema, element should be string or dict, but {type(s)} received")
-        return schema_tree
-class SchemaTree(object):
-    """
-    Implementation of SchemaTree
-    """
-    def __init__(self, name='root', children=None):
-        self.name = name
-        self.children = []
-        self.prefix = None
-        self.parent_relations = None
-        if children is not None:
-            for child in children:
-                self.add_child(child)
-    def __repr__(self):
-        return self.name
-    def add_child(self, node):
-        assert isinstance(
-            node, SchemaTree
-        ), "The children of a node should be an instance of SchemaTree."
-        self.children.append(node)

+import logging
+import math
+import os
+import queue
+import re
+from multiprocessing import Queue
+from typing import (
+    List,
+    Tuple,
+    Union,
+    Dict,
+    Any,
+    Set,
+    TYPE_CHECKING,
+    Optional,
+    Literal,
+)
+import numpy as np
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from tqdm import tqdm
+from transformers import is_torch_npu_available
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+os.environ["PYTHONWARNINGS"] = "ignore"
+logger = logging.getLogger("FASTIE")
+def get_id_and_prob(spans, offset_map):
+    prompt_length = 0
+    for i in range(1, len(offset_map)):
+        if offset_map[i] != [0, 0]:
+            prompt_length += 1
+        else:
+            break
+    for i in range(1, prompt_length + 1):
+        offset_map[i][0] -= (prompt_length + 1)
+        offset_map[i][1] -= (prompt_length + 1)
+    sentence_id = []
+    prob = []
+    for start, end in spans:
+        prob.append(float(start[1] * end[1]))
+        sentence_id.append(
+            (offset_map[start[0]][0], offset_map[end[0]][1]))
+    return sentence_id, prob
+def get_span(
+    start_ids: Union[List[int], List[Tuple[int, float]]],
+    end_ids: Union[List[int], List[Tuple[int, float]]],
+    with_prob: bool = False
+) -> Set[Tuple[int, int]]:
+    """
+    Get span set from position start and end list.
+    Args:
+        start_ids (List[int]/List[tuple]): The start index list.
+        end_ids (List[int]/List[tuple]): The end index list.
+        with_prob (bool): If True, each element for start_ids and end_ids is a tuple aslike: (index, probability).
+    Returns:
+        set: The span set without overlapping, every id can only be used once.
+    """
+    if with_prob:
+        start_ids = sorted(start_ids, key=lambda x: x[0])
+        end_ids = sorted(end_ids, key=lambda x: x[0])
+    else:
+        start_ids = sorted(start_ids)
+        end_ids = sorted(end_ids)
+    start_pointer = 0
+    end_pointer = 0
+    len_start = len(start_ids)
+    len_end = len(end_ids)
+    couple_dict = {}
+    # 将每一个span的首/尾token的id进行配对（就近匹配，默认没有overlap的情况）
+    while start_pointer < len_start and end_pointer < len_end:
+        if with_prob:
+            start_id = start_ids[start_pointer][0]
+            end_id = end_ids[end_pointer][0]
+        else:
+            start_id = start_ids[start_pointer]
+            end_id = end_ids[end_pointer]
+        if start_id == end_id:
+            couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
+            start_pointer += 1
+            end_pointer += 1
+            continue
+        if start_id < end_id:
+            couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
+            start_pointer += 1
+            continue
+        if start_id > end_id:
+            end_pointer += 1
+            continue
+    result = [(couple_dict[end], end) for end in couple_dict]
+    result = set(result)
+    return result
+def get_bool_ids_greater_than(
+    probs: List[List[float]], limit: float = 0.5, return_prob: bool = False
+) -> List[List[int]]:
+    """
+    Get idx of the last dimension in probability arrays, which is greater than a limitation.
+    Args:
+        probs (List[List[float]]): The input probability arrays.
+        limit (float): The limitation for probability.
+        return_prob (bool): Whether to return the probability
+    Returns:
+        List[List[int]]: The index of the last dimension meet the conditions.
+    """
+    probs = np.array(probs)
+    dim_len = len(probs.shape)
+    if dim_len > 1:
+        result = []
+        for p in probs:
+            result.append(get_bool_ids_greater_than(p, limit, return_prob))
+        return result
+    else:
+        result = []
+        for i, p in enumerate(probs):
+            if p > limit:
+                if return_prob:
+                    result.append((i, p))
+                else:
+                    result.append(i)
+        return result
+def dbc2sbc(s) -> str:
+    rs = ""
+    for char in s:
+        code = ord(char)
+        if code == 0x3000:
+            code = 0x0020
+        else:
+            code -= 0xfee0
+        if not (0x0021 <= code <= 0x7e):
+            rs += char
+            continue
+        rs += chr(code)
+    return rs
+def cut_chinese_sent(para: str) -> List[str]:
+    """
+    Cut the Chinese sentences more precisely, reference to
+    "https://blog.csdn.net/blmoistawinde/article/details/82379256".
+    """
+    para = re.sub(r'([。！？\?])([^”’])', r'\1\n\2', para)
+    para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)
+    para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)
+    para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)
+    para = para.rstrip()
+    return para.split("\n")
+class UIEDecoder(nn.Module):
+    keys_to_ignore_on_gpu = ["offset_mapping", "texts"]
+    @torch.inference_mode()
+    def predict(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        texts: Union[List[str], str],
+        schema: Optional[Any] = None,
+        batch_size: int = 64,
+        max_length: int = 512,
+        split_sentence: bool = False,
+        position_prob: float = 0.5,
+        language: Optional[str] = "zh",
+        show_progress_bar: bool = None,
+        device: Optional[str] = None,
+    ) -> List[Any]:
+        self.eval()
+        self.is_english = False if language.lower() in ["zh", "zh-cn", "chinese"] else True
+        if schema is not None:
+            self.set_schema(schema)
+        if show_progress_bar is None:
+            show_progress_bar = (
+                logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
+            )
+        # Cast an individual text to a list with length 1
+        if isinstance(texts, str) or not hasattr(texts, "__len__"):
+            texts = [texts]
+        if device is None:
+            device = next(self.parameters()).device
+        self.to(device)
+        return self._multi_stage_predict(
+            tokenizer, texts, batch_size, max_length, split_sentence, position_prob, show_progress_bar
+        )
+    def set_schema(self, schema):
+        if isinstance(schema, (dict, str)):
+            schema = [schema]
+        self._schema_tree = self._build_tree(schema)
+    def _multi_stage_predict(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        texts: List[str],
+        batch_size: int = 64,
+        max_length: int = 512,
+        split_sentence: bool = False,
+        position_prob: float = 0.5,
+        show_progress_bar: bool = False,
+    ) -> List[Any]:
+        """ Traversal the schema tree and do multi-stage prediction. """
+        results = [{} for _ in range(len(texts))]
+        if len(texts) < 1 or self._schema_tree is None:
+            return results
+        schema_list = self._schema_tree.children[:]
+        while len(schema_list) > 0:
+            node = schema_list.pop(0)
+            examples = []
+            input_map = {}
+            cnt = 0
+            idx = 0
+            if not node.prefix:
+                for data in texts:
+                    examples.append({"text": data, "prompt": dbc2sbc(node.name)})
+                    input_map[cnt] = [idx]
+                    idx += 1
+                    cnt += 1
+            else:
+                for pre, data in zip(node.prefix, texts):
+                    if len(pre) == 0:
+                        input_map[cnt] = []
+                    else:
+                        for p in pre:
+                            if self.is_english:
+                                if re.search(r'\[.*?\]$', node.name):
+                                    prompt_prefix = node.name[:node.name.find("[", 1)].strip()
+                                    cls_options = re.search(r'\[.*?\]$', node.name).group()
+                                    # Sentiment classification of xxx [positive, negative]
+                                    prompt = prompt_prefix + p + " " + cls_options
+                                else:
+                                    prompt = node.name + p
+                            else:
+                                prompt = p + node.name
+                            examples.append(
+                                {
+                                    "text": data,
+                                    "prompt": dbc2sbc(prompt)
+                                }
+                            )
+                        input_map[cnt] = [i + idx for i in range(len(pre))]
+                        idx += len(pre)
+                    cnt += 1
+            result_list = self._single_stage_predict(
+                tokenizer, examples, batch_size, max_length, split_sentence, position_prob, show_progress_bar
+            ) if examples else []
+            if not node.parent_relations:
+                relations = [[] for _ in range(len(texts))]
+                for k, v in input_map.items():
+                    for idx in v:
+                        if len(result_list[idx]) == 0:
+                            continue
+                        if node.name not in results[k].keys():
+                            results[k][node.name] = result_list[idx]
+                        else:
+                            results[k][node.name].extend(result_list[idx])
+                    if node.name in results[k].keys():
+                        relations[k].extend(results[k][node.name])
+            else:
+                relations = node.parent_relations
+                for k, v in input_map.items():
+                    for i in range(len(v)):
+                        if len(result_list[v[i]]) == 0:
+                            continue
+                        if "relations" not in relations[k][i].keys():
+                            relations[k][i]["relations"] = {node.name: result_list[v[i]]}
+                        elif node.name not in relations[k][i]["relations"].keys():
+                            relations[k][i]["relations"][node.name] = result_list[v[i]]
+                        else:
+                            relations[k][i]["relations"][node.name].extend(result_list[v[i]])
+                new_relations = [[] for _ in range(len(texts))]
+                for i in range(len(relations)):
+                    for j in range(len(relations[i])):
+                        if "relations" in relations[i][j].keys() and node.name in relations[i][j]["relations"].keys():
+                            for k in range(len(relations[i][j]["relations"][node.name])):
+                                new_relations[i].append(relations[i][j]["relations"][node.name][k])
+                relations = new_relations
+            prefix = [[] for _ in range(len(texts))]
+            for k, v in input_map.items():
+                for idx in v:
+                    for i in range(len(result_list[idx])):
+                        if self.is_english:
+                            prefix[k].append(" of " + result_list[idx][i]["text"])
+                        else:
+                            prefix[k].append(result_list[idx][i]["text"] + "的")
+            for child in node.children:
+                child.prefix = prefix
+                child.parent_relations = relations
+                schema_list.append(child)
+        return results
+    def _convert_ids_to_results(self, examples, sentence_ids, probs):
+        """ Convert ids to raw text in a single stage. """
+        results = []
+        for example, sentence_id, prob in zip(examples, sentence_ids, probs):
+            if len(sentence_id) == 0:
+                results.append([])
+                continue
+            result_list = []
+            text = example["text"]
+            prompt = example["prompt"]
+            for i in range(len(sentence_id)):
+                start, end = sentence_id[i]
+                if start < 0 and end >= 0:
+                    continue
+                if end < 0:
+                    start += len(prompt) + 1
+                    end += len(prompt) + 1
+                    result = {"text": prompt[start: end], "probability": float(prob[i])}
+                else:
+                    result = {"text": text[start: end], "start": start, "end": end, "probability": float(prob[i])}
+                result_list.append(result)
+            results.append(result_list)
+        return results
+    def _auto_splitter(self, input_texts, max_text_len, split_sentence=False):
+        """
+        Split the raw texts automatically for model inference.
+        Args:
+            input_texts (List[str]): input raw texts.
+            max_text_len (int): cutting length.
+            split_sentence (bool): If True, sentence-level split will be performed.
+        return:
+            short_input_texts (List[str]): the short input texts for model inference.
+            input_mapping (dict): mapping between raw text and short input texts.
+        """
+        input_mapping = {}
+        short_input_texts = []
+        cnt_short = 0
+        for cnt_org, text in enumerate(input_texts):
+            sens = cut_chinese_sent(text) if split_sentence else [text]
+            for sen in sens:
+                lens = len(sen)
+                if lens <= max_text_len:
+                    short_input_texts.append(sen)
+                    if cnt_org in input_mapping:
+                        input_mapping[cnt_org].append(cnt_short)
+                    else:
+                        input_mapping[cnt_org] = [cnt_short]
+                    cnt_short += 1
+                else:
+                    temp_text_list = [sen[i: i + max_text_len] for i in range(0, lens, max_text_len)]
+                    short_input_texts.extend(temp_text_list)
+                    short_idx = cnt_short
+                    cnt_short += math.ceil(lens / max_text_len)
+                    temp_text_id = [short_idx + i for i in range(cnt_short - short_idx)]
+                    if cnt_org in input_mapping:
+                        input_mapping[cnt_org].extend(temp_text_id)
+                    else:
+                        input_mapping[cnt_org] = temp_text_id
+        return short_input_texts, input_mapping
+    def _single_stage_predict(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        inputs: List[dict],
+        batch_size: int = 64,
+        max_length: int = 512,
+        split_sentence: bool = False,
+        position_prob: float = 0.5,
+        show_progress_bar: bool = False,
+    ) -> List[Any]:
+        input_texts = []
+        prompts = []
+        for i in range(len(inputs)):
+            input_texts.append(inputs[i]["text"])
+            prompts.append(inputs[i]["prompt"])
+        # max predict length should exclude the length of prompt and summary tokens
+        max_predict_len = max_length - len(max(prompts)) - 3
+        short_input_texts, input_mapping = self._auto_splitter(
+            input_texts, max_predict_len, split_sentence=split_sentence
+        )
+        short_texts_prompts = []
+        for k, v in input_mapping.items():
+            short_texts_prompts.extend([prompts[k] for _ in range(len(v))])
+        short_inputs = [
+            {
+                "text": short_input_texts[i],
+                "prompt": short_texts_prompts[i]
+            }
+            for i in range(len(short_input_texts))
+        ]
+        encoded_inputs = tokenizer(
+            text=short_texts_prompts,
+            text_pair=short_input_texts,
+            stride=2,
+            truncation=True,
+            max_length=512,
+            padding="max_length",
+            add_special_tokens=True,
+            return_offsets_mapping=True,
+            return_tensors="np",
+        )
+        offset_maps = encoded_inputs["offset_mapping"]
+        start_prob_concat, end_prob_concat = [], []
+        batch_iterator = tqdm(range(0, len(short_input_texts), batch_size), desc="Batches", disable=not show_progress_bar)
+        for batch_start in batch_iterator:
+            batch = {
+                key:
+                    np.array(value[batch_start: batch_start + batch_size], dtype="int64")
+                for key, value in encoded_inputs.items() if key not in self.keys_to_ignore_on_gpu
+            }
+            for k, v in batch.items():
+                batch[k] = torch.tensor(v, device=self.device)
+            outputs = self(**batch)
+            start_prob, end_prob = outputs[0], outputs[1]
+            if self.device != torch.device("cpu"):
+                start_prob, end_prob = start_prob.cpu(), end_prob.cpu()
+            start_prob_concat.append(start_prob.detach().numpy())
+            end_prob_concat.append(end_prob.detach().numpy())
+        start_prob_concat = np.concatenate(start_prob_concat)
+        end_prob_concat = np.concatenate(end_prob_concat)
+        start_ids_list = get_bool_ids_greater_than(start_prob_concat, limit=position_prob, return_prob=True)
+        end_ids_list = get_bool_ids_greater_than(end_prob_concat, limit=position_prob, return_prob=True)
+        input_ids = encoded_inputs["input_ids"].tolist()
+        sentence_ids, probs = [], []
+        for start_ids, end_ids, ids, offset_map in zip(start_ids_list, end_ids_list, input_ids, offset_maps):
+            span_list = get_span(start_ids, end_ids, with_prob=True)
+            sentence_id, prob = get_id_and_prob(span_list, offset_map.tolist())
+            sentence_ids.append(sentence_id)
+            probs.append(prob)
+        results = self._convert_ids_to_results(short_inputs, sentence_ids, probs)
+        results = self._auto_joiner(results, short_input_texts, input_mapping)
+        return results
+    def _auto_joiner(self, short_results, short_inputs, input_mapping):
+        concat_results = []
+        is_cls_task = False
+        for short_result in short_results:
+            if not short_result:
+                continue
+            elif 'start' not in short_result[0].keys() and 'end' not in short_result[0].keys():
+                is_cls_task = True
+                break
+            else:
+                break
+        for k, vs in input_mapping.items():
+            single_results = []
+            if is_cls_task:
+                cls_options = {}
+                for v in vs:
+                    if len(short_results[v]) == 0:
+                        continue
+                    if short_results[v][0]['text'] in cls_options:
+                        cls_options[short_results[v][0]["text"]][0] += 1
+                        cls_options[short_results[v][0]["text"]][1] += short_results[v][0]["probability"]
+                    else:
+                        cls_options[short_results[v][0]["text"]] = [1, short_results[v][0]["probability"]]
+                if cls_options:
+                    cls_res, cls_info = max(cls_options.items(), key=lambda x: x[1])
+                    concat_results.append(
+                        [
+                            {"text": cls_res, "probability": cls_info[1] / cls_info[0]}
+                        ]
+                    )
+                else:
+                    concat_results.append([])
+            else:
+                offset = 0
+                for v in vs:
+                    if v == 0:
+                        single_results = short_results[v]
+                        offset += len(short_inputs[v])
+                    else:
+                        for i in range(len(short_results[v])):
+                            if "start" not in short_results[v][i] or 'end' not in short_results[v][i]:
+                                continue
+                            short_results[v][i]["start"] += offset
+                            short_results[v][i]["end"] += offset
+                        offset += len(short_inputs[v])
+                        single_results.extend(short_results[v])
+                concat_results.append(single_results)
+        return concat_results
+    @classmethod
+    def _build_tree(cls, schema, name="root"):
+        """
+        Build the schema tree.
+        """
+        schema_tree = SchemaTree(name)
+        for s in schema:
+            if isinstance(s, str):
+                schema_tree.add_child(SchemaTree(s))
+            elif isinstance(s, dict):
+                for k, v in s.items():
+                    if isinstance(v, str):
+                        child = [v]
+                    elif isinstance(v, list):
+                        child = v
+                    else:
+                        raise TypeError(
+                            f"Invalid schema, value for each key:value pairs should be list or string"
+                            f"but {type(v)} received")
+                    schema_tree.add_child(cls._build_tree(child, name=k))
+            else:
+                raise TypeError(f"Invalid schema, element should be string or dict, but {type(s)} received")
+        return schema_tree
+    def start_multi_process_pool(self, target_devices: List[str] = None) -> Dict[
+        Literal["input", "output", "processes"], Any]:
+        """启动多进程池，用多个独立进程进行预测
+        如果要在多个GPU或CPU上进行预测，建议使用此方法，建议每个GPU只启动一个进程
+        Args:
+            target_devices (List[str], optional): PyTorch target devices, e.g. ["cuda:0", "cuda:1", ...],
+                ["npu:0", "npu:1", ...], or ["cpu", "cpu", "cpu", "cpu"]. If target_devices is None and CUDA/NPU
+                is available, then all available CUDA/NPU devices will be used. If target_devices is None and
+                CUDA/NPU is not available, then 4 CPU devices will be used.
+        Returns:
+            Dict[str, Any]: A dictionary with the target processes, an input queue, and an output queue.
+        """
+        if target_devices is None:
+            if torch.cuda.is_available():
+                target_devices = ["cuda:{}".format(i) for i in range(torch.cuda.device_count())]
+            elif is_torch_npu_available():
+                target_devices = ["npu:{}".format(i) for i in range(torch.npu.device_count())]
+            else:
+                logger.info("CUDA/NPU is not available. Starting 4 CPU workers")
+                target_devices = ["cpu"] * 4
+        logger.info("Start multi-process pool on devices: {}".format(", ".join(map(str, target_devices))))
+        self.to("cpu")
+        self.share_memory()
+        ctx = mp.get_context("spawn")
+        input_queue = ctx.Queue()
+        output_queue = ctx.Queue()
+        processes = []
+        for device_id in target_devices:
+            p = ctx.Process(
+                target=UIEDecoder._predict_multi_process_worker,
+                args=(device_id, self, input_queue, output_queue),
+                daemon=True,
+            )
+            p.start()
+            processes.append(p)
+        return {"input": input_queue, "output": output_queue, "processes": processes}
+    @staticmethod
+    def stop_multi_process_pool(pool: Dict[Literal["input", "output", "processes"], Any]) -> None:
+        """
+        Stops all processes started with start_multi_process_pool.
+        Args:
+            pool (Dict[str, object]): A dictionary containing the input queue, output queue, and process list.
+        Returns:
+            None
+        """
+        for p in pool["processes"]:
+            p.terminate()
+        for p in pool["processes"]:
+            p.join()
+            p.close()
+        pool["input"].close()
+        pool["output"].close()
+    def predict_multi_process(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        texts: List[str],
+        pool: Dict[Literal["input", "output", "processes"], Any],
+        batch_size: int = 64,
+        max_length: int = 512,
+        split_sentence: bool = False,
+        language: Optional[str] = "zh",
+        position_prob: float = 0.5,
+        chunk_size: Optional[int] = None,
+    ) -> List[Any]:
+        if chunk_size is None:
+            chunk_size = min(math.ceil(len(texts) / len(pool["processes"]) / 10), 5000)
+        logger.debug(f"Chunk data into {math.ceil(len(texts) / chunk_size)} packages of size {chunk_size}")
+        input_queue = pool["input"]
+        last_chunk_id = 0
+        chunk = []
+        for text in texts:
+            chunk.append(text)
+            if len(chunk) >= chunk_size:
+                input_queue.put(
+                    [last_chunk_id, tokenizer, batch_size, chunk, max_length, split_sentence, language, position_prob]
+                )
+                last_chunk_id += 1
+                chunk = []
+        if len(chunk) > 0:
+            input_queue.put(
+                [last_chunk_id, tokenizer, batch_size, chunk, max_length, split_sentence, language, position_prob]
+            )
+            last_chunk_id += 1
+        output_queue = pool["output"]
+        results_list = sorted([output_queue.get() for _ in range(last_chunk_id)], key=lambda x: x[0])
+        return sum([result[1] for result in results_list], [])
+    @staticmethod
+    def _predict_multi_process_worker(
+        target_device: str, model: "UIEDecoder", input_queue: Queue, results_queue: Queue
+    ) -> None:
+        """
+        Internal working process to predict in multi-process setup
+        """
+        while True:
+            try:
+                chunk_id, tokenizer, batch_size, chunk, max_length, split_sentence, language, position_prob = (
+                    input_queue.get()
+                )
+                results = model.predict(
+                    tokenizer,
+                    chunk,
+                    batch_size=batch_size,
+                    max_length=max_length,
+                    split_sentence=split_sentence,
+                    language=language,
+                    show_progress_bar=False,
+                    device=target_device,
+                )
+                results_queue.put([chunk_id, results])
+            except queue.Empty:
+                break
+class SchemaTree(object):
+    """
+    Implementation of SchemaTree
+    """
+    def __init__(self, name='root', children=None):
+        self.name = name
+        self.children = []
+        self.prefix = None
+        self.parent_relations = None
+        if children is not None:
+            for child in children:
+                self.add_child(child)
+    def __repr__(self):
+        return self.name
+    def add_child(self, node):
+        assert isinstance(
+            node, SchemaTree
+        ), "The children of a node should be an instance of SchemaTree."
+        self.children.append(node)

modeling_uie.py CHANGED Viewed

@@ -1,162 +1,162 @@
-from dataclasses import dataclass
-from typing import Optional, Tuple
-import torch
-import torch.nn as nn
-from transformers import ErnieModel, ErniePreTrainedModel, PretrainedConfig
-from transformers.file_utils import ModelOutput
-from .decode_utils import UIEDecoder
-@dataclass
-class UIEModelOutput(ModelOutput):
-    """
-    Output class for outputs of UIE.
-    losses (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-        Total spn extraction losses is the sum of a Cross-Entropy for the start and end positions.
-    start_prob (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-        Span-start scores (after Sigmoid).
-    end_prob (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-        Span-end scores (after Sigmoid).
-    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layers, +
-        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-        sequence_length)`.
-        Attention weights after the attention softmax, used to compute the weighted average in the self-attention
-        heads.
-    """
-    loss: Optional[torch.FloatTensor] = None
-    start_prob: torch.FloatTensor = None
-    end_prob: torch.FloatTensor = None
-    start_positions: torch.FloatTensor = None
-    end_positions: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-class UIEModel(ErniePreTrainedModel, UIEDecoder):
-    """
-    UIE model based on Bert model.
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-    Parameters:
-        config ([`PretrainedConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-    """
-    def __init__(self, config: PretrainedConfig):
-        super(UIEModel, self).__init__(config)
-        self.encoder = ErnieModel(config)
-        self.config = config
-        hidden_size = self.config.hidden_size
-        self.linear_start = nn.Linear(hidden_size, 1)
-        self.linear_end = nn.Linear(hidden_size, 1)
-        self.sigmoid = nn.Sigmoid()
-        self.post_init()
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        start_positions: Optional[torch.Tensor] = None,
-        end_positions: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-    ) -> UIEModelOutput:
-        """
-        Args:
-        input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
-            1]`:
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
-            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-            [What are position IDs?](../glossary#position-ids)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outsides of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outsides of the sequence
-            are not taken into account for computing the loss.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        outputs = self.encoder(
-            input_ids=input_ids,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
-        sequence_output = outputs[0]
-        start_logits = self.linear_start(sequence_output)
-        start_logits = torch.squeeze(start_logits, -1)
-        start_prob = self.sigmoid(start_logits)
-        end_logits = self.linear_end(sequence_output)
-        end_logits = torch.squeeze(end_logits, -1)
-        end_prob = self.sigmoid(end_logits)
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            loss_fct = nn.BCELoss()
-            start_loss = loss_fct(start_prob, start_positions)
-            end_loss = loss_fct(end_prob, end_positions)
-            total_loss = (start_loss + end_loss) / 2.0
-        return UIEModelOutput(
-            loss=total_loss,
-            start_prob=start_prob,
-            end_prob=end_prob,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )

+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+from transformers import ErnieModel, ErniePreTrainedModel, PretrainedConfig
+from transformers.file_utils import ModelOutput
+from .decode_utils import UIEDecoder
+@dataclass
+class UIEModelOutput(ModelOutput):
+    """
+    Output class for outputs of UIE.
+    losses (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Total spn extraction losses is the sum of a Cross-Entropy for the start and end positions.
+    start_prob (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+        Span-start scores (after Sigmoid).
+    end_prob (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+        Span-end scores (after Sigmoid).
+    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layers, +
+        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`.
+        Attention weights after the attention softmax, used to compute the weighted average in the self-attention
+        heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    start_prob: torch.FloatTensor = None
+    end_prob: torch.FloatTensor = None
+    start_positions: torch.FloatTensor = None
+    end_positions: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+class UIEModel(ErniePreTrainedModel, UIEDecoder):
+    """
+    UIE model based on Bert model.
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`PretrainedConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    """
+    def __init__(self, config: PretrainedConfig):
+        super(UIEModel, self).__init__(config)
+        self.encoder = ErnieModel(config)
+        self.config = config
+        hidden_size = self.config.hidden_size
+        self.linear_start = nn.Linear(hidden_size, 1)
+        self.linear_end = nn.Linear(hidden_size, 1)
+        self.sigmoid = nn.Sigmoid()
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> UIEModelOutput:
+        """
+        Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outsides of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outsides of the sequence
+            are not taken into account for computing the loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        outputs = self.encoder(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs[0]
+        start_logits = self.linear_start(sequence_output)
+        start_logits = torch.squeeze(start_logits, -1)
+        start_prob = self.sigmoid(start_logits)
+        end_logits = self.linear_end(sequence_output)
+        end_logits = torch.squeeze(end_logits, -1)
+        end_prob = self.sigmoid(end_logits)
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            loss_fct = nn.BCELoss()
+            start_loss = loss_fct(start_prob, start_positions)
+            end_loss = loss_fct(end_prob, end_positions)
+            total_loss = (start_loss + end_loss) / 2.0
+        return UIEModelOutput(
+            loss=total_loss,
+            start_prob=start_prob,
+            end_prob=end_prob,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32be889914931873a60d481926e9208625823baa2166d7f1f89359f22f15a778
+size 471852986

special_tokens_map.json CHANGED Viewed

@@ -1,7 +1,7 @@
-{
-  "cls_token": "[CLS]",
-  "mask_token": "[MASK]",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "unk_token": "[UNK]"
-}

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json CHANGED Viewed

@@ -1,19 +1,7 @@
 {
   "version": "1.0",
-  "truncation": {
-    "direction": "Right",
-    "max_length": 512,
-    "strategy": "LongestFirst",
-    "stride": 2
-  },
-  "padding": {
-    "strategy": "BatchLongest",
-    "direction": "Right",
-    "pad_to_multiple_of": null,
-    "pad_id": 0,
-    "pad_type_id": 0,
-    "pad_token": "[PAD]"
-  },
   "added_tokens": [
     {
       "id": 0,
@@ -12241,7 +12229,7 @@
       "ー": 12081,
       "★": 12082,
       "’": 12083,
-      "$$": 12084,
       "{": 12085,
       "}": 12086,
       "‘": 12087,

 {
   "version": "1.0",
+  "truncation": null,
+  "padding": null,
   "added_tokens": [
     {
       "id": 0,
       "ー": 12081,
       "★": 12082,
       "’": 12083,
+      "’’": 12084,
       "{": 12085,
       "}": 12086,
       "‘": 12087,

tokenizer_config.json CHANGED Viewed

@@ -1,57 +1,57 @@
-{
-  "added_tokens_decoder": {
-    "0": {
-      "content": "[PAD]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "[CLS]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "[SEP]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "3": {
-      "content": "[MASK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "39979": {
-      "content": "[UNK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "clean_up_tokenization_spaces": true,
-  "cls_token": "[CLS]",
-  "do_basic_tokenize": true,
-  "do_lower_case": true,
-  "mask_token": "[MASK]",
-  "model_max_length": 1000000000000000019884624838656,
-  "never_split": null,
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "BertTokenizer",
-  "unk_token": "[UNK]"
-}

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "39979": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}