feat(similar.py): update pipeline

Files changed (8) hide show

README.md CHANGED Viewed

@@ -1,3 +1,11 @@
 ---
 license: apache-2.0
 ---

 ---
+language:
+    - zh
 license: apache-2.0
+tags:
+    - bert
+    - similar
+pipeline_tag: other
 ---
+### BERT 中文相似度计算

config.json ADDED Viewed

+{
+  "_name_or_path": "minskiter/simbert-chinese-bert-wwm-ext",
+  "architectures": [
+    "SimBertModel"
+  ],
+  "custom_pipelines": {
+    "sentences_sim": {
+      "impl": "minskiter/simbert-chinese-bert-wwm-ext--similar.SimilarPipeline",
+      "pt": "AutoModel",
+      "tf": []
+    }
+  },
+  "auto_map":{
+    "AutoModel": "minskiter/simbert-chinese-bert-wwm-ext--modeling_bert.SimBertModel",
+    "AutoConfig": "minskiter/simbert-chinese-bert-wwm-ext--configuration_bert.SimBertConfig"
+  },
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "simbert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 21128
+}

configuration_bert.py ADDED Viewed

+from transformers import BertConfig
+class SimBertConfig(BertConfig):
+    model_type = "simbert"

modeling_bert.py ADDED Viewed

+from transformers import PretrainedConfig, PreTrainedModel, BertModel, BertConfig
+from torch import nn
+class SimBertModel(PreTrainedModel):
+    """ SimBert Model
+    """
+    config_class = BertConfig
+    def __init__(
+            self,
+            config: PretrainedConfig
+        ) -> None:
+            super().__init__(config)
+            self.bert = BertModel(config=config, add_pooling_layer=True)
+            self.fc = nn.Linear(config.hidden_size, 2)
+            # self.loss_fct = nn.CrossEntropyLoss()
+            self.loss_fct = nn.MSELoss()
+            self.softmax = nn.Softmax(dim=1)
+    def forward(
+        self,
+        input_ids,
+        token_type_ids,
+        attention_mask,
+        labels=None
+    ):
+        outputs = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids
+        )
+        pooled_output = outputs.pooler_output
+        logits = self.fc(pooled_output)
+        logits = self.softmax(logits)[:,1]
+        if labels is not None:
+            loss = self.loss_fct(logits.view(-1), labels.view(-1))
+            return loss, logits
+        return None, logits

pytorch_model.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:c73394cb3357f031cd1946df0acd09dfd7d0983bbb172aa9df3916028acf4a22
+size 409149557

similar.py ADDED Viewed

+from typing import Any, Dict, Tuple
+from transformers import Pipeline
+from transformers.pipelines.base import GenericTensor
+from transformers.utils import ModelOutput
+from typing import Union,List
+class SimilarPipeline(Pipeline):
+    def __init__(self, max_length=512,*args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.max_length = max_length
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return {},{},{}
+    def preprocess(self, input: Union[Tuple[str],List[Tuple[str]]], **preprocess_parameters: Dict) -> Dict[str, GenericTensor]:
+        if isinstance(input, list):
+            a = list(map(lambda x: x[0], input))
+            b = list(map(lambda x: x[1], input))
+        else:
+            a = input[0]
+            b = input[1]
+        tensors = self.tokenizer(
+            a,
+            b,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        return tensors
+    def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput:
+        _,logits = self.model(**input_tensors)
+        return logits.tolist()
+    def postprocess(
+            self,
+            model_outputs: ModelOutput,
+            **postprocess_parameters: Dict
+        ) -> Any:
+        return model_outputs

tokenizer_config.json ADDED Viewed

+{
+    "padding": "max_length",
+    "max_length": 512,
+    "name_or_path": "hfl/chinese-bert-wwm-ext",
+    "tokenizer_class": "BertTokenizer"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff