arubenruben commited on
Commit
de0dec6
1 Parent(s): 2968f5e

commit files to HF hub

Browse files
Files changed (2) hide show
  1. config.json +12 -3
  2. deploy_pipeline.py +103 -0
config.json CHANGED
@@ -1,13 +1,22 @@
1
  {
2
- "_name_or_path": "/notebooks/src/hugging_face_pipeline/BERT-CRF/out/model",
3
  "architectures": [
4
  "BERT_CRF"
5
  ],
6
  "auto_map": {
7
- "AutoConfig": "model.BERT_CRF_Config",
8
- "AutoModelForTokenClassification": "model.BERT_CRF"
9
  },
10
  "bert_name": "neuralmind/bert-large-portuguese-cased",
 
 
 
 
 
 
 
 
 
11
  "id2label": {
12
  "0": "O",
13
  "1": "B-PESSOA",
 
1
  {
2
+ "_name_or_path": "arubenruben/PT-BERT-Large-CRF-HAREM-Default",
3
  "architectures": [
4
  "BERT_CRF"
5
  ],
6
  "auto_map": {
7
+ "AutoConfig": "arubenruben/PT-BERT-Large-CRF-HAREM-Default--model.BERT_CRF_Config",
8
+ "AutoModelForTokenClassification": "arubenruben/PT-BERT-Large-CRF-HAREM-Default--model.BERT_CRF"
9
  },
10
  "bert_name": "neuralmind/bert-large-portuguese-cased",
11
+ "custom_pipelines": {
12
+ "arubenruben/PT-BERT-Large-CRF-HAREM-Default-pipeline": {
13
+ "impl": "deploy_pipeline.BERT_CRF_Pipeline",
14
+ "pt": [
15
+ "AutoModelForTokenClassification"
16
+ ],
17
+ "tf": []
18
+ }
19
+ },
20
  "id2label": {
21
  "0": "O",
22
  "1": "B-PESSOA",
deploy_pipeline.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import Pipeline
3
+ from transformers import AutoTokenizer
4
+ from transformers.pipelines import PIPELINE_REGISTRY
5
+ from transformers import pipeline
6
+ from transformers import AutoModelForTokenClassification
7
+ from huggingface_hub import Repository
8
+ import sys
9
+ import os
10
+
11
+
12
+ class TokenizeAndAlignLabelsStep():
13
+
14
+ # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
15
+ def tokenize_and_align_labels(self, examples, tokenizer):
16
+
17
+ tokenized_inputs = tokenizer(examples, padding='max_length', max_length=512)
18
+
19
+ # Map tokens to their respective word.
20
+ word_ids = tokenized_inputs.word_ids()
21
+
22
+ previous_word_idx = None
23
+
24
+ labels_mask = []
25
+
26
+ for word_idx in word_ids: # Set the special tokens to -100.
27
+ if word_idx is None:
28
+ labels_mask.append(False)
29
+ # Only label the first token of a given word.
30
+ elif word_idx != previous_word_idx:
31
+ labels_mask.append(True)
32
+ else:
33
+ labels_mask.append(False)
34
+
35
+ previous_word_idx = word_idx
36
+
37
+ tokenized_inputs["tokens"] = examples
38
+ tokenized_inputs["ner_tags"] = []
39
+ tokenized_inputs["labels"] = []
40
+ tokenized_inputs["labels_mask"] = labels_mask
41
+
42
+ return tokenized_inputs
43
+
44
+
45
+ class BERT_CRF_Pipeline(Pipeline):
46
+
47
+ def _sanitize_parameters(self, **kwargs):
48
+ return {}, {}, {}
49
+
50
+ def preprocess(self, text):
51
+
52
+ tokenizer = AutoTokenizer.from_pretrained(
53
+ "neuralmind/bert-base-portuguese-cased", do_lower_case=False)
54
+
55
+ TokenizeAndAlignLabelsStep().tokenize_and_align_labels(
56
+ examples=text, tokenizer=tokenizer)
57
+
58
+ return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=text, tokenizer=tokenizer)
59
+
60
+ def _forward(self, tokenizer_results):
61
+
62
+ input_ids = torch.tensor(
63
+ tokenizer_results['input_ids'], dtype=torch.long).unsqueeze(0)
64
+
65
+ token_type_ids = torch.tensor(
66
+ tokenizer_results['token_type_ids'], dtype=torch.long).unsqueeze(0)
67
+
68
+ attention_mask = torch.tensor(
69
+ tokenizer_results['attention_mask'], dtype=torch.bool).unsqueeze(0)
70
+
71
+ labels_mask = torch.tensor(
72
+ tokenizer_results['labels_mask'], dtype=torch.bool).unsqueeze(0)
73
+
74
+ # input_ids, token_type_ids, attention_mask, labels, labels_mask
75
+ outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
76
+ attention_mask=attention_mask, labels=None, labels_mask=labels_mask)
77
+
78
+ return outputs
79
+
80
+ def postprocess(self, model_outputs):
81
+ # From Ner_tags to Ner_labels
82
+ for i, label in enumerate(model_outputs[0]):
83
+ model_outputs[0][i] = self.model.config.id2label[label]
84
+
85
+ return model_outputs[0]
86
+
87
+
88
+ def main():
89
+
90
+ PIPELINE_REGISTRY.register_pipeline("arubenruben/PT-BERT-Large-CRF-HAREM-Default-pipeline",
91
+ pipeline_class=BERT_CRF_Pipeline,
92
+ pt_model=AutoModelForTokenClassification,
93
+ )
94
+ classifier = pipeline("arubenruben/PT-BERT-Large-CRF-HAREM-Default-pipeline", model="arubenruben/PT-BERT-Large-CRF-HAREM-Default",
95
+ device='cuda' if torch.cuda.is_available() else 'cpu', trust_remote_code=True)
96
+ out_path = os.path.join(sys.path[0], 'out', 'pipeline')
97
+ repo = Repository(
98
+ out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-HAREM-Default", use_auth_token=True)
99
+
100
+ # repo.git_pull()
101
+
102
+ classifier.save_pretrained(out_path)
103
+ repo.push_to_hub()