arubenruben commited on
Commit
a8437dd
1 Parent(s): 66ca236

Update deploy_pipeline.py

Browse files
Files changed (1) hide show
  1. deploy_pipeline.py +69 -0
deploy_pipeline.py CHANGED
@@ -1,3 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  class TokenizeAndAlignLabelsStep():
2
 
3
  # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
@@ -29,6 +49,55 @@ class TokenizeAndAlignLabelsStep():
29
  tokenized_inputs["labels_mask"] = labels_mask
30
 
31
  return tokenized_inputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def main():
33
 
34
  PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline",
 
1
+ class TokenizeAndAlignLabelsStep():
2
+
3
+ # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
4
+ def tokenize_and_align_labels(self, examples, tokenizer):
5
+
6
+ tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128)
7
+
8
+ # Map tokens to their respective word.
9
+ word_ids = tokenized_inputs.word_ids()
10
+ import torch
11
+ from transformers import Pipeline
12
+ from transformers import AutoTokenizer
13
+ from transformers.pipelines import PIPELINE_REGISTRY
14
+ from transformers import pipeline
15
+ from transformers import AutoModelForTokenClassification
16
+ from huggingface_hub import Repository
17
+ import sys
18
+ import os
19
+
20
+
21
  class TokenizeAndAlignLabelsStep():
22
 
23
  # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
 
49
  tokenized_inputs["labels_mask"] = labels_mask
50
 
51
  return tokenized_inputs
52
+
53
+
54
+ class BERT_CRF_Pipeline(Pipeline):
55
+
56
+ def _sanitize_parameters(self, **kwargs):
57
+ return {}, {}, {}
58
+
59
+ def preprocess(self, text):
60
+
61
+ tokenizer = AutoTokenizer.from_pretrained(
62
+ "neuralmind/bert-base-portuguese-cased", do_lower_case=False)
63
+
64
+ TokenizeAndAlignLabelsStep().tokenize_and_align_labels(
65
+ examples=text, tokenizer=tokenizer)
66
+
67
+ return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=text, tokenizer=tokenizer)
68
+
69
+ def _forward(self, tokenizer_results):
70
+
71
+ input_ids = torch.tensor(
72
+ tokenizer_results['input_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
73
+
74
+ token_type_ids = torch.tensor(
75
+ tokenizer_results['token_type_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
76
+
77
+ attention_mask = torch.tensor(
78
+ tokenizer_results['attention_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
79
+
80
+ labels_mask = torch.tensor(
81
+ tokenizer_results['labels_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
82
+
83
+ # input_ids, token_type_ids, attention_mask, labels, labels_mask
84
+ outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
85
+ attention_mask=attention_mask, labels=None, labels_mask=labels_mask)
86
+
87
+ return {'outputs': outputs, 'tokens': tokenizer_results['tokens']}
88
+
89
+ def postprocess(self, outputs):
90
+
91
+ model_outputs = outputs['outputs']
92
+ tokens = outputs['tokens']
93
+
94
+ # From Ner_tags to Ner_labels
95
+ for i, label in enumerate(model_outputs[0]):
96
+ model_outputs[0][i] = self.model.config.id2label[label]
97
+
98
+ return model_outputs[0], tokens
99
+
100
+
101
  def main():
102
 
103
  PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline",