arubenruben commited on
Commit
66ca236
1 Parent(s): dc7dbbc

Update deploy_pipeline.py

Browse files
Files changed (1) hide show
  1. deploy_pipeline.py +4 -62
deploy_pipeline.py CHANGED
@@ -1,14 +1,3 @@
1
- import torch
2
- from transformers import Pipeline
3
- from transformers import AutoTokenizer
4
- from transformers.pipelines import PIPELINE_REGISTRY
5
- from transformers import pipeline
6
- from transformers import AutoModelForTokenClassification
7
- from huggingface_hub import Repository
8
- import sys
9
- import os
10
-
11
-
12
  class TokenizeAndAlignLabelsStep():
13
 
14
  # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
@@ -21,7 +10,7 @@ class TokenizeAndAlignLabelsStep():
21
 
22
  previous_word_idx = None
23
 
24
-
25
  labels_mask = []
26
 
27
  for word_idx in word_ids: # Set the special tokens to -100.
@@ -29,64 +18,17 @@ class TokenizeAndAlignLabelsStep():
29
  labels_mask.append(False)
30
  # Only label the first token of a given word.
31
  elif word_idx != previous_word_idx:
32
- labels_mask.append(True)
 
33
  else:
34
  labels_mask.append(False)
35
 
36
  previous_word_idx = word_idx
37
 
38
- tokenized_inputs["tokens"] = tokenizer.decode(tokenized_inputs["input_ids"], skip_special_tokens=True)
39
  tokenized_inputs["labels_mask"] = labels_mask
40
 
41
  return tokenized_inputs
42
-
43
-
44
- class BERT_CRF_Pipeline(Pipeline):
45
-
46
- def _sanitize_parameters(self, **kwargs):
47
- return {}, {}, {}
48
-
49
- def preprocess(self, text):
50
-
51
- tokenizer = AutoTokenizer.from_pretrained(
52
- "neuralmind/bert-base-portuguese-cased", do_lower_case=False)
53
-
54
- TokenizeAndAlignLabelsStep().tokenize_and_align_labels(
55
- examples=text, tokenizer=tokenizer)
56
-
57
- return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=text, tokenizer=tokenizer)
58
-
59
- def _forward(self, tokenizer_results):
60
-
61
- input_ids = torch.tensor(
62
- tokenizer_results['input_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
63
-
64
- token_type_ids = torch.tensor(
65
- tokenizer_results['token_type_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
66
-
67
- attention_mask = torch.tensor(
68
- tokenizer_results['attention_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
69
-
70
- labels_mask = torch.tensor(
71
- tokenizer_results['labels_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
72
-
73
- # input_ids, token_type_ids, attention_mask, labels, labels_mask
74
- outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
75
- attention_mask=attention_mask, labels=None, labels_mask=labels_mask)
76
-
77
- return {'outputs': outputs, 'tokens': tokenizer_results['tokens']}
78
-
79
- def postprocess(self, outputs):
80
-
81
- model_outputs = outputs['outputs']
82
- tokens = outputs['tokens']
83
-
84
- # From Ner_tags to Ner_labels
85
- for i, label in enumerate(model_outputs[0]):
86
- model_outputs[0][i] = self.model.config.id2label[label]
87
-
88
- return model_outputs[0], tokens
89
-
90
  def main():
91
 
92
  PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline",
 
 
 
 
 
 
 
 
 
 
 
 
1
  class TokenizeAndAlignLabelsStep():
2
 
3
  # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
 
10
 
11
  previous_word_idx = None
12
 
13
+ tokens= []
14
  labels_mask = []
15
 
16
  for word_idx in word_ids: # Set the special tokens to -100.
 
18
  labels_mask.append(False)
19
  # Only label the first token of a given word.
20
  elif word_idx != previous_word_idx:
21
+ labels_mask.append(True)
22
+ tokens.append(tokenized_inputs["input_ids"][word_idx])
23
  else:
24
  labels_mask.append(False)
25
 
26
  previous_word_idx = word_idx
27
 
28
+ tokenized_inputs["tokens"] = tokens
29
  tokenized_inputs["labels_mask"] = labels_mask
30
 
31
  return tokenized_inputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def main():
33
 
34
  PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline",