NER-PT-BERT-CRF-HAREM-Default / deploy_pipeline.py
arubenruben's picture
Update deploy_pipeline.py
a8437dd
raw
history blame
4.74 kB
class TokenizeAndAlignLabelsStep():
# Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
def tokenize_and_align_labels(self, examples, tokenizer):
tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128)
# Map tokens to their respective word.
word_ids = tokenized_inputs.word_ids()
import torch
from transformers import Pipeline
from transformers import AutoTokenizer
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline
from transformers import AutoModelForTokenClassification
from huggingface_hub import Repository
import sys
import os
class TokenizeAndAlignLabelsStep():
# Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
def tokenize_and_align_labels(self, examples, tokenizer):
tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128)
# Map tokens to their respective word.
word_ids = tokenized_inputs.word_ids()
previous_word_idx = None
tokens= []
labels_mask = []
for word_idx in word_ids: # Set the special tokens to -100.
if word_idx is None:
labels_mask.append(False)
# Only label the first token of a given word.
elif word_idx != previous_word_idx:
labels_mask.append(True)
tokens.append(tokenized_inputs["input_ids"][word_idx])
else:
labels_mask.append(False)
previous_word_idx = word_idx
tokenized_inputs["tokens"] = tokens
tokenized_inputs["labels_mask"] = labels_mask
return tokenized_inputs
class BERT_CRF_Pipeline(Pipeline):
def _sanitize_parameters(self, **kwargs):
return {}, {}, {}
def preprocess(self, text):
tokenizer = AutoTokenizer.from_pretrained(
"neuralmind/bert-base-portuguese-cased", do_lower_case=False)
TokenizeAndAlignLabelsStep().tokenize_and_align_labels(
examples=text, tokenizer=tokenizer)
return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=text, tokenizer=tokenizer)
def _forward(self, tokenizer_results):
input_ids = torch.tensor(
tokenizer_results['input_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
token_type_ids = torch.tensor(
tokenizer_results['token_type_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
attention_mask = torch.tensor(
tokenizer_results['attention_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
labels_mask = torch.tensor(
tokenizer_results['labels_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
# input_ids, token_type_ids, attention_mask, labels, labels_mask
outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
attention_mask=attention_mask, labels=None, labels_mask=labels_mask)
return {'outputs': outputs, 'tokens': tokenizer_results['tokens']}
def postprocess(self, outputs):
model_outputs = outputs['outputs']
tokens = outputs['tokens']
# From Ner_tags to Ner_labels
for i, label in enumerate(model_outputs[0]):
model_outputs[0][i] = self.model.config.id2label[label]
return model_outputs[0], tokens
def main():
PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline",
pipeline_class=BERT_CRF_Pipeline,
pt_model=AutoModelForTokenClassification,
)
classifier = pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline", model="arubenruben/PT-BERT-Large-CRF-HAREM-Default",
device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), trust_remote_code=True)
out_path = os.path.join(sys.path[0], 'out', 'pipeline')
repo = Repository(
out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-HAREM-Default", use_auth_token=True)
# repo.git_pull()
classifier.save_pretrained(out_path)
repo.push_to_hub()