Spaces:
Runtime error
Runtime error
feat(models): update models and deploy app.py
Browse files- .gitignore +1 -0
- README.md +2 -2
- app.py +22 -0
- models/bert/__init__.py +1 -0
- models/bert/configuration_bert.py +51 -0
- models/bert/model_bert.py +41 -0
- models/crf/__init__.py +1 -0
- models/crf/model_crf.py +166 -0
- pipelines/__init__.py +1 -0
- pipelines/ner_pipeline.py +114 -0
- register.py +8 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
**/__pycache__
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: π
|
4 |
colorFrom: indigo
|
5 |
colorTo: red
|
@@ -7,7 +7,7 @@ sdk: gradio
|
|
7 |
sdk_version: 3.36.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
license:
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Resume Basic
|
3 |
emoji: π
|
4 |
colorFrom: indigo
|
5 |
colorTo: red
|
|
|
7 |
sdk_version: 3.36.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import BertTokenizer,AutoModel
|
2 |
+
from transformers.pipelines import pipeline
|
3 |
+
from register import register
|
4 |
+
import gradio as gr
|
5 |
+
from huggingface_hub import login
|
6 |
+
import os
|
7 |
+
register()
|
8 |
+
login(os.environ["HF_Token"])
|
9 |
+
tokenizer = BertTokenizer.from_pretrained("minskiter/resume_token_classification",use_auth_token=True)
|
10 |
+
model = AutoModel.from_pretrained("minskiter/resume_token_classification",use_auth_token=True)
|
11 |
+
ner_predictor = pipeline(
|
12 |
+
"ner_predictor",
|
13 |
+
model=model,
|
14 |
+
tokenizer=tokenizer,
|
15 |
+
device="cpu"
|
16 |
+
)
|
17 |
+
|
18 |
+
def ner_predictor_gradio(input):
|
19 |
+
return ner_predictor(input)
|
20 |
+
|
21 |
+
demo = gr.Interface(fn=ner_predictor_gradio, inputs="text", outputs="text")
|
22 |
+
demo.launch()
|
models/bert/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .model_bert import BertCrfModel,BertCrfConfig
|
models/bert/configuration_bert.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PretrainedConfig
|
2 |
+
|
3 |
+
class BertCrfConfig(PretrainedConfig):
|
4 |
+
|
5 |
+
model_type="bert_crf"
|
6 |
+
|
7 |
+
def __init__(
|
8 |
+
self,
|
9 |
+
vocab_size=30522,
|
10 |
+
hidden_size=768,
|
11 |
+
num_hidden_layers=12,
|
12 |
+
num_attention_heads=12,
|
13 |
+
intermediate_size=3072,
|
14 |
+
hidden_act="gelu",
|
15 |
+
hidden_dropout_prob=0.1,
|
16 |
+
attention_probs_dropout_prob=0.1,
|
17 |
+
max_position_embeddings=512,
|
18 |
+
type_vocab_size=2,
|
19 |
+
initializer_range=0.02,
|
20 |
+
layer_norm_eps=1e-12,
|
21 |
+
pad_token_id=0,
|
22 |
+
position_embedding_type="absolute",
|
23 |
+
use_cache=True,
|
24 |
+
classifier_dropout=None,
|
25 |
+
lstm_hidden_state=300,
|
26 |
+
num_tags=2,
|
27 |
+
tag2id={"O":0,"I":1},
|
28 |
+
id2tag={"0":"O","1":"I"},
|
29 |
+
**kwargs
|
30 |
+
):
|
31 |
+
super().__init__(pad_token_id=pad_token_id,**kwargs)
|
32 |
+
self.vocab_size = vocab_size
|
33 |
+
self.hidden_size = hidden_size
|
34 |
+
self.num_hidden_layers = num_hidden_layers
|
35 |
+
self.num_attention_heads = num_attention_heads
|
36 |
+
self.intermediate_size = intermediate_size
|
37 |
+
self.hidden_act = hidden_act
|
38 |
+
self.hidden_dropout_prob = hidden_dropout_prob
|
39 |
+
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
40 |
+
self.max_position_embeddings = max_position_embeddings
|
41 |
+
self.type_vocab_size = type_vocab_size
|
42 |
+
self.initializer_range = initializer_range
|
43 |
+
self.layer_norm_eps = layer_norm_eps
|
44 |
+
self.position_embedding_type = position_embedding_type
|
45 |
+
self.use_cache = use_cache
|
46 |
+
self.classifier_dropout = classifier_dropout
|
47 |
+
self.lstm_hidden_state = lstm_hidden_state
|
48 |
+
self.num_tags = num_tags
|
49 |
+
self.tag2id = tag2id
|
50 |
+
self.id2tag = id2tag
|
51 |
+
|
models/bert/model_bert.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PreTrainedModel,BertModel
|
2 |
+
from torch import nn
|
3 |
+
from transformers.configuration_utils import PretrainedConfig
|
4 |
+
from ..crf import CRF
|
5 |
+
from .configuration_bert import BertCrfConfig
|
6 |
+
|
7 |
+
class BertCrfModel(PreTrainedModel):
|
8 |
+
"""BERT LSTM CRF Classify
|
9 |
+
|
10 |
+
Args:
|
11 |
+
PreTrainedModel (BertConfig): config
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
loss: (torch.Tensor) batch loss
|
15 |
+
(best_path, labels): crf best path with true labels
|
16 |
+
"""
|
17 |
+
config_class = BertCrfConfig
|
18 |
+
|
19 |
+
def __init__(self, config, num_tags = None):
|
20 |
+
super().__init__(config)
|
21 |
+
if num_tags is not None:
|
22 |
+
config.num_tags = num_tags
|
23 |
+
self.bert = BertModel(config=config, add_pooling_layer=False)
|
24 |
+
self.lstm = nn.LSTM(config.hidden_size, config.lstm_hidden_state, 1, batch_first=True, bidirectional=True)
|
25 |
+
self.crf = CRF(config.num_tags)
|
26 |
+
self.fc = nn.Linear(config.lstm_hidden_state*2, config.num_tags)
|
27 |
+
|
28 |
+
def forward(self, input_ids, attention_mask, token_type_ids, input_mask, labels=None):
|
29 |
+
outputs = self.bert(
|
30 |
+
input_ids = input_ids,
|
31 |
+
attention_mask = attention_mask,
|
32 |
+
token_type_ids = token_type_ids
|
33 |
+
)
|
34 |
+
hidden_states = outputs[0]
|
35 |
+
lstm_hidden_states = self.lstm(hidden_states)[0]
|
36 |
+
emission_scores = self.fc(lstm_hidden_states)
|
37 |
+
loss = None
|
38 |
+
if labels is not None:
|
39 |
+
loss = self.crf.loss(emission_scores, labels, input_mask==0)
|
40 |
+
_,best_path = self.crf(emission_scores, input_mask==0)
|
41 |
+
return loss,(list(i[1:-1] for i in best_path), labels.cpu() if labels is not None else None)
|
models/crf/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .model_crf import CRF
|
models/crf/model_crf.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
|
5 |
+
def log_sum_exp(x):
|
6 |
+
"""calculate log(sum(exp(x))) = max(x) + log(sum(exp(x - max(x))))
|
7 |
+
"""
|
8 |
+
max_score = x.max(-1)[0]
|
9 |
+
return max_score + (x - max_score.unsqueeze(-1)).exp().sum(-1).log()
|
10 |
+
|
11 |
+
|
12 |
+
IMPOSSIBLE = -1e4
|
13 |
+
|
14 |
+
|
15 |
+
class CRF(nn.Module):
|
16 |
+
"""General CRF module.
|
17 |
+
The CRF module contain a inner Linear Layer which transform the input from features space to tag space.
|
18 |
+
:param in_features: number of features for the input
|
19 |
+
:param num_tag: number of tags. DO NOT include START, STOP tags, they are included internal.
|
20 |
+
"""
|
21 |
+
|
22 |
+
def __init__(self, num_tags):
|
23 |
+
super(CRF, self).__init__()
|
24 |
+
|
25 |
+
self.num_tags = num_tags + 2
|
26 |
+
self.start_idx = self.num_tags - 2
|
27 |
+
self.stop_idx = self.num_tags - 1
|
28 |
+
|
29 |
+
# transition factor, Tij mean transition from j to i
|
30 |
+
self.transitions = nn.Parameter(torch.randn(self.num_tags, self.num_tags), requires_grad=True)
|
31 |
+
self.transitions.data[self.start_idx, :] = IMPOSSIBLE
|
32 |
+
self.transitions.data[:, self.stop_idx] = IMPOSSIBLE
|
33 |
+
|
34 |
+
def __get_emission_score(self, features):
|
35 |
+
# features
|
36 |
+
b,seq,_ = features.size()
|
37 |
+
start_score = torch.full((b,seq,1),IMPOSSIBLE).to(features.device)
|
38 |
+
end_score = torch.full((b,seq,1),IMPOSSIBLE).to(features.device)
|
39 |
+
return torch.cat([features,start_score,end_score],dim=-1)
|
40 |
+
|
41 |
+
def forward(self, features, masks):
|
42 |
+
"""decode tags
|
43 |
+
:param features: [B, L, C], batch of unary scores
|
44 |
+
:param masks: [B, L] masks
|
45 |
+
:return: (best_score, best_paths)
|
46 |
+
best_score: [B]
|
47 |
+
best_paths: [B, L]
|
48 |
+
"""
|
49 |
+
features = self.__get_emission_score(features) # [B,L,C] => [B,L,T]
|
50 |
+
return self.__viterbi_decode(features, masks[:, :features.size(1)].float())
|
51 |
+
|
52 |
+
def loss(self, features, ys, masks):
|
53 |
+
"""negative log likelihood loss
|
54 |
+
B: batch size, L: sequence length, D: dimension
|
55 |
+
:param features: [B, L, D]
|
56 |
+
:param ys: tags, [B, L]
|
57 |
+
:param masks: masks for padding, [B, L]
|
58 |
+
:return: loss
|
59 |
+
"""
|
60 |
+
features = self.__get_emission_score(features) # [B,L,C] => [B,L,T]
|
61 |
+
|
62 |
+
L = features.size(1)
|
63 |
+
masks_ = masks[:, :L].float()
|
64 |
+
forward_score = self.__forward_algorithm(features, masks_)
|
65 |
+
ys = ys.clone().detach()
|
66 |
+
ys[ys<0] = 0
|
67 |
+
gold_score = self.__score_sentence(features, ys[:, :L].long(), masks_)
|
68 |
+
loss = (forward_score - gold_score).mean()
|
69 |
+
return loss
|
70 |
+
|
71 |
+
def __score_sentence(self, features, tags, masks):
|
72 |
+
"""Gives the score of a provided tag sequence
|
73 |
+
:param features: [B, L, C]
|
74 |
+
:param tags: [B, L]
|
75 |
+
:param masks: [B, L]
|
76 |
+
:return: [B] score in the log space
|
77 |
+
"""
|
78 |
+
B, L, C = features.shape
|
79 |
+
|
80 |
+
# emission score
|
81 |
+
emit_scores = features.gather(dim=2, index=tags.unsqueeze(-1)).squeeze(-1)
|
82 |
+
|
83 |
+
# transition score
|
84 |
+
start_tag = torch.full((B, 1), self.start_idx, dtype=torch.long, device=tags.device)
|
85 |
+
tags = torch.cat([start_tag, tags], dim=1) # [B, L+1]
|
86 |
+
trans_scores = self.transitions[tags[:, 1:], tags[:, :-1]]
|
87 |
+
|
88 |
+
# last transition score to STOP tag
|
89 |
+
last_tag = tags.gather(dim=1, index=masks.sum(1).long().unsqueeze(1)).squeeze(1) # [B]
|
90 |
+
last_score = self.transitions[self.stop_idx, last_tag]
|
91 |
+
|
92 |
+
score = ((trans_scores + emit_scores) * masks).sum(1) + last_score
|
93 |
+
return score
|
94 |
+
|
95 |
+
def __viterbi_decode(self, features, masks):
|
96 |
+
"""decode to tags using viterbi algorithm
|
97 |
+
:param features: [B, L, C], batch of unary scores
|
98 |
+
:param masks: [B, L] masks
|
99 |
+
:return: (best_score, best_paths)
|
100 |
+
best_score: [B]
|
101 |
+
best_paths: [B, L]
|
102 |
+
"""
|
103 |
+
B, L, C = features.shape
|
104 |
+
|
105 |
+
bps = torch.zeros(B, L, C, dtype=torch.long, device=features.device) # back pointers
|
106 |
+
|
107 |
+
# Initialize the viterbi variables in log space
|
108 |
+
|
109 |
+
max_score = torch.full((B, C), IMPOSSIBLE, device=features.device) # [B, C]
|
110 |
+
max_score[:, self.start_idx] = 0
|
111 |
+
|
112 |
+
for t in range(L):
|
113 |
+
mask_t = masks[:, t].unsqueeze(1) # [B, 1]
|
114 |
+
emit_score_t = features[:, t] # [B, C]
|
115 |
+
|
116 |
+
# [B, 1, C] + [C, C]
|
117 |
+
acc_score_t = max_score.unsqueeze(1) + self.transitions # [B, C, C]
|
118 |
+
acc_score_t, bps[:, t, :] = acc_score_t.max(dim=-1)
|
119 |
+
acc_score_t += emit_score_t
|
120 |
+
max_score = acc_score_t * mask_t + max_score * (1 - mask_t) # max_score or acc_score_t
|
121 |
+
|
122 |
+
# Transition to STOP_TAG
|
123 |
+
max_score += self.transitions[self.stop_idx]
|
124 |
+
best_score, best_tag = max_score.max(dim=-1)
|
125 |
+
|
126 |
+
# Follow the back pointers to decode the best path.
|
127 |
+
best_paths = []
|
128 |
+
bps = bps.cpu().numpy()
|
129 |
+
for b in range(B):
|
130 |
+
best_tag_b = best_tag[b].item()
|
131 |
+
seq_len = int(masks[b, :].sum().item())
|
132 |
+
|
133 |
+
best_path = [best_tag_b]
|
134 |
+
for bps_t in reversed(bps[b, :seq_len]):
|
135 |
+
best_tag_b = bps_t[best_tag_b]
|
136 |
+
best_path.append(best_tag_b)
|
137 |
+
# drop the last tag and reverse the left
|
138 |
+
best_paths.append(best_path[-2::-1])
|
139 |
+
|
140 |
+
return best_score, best_paths
|
141 |
+
|
142 |
+
def __forward_algorithm(self, features, masks):
|
143 |
+
"""calculate the partition function with forward algorithm.
|
144 |
+
TRICK: log_sum_exp([x1, x2, x3, x4, ...]) = log_sum_exp([log_sum_exp([x1, x2]), log_sum_exp([x3, x4]), ...])
|
145 |
+
:param features: features. [B, L, C]
|
146 |
+
:param masks: [B, L] masks
|
147 |
+
:return: [B], score in the log space
|
148 |
+
"""
|
149 |
+
B, L, C = features.shape
|
150 |
+
|
151 |
+
scores = torch.full((B, C), IMPOSSIBLE, device=features.device) # [B, C]
|
152 |
+
scores[:, self.start_idx] = 0.
|
153 |
+
trans = self.transitions.unsqueeze(0) # [1, C, C]
|
154 |
+
|
155 |
+
# Iterate through the sentence
|
156 |
+
for t in range(L):
|
157 |
+
emit_score_t = features[:, t].unsqueeze(2) # [B, C, 1]
|
158 |
+
score_t = scores.unsqueeze(1) + trans + emit_score_t # [B, 1, C] + [1, C, C] + [B, C, 1] => [B, C, C]
|
159 |
+
score_t = log_sum_exp(score_t) # [B, C]
|
160 |
+
|
161 |
+
mask_t = masks[:, t].unsqueeze(1) # [B, 1]
|
162 |
+
scores = score_t * mask_t + scores * (1 - mask_t)
|
163 |
+
scores = log_sum_exp(scores + self.transitions[self.stop_idx])
|
164 |
+
return scores
|
165 |
+
|
166 |
+
|
pipelines/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .ner_pipeline import NERPredictorPipe
|
pipelines/ner_pipeline.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import Pipeline
|
2 |
+
from typing import Dict, Any, Union
|
3 |
+
from transformers.pipelines.base import GenericTensor
|
4 |
+
from transformers.modeling_outputs import ModelOutput
|
5 |
+
import torch
|
6 |
+
|
7 |
+
class NERPredictorPipe(Pipeline):
|
8 |
+
|
9 |
+
def _sanitize_parameters(self, **kwargs):
|
10 |
+
return {},{},{}
|
11 |
+
|
12 |
+
def __token_preprocess(self, input, tokenizer, max_length=512):
|
13 |
+
tokenized = tokenizer(input,
|
14 |
+
padding="max_length",
|
15 |
+
max_length=max_length,
|
16 |
+
truncation=True,
|
17 |
+
return_tensors="pt"
|
18 |
+
)
|
19 |
+
return tokenized
|
20 |
+
|
21 |
+
def preprocess(self, sentence: Union[str,list], max_length=512) -> Dict[str, GenericTensor]:
|
22 |
+
input_tensors = self.__token_preprocess(
|
23 |
+
sentence,
|
24 |
+
self.tokenizer,
|
25 |
+
max_length=max_length
|
26 |
+
)
|
27 |
+
input_tensors["input_mask"] = (~(input_tensors["input_ids"]>0)).long()
|
28 |
+
for key in input_tensors:
|
29 |
+
if input_tensors[key] is not None:
|
30 |
+
input_tensors[key] = input_tensors[key].to(self.device)
|
31 |
+
return input_tensors
|
32 |
+
|
33 |
+
def _forward(self, input_tensors: Dict[str, GenericTensor]) -> ModelOutput:
|
34 |
+
self.model.eval()
|
35 |
+
with torch.no_grad():
|
36 |
+
_,(best_path,_) = self.model(**input_tensors)
|
37 |
+
return (input_tensors["input_ids"].tolist(),best_path)
|
38 |
+
|
39 |
+
def __format_output(self, start, end, text, label):
|
40 |
+
return {
|
41 |
+
"text": text,
|
42 |
+
"start": start,
|
43 |
+
"end": end,
|
44 |
+
"label": label
|
45 |
+
}
|
46 |
+
|
47 |
+
def postprocess(self, model_outputs: ModelOutput) -> Any:
|
48 |
+
batch_slices = []
|
49 |
+
input_ids_list = model_outputs[0]
|
50 |
+
label_ids_list = model_outputs[1]
|
51 |
+
for input_ids,label_ids in zip(input_ids_list,label_ids_list):
|
52 |
+
slices = []
|
53 |
+
labels = list(self.model.config.id2tag[str(id)] for id in label_ids)
|
54 |
+
# get slice
|
55 |
+
past = "O"
|
56 |
+
start = -1
|
57 |
+
end = -1
|
58 |
+
for i,label in enumerate(labels):
|
59 |
+
if label.startswith("B-"):
|
60 |
+
if start!=-1 and end!=-1:
|
61 |
+
slices.append(
|
62 |
+
self.__format_output(
|
63 |
+
start, end,
|
64 |
+
''.join(self.tokenizer.convert_ids_to_tokens(
|
65 |
+
input_ids[start+1:end+2])), past
|
66 |
+
)
|
67 |
+
)
|
68 |
+
start = i
|
69 |
+
end = i
|
70 |
+
past = "-".join(label.split("-")[1:])
|
71 |
+
elif label.startswith("I-") or label.startswith("M-") or label.startswith("E-"):
|
72 |
+
cur = "-".join(label.split("-")[1:])
|
73 |
+
if cur!=past:
|
74 |
+
# cut and skip to next entity
|
75 |
+
if start!=-1 and end!=-1:
|
76 |
+
slices.append(
|
77 |
+
self.__format_output(
|
78 |
+
start, end,
|
79 |
+
''.join(self.tokenizer.convert_ids_to_tokens(
|
80 |
+
input_ids[start+1:end+2])), past
|
81 |
+
)
|
82 |
+
)
|
83 |
+
start = i
|
84 |
+
past = cur
|
85 |
+
end = i
|
86 |
+
elif label.startswith("S-"):
|
87 |
+
if start!=-1 and end!=-1:
|
88 |
+
slices.append(
|
89 |
+
self.__format_output(
|
90 |
+
start, end,
|
91 |
+
''.join(self.tokenizer.convert_ids_to_tokens(
|
92 |
+
input_ids[start+1:end+2])), past
|
93 |
+
)
|
94 |
+
)
|
95 |
+
slices.append(
|
96 |
+
self.__format_output(
|
97 |
+
i, i,
|
98 |
+
''.join(self.tokenizer.convert_ids_to_tokens(
|
99 |
+
input_ids[i+1:i+2])), past
|
100 |
+
)
|
101 |
+
)
|
102 |
+
start = -1
|
103 |
+
end = -1
|
104 |
+
past = "O"
|
105 |
+
if start!=-1 and end!=-1:
|
106 |
+
slices.append(
|
107 |
+
self.__format_output(
|
108 |
+
start, end,
|
109 |
+
''.join(self.tokenizer.convert_ids_to_tokens(
|
110 |
+
input_ids[start+1:end+2])), past
|
111 |
+
)
|
112 |
+
)
|
113 |
+
batch_slices.append(slices)
|
114 |
+
return batch_slices
|
register.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers.pipelines import PIPELINE_REGISTRY,AutoModel,AutoConfig
|
2 |
+
from models.bert import BertCrfModel,BertCrfConfig
|
3 |
+
from pipelines import NERPredictorPipe
|
4 |
+
|
5 |
+
def register():
|
6 |
+
PIPELINE_REGISTRY.register_pipeline("ner_predictor", pipeline_class=NERPredictorPipe)
|
7 |
+
AutoConfig.register("bert_crf",BertCrfConfig)
|
8 |
+
AutoModel.register(BertCrfConfig,BertCrfModel)
|