Spaces:
Running
Running
File size: 7,714 Bytes
df81629 02758ba df81629 02758ba df81629 02758ba df81629 02758ba df81629 02758ba df81629 02758ba df81629 02758ba df81629 0f2eb8c df81629 2720879 969d94d 2720879 969d94d 2720879 525d89c 2720879 176f061 2720879 7f9c127 2720879 df81629 0537017 df81629 31aac55 e7a0f80 31aac55 df81629 66db8e5 87b9111 2720879 176f061 0d7455a 2720879 d492135 8a2ba57 e7a0f80 2720879 8e41965 2720879 8e41965 2720879 8e41965 2720879 87b9111 3743aa7 66db8e5 87b9111 2720879 df81629 7ecd7c4 df81629 66db8e5 2720879 df81629 87b9111 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import os
import transformers
from transformers import pipeline
from transformers.pipelines.token_classification import TokenClassificationPipeline
import py_vncorenlp
os.system('pwd')
os.system('sudo update-alternatives --config java')
os.mkdir('/home/user/app/vncorenlp')
py_vncorenlp.download_model(save_dir='/home/user/app/vncorenlp')
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/home/user/app/vncorenlp')
# I have to make some changes to the preprocess() method since they (Hugging Face) had changed some attributes
class MyPipeline(TokenClassificationPipeline):
def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
tokenizer_params = preprocess_params.pop("tokenizer_params", {})
truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
inputs = self.tokenizer(
sentence,
return_tensors=self.framework,
truncation=truncation,
return_special_tokens_mask=True,
return_offsets_mapping=self.tokenizer.is_fast,
**tokenizer_params,
)
inputs.pop("overflow_to_sample_mapping", None)
num_chunks = len(inputs["input_ids"])
# Override preprocess method with these offset_mapping lines
length = len(inputs['input_ids'][0]) - 2
tokens = self.tokenizer.tokenize(sentence)
seek = 0
offset_mapping_list = [[(0, 0)]]
for i in range(length):
if tokens[i][-2:] == '@@':
offset_mapping_list[0].append((seek, seek + len(tokens[i]) - 2))
seek += len(tokens[i]) - 2
else:
offset_mapping_list[0].append((seek, seek + len(tokens[i])))
seek += len(tokens[i]) + 1
offset_mapping_list[0].append((0, 0))
for i in range(num_chunks):
if self.framework == "tf":
model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
else:
model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
model_inputs['offset_mapping'] = offset_mapping_list
model_inputs["sentence"] = sentence if i == 0 else None
model_inputs["is_last"] = i == num_chunks - 1
yield model_inputs
model_checkpoint = "DD0101/disfluency-large"
my_classifier = pipeline(
"token-classification", model=model_checkpoint, aggregation_strategy="simple", pipeline_class=MyPipeline)
#################### IDSF #######################
from Customized_IDSF.utils import get_intent_labels, get_slot_labels, load_tokenizer
import argparse
import Customized_IDSF.load_model as lm
parser = argparse.ArgumentParser()
# parser.add_argument("--input_file", default="sample_pred_in.txt", type=str, help="Input file for prediction")
# parser.add_argument("--output_file", default="sample_pred_out.txt", type=str, help="Output file for prediction")
parser.add_argument("--model_dir", default="/home/user/app/Customized_IDSF/JointBERT-CRF_PhoBERTencoder", type=str, help="Path to save, load model")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size for prediction")
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
pred_config = parser.parse_args()
# load model and args
args = lm.get_args(pred_config)
device = lm.get_device(pred_config)
model = lm.load_model(pred_config, args, device)
intent_label_lst = get_intent_labels(args)
slot_label_lst = get_slot_labels(args)
# Convert input file to TensorDataset
pad_token_label_id = args.ignore_index
tokenizer = load_tokenizer(args)
#################### END IDSF #######################
def remove_disfluency(example, prediction):
characters = list(example)
for entity in reversed(prediction):
characters[entity['start']:entity['end']] = ''
return " ".join("".join(characters).split())
import gradio as gr
def ner(text):
text = " ".join(rdrsegmenter.word_segment(text))
# Some words in lowercase like "đà nẵng" will get error (due to vncorenlp)
text = text.replace("_đà ", " đà_").replace("_Đà ", " Đà_")
output = my_classifier(text)
for entity in output:
entity['entity'] = entity.pop('entity_group')
# Remove Disfluency-entities to return a sentence with "Fluency" version
fluency_sentence = remove_disfluency(text, output)
#################### IDSF #######################
prediction = lm.predict([fluency_sentence.strip().split()], pred_config, args, tokenizer, pad_token_label_id, model, device,
intent_label_lst, slot_label_lst)
words, slot_preds, intent_pred = prediction[0][0], prediction[1][0], prediction[2][0]
slot_tokens = []
words[0] = words[0][0].upper() + words[0][1:] # capitalize the first word of sentence
for word, pred in zip(words, slot_preds):
word = word.replace("_", " ")
if pred == 'O':
slot_tokens.extend([(word, None), (" ", None)])
elif pred[0] == 'I': # token with label's prefix' "I-XXX": will be combined their corresponded "B-XXX"
added_tokens = list(slot_tokens[-2])
added_tokens[0] += f' {word}'
slot_tokens[-2] = tuple(added_tokens)
elif pred[0] == 'B': # token with label's prefix "B-XXX" : remove "B-"
slot_tokens.extend([(word, pred[2:]), (" ", None)])
else: # PAD or UNK tags
slot_tokens.extend([(word, pred), (" ", None)])
intent_label = intent_label_lst[intent_pred]
#################### END IDSF #######################
fluency_sentence = fluency_sentence[0].upper() + fluency_sentence[1:] # since capitalize() just lowercase whole sentence first then uppercase the first letter
# Replace words like "Đà_Nẵng" to "Đà Nẵng"
text = text.replace("_", " ")
fluency_sentence = fluency_sentence.replace("_", " ")
return {'text': text, 'entities': output}, fluency_sentence, slot_tokens, intent_label
################################### Gradio Demo ####################################
examples = ['Tôi cần thuê à tôi muốn bay một chuyến khứ hồi từ Đà Nẵng đến Đà Lạt',
'Giá vé một chiều à không khứ hồi từ Đà Nẵng đến Vinh dưới 2 triệu đồng giá vé khứ hồi từ Quy Nhơn đến Vinh dưới 3 triệu đồng giá vé khứ hồi từ Buôn Ma Thuột đến Quy Nhơn à đến Vinh dưới 4 triệu rưỡi',
'Cho tôi biết các chuyến bay đến Đà Nẵng vào ngày 12 mà không ngày 14 tháng sáu',
'Những chuyến bay nào khởi hành từ Thành phố Hồ Chí Minh bay đến Frankfurt mà nối chuyến ở Singapore và hạ cánh trước 10 giờ ý tôi là 9 giờ tối',
'Thành Phố nào có VNA ừm thôi cho tôi xem tất cả các chuyến bay từ Thanh Hóa hay Nghệ An nhỉ à Thanh Hóa đến Đà Lạt vào Thứ ba à thôi tôi cần vào Thứ hai'
]
demo = gr.Interface(ner,
gr.Textbox(label='Sentence', placeholder="Enter your sentence here..."),
outputs=[gr.HighlightedText(label='Disfluency Highlighted'), gr.Textbox(label='"Fluency" version'),
gr.HighlightedText(label='Slot Filling Highlighted'), gr.Textbox(label='Intent Label')],
examples=examples,
title="Disfluency Detection",
description="This is an easy-to-use built in Gradio for desmontrating a NER System that identifies disfluency-entities in \
Vietnamese utterances",
theme=gr.themes.Soft())
demo.launch() |