import time import gradio as gr import torch from huggingface_hub import hf_hub_download from onnxruntime import InferenceSession from transformers import AutoModelForQuestionAnswering, AutoTokenizer models = { "Base model": "bert-large-uncased-whole-word-masking-finetuned-squad", "Prunned model": "madlag/bert-large-uncased-wwm-squadv2-x2.63-f82.6-d16-hybrid-v1", "Prunned ONNX Optimized FP16": "tryolabs/bert-large-uncased-wwm-squadv2-optimized-f16", } def run_ort_inference(model_name, inputs): model_path = hf_hub_download(repo_id=models[model_name], filename="model.onnx") sess = InferenceSession(model_path, providers=["CPUExecutionProvider"]) start_time = time.time() output = sess.run(None, input_feed=inputs) end_time = time.time() return (output[0], output[1]), (end_time - start_time) def run_normal_hf(model_name, inputs): start_time = time.time() model = AutoModelForQuestionAnswering.from_pretrained(models[model_name]) end_time = time.time() return model(**inputs).values(), (end_time - start_time) def inference(model_name, context, question): tokenizer = AutoTokenizer.from_pretrained(models[model_name]) if model_name == "Prunned ONNX Optimized FP16": inputs = dict(tokenizer(question, context, return_tensors="np")) output, inference_time = run_ort_inference(model_name, inputs) answer_start_scores, answer_end_scores = torch.tensor(output[0]), torch.tensor( output[1] ) else: inputs = tokenizer(question, context, return_tensors="pt") output, inference_time = run_normal_hf(model_name, inputs) answer_start_scores, answer_end_scores = output input_ids = inputs["input_ids"].tolist()[0] answer_start = torch.argmax(answer_start_scores) answer_end = torch.argmax(answer_end_scores) + 1 answer = tokenizer.convert_tokens_to_string( tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]) ) return answer, f"{inference_time:.4f}s" model_field = gr.Dropdown( choices=["Base model", "Prunned model", "Prunned ONNX Optimized FP16"], value="Prunned ONNX Optimized FP16", label="Model", ) input_text_field = gr.Textbox(placeholder="Enter the text here", label="Text") input_question_field = gr.Text(placeholder="Enter the question here", label="Question") output_model = gr.Text(label="Model output") output_inference_time = gr.Text(label="Inference time in seconds") demo = gr.Interface( inference, title="Optimizing Transformers - Question Answering Demo", inputs=[model_field, input_text_field, input_question_field], outputs=[output_model, output_inference_time], ) demo.launch()