File size: 6,098 Bytes
2a2c864
b8db52d
ba7c814
abf3d53
 
6128b93
a6ed751
773755f
a6ed751
 
6128b93
a6ed751
81fdd84
ba7c814
a6ed751
773755f
2a2c864
 
7dcde50
33ff5cc
 
 
7dcde50
 
 
 
33ff5cc
ba7c814
 
a6ed751
ba7c814
 
 
6571d18
 
ba7c814
a6ed751
 
 
 
 
ba7c814
6571d18
b8db52d
abf3d53
ba7c814
a6ed751
7dcde50
45cd238
6af5526
abf3d53
8cea305
a6ed751
8cea305
33ff5cc
08009f0
cc29eef
6571d18
 
 
 
a6ed751
 
ba7c814
6af5526
 
3ecf378
33ff5cc
 
3ecf378
a6ed751
6571d18
0e961d2
33ff5cc
a6ed751
 
33ff5cc
b8db52d
a6ed751
 
 
 
33ff5cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
from transformers import pipeline
import time
# import neural_compressor
# from optimum.intel.neural_compressor import IncQuantizedModelForQuestionAnswering

# model_id = "Intel/bert-base-uncased-squadv1.1-sparse-80-1x4-block-pruneofa"
# model_id = "Intel/distilbert-base-uncased-squadv1.1-sparse-80-1x4-block-pruneofa-int8"
# model_id = "Intel/distilbert-base-uncased-squadv1.1-sparse-80-1X4-block"
# int8_model = IncQuantizedModelForQuestionAnswering.from_pretrained(model_id)

sparse_qa_pipeline = pipeline(task="question-answering",model="Intel/bert-base-uncased-squadv1.1-sparse-80-1x4-block-pruneofa")
# sparse_qa_pipeline = pipeline(task="question-answering",model="Intel/distilbert-base-uncased-squadv1.1-sparse-80-1x4-block-pruneofa-int8")

# dense_qa_pipeline = pipeline(task="question-answering",model="csarron/bert-base-uncased-squad-v1")
# dense_qa_pipeline = pipeline(task="question-answering",model="distilbert-base-uncased-distilled-squad")


def predict(context,question):
    '''
    Sample prediction should return a dictionary of the form:
    {'score': 0.9376363158226013, 'start': 10, 'end': 15, 'answer': 'seven'}
    Score is the probability confidence score
    Start is the starting character where it found the answer
    End is the ending character where it found the answer
    Answer is the part of the text it drew its answer from.
    '''
    
    sparse_start_time = time.perf_counter()
    sparse_predictions = sparse_qa_pipeline(context=context,question=question)
    sparse_end_time = time.perf_counter()
    sparse_duration = (sparse_end_time - sparse_start_time) * 1000
    sparse_answer = sparse_predictions['answer']
    sparse_score = sparse_predictions['score']
    sparse_start = sparse_predictions['start']
    
    # dense_start_time = time.perf_counter()
    # dense_predictions = dense_qa_pipeline(context=context,question=question)
    # dense_end_time = time.perf_counter()
    # dense_duration = (dense_end_time - dense_start_time) * 1000
    # dense_answer = dense_predictions['answer']

    return sparse_answer,sparse_duration,sparse_score,sparse_start #,dense_answer,dense_duration

md = """This prediction model is designed to answer a question about a given input text--reading comprehension. The model does not just answer questions in general -- it only works from the text that you provide. However, automated reading comprehension can be a valuable task.

The model is based on the Zafrir et al. (2021) paper: [Prune Once for All: Sparse Pre-Trained Language Models](https://arxiv.org/abs/2111.05754). The model can be found [here](https://huggingface.co/Intel/bert-base-uncased-squadv1.1-sparse-80-1x4-block-pruneofa). It has had weight pruning and model distillation applied to create a sparse weight pattern that is maintained even after fine-tuning has been applied. According to Zafrir et al. (2016), their "results show the best compression-to-accuracy ratio for BERT-Base". This model is still in FP32, but can be quantized to INT8 with the [Intel® Neural Compressor](https://github.com/intel/neural-compressor) for further compression. 

The training dataset used is the English Wikipedia dataset (2500M words), and then fine-tuned on the SQuADv1.1 dataset containing 89K training examples, compiled by Rajpurkar et al. (2016): [100, 000+ Questions for Machine Comprehension of Text](https://arxiv.org/abs/1606.05250).

Author of Hugging Face Space: Benjamin Consolvo, AI Solutions Engineer Manager at Intel | Date last updated: 03/28/2023
"""
# The main idea of this BERT-Base model is that it is much more fast and efficient in deployment than its dense counterpart: (https://huggingface.co/csarron/bert-base-uncased-squad-v1).

# predict()
context=gr.Text(lines=10,label="Context")
question=gr.Text(label="Question")
sparse_answer=gr.Text(label="Answer")
sparse_duration=gr.Text(label="Latency (ms)")
sparse_score=gr.Text(label="Probability score")
sparse_start=gr.Text(label="Starting character")
# dense_answer=gr.Text(label="Dense Answer")
# dense_duration=gr.Text(label="Dense latency (ms)")

apple_context = "An apple is an edible fruit produced by an apple tree (Malus domestica). Apple trees are cultivated worldwide and are the most widely grown species in the genus Malus. The tree originated in Central Asia, where its wild ancestor, Malus sieversii, is still found today. Apples have been grown for thousands of years in Asia and Europe and were brought to North America by European colonists. Apples have religious and mythological significance in many cultures, including Norse, Greek, and European Christian tradition. Apples grown from seed tend to be very different from those of their parents, and the resultant fruit frequently lacks desired characteristics. Generally, apple cultivars are propagated by clonal grafting onto rootstocks. Apple trees grown without rootstocks tend to be larger and much slower to fruit after planting. Rootstocks are used to control the speed of growth and the size of the resulting tree, allowing for easier harvesting."
apple_question = "How many years have apples been grown for?"

iface = gr.Interface(
    fn=predict, 
    inputs=[context,question],
    # outputs=[sparse_answer,sparse_duration,dense_answer,dense_duration],
    outputs=[sparse_answer,sparse_score,sparse_start,sparse_duration],
    examples=[[apple_context,apple_question]],
    title = "Question & Answer with Sparse BERT using the SQuAD dataset",
    description = md,
    
    )

legal = "Performance varies by use, configuration and other factors. Learn more at www.Intel.com/PerformanceIndex. Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates. See backup for configuration details. No product or component can be absolutely secure. © Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others."
gr.Markdown("**Notices and Disclaimers:**")
gr.Markdown(legal)

iface.launch()