Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,501 Bytes
22cbbce 301c810 152a862 2bed478 301c810 152a862 742d7b5 301c810 84a5d01 301c810 742d7b5 301c810 508850f 301c810 04efd2c 301c810 508850f 301c810 152a862 2107a44 742d7b5 2bed478 742d7b5 322dcb3 742d7b5 152a862 301c810 508850f 301c810 508850f 84a5d01 301c810 84a5d01 4a2ff24 84a5d01 301c810 84a5d01 301c810 748e489 301c810 152a862 301c810 748e489 301c810 748e489 301c810 152a862 301c810 742d7b5 04efd2c 301c810 84a5d01 04efd2c 84a5d01 301c810 04efd2c 301c810 84a5d01 301c810 84a5d01 301c810 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import os
import gradio as gr
import sys
import copy
import spaces
from datasets import load_dataset
from string import Template
from tigerscore import TIGERScorer
DESCRIPTIONS = """
We present ***TIGERScore***, a **T**rained metric that follows **I**nstruction **G**uidance to perform **E**xplainable, and **R**eference-free evaluation over a wide spectrum of text generation tasks. Different from other automatic evaluation methods that only provide arcane scores, TIGERScore is guided by the natural language instruction to provide error analysis to pinpoint the mistakes in the generated text.
[**Website**](https://tiger-ai-lab.github.io/TIGERScore/) |
[**Paper**](https://arxiv.org/abs/2310.00752) |
[**Code**](https://github.com/TIGER-AI-Lab/TIGERScore) |
[**TIGERScore-7B**](https://huggingface.co/TIGER-Lab/TIGERScore-7B) |
[**TIGERScore-13B**](https://huggingface.co/TIGER-Lab/TIGERScore-13B)
"""
EXAMPLES_DATASET = load_dataset("TIGER-Lab/MetricInstruct", split="train", streaming=True)
SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42)
EXAMPLES = []
fields = ["instruction", "input_context", "hypo_output"]
print("Loading examples...")
for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET.take(100)):
# if any([not ex[field] for field in fields]):
# continue
EXAMPLES.append([ex[field] for field in fields])
TEMPLATE = """You are evaluating errors in a model-generated output for a given instruction.
Instruction:
${generation_instruction}
${input_context}
Model-generated Output:
${hypothesis_output}
For each error you give in the response, please also elaborate the following information:
- error location (the words that are wrong in the output)
- error aspect it belongs to.
- explanation why it's an error, and the correction suggestions.
- severity of the error ("Major" or "Minor").
- reduction of score (between 0.5 and 5 given the severity of the error)
Your evaluation output:
"""
# from huggingface_hub import hf_hub_download
# from llama_cpp import Llama
# llm = Llama(
# model_path=hf_hub_download(
# repo_id=os.environ.get("REPO_ID", "TIGER-Lab/TIGERScore-13B-GGUF"),
# filename=os.environ.get("MODEL_FILE", "ggml-model-q4_0.gguf"),
# ),
# n_ctx=2048,
# n_gpu_layers=50, # change n_gpu_layers if you have more or less VRAM
# )
scorer = TIGERScorer(model_name="TIGER-Lab/TIGERScore-13B")
@spaces.GPU(duration=60)
def generate_text_hf(input_context, generation_instruction, hypo_output, max_new_tokens=1024, temperature=0.7, top_p=1.0):
global scorer
scorer.model = scorer.model.to("cuda")
for output in scorer.generate_stream(generation_instruction, hypo_output, input_context, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p):
yield output
def generate_text_llamacpp(input_context, generation_instruction, hypo_output, max_new_tokens=1024, temperature=0.7, top_p=1.0):
global llm
prompt_template = Template(TEMPLATE)
prompt = prompt_template.substitute(
generation_instruction=generation_instruction,
input_context=input_context,
hypothesis_output=hypo_output,
).strip("\n ")
gen_params = {
"max_tokens": max_new_tokens,
"top_p": top_p,
"top_k": 40,
"temperature": temperature,
"frequency_penalty": 0.0,
"presence_penalty": 0.0,
"echo": False,
"stream": True,
}
outputs = llm(prompt, **gen_params)
temp=""
for out in outputs:
stream = copy.deepcopy(out)
temp += stream["choices"][0]["text"]
yield temp
def get_examples(inst_textbox, input_textbox, hypo_output_textbox):
return inst_textbox, input_textbox, hypo_output_textbox
def clear_all(inst_textbox, input_textbox, hypo_output_textbox):
return "", "", ""
with gr.Blocks(theme='gradio/soft') as demo:
gr.Markdown("# 🐯 TIGERScore Demo")
with gr.Row():
gr.Markdown(DESCRIPTIONS)
gr.Image("https://jdf-prog.github.io/assets/img/publication_preview/tigerscore_preview.png")
gr.Markdown("## TIGERScore Inputs")
inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
hypo_output_textbox = gr.Textbox(lines=4, label="Hypothesis Output", placeholder="Enter hypothesis output to be evaluated here", show_label=True)
with gr.Row():
clear_button = gr.Button('Clear', variant='primary')
submit_button = gr.Button('Submit', variant='primary')
with gr.Accordion(label='Advanced options', open=False):
max_new_tokens = gr.Slider(
label='Max new tokens to generate',
minimum=256,
maximum=1024,
step=1,
value=1024,
)
temperature = gr.Slider(
label='Temperature of generation',
minimum=0.1,
maximum=2.0,
step=0.1,
value=0.7,
)
top_p = gr.Slider(
label='Top-p of generation',
minimum=0.05,
maximum=1.0,
step=0.05,
value=1.0,
)
gr.Markdown("## TIGERScore Outputs")
evaluation_output_textbox = gr.Textbox(lines=4, label="Evaluation Output", placeholder="Evaluation output", show_label=True)
submit_button.click(
fn=generate_text_hf,
inputs=[input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
outputs=evaluation_output_textbox,
)
clear_button.click(
fn=clear_all,
inputs=[inst_textbox, input_textbox, hypo_output_textbox],
outputs=[inst_textbox, input_textbox, hypo_output_textbox],
)
batch_examples = gr.Examples(
examples=EXAMPLES,
fn=get_examples,
cache_examples=True,
examples_per_page=5,
inputs=[inst_textbox, input_textbox, hypo_output_textbox],
outputs=[inst_textbox, input_textbox, hypo_output_textbox],
)
citations = gr.Markdown("""## Citation
```txt
@article{jiang2023TIGERScore,
title={TIGERScore: Towards Building Explainable Metric for All Text Generation Tasks},
author={Dongfu Jiang, Yishan Li, Ge Zhang, Wenhao Huang, Bill Yuchen Lin, Wenhu Chen},
journal={arXiv preprint arXiv:2310.00752},
year={2023}
}
```""")
demo.queue(max_size=20).launch() |