Spaces:
Running
on
Zero
Running
on
Zero
DongfuJiang
commited on
Commit
•
04efd2c
1
Parent(s):
4a2ff24
update
Browse files- app.py +22 -21
- requirements.txt +2 -1
- utils.py +0 -85
app.py
CHANGED
@@ -3,7 +3,7 @@ import sys
|
|
3 |
import os
|
4 |
from datasets import load_dataset
|
5 |
from typing import List
|
6 |
-
import
|
7 |
|
8 |
|
9 |
DESCRIPTIONS = """
|
@@ -13,10 +13,10 @@ We present ***TIGERScore***, a **T**rained metric that follows **I**nstruction *
|
|
13 |
|
14 |
"""
|
15 |
|
16 |
-
EXAMPLES_DATASET = load_dataset("TIGER-Lab/MetricInstruct", split="
|
17 |
SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42)
|
18 |
EXAMPLES = []
|
19 |
-
fields = ["
|
20 |
print("Loading examples...")
|
21 |
for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET):
|
22 |
if any([not ex[field] for field in fields]):
|
@@ -25,13 +25,19 @@ for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET):
|
|
25 |
if i >= 100:
|
26 |
break
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
def get_examples(task, inst_textbox, input_textbox, hypo_output_textbox):
|
37 |
return gr.Dropdown.update(value=task), inst_textbox, input_textbox, hypo_output_textbox
|
@@ -39,10 +45,6 @@ def get_examples(task, inst_textbox, input_textbox, hypo_output_textbox):
|
|
39 |
def clear_all(task, inst_textbox, input_textbox, hypo_output_textbox):
|
40 |
return gr.Dropdown.update(value=task), "", "", ""
|
41 |
|
42 |
-
## initialize the model
|
43 |
-
print("Loading TIGERScore model...")
|
44 |
-
utils.load_tigerscore("7b")
|
45 |
-
|
46 |
with gr.Blocks(theme='gradio/soft') as demo:
|
47 |
|
48 |
gr.Markdown("# 🐯 TIGERScore Demo")
|
@@ -51,7 +53,6 @@ with gr.Blocks(theme='gradio/soft') as demo:
|
|
51 |
gr.Image("https://jdf-prog.github.io/assets/img/publication_preview/tigerscore_preview.png")
|
52 |
|
53 |
gr.Markdown("## TIGERScore Inputs")
|
54 |
-
tasks_dropdown = gr.Dropdown(label="Task", choices=utils.tasks, value="translation", show_label=True, allow_custom_value=True)
|
55 |
inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
|
56 |
input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
|
57 |
hypo_output_textbox = gr.Textbox(lines=4, label="Hypothesis Output", placeholder="Enter hypothesis output to be evaluated here", show_label=True)
|
@@ -88,15 +89,15 @@ with gr.Blocks(theme='gradio/soft') as demo:
|
|
88 |
|
89 |
|
90 |
submit_button.click(
|
91 |
-
fn=
|
92 |
-
inputs=[
|
93 |
outputs=evaluation_output_textbox,
|
94 |
)
|
95 |
|
96 |
clear_button.click(
|
97 |
fn=clear_all,
|
98 |
-
inputs=[
|
99 |
-
outputs=[
|
100 |
)
|
101 |
|
102 |
batch_examples = gr.Examples(
|
@@ -104,8 +105,8 @@ with gr.Blocks(theme='gradio/soft') as demo:
|
|
104 |
fn=get_examples,
|
105 |
cache_examples=True,
|
106 |
examples_per_page=5,
|
107 |
-
inputs=[
|
108 |
-
outputs=[
|
109 |
)
|
110 |
|
111 |
citations = gr.Markdown("""## Citation
|
|
|
3 |
import os
|
4 |
from datasets import load_dataset
|
5 |
from typing import List
|
6 |
+
from tigerscore import TIGERScorer
|
7 |
|
8 |
|
9 |
DESCRIPTIONS = """
|
|
|
13 |
|
14 |
"""
|
15 |
|
16 |
+
EXAMPLES_DATASET = load_dataset("TIGER-Lab/MetricInstruct", split="train")
|
17 |
SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42)
|
18 |
EXAMPLES = []
|
19 |
+
fields = ["instruction", "input_context", "hypo_output"]
|
20 |
print("Loading examples...")
|
21 |
for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET):
|
22 |
if any([not ex[field] for field in fields]):
|
|
|
25 |
if i >= 100:
|
26 |
break
|
27 |
|
28 |
+
scorer = TIGERScorer("TIGER-Lab/TIGERScore-7B-GGUF", use_llamacpp=True)
|
29 |
+
|
30 |
+
def submit_fn(input_context, generation_instruction, hypo_output, max_new_tokens=512, temperature=0.7, top_p=1.0):
|
31 |
+
return scorer.score(
|
32 |
+
insts=[generation_instruction],
|
33 |
+
hypo_outputs=[hypo_output],
|
34 |
+
input_contexts=[input_context],
|
35 |
+
max_new_tokens=max_new_tokens,
|
36 |
+
temperature=temperature,
|
37 |
+
top_p=top_p,
|
38 |
+
)[0]['raw_output'].strip()
|
39 |
+
|
40 |
+
|
41 |
|
42 |
def get_examples(task, inst_textbox, input_textbox, hypo_output_textbox):
|
43 |
return gr.Dropdown.update(value=task), inst_textbox, input_textbox, hypo_output_textbox
|
|
|
45 |
def clear_all(task, inst_textbox, input_textbox, hypo_output_textbox):
|
46 |
return gr.Dropdown.update(value=task), "", "", ""
|
47 |
|
|
|
|
|
|
|
|
|
48 |
with gr.Blocks(theme='gradio/soft') as demo:
|
49 |
|
50 |
gr.Markdown("# 🐯 TIGERScore Demo")
|
|
|
53 |
gr.Image("https://jdf-prog.github.io/assets/img/publication_preview/tigerscore_preview.png")
|
54 |
|
55 |
gr.Markdown("## TIGERScore Inputs")
|
|
|
56 |
inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
|
57 |
input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
|
58 |
hypo_output_textbox = gr.Textbox(lines=4, label="Hypothesis Output", placeholder="Enter hypothesis output to be evaluated here", show_label=True)
|
|
|
89 |
|
90 |
|
91 |
submit_button.click(
|
92 |
+
fn=submit_fn,
|
93 |
+
inputs=[input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
|
94 |
outputs=evaluation_output_textbox,
|
95 |
)
|
96 |
|
97 |
clear_button.click(
|
98 |
fn=clear_all,
|
99 |
+
inputs=[inst_textbox, input_textbox, hypo_output_textbox],
|
100 |
+
outputs=[inst_textbox, input_textbox, hypo_output_textbox],
|
101 |
)
|
102 |
|
103 |
batch_examples = gr.Examples(
|
|
|
105 |
fn=get_examples,
|
106 |
cache_examples=True,
|
107 |
examples_per_page=5,
|
108 |
+
inputs=[inst_textbox, input_textbox, hypo_output_textbox],
|
109 |
+
outputs=[inst_textbox, input_textbox, hypo_output_textbox],
|
110 |
)
|
111 |
|
112 |
citations = gr.Markdown("""## Citation
|
requirements.txt
CHANGED
@@ -28,4 +28,5 @@ rouge_score
|
|
28 |
bs4
|
29 |
py7zr
|
30 |
sacrebleu
|
31 |
-
gdown
|
|
|
|
28 |
bs4
|
29 |
py7zr
|
30 |
sacrebleu
|
31 |
+
gdown
|
32 |
+
bitsandbytes
|
utils.py
DELETED
@@ -1,85 +0,0 @@
|
|
1 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
2 |
-
from string import Template
|
3 |
-
import torch
|
4 |
-
|
5 |
-
FINETUNE_INST = "You are evaluating errors in a model-generated output for a(an) ${task} task."
|
6 |
-
FINETUNE_INPUT = """\
|
7 |
-
Task instruction: ${generation_instruction}
|
8 |
-
Source: ${input_context}
|
9 |
-
Model-generated Output: ${hypothesis_output}
|
10 |
-
|
11 |
-
Based on the given task instruction and source, identify errors in this model-generated output.
|
12 |
-
For each error you give in the response, please also elaborate the following information:
|
13 |
-
- error location (the words that are wrong in the output)
|
14 |
-
- error aspect it belongs to.
|
15 |
-
- explanation why it's an error, and the correction suggestions.
|
16 |
-
- severity of the error ("Major" or "Minor").
|
17 |
-
- reduction of score (between 0.5 and 5 given the severity of the error)
|
18 |
-
|
19 |
-
Your evaluation output:
|
20 |
-
"""
|
21 |
-
|
22 |
-
TIGERScore_model_map = {
|
23 |
-
"7b": "TIGER-Lab/TIGERScore-7B-V1.0",
|
24 |
-
"13b": "TIGER-Lab/TIGERScore-13B-V1.0",
|
25 |
-
}
|
26 |
-
tigerscore_model = None
|
27 |
-
tigerscore_tokenizer = None
|
28 |
-
|
29 |
-
tasks = [
|
30 |
-
"translation",
|
31 |
-
"summarization",
|
32 |
-
"data2text",
|
33 |
-
"mathQA",
|
34 |
-
"long-form QA",
|
35 |
-
"instruction-following",
|
36 |
-
]
|
37 |
-
|
38 |
-
def load_tigerscore(model_size):
|
39 |
-
assert model_size in TIGERScore_model_map
|
40 |
-
model_name = TIGERScore_model_map[model_size]
|
41 |
-
global tigerscore_model, tigerscore_tokenizer
|
42 |
-
tigerscore_model = AutoModelForCausalLM.from_pretrained(
|
43 |
-
model_name,
|
44 |
-
torch_dtype=torch.bfloat16,
|
45 |
-
device_map="auto"
|
46 |
-
)
|
47 |
-
tigerscore_tokenizer = AutoTokenizer.from_pretrained(
|
48 |
-
model_name,
|
49 |
-
use_fast=True
|
50 |
-
)
|
51 |
-
|
52 |
-
def generate(task, input_context, generation_instruction, hypo_output, **generate_kwargs):
|
53 |
-
inst_part = Template(FINETUNE_INST)
|
54 |
-
inst_part = inst_part.substitute(task=task)
|
55 |
-
input_part = Template(FINETUNE_INPUT)
|
56 |
-
input_part = input_part.substitute(
|
57 |
-
generation_instruction=generation_instruction,
|
58 |
-
input_context=input_context,
|
59 |
-
hypothesis_output=hypo_output
|
60 |
-
)
|
61 |
-
prompt = (inst_part + "\n" + input_part).strip("\n ") + "\n"
|
62 |
-
encodings = tigerscore_tokenizer(prompt, return_tensors="pt")
|
63 |
-
input_ids = encodings["input_ids"].to(tigerscore_model.device)
|
64 |
-
attention_mask = encodings["attention_mask"].to(tigerscore_model.device)
|
65 |
-
gen_params = {
|
66 |
-
"input_ids": input_ids,
|
67 |
-
"attention_mask": attention_mask,
|
68 |
-
"max_new_tokens": 512,
|
69 |
-
"do_sample": True,
|
70 |
-
"top_k": 1,
|
71 |
-
"num_return_sequences": 1,
|
72 |
-
}
|
73 |
-
gen_params.update(generate_kwargs)
|
74 |
-
output = tigerscore_model.generate(**gen_params)
|
75 |
-
output = tigerscore_tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)
|
76 |
-
return output
|
77 |
-
|
78 |
-
if __name__ == "__main__":
|
79 |
-
task = "translation"
|
80 |
-
input_context = "Der künftige EM-Cheforganisator Philipp Lahm soll laut Grindel im DFB-Präsidium mitarbeiten."
|
81 |
-
generation_instruction = "Translate the following text from German to English."
|
82 |
-
hypo_output = "According to Grindel, the future head of the European Championships, Philipp Lahm, is to participate in the DFB Presidency."
|
83 |
-
output = generate(task, input_context, generation_instruction, hypo_output)
|
84 |
-
print(output)
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|