Spaces:

TIGER-Lab
/

TIGERScore

Running on Zero

App Files Files Community

DongfuJiang commited on Dec 3, 2023

Commit

04efd2c

•

1 Parent(s): 4a2ff24

update

Browse files

Files changed (3) hide show

app.py +22 -21
requirements.txt +2 -1
utils.py +0 -85

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import sys
 import os
 from datasets import load_dataset
 from typing import List
-import utils
 DESCRIPTIONS = """
@@ -13,10 +13,10 @@ We present ***TIGERScore***, a **T**rained metric that follows **I**nstruction *
 """
-EXAMPLES_DATASET = load_dataset("TIGER-Lab/MetricInstruct", split="train_mix")
 SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42)
 EXAMPLES = []
-fields = ["task", "instruction", "input_context", "hypo_output"]
 print("Loading examples...")
 for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET):
     if any([not ex[field] for field in fields]):
@@ -25,13 +25,19 @@ for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET):
     if i >= 100:
         break
-def tigerscore(task, input_context, generation_instruction, hypo_output, max_new_tokens=512, temperature=0.7, top_p=1.0):
-    return utils.generate(
-        task, input_context,
-        generation_instruction, hypo_output,
-        max_new_tokens=max_new_tokens,
-        temperature=temperature, top_p=top_p
-    )
 def get_examples(task, inst_textbox, input_textbox, hypo_output_textbox):
     return gr.Dropdown.update(value=task), inst_textbox, input_textbox, hypo_output_textbox
@@ -39,10 +45,6 @@ def get_examples(task, inst_textbox, input_textbox, hypo_output_textbox):
 def clear_all(task, inst_textbox, input_textbox, hypo_output_textbox):
     return gr.Dropdown.update(value=task), "", "", ""
-## initialize the model
-print("Loading TIGERScore model...")
-utils.load_tigerscore("7b")
 with gr.Blocks(theme='gradio/soft') as demo:
     gr.Markdown("# 🐯 TIGERScore Demo")
@@ -51,7 +53,6 @@ with gr.Blocks(theme='gradio/soft') as demo:
         gr.Image("https://jdf-prog.github.io/assets/img/publication_preview/tigerscore_preview.png")
     gr.Markdown("## TIGERScore Inputs")
-    tasks_dropdown = gr.Dropdown(label="Task", choices=utils.tasks, value="translation", show_label=True, allow_custom_value=True)
     inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
     input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
     hypo_output_textbox = gr.Textbox(lines=4, label="Hypothesis Output", placeholder="Enter hypothesis output to be evaluated here", show_label=True)
@@ -88,15 +89,15 @@ with gr.Blocks(theme='gradio/soft') as demo:
     submit_button.click(
-        fn=tigerscore,
-        inputs=[tasks_dropdown, input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
         outputs=evaluation_output_textbox,
     )
     clear_button.click(
         fn=clear_all,
-        inputs=[tasks_dropdown, inst_textbox, input_textbox, hypo_output_textbox],
-        outputs=[tasks_dropdown, inst_textbox, input_textbox, hypo_output_textbox],
     )
     batch_examples = gr.Examples(
@@ -104,8 +105,8 @@ with gr.Blocks(theme='gradio/soft') as demo:
         fn=get_examples,
         cache_examples=True,
         examples_per_page=5,
-        inputs=[tasks_dropdown, inst_textbox, input_textbox, hypo_output_textbox],
-        outputs=[tasks_dropdown, inst_textbox, input_textbox, hypo_output_textbox],
     )
     citations = gr.Markdown("""## Citation

 import os
 from datasets import load_dataset
 from typing import List
+from tigerscore import TIGERScorer
 DESCRIPTIONS = """
 """
+EXAMPLES_DATASET = load_dataset("TIGER-Lab/MetricInstruct", split="train")
 SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42)
 EXAMPLES = []
+fields = ["instruction", "input_context", "hypo_output"]
 print("Loading examples...")
 for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET):
     if any([not ex[field] for field in fields]):
     if i >= 100:
         break
+scorer = TIGERScorer("TIGER-Lab/TIGERScore-7B-GGUF", use_llamacpp=True)
+def submit_fn(input_context, generation_instruction, hypo_output, max_new_tokens=512, temperature=0.7, top_p=1.0):
+    return scorer.score(
+        insts=[generation_instruction],
+        hypo_outputs=[hypo_output],
+        input_contexts=[input_context],
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+    )[0]['raw_output'].strip()
 def get_examples(task, inst_textbox, input_textbox, hypo_output_textbox):
     return gr.Dropdown.update(value=task), inst_textbox, input_textbox, hypo_output_textbox
 def clear_all(task, inst_textbox, input_textbox, hypo_output_textbox):
     return gr.Dropdown.update(value=task), "", "", ""
 with gr.Blocks(theme='gradio/soft') as demo:
     gr.Markdown("# 🐯 TIGERScore Demo")
         gr.Image("https://jdf-prog.github.io/assets/img/publication_preview/tigerscore_preview.png")
     gr.Markdown("## TIGERScore Inputs")
     inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
     input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
     hypo_output_textbox = gr.Textbox(lines=4, label="Hypothesis Output", placeholder="Enter hypothesis output to be evaluated here", show_label=True)
     submit_button.click(
+        fn=submit_fn,
+        inputs=[input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
         outputs=evaluation_output_textbox,
     )
     clear_button.click(
         fn=clear_all,
+        inputs=[inst_textbox, input_textbox, hypo_output_textbox],
+        outputs=[inst_textbox, input_textbox, hypo_output_textbox],
     )
     batch_examples = gr.Examples(
         fn=get_examples,
         cache_examples=True,
         examples_per_page=5,
+        inputs=[inst_textbox, input_textbox, hypo_output_textbox],
+        outputs=[inst_textbox, input_textbox, hypo_output_textbox],
     )
     citations = gr.Markdown("""## Citation

requirements.txt CHANGED Viewed

@@ -28,4 +28,5 @@ rouge_score
 bs4
 py7zr
 sacrebleu
-gdown

 bs4
 py7zr
 sacrebleu
+gdown
+bitsandbytes

utils.py DELETED Viewed

@@ -1,85 +0,0 @@
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from string import Template
-import torch
-FINETUNE_INST = "You are evaluating errors in a model-generated output for a(an) ${task} task."
-FINETUNE_INPUT = """\
-Task instruction: ${generation_instruction}
-Source: ${input_context}
-Model-generated Output: ${hypothesis_output}
-Based on the given task instruction and source, identify errors in this model-generated output.
-For each error you give in the response, please also elaborate the following information:
-- error location (the words that are wrong in the output)
-- error aspect it belongs to.
-- explanation why it's an error, and the correction suggestions.
-- severity of the error ("Major" or "Minor").
-- reduction of score (between 0.5 and 5 given the severity of the error)
-Your evaluation output:
-"""
-TIGERScore_model_map = {
-    "7b": "TIGER-Lab/TIGERScore-7B-V1.0",
-    "13b": "TIGER-Lab/TIGERScore-13B-V1.0",
-}
-tigerscore_model = None
-tigerscore_tokenizer = None
-tasks = [
-    "translation",
-    "summarization",
-    "data2text",
-    "mathQA",
-    "long-form QA",
-    "instruction-following",
-]
-def load_tigerscore(model_size):
-    assert model_size in TIGERScore_model_map
-    model_name = TIGERScore_model_map[model_size]
-    global tigerscore_model, tigerscore_tokenizer
-    tigerscore_model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        torch_dtype=torch.bfloat16,
-        device_map="auto"
-    )
-    tigerscore_tokenizer = AutoTokenizer.from_pretrained(
-        model_name,
-        use_fast=True
-    )
-def generate(task, input_context, generation_instruction, hypo_output, **generate_kwargs):
-    inst_part = Template(FINETUNE_INST)
-    inst_part = inst_part.substitute(task=task)
-    input_part = Template(FINETUNE_INPUT)
-    input_part = input_part.substitute(
-        generation_instruction=generation_instruction,
-        input_context=input_context,
-        hypothesis_output=hypo_output
-    )
-    prompt = (inst_part + "\n" + input_part).strip("\n ") + "\n"
-    encodings = tigerscore_tokenizer(prompt, return_tensors="pt")
-    input_ids = encodings["input_ids"].to(tigerscore_model.device)
-    attention_mask = encodings["attention_mask"].to(tigerscore_model.device)
-    gen_params = {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "max_new_tokens": 512,
-        "do_sample": True,
-        "top_k": 1,
-        "num_return_sequences": 1,
-    }
-    gen_params.update(generate_kwargs)
-    output = tigerscore_model.generate(**gen_params)
-    output = tigerscore_tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)
-    return output
-if __name__ == "__main__":
-    task = "translation"
-    input_context = "Der künftige EM-Cheforganisator Philipp Lahm soll laut Grindel im DFB-Präsidium mitarbeiten."
-    generation_instruction = "Translate the following text from German to English."
-    hypo_output = "According to Grindel, the future head of the European Championships, Philipp Lahm, is to participate in the DFB Presidency."
-    output = generate(task, input_context, generation_instruction, hypo_output)
-    print(output)