Trayvon commited on
Commit
19cef65
1 Parent(s): 9d930db

Upload 9 files

Browse files
app.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+
4
+ from src.css_html import custom_css
5
+ from src.utils import (
6
+ AutoEvalColumn,
7
+ fields,
8
+ make_clickable_names,
9
+ make_plot_data
10
+ )
11
+ from src.demo import (
12
+ generate,
13
+ random_examples,
14
+ )
15
+
16
+
17
+ DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
18
+ MAX_MAX_NEW_TOKENS = 1024
19
+ DEFAULT_MAX_NEW_TOKENS = 512
20
+
21
+
22
+ df = pd.read_csv("data/eval_board.csv")
23
+
24
+ COLS = [c.name for c in fields(AutoEvalColumn)]
25
+ TYPES = [c.type for c in fields(AutoEvalColumn)]
26
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
27
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
28
+
29
+
30
+ def select_columns(df, columns):
31
+ always_here_cols = [
32
+ AutoEvalColumn.model.name
33
+ ]
34
+ # We use COLS to maintain sorting
35
+ filtered_df = df[
36
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns]
37
+ ]
38
+ return filtered_df
39
+
40
+
41
+ df["pure_name"] = df['Models']
42
+ df = make_clickable_names(df)
43
+ demo = gr.Blocks(css=custom_css)
44
+
45
+ with demo:
46
+ with gr.Row():
47
+ gr.Markdown(
48
+ """<div style="text-align: center;"><h1> 🤖ConvRe🤯 <span style='color: #e6b800;'>Leaderboard</span></h1></div>\
49
+ <br>\
50
+ <p> 🤖ConvRe🤯 is the benchmark proposed in our EMNLP 2023 paper: <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard"> An Investigation of LLMs’ Inefficacy in Understanding Converse Relations</a>. It aims to evaluate LLMs' ability on understanding converse relations. Converse relation is defined as the opposite of semantic relation while keeping the surface form of the triple unchanged. For example, the triple (x, has part, y) is interpreted as "x has a part called y" in normal relation, while "y has a part called x" in converse relation 🔁.
51
+
52
+ The experiments in our paper suggested that LLMs often resort to shortcut learning (or superficial correlations) and still face challenges on our 🤖ConvRe🤯 benchmark even for powerful models like GPT-4.
53
+ </p>""",
54
+ elem_classes="markdown-text",
55
+ )
56
+
57
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
58
+ with gr.TabItem("🔢 Data", id=0):
59
+ with gr.Accordion("➡️ See All Columns", open=False):
60
+ shown_columns = gr.CheckboxGroup(
61
+ choices=[
62
+ c for c in COLS if c not in [AutoEvalColumn.model.name]
63
+ ],
64
+ value=[
65
+ c for c in COLS_LITE if c not in [AutoEvalColumn.model.name]
66
+ ],
67
+ label="",
68
+ elem_id="column-select",
69
+ interactive=True
70
+ )
71
+ leaderboard_df_re2text = gr.components.Dataframe(
72
+ value=df[
73
+ [
74
+ AutoEvalColumn.model.name,
75
+ ] + shown_columns.value
76
+ ],
77
+ headers=[
78
+ AutoEvalColumn.model.name,
79
+ ] + shown_columns.value,
80
+ datatype=TYPES,
81
+ elem_id="leaderboard-table",
82
+ interactive=False,
83
+ )
84
+
85
+ hidden_leaderboard_df_re2text = gr.components.DataFrame(
86
+ value=df,
87
+ headers=COLS,
88
+ datatype=["str" for _ in range(len(COLS))],
89
+ visible=False,
90
+ )
91
+
92
+ shown_columns.change(
93
+ select_columns,
94
+ [hidden_leaderboard_df_re2text, shown_columns],
95
+ leaderboard_df_re2text
96
+ )
97
+
98
+ with gr.TabItem("📊 Plot", id=1):
99
+ with gr.Row():
100
+ with gr.Column():
101
+ gr.LinePlot(
102
+ make_plot_data(df, task="Re2Text"),
103
+ x="Setting",
104
+ y="Accuracy",
105
+ color="Symbol",
106
+ title="Re2Text",
107
+ y_lim=[0, 100],
108
+ x_label_angle=0,
109
+ height=400,
110
+ width=500,
111
+ )
112
+
113
+ with gr.Column():
114
+ gr.LinePlot(
115
+ make_plot_data(df, task="Text2Re"),
116
+ x="Setting",
117
+ y="Accuracy",
118
+ color="Symbol",
119
+ title="Text2Re",
120
+ y_lim=[0, 100],
121
+ x_label_angle=0,
122
+ height=400,
123
+ width=500,
124
+ )
125
+
126
+ with gr.Column():
127
+ gr.Markdown(
128
+ """<div style="text-align: center;"><h2> 🤖ConvRe🤯 Demo </h2></div>\
129
+ <br>\
130
+ """,
131
+ elem_classes="markdown-text",
132
+ )
133
+
134
+ output_box = gr.Textbox(lines=10, max_lines=10, label="ChatBot")
135
+
136
+ input_box = gr.Textbox(lines=12, max_lines=12, label="Input")
137
+
138
+ with gr.Row():
139
+ gr.ClearButton([input_box, output_box])
140
+ gr.Button("Submit")
141
+
142
+ with gr.Row():
143
+ re2text_easy_btn = gr.Button("Random Re2Text Easy Example 😄")
144
+ re2text_easy_btn.click(
145
+ fn=random_examples,
146
+ inputs=gr.Text("re2text-easy", visible=False),
147
+ outputs = input_box,
148
+ api_name="re2text-easy"
149
+ )
150
+
151
+ re2text_hard_btn = gr.Button("Random Re2Text Hard Example 🤯")
152
+ re2text_hard_btn.click(
153
+ fn=random_examples,
154
+ inputs=gr.Text("re2text-hard", visible=False),
155
+ outputs=input_box,
156
+ )
157
+
158
+ text2re_easy_btn = gr.Button("Random Text2Re Easy Example 😄")
159
+ text2re_easy_btn.click(
160
+ fn=random_examples,
161
+ inputs=gr.Text("text2re-easy", visible=False),
162
+ outputs = input_box,
163
+ )
164
+
165
+ text2re_hard_btn = gr.Button("Random Text2Re Hard Example 🤯")
166
+ text2re_hard_btn.click(
167
+ fn=random_examples,
168
+ inputs=gr.Text("text2re-hard", visible=False),
169
+ outputs = input_box,
170
+ )
171
+
172
+ with gr.Accordion("Additional Inputs", open=False):
173
+ gr.Textbox(label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6),
174
+
175
+ gr.Slider(
176
+ label="Max new tokens",
177
+ minimum=1,
178
+ maximum=MAX_MAX_NEW_TOKENS,
179
+ step=1,
180
+ value=DEFAULT_MAX_NEW_TOKENS,
181
+ ),
182
+
183
+ gr.Slider(
184
+ label="Temperature",
185
+ minimum=0,
186
+ maximum=4.0,
187
+ step=0.05,
188
+ value=0,
189
+ ),
190
+
191
+ demo.launch()
data/eval_board.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Models,Re2Text-Easy,Text2Re-Easy,Re2Text-Hard,Text2Re-Hard,Avg,Links
2
+ llama-2-7b-chat-hf,0,0,0,0,0,https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
3
+ qwen-7b-chat,0,0,0,0,0,https://huggingface.co/Qwen/Qwen-7B-Chat
4
+ internlm-7b-chat,0,0,0,0,0,https://huggingface.co/internlm/internlm-chat-7b
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==3.47.1
2
+ torch==2.0.0
3
+ transformers==4.34.0
4
+ datasets==2.14.5
src/__pycache__/css_html.cpython-38.pyc ADDED
Binary file (1.44 kB). View file
 
src/__pycache__/demo.cpython-38.pyc ADDED
Binary file (1.22 kB). View file
 
src/__pycache__/utils.cpython-38.pyc ADDED
Binary file (2.3 kB). View file
 
src/css_html.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # source: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/assets/css_html_js.py
2
+ custom_css = """
3
+ #changelog-text {
4
+ font-size: 16px !important;
5
+ }
6
+
7
+ #changelog-text h2 {
8
+ font-size: 18px !important;
9
+ }
10
+
11
+ .markdown-text {
12
+ font-size: 16px !important;
13
+ }
14
+
15
+ #models-to-add-text {
16
+ font-size: 18px !important;
17
+ }
18
+
19
+ #citation-button span {
20
+ font-size: 16px !important;
21
+ }
22
+
23
+ #citation-button textarea {
24
+ font-size: 16px !important;
25
+ }
26
+
27
+ #citation-button > label > button {
28
+ margin: 6px;
29
+ transform: scale(1.3);
30
+ }
31
+
32
+ #leaderboard-table {
33
+ margin-top: 15px
34
+ }
35
+
36
+ #leaderboard-table-lite {
37
+ margin-top: 15px
38
+ }
39
+
40
+ #search-bar-table-box > div:first-child {
41
+ background: none;
42
+ border: none;
43
+ }
44
+
45
+ #search-bar {
46
+ padding: 0px;
47
+ }
48
+
49
+ /* Hides the final AutoEvalColumn */
50
+ #llm-benchmark-tab-table table td:last-child,
51
+ #llm-benchmark-tab-table table th:last-child {
52
+ display: none;
53
+ }
54
+
55
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
56
+ table td:first-child,
57
+ table th:first-child {
58
+ max-width: 400px;
59
+ overflow: auto;
60
+ white-space: nowrap;
61
+ }
62
+
63
+ .tab-buttons button {
64
+ font-size: 20px;
65
+ }
66
+
67
+ #scale-logo {
68
+ border-style: none !important;
69
+ box-shadow: none;
70
+ display: block;
71
+ margin-left: auto;
72
+ margin-right: auto;
73
+ max-width: 600px;
74
+ }
75
+
76
+ #scale-logo .download {
77
+ display: none;
78
+ }
79
+ """
src/demo.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ from threading import Thread
4
+ from typing import Iterable
5
+
6
+ import torch
7
+ from huggingface_hub import HfApi
8
+ from datasets import load_dataset
9
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
10
+ from transformers import AutoTokenizer, AutoModelForCausalLM
11
+
12
+
13
+ TOKEN = os.environ.get("HF_TOKEN", None)
14
+
15
+
16
+ model_id = "meta-llama/Llama-2-7b-chat-hf"
17
+ # tokenizer = AutoTokenizer.from_pretrained(model_id)
18
+ # model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
19
+
20
+
21
+ type2dataset = {
22
+ "re2text-easy": load_dataset('3B-Group/ConvRe', "en-re2text", token=True, split="prompt1"),
23
+ "re2text-hard": load_dataset('3B-Group/ConvRe', "en-re2text", token=True, split="prompt4"),
24
+ "text2re-easy": load_dataset('3B-Group/ConvRe', "en-text2re", token=True, split="prompt1"),
25
+ "text2re-hard": load_dataset('3B-Group/ConvRe', "en-text2re", token=True, split="prompt3")
26
+ }
27
+
28
+ # type2dataset = {}
29
+
30
+
31
+ def generate():
32
+ return "1"
33
+
34
+
35
+ def random_examples(dataset_key) -> str:
36
+ # target_dataset = type2dataset[f"{task.lower()}-{type.lower()}"]
37
+ target_dataset = type2dataset[dataset_key]
38
+
39
+ idx = random.randint(0, len(target_dataset) - 1)
40
+ item = target_dataset[idx]
41
+ return item['query']
42
+
43
+
44
+
src/utils.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import pandas as pd
3
+
4
+
5
+ @dataclass
6
+ class ColumnContent:
7
+ name: str
8
+ type: str
9
+ displayed_by_default: bool
10
+ hidden: bool = False
11
+
12
+
13
+ def fields(raw_class):
14
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class AutoEvalColumn: # Auto evals column
19
+ model = ColumnContent("Models", "markdown", True)
20
+ re2text_easy = ColumnContent("Re2Text-Easy", "number", True)
21
+ text2re_easy = ColumnContent("Text2Re-Easy", "number", True)
22
+ re2text_hard = ColumnContent("Re2Text-Hard", "number", True)
23
+ text2re_hard = ColumnContent("Text2Re-Hard", "number", True)
24
+ avg = ColumnContent("Avg", "number", True)
25
+
26
+ link = ColumnContent("Links", "str", False)
27
+
28
+
29
+ def model_hyperlink(link, model_name):
30
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
31
+
32
+
33
+ def make_clickable_names(df):
34
+ df["Models"] = df.apply(
35
+ lambda row: model_hyperlink(row["Links"], row["Models"]), axis=1
36
+ )
37
+ return df
38
+
39
+
40
+ def make_plot_data(df, task):
41
+ c = []
42
+ x = []
43
+ y = []
44
+
45
+ for i in df.index:
46
+ c.append(df.loc[i, "pure_name"])
47
+ x.append(f"{task}-Easy")
48
+ y.append(df.loc[i, f"{task}-Easy"])
49
+
50
+ c.append(df.loc[i, "pure_name"])
51
+ x.append(f"{task}-Hard")
52
+ y.append(df.loc[i, f"{task}-Hard"])
53
+
54
+ data = pd.DataFrame(
55
+ {
56
+ "Symbol": c,
57
+ "Setting": x,
58
+ "Accuracy": y,
59
+ }
60
+ )
61
+
62
+ return data
63
+
64
+
65
+