Spaces:
Build error
Build error
eval with few-shots prompting
Browse files- .env.example +1 -1
- llm_toolkit/eval.py +46 -20
- llm_toolkit/translation_utils.py +13 -3
- notebooks/00_Data Analysis.ipynb +0 -0
- results/mac-results_metrics.csv +19 -0
- scripts/eval-mac.sh +5 -4
.env.example
CHANGED
@@ -7,4 +7,4 @@ HF_TOKEN=
|
|
7 |
LOAD_IN_4BIT=false
|
8 |
|
9 |
DATA_PATH=datasets/mac/mac.tsv
|
10 |
-
RESULTS_PATH=results/mac-
|
|
|
7 |
LOAD_IN_4BIT=false
|
8 |
|
9 |
DATA_PATH=datasets/mac/mac.tsv
|
10 |
+
RESULTS_PATH=results/mac-results_few_shots.csv
|
llm_toolkit/eval.py
CHANGED
@@ -28,7 +28,6 @@ results_path = os.getenv("RESULTS_PATH")
|
|
28 |
batch_size = int(os.getenv("BATCH_SIZE", 1))
|
29 |
use_english_datasets = os.getenv("USE_ENGLISH_DATASETS") == "true"
|
30 |
max_new_tokens = int(os.getenv("MAX_NEW_TOKENS", 2048))
|
31 |
-
start_repetition_penalty = float(os.getenv("START_REPETITION_PENALTY", 1.0))
|
32 |
|
33 |
print(
|
34 |
model_name,
|
@@ -62,41 +61,68 @@ if is_cuda:
|
|
62 |
print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
|
63 |
print(f"{start_gpu_memory} GB of memory reserved.")
|
64 |
|
65 |
-
datasets = load_translation_dataset(data_path, tokenizer)
|
66 |
|
67 |
-
|
68 |
-
num = int(sys.argv[1])
|
69 |
-
if num > 0:
|
70 |
-
print(f"--- evaluating {num} entries")
|
71 |
-
datasets["test"] = datasets["test"].select(range(num))
|
72 |
-
|
73 |
-
print_row_details(datasets["test"].to_pandas(), indices=[0, -1])
|
74 |
-
|
75 |
-
|
76 |
-
def on_repetition_penalty_step_completed(model_name, predictions):
|
77 |
save_results(
|
78 |
model_name,
|
79 |
results_path,
|
80 |
-
|
81 |
predictions,
|
82 |
)
|
83 |
|
84 |
-
metrics = calc_metrics(
|
85 |
print(f"{model_name} metrics: {metrics}")
|
86 |
|
87 |
|
88 |
if adapter_name_or_path is not None:
|
89 |
model_name += "/" + adapter_name_or_path.split("/")[-1]
|
90 |
|
91 |
-
|
|
|
92 |
model,
|
93 |
tokenizer,
|
94 |
model_name,
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
batch_size=batch_size,
|
101 |
max_new_tokens=max_new_tokens,
|
102 |
device=device,
|
|
|
28 |
batch_size = int(os.getenv("BATCH_SIZE", 1))
|
29 |
use_english_datasets = os.getenv("USE_ENGLISH_DATASETS") == "true"
|
30 |
max_new_tokens = int(os.getenv("MAX_NEW_TOKENS", 2048))
|
|
|
31 |
|
32 |
print(
|
33 |
model_name,
|
|
|
61 |
print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
|
62 |
print(f"{start_gpu_memory} GB of memory reserved.")
|
63 |
|
|
|
64 |
|
65 |
+
def on_num_shots_step_completed(model_name, dataset, predictions):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
save_results(
|
67 |
model_name,
|
68 |
results_path,
|
69 |
+
dataset,
|
70 |
predictions,
|
71 |
)
|
72 |
|
73 |
+
metrics = calc_metrics(dataset["english"], predictions, debug=True)
|
74 |
print(f"{model_name} metrics: {metrics}")
|
75 |
|
76 |
|
77 |
if adapter_name_or_path is not None:
|
78 |
model_name += "/" + adapter_name_or_path.split("/")[-1]
|
79 |
|
80 |
+
|
81 |
+
def evaluate_model_with_num_shots(
|
82 |
model,
|
83 |
tokenizer,
|
84 |
model_name,
|
85 |
+
data_path,
|
86 |
+
range_num_shots=[0, 1, 3, 5, 10, 50],
|
87 |
+
batch_size=1,
|
88 |
+
max_new_tokens=2048,
|
89 |
+
device="cuda",
|
90 |
+
):
|
91 |
+
print(f"Evaluating model: {model_name} on {device}")
|
92 |
+
|
93 |
+
for num_shots in range_num_shots:
|
94 |
+
print(f"*** Evaluating with num_shots: {num_shots}")
|
95 |
+
|
96 |
+
datasets = load_translation_dataset(data_path, tokenizer, num_shots=num_shots)
|
97 |
+
print_row_details(datasets["test"].to_pandas())
|
98 |
+
|
99 |
+
predictions = eval_model(
|
100 |
+
model,
|
101 |
+
tokenizer,
|
102 |
+
datasets["test"],
|
103 |
+
device=device,
|
104 |
+
num_shots=num_shots,
|
105 |
+
batch_size=batch_size,
|
106 |
+
max_new_tokens=max_new_tokens,
|
107 |
+
)
|
108 |
+
|
109 |
+
model_name_with_rp = f"{model_name}/shots-{num_shots:02d}"
|
110 |
+
|
111 |
+
try:
|
112 |
+
on_num_shots_step_completed(
|
113 |
+
model_name_with_rp,
|
114 |
+
datasets["test"],
|
115 |
+
predictions,
|
116 |
+
)
|
117 |
+
except Exception as e:
|
118 |
+
print(e)
|
119 |
+
|
120 |
+
|
121 |
+
evaluate_model_with_num_shots(
|
122 |
+
model,
|
123 |
+
tokenizer,
|
124 |
+
model_name,
|
125 |
+
data_path,
|
126 |
batch_size=batch_size,
|
127 |
max_new_tokens=max_new_tokens,
|
128 |
device=device,
|
llm_toolkit/translation_utils.py
CHANGED
@@ -91,7 +91,7 @@ def save_results(model_name, results_path, dataset, predictions, debug=False):
|
|
91 |
df.to_csv(results_path, index=False)
|
92 |
|
93 |
|
94 |
-
def load_translation_dataset(data_path, tokenizer=None):
|
95 |
train_data_file = data_path.replace(".tsv", "-train.tsv")
|
96 |
test_data_file = data_path.replace(".tsv", "-test.tsv")
|
97 |
|
@@ -122,7 +122,17 @@ def load_translation_dataset(data_path, tokenizer=None):
|
|
122 |
)
|
123 |
|
124 |
if tokenizer:
|
125 |
-
translation_prompt = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
def formatting_prompts_func(examples):
|
128 |
inputs = examples["chinese"]
|
@@ -131,7 +141,7 @@ def load_translation_dataset(data_path, tokenizer=None):
|
|
131 |
messages = [
|
132 |
{
|
133 |
"role": "system",
|
134 |
-
"content": "You are
|
135 |
},
|
136 |
None,
|
137 |
]
|
|
|
91 |
df.to_csv(results_path, index=False)
|
92 |
|
93 |
|
94 |
+
def load_translation_dataset(data_path, tokenizer=None, num_shots=5):
|
95 |
train_data_file = data_path.replace(".tsv", "-train.tsv")
|
96 |
test_data_file = data_path.replace(".tsv", "-test.tsv")
|
97 |
|
|
|
122 |
)
|
123 |
|
124 |
if tokenizer:
|
125 |
+
translation_prompt = "You will be given a Chinese sentence to translate. If it is an incomplete sentence, or if you are unsure about the meaning, simply copy the input text as your output. Do not output any additional sentence such as explanation or reasoning.\n\n"
|
126 |
+
if num_shots > 0:
|
127 |
+
example_translations = "Example Translations:\n"
|
128 |
+
for i in range(num_shots):
|
129 |
+
example_translations += f"Chinese: {datasets['train'][i]['chinese']}\n"
|
130 |
+
example_translations += (
|
131 |
+
f"English: {datasets['train'][i]['english']}\n\n"
|
132 |
+
)
|
133 |
+
translation_prompt = translation_prompt + example_translations
|
134 |
+
|
135 |
+
translation_prompt = translation_prompt + "Chinese: {}\nEnglish:"
|
136 |
|
137 |
def formatting_prompts_func(examples):
|
138 |
inputs = examples["chinese"]
|
|
|
141 |
messages = [
|
142 |
{
|
143 |
"role": "system",
|
144 |
+
"content": "You are a helpful assistant that translates Chinese to English.",
|
145 |
},
|
146 |
None,
|
147 |
]
|
notebooks/00_Data Analysis.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
results/mac-results_metrics.csv
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
model,rpp,meteor,bleu_1,rouge_l,ews_score,repetition_score,total_repetitions,rap,num_max_output_tokens
|
|
|
2 |
01-ai/Yi-1.5-9B-Chat,1.00,0.3463725436435439,0.09312113035602035,0.33287597095291194,0.0,0.35127978817299205,0.35127978817299205,0.34125573890735983,2
|
3 |
01-ai/Yi-1.5-9B-Chat,1.02,0.3471185374158656,0.09126513887574451,0.3325894211716421,0.0,0.264783759929391,0.264783759929391,0.3432230230787291,4
|
4 |
01-ai/Yi-1.5-9B-Chat,1.04,0.3471882673119874,0.09019886552461354,0.33194600115482237,0.0,0.37775816416593117,0.37775816416593117,0.3416859125059273,8
|
@@ -11,6 +12,10 @@ model,rpp,meteor,bleu_1,rouge_l,ews_score,repetition_score,total_repetitions,rap
|
|
11 |
01-ai/Yi-1.5-9B-Chat,1.18,0.3365273541015462,0.06786555450924157,0.31409006189011535,0.0,0.30979699911738745,0.30979699911738745,0.33212663825955735,26
|
12 |
01-ai/Yi-1.5-9B-Chat,1.20,0.3355307477803475,0.06314678954328107,0.3110853370928648,0.0,0.22241835834068843,0.22241835834068843,0.33235553904085485,36
|
13 |
01-ai/Yi-1.5-9B-Chat,1.22,0.33363375306882515,0.06214712430276763,0.3086913255065605,0.11827007943512798,0.25772285966460723,0.3715798764342454,0.32842981018186884,33
|
|
|
|
|
|
|
|
|
14 |
Qwen/Qwen2-72B-Instruct,1.00,0.3928168861285181,0.12345162681603773,0.3843593208981698,0.0,0.17563989408649602,0.17563989408649602,0.389868803763904,0
|
15 |
Qwen/Qwen2-72B-Instruct,1.02,0.3936651928828143,0.12446659906815814,0.3844415446718956,0.0,0.147396293027361,0.147396293027361,0.39117939588436124,0
|
16 |
Qwen/Qwen2-72B-Instruct,1.04,0.39263683565035906,0.12496255366843562,0.38481746782098636,0.0,0.15798764342453664,0.15798764342453664,0.38998196316138,0
|
@@ -43,9 +48,23 @@ Qwen/Qwen2-7B-Instruct,1.24,0.352755026120472,0.08591470945904531,0.339935275908
|
|
43 |
Qwen/Qwen2-7B-Instruct,1.26,0.3483233677173315,0.07972359456247886,0.3352291660383133,0.0,0.08561341571050309,0.08561341571050309,0.3470385209221742,0
|
44 |
Qwen/Qwen2-7B-Instruct,1.28,0.34450122231539704,0.07518096876457613,0.33099576010918924,0.0,0.09179170344218888,0.09179170344218888,0.34313954918633316,1
|
45 |
Qwen/Qwen2-7B-Instruct,1.30,0.3401098279932269,0.07026740554261787,0.32623150769341913,0.0,0.09002647837599294,0.09002647837599294,0.3387911491977248,3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.00,0.3815423445635067,0.11524878188694271,0.37042646286690667,0.0,0.1968225948808473,0.1968225948808473,0.37833975022913946,0
|
47 |
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.02,0.3814400195917603,0.11481993983759356,0.3699786095406088,0.0,0.2118270079435128,0.2118270079435128,0.3779989256169545,0
|
48 |
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.04,0.3804874897233306,0.11332999451398112,0.36959666465385377,0.0,0.1879964695498676,0.1879964695498676,0.3774345028777672,0
|
|
|
|
|
|
|
|
|
|
|
49 |
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.00,0.3579680086793429,0.10207096308148353,0.3460132814937531,0.0,0.19240953221535745,0.19240953221535745,0.3550294775004645,0
|
50 |
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.02,0.3572226770743513,0.10061303169730976,0.3450507994469454,0.0,0.1615180935569285,0.1615180935569285,0.3547540871288482,0
|
51 |
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.04,0.35670586983276636,0.10074138007196803,0.3450245802338977,0.0,0.1615180935569285,0.1615180935569285,0.3542408512875192,0
|
|
|
1 |
model,rpp,meteor,bleu_1,rouge_l,ews_score,repetition_score,total_repetitions,rap,num_max_output_tokens
|
2 |
+
01-ai/Yi-1.5-34B-Chat,1.00,0.3755299490820739,0.105238895159134,0.35899145567662555,0.0,0.43777581641659313,0.43777581641659313,0.3686697552536047,4
|
3 |
01-ai/Yi-1.5-9B-Chat,1.00,0.3463725436435439,0.09312113035602035,0.33287597095291194,0.0,0.35127978817299205,0.35127978817299205,0.34125573890735983,2
|
4 |
01-ai/Yi-1.5-9B-Chat,1.02,0.3471185374158656,0.09126513887574451,0.3325894211716421,0.0,0.264783759929391,0.264783759929391,0.3432230230787291,4
|
5 |
01-ai/Yi-1.5-9B-Chat,1.04,0.3471882673119874,0.09019886552461354,0.33194600115482237,0.0,0.37775816416593117,0.37775816416593117,0.3416859125059273,8
|
|
|
12 |
01-ai/Yi-1.5-9B-Chat,1.18,0.3365273541015462,0.06786555450924157,0.31409006189011535,0.0,0.30979699911738745,0.30979699911738745,0.33212663825955735,26
|
13 |
01-ai/Yi-1.5-9B-Chat,1.20,0.3355307477803475,0.06314678954328107,0.3110853370928648,0.0,0.22241835834068843,0.22241835834068843,0.33235553904085485,36
|
14 |
01-ai/Yi-1.5-9B-Chat,1.22,0.33363375306882515,0.06214712430276763,0.3086913255065605,0.11827007943512798,0.25772285966460723,0.3715798764342454,0.32842981018186884,33
|
15 |
+
01-ai/Yi-1.5-9B-Chat,1.24,0.32989818112961883,0.0570096687900327,0.30266509467774577,0.0970873786407767,0.09796999117387467,0.19064430714916153,0.3272144846442304,43
|
16 |
+
01-ai/Yi-1.5-9B-Chat,1.26,0.3273962142810874,0.055237435454242444,0.3014277229065493,0.10944395410414828,0.40158870255957635,0.5110326566637247,0.32045973971100133,42
|
17 |
+
01-ai/Yi-1.5-9B-Chat,1.28,0.323837066013506,0.051278882542576266,0.2953285400642388,0.03000882612533098,0.1059135039717564,0.13592233009708737,0.32194938854685645,42
|
18 |
+
01-ai/Yi-1.5-9B-Chat,1.30,0.32093338081030204,0.047270589835958714,0.2909437718168147,0.02912621359223301,0.3698146513680494,0.3989408649602824,0.31557209228338484,49
|
19 |
Qwen/Qwen2-72B-Instruct,1.00,0.3928168861285181,0.12345162681603773,0.3843593208981698,0.0,0.17563989408649602,0.17563989408649602,0.389868803763904,0
|
20 |
Qwen/Qwen2-72B-Instruct,1.02,0.3936651928828143,0.12446659906815814,0.3844415446718956,0.0,0.147396293027361,0.147396293027361,0.39117939588436124,0
|
21 |
Qwen/Qwen2-72B-Instruct,1.04,0.39263683565035906,0.12496255366843562,0.38481746782098636,0.0,0.15798764342453664,0.15798764342453664,0.38998196316138,0
|
|
|
48 |
Qwen/Qwen2-7B-Instruct,1.26,0.3483233677173315,0.07972359456247886,0.3352291660383133,0.0,0.08561341571050309,0.08561341571050309,0.3470385209221742,0
|
49 |
Qwen/Qwen2-7B-Instruct,1.28,0.34450122231539704,0.07518096876457613,0.33099576010918924,0.0,0.09179170344218888,0.09179170344218888,0.34313954918633316,1
|
50 |
Qwen/Qwen2-7B-Instruct,1.30,0.3401098279932269,0.07026740554261787,0.32623150769341913,0.0,0.09002647837599294,0.09002647837599294,0.3387911491977248,3
|
51 |
+
internlm/internlm2_5-7b-chat,1.00,0.3671999390104764,0.11113560012478008,0.35982569682606647,0.0,0.14916151809355693,0.14916151809355693,0.36485386875796905,0
|
52 |
+
internlm/internlm2_5-7b-chat,1.02,0.3669871024548418,0.11052834918066008,0.36047543966393214,0.0,0.11738746690203,0.11738746690203,0.36513645123895805,0
|
53 |
+
internlm/internlm2_5-7b-chat,1.04,0.36431458209018286,0.10500475486417327,0.35796005034467937,0.0,0.09620476610767872,0.09620476610767872,0.36280598035582484,0
|
54 |
+
internlm/internlm2_5-7b-chat,1.06,0.35852062221276365,0.1007746558766565,0.35340523977241706,0.0,0.09532215357458076,0.09532215357458076,0.3570495131117442,0
|
55 |
+
internlm/internlm2_5-7b-chat,1.08,0.35020798408499415,0.0934055868567434,0.3442244092114163,0.0,0.11562224183583407,0.11562224183583407,0.3484682207171237,0
|
56 |
+
internlm/internlm2_5-7b-chat,1.10,0.34191587814517765,0.07320041801997382,0.32676144300373294,0.0,0.1262135922330097,0.1262135922330097,0.3400635236612148,1
|
57 |
+
internlm/internlm2_5-7b-chat,1.12,0.3343574558292014,0.052927581567650116,0.298405410760479,0.0,0.12886142983230361,0.12886142983230361,0.33250849918959285,6
|
58 |
+
internlm/internlm2_5-7b-chat,1.14,0.31458167312569474,0.03414901931637522,0.2613753073822461,0.01323918799646955,0.1562224183583407,0.16946160635481025,0.31230250164005646,50
|
59 |
+
internlm/internlm2_5-7b-chat,1.16,0.2895154344950096,0.021563541388547387,0.21947098060882128,0.1297440423654016,0.18446601941747573,0.3142100617828773,0.28567708514627493,155
|
60 |
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.00,0.3815423445635067,0.11524878188694271,0.37042646286690667,0.0,0.1968225948808473,0.1968225948808473,0.37833975022913946,0
|
61 |
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.02,0.3814400195917603,0.11481993983759356,0.3699786095406088,0.0,0.2118270079435128,0.2118270079435128,0.3779989256169545,0
|
62 |
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.04,0.3804874897233306,0.11332999451398112,0.36959666465385377,0.0,0.1879964695498676,0.1879964695498676,0.3774345028777672,0
|
63 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.06,0.3794618813204196,0.11264147540594219,0.3691306547739815,0.0,0.1879964695498676,0.1879964695498676,0.37641712383599774,0
|
64 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.08,0.3776298001192138,0.11139124543548622,0.3679447010986864,0.0,0.21977052074139453,0.21977052074139453,0.3740978880627333,0
|
65 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.10,0.3774364800258169,0.11069681096128998,0.36781965660594534,0.0,0.21712268314210062,0.21712268314210062,0.37394806502228206,0
|
66 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.12,0.3762356152613159,0.10985732529615087,0.36685494442700517,0.0,0.21712268314210062,0.21712268314210062,0.37275829911781017,0
|
67 |
+
shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.14,0.37576687641484824,0.10927960097864851,0.36660320893621023,0.0,0.1968225948808473,0.1968225948808473,0.3726127602686422,0
|
68 |
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.00,0.3579680086793429,0.10207096308148353,0.3460132814937531,0.0,0.19240953221535745,0.19240953221535745,0.3550294775004645,0
|
69 |
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.02,0.3572226770743513,0.10061303169730976,0.3450507994469454,0.0,0.1615180935569285,0.1615180935569285,0.3547540871288482,0
|
70 |
shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.04,0.35670586983276636,0.10074138007196803,0.3450245802338977,0.0,0.1615180935569285,0.1615180935569285,0.3542408512875192,0
|
scripts/eval-mac.sh
CHANGED
@@ -14,12 +14,13 @@ grep MemTotal /proc/meminfo
|
|
14 |
# pip install torch torchvision torchaudio
|
15 |
# pip install -r requirements.txt
|
16 |
|
17 |
-
|
|
|
|
|
18 |
|
19 |
./scripts/eval-model.sh Qwen/Qwen2-7B-Instruct
|
20 |
|
21 |
-
./scripts/eval-model.sh shenzhi-wang/
|
22 |
|
23 |
-
|
24 |
|
25 |
-
#./scripts/eval-model.sh internlm/internlm2_5-7b-chat
|
|
|
14 |
# pip install torch torchvision torchaudio
|
15 |
# pip install -r requirements.txt
|
16 |
|
17 |
+
./scripts/eval-model.sh 01-ai/Yi-1.5-9B-Chat
|
18 |
+
|
19 |
+
./scripts/eval-model.sh internlm/internlm2_5-7b-chat
|
20 |
|
21 |
./scripts/eval-model.sh Qwen/Qwen2-7B-Instruct
|
22 |
|
23 |
+
./scripts/eval-model.sh shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat
|
24 |
|
25 |
+
./scripts/eval-model.sh shenzhi-wang/Llama3.1-8B-Chinese-Chat
|
26 |
|
|