dh-mc commited on
Commit
6c91c84
·
1 Parent(s): eb7323d

eval with few-shots prompting

Browse files
.env.example CHANGED
@@ -7,4 +7,4 @@ HF_TOKEN=
7
  LOAD_IN_4BIT=false
8
 
9
  DATA_PATH=datasets/mac/mac.tsv
10
- RESULTS_PATH=results/mac-results.csv
 
7
  LOAD_IN_4BIT=false
8
 
9
  DATA_PATH=datasets/mac/mac.tsv
10
+ RESULTS_PATH=results/mac-results_few_shots.csv
llm_toolkit/eval.py CHANGED
@@ -28,7 +28,6 @@ results_path = os.getenv("RESULTS_PATH")
28
  batch_size = int(os.getenv("BATCH_SIZE", 1))
29
  use_english_datasets = os.getenv("USE_ENGLISH_DATASETS") == "true"
30
  max_new_tokens = int(os.getenv("MAX_NEW_TOKENS", 2048))
31
- start_repetition_penalty = float(os.getenv("START_REPETITION_PENALTY", 1.0))
32
 
33
  print(
34
  model_name,
@@ -62,41 +61,68 @@ if is_cuda:
62
  print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
63
  print(f"{start_gpu_memory} GB of memory reserved.")
64
 
65
- datasets = load_translation_dataset(data_path, tokenizer)
66
 
67
- if len(sys.argv) > 1:
68
- num = int(sys.argv[1])
69
- if num > 0:
70
- print(f"--- evaluating {num} entries")
71
- datasets["test"] = datasets["test"].select(range(num))
72
-
73
- print_row_details(datasets["test"].to_pandas(), indices=[0, -1])
74
-
75
-
76
- def on_repetition_penalty_step_completed(model_name, predictions):
77
  save_results(
78
  model_name,
79
  results_path,
80
- datasets["test"],
81
  predictions,
82
  )
83
 
84
- metrics = calc_metrics(datasets["test"]["english"], predictions, debug=True)
85
  print(f"{model_name} metrics: {metrics}")
86
 
87
 
88
  if adapter_name_or_path is not None:
89
  model_name += "/" + adapter_name_or_path.split("/")[-1]
90
 
91
- evaluate_model_with_repetition_penalty(
 
92
  model,
93
  tokenizer,
94
  model_name,
95
- datasets["test"],
96
- on_repetition_penalty_step_completed,
97
- start_repetition_penalty=start_repetition_penalty,
98
- end_repetition_penalty=1.3,
99
- step_repetition_penalty=0.02,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  batch_size=batch_size,
101
  max_new_tokens=max_new_tokens,
102
  device=device,
 
28
  batch_size = int(os.getenv("BATCH_SIZE", 1))
29
  use_english_datasets = os.getenv("USE_ENGLISH_DATASETS") == "true"
30
  max_new_tokens = int(os.getenv("MAX_NEW_TOKENS", 2048))
 
31
 
32
  print(
33
  model_name,
 
61
  print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
62
  print(f"{start_gpu_memory} GB of memory reserved.")
63
 
 
64
 
65
+ def on_num_shots_step_completed(model_name, dataset, predictions):
 
 
 
 
 
 
 
 
 
66
  save_results(
67
  model_name,
68
  results_path,
69
+ dataset,
70
  predictions,
71
  )
72
 
73
+ metrics = calc_metrics(dataset["english"], predictions, debug=True)
74
  print(f"{model_name} metrics: {metrics}")
75
 
76
 
77
  if adapter_name_or_path is not None:
78
  model_name += "/" + adapter_name_or_path.split("/")[-1]
79
 
80
+
81
+ def evaluate_model_with_num_shots(
82
  model,
83
  tokenizer,
84
  model_name,
85
+ data_path,
86
+ range_num_shots=[0, 1, 3, 5, 10, 50],
87
+ batch_size=1,
88
+ max_new_tokens=2048,
89
+ device="cuda",
90
+ ):
91
+ print(f"Evaluating model: {model_name} on {device}")
92
+
93
+ for num_shots in range_num_shots:
94
+ print(f"*** Evaluating with num_shots: {num_shots}")
95
+
96
+ datasets = load_translation_dataset(data_path, tokenizer, num_shots=num_shots)
97
+ print_row_details(datasets["test"].to_pandas())
98
+
99
+ predictions = eval_model(
100
+ model,
101
+ tokenizer,
102
+ datasets["test"],
103
+ device=device,
104
+ num_shots=num_shots,
105
+ batch_size=batch_size,
106
+ max_new_tokens=max_new_tokens,
107
+ )
108
+
109
+ model_name_with_rp = f"{model_name}/shots-{num_shots:02d}"
110
+
111
+ try:
112
+ on_num_shots_step_completed(
113
+ model_name_with_rp,
114
+ datasets["test"],
115
+ predictions,
116
+ )
117
+ except Exception as e:
118
+ print(e)
119
+
120
+
121
+ evaluate_model_with_num_shots(
122
+ model,
123
+ tokenizer,
124
+ model_name,
125
+ data_path,
126
  batch_size=batch_size,
127
  max_new_tokens=max_new_tokens,
128
  device=device,
llm_toolkit/translation_utils.py CHANGED
@@ -91,7 +91,7 @@ def save_results(model_name, results_path, dataset, predictions, debug=False):
91
  df.to_csv(results_path, index=False)
92
 
93
 
94
- def load_translation_dataset(data_path, tokenizer=None):
95
  train_data_file = data_path.replace(".tsv", "-train.tsv")
96
  test_data_file = data_path.replace(".tsv", "-test.tsv")
97
 
@@ -122,7 +122,17 @@ def load_translation_dataset(data_path, tokenizer=None):
122
  )
123
 
124
  if tokenizer:
125
- translation_prompt = "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{}"
 
 
 
 
 
 
 
 
 
 
126
 
127
  def formatting_prompts_func(examples):
128
  inputs = examples["chinese"]
@@ -131,7 +141,7 @@ def load_translation_dataset(data_path, tokenizer=None):
131
  messages = [
132
  {
133
  "role": "system",
134
- "content": "You are an expert in translating Chinese to English.",
135
  },
136
  None,
137
  ]
 
91
  df.to_csv(results_path, index=False)
92
 
93
 
94
+ def load_translation_dataset(data_path, tokenizer=None, num_shots=5):
95
  train_data_file = data_path.replace(".tsv", "-train.tsv")
96
  test_data_file = data_path.replace(".tsv", "-test.tsv")
97
 
 
122
  )
123
 
124
  if tokenizer:
125
+ translation_prompt = "You will be given a Chinese sentence to translate. If it is an incomplete sentence, or if you are unsure about the meaning, simply copy the input text as your output. Do not output any additional sentence such as explanation or reasoning.\n\n"
126
+ if num_shots > 0:
127
+ example_translations = "Example Translations:\n"
128
+ for i in range(num_shots):
129
+ example_translations += f"Chinese: {datasets['train'][i]['chinese']}\n"
130
+ example_translations += (
131
+ f"English: {datasets['train'][i]['english']}\n\n"
132
+ )
133
+ translation_prompt = translation_prompt + example_translations
134
+
135
+ translation_prompt = translation_prompt + "Chinese: {}\nEnglish:"
136
 
137
  def formatting_prompts_func(examples):
138
  inputs = examples["chinese"]
 
141
  messages = [
142
  {
143
  "role": "system",
144
+ "content": "You are a helpful assistant that translates Chinese to English.",
145
  },
146
  None,
147
  ]
notebooks/00_Data Analysis.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
results/mac-results_metrics.csv CHANGED
@@ -1,4 +1,5 @@
1
  model,rpp,meteor,bleu_1,rouge_l,ews_score,repetition_score,total_repetitions,rap,num_max_output_tokens
 
2
  01-ai/Yi-1.5-9B-Chat,1.00,0.3463725436435439,0.09312113035602035,0.33287597095291194,0.0,0.35127978817299205,0.35127978817299205,0.34125573890735983,2
3
  01-ai/Yi-1.5-9B-Chat,1.02,0.3471185374158656,0.09126513887574451,0.3325894211716421,0.0,0.264783759929391,0.264783759929391,0.3432230230787291,4
4
  01-ai/Yi-1.5-9B-Chat,1.04,0.3471882673119874,0.09019886552461354,0.33194600115482237,0.0,0.37775816416593117,0.37775816416593117,0.3416859125059273,8
@@ -11,6 +12,10 @@ model,rpp,meteor,bleu_1,rouge_l,ews_score,repetition_score,total_repetitions,rap
11
  01-ai/Yi-1.5-9B-Chat,1.18,0.3365273541015462,0.06786555450924157,0.31409006189011535,0.0,0.30979699911738745,0.30979699911738745,0.33212663825955735,26
12
  01-ai/Yi-1.5-9B-Chat,1.20,0.3355307477803475,0.06314678954328107,0.3110853370928648,0.0,0.22241835834068843,0.22241835834068843,0.33235553904085485,36
13
  01-ai/Yi-1.5-9B-Chat,1.22,0.33363375306882515,0.06214712430276763,0.3086913255065605,0.11827007943512798,0.25772285966460723,0.3715798764342454,0.32842981018186884,33
 
 
 
 
14
  Qwen/Qwen2-72B-Instruct,1.00,0.3928168861285181,0.12345162681603773,0.3843593208981698,0.0,0.17563989408649602,0.17563989408649602,0.389868803763904,0
15
  Qwen/Qwen2-72B-Instruct,1.02,0.3936651928828143,0.12446659906815814,0.3844415446718956,0.0,0.147396293027361,0.147396293027361,0.39117939588436124,0
16
  Qwen/Qwen2-72B-Instruct,1.04,0.39263683565035906,0.12496255366843562,0.38481746782098636,0.0,0.15798764342453664,0.15798764342453664,0.38998196316138,0
@@ -43,9 +48,23 @@ Qwen/Qwen2-7B-Instruct,1.24,0.352755026120472,0.08591470945904531,0.339935275908
43
  Qwen/Qwen2-7B-Instruct,1.26,0.3483233677173315,0.07972359456247886,0.3352291660383133,0.0,0.08561341571050309,0.08561341571050309,0.3470385209221742,0
44
  Qwen/Qwen2-7B-Instruct,1.28,0.34450122231539704,0.07518096876457613,0.33099576010918924,0.0,0.09179170344218888,0.09179170344218888,0.34313954918633316,1
45
  Qwen/Qwen2-7B-Instruct,1.30,0.3401098279932269,0.07026740554261787,0.32623150769341913,0.0,0.09002647837599294,0.09002647837599294,0.3387911491977248,3
 
 
 
 
 
 
 
 
 
46
  shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.00,0.3815423445635067,0.11524878188694271,0.37042646286690667,0.0,0.1968225948808473,0.1968225948808473,0.37833975022913946,0
47
  shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.02,0.3814400195917603,0.11481993983759356,0.3699786095406088,0.0,0.2118270079435128,0.2118270079435128,0.3779989256169545,0
48
  shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.04,0.3804874897233306,0.11332999451398112,0.36959666465385377,0.0,0.1879964695498676,0.1879964695498676,0.3774345028777672,0
 
 
 
 
 
49
  shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.00,0.3579680086793429,0.10207096308148353,0.3460132814937531,0.0,0.19240953221535745,0.19240953221535745,0.3550294775004645,0
50
  shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.02,0.3572226770743513,0.10061303169730976,0.3450507994469454,0.0,0.1615180935569285,0.1615180935569285,0.3547540871288482,0
51
  shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.04,0.35670586983276636,0.10074138007196803,0.3450245802338977,0.0,0.1615180935569285,0.1615180935569285,0.3542408512875192,0
 
1
  model,rpp,meteor,bleu_1,rouge_l,ews_score,repetition_score,total_repetitions,rap,num_max_output_tokens
2
+ 01-ai/Yi-1.5-34B-Chat,1.00,0.3755299490820739,0.105238895159134,0.35899145567662555,0.0,0.43777581641659313,0.43777581641659313,0.3686697552536047,4
3
  01-ai/Yi-1.5-9B-Chat,1.00,0.3463725436435439,0.09312113035602035,0.33287597095291194,0.0,0.35127978817299205,0.35127978817299205,0.34125573890735983,2
4
  01-ai/Yi-1.5-9B-Chat,1.02,0.3471185374158656,0.09126513887574451,0.3325894211716421,0.0,0.264783759929391,0.264783759929391,0.3432230230787291,4
5
  01-ai/Yi-1.5-9B-Chat,1.04,0.3471882673119874,0.09019886552461354,0.33194600115482237,0.0,0.37775816416593117,0.37775816416593117,0.3416859125059273,8
 
12
  01-ai/Yi-1.5-9B-Chat,1.18,0.3365273541015462,0.06786555450924157,0.31409006189011535,0.0,0.30979699911738745,0.30979699911738745,0.33212663825955735,26
13
  01-ai/Yi-1.5-9B-Chat,1.20,0.3355307477803475,0.06314678954328107,0.3110853370928648,0.0,0.22241835834068843,0.22241835834068843,0.33235553904085485,36
14
  01-ai/Yi-1.5-9B-Chat,1.22,0.33363375306882515,0.06214712430276763,0.3086913255065605,0.11827007943512798,0.25772285966460723,0.3715798764342454,0.32842981018186884,33
15
+ 01-ai/Yi-1.5-9B-Chat,1.24,0.32989818112961883,0.0570096687900327,0.30266509467774577,0.0970873786407767,0.09796999117387467,0.19064430714916153,0.3272144846442304,43
16
+ 01-ai/Yi-1.5-9B-Chat,1.26,0.3273962142810874,0.055237435454242444,0.3014277229065493,0.10944395410414828,0.40158870255957635,0.5110326566637247,0.32045973971100133,42
17
+ 01-ai/Yi-1.5-9B-Chat,1.28,0.323837066013506,0.051278882542576266,0.2953285400642388,0.03000882612533098,0.1059135039717564,0.13592233009708737,0.32194938854685645,42
18
+ 01-ai/Yi-1.5-9B-Chat,1.30,0.32093338081030204,0.047270589835958714,0.2909437718168147,0.02912621359223301,0.3698146513680494,0.3989408649602824,0.31557209228338484,49
19
  Qwen/Qwen2-72B-Instruct,1.00,0.3928168861285181,0.12345162681603773,0.3843593208981698,0.0,0.17563989408649602,0.17563989408649602,0.389868803763904,0
20
  Qwen/Qwen2-72B-Instruct,1.02,0.3936651928828143,0.12446659906815814,0.3844415446718956,0.0,0.147396293027361,0.147396293027361,0.39117939588436124,0
21
  Qwen/Qwen2-72B-Instruct,1.04,0.39263683565035906,0.12496255366843562,0.38481746782098636,0.0,0.15798764342453664,0.15798764342453664,0.38998196316138,0
 
48
  Qwen/Qwen2-7B-Instruct,1.26,0.3483233677173315,0.07972359456247886,0.3352291660383133,0.0,0.08561341571050309,0.08561341571050309,0.3470385209221742,0
49
  Qwen/Qwen2-7B-Instruct,1.28,0.34450122231539704,0.07518096876457613,0.33099576010918924,0.0,0.09179170344218888,0.09179170344218888,0.34313954918633316,1
50
  Qwen/Qwen2-7B-Instruct,1.30,0.3401098279932269,0.07026740554261787,0.32623150769341913,0.0,0.09002647837599294,0.09002647837599294,0.3387911491977248,3
51
+ internlm/internlm2_5-7b-chat,1.00,0.3671999390104764,0.11113560012478008,0.35982569682606647,0.0,0.14916151809355693,0.14916151809355693,0.36485386875796905,0
52
+ internlm/internlm2_5-7b-chat,1.02,0.3669871024548418,0.11052834918066008,0.36047543966393214,0.0,0.11738746690203,0.11738746690203,0.36513645123895805,0
53
+ internlm/internlm2_5-7b-chat,1.04,0.36431458209018286,0.10500475486417327,0.35796005034467937,0.0,0.09620476610767872,0.09620476610767872,0.36280598035582484,0
54
+ internlm/internlm2_5-7b-chat,1.06,0.35852062221276365,0.1007746558766565,0.35340523977241706,0.0,0.09532215357458076,0.09532215357458076,0.3570495131117442,0
55
+ internlm/internlm2_5-7b-chat,1.08,0.35020798408499415,0.0934055868567434,0.3442244092114163,0.0,0.11562224183583407,0.11562224183583407,0.3484682207171237,0
56
+ internlm/internlm2_5-7b-chat,1.10,0.34191587814517765,0.07320041801997382,0.32676144300373294,0.0,0.1262135922330097,0.1262135922330097,0.3400635236612148,1
57
+ internlm/internlm2_5-7b-chat,1.12,0.3343574558292014,0.052927581567650116,0.298405410760479,0.0,0.12886142983230361,0.12886142983230361,0.33250849918959285,6
58
+ internlm/internlm2_5-7b-chat,1.14,0.31458167312569474,0.03414901931637522,0.2613753073822461,0.01323918799646955,0.1562224183583407,0.16946160635481025,0.31230250164005646,50
59
+ internlm/internlm2_5-7b-chat,1.16,0.2895154344950096,0.021563541388547387,0.21947098060882128,0.1297440423654016,0.18446601941747573,0.3142100617828773,0.28567708514627493,155
60
  shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.00,0.3815423445635067,0.11524878188694271,0.37042646286690667,0.0,0.1968225948808473,0.1968225948808473,0.37833975022913946,0
61
  shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.02,0.3814400195917603,0.11481993983759356,0.3699786095406088,0.0,0.2118270079435128,0.2118270079435128,0.3779989256169545,0
62
  shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.04,0.3804874897233306,0.11332999451398112,0.36959666465385377,0.0,0.1879964695498676,0.1879964695498676,0.3774345028777672,0
63
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.06,0.3794618813204196,0.11264147540594219,0.3691306547739815,0.0,0.1879964695498676,0.1879964695498676,0.37641712383599774,0
64
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.08,0.3776298001192138,0.11139124543548622,0.3679447010986864,0.0,0.21977052074139453,0.21977052074139453,0.3740978880627333,0
65
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.10,0.3774364800258169,0.11069681096128998,0.36781965660594534,0.0,0.21712268314210062,0.21712268314210062,0.37394806502228206,0
66
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.12,0.3762356152613159,0.10985732529615087,0.36685494442700517,0.0,0.21712268314210062,0.21712268314210062,0.37275829911781017,0
67
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.14,0.37576687641484824,0.10927960097864851,0.36660320893621023,0.0,0.1968225948808473,0.1968225948808473,0.3726127602686422,0
68
  shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.00,0.3579680086793429,0.10207096308148353,0.3460132814937531,0.0,0.19240953221535745,0.19240953221535745,0.3550294775004645,0
69
  shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.02,0.3572226770743513,0.10061303169730976,0.3450507994469454,0.0,0.1615180935569285,0.1615180935569285,0.3547540871288482,0
70
  shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.04,0.35670586983276636,0.10074138007196803,0.3450245802338977,0.0,0.1615180935569285,0.1615180935569285,0.3542408512875192,0
scripts/eval-mac.sh CHANGED
@@ -14,12 +14,13 @@ grep MemTotal /proc/meminfo
14
  # pip install torch torchvision torchaudio
15
  # pip install -r requirements.txt
16
 
17
- #./scripts/eval-model.sh shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat
 
 
18
 
19
  ./scripts/eval-model.sh Qwen/Qwen2-7B-Instruct
20
 
21
- ./scripts/eval-model.sh shenzhi-wang/Llama3.1-8B-Chinese-Chat
22
 
23
- #./scripts/eval-model.sh 01-ai/Yi-1.5-9B-Chat
24
 
25
- #./scripts/eval-model.sh internlm/internlm2_5-7b-chat
 
14
  # pip install torch torchvision torchaudio
15
  # pip install -r requirements.txt
16
 
17
+ ./scripts/eval-model.sh 01-ai/Yi-1.5-9B-Chat
18
+
19
+ ./scripts/eval-model.sh internlm/internlm2_5-7b-chat
20
 
21
  ./scripts/eval-model.sh Qwen/Qwen2-7B-Instruct
22
 
23
+ ./scripts/eval-model.sh shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat
24
 
25
+ ./scripts/eval-model.sh shenzhi-wang/Llama3.1-8B-Chinese-Chat
26