dh-mc commited on
Commit
3f6b774
·
1 Parent(s): 64e39c6

complete gpt-4o-mini training

Browse files
datasets/mac/openai-training.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llm_toolkit/eval_openai.py CHANGED
@@ -29,7 +29,7 @@ print(
29
  )
30
 
31
 
32
- def on_num_shots_step_completed(model_name, dataset, predictions):
33
  save_results(
34
  model_name,
35
  results_path,
@@ -44,8 +44,10 @@ def on_num_shots_step_completed(model_name, dataset, predictions):
44
  def evaluate_model_with_num_shots(
45
  model_name,
46
  data_path,
 
47
  range_num_shots=[0, 1, 3, 5, 10, 50],
48
  max_new_tokens=2048,
 
49
  ):
50
  print(f"Evaluating model: {model_name}")
51
 
@@ -56,20 +58,24 @@ def evaluate_model_with_num_shots(
56
  print(f"*** Evaluating with num_shots: {num_shots}")
57
 
58
  predictions = eval_openai(num_shots, datasets, max_new_tokens=max_new_tokens)
59
- model_name_with_shorts = f"{model_name}/shots-{num_shots:02d}"
 
 
 
 
60
 
61
  try:
62
  on_num_shots_step_completed(
63
- model_name_with_shorts,
64
- datasets["test"],
65
- predictions,
66
  )
67
  except Exception as e:
68
  print(e)
69
 
70
 
71
- evaluate_model_with_num_shots(
72
- model_name,
73
- data_path,
74
- max_new_tokens=max_new_tokens,
75
- )
 
 
 
29
  )
30
 
31
 
32
+ def on_num_shots_step_completed(model_name, dataset, predictions, results_path):
33
  save_results(
34
  model_name,
35
  results_path,
 
44
  def evaluate_model_with_num_shots(
45
  model_name,
46
  data_path,
47
+ results_path=None,
48
  range_num_shots=[0, 1, 3, 5, 10, 50],
49
  max_new_tokens=2048,
50
+ result_column_name=None,
51
  ):
52
  print(f"Evaluating model: {model_name}")
53
 
 
58
  print(f"*** Evaluating with num_shots: {num_shots}")
59
 
60
  predictions = eval_openai(num_shots, datasets, max_new_tokens=max_new_tokens)
61
+ model_name_with_shorts = (
62
+ result_column_name
63
+ if result_column_name
64
+ else f"{model_name}/shots-{num_shots:02d}"
65
+ )
66
 
67
  try:
68
  on_num_shots_step_completed(
69
+ model_name_with_shorts, datasets["test"], predictions, results_path
 
 
70
  )
71
  except Exception as e:
72
  print(e)
73
 
74
 
75
+ if __name__ == "__main__":
76
+ evaluate_model_with_num_shots(
77
+ model_name,
78
+ data_path,
79
+ results_path=results_path,
80
+ max_new_tokens=max_new_tokens,
81
+ )
llm_toolkit/translation_utils.py CHANGED
@@ -18,6 +18,7 @@ bleu = evaluate.load("bleu")
18
  rouge = evaluate.load("rouge")
19
  meteor = evaluate.load("meteor")
20
  accuracy = evaluate.load("accuracy")
 
21
 
22
 
23
  def extract_answer(text, debug=False):
@@ -54,6 +55,10 @@ def calc_metrics(references, predictions, debug=False):
54
  "meteor"
55
  ]
56
 
 
 
 
 
57
  results["bleu_scores"] = bleu.compute(
58
  predictions=predictions, references=references, max_order=4
59
  )
@@ -108,7 +113,7 @@ def get_few_shot_prompt(dataset, num_shots=5):
108
  return translation_prompt
109
 
110
 
111
- def load_translation_dataset(data_path, tokenizer=None, num_shots=0):
112
  train_data_file = data_path.replace(".tsv", "-train.tsv")
113
  test_data_file = data_path.replace(".tsv", "-test.tsv")
114
 
@@ -138,7 +143,7 @@ def load_translation_dataset(data_path, tokenizer=None, num_shots=0):
138
  delimiter="\t",
139
  )
140
 
141
- if tokenizer:
142
  translation_prompt = get_few_shot_prompt(datasets["train"], num_shots)
143
 
144
  def formatting_prompts_func(examples):
@@ -164,11 +169,23 @@ def load_translation_dataset(data_path, tokenizer=None, num_shots=0):
164
  prompt = translation_prompt.format(input=input)
165
  messages[-1] = {"role": "user", "content": prompt}
166
 
167
- prompt = tokenizer.apply_chat_template(
168
- messages, tokenize=False, add_generation_prompt=True
169
- )
170
- prompts.append(prompt)
171
- texts.append(prompt + output + tokenizer.eos_token)
 
 
 
 
 
 
 
 
 
 
 
 
172
  return {"text": texts, "prompt": prompts}
173
 
174
  datasets = datasets.map(
@@ -216,6 +233,11 @@ def detect_repetition_scores(row, col, debug=False):
216
  )
217
 
218
 
 
 
 
 
 
219
  def get_metrics(df, max_output_tokens=2048, variant="rpp"):
220
  metrics_df = pd.DataFrame(df.columns.T)[2:]
221
  metrics_df.rename(columns={0: "model"}, inplace=True)
@@ -235,12 +257,14 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
235
  tokenizers = {model: load_tokenizer(model) for model in models}
236
 
237
  meteor = []
 
238
  bleu_1 = []
239
  rouge_l = []
240
  ews_score = []
241
  repetition_score = []
242
  total_repetitions = []
243
  num_max_output_tokens = []
 
244
  columns = df.columns[2:]
245
 
246
  df[
@@ -256,6 +280,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
256
  print(f"{col}: {metrics}")
257
 
258
  meteor.append(metrics["meteor"])
 
259
  bleu_1.append(metrics["bleu_scores"]["bleu"])
260
  rouge_l.append(metrics["rouge_scores"]["rougeL"])
261
 
@@ -273,6 +298,10 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
273
  lambda x: len(tokenizers[model](x)["input_ids"])
274
  )
275
 
 
 
 
 
276
  new_col = f"output_tokens-{col}"
277
  df[new_col] = df[col].apply(lambda x: len(tokenizers[model](x)["input_ids"]))
278
 
@@ -281,6 +310,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
281
  )
282
 
283
  metrics_df["meteor"] = meteor
 
284
  metrics_df["bleu_1"] = bleu_1
285
  metrics_df["rouge_l"] = rouge_l
286
  metrics_df["ews_score"] = ews_score
@@ -290,6 +320,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
290
  lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1
291
  )
292
 
 
293
  metrics_df["num_max_output_tokens"] = num_max_output_tokens
294
 
295
  if variant != "rpp":
@@ -328,6 +359,12 @@ def analyze_translation_results(df, col, max_new_tokens=300, repetition_threshol
328
  )
329
  print_row_details(df2, range(len(df2)))
330
 
 
 
 
 
 
 
331
 
332
  def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
333
  plt.figure(figsize=figsize)
@@ -604,3 +641,44 @@ def load_alpaca_data(data_path):
604
  df_alpaca.to_json(alpaca_data_path, orient="records", lines=False, indent=2)
605
 
606
  return df_alpaca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  rouge = evaluate.load("rouge")
19
  meteor = evaluate.load("meteor")
20
  accuracy = evaluate.load("accuracy")
21
+ sacrebleu = evaluate.load("sacrebleu")
22
 
23
 
24
  def extract_answer(text, debug=False):
 
55
  "meteor"
56
  ]
57
 
58
+ results["sacrebleu"] = sacrebleu.compute(
59
+ predictions=predictions, references=references
60
+ )
61
+
62
  results["bleu_scores"] = bleu.compute(
63
  predictions=predictions, references=references, max_order=4
64
  )
 
113
  return translation_prompt
114
 
115
 
116
+ def load_translation_dataset(data_path, tokenizer=None, num_shots=0, for_openai=False):
117
  train_data_file = data_path.replace(".tsv", "-train.tsv")
118
  test_data_file = data_path.replace(".tsv", "-test.tsv")
119
 
 
143
  delimiter="\t",
144
  )
145
 
146
+ if tokenizer or for_openai:
147
  translation_prompt = get_few_shot_prompt(datasets["train"], num_shots)
148
 
149
  def formatting_prompts_func(examples):
 
169
  prompt = translation_prompt.format(input=input)
170
  messages[-1] = {"role": "user", "content": prompt}
171
 
172
+ if for_openai:
173
+ prompts.append(messages.copy())
174
+ text = messages.copy()
175
+ text.append(
176
+ {
177
+ "role": "assistant",
178
+ "content": output,
179
+ }
180
+ )
181
+ texts.append(text)
182
+ else:
183
+ prompt = tokenizer.apply_chat_template(
184
+ messages, tokenize=False, add_generation_prompt=True
185
+ )
186
+ prompts.append(prompt)
187
+ texts.append(prompt + output + tokenizer.eos_token)
188
+
189
  return {"text": texts, "prompt": prompts}
190
 
191
  datasets = datasets.map(
 
233
  )
234
 
235
 
236
+ def contains_chinese(text):
237
+ chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]")
238
+ return 1 if chinese_char_pattern.search(text) else 0
239
+
240
+
241
  def get_metrics(df, max_output_tokens=2048, variant="rpp"):
242
  metrics_df = pd.DataFrame(df.columns.T)[2:]
243
  metrics_df.rename(columns={0: "model"}, inplace=True)
 
257
  tokenizers = {model: load_tokenizer(model) for model in models}
258
 
259
  meteor = []
260
+ spbleu = []
261
  bleu_1 = []
262
  rouge_l = []
263
  ews_score = []
264
  repetition_score = []
265
  total_repetitions = []
266
  num_max_output_tokens = []
267
+ num_incomplete_translations = []
268
  columns = df.columns[2:]
269
 
270
  df[
 
280
  print(f"{col}: {metrics}")
281
 
282
  meteor.append(metrics["meteor"])
283
+ spbleu.append(metrics["sacrebleu"]["score"])
284
  bleu_1.append(metrics["bleu_scores"]["bleu"])
285
  rouge_l.append(metrics["rouge_scores"]["rougeL"])
286
 
 
298
  lambda x: len(tokenizers[model](x)["input_ids"])
299
  )
300
 
301
+ new_col = f"contains_chinese-{col}"
302
+ df[new_col] = df[col].apply(contains_chinese)
303
+ num_incomplete_translations.append(df[new_col].sum())
304
+
305
  new_col = f"output_tokens-{col}"
306
  df[new_col] = df[col].apply(lambda x: len(tokenizers[model](x)["input_ids"]))
307
 
 
310
  )
311
 
312
  metrics_df["meteor"] = meteor
313
+ metrics_df["spbleu"] = spbleu
314
  metrics_df["bleu_1"] = bleu_1
315
  metrics_df["rouge_l"] = rouge_l
316
  metrics_df["ews_score"] = ews_score
 
320
  lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1
321
  )
322
 
323
+ metrics_df["num_incomplete_translations"] = num_incomplete_translations
324
  metrics_df["num_max_output_tokens"] = num_max_output_tokens
325
 
326
  if variant != "rpp":
 
359
  )
360
  print_row_details(df2, range(len(df2)))
361
 
362
+ contains_chinese = f"contains_chinese-{col}"
363
+ df3 = df[df[contains_chinese] > 0][["chinese", "english", col, contains_chinese]]
364
+
365
+ print(f"\n*** Found {len(df3)} rows with incomplete translations for {col}")
366
+ print_row_details(df3, range(len(df3)))
367
+
368
 
369
  def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
370
  plt.figure(figsize=figsize)
 
641
  df_alpaca.to_json(alpaca_data_path, orient="records", lines=False, indent=2)
642
 
643
  return df_alpaca
644
+
645
+
646
+ def load_openai_training_data(
647
+ data_path, openai_data_path="datasets/mac/openai-training.jsonl"
648
+ ):
649
+ if os.path.exists(openai_data_path):
650
+ print("loading existing data from:", openai_data_path)
651
+ data = pd.read_json(openai_data_path, orient="records", lines=True)
652
+ return data
653
+
654
+ datasets = load_translation_dataset(data_path)
655
+ prompt_template = get_few_shot_prompt(datasets["train"], num_shots=0)
656
+
657
+ df_train = datasets["train"].to_pandas()
658
+ messages = []
659
+
660
+ for i, row in df_train.iterrows():
661
+ messages.append(
662
+ [
663
+ {
664
+ "role": "system",
665
+ "content": system_prompt,
666
+ },
667
+ {
668
+ "role": "user",
669
+ "content": prompt_template.format(input=row["chinese"]),
670
+ },
671
+ {
672
+ "role": "assistant",
673
+ "content": row["english"],
674
+ },
675
+ ]
676
+ )
677
+
678
+ df_openai = pd.DataFrame(
679
+ {
680
+ "messages": messages,
681
+ }
682
+ )
683
+ df_openai.to_json(openai_data_path, orient="records", lines=True)
684
+ return df_openai
logs/{l40-1gpu-rpp.txt → l40-1gpu-rpp-1.txt} RENAMED
File without changes
logs/l40-4gpu-1.txt ADDED
The diff for this file is too large to render. See raw diff
 
logs/l40-4gpu.txt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:289a8bbbf208650bc4a0cc3b86578f8a7db73ef68bbefa3c55c3eedf94a38ed0
3
- size 878270
 
 
 
 
logs/openai-gpt-4o-mini-fine-tuned.txt ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Qwen/Qwen2-7B-Instruct None False datasets/mac/mac.tsv results/mac-results.csv False 300
2
+ loading env vars from: /Users/inflaton/code/engd/papers/rapget-translation/.env
3
+ workding dir: /Users/inflaton/code/engd/papers/rapget-translation
4
+ Python 3.11.9
5
+ Name: torch
6
+ Version: 2.4.0
7
+ Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
8
+ Home-page: https://pytorch.org/
9
+ Author: PyTorch Team
10
+ Author-email: packages@pytorch.org
11
+ License: BSD-3
12
+ Location: /Users/inflaton/anaconda3/envs/rapget/lib/python3.11/site-packages
13
+ Requires: filelock, fsspec, jinja2, networkx, sympy, typing-extensions
14
+ Required-by: accelerate, peft, torchaudio, torchvision, trl
15
+ ---
16
+ Name: transformers
17
+ Version: 4.43.3
18
+ Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
19
+ Home-page: https://github.com/huggingface/transformers
20
+ Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
21
+ Author-email: transformers@huggingface.co
22
+ License: Apache 2.0 License
23
+ Location: /Users/inflaton/anaconda3/envs/rapget/lib/python3.11/site-packages
24
+ Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
25
+ Required-by: llamafactory, peft, trl
26
+ CPU times: user 8.97 ms, sys: 13.7 ms, total: 22.7 ms
27
+ Wall time: 1.91 s
28
+ MPS is available
29
+ loading existing data from: logs/openai-training-sample.jsonl
30
+ messages
31
+ 0 [{'role': 'system', 'content': 'Marv is a fact...
32
+ 1 [{'role': 'system', 'content': 'Marv is a fact...
33
+ 2 [{'role': 'system', 'content': 'Marv is a fact...
34
+ FileObject(id='file-IokPHn4YWcniXL4wGnK4xVmn', bytes=3413094, created_at=1723269681, filename='openai-training.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
35
+ FineTuningJob(id='ftjob-TcCo4KtDd3Gp5cnOVky2Rxhh', created_at=1723270136, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=6, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-RXHVnD8cqPvqTPdXgZ5rQdl3', result_files=[], seed=1046194933, status='validating_files', trained_tokens=None, training_file='file-IokPHn4YWcniXL4wGnK4xVmn', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)
36
+ FineTuningJob(id='ftjob-TcCo4KtDd3Gp5cnOVky2Rxhh', created_at=1723270136, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:mastercard::9uaCEFTs', finished_at=1723272532, hyperparameters=Hyperparameters(n_epochs=6, batch_size=18, learning_rate_multiplier=1.8), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-RXHVnD8cqPvqTPdXgZ5rQdl3', result_files=['file-aCppW0GWhhytwe4yKwymNUZl'], seed=1046194933, status='succeeded', trained_tokens=3640956, training_file='file-IokPHn4YWcniXL4wGnK4xVmn', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)
37
+ Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ufuULvy
38
+ loading train/test data files
39
+ DatasetDict({
40
+ train: Dataset({
41
+ features: ['chinese', 'english'],
42
+ num_rows: 4528
43
+ })
44
+ test: Dataset({
45
+ features: ['chinese', 'english'],
46
+ num_rows: 1133
47
+ })
48
+ })
49
+ --------------------------------------------------
50
+ chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
51
+ --------------------------------------------------
52
+ english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
53
+ *** Evaluating with num_shots: 0
54
+ 100%|██████████| 1133/1133 [16:48<00:00, 1.12it/s]
55
+ gpt-4o-mini/epochs-01 metrics: {'meteor': 0.3785370331806402, 'sacrebleu': {'score': 12.052844230027103, 'counts': [12818, 4623, 2153, 1081], 'totals': [29097, 27964, 26850, 25740], 'precisions': [44.05265147609719, 16.53196967529681, 8.018621973929237, 4.1996891996892], 'bp': 0.9631327655852462, 'sys_len': 29097, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.12052844230027103, 'precisions': [0.44052651476097193, 0.1653196967529681, 0.08018621973929237, 0.041996891996891994], 'brevity_penalty': 0.9631327655852462, 'length_ratio': 0.9637959589267969, 'translation_length': 29097, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4244007719128182, 'rouge2': 0.17601540674784633, 'rougeL': 0.3693615986543504, 'rougeLsum': 0.3696442718692141}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
56
+ Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ug0Gt3w
57
+ loading train/test data files
58
+ DatasetDict({
59
+ train: Dataset({
60
+ features: ['chinese', 'english'],
61
+ num_rows: 4528
62
+ })
63
+ test: Dataset({
64
+ features: ['chinese', 'english'],
65
+ num_rows: 1133
66
+ })
67
+ })
68
+ --------------------------------------------------
69
+ chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
70
+ --------------------------------------------------
71
+ english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
72
+ *** Evaluating with num_shots: 0
73
+ 100%|██████████| 1133/1133 [17:56<00:00, 1.05it/s]
74
+ gpt-4o-mini/epochs-02 metrics: {'meteor': 0.3785921332515917, 'sacrebleu': {'score': 12.033706874864837, 'counts': [12801, 4628, 2150, 1076], 'totals': [29076, 27943, 26830, 25722], 'precisions': [44.02600082542303, 16.562287513867517, 8.013417815877748, 4.183189487598165], 'bp': 0.9624112877781842, 'sys_len': 29076, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.12033706874864836, 'precisions': [0.4402600082542303, 0.16562287513867516, 0.08013417815877749, 0.04183189487598165], 'brevity_penalty': 0.9624112877781842, 'length_ratio': 0.9631003643590593, 'translation_length': 29076, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4235104923203792, 'rouge2': 0.1758318317686482, 'rougeL': 0.36922125683186846, 'rougeLsum': 0.3693808162149962}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
75
+ Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ug5PhpZ
76
+ loading train/test data files
77
+ DatasetDict({
78
+ train: Dataset({
79
+ features: ['chinese', 'english'],
80
+ num_rows: 4528
81
+ })
82
+ test: Dataset({
83
+ features: ['chinese', 'english'],
84
+ num_rows: 1133
85
+ })
86
+ })
87
+ --------------------------------------------------
88
+ chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
89
+ --------------------------------------------------
90
+ english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
91
+ *** Evaluating with num_shots: 0
92
+ 100%|██████████| 1133/1133 [17:02<00:00, 1.11it/s]
93
+ gpt-4o-mini/epochs-03 metrics: {'meteor': 0.37736228106121694, 'sacrebleu': {'score': 11.933111335430906, 'counts': [12779, 4601, 2124, 1061], 'totals': [29096, 27963, 26848, 25737], 'precisions': [43.920126477866376, 16.453885491542394, 7.911203814064362, 4.122469596301046], 'bp': 0.9630984208616785, 'sys_len': 29096, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11933111335430906, 'precisions': [0.4392012647786637, 0.16453885491542394, 0.07911203814064362, 0.041224695963010455], 'brevity_penalty': 0.9630984208616785, 'length_ratio': 0.9637628353759523, 'translation_length': 29096, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4235319934194407, 'rouge2': 0.17493309683581332, 'rougeL': 0.3685697120399035, 'rougeLsum': 0.3689298428303013}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
94
+ Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ugPThQI
95
+ loading train/test data files
96
+ DatasetDict({
97
+ train: Dataset({
98
+ features: ['chinese', 'english'],
99
+ num_rows: 4528
100
+ })
101
+ test: Dataset({
102
+ features: ['chinese', 'english'],
103
+ num_rows: 1133
104
+ })
105
+ })
106
+ --------------------------------------------------
107
+ chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
108
+ --------------------------------------------------
109
+ english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
110
+ *** Evaluating with num_shots: 0
111
+ 100%|██████████| 1133/1133 [18:35<00:00, 1.02it/s]
112
+ gpt-4o-mini/epochs-04 metrics: {'meteor': 0.37818535038887346, 'sacrebleu': {'score': 11.933285526593995, 'counts': [12797, 4601, 2121, 1061], 'totals': [29110, 27977, 26861, 25749], 'precisions': [43.960838199931295, 16.445651785395146, 7.896206395889952, 4.120548370810517], 'bp': 0.9635791436286372, 'sys_len': 29110, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11933285526593994, 'precisions': [0.43960838199931296, 0.16445651785395146, 0.07896206395889951, 0.041205483708105166], 'brevity_penalty': 0.9635791436286371, 'length_ratio': 0.9642265650877774, 'translation_length': 29110, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.42372801674771476, 'rouge2': 0.17487358435014705, 'rougeL': 0.36931437347367646, 'rougeLsum': 0.36934766241132383}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
113
+
114
+ Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ugVLmcB
115
+ loading train/test data files
116
+ DatasetDict({
117
+ train: Dataset({
118
+ features: ['chinese', 'english'],
119
+ num_rows: 4528
120
+ })
121
+ test: Dataset({
122
+ features: ['chinese', 'english'],
123
+ num_rows: 1133
124
+ })
125
+ })
126
+ --------------------------------------------------
127
+ chinese: 老耿端起枪,眯缝���一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
128
+ --------------------------------------------------
129
+ english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
130
+ *** Evaluating with num_shots: 0
131
+ 100%|██████████| 1133/1133 [15:47<00:00, 1.20it/s]
132
+ gpt-4o-mini/epochs-05 metrics: {'meteor': 0.3790673551140706, 'sacrebleu': {'score': 11.955698498650582, 'counts': [12808, 4609, 2126, 1064], 'totals': [29209, 28076, 26959, 25846], 'precisions': [43.849498442260945, 16.416156147599374, 7.88604918580066, 4.116691170780778], 'bp': 0.9669721941455759, 'sys_len': 29209, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11955698498650584, 'precisions': [0.4384949844226095, 0.16416156147599373, 0.0788604918580066, 0.041166911707807785], 'brevity_penalty': 0.9669721941455759, 'length_ratio': 0.9675057966213978, 'translation_length': 29209, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.42476082012412075, 'rouge2': 0.17559955520032905, 'rougeL': 0.3700113513462385, 'rougeLsum': 0.37012014201963733}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
133
+ Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9uaCEFTs
134
+ loading train/test data files
135
+ DatasetDict({
136
+ train: Dataset({
137
+ features: ['chinese', 'english'],
138
+ num_rows: 4528
139
+ })
140
+ test: Dataset({
141
+ features: ['chinese', 'english'],
142
+ num_rows: 1133
143
+ })
144
+ })
145
+ --------------------------------------------------
146
+ chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
147
+ --------------------------------------------------
148
+ english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
149
+ *** Evaluating with num_shots: 0
150
+ 100%|██████████| 1133/1133 [15:43<00:00, 1.20it/s]
151
+ gpt-4o-mini/epochs-06 metrics: {'meteor': 0.3792226866395673, 'sacrebleu': {'score': 11.982811850915233, 'counts': [12810, 4617, 2137, 1066], 'totals': [29116, 27983, 26868, 25757], 'precisions': [43.996428080780326, 16.499303148340065, 7.95369956825964, 4.138680746981403], 'bp': 0.9637850995333245, 'sys_len': 29116, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11982811850915229, 'precisions': [0.43996428080780325, 0.16499303148340064, 0.0795369956825964, 0.04138680746981403], 'brevity_penalty': 0.9637850995333245, 'length_ratio': 0.9644253063928453, 'translation_length': 29116, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4251187202203103, 'rouge2': 0.17553224521896635, 'rougeL': 0.37003282393672954, 'rougeLsum': 0.370114181474168}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
logs/openai-training-sample.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
2
+ {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
3
+ {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}
notebooks/00b_Data Analysis_Few_Shots.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79fc6f45be27f13fe14be2e41598b8cc605cfcb71565a136603608711a5338fa
3
- size 1606476
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d7af735cc4e9c4395e8bba07f2d212edc159f0f8f8cf83281f25f1ea8c2d717
3
+ size 3332561
notebooks/00c_Data Analysis_Fine_Tuned.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/00d_Data Analysis_Fine_Tuned_RPP.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/02_Fine_Tune_OpenAI.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -18,3 +18,4 @@ sentencepiece==0.2.0
18
  einops==0.8.0
19
  accelerate==0.32.0
20
  peft==0.11.1
 
 
18
  einops==0.8.0
19
  accelerate==0.32.0
20
  peft==0.11.1
21
+ sacrebleu==2.4.2
results/mac-results_few_shots_metrics.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40bd68a7831cf37a0bd5b4e290435873c574a15bca5a79400127a46ff2717672
3
- size 8156
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:945e48d5773ce3a870e793e410c79148bd34c1b427c7bcd8e9e5ec140e574fa7
3
+ size 9379
results/mac-results_few_shots_openai.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c797cbdebc17690da76da5e75f9533939bacaa80dca1b46a5d51aaaa316a0ee5
3
- size 1932132
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eea324569b30d1696a51853cbfc5f7b992a569f464cae0db7a88a38c8024578a
3
+ size 2782816
results/mac-results_fine_tuned_metrics.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9443cd0d5ed360cccbbf9f58b0f26e41320c962bca19fe34ae4dbeb9334de610
3
- size 1158
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbf8e7661be99195444d1c6985179880efdc065fc1856c5fe5a78de14906c064
3
+ size 8321
scripts/eval-4gpu.sh CHANGED
@@ -16,11 +16,9 @@ grep MemTotal /proc/meminfo
16
  #pip install -r requirements.txt
17
 
18
  export BATCH_SIZE=1
19
- # export START_REPETITION_PENALTY=1.06
20
- export START_NUM_SHOTS=50
21
 
22
  #./scripts/eval-model.sh Qwen/Qwen2-72B-Instruct
23
 
24
- ./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
25
 
26
- # ./scripts/eval-model.sh 01-ai/Yi-1.5-34B-Chat
 
16
  #pip install -r requirements.txt
17
 
18
  export BATCH_SIZE=1
 
 
19
 
20
  #./scripts/eval-model.sh Qwen/Qwen2-72B-Instruct
21
 
22
+ # ./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
23
 
24
+ ./scripts/eval-rpp.sh shenzhi-wang Llama3.1-70B-Chinese-Chat checkpoint-280