Spaces:
Build error
Build error
complete gpt-4o-mini training
Browse files- datasets/mac/openai-training.jsonl +0 -0
- llm_toolkit/eval_openai.py +16 -10
- llm_toolkit/translation_utils.py +85 -7
- logs/{l40-1gpu-rpp.txt → l40-1gpu-rpp-1.txt} +0 -0
- logs/l40-4gpu-1.txt +0 -0
- logs/l40-4gpu.txt +0 -3
- logs/openai-gpt-4o-mini-fine-tuned.txt +151 -0
- logs/openai-training-sample.jsonl +3 -0
- notebooks/00b_Data Analysis_Few_Shots.ipynb +2 -2
- notebooks/00c_Data Analysis_Fine_Tuned.ipynb +0 -0
- notebooks/00d_Data Analysis_Fine_Tuned_RPP.ipynb +0 -0
- notebooks/02_Fine_Tune_OpenAI.ipynb +0 -0
- requirements.txt +1 -0
- results/mac-results_few_shots_metrics.csv +2 -2
- results/mac-results_few_shots_openai.csv +2 -2
- results/mac-results_fine_tuned_metrics.csv +2 -2
- scripts/eval-4gpu.sh +2 -4
datasets/mac/openai-training.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
llm_toolkit/eval_openai.py
CHANGED
@@ -29,7 +29,7 @@ print(
|
|
29 |
)
|
30 |
|
31 |
|
32 |
-
def on_num_shots_step_completed(model_name, dataset, predictions):
|
33 |
save_results(
|
34 |
model_name,
|
35 |
results_path,
|
@@ -44,8 +44,10 @@ def on_num_shots_step_completed(model_name, dataset, predictions):
|
|
44 |
def evaluate_model_with_num_shots(
|
45 |
model_name,
|
46 |
data_path,
|
|
|
47 |
range_num_shots=[0, 1, 3, 5, 10, 50],
|
48 |
max_new_tokens=2048,
|
|
|
49 |
):
|
50 |
print(f"Evaluating model: {model_name}")
|
51 |
|
@@ -56,20 +58,24 @@ def evaluate_model_with_num_shots(
|
|
56 |
print(f"*** Evaluating with num_shots: {num_shots}")
|
57 |
|
58 |
predictions = eval_openai(num_shots, datasets, max_new_tokens=max_new_tokens)
|
59 |
-
model_name_with_shorts =
|
|
|
|
|
|
|
|
|
60 |
|
61 |
try:
|
62 |
on_num_shots_step_completed(
|
63 |
-
model_name_with_shorts,
|
64 |
-
datasets["test"],
|
65 |
-
predictions,
|
66 |
)
|
67 |
except Exception as e:
|
68 |
print(e)
|
69 |
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
29 |
)
|
30 |
|
31 |
|
32 |
+
def on_num_shots_step_completed(model_name, dataset, predictions, results_path):
|
33 |
save_results(
|
34 |
model_name,
|
35 |
results_path,
|
|
|
44 |
def evaluate_model_with_num_shots(
|
45 |
model_name,
|
46 |
data_path,
|
47 |
+
results_path=None,
|
48 |
range_num_shots=[0, 1, 3, 5, 10, 50],
|
49 |
max_new_tokens=2048,
|
50 |
+
result_column_name=None,
|
51 |
):
|
52 |
print(f"Evaluating model: {model_name}")
|
53 |
|
|
|
58 |
print(f"*** Evaluating with num_shots: {num_shots}")
|
59 |
|
60 |
predictions = eval_openai(num_shots, datasets, max_new_tokens=max_new_tokens)
|
61 |
+
model_name_with_shorts = (
|
62 |
+
result_column_name
|
63 |
+
if result_column_name
|
64 |
+
else f"{model_name}/shots-{num_shots:02d}"
|
65 |
+
)
|
66 |
|
67 |
try:
|
68 |
on_num_shots_step_completed(
|
69 |
+
model_name_with_shorts, datasets["test"], predictions, results_path
|
|
|
|
|
70 |
)
|
71 |
except Exception as e:
|
72 |
print(e)
|
73 |
|
74 |
|
75 |
+
if __name__ == "__main__":
|
76 |
+
evaluate_model_with_num_shots(
|
77 |
+
model_name,
|
78 |
+
data_path,
|
79 |
+
results_path=results_path,
|
80 |
+
max_new_tokens=max_new_tokens,
|
81 |
+
)
|
llm_toolkit/translation_utils.py
CHANGED
@@ -18,6 +18,7 @@ bleu = evaluate.load("bleu")
|
|
18 |
rouge = evaluate.load("rouge")
|
19 |
meteor = evaluate.load("meteor")
|
20 |
accuracy = evaluate.load("accuracy")
|
|
|
21 |
|
22 |
|
23 |
def extract_answer(text, debug=False):
|
@@ -54,6 +55,10 @@ def calc_metrics(references, predictions, debug=False):
|
|
54 |
"meteor"
|
55 |
]
|
56 |
|
|
|
|
|
|
|
|
|
57 |
results["bleu_scores"] = bleu.compute(
|
58 |
predictions=predictions, references=references, max_order=4
|
59 |
)
|
@@ -108,7 +113,7 @@ def get_few_shot_prompt(dataset, num_shots=5):
|
|
108 |
return translation_prompt
|
109 |
|
110 |
|
111 |
-
def load_translation_dataset(data_path, tokenizer=None, num_shots=0):
|
112 |
train_data_file = data_path.replace(".tsv", "-train.tsv")
|
113 |
test_data_file = data_path.replace(".tsv", "-test.tsv")
|
114 |
|
@@ -138,7 +143,7 @@ def load_translation_dataset(data_path, tokenizer=None, num_shots=0):
|
|
138 |
delimiter="\t",
|
139 |
)
|
140 |
|
141 |
-
if tokenizer:
|
142 |
translation_prompt = get_few_shot_prompt(datasets["train"], num_shots)
|
143 |
|
144 |
def formatting_prompts_func(examples):
|
@@ -164,11 +169,23 @@ def load_translation_dataset(data_path, tokenizer=None, num_shots=0):
|
|
164 |
prompt = translation_prompt.format(input=input)
|
165 |
messages[-1] = {"role": "user", "content": prompt}
|
166 |
|
167 |
-
|
168 |
-
messages
|
169 |
-
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
return {"text": texts, "prompt": prompts}
|
173 |
|
174 |
datasets = datasets.map(
|
@@ -216,6 +233,11 @@ def detect_repetition_scores(row, col, debug=False):
|
|
216 |
)
|
217 |
|
218 |
|
|
|
|
|
|
|
|
|
|
|
219 |
def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
220 |
metrics_df = pd.DataFrame(df.columns.T)[2:]
|
221 |
metrics_df.rename(columns={0: "model"}, inplace=True)
|
@@ -235,12 +257,14 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
|
235 |
tokenizers = {model: load_tokenizer(model) for model in models}
|
236 |
|
237 |
meteor = []
|
|
|
238 |
bleu_1 = []
|
239 |
rouge_l = []
|
240 |
ews_score = []
|
241 |
repetition_score = []
|
242 |
total_repetitions = []
|
243 |
num_max_output_tokens = []
|
|
|
244 |
columns = df.columns[2:]
|
245 |
|
246 |
df[
|
@@ -256,6 +280,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
|
256 |
print(f"{col}: {metrics}")
|
257 |
|
258 |
meteor.append(metrics["meteor"])
|
|
|
259 |
bleu_1.append(metrics["bleu_scores"]["bleu"])
|
260 |
rouge_l.append(metrics["rouge_scores"]["rougeL"])
|
261 |
|
@@ -273,6 +298,10 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
|
273 |
lambda x: len(tokenizers[model](x)["input_ids"])
|
274 |
)
|
275 |
|
|
|
|
|
|
|
|
|
276 |
new_col = f"output_tokens-{col}"
|
277 |
df[new_col] = df[col].apply(lambda x: len(tokenizers[model](x)["input_ids"]))
|
278 |
|
@@ -281,6 +310,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
|
281 |
)
|
282 |
|
283 |
metrics_df["meteor"] = meteor
|
|
|
284 |
metrics_df["bleu_1"] = bleu_1
|
285 |
metrics_df["rouge_l"] = rouge_l
|
286 |
metrics_df["ews_score"] = ews_score
|
@@ -290,6 +320,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
|
290 |
lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1
|
291 |
)
|
292 |
|
|
|
293 |
metrics_df["num_max_output_tokens"] = num_max_output_tokens
|
294 |
|
295 |
if variant != "rpp":
|
@@ -328,6 +359,12 @@ def analyze_translation_results(df, col, max_new_tokens=300, repetition_threshol
|
|
328 |
)
|
329 |
print_row_details(df2, range(len(df2)))
|
330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
|
332 |
def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
|
333 |
plt.figure(figsize=figsize)
|
@@ -604,3 +641,44 @@ def load_alpaca_data(data_path):
|
|
604 |
df_alpaca.to_json(alpaca_data_path, orient="records", lines=False, indent=2)
|
605 |
|
606 |
return df_alpaca
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
rouge = evaluate.load("rouge")
|
19 |
meteor = evaluate.load("meteor")
|
20 |
accuracy = evaluate.load("accuracy")
|
21 |
+
sacrebleu = evaluate.load("sacrebleu")
|
22 |
|
23 |
|
24 |
def extract_answer(text, debug=False):
|
|
|
55 |
"meteor"
|
56 |
]
|
57 |
|
58 |
+
results["sacrebleu"] = sacrebleu.compute(
|
59 |
+
predictions=predictions, references=references
|
60 |
+
)
|
61 |
+
|
62 |
results["bleu_scores"] = bleu.compute(
|
63 |
predictions=predictions, references=references, max_order=4
|
64 |
)
|
|
|
113 |
return translation_prompt
|
114 |
|
115 |
|
116 |
+
def load_translation_dataset(data_path, tokenizer=None, num_shots=0, for_openai=False):
|
117 |
train_data_file = data_path.replace(".tsv", "-train.tsv")
|
118 |
test_data_file = data_path.replace(".tsv", "-test.tsv")
|
119 |
|
|
|
143 |
delimiter="\t",
|
144 |
)
|
145 |
|
146 |
+
if tokenizer or for_openai:
|
147 |
translation_prompt = get_few_shot_prompt(datasets["train"], num_shots)
|
148 |
|
149 |
def formatting_prompts_func(examples):
|
|
|
169 |
prompt = translation_prompt.format(input=input)
|
170 |
messages[-1] = {"role": "user", "content": prompt}
|
171 |
|
172 |
+
if for_openai:
|
173 |
+
prompts.append(messages.copy())
|
174 |
+
text = messages.copy()
|
175 |
+
text.append(
|
176 |
+
{
|
177 |
+
"role": "assistant",
|
178 |
+
"content": output,
|
179 |
+
}
|
180 |
+
)
|
181 |
+
texts.append(text)
|
182 |
+
else:
|
183 |
+
prompt = tokenizer.apply_chat_template(
|
184 |
+
messages, tokenize=False, add_generation_prompt=True
|
185 |
+
)
|
186 |
+
prompts.append(prompt)
|
187 |
+
texts.append(prompt + output + tokenizer.eos_token)
|
188 |
+
|
189 |
return {"text": texts, "prompt": prompts}
|
190 |
|
191 |
datasets = datasets.map(
|
|
|
233 |
)
|
234 |
|
235 |
|
236 |
+
def contains_chinese(text):
|
237 |
+
chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]")
|
238 |
+
return 1 if chinese_char_pattern.search(text) else 0
|
239 |
+
|
240 |
+
|
241 |
def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
242 |
metrics_df = pd.DataFrame(df.columns.T)[2:]
|
243 |
metrics_df.rename(columns={0: "model"}, inplace=True)
|
|
|
257 |
tokenizers = {model: load_tokenizer(model) for model in models}
|
258 |
|
259 |
meteor = []
|
260 |
+
spbleu = []
|
261 |
bleu_1 = []
|
262 |
rouge_l = []
|
263 |
ews_score = []
|
264 |
repetition_score = []
|
265 |
total_repetitions = []
|
266 |
num_max_output_tokens = []
|
267 |
+
num_incomplete_translations = []
|
268 |
columns = df.columns[2:]
|
269 |
|
270 |
df[
|
|
|
280 |
print(f"{col}: {metrics}")
|
281 |
|
282 |
meteor.append(metrics["meteor"])
|
283 |
+
spbleu.append(metrics["sacrebleu"]["score"])
|
284 |
bleu_1.append(metrics["bleu_scores"]["bleu"])
|
285 |
rouge_l.append(metrics["rouge_scores"]["rougeL"])
|
286 |
|
|
|
298 |
lambda x: len(tokenizers[model](x)["input_ids"])
|
299 |
)
|
300 |
|
301 |
+
new_col = f"contains_chinese-{col}"
|
302 |
+
df[new_col] = df[col].apply(contains_chinese)
|
303 |
+
num_incomplete_translations.append(df[new_col].sum())
|
304 |
+
|
305 |
new_col = f"output_tokens-{col}"
|
306 |
df[new_col] = df[col].apply(lambda x: len(tokenizers[model](x)["input_ids"]))
|
307 |
|
|
|
310 |
)
|
311 |
|
312 |
metrics_df["meteor"] = meteor
|
313 |
+
metrics_df["spbleu"] = spbleu
|
314 |
metrics_df["bleu_1"] = bleu_1
|
315 |
metrics_df["rouge_l"] = rouge_l
|
316 |
metrics_df["ews_score"] = ews_score
|
|
|
320 |
lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1
|
321 |
)
|
322 |
|
323 |
+
metrics_df["num_incomplete_translations"] = num_incomplete_translations
|
324 |
metrics_df["num_max_output_tokens"] = num_max_output_tokens
|
325 |
|
326 |
if variant != "rpp":
|
|
|
359 |
)
|
360 |
print_row_details(df2, range(len(df2)))
|
361 |
|
362 |
+
contains_chinese = f"contains_chinese-{col}"
|
363 |
+
df3 = df[df[contains_chinese] > 0][["chinese", "english", col, contains_chinese]]
|
364 |
+
|
365 |
+
print(f"\n*** Found {len(df3)} rows with incomplete translations for {col}")
|
366 |
+
print_row_details(df3, range(len(df3)))
|
367 |
+
|
368 |
|
369 |
def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
|
370 |
plt.figure(figsize=figsize)
|
|
|
641 |
df_alpaca.to_json(alpaca_data_path, orient="records", lines=False, indent=2)
|
642 |
|
643 |
return df_alpaca
|
644 |
+
|
645 |
+
|
646 |
+
def load_openai_training_data(
|
647 |
+
data_path, openai_data_path="datasets/mac/openai-training.jsonl"
|
648 |
+
):
|
649 |
+
if os.path.exists(openai_data_path):
|
650 |
+
print("loading existing data from:", openai_data_path)
|
651 |
+
data = pd.read_json(openai_data_path, orient="records", lines=True)
|
652 |
+
return data
|
653 |
+
|
654 |
+
datasets = load_translation_dataset(data_path)
|
655 |
+
prompt_template = get_few_shot_prompt(datasets["train"], num_shots=0)
|
656 |
+
|
657 |
+
df_train = datasets["train"].to_pandas()
|
658 |
+
messages = []
|
659 |
+
|
660 |
+
for i, row in df_train.iterrows():
|
661 |
+
messages.append(
|
662 |
+
[
|
663 |
+
{
|
664 |
+
"role": "system",
|
665 |
+
"content": system_prompt,
|
666 |
+
},
|
667 |
+
{
|
668 |
+
"role": "user",
|
669 |
+
"content": prompt_template.format(input=row["chinese"]),
|
670 |
+
},
|
671 |
+
{
|
672 |
+
"role": "assistant",
|
673 |
+
"content": row["english"],
|
674 |
+
},
|
675 |
+
]
|
676 |
+
)
|
677 |
+
|
678 |
+
df_openai = pd.DataFrame(
|
679 |
+
{
|
680 |
+
"messages": messages,
|
681 |
+
}
|
682 |
+
)
|
683 |
+
df_openai.to_json(openai_data_path, orient="records", lines=True)
|
684 |
+
return df_openai
|
logs/{l40-1gpu-rpp.txt → l40-1gpu-rpp-1.txt}
RENAMED
File without changes
|
logs/l40-4gpu-1.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
logs/l40-4gpu.txt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:289a8bbbf208650bc4a0cc3b86578f8a7db73ef68bbefa3c55c3eedf94a38ed0
|
3 |
-
size 878270
|
|
|
|
|
|
|
|
logs/openai-gpt-4o-mini-fine-tuned.txt
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Qwen/Qwen2-7B-Instruct None False datasets/mac/mac.tsv results/mac-results.csv False 300
|
2 |
+
loading env vars from: /Users/inflaton/code/engd/papers/rapget-translation/.env
|
3 |
+
workding dir: /Users/inflaton/code/engd/papers/rapget-translation
|
4 |
+
Python 3.11.9
|
5 |
+
Name: torch
|
6 |
+
Version: 2.4.0
|
7 |
+
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
|
8 |
+
Home-page: https://pytorch.org/
|
9 |
+
Author: PyTorch Team
|
10 |
+
Author-email: packages@pytorch.org
|
11 |
+
License: BSD-3
|
12 |
+
Location: /Users/inflaton/anaconda3/envs/rapget/lib/python3.11/site-packages
|
13 |
+
Requires: filelock, fsspec, jinja2, networkx, sympy, typing-extensions
|
14 |
+
Required-by: accelerate, peft, torchaudio, torchvision, trl
|
15 |
+
---
|
16 |
+
Name: transformers
|
17 |
+
Version: 4.43.3
|
18 |
+
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
|
19 |
+
Home-page: https://github.com/huggingface/transformers
|
20 |
+
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
|
21 |
+
Author-email: transformers@huggingface.co
|
22 |
+
License: Apache 2.0 License
|
23 |
+
Location: /Users/inflaton/anaconda3/envs/rapget/lib/python3.11/site-packages
|
24 |
+
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
|
25 |
+
Required-by: llamafactory, peft, trl
|
26 |
+
CPU times: user 8.97 ms, sys: 13.7 ms, total: 22.7 ms
|
27 |
+
Wall time: 1.91 s
|
28 |
+
MPS is available
|
29 |
+
loading existing data from: logs/openai-training-sample.jsonl
|
30 |
+
messages
|
31 |
+
0 [{'role': 'system', 'content': 'Marv is a fact...
|
32 |
+
1 [{'role': 'system', 'content': 'Marv is a fact...
|
33 |
+
2 [{'role': 'system', 'content': 'Marv is a fact...
|
34 |
+
FileObject(id='file-IokPHn4YWcniXL4wGnK4xVmn', bytes=3413094, created_at=1723269681, filename='openai-training.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
|
35 |
+
FineTuningJob(id='ftjob-TcCo4KtDd3Gp5cnOVky2Rxhh', created_at=1723270136, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=6, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-RXHVnD8cqPvqTPdXgZ5rQdl3', result_files=[], seed=1046194933, status='validating_files', trained_tokens=None, training_file='file-IokPHn4YWcniXL4wGnK4xVmn', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)
|
36 |
+
FineTuningJob(id='ftjob-TcCo4KtDd3Gp5cnOVky2Rxhh', created_at=1723270136, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:mastercard::9uaCEFTs', finished_at=1723272532, hyperparameters=Hyperparameters(n_epochs=6, batch_size=18, learning_rate_multiplier=1.8), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-RXHVnD8cqPvqTPdXgZ5rQdl3', result_files=['file-aCppW0GWhhytwe4yKwymNUZl'], seed=1046194933, status='succeeded', trained_tokens=3640956, training_file='file-IokPHn4YWcniXL4wGnK4xVmn', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)
|
37 |
+
Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ufuULvy
|
38 |
+
loading train/test data files
|
39 |
+
DatasetDict({
|
40 |
+
train: Dataset({
|
41 |
+
features: ['chinese', 'english'],
|
42 |
+
num_rows: 4528
|
43 |
+
})
|
44 |
+
test: Dataset({
|
45 |
+
features: ['chinese', 'english'],
|
46 |
+
num_rows: 1133
|
47 |
+
})
|
48 |
+
})
|
49 |
+
--------------------------------------------------
|
50 |
+
chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
|
51 |
+
--------------------------------------------------
|
52 |
+
english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
|
53 |
+
*** Evaluating with num_shots: 0
|
54 |
+
100%|██████████| 1133/1133 [16:48<00:00, 1.12it/s]
|
55 |
+
gpt-4o-mini/epochs-01 metrics: {'meteor': 0.3785370331806402, 'sacrebleu': {'score': 12.052844230027103, 'counts': [12818, 4623, 2153, 1081], 'totals': [29097, 27964, 26850, 25740], 'precisions': [44.05265147609719, 16.53196967529681, 8.018621973929237, 4.1996891996892], 'bp': 0.9631327655852462, 'sys_len': 29097, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.12052844230027103, 'precisions': [0.44052651476097193, 0.1653196967529681, 0.08018621973929237, 0.041996891996891994], 'brevity_penalty': 0.9631327655852462, 'length_ratio': 0.9637959589267969, 'translation_length': 29097, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4244007719128182, 'rouge2': 0.17601540674784633, 'rougeL': 0.3693615986543504, 'rougeLsum': 0.3696442718692141}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
|
56 |
+
Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ug0Gt3w
|
57 |
+
loading train/test data files
|
58 |
+
DatasetDict({
|
59 |
+
train: Dataset({
|
60 |
+
features: ['chinese', 'english'],
|
61 |
+
num_rows: 4528
|
62 |
+
})
|
63 |
+
test: Dataset({
|
64 |
+
features: ['chinese', 'english'],
|
65 |
+
num_rows: 1133
|
66 |
+
})
|
67 |
+
})
|
68 |
+
--------------------------------------------------
|
69 |
+
chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
|
70 |
+
--------------------------------------------------
|
71 |
+
english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
|
72 |
+
*** Evaluating with num_shots: 0
|
73 |
+
100%|██████████| 1133/1133 [17:56<00:00, 1.05it/s]
|
74 |
+
gpt-4o-mini/epochs-02 metrics: {'meteor': 0.3785921332515917, 'sacrebleu': {'score': 12.033706874864837, 'counts': [12801, 4628, 2150, 1076], 'totals': [29076, 27943, 26830, 25722], 'precisions': [44.02600082542303, 16.562287513867517, 8.013417815877748, 4.183189487598165], 'bp': 0.9624112877781842, 'sys_len': 29076, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.12033706874864836, 'precisions': [0.4402600082542303, 0.16562287513867516, 0.08013417815877749, 0.04183189487598165], 'brevity_penalty': 0.9624112877781842, 'length_ratio': 0.9631003643590593, 'translation_length': 29076, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4235104923203792, 'rouge2': 0.1758318317686482, 'rougeL': 0.36922125683186846, 'rougeLsum': 0.3693808162149962}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
|
75 |
+
Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ug5PhpZ
|
76 |
+
loading train/test data files
|
77 |
+
DatasetDict({
|
78 |
+
train: Dataset({
|
79 |
+
features: ['chinese', 'english'],
|
80 |
+
num_rows: 4528
|
81 |
+
})
|
82 |
+
test: Dataset({
|
83 |
+
features: ['chinese', 'english'],
|
84 |
+
num_rows: 1133
|
85 |
+
})
|
86 |
+
})
|
87 |
+
--------------------------------------------------
|
88 |
+
chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
|
89 |
+
--------------------------------------------------
|
90 |
+
english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
|
91 |
+
*** Evaluating with num_shots: 0
|
92 |
+
100%|██████████| 1133/1133 [17:02<00:00, 1.11it/s]
|
93 |
+
gpt-4o-mini/epochs-03 metrics: {'meteor': 0.37736228106121694, 'sacrebleu': {'score': 11.933111335430906, 'counts': [12779, 4601, 2124, 1061], 'totals': [29096, 27963, 26848, 25737], 'precisions': [43.920126477866376, 16.453885491542394, 7.911203814064362, 4.122469596301046], 'bp': 0.9630984208616785, 'sys_len': 29096, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11933111335430906, 'precisions': [0.4392012647786637, 0.16453885491542394, 0.07911203814064362, 0.041224695963010455], 'brevity_penalty': 0.9630984208616785, 'length_ratio': 0.9637628353759523, 'translation_length': 29096, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4235319934194407, 'rouge2': 0.17493309683581332, 'rougeL': 0.3685697120399035, 'rougeLsum': 0.3689298428303013}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
|
94 |
+
Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ugPThQI
|
95 |
+
loading train/test data files
|
96 |
+
DatasetDict({
|
97 |
+
train: Dataset({
|
98 |
+
features: ['chinese', 'english'],
|
99 |
+
num_rows: 4528
|
100 |
+
})
|
101 |
+
test: Dataset({
|
102 |
+
features: ['chinese', 'english'],
|
103 |
+
num_rows: 1133
|
104 |
+
})
|
105 |
+
})
|
106 |
+
--------------------------------------------------
|
107 |
+
chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
|
108 |
+
--------------------------------------------------
|
109 |
+
english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
|
110 |
+
*** Evaluating with num_shots: 0
|
111 |
+
100%|██████████| 1133/1133 [18:35<00:00, 1.02it/s]
|
112 |
+
gpt-4o-mini/epochs-04 metrics: {'meteor': 0.37818535038887346, 'sacrebleu': {'score': 11.933285526593995, 'counts': [12797, 4601, 2121, 1061], 'totals': [29110, 27977, 26861, 25749], 'precisions': [43.960838199931295, 16.445651785395146, 7.896206395889952, 4.120548370810517], 'bp': 0.9635791436286372, 'sys_len': 29110, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11933285526593994, 'precisions': [0.43960838199931296, 0.16445651785395146, 0.07896206395889951, 0.041205483708105166], 'brevity_penalty': 0.9635791436286371, 'length_ratio': 0.9642265650877774, 'translation_length': 29110, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.42372801674771476, 'rouge2': 0.17487358435014705, 'rougeL': 0.36931437347367646, 'rougeLsum': 0.36934766241132383}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
|
113 |
+
|
114 |
+
Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ugVLmcB
|
115 |
+
loading train/test data files
|
116 |
+
DatasetDict({
|
117 |
+
train: Dataset({
|
118 |
+
features: ['chinese', 'english'],
|
119 |
+
num_rows: 4528
|
120 |
+
})
|
121 |
+
test: Dataset({
|
122 |
+
features: ['chinese', 'english'],
|
123 |
+
num_rows: 1133
|
124 |
+
})
|
125 |
+
})
|
126 |
+
--------------------------------------------------
|
127 |
+
chinese: 老耿端起枪,眯缝���一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
|
128 |
+
--------------------------------------------------
|
129 |
+
english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
|
130 |
+
*** Evaluating with num_shots: 0
|
131 |
+
100%|██████████| 1133/1133 [15:47<00:00, 1.20it/s]
|
132 |
+
gpt-4o-mini/epochs-05 metrics: {'meteor': 0.3790673551140706, 'sacrebleu': {'score': 11.955698498650582, 'counts': [12808, 4609, 2126, 1064], 'totals': [29209, 28076, 26959, 25846], 'precisions': [43.849498442260945, 16.416156147599374, 7.88604918580066, 4.116691170780778], 'bp': 0.9669721941455759, 'sys_len': 29209, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11955698498650584, 'precisions': [0.4384949844226095, 0.16416156147599373, 0.0788604918580066, 0.041166911707807785], 'brevity_penalty': 0.9669721941455759, 'length_ratio': 0.9675057966213978, 'translation_length': 29209, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.42476082012412075, 'rouge2': 0.17559955520032905, 'rougeL': 0.3700113513462385, 'rougeLsum': 0.37012014201963733}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
|
133 |
+
Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9uaCEFTs
|
134 |
+
loading train/test data files
|
135 |
+
DatasetDict({
|
136 |
+
train: Dataset({
|
137 |
+
features: ['chinese', 'english'],
|
138 |
+
num_rows: 4528
|
139 |
+
})
|
140 |
+
test: Dataset({
|
141 |
+
features: ['chinese', 'english'],
|
142 |
+
num_rows: 1133
|
143 |
+
})
|
144 |
+
})
|
145 |
+
--------------------------------------------------
|
146 |
+
chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
|
147 |
+
--------------------------------------------------
|
148 |
+
english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
|
149 |
+
*** Evaluating with num_shots: 0
|
150 |
+
100%|██████████| 1133/1133 [15:43<00:00, 1.20it/s]
|
151 |
+
gpt-4o-mini/epochs-06 metrics: {'meteor': 0.3792226866395673, 'sacrebleu': {'score': 11.982811850915233, 'counts': [12810, 4617, 2137, 1066], 'totals': [29116, 27983, 26868, 25757], 'precisions': [43.996428080780326, 16.499303148340065, 7.95369956825964, 4.138680746981403], 'bp': 0.9637850995333245, 'sys_len': 29116, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11982811850915229, 'precisions': [0.43996428080780325, 0.16499303148340064, 0.0795369956825964, 0.04138680746981403], 'brevity_penalty': 0.9637850995333245, 'length_ratio': 0.9644253063928453, 'translation_length': 29116, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4251187202203103, 'rouge2': 0.17553224521896635, 'rougeL': 0.37003282393672954, 'rougeLsum': 0.370114181474168}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
|
logs/openai-training-sample.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
|
2 |
+
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
|
3 |
+
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}
|
notebooks/00b_Data Analysis_Few_Shots.ipynb
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d7af735cc4e9c4395e8bba07f2d212edc159f0f8f8cf83281f25f1ea8c2d717
|
3 |
+
size 3332561
|
notebooks/00c_Data Analysis_Fine_Tuned.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/00d_Data Analysis_Fine_Tuned_RPP.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/02_Fine_Tune_OpenAI.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
CHANGED
@@ -18,3 +18,4 @@ sentencepiece==0.2.0
|
|
18 |
einops==0.8.0
|
19 |
accelerate==0.32.0
|
20 |
peft==0.11.1
|
|
|
|
18 |
einops==0.8.0
|
19 |
accelerate==0.32.0
|
20 |
peft==0.11.1
|
21 |
+
sacrebleu==2.4.2
|
results/mac-results_few_shots_metrics.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:945e48d5773ce3a870e793e410c79148bd34c1b427c7bcd8e9e5ec140e574fa7
|
3 |
+
size 9379
|
results/mac-results_few_shots_openai.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eea324569b30d1696a51853cbfc5f7b992a569f464cae0db7a88a38c8024578a
|
3 |
+
size 2782816
|
results/mac-results_fine_tuned_metrics.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bbf8e7661be99195444d1c6985179880efdc065fc1856c5fe5a78de14906c064
|
3 |
+
size 8321
|
scripts/eval-4gpu.sh
CHANGED
@@ -16,11 +16,9 @@ grep MemTotal /proc/meminfo
|
|
16 |
#pip install -r requirements.txt
|
17 |
|
18 |
export BATCH_SIZE=1
|
19 |
-
# export START_REPETITION_PENALTY=1.06
|
20 |
-
export START_NUM_SHOTS=50
|
21 |
|
22 |
#./scripts/eval-model.sh Qwen/Qwen2-72B-Instruct
|
23 |
|
24 |
-
./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
|
25 |
|
26 |
-
|
|
|
16 |
#pip install -r requirements.txt
|
17 |
|
18 |
export BATCH_SIZE=1
|
|
|
|
|
19 |
|
20 |
#./scripts/eval-model.sh Qwen/Qwen2-72B-Instruct
|
21 |
|
22 |
+
# ./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
|
23 |
|
24 |
+
./scripts/eval-rpp.sh shenzhi-wang Llama3.1-70B-Chinese-Chat checkpoint-280
|