Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

dh-mc commited on Sep 16, 2024

Commit

3db2ae5

1 Parent(s): d8cfffb

10-shot results ready for 7/8 B models

Browse files

Files changed (26) hide show

data/Llama3.1-8B-Chinese-Chat_metrics.csv +1 -1
data/Llama3.1-8B-Chinese-Chat_results.csv +0 -0
data/Llama3.1-8B-Chinese-Chat_shots_metrics.csv +3 -0
data/Mistral-7B-v0.3-Chinese-Chat_metrics.csv +11 -11
data/Mistral-7B-v0.3-Chinese-Chat_results.csv +0 -0
data/Mistral-7B-v0.3-Chinese-Chat_shots_metrics.csv +3 -0
data/Qwen2-7B-Instruct_metrics.csv +1 -1
data/Qwen2-7B-Instruct_results.csv +0 -0
data/Qwen2-7B-Instruct_shots_metrics.csv +3 -0
data/internlm2_5-20b-chat_metrics.csv +12 -0
data/internlm2_5-20b-chat_results.csv +0 -0
data/internlm2_5-20b-chat_shots_metrics.csv +2 -0
data/internlm2_5-7b-chat-1m_metrics.csv +1 -1
data/internlm2_5-7b-chat-1m_results.csv +0 -0
data/internlm2_5-7b-chat-1m_shots_metrics.csv +3 -0
data/internlm2_5-7b-chat_metrics.csv +10 -9
data/internlm2_5-7b-chat_results.csv +0 -0
data/internlm2_5-7b-chat_shots_metrics.csv +3 -0
llm_toolkit/logical_reasoning_utils.py +8 -2
notebooks/01a_internlm2_5-20b-chat_analysis.ipynb +0 -0
notebooks/01a_internlm2_5-7b-chat-1m_analysis.ipynb +0 -0
notebooks/01a_internlm2_5-7b-chat_analysis.ipynb +0 -0
notebooks/01b_Mistral-7B-v0.3-Chinese-Chat_analysis.ipynb +0 -0
notebooks/02a_Qwen2-7B-Instruct_analysis.ipynb +0 -0
notebooks/03a_Llama3.1-8B-Chinese-Chat_analysis.ipynb +0 -0
scripts/eval-mac.sh +0 -19

data/Llama3.1-8B-Chinese-Chat_metrics.csv CHANGED Viewed

@@ -1,5 +1,5 @@
 epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
-0.0,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat_torch.float16_lf,0.707,0.7631091217915184,0.707,0.7243940517731183,0.3923333333333333
 0.2,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-35_torch.float16_lf,0.709,0.7987219597893886,0.709,0.7427961200958145,1.0
 0.4,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-70_torch.float16_lf,0.7163333333333334,0.8058657875960304,0.7163333333333334,0.7487811196109319,0.9993333333333333
 0.6,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-105_torch.float16_lf,0.6996666666666667,0.802722482275839,0.6996666666666667,0.7370938556711591,1.0

 epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat_torch.float16_lf,0.742,0.7477056799746837,0.742,0.7371050181385632,0.8033333333333333
 0.2,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-35_torch.float16_lf,0.709,0.7987219597893886,0.709,0.7427961200958145,1.0
 0.4,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-70_torch.float16_lf,0.7163333333333334,0.8058657875960304,0.7163333333333334,0.7487811196109319,0.9993333333333333
 0.6,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/checkpoint-105_torch.float16_lf,0.6996666666666667,0.802722482275839,0.6996666666666667,0.7370938556711591,1.0

data/Llama3.1-8B-Chinese-Chat_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/Llama3.1-8B-Chinese-Chat_shots_metrics.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-00,0.742,0.7477056799746837,0.742,0.7371050181385632,0.8033333333333333
+10,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-10,0.6676666666666666,0.7834080522821993,0.6676666666666666,0.7082605860921491,0.9623333333333334

data/Mistral-7B-v0.3-Chinese-Chat_metrics.csv CHANGED Viewed

@@ -1,12 +1,12 @@
 epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
-0.0,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat_torch.float16_lf,0.7113333333333334,0.70220546362905,0.7113333333333334,0.6894974942637364,0.004
-0.2,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-35_torch.float16_lf,0.702,0.7932731014186957,0.702,0.7342714734731689,1.0
-0.4,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-70_torch.float16_lf,0.742,0.78982949223512,0.742,0.7536681109811127,1.0
-0.6,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-105_torch.float16_lf,0.6596666666666666,0.7923396753604393,0.6596666666666666,0.7067542301676931,1.0
-0.8,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-140_torch.float16_lf,0.7146666666666667,0.7861341885687435,0.7146666666666667,0.7404677278137267,1.0
-1.0,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-175_torch.float16_lf,0.7326666666666667,0.7876867721932461,0.7326666666666667,0.7471869515031995,1.0
-1.2,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-210_torch.float16_lf,0.7016666666666667,0.7903119228393193,0.7016666666666667,0.7348708822385348,1.0
-1.4,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-245_torch.float16_lf,0.75,0.7885868317699068,0.75,0.7648234347578796,1.0
-1.6,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-280_torch.float16_lf,0.7156666666666667,0.7846106674095725,0.7156666666666667,0.7410042005708856,1.0
-1.8,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-315_torch.float16_lf,0.6916666666666667,0.7864256994491394,0.6916666666666667,0.7257499426487266,1.0
-2.0,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-350_torch.float16_lf,0.6976666666666667,0.7889443494370009,0.6976666666666667,0.7307996137659796,1.0

 epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat_torch.float16_lf,0.6946666666666667,0.701136267898111,0.6946666666666667,0.6634078645357937,0.011666666666666667
+0.2,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-105_torch.float16_lf,0.6596666666666666,0.7923396753604393,0.6596666666666666,0.7067542301676931,1.0
+0.4,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-140_torch.float16_lf,0.7146666666666667,0.7861341885687435,0.7146666666666667,0.7404677278137267,1.0
+0.6,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-175_torch.float16_lf,0.7326666666666667,0.7876867721932461,0.7326666666666667,0.7471869515031995,1.0
+0.8,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-210_torch.float16_lf,0.7016666666666667,0.7903119228393193,0.7016666666666667,0.7348708822385348,1.0
+1.0,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-245_torch.float16_lf,0.75,0.7885868317699068,0.75,0.7648234347578796,1.0
+1.2,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-280_torch.float16_lf,0.7156666666666667,0.7846106674095725,0.7156666666666667,0.7410042005708856,1.0
+1.4,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-315_torch.float16_lf,0.6916666666666667,0.7864256994491394,0.6916666666666667,0.7257499426487266,1.0
+1.6,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-35_torch.float16_lf,0.702,0.7932731014186957,0.702,0.7342714734731689,1.0
+1.8,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-350_torch.float16_lf,0.6976666666666667,0.7889443494370009,0.6976666666666667,0.7307996137659796,1.0
+2.0,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/checkpoint-70_torch.float16_lf,0.742,0.78982949223512,0.742,0.7536681109811127,1.0

data/Mistral-7B-v0.3-Chinese-Chat_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/Mistral-7B-v0.3-Chinese-Chat_shots_metrics.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/shots-00,0.6946666666666667,0.701136267898111,0.6946666666666667,0.6634078645357937,0.011666666666666667
+10,Mistral-7B-v0.3-Chinese-Chat,shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat/shots-10,0.6036666666666667,0.7334913867282189,0.6036666666666667,0.6493185547247415,0.10633333333333334

data/Qwen2-7B-Instruct_metrics.csv CHANGED Viewed

@@ -1,5 +1,5 @@
 epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
-0.0,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct_torch.float16_lf,0.6203333333333333,0.7554720257311661,0.6203333333333333,0.6731632664545455,0.9973333333333333
 0.2,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-35_torch.float16_lf,0.725,0.7840171468707405,0.725,0.748994536667058,0.9996666666666667
 0.4,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-70_torch.float16_lf,0.759,0.8005303465799652,0.759,0.7748745026535183,1.0
 0.6,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-105_torch.float16_lf,0.6926666666666667,0.8039176975550218,0.6926666666666667,0.7332481528585848,1.0

 epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct_torch.float16_lf,0.683,0.7493103872717293,0.683,0.710140098232232,0.9996666666666667
 0.2,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-35_torch.float16_lf,0.725,0.7840171468707405,0.725,0.748994536667058,0.9996666666666667
 0.4,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-70_torch.float16_lf,0.759,0.8005303465799652,0.759,0.7748745026535183,1.0
 0.6,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/checkpoint-105_torch.float16_lf,0.6926666666666667,0.8039176975550218,0.6926666666666667,0.7332481528585848,1.0

data/Qwen2-7B-Instruct_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/Qwen2-7B-Instruct_shots_metrics.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-00,0.683,0.7493103872717293,0.683,0.710140098232232,0.9996666666666667
+10,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-10,0.5646666666666667,0.7391197908117386,0.5646666666666667,0.6064049121095652,0.9896666666666667

data/internlm2_5-20b-chat_metrics.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat_torch.bfloat16_4bit_lf,0.564,0.7745256693833624,0.564,0.6352190975436365,0.6726666666666666
+0.2,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat/checkpoint-105_torch.bfloat16_4bit_lf,0.7053333333333334,0.8070587351344375,0.7053333333333334,0.7421985241641746,1.0
+0.4,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat/checkpoint-140_torch.bfloat16_4bit_lf,0.795,0.817457691710893,0.795,0.8027552955647029,1.0
+0.6,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat/checkpoint-175_torch.bfloat16_4bit_lf,0.7786666666666666,0.8220512342362645,0.7786666666666666,0.7938353741035283,1.0
+0.8,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat/checkpoint-210_torch.bfloat16_4bit_lf,0.7516666666666667,0.8264680853251051,0.7516666666666667,0.7787088167337303,1.0
+1.0,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat/checkpoint-245_torch.bfloat16_4bit_lf,0.7876666666666666,0.8154190698395475,0.7876666666666666,0.7965399224841393,1.0
+1.2,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat/checkpoint-280_torch.bfloat16_4bit_lf,0.7753333333333333,0.8181125383376948,0.7753333333333333,0.7899794199099057,1.0
+1.4,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat/checkpoint-315_torch.bfloat16_4bit_lf,0.7583333333333333,0.8179523170315577,0.7583333333333333,0.7795358413482081,1.0
+1.6,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat/checkpoint-35_torch.bfloat16_4bit_lf,0.7576666666666667,0.7960640143421251,0.7576666666666667,0.769346697622254,1.0
+1.8,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat/checkpoint-350_torch.bfloat16_4bit_lf,0.7616666666666667,0.8208475549648238,0.7616666666666667,0.7826736174247095,1.0
+2.0,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat/checkpoint-70_torch.bfloat16_4bit_lf,0.7743333333333333,0.8042791719587958,0.7743333333333333,0.7849233169481004,1.0

data/internlm2_5-20b-chat_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/internlm2_5-20b-chat_shots_metrics.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2	+ 0,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat/shots-00,0.564,0.7745256693833624,0.564,0.6352190975436365,0.6726666666666666

data/internlm2_5-7b-chat-1m_metrics.csv CHANGED Viewed

@@ -1,5 +1,5 @@
 epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
-0.0,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m_torch.bfloat16_lf,0.5106666666666667,0.743213901498142,0.5106666666666667,0.5357333853323308,1.0
 0.2,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-35_torch.bfloat16_lf,0.7843333333333333,0.7977648302848388,0.7843333333333333,0.7864944570659659,1.0
 0.4,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-70_torch.bfloat16_lf,0.7836666666666666,0.7996977262947886,0.7836666666666666,0.7886881726841081,1.0
 0.6,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-105_torch.bfloat16_lf,0.7243333333333334,0.8171172705912051,0.7243333333333334,0.7565804830382912,1.0

 epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m_torch.bfloat16_lf,0.48133333333333334,0.7605248207587668,0.48133333333333334,0.5244515621126862,0.9986666666666667
 0.2,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-35_torch.bfloat16_lf,0.7843333333333333,0.7977648302848388,0.7843333333333333,0.7864944570659659,1.0
 0.4,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-70_torch.bfloat16_lf,0.7836666666666666,0.7996977262947886,0.7836666666666666,0.7886881726841081,1.0
 0.6,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/checkpoint-105_torch.bfloat16_lf,0.7243333333333334,0.8171172705912051,0.7243333333333334,0.7565804830382912,1.0

data/internlm2_5-7b-chat-1m_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/internlm2_5-7b-chat-1m_shots_metrics.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-00,0.48133333333333334,0.7605248207587668,0.48133333333333334,0.5244515621126862,0.9986666666666667
+10,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-10,0.6473333333333333,0.7282065610714444,0.6473333333333333,0.665824871588245,0.8866666666666667

data/internlm2_5-7b-chat_metrics.csv CHANGED Viewed

@@ -1,11 +1,12 @@
 epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
-0.0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-35_torch.bfloat16_lf,0.7193333333333334,0.7863486093365692,0.7193333333333334,0.7330498811142795,1.0
-0.2,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-70_torch.bfloat16_lf,0.726,0.7900250828103491,0.726,0.7396583495246526,1.0
-0.4,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-105_torch.bfloat16_lf,0.6736666666666666,0.8044565554629858,0.6736666666666666,0.7104123104529902,1.0
-0.6,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-140_torch.bfloat16_lf,0.7496666666666667,0.8041871978859686,0.7496666666666667,0.7660159670998776,1.0
-0.8,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-175_torch.bfloat16_lf,0.726,0.8094634420846424,0.726,0.751394838822856,1.0
-1.0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-210_torch.bfloat16_lf,0.7276666666666667,0.8039673699820601,0.7276666666666667,0.7488653386949028,1.0
-1.2,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-245_torch.bfloat16_lf,0.747,0.8055537753403307,0.747,0.76527383722639,1.0
-1.4,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-280_torch.bfloat16_lf,0.7166666666666667,0.8059535682746547,0.7166666666666667,0.7432427946178835,1.0
-1.6,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-315_torch.bfloat16_lf,0.6983333333333334,0.8119110469658597,0.6983333333333334,0.7347246872892312,1.0
 1.8,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-350_torch.bfloat16_lf,0.7076666666666667,0.8120132783051135,0.7076666666666667,0.7408145046817652,1.0

 epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat_torch.bfloat16_lf,0.705,0.7398041613378253,0.705,0.6906357423169466,1.0
+0.2,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-105_torch.bfloat16_lf,0.6736666666666666,0.8044565554629858,0.6736666666666666,0.7104123104529902,1.0
+0.4,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-140_torch.bfloat16_lf,0.7496666666666667,0.8041871978859686,0.7496666666666667,0.7660159670998776,1.0
+0.6,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-175_torch.bfloat16_lf,0.726,0.8094634420846424,0.726,0.751394838822856,1.0
+0.8,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-210_torch.bfloat16_lf,0.7276666666666667,0.8039673699820601,0.7276666666666667,0.7488653386949028,1.0
+1.0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-245_torch.bfloat16_lf,0.747,0.8055537753403307,0.747,0.76527383722639,1.0
+1.2,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-280_torch.bfloat16_lf,0.7166666666666667,0.8059535682746547,0.7166666666666667,0.7432427946178835,1.0
+1.4,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-315_torch.bfloat16_lf,0.6983333333333334,0.8119110469658597,0.6983333333333334,0.7347246872892312,1.0
+1.6,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-35_torch.bfloat16_lf,0.7193333333333334,0.7863486093365692,0.7193333333333334,0.7330498811142795,1.0
 1.8,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-350_torch.bfloat16_lf,0.7076666666666667,0.8120132783051135,0.7076666666666667,0.7408145046817652,1.0
+2.0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/checkpoint-70_torch.bfloat16_lf,0.726,0.7900250828103491,0.726,0.7396583495246526,1.0

data/internlm2_5-7b-chat_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/internlm2_5-7b-chat_shots_metrics.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-00,0.705,0.7398041613378253,0.705,0.6906357423169466,1.0
+10,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-10,0.5533333333333333,0.7301739373336078,0.5533333333333333,0.625097481985829,0.9883333333333333

llm_toolkit/logical_reasoning_utils.py CHANGED Viewed

@@ -429,7 +429,13 @@ def get_metrics_df(df, variant="epoch"):
     perf_df = pd.DataFrame(
         columns=[variant, "model", "run", "accuracy", "precision", "recall", "f1"]
     )
-    for i, col in enumerate(df.columns[5:]):
         metrics = calc_metrics(df["label"], df[col], debug=False)
         new_model_metrics = {
             variant: i / 5 if variant == "epoch" else i + 1,
@@ -439,7 +445,7 @@ def get_metrics_df(df, variant="epoch"):
         if variant == "shots":
             parts = col.split("/shots-")
             new_model_metrics["shots"] = int(parts[1])
-            new_model_metrics["model"] = parts[0]
         new_model_metrics.update(metrics)

     perf_df = pd.DataFrame(
         columns=[variant, "model", "run", "accuracy", "precision", "recall", "f1"]
     )
+    columns = [
+        col
+        for col in df.columns[5:]
+        if variant in col or variant == "epoch" and "_torch." in col
+    ]
+    print("columns:", columns)
+    for i, col in enumerate(columns):
         metrics = calc_metrics(df["label"], df[col], debug=False)
         new_model_metrics = {
             variant: i / 5 if variant == "epoch" else i + 1,
         if variant == "shots":
             parts = col.split("/shots-")
             new_model_metrics["shots"] = int(parts[1])
+            # new_model_metrics["model"] = parts[0]
         new_model_metrics.update(metrics)

notebooks/01a_internlm2_5-20b-chat_analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/01a_internlm2_5-7b-chat-1m_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/01a_internlm2_5-7b-chat_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/01b_Mistral-7B-v0.3-Chinese-Chat_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/02a_Qwen2-7B-Instruct_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/03a_Llama3.1-8B-Chinese-Chat_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

scripts/eval-mac.sh DELETED Viewed

@@ -1,19 +0,0 @@
-#!/bin/sh
-BASEDIR=$(dirname "$0")
-cd $BASEDIR/..
-echo Current Directory:
-pwd
-nvidia-smi
-uname -a
-cat /etc/os-release
-lscpu
-grep MemTotal /proc/meminfo
-export EVAL_BASE_MODEL=true
-export DO_FINE_TUNING=false
-export MODEL_NAME=$1
-echo Evaluating $MODEL_NAME
-python llm_toolkit/tune_mac.py