Spaces:
Runtime error
Runtime error
Subject,Condition,Claude 3 Opus (20240229),GPT-4o (2024-05-13),GPT-4 (0613),Gemini 1.5 Pro (0409 preview),GPT-4 Turbo (1106 preview),Llama 3 (70B),Palmyra X V3 (72B),PaLM-2 (Unicorn),Mixtral (8x22B),Gemini 1.5 Flash (0514 preview) | |
All,Overall,0.7876666666666666,0.781,0.7556666666666667,0.7666666666666667,0.7133333333333333,0.7273333333333334,0.7243333333333334,0.7243333333333334,0.7093333333333334,0.7266666666666667 | |
All,OK,0.8188119179482939,0.8143757592018139,0.7918679649580322,0.796233989023108,0.7481844316480185,0.7608898291200237,0.7549373327929648,0.7549373327929648,0.741889427285375,0.7556001104880412 | |
All,Erroneous,0.5850130408902339,0.5015649977053486,0.4581050117307427,0.467992747992748,0.4455619169654257,0.4657074493624201,0.5061395481278522,0.5061395481278522,0.45554439279585474,0.5376109876694672 | |
Anatomy,Overall,0.79,0.91,0.81,0.77,0.8,0.78,0.72,0.72,0.72,0.83 | |
Anatomy,OK,0.797979797979798,0.9191919191919192,0.8181818181818182,0.7777777777777778,0.8080808080808081,0.7878787878787878,0.7272727272727273,0.7272727272727273,0.7272727272727273,0.8383838383838383 | |
Anatomy,Erroneous,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 | |
Astronomy,Overall,0.98,0.95,0.93,0.91,0.96,0.91,0.87,0.87,0.87,0.86 | |
Astronomy,OK,0.989010989010989,0.978021978021978,0.967032967032967,0.9340659340659341,0.978021978021978,0.9230769230769231,0.9010989010989011,0.9010989010989011,0.8901098901098901,0.8791208791208791 | |
Astronomy,Erroneous,0.8888888888888888,0.6666666666666666,0.5555555555555556,0.6666666666666666,0.7777777777777778,0.7777777777777778,0.5555555555555556,0.5555555555555556,0.6666666666666666,0.6666666666666666 | |
Business ethics,Overall,0.86,0.85,0.79,0.8,0.78,0.83,0.83,0.83,0.74,0.82 | |
Business ethics,OK,0.9529411764705882,0.9647058823529412,0.9294117647058824,0.8823529411764706,0.9058823529411765,0.9294117647058824,0.9058823529411765,0.9058823529411765,0.8352941176470589,0.9058823529411765 | |
Business ethics,Erroneous,0.3333333333333333,0.2,0.0,0.3333333333333333,0.06666666666666667,0.26666666666666666,0.4,0.4,0.2,0.3333333333333333 | |
Clinical knowledge,Overall,0.84,0.89,0.86,0.85,0.87,0.86,0.83,0.83,0.8,0.84 | |
Clinical knowledge,OK,0.8383838383838383,0.8888888888888888,0.8585858585858586,0.8585858585858586,0.8686868686868687,0.8585858585858586,0.8282828282828283,0.8282828282828283,0.797979797979798,0.8383838383838383 | |
Clinical knowledge,Erroneous,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0 | |
College chemistry,Overall,0.6,0.61,0.55,0.58,0.47,0.56,0.59,0.59,0.57,0.6 | |
College chemistry,OK,0.72,0.7066666666666667,0.68,0.6933333333333334,0.6,0.6666666666666666,0.7066666666666667,0.7066666666666667,0.64,0.7066666666666667 | |
College chemistry,Erroneous,0.24,0.32,0.16,0.24,0.08,0.24,0.24,0.24,0.36,0.28 | |
College computer_science,Overall,0.81,0.77,0.76,0.78,0.69,0.7,0.65,0.65,0.7,0.64 | |
College computer_science,OK,0.8041237113402062,0.7628865979381443,0.7525773195876289,0.7731958762886598,0.6804123711340206,0.7010309278350515,0.6494845360824743,0.6494845360824743,0.6907216494845361,0.6288659793814433 | |
College computer_science,Erroneous,1.0,1.0,1.0,1.0,1.0,0.6666666666666666,0.6666666666666666,0.6666666666666666,1.0,1.0 | |
College mathematics,Overall,0.55,0.47,0.54,0.59,0.4,0.56,0.51,0.51,0.48,0.53 | |
College mathematics,OK,0.5454545454545454,0.46464646464646464,0.5353535353535354,0.5858585858585859,0.3939393939393939,0.5555555555555556,0.5050505050505051,0.5050505050505051,0.47474747474747475,0.5252525252525253 | |
College mathematics,Erroneous,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 | |
College medicine,Overall,0.84,0.82,0.78,0.81,0.78,0.82,0.77,0.77,0.78,0.75 | |
College medicine,OK,0.8735632183908046,0.8620689655172413,0.8160919540229885,0.8275862068965517,0.8045977011494253,0.8735632183908046,0.8160919540229885,0.8160919540229885,0.7931034482758621,0.7701149425287356 | |
College medicine,Erroneous,0.6153846153846154,0.5384615384615384,0.5384615384615384,0.6923076923076923,0.6153846153846154,0.46153846153846156,0.46153846153846156,0.46153846153846156,0.6923076923076923,0.6153846153846154 | |
College physics,Overall,0.72,0.69,0.64,0.8,0.41,0.53,0.55,0.55,0.58,0.68 | |
College physics,OK,0.72,0.69,0.64,0.8,0.41,0.53,0.55,0.55,0.58,0.68 | |
College physics,Erroneous,-,-,-,-,-,-,-,-,-,- | |
Conceptual physics:,Overall,0.85,0.88,0.85,0.88,0.87,0.82,0.79,0.79,0.79,0.82 | |
Conceptual physics:,OK,0.8804347826086957,0.9021739130434783,0.8695652173913043,0.8913043478260869,0.8913043478260869,0.8478260869565217,0.8152173913043478,0.8152173913043478,0.8152173913043478,0.8478260869565217 | |
Conceptual physics:,Erroneous,0.5,0.625,0.625,0.75,0.625,0.5,0.5,0.5,0.5,0.5 | |
Econometrics,Overall,0.8,0.69,0.68,0.76,0.68,0.69,0.66,0.66,0.69,0.66 | |
Econometrics,OK,0.7938144329896907,0.711340206185567,0.7010309278350515,0.7628865979381443,0.7010309278350515,0.7010309278350515,0.6804123711340206,0.6804123711340206,0.711340206185567,0.6597938144329897 | |
Econometrics,Erroneous,1.0,0.0,0.0,0.6666666666666666,0.0,0.3333333333333333,0.0,0.0,0.0,0.6666666666666666 | |
Electrical engineering,Overall,0.82,0.81,0.78,0.76,0.77,0.74,0.76,0.76,0.75,0.8 | |
Electrical engineering,OK,0.826530612244898,0.8163265306122449,0.7857142857142857,0.7755102040816326,0.7755102040816326,0.7448979591836735,0.7653061224489796,0.7653061224489796,0.7551020408163265,0.8061224489795918 | |
Electrical engineering,Erroneous,0.5,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5 | |
Formal logic,Overall,0.72,0.72,0.65,0.63,0.66,0.65,0.66,0.66,0.63,0.6 | |
Formal logic,OK,0.7126436781609196,0.7241379310344828,0.6666666666666666,0.632183908045977,0.6781609195402298,0.6206896551724138,0.6551724137931034,0.6551724137931034,0.6436781609195402,0.6091954022988506 | |
Formal logic,Erroneous,0.7692307692307693,0.6923076923076923,0.5384615384615384,0.6153846153846154,0.5384615384615384,0.8461538461538461,0.6923076923076923,0.6923076923076923,0.5384615384615384,0.5384615384615384 | |
Global facts,Overall,0.66,0.64,0.62,0.66,0.58,0.49,0.53,0.53,0.56,0.55 | |
Global facts,OK,0.7045454545454546,0.6704545454545454,0.6590909090909091,0.6931818181818182,0.5909090909090909,0.5113636363636364,0.5454545454545454,0.5454545454545454,0.5454545454545454,0.5568181818181818 | |
Global facts,Erroneous,0.3333333333333333,0.4166666666666667,0.3333333333333333,0.4166666666666667,0.5,0.3333333333333333,0.4166666666666667,0.4166666666666667,0.6666666666666666,0.5 | |
High school chemistry,Overall,0.77,0.74,0.67,0.8,0.65,0.72,0.73,0.73,0.7,0.75 | |
High school chemistry,OK,0.7777777777777778,0.7474747474747475,0.6767676767676768,0.8080808080808081,0.6565656565656566,0.7272727272727273,0.7373737373737373,0.7373737373737373,0.7070707070707071,0.7575757575757576 | |
High school chemistry,Erroneous,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 | |
High school geography,Overall,0.94,0.94,0.94,0.9,0.91,0.93,0.91,0.91,0.94,0.92 | |
High school geography,OK,0.94,0.94,0.94,0.9,0.91,0.93,0.91,0.91,0.94,0.92 | |
High school geography,Erroneous,-,-,-,-,-,-,-,-,-,- | |
High school macroeconomics,Overall,0.83,0.89,0.88,0.83,0.88,0.79,0.78,0.78,0.76,0.82 | |
High school macroeconomics,OK,0.8409090909090909,0.8977272727272727,0.8863636363636364,0.8409090909090909,0.8863636363636364,0.8181818181818182,0.7954545454545454,0.7954545454545454,0.7840909090909091,0.8295454545454546 | |
High school macroeconomics,Erroneous,0.75,0.8333333333333334,0.8333333333333334,0.75,0.8333333333333334,0.5833333333333334,0.6666666666666666,0.6666666666666666,0.5833333333333334,0.75 | |
High school mathematics,Overall,0.58,0.46,0.55,0.64,0.11,0.51,0.52,0.52,0.49,0.55 | |
High school mathematics,OK,0.5757575757575758,0.46464646464646464,0.5555555555555556,0.6464646464646465,0.1111111111111111,0.5151515151515151,0.5151515151515151,0.5151515151515151,0.494949494949495,0.5454545454545454 | |
High school mathematics,Erroneous,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0 | |
High school physics,Overall,0.74,0.72,0.59,0.69,0.57,0.61,0.55,0.55,0.55,0.64 | |
High school physics,OK,0.7525773195876289,0.7319587628865979,0.5979381443298969,0.7010309278350515,0.5876288659793815,0.6185567010309279,0.5567010309278351,0.5567010309278351,0.5567010309278351,0.6494845360824743 | |
High school physics,Erroneous,0.3333333333333333,0.3333333333333333,0.3333333333333333,0.3333333333333333,0.0,0.3333333333333333,0.3333333333333333,0.3333333333333333,0.3333333333333333,0.3333333333333333 | |
High school statistics,Overall,0.79,0.8,0.82,0.84,0.68,0.73,0.74,0.74,0.69,0.77 | |
High school statistics,OK,0.8061224489795918,0.8163265306122449,0.8367346938775511,0.8571428571428571,0.6938775510204082,0.7448979591836735,0.7551020408163265,0.7551020408163265,0.7040816326530612,0.7857142857142857 | |
High school statistics,Erroneous,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 | |
High school US history,Overall,0.94,0.95,0.93,0.92,0.91,0.95,0.94,0.94,0.92,0.91 | |
High school US history,OK,0.94,0.95,0.93,0.92,0.91,0.95,0.94,0.94,0.92,0.91 | |
High school US history,Erroneous,-,-,-,-,-,-,-,-,-,- | |
Human aging,Overall,0.82,0.84,0.8,0.8,0.83,0.78,0.79,0.79,0.73,0.77 | |
Human aging,OK,0.8735632183908046,0.8850574712643678,0.8390804597701149,0.8505747126436781,0.8850574712643678,0.8275862068965517,0.8505747126436781,0.8505747126436781,0.7816091954022989,0.8160919540229885 | |
Human aging,Erroneous,0.46153846153846156,0.5384615384615384,0.5384615384615384,0.46153846153846156,0.46153846153846156,0.46153846153846156,0.38461538461538464,0.38461538461538464,0.38461538461538464,0.46153846153846156 | |
Logical fallacies,Overall,0.9,0.89,0.89,0.88,0.86,0.85,0.9,0.9,0.9,0.86 | |
Logical fallacies,OK,0.9594594594594594,0.9864864864864865,0.9594594594594594,0.9594594594594594,0.9324324324324325,0.9594594594594594,0.9459459459459459,0.9459459459459459,0.9864864864864865,0.918918918918919 | |
Logical fallacies,Erroneous,0.7307692307692307,0.6153846153846154,0.6923076923076923,0.6538461538461539,0.6538461538461539,0.5384615384615384,0.7692307692307693,0.7692307692307693,0.6538461538461539,0.6923076923076923 | |
Machine learning,Overall,0.74,0.78,0.76,0.69,0.72,0.71,0.63,0.63,0.67,0.56 | |
Machine learning,OK,0.7640449438202247,0.797752808988764,0.7865168539325843,0.7078651685393258,0.7415730337078652,0.7303370786516854,0.651685393258427,0.651685393258427,0.6966292134831461,0.5842696629213483 | |
Machine learning,Erroneous,0.5454545454545454,0.6363636363636364,0.5454545454545454,0.5454545454545454,0.5454545454545454,0.5454545454545454,0.45454545454545453,0.45454545454545453,0.45454545454545453,0.36363636363636365 | |
Miscellaneous,Overall,0.96,0.97,0.97,0.94,0.96,0.94,0.93,0.93,0.93,0.91 | |
Miscellaneous,OK,0.9777777777777777,0.9777777777777777,0.9888888888888889,0.9666666666666667,0.9888888888888889,0.9666666666666667,0.9555555555555556,0.9555555555555556,0.9555555555555556,0.9333333333333333 | |
Miscellaneous,Erroneous,0.8,0.9,0.8,0.7,0.7,0.7,0.7,0.7,0.7,0.7 | |
Philosophy,Overall,0.9,0.9,0.88,0.85,0.85,0.83,0.84,0.84,0.8,0.81 | |
Philosophy,OK,0.9213483146067416,0.9325842696629213,0.9213483146067416,0.8651685393258427,0.8764044943820225,0.8539325842696629,0.8539325842696629,0.8539325842696629,0.8314606741573034,0.8426966292134831 | |
Philosophy,Erroneous,0.7272727272727273,0.6363636363636364,0.5454545454545454,0.7272727272727273,0.6363636363636364,0.6363636363636364,0.7272727272727273,0.7272727272727273,0.5454545454545454,0.5454545454545454 | |
Professional accounting,Overall,0.82,0.76,0.72,0.65,0.66,0.67,0.71,0.71,0.63,0.64 | |
Professional accounting,OK,0.8152173913043478,0.7717391304347826,0.7282608695652174,0.6413043478260869,0.7065217391304348,0.6739130434782609,0.7065217391304348,0.7065217391304348,0.6413043478260869,0.6304347826086957 | |
Professional accounting,Erroneous,0.875,0.625,0.625,0.75,0.125,0.625,0.75,0.75,0.5,0.75 | |
Professional law,Overall,0.69,0.7,0.71,0.67,0.71,0.55,0.68,0.68,0.59,0.58 | |
Professional law,OK,0.7195121951219512,0.6951219512195121,0.7439024390243902,0.6951219512195121,0.7317073170731707,0.5609756097560976,0.6463414634146342,0.6463414634146342,0.6585365853658537,0.573170731707317 | |
Professional law,Erroneous,0.5555555555555556,0.7222222222222222,0.5555555555555556,0.5555555555555556,0.6111111111111112,0.5,0.8333333333333334,0.8333333333333334,0.2777777777777778,0.6111111111111112 | |
Public relations,Overall,0.83,0.83,0.76,0.77,0.82,0.76,0.8,0.8,0.76,0.8 | |
Public relations,OK,0.8571428571428571,0.8681318681318682,0.8021978021978022,0.8021978021978022,0.8571428571428571,0.7912087912087912,0.8461538461538461,0.8461538461538461,0.7912087912087912,0.8351648351648352 | |
Public relations,Erroneous,0.5555555555555556,0.4444444444444444,0.3333333333333333,0.4444444444444444,0.4444444444444444,0.4444444444444444,0.3333333333333333,0.3333333333333333,0.4444444444444444,0.4444444444444444 | |
Virology,Overall,0.54,0.56,0.56,0.55,0.56,0.55,0.56,0.56,0.56,0.53 | |
Virology,OK,0.8837209302325582,0.9069767441860465,0.8837209302325582,0.8372093023255814,0.8837209302325582,0.9069767441860465,0.9302325581395349,0.9302325581395349,0.9069767441860465,0.8837209302325582 | |
Virology,Erroneous,0.2807017543859649,0.2982456140350877,0.3157894736842105,0.3333333333333333,0.3157894736842105,0.2807017543859649,0.2807017543859649,0.2807017543859649,0.2982456140350877,0.2631578947368421 | |