Spaces:
Running
Running
update leaderboard
Browse files- all_results.json +0 -0
- app.py +32 -29
all_results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
app.py
CHANGED
@@ -2031,89 +2031,92 @@ with block:
|
|
2031 |
with gr.TabItem("Cross-Lingual Consistency"):
|
2032 |
|
2033 |
# dataset 1: cross-mmlu
|
2034 |
-
|
|
|
|
|
|
|
2035 |
with gr.TabItem("Zero Shot"):
|
2036 |
with gr.TabItem("Overall"):
|
2037 |
with gr.Row():
|
2038 |
-
|
2039 |
-
|
2040 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
2041 |
type="pandas",
|
2042 |
)
|
2043 |
with gr.TabItem("Language Performance"):
|
2044 |
|
2045 |
with gr.Row():
|
2046 |
-
|
2047 |
-
|
2048 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
2049 |
type="pandas",
|
2050 |
)
|
2051 |
with gr.TabItem("Five Shot"):
|
2052 |
with gr.TabItem("Overall"):
|
2053 |
|
2054 |
with gr.Row():
|
2055 |
-
|
2056 |
-
|
2057 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
2058 |
type="pandas",
|
2059 |
)
|
2060 |
with gr.TabItem("Language Performance"):
|
2061 |
|
2062 |
with gr.Row():
|
2063 |
gr.components.Dataframe(
|
2064 |
-
|
2065 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
2066 |
type="pandas",
|
2067 |
)
|
2068 |
|
2069 |
with gr.Row():
|
2070 |
gr.Markdown("""
|
2071 |
-
**Cross-
|
2072 |
- **Metric:** Cross-Lingual Consistency, Accuracy, AC3
|
2073 |
-
- **Languages:** English, Chinese, Spanish, Vietnamese
|
2074 |
""")
|
2075 |
|
2076 |
-
|
2077 |
-
with gr.TabItem("Cross-
|
2078 |
with gr.TabItem("Zero Shot"):
|
2079 |
with gr.TabItem("Overall"):
|
2080 |
with gr.Row():
|
2081 |
-
|
2082 |
-
|
2083 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
2084 |
type="pandas",
|
2085 |
)
|
2086 |
with gr.TabItem("Language Performance"):
|
2087 |
|
2088 |
with gr.Row():
|
2089 |
-
|
2090 |
-
|
2091 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
2092 |
type="pandas",
|
2093 |
)
|
2094 |
with gr.TabItem("Five Shot"):
|
2095 |
with gr.TabItem("Overall"):
|
2096 |
|
2097 |
with gr.Row():
|
2098 |
-
|
2099 |
-
|
2100 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
2101 |
type="pandas",
|
2102 |
)
|
2103 |
with gr.TabItem("Language Performance"):
|
2104 |
|
2105 |
with gr.Row():
|
2106 |
gr.components.Dataframe(
|
2107 |
-
|
2108 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
2109 |
type="pandas",
|
2110 |
)
|
2111 |
|
2112 |
with gr.Row():
|
2113 |
gr.Markdown("""
|
2114 |
-
**Cross-
|
2115 |
- **Metric:** Cross-Lingual Consistency, Accuracy, AC3
|
2116 |
-
- **Languages:** English, Chinese,
|
2117 |
""")
|
2118 |
|
2119 |
|
|
|
2031 |
with gr.TabItem("Cross-Lingual Consistency"):
|
2032 |
|
2033 |
# dataset 1: cross-mmlu
|
2034 |
+
|
2035 |
+
|
2036 |
+
# dataset 1: cross-mmlu
|
2037 |
+
with gr.TabItem("Cross-MMLU"):
|
2038 |
with gr.TabItem("Zero Shot"):
|
2039 |
with gr.TabItem("Overall"):
|
2040 |
with gr.Row():
|
2041 |
+
cross_mmlu_zero_shot_overall = gr.components.Dataframe(
|
2042 |
+
CROSS_MMLU_ZERO_SHOT_OVERALL,
|
2043 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns),
|
2044 |
type="pandas",
|
2045 |
)
|
2046 |
with gr.TabItem("Language Performance"):
|
2047 |
|
2048 |
with gr.Row():
|
2049 |
+
cross_mmlu_zero_shot_overall = gr.components.Dataframe(
|
2050 |
+
CROSS_MMLU_ZERO_SHOT_LANGUAGE,
|
2051 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns),
|
2052 |
type="pandas",
|
2053 |
)
|
2054 |
with gr.TabItem("Five Shot"):
|
2055 |
with gr.TabItem("Overall"):
|
2056 |
|
2057 |
with gr.Row():
|
2058 |
+
cross_mmlu_zero_shot_overall = gr.components.Dataframe(
|
2059 |
+
CROSS_MMLU_FIVE_SHOT_OVERALL,
|
2060 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns),
|
2061 |
type="pandas",
|
2062 |
)
|
2063 |
with gr.TabItem("Language Performance"):
|
2064 |
|
2065 |
with gr.Row():
|
2066 |
gr.components.Dataframe(
|
2067 |
+
CROSS_MMLU_FIVE_SHOT_LANGUAGE,
|
2068 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns),
|
2069 |
type="pandas",
|
2070 |
)
|
2071 |
|
2072 |
with gr.Row():
|
2073 |
gr.Markdown("""
|
2074 |
+
**Cross-MMLU Leaderboard** 🔮
|
2075 |
- **Metric:** Cross-Lingual Consistency, Accuracy, AC3
|
2076 |
+
- **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
|
2077 |
""")
|
2078 |
|
2079 |
+
|
2080 |
+
with gr.TabItem("Cross-XQUAD"):
|
2081 |
with gr.TabItem("Zero Shot"):
|
2082 |
with gr.TabItem("Overall"):
|
2083 |
with gr.Row():
|
2084 |
+
cross_xquad_zero_shot_overall = gr.components.Dataframe(
|
2085 |
+
CROSS_XQUAD_ZERO_SHOT_OVERALL,
|
2086 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_OVERALL.columns),
|
2087 |
type="pandas",
|
2088 |
)
|
2089 |
with gr.TabItem("Language Performance"):
|
2090 |
|
2091 |
with gr.Row():
|
2092 |
+
cross_xquad_zero_shot_overall = gr.components.Dataframe(
|
2093 |
+
CROSS_XQUAD_ZERO_SHOT_LANGUAGE,
|
2094 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_LANGUAGE.columns),
|
2095 |
type="pandas",
|
2096 |
)
|
2097 |
with gr.TabItem("Five Shot"):
|
2098 |
with gr.TabItem("Overall"):
|
2099 |
|
2100 |
with gr.Row():
|
2101 |
+
cross_xquad_zero_shot_overall = gr.components.Dataframe(
|
2102 |
+
CROSS_XQUAD_FIVE_SHOT_OVERALL,
|
2103 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_OVERALL.columns),
|
2104 |
type="pandas",
|
2105 |
)
|
2106 |
with gr.TabItem("Language Performance"):
|
2107 |
|
2108 |
with gr.Row():
|
2109 |
gr.components.Dataframe(
|
2110 |
+
CROSS_XQUAD_FIVE_SHOT_LANGUAGE,
|
2111 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_LANGUAGE.columns),
|
2112 |
type="pandas",
|
2113 |
)
|
2114 |
|
2115 |
with gr.Row():
|
2116 |
gr.Markdown("""
|
2117 |
+
**Cross-XQUAD Leaderboard** 🔮
|
2118 |
- **Metric:** Cross-Lingual Consistency, Accuracy, AC3
|
2119 |
+
- **Languages:** English, Chinese, Spanish, Vietnamese
|
2120 |
""")
|
2121 |
|
2122 |
|