oceansweep commited on
Commit
0595d11
·
verified ·
1 Parent(s): f596ec4

Delete App_Function_Libraries/Gradio_UI/MMLU_Pro_tab.py

Browse files
App_Function_Libraries/Gradio_UI/MMLU_Pro_tab.py DELETED
@@ -1,115 +0,0 @@
1
- # MMLU_Pro_tab.py
2
- # is a library that contains the Gradio UI code for the MMLU-Pro benchmarking tool.
3
- #
4
- ##############################################################################################################
5
- # Imports
6
- import os
7
-
8
- import gradio as gr
9
- import logging
10
- #
11
- # External Imports
12
- from tqdm import tqdm
13
- # Local Imports
14
- from App_Function_Libraries.Benchmarks_Evaluations.MMLU_Pro.MMLU_Pro_rewritten import (
15
- load_mmlu_pro, run_mmlu_pro_benchmark, mmlu_pro_main, load_mmlu_pro_config
16
- )
17
- #
18
- ##############################################################################################################
19
- #
20
- # Functions:
21
-
22
- # Set up logging
23
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
24
- logger = logging.getLogger(__name__)
25
-
26
-
27
- def get_categories():
28
- """Fetch categories using the dataset loader from MMLU_Pro_rewritten.py"""
29
- try:
30
- test_data, _ = load_mmlu_pro() # Use the function from MMLU_Pro_rewritten.py
31
- return list(test_data.keys()) # Return the categories from the test dataset
32
- except Exception as e:
33
- logger.error(f"Failed to load categories: {e}")
34
- return ["Error loading categories"]
35
-
36
-
37
- def load_categories():
38
- """Helper function to return the categories for the Gradio dropdown."""
39
- categories = get_categories() # Fetch categories from the dataset
40
- if categories:
41
- return gr.update(choices=categories, value=categories[0]) # Update dropdown with categories
42
- else:
43
- return gr.update(choices=["Error loading categories"], value="Error loading categories")
44
-
45
-
46
- def run_benchmark_from_ui(url, api_key, model, timeout, category, parallel, verbosity, log_prompt):
47
- """Function to run the benchmark with parameters from the UI."""
48
-
49
- # Override config with UI parameters
50
- config = load_mmlu_pro_config(
51
- url=url,
52
- api_key=api_key,
53
- model=model,
54
- timeout=timeout,
55
- categories=[category] if category else None,
56
- parallel=parallel,
57
- verbosity=verbosity,
58
- log_prompt=log_prompt
59
- )
60
-
61
- # Run the benchmarking process
62
- try:
63
- # Call the main benchmarking function
64
- mmlu_pro_main()
65
-
66
- # Assume the final report is generated in "eval_results" folder
67
- report_path = os.path.join("eval_results", config["server"]["model"].replace("/", "-"), "final_report.txt")
68
-
69
- # Read the final report
70
- with open(report_path, "r") as f:
71
- report = f.read()
72
-
73
- return report
74
- except Exception as e:
75
- logger.error(f"An error occurred during benchmark execution: {e}")
76
- return f"An error occurred during benchmark execution. Please check the logs for more information. Error: {str(e)}"
77
-
78
-
79
- def create_mmlu_pro_tab():
80
- """Create the Gradio UI tab for MMLU-Pro Benchmark."""
81
- with gr.Tab("MMLU-Pro Benchmark"):
82
- gr.Markdown("## Run MMLU-Pro Benchmark")
83
-
84
- with gr.Row():
85
- with gr.Column():
86
- # Inputs for the benchmark
87
- url = gr.Textbox(label="Server URL")
88
- api_key = gr.Textbox(label="API Key", type="password")
89
- model = gr.Textbox(label="Model Name")
90
- timeout = gr.Number(label="Timeout (seconds)", value=30)
91
- category = gr.Dropdown(label="Category", choices=["Load categories..."])
92
- load_categories_btn = gr.Button("Load Categories")
93
- parallel = gr.Slider(label="Parallel Requests", minimum=1, maximum=10, step=1, value=1)
94
- verbosity = gr.Slider(label="Verbosity Level", minimum=0, maximum=2, step=1, value=1)
95
- log_prompt = gr.Checkbox(label="Log Prompt")
96
-
97
- with gr.Column():
98
- # Run button and output display
99
- run_button = gr.Button("Run Benchmark")
100
- output = gr.Textbox(label="Benchmark Results", lines=20)
101
-
102
- # When "Load Categories" is clicked, load the categories into the dropdown
103
- load_categories_btn.click(
104
- load_categories,
105
- outputs=category
106
- )
107
-
108
- # When "Run Benchmark" is clicked, trigger the run_benchmark_from_ui function
109
- run_button.click(
110
- run_benchmark_from_ui, # Use the function defined to run the benchmark
111
- inputs=[url, api_key, model, timeout, category, parallel, verbosity, log_prompt],
112
- outputs=output
113
- )
114
-
115
- return [url, api_key, model, timeout, category, parallel, verbosity, log_prompt, run_button, output]