Refactor the code
Browse files- app.py +6 -2
- src/logic/data_fetching.py +52 -20
- src/logic/data_processing.py +9 -5
- src/logic/graph_settings.py +36 -0
- src/logic/plotting.py +71 -16
- src/view/help_tab.py +37 -0
- src/view/metric_view_tab.py +222 -0
- src/view/reverse_search_tab.py +53 -0
- src/view/view.py +12 -309
app.py
CHANGED
@@ -1,4 +1,8 @@
|
|
1 |
from src.view.view import create_interface
|
2 |
|
3 |
-
demo
|
4 |
-
demo
|
|
|
|
|
|
|
|
|
|
1 |
from src.view.view import create_interface
|
2 |
|
3 |
+
global demo
|
4 |
+
demo = None
|
5 |
+
|
6 |
+
if __name__ == "__main__":
|
7 |
+
demo = create_interface()
|
8 |
+
demo.launch()
|
src/logic/data_fetching.py
CHANGED
@@ -1,5 +1,7 @@
|
|
|
|
1 |
import os
|
2 |
import json
|
|
|
3 |
import tempfile
|
4 |
from pathlib import Path
|
5 |
from concurrent.futures import ThreadPoolExecutor
|
@@ -9,30 +11,25 @@ from datatrove.utils.stats import MetricStatsDict
|
|
9 |
import gradio as gr
|
10 |
import tenacity
|
11 |
|
|
|
|
|
12 |
def find_folders(base_folder: str, path: str) -> List[str]:
|
13 |
-
|
14 |
-
if not
|
15 |
return []
|
16 |
return sorted(
|
17 |
[
|
18 |
-
folder
|
19 |
-
for folder in
|
20 |
-
if
|
21 |
]
|
22 |
)
|
23 |
|
24 |
-
def find_metrics_folders(base_folder: str) -> List[str]:
|
25 |
-
base_data_df = get_datafolder(base_folder)
|
26 |
-
dirs = sorted(
|
27 |
-
folder
|
28 |
-
for folder, info in base_data_df.find("", detail=True, maxdepth=1, withdirs=True).items()
|
29 |
-
if info["type"] == "directory"
|
30 |
-
)
|
31 |
-
return sorted(list(set(dirs)))
|
32 |
-
|
33 |
def fetch_datasets(base_folder: str):
|
34 |
-
datasets = sorted(
|
35 |
-
|
|
|
|
|
36 |
|
37 |
def fetch_groups(base_folder: str, datasets: List[str], old_groups: str, type: str = "intersection"):
|
38 |
if not datasets:
|
@@ -55,7 +52,7 @@ def fetch_groups(base_folder: str, datasets: List[str], old_groups: str, type: s
|
|
55 |
if not value and len(new_choices) == 1:
|
56 |
value = list(new_choices)[0]
|
57 |
|
58 |
-
return gr.
|
59 |
|
60 |
def fetch_metrics(base_folder: str, datasets: List[str], group: str, old_metrics: str, type: str = "intersection"):
|
61 |
if not group:
|
@@ -79,7 +76,7 @@ def fetch_metrics(base_folder: str, datasets: List[str], group: str, old_metrics
|
|
79 |
if not value and len(new_possibles_choices) == 1:
|
80 |
value = list(new_possibles_choices)[0]
|
81 |
|
82 |
-
return gr.
|
83 |
|
84 |
def reverse_search(base_folder: str, possible_datasets: List[str], grouping: str, metric_name: str) -> str:
|
85 |
with ThreadPoolExecutor() as executor:
|
@@ -91,7 +88,7 @@ def reverse_search(base_folder: str, possible_datasets: List[str], grouping: str
|
|
91 |
|
92 |
def reverse_search_add(datasets: List[str], reverse_search_results: str) -> List[str]:
|
93 |
datasets = datasets or []
|
94 |
-
return
|
95 |
|
96 |
def metric_exists(base_folder: str, path: str, metric_name: str, group_by: str) -> bool:
|
97 |
base_folder = get_datafolder(base_folder)
|
@@ -105,4 +102,39 @@ def load_metrics(base_folder: str, path: str, metric_name: str, group_by: str) -
|
|
105 |
return MetricStatsDict.from_dict(json_metric)
|
106 |
|
107 |
def load_data(dataset_path: str, base_folder: str, grouping: str, metric_name: str) -> MetricStatsDict:
|
108 |
-
return load_metrics(base_folder, dataset_path, metric_name, grouping)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import partial
|
2 |
import os
|
3 |
import json
|
4 |
+
import re
|
5 |
import tempfile
|
6 |
from pathlib import Path
|
7 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
11 |
import gradio as gr
|
12 |
import tenacity
|
13 |
|
14 |
+
from src.logic.graph_settings import Grouping
|
15 |
+
|
16 |
def find_folders(base_folder: str, path: str) -> List[str]:
|
17 |
+
base_folder_df = get_datafolder(base_folder)
|
18 |
+
if not base_folder_df.exists(path):
|
19 |
return []
|
20 |
return sorted(
|
21 |
[
|
22 |
+
folder
|
23 |
+
for folder,info in base_folder_df.find(path, maxdepth=1, withdirs=True, detail=True).items()
|
24 |
+
if info["type"] == "directory" and not (folder.rstrip("/") == path.rstrip("/"))
|
25 |
]
|
26 |
)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def fetch_datasets(base_folder: str):
|
29 |
+
datasets = sorted(find_folders(base_folder, ""))
|
30 |
+
if len(datasets) == 0:
|
31 |
+
raise ValueError("No datasets found")
|
32 |
+
return datasets
|
33 |
|
34 |
def fetch_groups(base_folder: str, datasets: List[str], old_groups: str, type: str = "intersection"):
|
35 |
if not datasets:
|
|
|
52 |
if not value and len(new_choices) == 1:
|
53 |
value = list(new_choices)[0]
|
54 |
|
55 |
+
return gr.Dropdown(choices=sorted(list(new_choices)), value=value)
|
56 |
|
57 |
def fetch_metrics(base_folder: str, datasets: List[str], group: str, old_metrics: str, type: str = "intersection"):
|
58 |
if not group:
|
|
|
76 |
if not value and len(new_possibles_choices) == 1:
|
77 |
value = list(new_possibles_choices)[0]
|
78 |
|
79 |
+
return gr.Dropdown(choices=sorted(list(new_possibles_choices)), value=value)
|
80 |
|
81 |
def reverse_search(base_folder: str, possible_datasets: List[str], grouping: str, metric_name: str) -> str:
|
82 |
with ThreadPoolExecutor() as executor:
|
|
|
88 |
|
89 |
def reverse_search_add(datasets: List[str], reverse_search_results: str) -> List[str]:
|
90 |
datasets = datasets or []
|
91 |
+
return list(set(datasets + reverse_search_results.strip().split("\n")))
|
92 |
|
93 |
def metric_exists(base_folder: str, path: str, metric_name: str, group_by: str) -> bool:
|
94 |
base_folder = get_datafolder(base_folder)
|
|
|
102 |
return MetricStatsDict.from_dict(json_metric)
|
103 |
|
104 |
def load_data(dataset_path: str, base_folder: str, grouping: str, metric_name: str) -> MetricStatsDict:
|
105 |
+
return load_metrics(base_folder, dataset_path, metric_name, grouping)
|
106 |
+
|
107 |
+
|
108 |
+
def fetch_graph_data(
|
109 |
+
base_folder: str,
|
110 |
+
datasets: List[str],
|
111 |
+
metric_name: str,
|
112 |
+
grouping: Grouping,
|
113 |
+
progress=gr.Progress(),
|
114 |
+
):
|
115 |
+
if len(datasets) <= 0 or not metric_name or not grouping:
|
116 |
+
return None
|
117 |
+
|
118 |
+
with ThreadPoolExecutor() as pool:
|
119 |
+
data = list(
|
120 |
+
progress.tqdm(
|
121 |
+
pool.map(
|
122 |
+
partial(load_data, base_folder=base_folder, metric_name=metric_name, grouping=grouping),
|
123 |
+
datasets,
|
124 |
+
),
|
125 |
+
total=len(datasets),
|
126 |
+
desc="Loading data...",
|
127 |
+
)
|
128 |
+
)
|
129 |
+
|
130 |
+
data = {path: result for path, result in zip(datasets, data)}
|
131 |
+
return data, None
|
132 |
+
|
133 |
+
def update_datasets_with_regex(regex: str, selected_runs: List[str], all_runs: List[str]):
|
134 |
+
if not regex:
|
135 |
+
return []
|
136 |
+
new_dsts = {run for run in all_runs if re.search(regex, run)}
|
137 |
+
if not new_dsts:
|
138 |
+
return selected_runs
|
139 |
+
dst_union = new_dsts.union(selected_runs or [])
|
140 |
+
return sorted(list(dst_union))
|
src/logic/data_processing.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import json
|
2 |
import re
|
3 |
import heapq
|
@@ -7,6 +8,8 @@ from typing import Dict, Tuple, List, Literal
|
|
7 |
import gradio as gr
|
8 |
from datatrove.utils.stats import MetricStatsDict
|
9 |
|
|
|
|
|
10 |
PARTITION_OPTIONS = Literal["Top", "Bottom", "Most frequent (n_docs)"]
|
11 |
|
12 |
def prepare_for_non_grouped_plotting(metric: Dict[str, MetricStatsDict], normalization: bool, rounding: int) -> Dict[float, float]:
|
@@ -35,13 +38,14 @@ def prepare_for_group_plotting(metric: Dict[str, MetricStatsDict], top_k: int, d
|
|
35 |
stds = [metric[key].standard_deviation for key in keys]
|
36 |
return keys, means, stds
|
37 |
|
38 |
-
def export_data(exported_data: Dict[str, MetricStatsDict], metric_name: str):
|
39 |
if not exported_data:
|
40 |
return None
|
41 |
-
|
|
|
|
|
42 |
json.dump({
|
43 |
name: sorted([{"value": key, **value} for key, value in dt.to_dict().items()], key=lambda x: x["value"])
|
44 |
for name, dt in exported_data.items()
|
45 |
-
},
|
46 |
-
|
47 |
-
return gr.update(visible=True, value=temp_path)
|
|
|
1 |
+
from datetime import datetime
|
2 |
import json
|
3 |
import re
|
4 |
import heapq
|
|
|
8 |
import gradio as gr
|
9 |
from datatrove.utils.stats import MetricStatsDict
|
10 |
|
11 |
+
from src.logic.graph_settings import Grouping
|
12 |
+
|
13 |
PARTITION_OPTIONS = Literal["Top", "Bottom", "Most frequent (n_docs)"]
|
14 |
|
15 |
def prepare_for_non_grouped_plotting(metric: Dict[str, MetricStatsDict], normalization: bool, rounding: int) -> Dict[float, float]:
|
|
|
38 |
stds = [metric[key].standard_deviation for key in keys]
|
39 |
return keys, means, stds
|
40 |
|
41 |
+
def export_data(exported_data: Dict[str, MetricStatsDict], metric_name: str, grouping: Grouping):
|
42 |
if not exported_data:
|
43 |
return None
|
44 |
+
|
45 |
+
file_name = f"{metric_name}_{grouping}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"
|
46 |
+
with open(file_name, 'w') as f:
|
47 |
json.dump({
|
48 |
name: sorted([{"value": key, **value} for key, value in dt.to_dict().items()], key=lambda x: x["value"])
|
49 |
for name, dt in exported_data.items()
|
50 |
+
}, f, indent=2)
|
51 |
+
return gr.File(value=file_name, visible=True)
|
|
src/logic/graph_settings.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Literal
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
|
5 |
+
Grouping = Literal["histogram", "fqdn", "suffix", "summary"]
|
6 |
+
|
7 |
+
def update_graph_options(grouping: Grouping):
|
8 |
+
"""
|
9 |
+
Updates visibility of the graph options based on the grouping type.
|
10 |
+
The return should be in following order:
|
11 |
+
group_settings, histogram_settings
|
12 |
+
"""
|
13 |
+
if grouping == "histogram":
|
14 |
+
return [
|
15 |
+
gr.TabItem(visible=False),
|
16 |
+
gr.TabItem(visible=True),
|
17 |
+
gr.TabItem(visible=False),
|
18 |
+
]
|
19 |
+
elif grouping in ["fqdn", "suffix"]:
|
20 |
+
return [
|
21 |
+
gr.Column(visible=True),
|
22 |
+
gr.Column(visible=False),
|
23 |
+
gr.Column(visible=False),
|
24 |
+
]
|
25 |
+
elif grouping == "summary":
|
26 |
+
return [
|
27 |
+
gr.Column(visible=False),
|
28 |
+
gr.Column(visible=False),
|
29 |
+
gr.Column(visible=True),
|
30 |
+
]
|
31 |
+
|
32 |
+
return [
|
33 |
+
gr.Column(visible=False),
|
34 |
+
gr.Column(visible=False),
|
35 |
+
gr.Column(visible=False),
|
36 |
+
]
|
src/logic/plotting.py
CHANGED
@@ -4,8 +4,11 @@ import plotly.graph_objects as go
|
|
4 |
import numpy as np
|
5 |
import gradio as gr
|
6 |
from typing import Dict, List
|
7 |
-
|
8 |
-
from .
|
|
|
|
|
|
|
9 |
|
10 |
def plot_scatter(
|
11 |
data: Dict[str, Dict[float, float]],
|
@@ -55,14 +58,15 @@ def plot_scatter(
|
|
55 |
return fig
|
56 |
|
57 |
def plot_bars(
|
58 |
-
data: Dict[str,
|
59 |
metric_name: str,
|
60 |
top_k: int,
|
61 |
-
direction:
|
62 |
regex: str | None,
|
63 |
rounding: int,
|
64 |
log_scale_x: bool,
|
65 |
log_scale_y: bool,
|
|
|
66 |
progress: gr.Progress,
|
67 |
):
|
68 |
fig = go.Figure()
|
@@ -77,7 +81,7 @@ def plot_bars(
|
|
77 |
y=y,
|
78 |
name=f"{name} Mean",
|
79 |
marker=dict(color=set_alpha(px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)], 0.5)),
|
80 |
-
error_y=dict(type='data', array=stds, visible=
|
81 |
))
|
82 |
|
83 |
fig.update_layout(
|
@@ -94,14 +98,65 @@ def plot_bars(
|
|
94 |
|
95 |
return fig
|
96 |
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import numpy as np
|
5 |
import gradio as gr
|
6 |
from typing import Dict, List
|
7 |
+
|
8 |
+
from src.logic.data_processing import PARTITION_OPTIONS, prepare_for_non_grouped_plotting, prepare_for_group_plotting
|
9 |
+
from src.logic.graph_settings import Grouping
|
10 |
+
from src.logic.utils import set_alpha
|
11 |
+
from datatrove.utils.stats import MetricStatsDict
|
12 |
|
13 |
def plot_scatter(
|
14 |
data: Dict[str, Dict[float, float]],
|
|
|
58 |
return fig
|
59 |
|
60 |
def plot_bars(
|
61 |
+
data: Dict[str, MetricStatsDict],
|
62 |
metric_name: str,
|
63 |
top_k: int,
|
64 |
+
direction: PARTITION_OPTIONS,
|
65 |
regex: str | None,
|
66 |
rounding: int,
|
67 |
log_scale_x: bool,
|
68 |
log_scale_y: bool,
|
69 |
+
show_stds: bool,
|
70 |
progress: gr.Progress,
|
71 |
):
|
72 |
fig = go.Figure()
|
|
|
81 |
y=y,
|
82 |
name=f"{name} Mean",
|
83 |
marker=dict(color=set_alpha(px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)], 0.5)),
|
84 |
+
error_y=dict(type='data', array=stds, visible=show_stds)
|
85 |
))
|
86 |
|
87 |
fig.update_layout(
|
|
|
98 |
|
99 |
return fig
|
100 |
|
101 |
+
|
102 |
+
# Add any other necessary functions
|
103 |
+
|
104 |
+
def plot_data(
|
105 |
+
metric_data: Dict[str, MetricStatsDict],
|
106 |
+
metric_name: str,
|
107 |
+
normalize: bool,
|
108 |
+
rounding: int,
|
109 |
+
grouping: Grouping,
|
110 |
+
top_n: int,
|
111 |
+
direction: PARTITION_OPTIONS,
|
112 |
+
group_regex: str,
|
113 |
+
log_scale_x: bool,
|
114 |
+
log_scale_y: bool,
|
115 |
+
cdf: bool,
|
116 |
+
perc: bool,
|
117 |
+
show_stds: bool,
|
118 |
+
) -> tuple[go.Figure, gr.Row, str]:
|
119 |
+
if grouping == "histogram":
|
120 |
+
fig = plot_scatter(
|
121 |
+
metric_data,
|
122 |
+
metric_name,
|
123 |
+
log_scale_x,
|
124 |
+
log_scale_y,
|
125 |
+
normalize,
|
126 |
+
rounding,
|
127 |
+
cdf,
|
128 |
+
perc,
|
129 |
+
gr.Progress(),
|
130 |
+
)
|
131 |
+
min_max_hist_data = generate_min_max_hist_data(metric_data)
|
132 |
+
return fig, gr.Row.update(visible=True), min_max_hist_data
|
133 |
+
else:
|
134 |
+
fig = plot_bars(
|
135 |
+
metric_data,
|
136 |
+
metric_name,
|
137 |
+
top_n,
|
138 |
+
direction,
|
139 |
+
group_regex,
|
140 |
+
rounding,
|
141 |
+
log_scale_x,
|
142 |
+
log_scale_y,
|
143 |
+
show_stds,
|
144 |
+
gr.Progress(),
|
145 |
+
)
|
146 |
+
return fig, gr.Row.update(visible=True), ""
|
147 |
+
|
148 |
+
def generate_min_max_hist_data(data: Dict[str, MetricStatsDict]) -> str:
|
149 |
+
runs_data = {
|
150 |
+
run: {
|
151 |
+
"min": min(map(float, dato.keys())),
|
152 |
+
"max": max(map(float, dato.keys())),
|
153 |
+
}
|
154 |
+
for run, dato in data.items()
|
155 |
+
}
|
156 |
+
|
157 |
+
runs_rows = [
|
158 |
+
f"| {run} | {values['min']:.4f} | {values['max']:.4f} |"
|
159 |
+
for run, values in runs_data.items()
|
160 |
+
]
|
161 |
+
header = "| Run | Min | Max |\n|-----|-----|-----|\n"
|
162 |
+
return header + "\n".join(runs_rows)
|
src/view/help_tab.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
def create_help_tab():
|
4 |
+
gr.Markdown(
|
5 |
+
label="Readme",
|
6 |
+
value="""
|
7 |
+
|
8 |
+
# Dataset Metrics Explorer
|
9 |
+
## Features:
|
10 |
+
- View metrics for various datasets you computed using datatrove
|
11 |
+
- Search for metrics across datasets
|
12 |
+
|
13 |
+
## View metrics Usage:
|
14 |
+
1) Specify Metrics location (Stats block `output_folder`) and click "Fetch Datasets"
|
15 |
+
2) Select datasets you are interested in using the dropdown or regex filter
|
16 |
+
3) Specify Grouping (histogram/summary/fqdn/suffix) and Metric name
|
17 |
+
4) Click "Render Metric", adjust Graph settings and see the result
|
18 |
+
|
19 |
+
### Groupings:
|
20 |
+
- **histogram**: Creates a line plot of values with their frequencies.
|
21 |
+
* normalize: Normalize the histogram to sum to 1
|
22 |
+
* CDF: Show the plot as cumulative distribution function
|
23 |
+
* %: Show the plot as percentage of the total
|
24 |
+
- **(fqdn/suffix)**: Creates a bar plot of the avg. values of the metric for full qualifed domain name/suffix of domain.
|
25 |
+
* k: the number of groups to show
|
26 |
+
* Top/Bottom/Most frequent (n_docs): Groups with the top/bottom k values/most prevalant docs are shown
|
27 |
+
- **summary**: Shows the average value of given metric for every dataset
|
28 |
+
* show_stds: Show the standard deviation from mean for every datasets
|
29 |
+
|
30 |
+
## Reverse search Usage:
|
31 |
+
To search for datasets containing a grouping and certain metric, use the Reverse search section.
|
32 |
+
Specify the search parameters and click "Search". This will show you found datasets in the "Found datasets" textbox. You can modify the selection after search by removing unwanted lines and clicking "Add to selection".
|
33 |
+
|
34 |
+
## Note:
|
35 |
+
The data might not be 100% representative, due to the sampling and optimistic merging of the metrics (fqdn/suffix).
|
36 |
+
"""
|
37 |
+
)
|
src/view/metric_view_tab.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
import tempfile
|
3 |
+
from typing import Callable
|
4 |
+
import gradio as gr
|
5 |
+
from functools import partial
|
6 |
+
import re
|
7 |
+
import json
|
8 |
+
|
9 |
+
from src.logic.data_fetching import fetch_datasets, fetch_graph_data, fetch_groups, fetch_metrics, update_datasets_with_regex
|
10 |
+
from src.logic.data_processing import export_data
|
11 |
+
from src.logic.graph_settings import update_graph_options
|
12 |
+
from src.logic.plotting import plot_data
|
13 |
+
|
14 |
+
def create_metric_view_tab(METRICS_LOCATION_DEFAULT: str, available_datasets: gr.State, selected_datasets: gr.State):
|
15 |
+
metric_data = gr.State([])
|
16 |
+
|
17 |
+
with gr.Row():
|
18 |
+
with gr.Column(scale=2):
|
19 |
+
with gr.Row():
|
20 |
+
with gr.Column(scale=1):
|
21 |
+
base_folder = gr.Textbox(
|
22 |
+
label="Metrics Location",
|
23 |
+
value=METRICS_LOCATION_DEFAULT,
|
24 |
+
)
|
25 |
+
datasets_fetch = gr.Button("Fetch Datasets")
|
26 |
+
|
27 |
+
with gr.Column(scale=1):
|
28 |
+
regex_select = gr.Text(label="Regex filter", value=".*")
|
29 |
+
regex_button = gr.Button("Search")
|
30 |
+
with gr.Row():
|
31 |
+
selected_datasets_dropdown = gr.Dropdown(
|
32 |
+
choices=[],
|
33 |
+
label="Datasets",
|
34 |
+
multiselect=True,
|
35 |
+
interactive=True,
|
36 |
+
)
|
37 |
+
|
38 |
+
with gr.Column(scale=1):
|
39 |
+
grouping_dropdown = gr.Dropdown(
|
40 |
+
choices=[],
|
41 |
+
label="Grouping",
|
42 |
+
multiselect=False,
|
43 |
+
)
|
44 |
+
metric_name_dropdown = gr.Dropdown(
|
45 |
+
choices=[],
|
46 |
+
label="Metric name",
|
47 |
+
multiselect=False,
|
48 |
+
)
|
49 |
+
|
50 |
+
render_button = gr.Button("Render Metric", variant="primary")
|
51 |
+
|
52 |
+
with gr.Tabs():
|
53 |
+
with gr.TabItem("Graph Settings"):
|
54 |
+
log_scale_x_checkbox = gr.Checkbox(
|
55 |
+
label="Log scale x",
|
56 |
+
value=False,
|
57 |
+
)
|
58 |
+
log_scale_y_checkbox = gr.Checkbox(
|
59 |
+
label="Log scale y",
|
60 |
+
value=False,
|
61 |
+
)
|
62 |
+
rounding = gr.Number(
|
63 |
+
label="Rounding",
|
64 |
+
value=2,
|
65 |
+
)
|
66 |
+
|
67 |
+
with gr.TabItem("Grouping Settings") as group_settings:
|
68 |
+
with gr.Row() as group_choices:
|
69 |
+
with gr.Column(scale=2):
|
70 |
+
group_regex = gr.Text(
|
71 |
+
label="Group Regex",
|
72 |
+
value=None,
|
73 |
+
)
|
74 |
+
with gr.Row():
|
75 |
+
top_select = gr.Number(
|
76 |
+
label="N Groups",
|
77 |
+
value=100,
|
78 |
+
interactive=True,
|
79 |
+
)
|
80 |
+
|
81 |
+
direction_checkbox = gr.Radio(
|
82 |
+
label="Partition",
|
83 |
+
choices=[
|
84 |
+
"Top",
|
85 |
+
"Bottom",
|
86 |
+
"Most frequent (n_docs)",
|
87 |
+
],
|
88 |
+
value="Most frequent (n_docs)",
|
89 |
+
)
|
90 |
+
|
91 |
+
with gr.TabItem("Histogram Settings") as histogram_settings:
|
92 |
+
normalization_checkbox = gr.Checkbox(
|
93 |
+
label="Normalize",
|
94 |
+
value=True,
|
95 |
+
visible=False
|
96 |
+
)
|
97 |
+
cdf_checkbox = gr.Checkbox(
|
98 |
+
label="CDF",
|
99 |
+
value=False,
|
100 |
+
)
|
101 |
+
perc_checkbox = gr.Checkbox(
|
102 |
+
label="%",
|
103 |
+
value=False,
|
104 |
+
)
|
105 |
+
|
106 |
+
with gr.TabItem("Summary Settings") as summary_settings:
|
107 |
+
show_stds_checkbox = gr.Checkbox(
|
108 |
+
label="Show standard deviations",
|
109 |
+
value=False,
|
110 |
+
)
|
111 |
+
|
112 |
+
with gr.Row():
|
113 |
+
graph_output = gr.Plot(label="Graph")
|
114 |
+
with gr.Row(visible=False) as min_max_hist:
|
115 |
+
with gr.Column(scale=3):
|
116 |
+
min_max_hist_data = gr.Markdown()
|
117 |
+
with gr.Column(scale=1):
|
118 |
+
export_data_button = gr.Button("Export Data")
|
119 |
+
export_data_json = gr.File(visible=False)
|
120 |
+
|
121 |
+
|
122 |
+
|
123 |
+
def update_selected_datasets_dropdown(available_datasets, selected_datasets):
|
124 |
+
return gr.Dropdown(choices=available_datasets, value=sorted(selected_datasets))
|
125 |
+
|
126 |
+
|
127 |
+
datasets_fetch.click(
|
128 |
+
fn=fetch_datasets,
|
129 |
+
inputs=[base_folder],
|
130 |
+
outputs=[available_datasets],
|
131 |
+
)
|
132 |
+
|
133 |
+
available_datasets.change(
|
134 |
+
fn=update_selected_datasets_dropdown,
|
135 |
+
inputs=[available_datasets, selected_datasets],
|
136 |
+
outputs=selected_datasets_dropdown,
|
137 |
+
)
|
138 |
+
|
139 |
+
regex_button.click(
|
140 |
+
fn=update_datasets_with_regex,
|
141 |
+
inputs=[regex_select, selected_datasets, available_datasets],
|
142 |
+
outputs=selected_datasets,
|
143 |
+
)
|
144 |
+
|
145 |
+
def update_selected_datasets(selected_datasets_dropdown):
|
146 |
+
return selected_datasets_dropdown
|
147 |
+
|
148 |
+
selected_datasets_dropdown.change(
|
149 |
+
fn=update_selected_datasets,
|
150 |
+
inputs=[selected_datasets_dropdown],
|
151 |
+
outputs=selected_datasets,
|
152 |
+
)
|
153 |
+
|
154 |
+
selected_datasets.change(
|
155 |
+
fn=update_selected_datasets_dropdown,
|
156 |
+
inputs=[available_datasets, selected_datasets],
|
157 |
+
outputs=selected_datasets_dropdown,
|
158 |
+
)
|
159 |
+
|
160 |
+
|
161 |
+
selected_datasets.change(
|
162 |
+
fn=fetch_groups,
|
163 |
+
inputs=[base_folder, selected_datasets, grouping_dropdown],
|
164 |
+
outputs=grouping_dropdown,
|
165 |
+
)
|
166 |
+
|
167 |
+
grouping_dropdown.change(
|
168 |
+
fn=fetch_metrics,
|
169 |
+
inputs=[base_folder, selected_datasets, grouping_dropdown, metric_name_dropdown],
|
170 |
+
outputs=metric_name_dropdown,
|
171 |
+
)
|
172 |
+
|
173 |
+
render_button.click(
|
174 |
+
fn=fetch_graph_data,
|
175 |
+
inputs=[
|
176 |
+
base_folder,
|
177 |
+
selected_datasets,
|
178 |
+
metric_name_dropdown,
|
179 |
+
grouping_dropdown,
|
180 |
+
],
|
181 |
+
# We also output the graph_output = None to show the progress
|
182 |
+
outputs=[metric_data, graph_output],
|
183 |
+
)
|
184 |
+
|
185 |
+
|
186 |
+
grouping_dropdown.change(
|
187 |
+
fn=update_graph_options,
|
188 |
+
inputs=[grouping_dropdown],
|
189 |
+
outputs=[group_settings, histogram_settings, summary_settings],
|
190 |
+
)
|
191 |
+
|
192 |
+
|
193 |
+
gr.on(
|
194 |
+
triggers=[normalization_checkbox.input, rounding.input, group_regex.input, direction_checkbox.input,
|
195 |
+
top_select.input, log_scale_x_checkbox.input,
|
196 |
+
log_scale_y_checkbox.input, cdf_checkbox.input, perc_checkbox.input, show_stds_checkbox.input, metric_data.change],
|
197 |
+
fn=plot_data,
|
198 |
+
inputs=[
|
199 |
+
metric_data,
|
200 |
+
metric_name_dropdown,
|
201 |
+
normalization_checkbox,
|
202 |
+
rounding,
|
203 |
+
grouping_dropdown,
|
204 |
+
top_select,
|
205 |
+
direction_checkbox,
|
206 |
+
group_regex,
|
207 |
+
log_scale_x_checkbox,
|
208 |
+
log_scale_y_checkbox,
|
209 |
+
cdf_checkbox,
|
210 |
+
perc_checkbox,
|
211 |
+
show_stds_checkbox
|
212 |
+
],
|
213 |
+
outputs=[graph_output, min_max_hist, min_max_hist_data],
|
214 |
+
)
|
215 |
+
|
216 |
+
export_data_button.click(
|
217 |
+
fn=export_data,
|
218 |
+
inputs=[metric_data, metric_name_dropdown, grouping_dropdown],
|
219 |
+
outputs=[export_data_json],
|
220 |
+
)
|
221 |
+
|
222 |
+
return base_folder
|
src/view/reverse_search_tab.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import partial
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
from src.logic.data_fetching import fetch_groups, fetch_metrics, reverse_search, reverse_search_add
|
5 |
+
|
6 |
+
def create_reverse_search_tab(base_folder: gr.Textbox, datasets_available: gr.State, datasets_selected: gr.State):
|
7 |
+
reverse_search_headline = gr.Markdown(value="# Reverse Metrics Search")
|
8 |
+
|
9 |
+
with gr.Row():
|
10 |
+
with gr.Column(scale=1):
|
11 |
+
reverse_grouping_dropdown = gr.Dropdown(
|
12 |
+
choices=[],
|
13 |
+
label="Grouping",
|
14 |
+
multiselect=False,
|
15 |
+
)
|
16 |
+
reverse_metric_name_dropdown = gr.Dropdown(
|
17 |
+
choices=[],
|
18 |
+
label="Metric Name",
|
19 |
+
multiselect=False,
|
20 |
+
)
|
21 |
+
reverse_search_button = gr.Button("Search")
|
22 |
+
reverse_search_add_button = gr.Button("Add to selection")
|
23 |
+
|
24 |
+
with gr.Column(scale=2):
|
25 |
+
reverse_search_results = gr.Textbox(
|
26 |
+
label="Found datasets",
|
27 |
+
lines=10,
|
28 |
+
placeholder="Found datasets containing the group/metric name. You can modify the selection after search by removing unwanted lines and clicking Add to selection"
|
29 |
+
)
|
30 |
+
|
31 |
+
datasets_available.change(
|
32 |
+
fn=partial(fetch_groups, type="union"),
|
33 |
+
inputs=[base_folder, datasets_available, reverse_grouping_dropdown],
|
34 |
+
outputs=[reverse_grouping_dropdown],
|
35 |
+
)
|
36 |
+
|
37 |
+
reverse_grouping_dropdown.select(
|
38 |
+
fn=partial(fetch_metrics, type="union"),
|
39 |
+
inputs=[base_folder, datasets_available, reverse_grouping_dropdown, reverse_metric_name_dropdown],
|
40 |
+
outputs=reverse_metric_name_dropdown,
|
41 |
+
)
|
42 |
+
|
43 |
+
reverse_search_button.click(
|
44 |
+
fn=partial(reverse_search),
|
45 |
+
inputs=[base_folder, datasets_available, reverse_grouping_dropdown, reverse_metric_name_dropdown],
|
46 |
+
outputs=reverse_search_results,
|
47 |
+
)
|
48 |
+
|
49 |
+
reverse_search_add_button.click(
|
50 |
+
fn=reverse_search_add,
|
51 |
+
inputs=[datasets_selected, reverse_search_results],
|
52 |
+
outputs=datasets_selected,
|
53 |
+
)
|
src/view/view.py
CHANGED
@@ -1,325 +1,28 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
-
from src.logic.data_fetching import fetch_datasets, fetch_groups, fetch_metrics, load_data, reverse_search, reverse_search_add
|
3 |
-
from src.logic.data_processing import export_data
|
4 |
-
from src.logic.plotting import plot_data
|
5 |
-
from src.logic.utils import get_desc
|
6 |
-
from concurrent.futures import ThreadPoolExecutor
|
7 |
from functools import partial
|
8 |
-
import os
|
9 |
import re
|
10 |
-
|
11 |
-
|
|
|
|
|
12 |
|
13 |
METRICS_LOCATION_DEFAULT = os.getenv("METRICS_LOCATION_DEFAULT", "hf://datasets/HuggingFaceFW-Dev/summary-stats-files")
|
14 |
|
15 |
-
def update_graph(
|
16 |
-
base_folder,
|
17 |
-
datasets,
|
18 |
-
metric_name,
|
19 |
-
grouping,
|
20 |
-
log_scale_x,
|
21 |
-
log_scale_y,
|
22 |
-
rounding,
|
23 |
-
normalization,
|
24 |
-
top_k,
|
25 |
-
direction,
|
26 |
-
regex,
|
27 |
-
cumsum,
|
28 |
-
perc,
|
29 |
-
progress=gr.Progress(),
|
30 |
-
):
|
31 |
-
if len(datasets) <= 0 or not metric_name or not grouping:
|
32 |
-
return None
|
33 |
-
|
34 |
-
with ThreadPoolExecutor() as pool:
|
35 |
-
data = list(
|
36 |
-
progress.tqdm(
|
37 |
-
pool.map(
|
38 |
-
partial(load_data, base_folder=base_folder, metric_name=metric_name, grouping=grouping),
|
39 |
-
datasets,
|
40 |
-
),
|
41 |
-
total=len(datasets),
|
42 |
-
desc="Loading data...",
|
43 |
-
)
|
44 |
-
)
|
45 |
-
|
46 |
-
data = {path: result for path, result in zip(datasets, data)}
|
47 |
-
return plot_data(data, metric_name, normalization, rounding, grouping, top_k, direction, regex, log_scale_x,
|
48 |
-
log_scale_y, cumsum, perc, progress), data, export_data(data, metric_name), get_desc(data)
|
49 |
-
|
50 |
def create_interface():
|
51 |
with gr.Blocks() as demo:
|
52 |
-
datasets = gr.State([])
|
53 |
-
exported_data = gr.State([])
|
54 |
metrics_headline = gr.Markdown(value="# Metrics Exploration")
|
|
|
|
|
55 |
|
56 |
with gr.Tabs():
|
57 |
-
with gr.
|
58 |
-
|
59 |
-
|
60 |
-
value="""
|
61 |
-
## How to use:
|
62 |
-
1) Specify Metrics location (Stats block `output_folder` without the last path segment) and click "Fetch Datasets"
|
63 |
-
2) Select datasets you are interested in using the dropdown or regex filter
|
64 |
-
3) Specify Grouping (global average/value/fqdn/suffix) and Metric name
|
65 |
-
4) Click "Render Metric"
|
66 |
-
|
67 |
-
|
68 |
-
## Groupings:
|
69 |
-
- **histogram**: Creates a line plot of values with their frequencies. If normalization is on, the frequencies sum to 1.
|
70 |
-
* normalize:
|
71 |
-
- **(fqdn/suffix)**: Creates a bar plot of the avg. values of the metric for full qualifed domain name/suffix of domain.
|
72 |
-
* k: the number of groups to show
|
73 |
-
* Top/Bottom/Most frequent (n_docs): Groups with the top/bottom k values/most prevalant docs are shown
|
74 |
-
- **none**: Shows the average value of given metric
|
75 |
-
|
76 |
-
## Reverse search:
|
77 |
-
To search for datasets containing a grouping and certain metric, use the Reverse search section.
|
78 |
-
Specify the search parameters and click "Search". This will show you found datasets in the "Found datasets" textbox. You can modify the selection after search by removing unwanted lines and clicking "Add to selection".
|
79 |
-
|
80 |
-
## Note:
|
81 |
-
The data might not be 100% representative, due to the sampling and optimistic merging of the metrics (fqdn/suffix).
|
82 |
-
""",
|
83 |
-
)
|
84 |
-
|
85 |
with gr.TabItem("Metric View"):
|
86 |
-
|
87 |
-
with gr.Column(scale=2):
|
88 |
-
with gr.Row():
|
89 |
-
with gr.Column(scale=1):
|
90 |
-
base_folder = gr.Textbox(
|
91 |
-
label="Metrics Location",
|
92 |
-
value=METRICS_LOCATION_DEFAULT,
|
93 |
-
)
|
94 |
-
datasets_refetch = gr.Button("Fetch Datasets")
|
95 |
-
|
96 |
-
with gr.Column(scale=1):
|
97 |
-
regex_select = gr.Text(label="Regex filter", value=".*")
|
98 |
-
regex_button = gr.Button("Search")
|
99 |
-
with gr.Row():
|
100 |
-
datasets_selected = gr.Dropdown(
|
101 |
-
choices=[],
|
102 |
-
label="Datasets",
|
103 |
-
multiselect=True,
|
104 |
-
)
|
105 |
-
|
106 |
-
with gr.Column(scale=1):
|
107 |
-
grouping_dropdown = gr.Dropdown(
|
108 |
-
choices=[],
|
109 |
-
label="Grouping",
|
110 |
-
multiselect=False,
|
111 |
-
)
|
112 |
-
metric_name_dropdown = gr.Dropdown(
|
113 |
-
choices=[],
|
114 |
-
label="Metric name",
|
115 |
-
multiselect=False,
|
116 |
-
)
|
117 |
-
|
118 |
-
render_button = gr.Button("Render Metric", variant="primary")
|
119 |
-
|
120 |
-
with gr.Tabs():
|
121 |
-
with gr.TabItem("Graph Settings"):
|
122 |
-
log_scale_x_checkbox = gr.Checkbox(
|
123 |
-
label="Log scale x",
|
124 |
-
value=False,
|
125 |
-
)
|
126 |
-
log_scale_y_checkbox = gr.Checkbox(
|
127 |
-
label="Log scale y",
|
128 |
-
value=False,
|
129 |
-
)
|
130 |
-
rounding = gr.Number(
|
131 |
-
label="Rounding",
|
132 |
-
value=2,
|
133 |
-
)
|
134 |
-
normalization_checkbox = gr.Checkbox(
|
135 |
-
label="Normalize",
|
136 |
-
value=True,
|
137 |
-
visible=False
|
138 |
-
)
|
139 |
-
with gr.Row():
|
140 |
-
export_data_json = gr.File(visible=False)
|
141 |
-
|
142 |
-
with gr.TabItem("Grouping Settings"):
|
143 |
-
with gr.Row(visible=False) as group_choices:
|
144 |
-
with gr.Column(scale=2):
|
145 |
-
group_regex = gr.Text(
|
146 |
-
label="Group Regex",
|
147 |
-
value=None,
|
148 |
-
)
|
149 |
-
with gr.Row():
|
150 |
-
top_select = gr.Number(
|
151 |
-
label="N Groups",
|
152 |
-
value=100,
|
153 |
-
interactive=True,
|
154 |
-
)
|
155 |
-
|
156 |
-
direction_checkbox = gr.Radio(
|
157 |
-
label="Partition",
|
158 |
-
choices=[
|
159 |
-
"Top",
|
160 |
-
"Bottom",
|
161 |
-
"Most frequent (n_docs)",
|
162 |
-
],
|
163 |
-
value="Most frequent (n_docs)",
|
164 |
-
)
|
165 |
-
|
166 |
-
with gr.TabItem("Histogram Settings") as histogram_settings:
|
167 |
-
cdf_checkbox = gr.Checkbox(
|
168 |
-
label="CDF",
|
169 |
-
value=False,
|
170 |
-
)
|
171 |
-
perc_checkbox = gr.Checkbox(
|
172 |
-
label="%",
|
173 |
-
value=False,
|
174 |
-
)
|
175 |
-
with gr.Column(visible=False) as min_max_hist:
|
176 |
-
min_max_hist_data = gr.Markdown()
|
177 |
-
|
178 |
-
with gr.Row():
|
179 |
-
graph_output = gr.Plot(label="Graph")
|
180 |
|
181 |
with gr.TabItem("Reverse Metrics Search"):
|
182 |
-
|
183 |
-
|
184 |
-
with gr.Row():
|
185 |
-
with gr.Column(scale=1):
|
186 |
-
reverse_grouping_dropdown = gr.Dropdown(
|
187 |
-
choices=[],
|
188 |
-
label="Grouping",
|
189 |
-
multiselect=False,
|
190 |
-
)
|
191 |
-
reverse_metric_name_dropdown = gr.Dropdown(
|
192 |
-
choices=[],
|
193 |
-
label="Metric Name",
|
194 |
-
multiselect=False,
|
195 |
-
)
|
196 |
-
reverse_search_button = gr.Button("Search")
|
197 |
-
reverse_search_add_button = gr.Button("Add to selection")
|
198 |
-
|
199 |
-
with gr.Column(scale=2):
|
200 |
-
reverse_search_results = gr.Textbox(
|
201 |
-
label="Found datasets",
|
202 |
-
lines=10,
|
203 |
-
placeholder="Found datasets containing the group/metric name. You can modify the selection after search by removing unwanted lines and clicking Add to selection"
|
204 |
-
)
|
205 |
-
|
206 |
-
render_button.click(
|
207 |
-
fn=update_graph,
|
208 |
-
inputs=[
|
209 |
-
base_folder,
|
210 |
-
datasets_selected,
|
211 |
-
metric_name_dropdown,
|
212 |
-
grouping_dropdown,
|
213 |
-
log_scale_x_checkbox,
|
214 |
-
log_scale_y_checkbox,
|
215 |
-
rounding,
|
216 |
-
normalization_checkbox,
|
217 |
-
top_select,
|
218 |
-
direction_checkbox,
|
219 |
-
group_regex,
|
220 |
-
cdf_checkbox,
|
221 |
-
perc_checkbox
|
222 |
-
],
|
223 |
-
outputs=[graph_output, exported_data, export_data_json, min_max_hist_data],
|
224 |
-
)
|
225 |
-
|
226 |
-
gr.on(
|
227 |
-
triggers=[normalization_checkbox.change, rounding.change, group_regex.change, direction_checkbox.change,
|
228 |
-
top_select.change, log_scale_x_checkbox.change,
|
229 |
-
log_scale_y_checkbox.change, cdf_checkbox.change, perc_checkbox.change],
|
230 |
-
fn=plot_data,
|
231 |
-
inputs=[
|
232 |
-
exported_data,
|
233 |
-
metric_name_dropdown,
|
234 |
-
normalization_checkbox,
|
235 |
-
rounding,
|
236 |
-
grouping_dropdown,
|
237 |
-
top_select,
|
238 |
-
direction_checkbox,
|
239 |
-
group_regex,
|
240 |
-
log_scale_x_checkbox,
|
241 |
-
log_scale_y_checkbox,
|
242 |
-
cdf_checkbox,
|
243 |
-
perc_checkbox
|
244 |
-
],
|
245 |
-
outputs=[graph_output],
|
246 |
-
)
|
247 |
-
|
248 |
-
datasets_selected.change(
|
249 |
-
fn=fetch_groups,
|
250 |
-
inputs=[base_folder, datasets_selected, grouping_dropdown],
|
251 |
-
outputs=grouping_dropdown,
|
252 |
-
)
|
253 |
-
|
254 |
-
grouping_dropdown.change(
|
255 |
-
fn=fetch_metrics,
|
256 |
-
inputs=[base_folder, datasets_selected, grouping_dropdown, metric_name_dropdown],
|
257 |
-
outputs=metric_name_dropdown,
|
258 |
-
)
|
259 |
-
|
260 |
-
reverse_grouping_dropdown.select(
|
261 |
-
fn=partial(fetch_metrics, type="union"),
|
262 |
-
inputs=[base_folder, datasets, reverse_grouping_dropdown, reverse_metric_name_dropdown],
|
263 |
-
outputs=reverse_metric_name_dropdown,
|
264 |
-
)
|
265 |
-
|
266 |
-
reverse_search_button.click(
|
267 |
-
fn=reverse_search,
|
268 |
-
inputs=[base_folder, datasets, reverse_grouping_dropdown, reverse_metric_name_dropdown],
|
269 |
-
outputs=reverse_search_results,
|
270 |
-
)
|
271 |
-
|
272 |
-
reverse_search_add_button.click(
|
273 |
-
fn=reverse_search_add,
|
274 |
-
inputs=[datasets_selected, reverse_search_results],
|
275 |
-
outputs=datasets_selected,
|
276 |
-
)
|
277 |
-
|
278 |
-
datasets_refetch.click(
|
279 |
-
fn=fetch_datasets,
|
280 |
-
inputs=[base_folder],
|
281 |
-
outputs=[datasets, datasets_selected, reverse_grouping_dropdown],
|
282 |
-
)
|
283 |
-
|
284 |
-
|
285 |
-
def update_datasets_with_regex(regex, selected_runs, all_runs):
|
286 |
-
if not regex:
|
287 |
-
return
|
288 |
-
new_dsts = {run for run in all_runs if re.search(regex, run)}
|
289 |
-
if not new_dsts:
|
290 |
-
return gr.update(value=list(selected_runs))
|
291 |
-
dst_union = new_dsts.union(selected_runs or [])
|
292 |
-
return gr.update(value=sorted(list(dst_union)))
|
293 |
-
|
294 |
-
|
295 |
-
regex_button.click(
|
296 |
-
fn=update_datasets_with_regex,
|
297 |
-
inputs=[regex_select, datasets_selected, datasets],
|
298 |
-
outputs=datasets_selected,
|
299 |
-
)
|
300 |
-
|
301 |
-
|
302 |
-
def update_grouping_options(grouping):
|
303 |
-
if grouping == "histogram":
|
304 |
-
return {
|
305 |
-
normalization_checkbox: gr.Column(visible=True),
|
306 |
-
group_choices: gr.Column(visible=False),
|
307 |
-
min_max_hist: gr.Column(visible=True),
|
308 |
-
histogram_settings: gr.TabItem(visible=True),
|
309 |
-
}
|
310 |
-
else:
|
311 |
-
return {
|
312 |
-
normalization_checkbox: gr.Column(visible=False),
|
313 |
-
group_choices: gr.Column(visible=True),
|
314 |
-
min_max_hist: gr.Column(visible=False),
|
315 |
-
histogram_settings: gr.TabItem(visible=False),
|
316 |
-
}
|
317 |
-
|
318 |
-
|
319 |
-
grouping_dropdown.change(
|
320 |
-
fn=update_grouping_options,
|
321 |
-
inputs=[grouping_dropdown],
|
322 |
-
outputs=[normalization_checkbox, group_choices, min_max_hist, histogram_settings],
|
323 |
-
)
|
324 |
|
325 |
return demo
|
|
|
1 |
+
import os
|
2 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
3 |
from functools import partial
|
|
|
4 |
import re
|
5 |
+
from src.view.help_tab import create_help_tab
|
6 |
+
from src.view.metric_view_tab import create_metric_view_tab
|
7 |
+
from src.view.reverse_search_tab import create_reverse_search_tab
|
8 |
+
from src.logic.data_fetching import fetch_datasets, fetch_groups, fetch_metrics, reverse_search, reverse_search_add
|
9 |
|
10 |
METRICS_LOCATION_DEFAULT = os.getenv("METRICS_LOCATION_DEFAULT", "hf://datasets/HuggingFaceFW-Dev/summary-stats-files")
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def create_interface():
|
13 |
with gr.Blocks() as demo:
|
|
|
|
|
14 |
metrics_headline = gr.Markdown(value="# Metrics Exploration")
|
15 |
+
available_datasets = gr.State([])
|
16 |
+
selected_datasets = gr.State([])
|
17 |
|
18 |
with gr.Tabs():
|
19 |
+
with gr.Tab("Help"):
|
20 |
+
create_help_tab()
|
21 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
with gr.TabItem("Metric View"):
|
23 |
+
base_folder = create_metric_view_tab(METRICS_LOCATION_DEFAULT, available_datasets, selected_datasets)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
with gr.TabItem("Reverse Metrics Search"):
|
26 |
+
create_reverse_search_tab(base_folder, available_datasets, selected_datasets)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
return demo
|